From 5878c9a31a899676c26358be1f2f3f3109e1b169 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Mon, 31 Oct 2022 12:59:42 +0800 Subject: [PATCH 01/42] fix: impl ring log buffer --- include/libs/sync/sync.h | 6 +- include/libs/wal/wal.h | 2 +- include/util/tdef.h | 1 + source/dnode/mnode/impl/src/mndSync.c | 5 +- source/dnode/vnode/src/inc/vnd.h | 2 +- source/dnode/vnode/src/tsdb/tsdbUtil.c | 4 +- source/dnode/vnode/src/vnd/vnodeSync.c | 8 +- source/libs/sync/inc/syncInt.h | 48 +- source/libs/sync/inc/syncRaftEntry.h | 2 +- source/libs/sync/inc/syncReplication.h | 4 +- source/libs/sync/src/syncAppendEntries.c | 579 +++++++++++++++++- source/libs/sync/src/syncAppendEntriesReply.c | 123 +++- source/libs/sync/src/syncCommit.c | 115 +++- source/libs/sync/src/syncMain.c | 365 ++++++++++- source/libs/sync/src/syncRaftEntry.c | 8 +- source/libs/sync/src/syncRaftLog.c | 17 +- source/libs/sync/src/syncReplication.c | 21 +- source/libs/sync/src/syncSnapshot.c | 2 +- source/libs/tdb/src/db/tdbBtree.c | 3 +- source/libs/wal/src/walWrite.c | 9 +- 20 files changed, 1230 insertions(+), 94 deletions(-) diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index ff14e637d0..7ed8414906 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -45,7 +45,7 @@ extern bool gRaftDetailLog; #define SYNC_MAX_BATCH_SIZE 1 #define SYNC_INDEX_BEGIN 0 #define SYNC_INDEX_INVALID -1 -#define SYNC_TERM_INVALID 0xFFFFFFFFFFFFFFFF +#define SYNC_TERM_INVALID -1 // 0xFFFFFFFFFFFFFFFF typedef enum { SYNC_STRATEGY_NO_SNAPSHOT = 0, @@ -56,7 +56,7 @@ typedef enum { typedef uint64_t SyncNodeId; typedef int32_t SyncGroupId; typedef int64_t SyncIndex; -typedef uint64_t SyncTerm; +typedef int64_t SyncTerm; typedef struct SSyncNode SSyncNode; typedef struct SSyncBuffer SSyncBuffer; @@ -201,7 +201,7 @@ typedef struct SSyncInfo { int32_t syncInit(); void syncCleanUp(); int64_t syncOpen(SSyncInfo* pSyncInfo); -void syncStart(int64_t rid); +int32_t syncStart(int64_t rid); void syncStop(int64_t rid); int32_t syncSetStandby(int64_t rid); ESyncState syncGetMyRole(int64_t rid); diff --git a/include/libs/wal/wal.h b/include/libs/wal/wal.h index adf244e32a..e908ae6d88 100644 --- a/include/libs/wal/wal.h +++ b/include/libs/wal/wal.h @@ -170,7 +170,7 @@ int32_t walWriteWithSyncInfo(SWal *, int64_t index, tmsg_t msgType, SWalSyncInfo // Assign version automatically and return to caller, // -1 will be returned for failed writes -int64_t walAppendLog(SWal *, tmsg_t msgType, SWalSyncInfo syncMeta, const void *body, int32_t bodyLen); +int64_t walAppendLog(SWal *, int64_t index, tmsg_t msgType, SWalSyncInfo syncMeta, const void *body, int32_t bodyLen); void walFsync(SWal *, bool force); diff --git a/include/util/tdef.h b/include/util/tdef.h index 936fbdf0d5..0b7f9b28fa 100644 --- a/include/util/tdef.h +++ b/include/util/tdef.h @@ -281,6 +281,7 @@ typedef enum ELogicConditionType { #define TSDB_DNODE_ROLE_VNODE 2 #define TSDB_MAX_REPLICA 5 +#define TSDB_SYNC_LOG_BUFFER_SIZE 500 #define TSDB_TBNAME_COLUMN_INDEX (-1) #define TSDB_MULTI_TABLEMETA_MAX_NUM 100000 // maximum batch size allowed to load table meta diff --git a/source/dnode/mnode/impl/src/mndSync.c b/source/dnode/mnode/impl/src/mndSync.c index cd6fe380e1..74f9d8bbdb 100644 --- a/source/dnode/mnode/impl/src/mndSync.c +++ b/source/dnode/mnode/impl/src/mndSync.c @@ -309,8 +309,11 @@ int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw, int32_t transId) { void mndSyncStart(SMnode *pMnode) { SSyncMgmt *pMgmt = &pMnode->syncMgmt; + if (syncStart(pMgmt->sync) < 0) { + mError("vgId:1, failed to start sync subsystem"); + return; + } syncSetMsgCb(pMgmt->sync, &pMnode->msgCb); - syncStart(pMgmt->sync); mInfo("vgId:1, sync started, id:%" PRId64, pMgmt->sync); } diff --git a/source/dnode/vnode/src/inc/vnd.h b/source/dnode/vnode/src/inc/vnd.h index 988ecc5dd3..fe23087c25 100644 --- a/source/dnode/vnode/src/inc/vnd.h +++ b/source/dnode/vnode/src/inc/vnd.h @@ -97,7 +97,7 @@ bool vnodeShouldRollback(SVnode* pVnode); // vnodeSync.c int32_t vnodeSyncOpen(SVnode* pVnode, char* path); -void vnodeSyncStart(SVnode* pVnode); +int32_t vnodeSyncStart(SVnode* pVnode); void vnodeSyncClose(SVnode* pVnode); void vnodeRedirectRpcMsg(SVnode* pVnode, SRpcMsg* pMsg); bool vnodeIsLeader(SVnode* pVnode); diff --git a/source/dnode/vnode/src/tsdb/tsdbUtil.c b/source/dnode/vnode/src/tsdb/tsdbUtil.c index 4e02a28cdf..a2f3ca2911 100644 --- a/source/dnode/vnode/src/tsdb/tsdbUtil.c +++ b/source/dnode/vnode/src/tsdb/tsdbUtil.c @@ -714,7 +714,8 @@ int32_t tRowMergerAdd(SRowMerger *pMerger, TSDBROW *pRow, STSchema *pTSchema) { taosArraySet(pMerger->pArray, iCol, pColVal); } } else { - ASSERT(0); + // ASSERT(0); + tsdbError("dup key accounted: key version:%" PRId64 ", merger version:%" PRId64, key.version, pMerger->version); } } @@ -888,7 +889,6 @@ int32_t tsdbBuildDeleteSkyline(SArray *aDelData, int32_t sidx, int32_t eidx, SAr code = TSDB_CODE_OUT_OF_MEMORY; goto _clear; } - midx = (sidx + eidx) / 2; code = tsdbBuildDeleteSkyline(aDelData, sidx, midx, aSkyline1); diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index 1863203f4a..c3ccbddc53 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -665,9 +665,13 @@ int32_t vnodeSyncOpen(SVnode *pVnode, char *path) { return 0; } -void vnodeSyncStart(SVnode *pVnode) { +int32_t vnodeSyncStart(SVnode *pVnode) { + if (syncStart(pVnode->sync) < 0) { + vError("vgId:%d, failed to start sync subsystem since %s", pVnode->config.vgId, terrstr()); + return -1; + } syncSetMsgCb(pVnode->sync, &pVnode->msgCb); - syncStart(pVnode->sync); + return 0; } void vnodeSyncClose(SVnode *pVnode) { syncStop(pVnode->sync); } diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index a158430a0f..bd97187ce7 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -88,6 +88,44 @@ typedef struct SPeerState { int64_t lastSendTime; } SPeerState; +typedef struct SSyncLogBufEntry { + SSyncRaftEntry* pItem; + SyncIndex prevLogIndex; + SyncTerm prevLogTerm; +} SSyncLogBufEntry; + +typedef struct SSyncLogBuffer { + SSyncLogBufEntry entries[TSDB_SYNC_LOG_BUFFER_SIZE]; + int64_t startIndex; + int64_t commitIndex; + int64_t matchIndex; + int64_t endIndex; + int64_t size; + TdThreadMutex mutex; +} SSyncLogBuffer; + +SSyncLogBuffer* syncLogBufferCreate(); +void syncLogBufferDestroy(SSyncLogBuffer* pBuf); +int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode); + +// access +int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf); +int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry); +int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevTerm); +int64_t syncLogBufferLoad(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex toIndex); +int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode); +int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t commitIndex); + +int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commtIndex); +SyncAppendEntries* syncLogToAppendEntries(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index); + +// private +int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf); +int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex); +int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index); +void syncIndexMgrSetIndex(SSyncIndexMgr* pSyncIndexMgr, const SRaftId* pRaftId, SyncIndex index); +bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index); + typedef struct SSyncNode { // init by SSyncInfo SyncGroupId vgId; @@ -97,6 +135,7 @@ typedef struct SSyncNode { char configPath[TSDB_FILENAME_LEN * 2]; // sync io + SSyncLogBuffer* pLogBuf; SWal* pWal; const SMsgCb* msgcb; int32_t (*FpSendMsg)(const SEpSet* pEpSet, SRpcMsg* pMsg); @@ -186,7 +225,7 @@ typedef struct SSyncNode { SSyncRespMgr* pSyncRespMgr; // restore state - bool restoreFinish; + _Atomic bool restoreFinish; // SSnapshot* pSnapshot; SSyncSnapshotSender* senders[TSDB_MAX_REPLICA]; SSyncSnapshotReceiver* pNewNodeReceiver; @@ -208,10 +247,11 @@ typedef struct SSyncNode { // open/close -------------- SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo); -void syncNodeStart(SSyncNode* pSyncNode); -void syncNodeStartStandBy(SSyncNode* pSyncNode); +int32_t syncNodeStart(SSyncNode* pSyncNode); +int32_t syncNodeStartStandBy(SSyncNode* pSyncNode); void syncNodeClose(SSyncNode* pSyncNode); int32_t syncNodePropose(SSyncNode* pSyncNode, SRpcMsg* pMsg, bool isWeak); +int32_t syncNodeRestore(SSyncNode* pSyncNode); // option bool syncNodeSnapshotEnable(SSyncNode* pSyncNode); @@ -298,7 +338,7 @@ int32_t syncGetSnapshotMeta(int64_t rid, struct SSnapshotMeta* sMeta); int32_t syncGetSnapshotMetaByIndex(int64_t rid, SyncIndex snapshotIndex, struct SSnapshotMeta* sMeta); void syncStartNormal(int64_t rid); -void syncStartStandBy(int64_t rid); +int32_t syncStartStandBy(int64_t rid); bool syncNodeCanChange(SSyncNode* pSyncNode); bool syncNodeCheckNewConfig(SSyncNode* pSyncNode, const SSyncCfg* pNewCfg); diff --git a/source/libs/sync/inc/syncRaftEntry.h b/source/libs/sync/inc/syncRaftEntry.h index bab1dcc661..75ecd2d2a1 100644 --- a/source/libs/sync/inc/syncRaftEntry.h +++ b/source/libs/sync/inc/syncRaftEntry.h @@ -47,7 +47,7 @@ SSyncRaftEntry* syncEntryBuild2(SyncClientRequest* pMsg, SyncTerm term, SyncInde SSyncRaftEntry* syncEntryBuild3(SyncClientRequest* pMsg, SyncTerm term, SyncIndex index); SSyncRaftEntry* syncEntryBuild4(SRpcMsg* pOriginalMsg, SyncTerm term, SyncIndex index); SSyncRaftEntry* syncEntryBuildNoop(SyncTerm term, SyncIndex index, int32_t vgId); -void syncEntryDestory(SSyncRaftEntry* pEntry); +void syncEntryDestroy(SSyncRaftEntry* pEntry); char* syncEntrySerialize(const SSyncRaftEntry* pEntry, uint32_t* len); // step 5 SSyncRaftEntry* syncEntryDeserialize(const char* buf, uint32_t len); // step 6 cJSON* syncEntry2Json(const SSyncRaftEntry* pEntry); diff --git a/source/libs/sync/inc/syncReplication.h b/source/libs/sync/inc/syncReplication.h index 4f15a45cec..1d34a41456 100644 --- a/source/libs/sync/inc/syncReplication.h +++ b/source/libs/sync/inc/syncReplication.h @@ -57,8 +57,8 @@ int32_t syncNodeSendHeartbeat(SSyncNode* pSyncNode, const SRaftId* pDestId, cons int32_t syncNodeReplicate(SSyncNode* pSyncNode); int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId); -int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* pDestId, const SyncAppendEntries* pMsg); -int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* pDestId, const SyncAppendEntries* pMsg); +int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, SRaftId* pDestId, SyncAppendEntries* pMsg); +int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, SRaftId* pDestId, SyncAppendEntries* pMsg); #ifdef __cplusplus } diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 170a57a7a9..9560ea269b 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -118,7 +118,7 @@ static int32_t syncNodeMakeLogSame(SSyncNode* ths, SyncAppendEntries* pMsg) { rpcFreeCont(rpcMsg.pCont); } - syncEntryDestory(pRollBackEntry); + syncEntryDestroy(pRollBackEntry); } } @@ -161,7 +161,7 @@ static int32_t syncNodeDoMakeLogSame(SSyncNode* ths, SyncIndex FromIndex) { rpcFreeCont(rpcMsg.pCont); } - syncEntryDestory(pRollBackEntry); + syncEntryDestroy(pRollBackEntry); } } @@ -308,7 +308,551 @@ int32_t syncNodeFollowerCommit(SSyncNode* ths, SyncIndex newCommitIndex) { return 0; } +SSyncRaftEntry* syncEntryBuildDummy(SyncTerm term, SyncIndex index, int32_t vgId) { + return syncEntryBuildNoop(term, index, vgId); +} + +int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { + taosThreadMutexLock(&pBuf->mutex); + ASSERT(pNode->pLogStore != NULL && "log store not created"); + ASSERT(pNode->pFsm != NULL && "pFsm not registered"); + ASSERT(pNode->pFsm->FpGetSnapshotInfo != NULL && "FpGetSnapshotInfo not registered"); + + SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); + SSnapshot snapshot; + if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) { + sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr()); + goto _err; + } + + SyncIndex commitIndex = snapshot.lastApplyIndex; + SyncTerm commitTerm = snapshot.lastApplyTerm; + SyncIndex toIndex = TMAX(lastVer, commitIndex); + + // update match index + pBuf->commitIndex = commitIndex; + pBuf->matchIndex = toIndex; + pBuf->endIndex = toIndex + 1; + + // load log entries in reverse order + SSyncLogStore* pLogStore = pNode->pLogStore; + SyncIndex index = toIndex; + SSyncRaftEntry* pEntry = NULL; + bool takeDummy = false; + + while (true) { + if (index <= pBuf->commitIndex) { + takeDummy = true; + break; + } + + if (pLogStore->syncLogGetEntry(pLogStore, index, &pEntry) < 0) { + sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); + ASSERT(0); + break; + } + + bool taken = false; + if (toIndex <= index + pBuf->size - 1) { + SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = -1, .prevLogTerm = -1}; + pBuf->entries[index % pBuf->size] = tmp; + taken = true; + } + + if (index < toIndex) { + pBuf->entries[(index + 1) % pBuf->size].prevLogIndex = pEntry->index; + pBuf->entries[(index + 1) % pBuf->size].prevLogTerm = pEntry->term; + } + + if (!taken) { + syncEntryDestroy(pEntry); + pEntry = NULL; + break; + } + + index--; + } + + // put a dummy record at commitIndex if present in log buffer + if (takeDummy) { + ASSERT(index == pBuf->commitIndex); + + SSyncRaftEntry* pDummy = syncEntryBuildDummy(commitTerm, commitIndex, pNode->vgId); + if (pDummy == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _err; + } + SSyncLogBufEntry tmp = {.pItem = pDummy, .prevLogIndex = commitIndex - 1, .prevLogTerm = commitTerm}; + pBuf->entries[(commitIndex + pBuf->size) % pBuf->size] = tmp; + + if (index < toIndex) { + pBuf->entries[(index + 1) % pBuf->size].prevLogIndex = commitIndex; + pBuf->entries[(index + 1) % pBuf->size].prevLogTerm = commitTerm; + } + } + + // update startIndex + pBuf->startIndex = index; + + // validate + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return 0; + +_err: + taosThreadMutexUnlock(&pBuf->mutex); + return -1; +} + +int64_t syncLogBufferLoadOld(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex toIndex) { + taosThreadMutexLock(&pBuf->mutex); + syncLogBufferValidate(pBuf); + + SSyncLogStore* pLogStore = pNode->pLogStore; + ASSERT(pBuf->startIndex <= pBuf->matchIndex); + ASSERT(pBuf->matchIndex + 1 == pBuf->endIndex); + SyncIndex index = pBuf->endIndex; + SSyncRaftEntry* pMatch = pBuf->entries[(index - 1 + pBuf->size) % pBuf->size].pItem; + ASSERT(pMatch != NULL); + + while (index - pBuf->startIndex < pBuf->size && index <= toIndex) { + SSyncRaftEntry* pEntry = NULL; + if (pLogStore->syncLogGetEntry(pLogStore, index, &pEntry) < 0) { + sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); + ASSERT(0); + break; + } + ASSERT(pMatch->index + 1 == pEntry->index); + SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = pMatch->index, .prevLogTerm = pMatch->term}; + pBuf->entries[pBuf->endIndex % pBuf->size] = tmp; + + sInfo("vgId:%d, loaded log entry into log buffer. index: %" PRId64 ", term: %" PRId64, pNode->vgId, pEntry->index, + pEntry->term); + + pBuf->matchIndex = index; + pBuf->endIndex = index + 1; + pMatch = pEntry; + index++; + } + + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return index; +} + +int32_t syncLogBufferInitOld(SSyncLogBuffer* pBuf, SSyncNode* pNode) { + taosThreadMutexLock(&pBuf->mutex); + ASSERT(pNode->pLogStore != NULL && "log store not created"); + ASSERT(pNode->pFsm != NULL && "pFsm not registered"); + ASSERT(pNode->pFsm->FpGetSnapshotInfo != NULL && "FpGetSnapshotInfo not registered"); + + SSnapshot snapshot; + if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) { + sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr()); + goto _err; + } + SyncIndex commitIndex = snapshot.lastApplyIndex; + SyncTerm commitTerm = snapshot.lastApplyTerm; + + // init log buffer indexes + pBuf->startIndex = commitIndex; + pBuf->matchIndex = commitIndex; + pBuf->commitIndex = commitIndex; + pBuf->endIndex = commitIndex + 1; + + // put a dummy record at initial commitIndex + SSyncRaftEntry* pDummy = syncEntryBuildDummy(commitTerm, commitIndex, pNode->vgId); + if (pDummy == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _err; + } + SSyncLogBufEntry tmp = {.pItem = pDummy, .prevLogIndex = commitIndex - 1, .prevLogTerm = commitTerm}; + pBuf->entries[(commitIndex + pBuf->size) % pBuf->size] = tmp; + + taosThreadMutexUnlock(&pBuf->mutex); + return 0; + +_err: + taosThreadMutexUnlock(&pBuf->mutex); + return -1; +} + +int32_t syncLogBufferRollbackMatchIndex(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex toIndex) { + if (toIndex <= pBuf->commitIndex) { + sError("vgId:%d, cannot rollback across commit index:%" PRId64 ", to index:%" PRId64 "", pNode->vgId, + pBuf->commitIndex, toIndex); + return -1; + } + + pBuf->matchIndex = TMIN(pBuf->matchIndex, toIndex - 1); + + // update my match index + syncIndexMgrSetIndex(pNode->pMatchIndex, &pNode->myRaftId, pBuf->matchIndex); + return 0; +} + +int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevTerm) { + taosThreadMutexLock(&pBuf->mutex); + syncLogBufferValidate(pBuf); + + int32_t ret = 0; + SyncIndex index = pEntry->index; + SyncIndex prevIndex = pEntry->index - 1; + if (index <= pBuf->commitIndex || index - pBuf->startIndex > pBuf->size) { + sInfo("vgId:%d, cannot accept index:%" PRId64 " into log buffer. start index: %" PRId64 ", commit index: %" PRId64 + ", end index:%" PRId64 ")", + pNode->vgId, index, pBuf->startIndex, pBuf->commitIndex, pBuf->endIndex); + ret = (index <= pBuf->commitIndex) ? 0 : -1; + goto _out; + } + + // check current in buffer + SSyncRaftEntry* pExist = pBuf->entries[index % pBuf->size].pItem; + if (pExist != NULL) { + ASSERT(pEntry->index == pExist->index); + + if (pEntry->term > pExist->term) { + (void)syncLogBufferRollback(pBuf, index); + } else { + sInfo("vgId:%d, %s raft entry received. index:%" PRId64 ", term: %" PRId64 "", pNode->vgId, + ((pEntry->term < pExist->term) ? "stale" : "duplicate"), pEntry->index, pEntry->term); + SyncTerm existPrevTerm = pBuf->entries[index % pBuf->size].prevLogTerm; + ASSERT(pEntry->term < pExist->term || (pEntry->term == pExist->term && prevTerm == existPrevTerm)); + ret = (pEntry->term < pExist->term) ? 0 : -1; + goto _out; + } + } + + // update + SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = prevIndex, .prevLogTerm = prevTerm}; + pEntry = NULL; + pBuf->entries[index % pBuf->size] = tmp; + + // update end index + pBuf->endIndex = TMAX(index + 1, pBuf->endIndex); + +_out: + syncEntryDestroy(pEntry); + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return ret; +} + +SSyncRaftEntry* syncLogAppendEntriesToRaftEntry(const SyncAppendEntries* pMsg) { + SSyncRaftEntry* pEntry = taosMemoryMalloc(pMsg->dataLen); + if (pEntry == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + (void)memcpy(pEntry, pMsg->data, pMsg->dataLen); + ASSERT(pEntry->bytes == pMsg->dataLen); + return pEntry; +} + +int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) { + SyncIndex lastVer = pLogStore->syncLogLastIndex(pLogStore); + if (lastVer >= pEntry->index && pLogStore->syncLogTruncate(pLogStore, pEntry->index) < 0) { + sError("failed to truncate log store since %s. from index:%" PRId64 "", terrstr(), pEntry->index); + return -1; + } + lastVer = pLogStore->syncLogLastIndex(pLogStore); + ASSERT(pEntry->index == lastVer + 1); + + if (pLogStore->syncLogAppendEntry(pLogStore, pEntry) < 0) { + sError("failed to append raft log entry since %s. index:%" PRId64 ", term:%" PRId64 "", terrstr(), pEntry->index, + pEntry->term); + return -1; + } + return 0; +} + +int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { + taosThreadMutexLock(&pBuf->mutex); + syncLogBufferValidate(pBuf); + + SSyncLogStore* pLogStore = pNode->pLogStore; + int64_t matchIndex = pBuf->matchIndex; + + while (pBuf->matchIndex + 1 < pBuf->endIndex) { + int64_t index = pBuf->matchIndex + 1; + ASSERT(index >= 0); + + // try to proceed + SSyncLogBufEntry* pBufEntry = &pBuf->entries[index % pBuf->size]; + SyncIndex prevLogIndex = pBufEntry->prevLogIndex; + SyncTerm prevLogTerm = pBufEntry->prevLogTerm; + SSyncRaftEntry* pEntry = pBufEntry->pItem; + if (pEntry == NULL) { + sDebug("vgId:%d, cannot proceed match index in log buffer. no raft entry at next pos of matchIndex:%" PRId64, + pNode->vgId, pBuf->matchIndex); + goto _out; + } + + ASSERT(index == pEntry->index); + + // match + SSyncRaftEntry* pMatch = pBuf->entries[(pBuf->matchIndex + pBuf->size) % pBuf->size].pItem; + ASSERT(pMatch != NULL); + ASSERT(pMatch->index == pBuf->matchIndex); + ASSERT(pMatch->index + 1 == pEntry->index); + ASSERT(prevLogIndex == pMatch->index); + + if (pMatch->term != prevLogTerm) { + sError( + "vgId:%d, mismatching raft log entries encountered. " + "{ index:%" PRId64 ", term:%" PRId64 + " } " + "{ index:%" PRId64 ", term:%" PRId64 ", prevLogIndex:%" PRId64 ", prevLogTerm:%" PRId64 " } ", + pNode->vgId, pMatch->index, pMatch->term, pEntry->index, pEntry->term, prevLogIndex, prevLogTerm); + goto _out; + } + + // replicate on demand + if (pNode->state == TAOS_SYNC_STATE_LEADER && pNode->replicaNum > 1) { + (void)syncLogBufferReplicate(pBuf, pNode, index); + } + + // persist + if (syncLogStorePersist(pLogStore, pEntry) < 0) { + sError("vgId:%d, failed to persist raft log entry from log buffer since %s. index:%" PRId64, pNode->vgId, + terrstr(), pEntry->index); + goto _out; + } + + // increment + pBuf->matchIndex = index; + matchIndex = pBuf->matchIndex; + + // update my match index + syncIndexMgrSetIndex(pNode->pMatchIndex, &pNode->myRaftId, pBuf->matchIndex); + } // end of while + +_out: + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return matchIndex; +} + +int32_t syncLogFsmExecute(SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry) { + ASSERT(pFsm->FpCommitCb != NULL && "No commit cb registered for the FSM"); + + SRpcMsg rpcMsg; + syncEntry2OriginalRpc(pEntry, &rpcMsg); + + SFsmCbMeta cbMeta = {0}; + cbMeta.index = pEntry->index; + cbMeta.lastConfigIndex = -1; + cbMeta.isWeak = pEntry->isWeak; + cbMeta.code = 0; + cbMeta.state = role; + cbMeta.seqNum = pEntry->seqNum; + cbMeta.term = pEntry->term; + cbMeta.currentTerm = term; + cbMeta.flag = -1; + + pFsm->FpCommitCb(pFsm, &rpcMsg, cbMeta); + return 0; +} + +int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf) { + ASSERT(pBuf->startIndex <= pBuf->matchIndex); + ASSERT(pBuf->commitIndex <= pBuf->matchIndex); + ASSERT(pBuf->matchIndex < pBuf->endIndex); + ASSERT(pBuf->endIndex - pBuf->startIndex <= pBuf->size); + for (SyncIndex index = pBuf->commitIndex; index <= pBuf->matchIndex; index++) { + SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + ASSERT(pEntry != NULL); + } + return 0; +} + +int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t commitIndex) { + taosThreadMutexLock(&pBuf->mutex); + syncLogBufferValidate(pBuf); + + SSyncLogStore* pLogStore = pNode->pLogStore; + SSyncFSM* pFsm = pNode->pFsm; + ESyncState role = pNode->state; + SyncTerm term = pNode->pRaftStore->currentTerm; + SyncGroupId vgId = pNode->vgId; + int32_t ret = 0; + int64_t upperIndex = TMIN(commitIndex, pBuf->matchIndex); + SSyncRaftEntry* pEntry = NULL; + bool inBuf = false; + + if (commitIndex <= pBuf->commitIndex) { + sDebug("vgId:%d, stale commit update. current:%" PRId64 ", notified:%" PRId64 "", vgId, pBuf->commitIndex, + commitIndex); + ret = 0; + goto _out; + } + + sDebug("vgId:%d, log buffer info. role: %d, term: %" PRId64 ". start index:%" PRId64 ", commit index:%" PRId64 + ", match index: %" PRId64 ", end index:%" PRId64 "", + pNode->vgId, role, term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + + // execute in fsm + for (int64_t index = pBuf->commitIndex + 1; index <= upperIndex; index++) { + // get a log entry + if (index >= pBuf->startIndex) { + inBuf = true; + pEntry = pBuf->entries[index % pBuf->size].pItem; + } else { + inBuf = false; + if (pLogStore->syncLogGetEntry(pLogStore, index, &pEntry) < 0) { + sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); + ret = -1; + goto _out; + } + } + + ASSERT(pEntry != NULL); + + // execute it + if (!syncUtilUserCommit(pEntry->originalRpcType)) { + sInfo("vgId:%d, non-user msg in raft log entry. index: %" PRId64 ", term:%" PRId64 "", vgId, pEntry->index, + pEntry->term); + pBuf->commitIndex = index; + if (!inBuf) { + syncEntryDestroy(pEntry); + pEntry = NULL; + } + continue; + } + + if (syncLogFsmExecute(pFsm, role, term, pEntry) != 0) { + sError("vgId:%d, failed to execute raft entry in FSM. log index:%" PRId64 ", term:%" PRId64 "", vgId, + pEntry->index, pEntry->term); + ret = -1; + goto _out; + } + pBuf->commitIndex = index; + + sInfo("vgId:%d, committed index: %" PRId64 ", term: %" PRId64 ", role: %d, current term: %" PRId64 "", pNode->vgId, + pEntry->index, pEntry->term, role, term); + + if (!inBuf) { + syncEntryDestroy(pEntry); + pEntry = NULL; + } + } + + // recycle + // TODO: with a grace period of one third of free space before commitIndex in ring buffer + SyncIndex until = pBuf->commitIndex; + for (SyncIndex index = pBuf->startIndex; index < until; index++) { + SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + ASSERT(pEntry != NULL); + syncEntryDestroy(pEntry); + memset(&pBuf->entries[(index + pBuf->size) % pBuf->size], 0, sizeof(pBuf->entries[0])); + pBuf->startIndex = index + 1; + } + +_out: + // mark as restored if needed + if (!pNode->restoreFinish && pBuf->commitIndex >= pNode->commitIndex) { + pNode->pFsm->FpRestoreFinishCb(pNode->pFsm); + pNode->restoreFinish = true; + sInfo("vgId:%d, restore finished. commit index:%" PRId64 ", match index:%" PRId64 ", last index:%" PRId64 "", + pNode->vgId, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex - 1); + } + + if (!inBuf) { + syncEntryDestroy(pEntry); + pEntry = NULL; + } + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return ret; +} + int32_t syncNodeOnAppendEntries(SSyncNode* ths, SyncAppendEntries* pMsg) { + SyncAppendEntriesReply* pReply = NULL; + // if already drop replica, do not process + if (!syncNodeInRaftGroup(ths, &(pMsg->srcId))) { + syncLogRecvAppendEntries(ths, pMsg, "not in my config"); + goto _IGNORE; + } + + // prepare response msg + pReply = syncAppendEntriesReplyBuild(ths->vgId); + pReply->srcId = ths->myRaftId; + pReply->destId = pMsg->srcId; + pReply->term = ths->pRaftStore->currentTerm; + pReply->success = false; + pReply->matchIndex = SYNC_INDEX_INVALID; + pReply->lastSendIndex = pMsg->prevLogIndex + 1; + pReply->privateTerm = ths->pNewNodeReceiver->privateTerm; + pReply->startTime = ths->startTime; + + if (pMsg->term < ths->pRaftStore->currentTerm) { + goto _SEND_RESPONSE; + } + + if (pMsg->term > ths->pRaftStore->currentTerm) { + pReply->term = pMsg->term; + } + + syncNodeStepDown(ths, pMsg->term); + syncNodeResetElectTimer(ths); + + // update commit index + (void)syncNodeUpdateCommitIndex(ths, pMsg->commitIndex); + + if (pMsg->dataLen < (int32_t)sizeof(SSyncRaftEntry)) { + sError("vgId:%d, incomplete append entries received. prev index:%" PRId64 ", term:%" PRId64 ", datalen:%d", + ths->vgId, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->dataLen); + goto _IGNORE; + } + + SSyncRaftEntry* pEntry = syncLogAppendEntriesToRaftEntry(pMsg); + + if (pEntry == NULL) { + sError("vgId:%d, failed to get raft entry from append entries since %s", ths->vgId, terrstr()); + goto _IGNORE; + } + + if (pMsg->prevLogIndex + 1 != pEntry->index) { + sError("vgId:%d, invalid previous log index in msg. index:%" PRId64 ", term:%" PRId64 ", prevLogIndex:%" PRId64 + ", prevLogTerm:%" PRId64, + ths->vgId, pEntry->index, pEntry->term, pMsg->prevLogIndex, pMsg->prevLogTerm); + goto _IGNORE; + } + + sInfo("vgId:%d, recv append entries msg. index:%" PRId64 ", term:%" PRId64 ", preLogIndex:%" PRId64 + ", prevLogTerm:%" PRId64 " commitIndex:%" PRId64 "", + pMsg->vgId, pMsg->prevLogIndex + 1, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex); + + // accept + if (syncLogBufferAccept(ths->pLogBuf, ths, pEntry, pMsg->prevLogTerm) < 0) { + sWarn("vgId:%d, failed to accept raft entry into log buffer. index:%" PRId64 ", term:%" PRId64, ths->vgId, + pEntry->index, pEntry->term); + goto _SEND_RESPONSE; + } + pReply->success = true; + +_SEND_RESPONSE: + // update match index + pReply->matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); + + // ack, i.e. send response + SRpcMsg rpcMsg; + syncAppendEntriesReply2RpcMsg(pReply, &rpcMsg); + (void)syncNodeSendMsgById(&pReply->destId, ths, &rpcMsg); + + // commit index, i.e. leader notice me + if (syncLogBufferCommit(ths->pLogBuf, ths, pMsg->commitIndex) < 0) { + sError("vgId:%d, failed to commit raft fsm log since %s.", ths->vgId, terrstr()); + goto _out; + } + +_out: +_IGNORE: + syncAppendEntriesReplyDestroy(pReply); + return 0; +} + +int32_t syncNodeOnAppendEntriesOld(SSyncNode* ths, SyncAppendEntries* pMsg) { // if already drop replica, do not process if (!syncNodeInRaftGroup(ths, &(pMsg->srcId))) { syncLogRecvAppendEntries(ths, pMsg, "not in my config"); @@ -386,6 +930,8 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, SyncAppendEntries* pMsg) { goto _IGNORE; } + ASSERT(pAppendEntry->index == appendIndex); + // append code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pAppendEntry); if (code != 0) { @@ -431,34 +977,11 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, SyncAppendEntries* pMsg) { } } -#if 0 - if (code != 0 && terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) { - code = ths->pLogStore->syncLogTruncate(ths->pLogStore, appendIndex); - ASSERT(code == 0); - - code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pAppendEntry); - ASSERT(code == 0); - - } else { - ASSERT(code == 0); - - if (pLocalEntry->term == pAppendEntry->term) { - // do nothing - } else { - code = ths->pLogStore->syncLogTruncate(ths->pLogStore, appendIndex); - ASSERT(code == 0); - - code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pAppendEntry); - ASSERT(code == 0); - } - } -#endif - // update match index pReply->matchIndex = pAppendEntry->index; - syncEntryDestory(pLocalEntry); - syncEntryDestory(pAppendEntry); + syncEntryDestroy(pLocalEntry); + syncEntryDestroy(pAppendEntry); } else { // no append entries, do nothing @@ -489,4 +1012,4 @@ _SEND_RESPONSE: syncAppendEntriesReplyDestroy(pReply); return 0; -} \ No newline at end of file +} diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index 5e6c9f1534..e37c40455c 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -84,6 +84,70 @@ static void syncNodeStartSnapshotOnce(SSyncNode* ths, SyncIndex beginIndex, Sync } } +int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commitIndex) { + ths->commitIndex = TMAX(commitIndex, ths->commitIndex); + SyncIndex lastVer = ths->pLogStore->syncLogLastIndex(ths->pLogStore); + commitIndex = TMIN(ths->commitIndex, lastVer); + ths->pLogStore->syncLogUpdateCommitIndex(ths->pLogStore, commitIndex); + return commitIndex; +} + +int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) { + if (indexLikely > ths->commitIndex && syncNodeAgreedUpon(ths, indexLikely)) { + SyncIndex commitIndex = indexLikely; + syncNodeUpdateCommitIndex(ths, commitIndex); + sInfo("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state, + ths->pRaftStore->currentTerm, commitIndex); + } + return ths->commitIndex; +} + +int32_t syncLogBufferCatchingUpReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex fromIndex, SRaftId destId) { + taosThreadMutexLock(&pBuf->mutex); + SyncAppendEntries* pMsgOut = NULL; + SyncIndex index = fromIndex; + + if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->replicaNum <= 1) { + goto _out; + } + + if (index < pBuf->startIndex) { + sError("vgId:%d, (not implemented yet) replication fromIndex: %" PRId64 + " that is less than pBuf->startIndex: %" PRId64 ". destId: 0x%016" PRId64 "", + pNode->vgId, fromIndex, pBuf->startIndex, destId.addr); + goto _out; + } + + if (index > pBuf->matchIndex) { + goto _out; + } + + do { + pMsgOut = syncLogToAppendEntries(pBuf, pNode, index); + if (pMsgOut == NULL) { + sError("vgId:%d, failed to assembly append entries msg since %s. index: %" PRId64 "", pNode->vgId, terrstr(), + index); + goto _out; + } + + if (syncNodeSendAppendEntries(pNode, &destId, pMsgOut) < 0) { + sWarn("vgId:%d, failed to send append entries msg since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", + pNode->vgId, terrstr(), index, destId.addr); + goto _out; + } + + index += 1; + syncAppendEntriesDestroy(pMsgOut); + pMsgOut = NULL; + } while (false && index <= pBuf->commitIndex); + +_out: + syncAppendEntriesDestroy(pMsgOut); + pMsgOut = NULL; + taosThreadMutexUnlock(&pBuf->mutex); + return 0; +} + int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, SyncAppendEntriesReply* pMsg) { int32_t ret = 0; @@ -99,6 +163,63 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, SyncAppendEntriesReply* pMs return 0; } + if (ths->state == TAOS_SYNC_STATE_LEADER) { + if (pMsg->term > ths->pRaftStore->currentTerm) { + syncLogRecvAppendEntriesReply(ths, pMsg, "error term"); + syncNodeStepDown(ths, pMsg->term); + return -1; + } + + ASSERT(pMsg->term == ths->pRaftStore->currentTerm); + + sInfo("vgId:%d received append entries reply. srcId:0x%016" PRIx64 ", term:%" PRId64 ", matchIndex:%" PRId64 "", + pMsg->vgId, pMsg->srcId.addr, pMsg->term, pMsg->matchIndex); + + if (pMsg->success) { + SyncIndex oldMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId)); + if (pMsg->matchIndex > oldMatchIndex) { + syncIndexMgrSetIndex(ths->pMatchIndex, &(pMsg->srcId), pMsg->matchIndex); + } + + // commit if needed + SyncIndex indexLikely = TMIN(pMsg->matchIndex, ths->pLogBuf->matchIndex); + SyncIndex commitIndex = syncNodeCheckCommitIndex(ths, indexLikely); + (void)syncLogBufferCommit(ths->pLogBuf, ths, commitIndex); + } else { + SyncIndex nextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId)); + if (nextIndex > SYNC_INDEX_BEGIN) { + --nextIndex; + } + syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), nextIndex); + } + + // send next append entries + SPeerState* pState = syncNodeGetPeerState(ths, &(pMsg->srcId)); + ASSERT(pState != NULL); + + if (pMsg->lastSendIndex == pState->lastSendIndex) { + syncNodeReplicateOne(ths, &(pMsg->srcId)); + } + } + + return 0; +} + +int32_t syncNodeOnAppendEntriesReplyOld(SSyncNode* ths, SyncAppendEntriesReply* pMsg) { + int32_t ret = 0; + + // if already drop replica, do not process + if (!syncNodeInRaftGroup(ths, &(pMsg->srcId))) { + syncLogRecvAppendEntriesReply(ths, pMsg, "not in my config"); + return 0; + } + + // drop stale response + if (pMsg->term < ths->pRaftStore->currentTerm) { + syncLogRecvAppendEntriesReply(ths, pMsg, "drop stale response"); + return 0; + } + if (ths->state == TAOS_SYNC_STATE_LEADER) { if (pMsg->term > ths->pRaftStore->currentTerm) { syncLogRecvAppendEntriesReply(ths, pMsg, "error term"); @@ -135,4 +256,4 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, SyncAppendEntriesReply* pMs syncLogRecvAppendEntriesReply(ths, pMsg, "process"); return 0; -} \ No newline at end of file +} diff --git a/source/libs/sync/src/syncCommit.c b/source/libs/sync/src/syncCommit.c index 811a7b8e99..a96fa31f83 100644 --- a/source/libs/sync/src/syncCommit.c +++ b/source/libs/sync/src/syncCommit.c @@ -44,12 +44,100 @@ // IN commitIndex' = [commitIndex EXCEPT ![i] = newCommitIndex] // /\ UNCHANGED <> // + void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) { if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { syncNodeErrorLog(pSyncNode, "not leader, can not advance commit index"); return; } + // update commit index + SyncIndex newCommitIndex = pSyncNode->commitIndex; + for (SyncIndex index = syncNodeGetLastIndex(pSyncNode); index > pSyncNode->commitIndex; --index) { + bool agree = syncAgree(pSyncNode, index); + + if (agree) { + // term + SSyncRaftEntry* pEntry = NULL; + SLRUCache* pCache = pSyncNode->pLogStore->pCache; + LRUHandle* h = taosLRUCacheLookup(pCache, &index, sizeof(index)); + if (h) { + pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h); + } else { + int32_t code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry); + if (code != 0) { + char logBuf[128]; + snprintf(logBuf, sizeof(logBuf), "advance commit index error, read wal index:%" PRId64, index); + syncNodeErrorLog(pSyncNode, logBuf); + return; + } + } + // cannot commit, even if quorum agree. need check term! + if (pEntry->term <= pSyncNode->pRaftStore->currentTerm) { + // update commit index + newCommitIndex = index; + + if (h) { + taosLRUCacheRelease(pCache, h, false); + } else { + syncEntryDestroy(pEntry); + } + + break; + } else { + do { + char logBuf[128]; + snprintf(logBuf, sizeof(logBuf), "can not commit due to term not equal, index:%" PRId64 ", term:%" PRIu64, + pEntry->index, pEntry->term); + syncNodeEventLog(pSyncNode, logBuf); + } while (0); + } + + if (h) { + taosLRUCacheRelease(pCache, h, false); + } else { + syncEntryDestroy(pEntry); + } + } + } + + // advance commit index as large as possible + SyncIndex walCommitVer = logStoreWalCommitVer(pSyncNode->pLogStore); + if (walCommitVer > newCommitIndex) { + newCommitIndex = walCommitVer; + } + + // maybe execute fsm + if (newCommitIndex > pSyncNode->commitIndex) { + SyncIndex beginIndex = pSyncNode->commitIndex + 1; + SyncIndex endIndex = newCommitIndex; + + // update commit index + pSyncNode->commitIndex = newCommitIndex; + + // call back Wal + pSyncNode->pLogStore->syncLogUpdateCommitIndex(pSyncNode->pLogStore, pSyncNode->commitIndex); + + // execute fsm + if (pSyncNode->pFsm != NULL) { + int32_t code = syncNodeDoCommit(pSyncNode, beginIndex, endIndex, pSyncNode->state); + if (code != 0) { + char logBuf[128]; + snprintf(logBuf, sizeof(logBuf), "advance commit index error, do commit begin:%" PRId64 ", end:%" PRId64, + beginIndex, endIndex); + syncNodeErrorLog(pSyncNode, logBuf); + return; + } + } + } +} + +void syncMaybeAdvanceCommitIndexOld(SSyncNode* pSyncNode) { + if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { + syncNodeErrorLog(pSyncNode, "not leader, can not advance commit index"); + return; + } + // advance commit index to sanpshot first SSnapshot snapshot; pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot); @@ -93,7 +181,7 @@ void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) { if (h) { taosLRUCacheRelease(pCache, h, false); } else { - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } break; @@ -109,7 +197,7 @@ void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) { if (h) { taosLRUCacheRelease(pCache, h, false); } else { - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } } } @@ -245,13 +333,28 @@ bool syncAgree(SSyncNode* pSyncNode, SyncIndex index) { } */ -bool syncAgree(SSyncNode* pSyncNode, SyncIndex index) { +bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index) { + int count = 0; + SSyncIndexMgr* pMatches = pNode->pMatchIndex; + ASSERT(pNode->replicaNum == pMatches->replicaNum); + + for (int i = 0; i < pNode->replicaNum; i++) { + SyncIndex matchIndex = pMatches->index[i]; + if (matchIndex >= index) { + count++; + } + } + + return count >= pNode->quorum; +} + +bool syncAgree(SSyncNode* pNode, SyncIndex index) { int agreeCount = 0; - for (int i = 0; i < pSyncNode->replicaNum; ++i) { - if (syncAgreeIndex(pSyncNode, &(pSyncNode->replicasId[i]), index)) { + for (int i = 0; i < pNode->replicaNum; ++i) { + if (syncAgreeIndex(pNode, &(pNode->replicasId[i]), index)) { ++agreeCount; } - if (agreeCount >= pSyncNode->quorum) { + if (agreeCount >= pNode->quorum) { return true; } } diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 00c4ea76aa..c3267bafdc 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -105,19 +105,32 @@ int64_t syncOpen(SSyncInfo* pSyncInfo) { return pSyncNode->rid; } -void syncStart(int64_t rid) { +int32_t syncStart(int64_t rid) { SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid); if (pSyncNode == NULL) { - return; + sError("failed to acquire rid: %" PRId64 " of tsNodeReftId for pSyncNode", rid); + return -1; + } + + if (syncNodeRestore(pSyncNode) < 0) { + sError("vgId:%d, failed to restore raft log buffer since %s", pSyncNode->vgId, terrstr()); + return -1; } if (pSyncNode->pRaftCfg->isStandBy) { - syncNodeStartStandBy(pSyncNode); + if (syncNodeStartStandBy(pSyncNode) < 0) { + sError("vgId:%d, failed to start raft node as standby since %s", pSyncNode->vgId, terrstr()); + return -1; + } } else { - syncNodeStart(pSyncNode); + if (syncNodeStart(pSyncNode) < 0) { + sError("vgId:%d, failed to start sync node since %s", pSyncNode->vgId, terrstr()); + return -1; + } } taosReleaseRef(tsNodeRefId, pSyncNode->rid); + return 0; } void syncStartNormal(int64_t rid) { @@ -130,14 +143,15 @@ void syncStartNormal(int64_t rid) { taosReleaseRef(tsNodeRefId, pSyncNode->rid); } -void syncStartStandBy(int64_t rid) { +int32_t syncStartStandBy(int64_t rid) { SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid); if (pSyncNode == NULL) { - return; + return -1; } syncNodeStartStandBy(pSyncNode); taosReleaseRef(tsNodeRefId, pSyncNode->rid); + return 0; } void syncStop(int64_t rid) { @@ -661,7 +675,7 @@ int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapsho int32_t code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry); if (code != 0) { if (pEntry != NULL) { - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } taosReleaseRef(tsNodeRefId, pSyncNode->rid); return -1; @@ -673,7 +687,7 @@ int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapsho pSnapshot->lastApplyTerm = pEntry->term; pSnapshot->lastConfigIndex = syncNodeGetSnapshotConfigIndex(pSyncNode, index); - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); taosReleaseRef(tsNodeRefId, pSyncNode->rid); return 0; } @@ -1089,6 +1103,38 @@ int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) { return ret; } +SSyncLogBuffer* syncLogBufferCreate() { + SSyncLogBuffer* pBuf = taosMemoryCalloc(1, sizeof(SSyncLogBuffer)); + if (pBuf == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + + pBuf->size = sizeof(pBuf->entries) / sizeof(pBuf->entries[0]); + + ASSERT(pBuf->size == TSDB_SYNC_LOG_BUFFER_SIZE); + + if (taosThreadMutexInit(&pBuf->mutex, NULL) < 0) { + sError("failed to init log buffer mutex due to %s", strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(errno); + goto _err; + } + return pBuf; + +_err: + taosMemoryFree(pBuf); + return NULL; +} + +void syncLogBufferDestroy(SSyncLogBuffer* pBuf) { + if (pBuf == NULL) { + return; + } + (void)taosThreadMutexDestroy(&pBuf->mutex); + (void)taosMemoryFree(pBuf); + return; +} + // open/close -------------- SSyncNode* syncNodeOpen(SSyncInfo* pOldSyncInfo) { SSyncInfo* pSyncInfo = (SSyncInfo*)pOldSyncInfo; @@ -1150,6 +1196,13 @@ SSyncNode* syncNodeOpen(SSyncInfo* pOldSyncInfo) { pSyncNode->FpEqMsg = pSyncInfo->FpEqMsg; pSyncNode->FpEqCtrlMsg = pSyncInfo->FpEqCtrlMsg; + // create raft log ring buffer + pSyncNode->pLogBuf = syncLogBufferCreate(); + if (pSyncNode->pLogBuf == NULL) { + sError("failed to init log buffer since %s. vgId:%d", terrstr(), pSyncNode->vgId); + goto _error; + } + // init raft config pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath); if (pSyncNode->pRaftCfg == NULL) { @@ -1362,6 +1415,12 @@ SSyncNode* syncNodeOpen(SSyncInfo* pOldSyncInfo) { // snapshotting atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID); + // init log buffer + if (syncLogBufferInit(pSyncNode->pLogBuf, pSyncNode) < 0) { + sError("vgId:%d, failed to init raft log buffer since %s", pSyncNode->vgId, terrstr()); + ASSERT(false); + } + syncNodeEventLog(pSyncNode, "sync open"); return pSyncNode; @@ -1387,7 +1446,48 @@ void syncNodeMaybeUpdateCommitBySnapshot(SSyncNode* pSyncNode) { } } -void syncNodeStart(SSyncNode* pSyncNode) { +int32_t syncNodeRestore(SSyncNode* pSyncNode) { + ASSERT(pSyncNode->pLogStore != NULL && "log store not created"); + ASSERT(pSyncNode->pLogBuf != NULL && "ring log buffer not created"); + + SyncIndex lastVer = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore); + SyncIndex commitIndex = pSyncNode->pLogStore->syncLogCommitIndex(pSyncNode->pLogStore); + SyncIndex endIndex = pSyncNode->pLogBuf->endIndex; + + commitIndex = TMAX(pSyncNode->commitIndex, commitIndex); + + if (syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, commitIndex) < 0) { + return -1; + } + + if (endIndex <= lastVer) { + sError("vgId:%d, failed to load log entries into log buffers. commit index:%" PRId64 ", lastVer: %" PRId64 "", + pSyncNode->vgId, commitIndex, lastVer); + return -1; + } + + return 0; +} + +int32_t syncNodeStart(SSyncNode* pSyncNode) { + // start raft + if (pSyncNode->replicaNum == 1) { + raftStoreNextTerm(pSyncNode->pRaftStore); + syncNodeBecomeLeader(pSyncNode, "one replica start"); + + // Raft 3.6.2 Committing entries from previous terms + syncNodeAppendNoop(pSyncNode); + } else { + syncNodeBecomeFollower(pSyncNode, "first start"); + } + + int32_t ret = 0; + ret = syncNodeStartPingTimer(pSyncNode); + ASSERT(ret == 0); + return ret; +} + +void syncNodeStartOld(SSyncNode* pSyncNode) { // start raft if (pSyncNode->replicaNum == 1) { raftStoreNextTerm(pSyncNode->pRaftStore); @@ -1406,7 +1506,7 @@ void syncNodeStart(SSyncNode* pSyncNode) { ASSERT(ret == 0); } -void syncNodeStartStandBy(SSyncNode* pSyncNode) { +int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) { // state change pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER; syncNodeStopHeartbeatTimer(pSyncNode); @@ -1419,6 +1519,7 @@ void syncNodeStartStandBy(SSyncNode* pSyncNode) { ret = 0; ret = syncNodeStartPingTimer(pSyncNode); ASSERT(ret == 0); + return ret; } void syncNodeClose(SSyncNode* pSyncNode) { @@ -1443,6 +1544,8 @@ void syncNodeClose(SSyncNode* pSyncNode) { pSyncNode->pMatchIndex = NULL; logStoreDestory(pSyncNode->pLogStore); pSyncNode->pLogStore = NULL; + syncLogBufferDestroy(pSyncNode->pLogBuf); + pSyncNode->pLogBuf = NULL; raftCfgClose(pSyncNode->pRaftCfg); pSyncNode->pRaftCfg = NULL; @@ -2341,6 +2444,43 @@ void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm) { void syncNodeLeaderChangeRsp(SSyncNode* pSyncNode) { syncRespCleanRsp(pSyncNode->pSyncRespMgr); } +int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex) { + ASSERT(pBuf->commitIndex < toIndex && toIndex <= pBuf->endIndex); + + SyncIndex index = pBuf->endIndex - 1; + while (index >= toIndex) { + SSyncRaftEntry* pEntry = pBuf->entries[index % pBuf->size].pItem; + if (pEntry != NULL) { + syncEntryDestroy(pEntry); + pEntry = NULL; + memset(&pBuf->entries[index % pBuf->size], 0, sizeof(pBuf->entries[0])); + } + index--; + } + pBuf->endIndex = toIndex; + pBuf->matchIndex = TMIN(pBuf->matchIndex, index); + ASSERT(index + 1 == toIndex); + return 0; +} + +int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { + taosThreadMutexLock(&pBuf->mutex); + SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); + ASSERT(lastVer == pBuf->matchIndex); + SyncIndex index = pBuf->endIndex - 1; + + (void)syncLogBufferRollback(pBuf, pBuf->matchIndex + 1); + + sInfo("vgId:%d, reset log buffer. start index: %" PRId64 ", commit index: %" PRId64 ", match Index: %" PRId64 + ", end index: %" PRId64 "", + pNode->vgId, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + + pBuf->endIndex = pBuf->matchIndex + 1; + + taosThreadMutexUnlock(&pBuf->mutex); + return 0; +} + void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) { // maybe clear leader cache if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) { @@ -2365,6 +2505,9 @@ void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) { // min match index pSyncNode->minMatchIndex = SYNC_INDEX_INVALID; + // reset log buffer + syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode); + // trace log do { int32_t debugStrLen = strlen(debugStr); @@ -2403,7 +2546,8 @@ void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) { pSyncNode->leaderTime = taosGetTimestampMs(); // reset restoreFinish - pSyncNode->restoreFinish = false; + // TODO: disable it temporarily + // pSyncNode->restoreFinish = false; // state change pSyncNode->state = TAOS_SYNC_STATE_LEADER; @@ -2467,6 +2611,9 @@ void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) { // min match index pSyncNode->minMatchIndex = SYNC_INDEX_INVALID; + // reset log buffer + syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode); + // trace log do { int32_t debugStrLen = strlen(debugStr); @@ -2490,6 +2637,17 @@ void syncNodeCandidate2Leader(SSyncNode* pSyncNode) { syncNodeLog2("==state change syncNodeCandidate2Leader==", pSyncNode); + // Raft 3.6.2 Committing entries from previous terms + syncNodeAppendNoop(pSyncNode); +} + +void syncNodeCandidate2LeaderOld(SSyncNode* pSyncNode) { + ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE); + ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted)); + syncNodeBecomeLeader(pSyncNode, "candidate to leader"); + + syncNodeLog2("==state change syncNodeCandidate2Leader==", pSyncNode); + // Raft 3.6.2 Committing entries from previous terms syncNodeAppendNoop(pSyncNode); syncMaybeAdvanceCommitIndex(pSyncNode); @@ -2941,7 +3099,46 @@ static int32_t syncCacheEntry(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, return code; } +int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) { + // append to log buffer + if (syncLogBufferAppend(ths->pLogBuf, ths, pEntry) < 0) { + sError("vgId:%d, failed to enqueue log buffer. index:%" PRId64 "", ths->vgId, pEntry->index); + return -1; + } + + // proceed match index, with replicating on needed + SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); + + // multi replica + if (ths->replicaNum > 1) { + return 0; + } + + // single replica + (void)syncNodeUpdateCommitIndex(ths, matchIndex); + + if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { + sError("vgId:%d, failed to commit until commitIndex:%" PRId64 "", ths->vgId, ths->commitIndex); + return -1; + } + + return 0; +} + static int32_t syncNodeAppendNoop(SSyncNode* ths) { + SyncIndex index = syncLogBufferGetEndIndex(ths->pLogBuf); + SyncTerm term = ths->pRaftStore->currentTerm; + + SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId); + if (pEntry == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + return syncNodeAppend(ths, pEntry); +} + +static int32_t syncNodeAppendNoopOld(SSyncNode* ths) { int32_t ret = 0; SyncIndex index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore); @@ -2963,7 +3160,7 @@ static int32_t syncNodeAppendNoop(SSyncNode* ths) { if (h) { taosLRUCacheRelease(ths->pLogStore->pCache, h, false); } else { - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } return ret; @@ -3055,6 +3252,114 @@ int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, SyncHeartbeatReply* pMsg) { return 0; } +int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf) { + taosThreadMutexLock(&pBuf->mutex); + int64_t index = pBuf->endIndex; + taosThreadMutexUnlock(&pBuf->mutex); + return index; +} + +int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry) { + taosThreadMutexLock(&pBuf->mutex); + syncLogBufferValidate(pBuf); + SyncIndex index = pEntry->index; + + if (index - pBuf->startIndex > pBuf->size) { + sError("vgId:%d, failed to append due to log buffer full. index:%" PRId64 "", pNode->vgId, index); + goto _out; + } + + ASSERT(index == pBuf->endIndex); + + SSyncRaftEntry* pExist = pBuf->entries[index % pBuf->size].pItem; + ASSERT(pExist == NULL); + + // initial log buffer with at least one item, e.g. commitIndex + SSyncRaftEntry* pMatch = pBuf->entries[(index - 1 + pBuf->size) % pBuf->size].pItem; + ASSERT(pMatch != NULL && "no matched raft log entry"); + ASSERT(pMatch->index + 1 == index); + + SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = pMatch->index, .prevLogTerm = pMatch->term}; + pBuf->entries[index % pBuf->size] = tmp; + pBuf->endIndex = index + 1; + + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return 0; + +_out: + syncLogBufferValidate(pBuf); + syncEntryDestroy(pEntry); + taosThreadMutexUnlock(&pBuf->mutex); + return -1; +} + +SyncTerm syncLogBufferGetTerm(SSyncLogBuffer* pBuf, SyncIndex index) { + ASSERT(pBuf->startIndex <= index && index < pBuf->endIndex); + SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + ASSERT(pEntry != NULL); + return pEntry->term; +} + +SyncAppendEntries* syncLogToAppendEntries(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index) { + SyncAppendEntries* pMsg = NULL; + + if (index < pBuf->startIndex || index >= pBuf->endIndex) { + sError("vgId:%d, log entry (%" PRId64 ") out of range of log buffer [%" PRId64 ", %" PRId64 ").", pNode->vgId, + index, pBuf->startIndex, pBuf->endIndex); + return pMsg; + } + + SSyncRaftEntry* pEntry = pBuf->entries[index % pBuf->size].pItem; + if (pEntry == NULL) { + sError("vgId:%d, log entry (%" PRId64 ") not exist in log buffer [%" PRId64 ", %" PRId64 ").", pNode->vgId, index, + pBuf->startIndex, pBuf->endIndex); + return pMsg; + } + + uint32_t datalen = pEntry->bytes; + pMsg = syncAppendEntriesBuild(datalen, pNode->vgId); + if (pMsg == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + + (void)memcpy(pMsg->data, pEntry, datalen); + + pMsg->prevLogIndex = index - 1; + pMsg->prevLogTerm = syncLogBufferGetTerm(pBuf, pMsg->prevLogIndex); + pMsg->vgId = pNode->vgId; + pMsg->srcId = pNode->myRaftId; + pMsg->term = pNode->pRaftStore->currentTerm; + pMsg->commitIndex = pNode->commitIndex; + pMsg->privateTerm = 0; + return pMsg; +} + +void syncLogReplicateAppendEntries(SSyncNode* pNode, SyncAppendEntries* pMsg) { + for (int i = 0; i < pNode->replicaNum; i++) { + SRaftId* pDestId = &pNode->peersId[i]; + if (!syncUtilSameId(pDestId, &pNode->myRaftId)) { + (void)syncNodeSendAppendEntries(pNode, pDestId, pMsg); + } + } +} + +int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index) { + SyncAppendEntries* pMsgOut = syncLogToAppendEntries(pNode->pLogBuf, pNode, index); + if (pMsgOut == NULL) { + sError("vgId:%d, failed to get append entries for index:%" PRId64 "", pNode->vgId, index); + goto _err; + } + + // replicate pMsgOut + (void)syncLogReplicateAppendEntries(pNode, pMsgOut); + +_err: + syncAppendEntriesDestroy(pMsgOut); + return 0; +} + // TLA+ Spec // ClientRequest(i, v) == // /\ state[i] = Leader @@ -3069,6 +3374,31 @@ int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, SyncHeartbeatReply* pMsg) { int32_t syncNodeOnClientRequest(SSyncNode* ths, SyncClientRequest* pMsg, SyncIndex* pRetIndex) { syncNodeEventLog(ths, "on client request"); + int32_t code = 0; + + SyncIndex index = syncLogBufferGetEndIndex(ths->pLogBuf); + SyncTerm term = ths->pRaftStore->currentTerm; + SSyncRaftEntry* pEntry = NULL; + pEntry = syncEntryBuild2((SyncClientRequest*)pMsg, term, index); + if (pEntry == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + if (ths->state == TAOS_SYNC_STATE_LEADER) { + if (pRetIndex) { + (*pRetIndex) = index; + } + + return syncNodeAppend(ths, pEntry); + } + + return 0; +} + +int32_t syncNodeOnClientRequestOld(SSyncNode* ths, SyncClientRequest* pMsg, SyncIndex* pRetIndex) { + syncNodeEventLog(ths, "on client request"); + int32_t ret = 0; int32_t code = 0; @@ -3085,11 +3415,11 @@ int32_t syncNodeOnClientRequest(SSyncNode* ths, SyncClientRequest* pMsg, SyncInd code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry); if (code != 0) { // del resp mgr, call FpCommitCb - ASSERT(0); + sError("vgId:%d, failed to append log entry since %s", ths->vgId, terrstr()); return -1; } - // if mulit replica, start replicate right now + // if multi replica, start replicate right now if (ths->replicaNum > 1) { syncNodeReplicate(ths); } @@ -3111,7 +3441,7 @@ int32_t syncNodeOnClientRequest(SSyncNode* ths, SyncClientRequest* pMsg, SyncInd if (h) { taosLRUCacheRelease(ths->pLogStore->pCache, h, false); } else { - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } return ret; @@ -3305,6 +3635,7 @@ static int32_t syncNodeProposeConfigChangeFinish(SSyncNode* ths, SyncReconfigFin } bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) { + return false; return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1); } @@ -3432,7 +3763,7 @@ int32_t syncNodeDoCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endInde if (h) { taosLRUCacheRelease(pCache, h, false); } else { - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } } } @@ -3703,4 +4034,4 @@ void syncLogRecvHeartbeatReply(SSyncNode* pSyncNode, const SyncHeartbeatReply* p snprintf(logBuf, sizeof(logBuf), "recv sync-heartbeat-reply from %s:%d {term:%" PRIu64 ", pterm:%" PRIu64 "}, %s", host, port, pMsg->term, pMsg->privateTerm, s); syncNodeEventLog(pSyncNode, logBuf); -} \ No newline at end of file +} diff --git a/source/libs/sync/src/syncRaftEntry.c b/source/libs/sync/src/syncRaftEntry.c index 940aaca055..38b8574afa 100644 --- a/source/libs/sync/src/syncRaftEntry.c +++ b/source/libs/sync/src/syncRaftEntry.c @@ -96,7 +96,7 @@ SSyncRaftEntry* syncEntryBuildNoop(SyncTerm term, SyncIndex index, int32_t vgId) return pEntry; } -void syncEntryDestory(SSyncRaftEntry* pEntry) { +void syncEntryDestroy(SSyncRaftEntry* pEntry) { if (pEntry != NULL) { taosMemoryFree(pEntry); } @@ -454,7 +454,7 @@ static int cmpFn(const void* p1, const void* p2) { return memcmp(p1, p2, sizeof( static void freeRaftEntry(void* param) { SSyncRaftEntry* pEntry = (SSyncRaftEntry*)param; - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } SRaftEntryCache* raftEntryCacheCreate(SSyncNode* pSyncNode, int32_t maxCount) { @@ -588,7 +588,7 @@ int32_t raftEntryCacheClear(struct SRaftEntryCache* pCache, int32_t count) { SSkipListNode* pNode = tSkipListIterGet(pIter); ASSERT(pNode != NULL); SSyncRaftEntry* pEntry = (SSyncRaftEntry*)SL_GET_NODE_DATA(pNode); - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); ++returnCnt; } tSkipListDestroyIter(pIter); @@ -617,7 +617,7 @@ int32_t raftEntryCacheClear(struct SRaftEntryCache* pCache, int32_t count) { ++returnCnt; SSyncRaftEntry* pEntry = (SSyncRaftEntry*)SL_GET_NODE_DATA(pNode); - // syncEntryDestory(pEntry); + // syncEntryDestroy(pEntry); taosRemoveRef(pCache->refMgr, pEntry->rid); } tSkipListDestroyIter(pIter); diff --git a/source/libs/sync/src/syncRaftLog.c b/source/libs/sync/src/syncRaftLog.c index 23d076cfbc..3db970ba00 100644 --- a/source/libs/sync/src/syncRaftLog.c +++ b/source/libs/sync/src/syncRaftLog.c @@ -152,7 +152,6 @@ static int32_t raftLogEntryCount(struct SSyncLogStore* pLogStore) { } static SyncIndex raftLogLastIndex(struct SSyncLogStore* pLogStore) { - SyncIndex lastIndex; SSyncLogStoreData* pData = pLogStore->data; SWal* pWal = pData->pWal; SyncIndex lastVer = walGetLastVer(pWal); @@ -207,7 +206,7 @@ static int32_t raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntr syncMeta.isWeek = pEntry->isWeak; syncMeta.seqNum = pEntry->seqNum; syncMeta.term = pEntry->term; - index = walAppendLog(pWal, pEntry->originalRpcType, syncMeta, pEntry->data, pEntry->dataLen); + index = walAppendLog(pWal, pEntry->index, pEntry->originalRpcType, syncMeta, pEntry->data, pEntry->dataLen); if (index < 0) { int32_t err = terrno; const char* errStr = tstrerror(err); @@ -218,11 +217,10 @@ static int32_t raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntr snprintf(logBuf, sizeof(logBuf), "wal write error, index:%" PRId64 ", err:%d %X, msg:%s, syserr:%d, sysmsg:%s", pEntry->index, err, err, errStr, sysErr, sysErrStr); syncNodeErrorLog(pData->pSyncNode, logBuf); - - ASSERT(0); return -1; } - pEntry->index = index; + + ASSERT(pEntry->index == index); do { char eventLog[128]; @@ -326,8 +324,7 @@ static int32_t raftLogTruncate(struct SSyncLogStore* pLogStore, SyncIndex fromIn const char* sysErrStr = strerror(errno); sError("vgId:%d, wal truncate error, from-index:%" PRId64 ", err:%d %X, msg:%s, syserr:%d, sysmsg:%s", pData->pSyncNode->vgId, fromIndex, err, err, errStr, sysErr, sysErrStr); - - ASSERT(0); + return -1; } // event log @@ -365,7 +362,6 @@ static int32_t raftLogGetLastEntry(SSyncLogStore* pLogStore, SSyncRaftEntry** pp int32_t raftLogUpdateCommitIndex(SSyncLogStore* pLogStore, SyncIndex index) { SSyncLogStoreData* pData = pLogStore->data; SWal* pWal = pData->pWal; - // ASSERT(walCommit(pWal, index) == 0); int32_t code = walCommit(pWal, index); if (code != 0) { int32_t err = terrno; @@ -374,8 +370,7 @@ int32_t raftLogUpdateCommitIndex(SSyncLogStore* pLogStore, SyncIndex index) { const char* sysErrStr = strerror(errno); sError("vgId:%d, wal update commit index error, index:%" PRId64 ", err:%d %X, msg:%s, syserr:%d, sysmsg:%s", pData->pSyncNode->vgId, index, err, err, errStr, sysErr, sysErrStr); - - ASSERT(0); + return -1; } return 0; } @@ -427,7 +422,7 @@ cJSON* logStore2Json(SSyncLogStore* pLogStore) { raftLogGetEntry(pLogStore, i, &pEntry); cJSON_AddItemToArray(pEntries, syncEntry2Json(pEntry)); - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } } } diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index e040310e15..181b9f2b74 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -55,7 +55,11 @@ int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId) { // maybe start snapshot SyncIndex logStartIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore); SyncIndex logEndIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore); - if (nextIndex < logStartIndex || nextIndex - 1 > logEndIndex) { + if (nextIndex > logEndIndex) { + return 0; + } + + if (nextIndex < logStartIndex) { char logBuf[128]; snprintf(logBuf, sizeof(logBuf), "start snapshot for next-index:%" PRId64 ", start:%" PRId64 ", end:%" PRId64, nextIndex, logStartIndex, logEndIndex); @@ -90,7 +94,7 @@ int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId) { memcpy(pMsg->data, serialized, len); taosMemoryFree(serialized); - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } else { if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) { @@ -154,8 +158,10 @@ int32_t syncNodeReplicate(SSyncNode* pSyncNode) { return 0; } -int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, const SyncAppendEntries* pMsg) { +int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, SRaftId* destRaftId, SyncAppendEntries* pMsg) { int32_t ret = 0; + pMsg->destId = *destRaftId; + syncLogSendAppendEntries(pSyncNode, pMsg, ""); SRpcMsg rpcMsg; @@ -163,7 +169,10 @@ int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftI syncNodeSendMsgById(destRaftId, pSyncNode, &rpcMsg); SPeerState* pState = syncNodeGetPeerState(pSyncNode, destRaftId); - ASSERT(pState != NULL); + if (pState == NULL) { + sError("vgId:%d, failed to get peer state for addr:0x%016" PRIx64 "", pSyncNode->vgId, destRaftId->addr); + return -1; + } if (pMsg->dataLen > 0) { pState->lastSendIndex = pMsg->prevLogIndex + 1; @@ -173,7 +182,7 @@ int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftI return ret; } -int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, const SyncAppendEntries* pMsg) { +int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, SRaftId* destRaftId, SyncAppendEntries* pMsg) { int32_t ret = 0; if (syncNodeNeedSendAppendEntries(pSyncNode, destRaftId, pMsg)) { ret = syncNodeSendAppendEntries(pSyncNode, destRaftId, pMsg); @@ -231,4 +240,4 @@ int32_t syncNodeHeartbeatPeers(SSyncNode* pSyncNode) { } return 0; -} \ No newline at end of file +} diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index a7bafa9f28..43c9ec2980 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -139,7 +139,7 @@ int32_t snapshotSenderStart(SSyncSnapshotSender *pSender, SSnapshotParam snapsho getLastConfig = true; rpcFreeCont(rpcMsg.pCont); - syncEntryDestory(pEntry); + syncEntryDestroy(pEntry); } else { if (pSender->snapshot.lastConfigIndex == pSender->pSyncNode->pRaftCfg->lastConfigIndex) { sTrace("vgId:%d, sync sender get cfg from local", pSender->pSyncNode->vgId); diff --git a/source/libs/tdb/src/db/tdbBtree.c b/source/libs/tdb/src/db/tdbBtree.c index 3f36a058e5..6dda38653d 100644 --- a/source/libs/tdb/src/db/tdbBtree.c +++ b/source/libs/tdb/src/db/tdbBtree.c @@ -194,7 +194,8 @@ int tdbBtreeInsert(SBTree *pBt, const void *pKey, int kLen, const void *pVal, in btc.idx++; } else if (c == 0) { // dup key not allowed - ASSERT(0); + tdbError("unable to insert dup key. pKey: %p, kLen: %d, btc: %p, pTxn: %p", pKey, kLen, &btc, pTxn); + // ASSERT(0); return -1; } } diff --git a/source/libs/wal/src/walWrite.c b/source/libs/wal/src/walWrite.c index 527ffa0056..7ddebc5424 100644 --- a/source/libs/wal/src/walWrite.c +++ b/source/libs/wal/src/walWrite.c @@ -519,10 +519,15 @@ END: return -1; } -int64_t walAppendLog(SWal *pWal, tmsg_t msgType, SWalSyncInfo syncMeta, const void *body, int32_t bodyLen) { +int64_t walAppendLog(SWal *pWal, int64_t index, tmsg_t msgType, SWalSyncInfo syncMeta, const void *body, + int32_t bodyLen) { taosThreadMutexLock(&pWal->mutex); - int64_t index = pWal->vers.lastVer + 1; + if (index != pWal->vers.lastVer + 1) { + terrno = TSDB_CODE_WAL_INVALID_VER; + taosThreadMutexUnlock(&pWal->mutex); + return -1; + } if (walCheckAndRoll(pWal) < 0) { taosThreadMutexUnlock(&pWal->mutex); From def4058eb113a2bf601aed2f12adabdc2b953e93 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 11 Nov 2022 22:55:21 +0800 Subject: [PATCH 02/42] feat: impl pipelining negotiation --- include/libs/sync/sync.h | 2 + include/os/osTime.h | 7 + include/util/tdef.h | 2 +- source/libs/sync/inc/syncInt.h | 67 +++- source/libs/sync/src/syncAppendEntries.c | 118 +++--- source/libs/sync/src/syncAppendEntriesReply.c | 123 +++--- source/libs/sync/src/syncIndexMgr.c | 9 + source/libs/sync/src/syncMain.c | 368 ++++++++++++++++-- source/libs/sync/src/syncReplication.c | 27 +- source/libs/transport/src/tmsgcb.c | 3 + 10 files changed, 593 insertions(+), 133 deletions(-) diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index 7ed8414906..a477cea93c 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -40,6 +40,8 @@ extern bool gRaftDetailLog; #define SYNC_MNODE_LOG_RETENTION 10000 #define SYNC_VNODE_LOG_RETENTION 500 +#define SYNC_MAX_RETRY_BACKOFF 5 +#define SYNC_LOG_REPL_RETRY_WAIT_MS 50 #define SYNC_APPEND_ENTRIES_TIMEOUT_MS 10000 #define SYNC_MAX_BATCH_SIZE 1 diff --git a/include/os/osTime.h b/include/os/osTime.h index 48f046d4d0..88eabd206d 100644 --- a/include/os/osTime.h +++ b/include/os/osTime.h @@ -82,6 +82,13 @@ static FORCE_INLINE int64_t taosGetTimestampNs() { return (int64_t)systemTime.tv_sec * 1000000000LL + (int64_t)systemTime.tv_nsec; } +//@return timestamp of monotonic clock in millisecond +static FORCE_INLINE int64_t taosGetMonoTimestampMs() { + struct timespec systemTime = {0}; + taosClockGetTime(CLOCK_MONOTONIC, &systemTime); + return (int64_t)systemTime.tv_sec * 1000LL + (int64_t)systemTime.tv_nsec / 1000000; +} + char *taosStrpTime(const char *buf, const char *fmt, struct tm *tm); struct tm *taosLocalTime(const time_t *timep, struct tm *result); struct tm *taosLocalTimeNolock(struct tm *result, const time_t *timep, int dst); diff --git a/include/util/tdef.h b/include/util/tdef.h index 0b7f9b28fa..c5776e8d87 100644 --- a/include/util/tdef.h +++ b/include/util/tdef.h @@ -281,7 +281,7 @@ typedef enum ELogicConditionType { #define TSDB_DNODE_ROLE_VNODE 2 #define TSDB_MAX_REPLICA 5 -#define TSDB_SYNC_LOG_BUFFER_SIZE 500 +#define TSDB_SYNC_LOG_BUFFER_SIZE 512 #define TSDB_TBNAME_COLUMN_INDEX (-1) #define TSDB_MULTI_TABLEMETA_MAX_NUM 100000 // maximum batch size allowed to load table meta diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index bd97187ce7..58b32ed025 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -88,6 +88,60 @@ typedef struct SPeerState { int64_t lastSendTime; } SPeerState; +typedef struct SSyncReplInfo { + bool barrier; + bool acked; + int64_t timeMs; + int64_t term; +} SSyncReplInfo; + +typedef struct SSyncLogReplMgr { + SSyncReplInfo states[TSDB_SYNC_LOG_BUFFER_SIZE]; + int64_t startIndex; + int64_t matchIndex; + int64_t endIndex; + int64_t size; + bool restored; + int64_t peerStartTime; + int32_t retryBackoff; + int32_t peerId; +} SSyncLogReplMgr; + +SSyncLogReplMgr* syncLogReplMgrCreate(); +void syncLogReplMgrDestroy(SSyncLogReplMgr* pMgr); + +// access +static FORCE_INLINE int64_t syncLogGetRetryBackoffTimeMs(SSyncLogReplMgr* pMgr) { + return (1 << pMgr->retryBackoff) * SYNC_LOG_REPL_RETRY_WAIT_MS; +} + +static FORCE_INLINE int32_t syncLogGetNextRetryBackoff(SSyncLogReplMgr* pMgr) { + return TMIN(pMgr->retryBackoff + 1, SYNC_MAX_RETRY_BACKOFF); +} + +static FORCE_INLINE int32_t syncLogReplMgrUpdateTerm(SSyncLogReplMgr* pMgr, SyncIndex index, SyncTerm term) { + if (index < pMgr->startIndex || index >= pMgr->endIndex) { + return -1; + } + pMgr->states[(index + pMgr->size) % pMgr->size].term = term; + return 0; +} + +SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index); +int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SRaftId* pDestId, + bool* pBarrier); +int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); +int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); +int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); +int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); +int32_t syncLogResetLogReplMgr(SSyncLogReplMgr* pMgr); +int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); +int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); +int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode); + +// others +bool syncLogReplMgrValidate(SSyncLogReplMgr* pMgr); + typedef struct SSyncLogBufEntry { SSyncRaftEntry* pItem; SyncIndex prevLogIndex; @@ -115,14 +169,15 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt int64_t syncLogBufferLoad(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex toIndex); int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode); int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t commitIndex); +SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index, bool* pInBuf); int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commtIndex); -SyncAppendEntries* syncLogToAppendEntries(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index); +SyncAppendEntries* syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm); // private int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf); int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex); -int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index); +int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm); void syncIndexMgrSetIndex(SSyncIndexMgr* pSyncIndexMgr, const SRaftId* pRaftId, SyncIndex index); bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index); @@ -225,11 +280,14 @@ typedef struct SSyncNode { SSyncRespMgr* pSyncRespMgr; // restore state - _Atomic bool restoreFinish; + bool restoreFinish; // SSnapshot* pSnapshot; SSyncSnapshotSender* senders[TSDB_MAX_REPLICA]; SSyncSnapshotReceiver* pNewNodeReceiver; + // log replication mgr + SSyncLogReplMgr* logReplMgrs[TSDB_MAX_REPLICA]; + SPeerState peerStates[TSDB_MAX_REPLICA]; // is config changing @@ -309,6 +367,9 @@ void syncNodeCandidate2Follower(SSyncNode* pSyncNode); void syncNodeVoteForTerm(SSyncNode* pSyncNode, SyncTerm term, SRaftId* pRaftId); void syncNodeVoteForSelf(SSyncNode* pSyncNode); +// log replication +SSyncLogReplMgr* syncNodeGetLogReplMgr(SSyncNode* pNode, SRaftId* pDestId); + // snapshot -------------- bool syncNodeHasSnapshot(SSyncNode* pSyncNode); void syncNodeMaybeUpdateCommitBySnapshot(SSyncNode* pSyncNode); diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 9560ea269b..285981012e 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -18,6 +18,7 @@ #include "syncRaftCfg.h" #include "syncRaftLog.h" #include "syncRaftStore.h" +#include "syncReplication.h" #include "syncSnapshot.h" #include "syncUtil.h" #include "syncVoteMgr.h" @@ -318,16 +319,17 @@ int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { ASSERT(pNode->pFsm != NULL && "pFsm not registered"); ASSERT(pNode->pFsm->FpGetSnapshotInfo != NULL && "FpGetSnapshotInfo not registered"); - SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); SSnapshot snapshot; if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) { sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr()); goto _err; } + SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); SyncIndex commitIndex = snapshot.lastApplyIndex; SyncTerm commitTerm = snapshot.lastApplyTerm; SyncIndex toIndex = TMAX(lastVer, commitIndex); + ASSERT(lastVer >= commitIndex); // update match index pBuf->commitIndex = commitIndex; @@ -392,7 +394,7 @@ int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { } // update startIndex - pBuf->startIndex = index; + pBuf->startIndex = takeDummy ? index : index + 1; // validate syncLogBufferValidate(pBuf); @@ -491,18 +493,44 @@ int32_t syncLogBufferRollbackMatchIndex(SSyncLogBuffer* pBuf, SSyncNode* pNode, return 0; } +FORCE_INLINE SyncTerm syncLogBufferGetLastMatchTerm(SSyncLogBuffer* pBuf) { + SyncIndex index = pBuf->matchIndex; + SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + ASSERT(pEntry != NULL); + return pEntry->term; +} + int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevTerm) { taosThreadMutexLock(&pBuf->mutex); syncLogBufferValidate(pBuf); - - int32_t ret = 0; + int32_t ret = -1; SyncIndex index = pEntry->index; SyncIndex prevIndex = pEntry->index - 1; - if (index <= pBuf->commitIndex || index - pBuf->startIndex > pBuf->size) { - sInfo("vgId:%d, cannot accept index:%" PRId64 " into log buffer. start index: %" PRId64 ", commit index: %" PRId64 - ", end index:%" PRId64 ")", - pNode->vgId, index, pBuf->startIndex, pBuf->commitIndex, pBuf->endIndex); - ret = (index <= pBuf->commitIndex) ? 0 : -1; + SyncTerm lastMatchTerm = syncLogBufferGetLastMatchTerm(pBuf); + + if (index <= pBuf->commitIndex) { + sInfo("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); + ret = 0; + goto _out; + } + + if (index - pBuf->startIndex >= pBuf->size) { + sInfo("vgId:%d, raft entry out of buffer capacity. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); + goto _out; + } + + if (index > pBuf->matchIndex && lastMatchTerm != prevTerm) { + sInfo("vgId:%d, not ready to accept raft entry (i.e. across barrier). index: %" PRId64 ", term: %" PRId64 + ": prevterm: %" PRId64 " /= lastmatch: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 + ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, prevTerm, lastMatchTerm, pBuf->startIndex, pBuf->commitIndex, + pBuf->matchIndex, pBuf->endIndex); goto _out; } @@ -511,14 +539,16 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt if (pExist != NULL) { ASSERT(pEntry->index == pExist->index); - if (pEntry->term > pExist->term) { + if (pEntry->term != pExist->term) { (void)syncLogBufferRollback(pBuf, index); } else { - sInfo("vgId:%d, %s raft entry received. index:%" PRId64 ", term: %" PRId64 "", pNode->vgId, - ((pEntry->term < pExist->term) ? "stale" : "duplicate"), pEntry->index, pEntry->term); + sInfo("vgId:%d, duplicate raft entry received. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); SyncTerm existPrevTerm = pBuf->entries[index % pBuf->size].prevLogTerm; - ASSERT(pEntry->term < pExist->term || (pEntry->term == pExist->term && prevTerm == existPrevTerm)); - ret = (pEntry->term < pExist->term) ? 0 : -1; + ASSERT(pEntry->term == pExist->term && prevTerm == existPrevTerm); + ret = 0; goto _out; } } @@ -531,6 +561,9 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt // update end index pBuf->endIndex = TMAX(index + 1, pBuf->endIndex); + // success + ret = 0; + _out: syncEntryDestroy(pEntry); syncLogBufferValidate(pBuf); @@ -550,6 +583,7 @@ SSyncRaftEntry* syncLogAppendEntriesToRaftEntry(const SyncAppendEntries* pMsg) { } int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) { + ASSERT(pEntry->index >= 0); SyncIndex lastVer = pLogStore->syncLogLastIndex(pLogStore); if (lastVer >= pEntry->index && pLogStore->syncLogTruncate(pLogStore, pEntry->index) < 0) { sError("failed to truncate log store since %s. from index:%" PRId64 "", terrstr(), pEntry->index); @@ -563,6 +597,9 @@ int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) { pEntry->term); return -1; } + + lastVer = pLogStore->syncLogLastIndex(pLogStore); + ASSERT(pEntry->index == lastVer); return 0; } @@ -607,10 +644,14 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { goto _out; } + // increase match index + pBuf->matchIndex = index; + + sInfo("vgId:%d, log buffer proceed. start index: %" PRId64 ", match index: %" PRId64 ", end index: %" PRId64, + pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); + // replicate on demand - if (pNode->state == TAOS_SYNC_STATE_LEADER && pNode->replicaNum > 1) { - (void)syncLogBufferReplicate(pBuf, pNode, index); - } + (void)syncNodeReplicate(pNode); // persist if (syncLogStorePersist(pLogStore, pEntry) < 0) { @@ -618,16 +659,15 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { terrstr(), pEntry->index); goto _out; } - - // increment - pBuf->matchIndex = index; - matchIndex = pBuf->matchIndex; + ASSERT(pEntry->index == pBuf->matchIndex); // update my match index + matchIndex = pBuf->matchIndex; syncIndexMgrSetIndex(pNode->pMatchIndex, &pNode->myRaftId, pBuf->matchIndex); } // end of while _out: + pBuf->matchIndex = matchIndex; syncLogBufferValidate(pBuf); taosThreadMutexUnlock(&pBuf->mutex); return matchIndex; @@ -659,7 +699,7 @@ int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf) { ASSERT(pBuf->commitIndex <= pBuf->matchIndex); ASSERT(pBuf->matchIndex < pBuf->endIndex); ASSERT(pBuf->endIndex - pBuf->startIndex <= pBuf->size); - for (SyncIndex index = pBuf->commitIndex; index <= pBuf->matchIndex; index++) { + for (SyncIndex index = pBuf->startIndex; index <= pBuf->matchIndex; index++) { SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; ASSERT(pEntry != NULL); } @@ -694,20 +734,11 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm // execute in fsm for (int64_t index = pBuf->commitIndex + 1; index <= upperIndex; index++) { // get a log entry - if (index >= pBuf->startIndex) { - inBuf = true; - pEntry = pBuf->entries[index % pBuf->size].pItem; - } else { - inBuf = false; - if (pLogStore->syncLogGetEntry(pLogStore, index, &pEntry) < 0) { - sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); - ret = -1; - goto _out; - } + pEntry = syncLogBufferGetOneEntry(pBuf, pNode, index, &inBuf); + if (pEntry == NULL) { + goto _out; } - ASSERT(pEntry != NULL); - // execute it if (!syncUtilUserCommit(pEntry->originalRpcType)) { sInfo("vgId:%d, non-user msg in raft log entry. index: %" PRId64 ", term:%" PRId64 "", vgId, pEntry->index, @@ -738,8 +769,8 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm } // recycle - // TODO: with a grace period of one third of free space before commitIndex in ring buffer - SyncIndex until = pBuf->commitIndex; + SyncIndex used = pBuf->endIndex - pBuf->startIndex; + SyncIndex until = pBuf->commitIndex - (pBuf->size - used) / 2; for (SyncIndex index = pBuf->startIndex; index < until; index++) { SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; ASSERT(pEntry != NULL); @@ -796,9 +827,6 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, SyncAppendEntries* pMsg) { syncNodeStepDown(ths, pMsg->term); syncNodeResetElectTimer(ths); - // update commit index - (void)syncNodeUpdateCommitIndex(ths, pMsg->commitIndex); - if (pMsg->dataLen < (int32_t)sizeof(SSyncRaftEntry)) { sError("vgId:%d, incomplete append entries received. prev index:%" PRId64 ", term:%" PRId64 ", datalen:%d", ths->vgId, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->dataLen); @@ -825,15 +853,17 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, SyncAppendEntries* pMsg) { // accept if (syncLogBufferAccept(ths->pLogBuf, ths, pEntry, pMsg->prevLogTerm) < 0) { - sWarn("vgId:%d, failed to accept raft entry into log buffer. index:%" PRId64 ", term:%" PRId64, ths->vgId, - pEntry->index, pEntry->term); goto _SEND_RESPONSE; } - pReply->success = true; _SEND_RESPONSE: - // update match index pReply->matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); + bool matched = (pReply->matchIndex >= pReply->lastSendIndex); + pReply->success = matched; + if (matched) { + // update commit index only after matching + (void)syncNodeUpdateCommitIndex(ths, pMsg->commitIndex); + } // ack, i.e. send response SRpcMsg rpcMsg; @@ -841,7 +871,7 @@ _SEND_RESPONSE: (void)syncNodeSendMsgById(&pReply->destId, ths, &rpcMsg); // commit index, i.e. leader notice me - if (syncLogBufferCommit(ths->pLogBuf, ths, pMsg->commitIndex) < 0) { + if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { sError("vgId:%d, failed to commit raft fsm log since %s.", ths->vgId, terrstr()); goto _out; } diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index e37c40455c..86d8ec11b9 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -85,11 +85,11 @@ static void syncNodeStartSnapshotOnce(SSyncNode* ths, SyncIndex beginIndex, Sync } int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commitIndex) { - ths->commitIndex = TMAX(commitIndex, ths->commitIndex); SyncIndex lastVer = ths->pLogStore->syncLogLastIndex(ths->pLogStore); - commitIndex = TMIN(ths->commitIndex, lastVer); - ths->pLogStore->syncLogUpdateCommitIndex(ths->pLogStore, commitIndex); - return commitIndex; + commitIndex = TMAX(commitIndex, ths->commitIndex); + ths->commitIndex = TMIN(commitIndex, lastVer); + ths->pLogStore->syncLogUpdateCommitIndex(ths->pLogStore, ths->commitIndex); + return ths->commitIndex; } int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) { @@ -102,50 +102,77 @@ int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) { return ths->commitIndex; } -int32_t syncLogBufferCatchingUpReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex fromIndex, SRaftId destId) { - taosThreadMutexLock(&pBuf->mutex); +SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index, bool* pInBuf) { + SSyncRaftEntry* pEntry = NULL; + if (index >= pBuf->endIndex) { + return NULL; + } + if (index > pBuf->startIndex) { // startIndex might be dummy + *pInBuf = true; + pEntry = pBuf->entries[index % pBuf->size].pItem; + } else { + *pInBuf = false; + if (pNode->pLogStore->syncLogGetEntry(pNode->pLogStore, index, &pEntry) < 0) { + sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); + } + } + return pEntry; +} + +bool syncLogReplMgrValidate(SSyncLogReplMgr* pMgr) { + ASSERT(pMgr->startIndex <= pMgr->endIndex); + for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { + ASSERT(pMgr->states[(index + pMgr->size) % pMgr->size].barrier == false || index + 1 == pMgr->endIndex); + } + return true; +} + +static FORCE_INLINE bool syncLogIsReplicationBarrier(SSyncRaftEntry* pEntry) { + return pEntry->originalRpcType == TDMT_SYNC_NOOP; +} + +int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SRaftId* pDestId, + bool* pBarrier) { + SSyncRaftEntry* pEntry = NULL; SyncAppendEntries* pMsgOut = NULL; - SyncIndex index = fromIndex; + bool inBuf = false; + int32_t ret = -1; + SyncTerm prevLogTerm = -1; + SSyncLogBuffer* pBuf = pNode->pLogBuf; - if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->replicaNum <= 1) { + sInfo("vgId:%d, replicate one msg index: %" PRId64 " to dest: 0x%016" PRIx64, pNode->vgId, index, pDestId->addr); + + pEntry = syncLogBufferGetOneEntry(pBuf, pNode, index, &inBuf); + if (pEntry == NULL) { + sError("vgId:%d, failed to get raft entry for index: %" PRId64 "", pNode->vgId, index); + goto _out; + } + *pBarrier = syncLogIsReplicationBarrier(pEntry); + + prevLogTerm = syncLogReplMgrGetPrevLogTerm(pMgr, pNode, index); + if (prevLogTerm < 0 && terrno != TSDB_CODE_SUCCESS) { + sError("vgId:%d, failed to get prev log term since %s. index: %" PRId64 "", pNode->vgId, terrstr(), index); + goto _out; + } + (void)syncLogReplMgrUpdateTerm(pMgr, pEntry->index, pEntry->term); + + pMsgOut = syncLogToAppendEntries(pNode, pEntry, prevLogTerm); + if (pMsgOut == NULL) { + sError("vgId:%d, failed to get append entries for index:%" PRId64 "", pNode->vgId, index); goto _out; } - if (index < pBuf->startIndex) { - sError("vgId:%d, (not implemented yet) replication fromIndex: %" PRId64 - " that is less than pBuf->startIndex: %" PRId64 ". destId: 0x%016" PRId64 "", - pNode->vgId, fromIndex, pBuf->startIndex, destId.addr); - goto _out; - } - - if (index > pBuf->matchIndex) { - goto _out; - } - - do { - pMsgOut = syncLogToAppendEntries(pBuf, pNode, index); - if (pMsgOut == NULL) { - sError("vgId:%d, failed to assembly append entries msg since %s. index: %" PRId64 "", pNode->vgId, terrstr(), - index); - goto _out; - } - - if (syncNodeSendAppendEntries(pNode, &destId, pMsgOut) < 0) { - sWarn("vgId:%d, failed to send append entries msg since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", - pNode->vgId, terrstr(), index, destId.addr); - goto _out; - } - - index += 1; - syncAppendEntriesDestroy(pMsgOut); - pMsgOut = NULL; - } while (false && index <= pBuf->commitIndex); + (void)syncNodeSendAppendEntries(pNode, pDestId, pMsgOut); + ret = 0; _out: syncAppendEntriesDestroy(pMsgOut); pMsgOut = NULL; - taosThreadMutexUnlock(&pBuf->mutex); - return 0; + if (!inBuf) { + syncEntryDestroy(pEntry); + pEntry = NULL; + } + return ret; } int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, SyncAppendEntriesReply* pMsg) { @@ -185,23 +212,15 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, SyncAppendEntriesReply* pMs SyncIndex indexLikely = TMIN(pMsg->matchIndex, ths->pLogBuf->matchIndex); SyncIndex commitIndex = syncNodeCheckCommitIndex(ths, indexLikely); (void)syncLogBufferCommit(ths->pLogBuf, ths, commitIndex); - } else { - SyncIndex nextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId)); - if (nextIndex > SYNC_INDEX_BEGIN) { - --nextIndex; - } - syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), nextIndex); } - // send next append entries - SPeerState* pState = syncNodeGetPeerState(ths, &(pMsg->srcId)); - ASSERT(pState != NULL); - - if (pMsg->lastSendIndex == pState->lastSendIndex) { - syncNodeReplicateOne(ths, &(pMsg->srcId)); + // replicate log + SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId); + ASSERT(pMgr != NULL); + if (pMgr != NULL) { + (void)syncLogReplMgrProcessReply(pMgr, ths, pMsg); } } - return 0; } diff --git a/source/libs/sync/src/syncIndexMgr.c b/source/libs/sync/src/syncIndexMgr.c index 8e78aeedc3..09137c31c7 100644 --- a/source/libs/sync/src/syncIndexMgr.c +++ b/source/libs/sync/src/syncIndexMgr.c @@ -82,6 +82,15 @@ void syncIndexMgrSetIndex(SSyncIndexMgr *pSyncIndexMgr, const SRaftId *pRaftId, index); } +SSyncLogReplMgr *syncNodeGetLogReplMgr(SSyncNode *pNode, SRaftId *pDestId) { + for (int i = 0; i < pNode->replicaNum; i++) { + if (syncUtilSameId(&(pNode->replicasId[i]), pDestId)) { + return pNode->logReplMgrs[i]; + } + } + return NULL; +} + SyncIndex syncIndexMgrGetIndex(SSyncIndexMgr *pSyncIndexMgr, const SRaftId *pRaftId) { if (pSyncIndexMgr == NULL) { return SYNC_INDEX_INVALID; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index c3267bafdc..f9f6760e8c 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -1103,6 +1103,252 @@ int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) { return ret; } +int32_t syncLogResetLogReplMgr(SSyncLogReplMgr* pMgr) { + ASSERT(pMgr->startIndex >= 0); + for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { + memset(&pMgr->states[index % pMgr->size], 0, sizeof(pMgr->states[0])); + } + pMgr->startIndex = 0; + pMgr->matchIndex = 0; + pMgr->endIndex = 0; + pMgr->restored = false; + pMgr->retryBackoff = 0; + return 0; +} + +int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { + if (pMgr->endIndex <= pMgr->startIndex) { + return 0; + } + + int32_t ret = -1; + bool retried = false; + int64_t retryWaitMs = syncLogGetRetryBackoffTimeMs(pMgr); + + for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { + int64_t pos = index % pMgr->size; + ASSERT(!pMgr->states[pos].barrier || (index == pMgr->startIndex || index + 1 == pMgr->endIndex)); + if (pMgr->states[pos].acked) { + continue; + } + int64_t nowMs = taosGetMonoTimestampMs(); + if (nowMs < pMgr->states[pos].timeMs + retryWaitMs) { + break; + } + + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; + bool barrier = false; + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, pDestId, &barrier) < 0) { + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, + terrstr(), index, pDestId->addr); + goto _out; + } + ASSERT(barrier == pMgr->states[pos].barrier); + pMgr->states[pos].timeMs = nowMs; + pMgr->states[pos].acked = false; + retried = true; + } + + ret = 0; +_out: + if (retried) { + pMgr->retryBackoff = syncLogGetNextRetryBackoff(pMgr); + } + return ret; +} + +int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, + SyncAppendEntriesReply* pMsg) { + SRaftId destId = pMsg->srcId; + ASSERT(pMgr->restored == false); + + if (pMgr->endIndex == 0) { + ASSERT(pMgr->startIndex == 0); + ASSERT(pMgr->matchIndex == 0); + if (pMsg->matchIndex < 0) { + pMgr->restored = true; + return 0; + } + } else { + if (pMsg->lastSendIndex < pMgr->startIndex || pMsg->lastSendIndex >= pMgr->endIndex) { + syncLogReplMgrRetryOnNeed(pMgr, pNode); + return 0; + } + + pMgr->states[pMsg->lastSendIndex % pMgr->size].acked = true; + + if (pMsg->matchIndex == pMsg->lastSendIndex) { + pMgr->restored = true; + return 0; + } + + (void)syncLogResetLogReplMgr(pMgr); + } + + SyncIndex index = TMIN(pMsg->matchIndex, pNode->pLogBuf->matchIndex); + bool barrier = false; + ASSERT(index >= 0); + // send match index + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &destId, &barrier) < 0) { + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, + terrstr(), index, destId.addr); + return -1; + } + + int64_t nowMs = taosGetMonoTimestampMs(); + pMgr->states[index % pMgr->size].barrier = barrier; + pMgr->states[index % pMgr->size].timeMs = nowMs; + pMgr->states[index % pMgr->size].acked = false; + + pMgr->matchIndex = index; + pMgr->startIndex = index; + pMgr->endIndex = index + 1; + return 0; +} + +int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + taosThreadMutexLock(&pBuf->mutex); + if (pMsg->startTime != pMgr->peerStartTime) { + syncLogResetLogReplMgr(pMgr); + pMgr->peerStartTime = pMsg->startTime; + } + + if (pMgr->restored) { + (void)syncLogReplMgrProcessReplyInNormalMode(pMgr, pNode, pMsg); + } else { + (void)syncLogReplMgrProcessReplyInRecoveryMode(pMgr, pNode, pMsg); + } + taosThreadMutexUnlock(&pBuf->mutex); + return 0; +} + +int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + if (pMgr->restored) { + (void)syncLogReplMgrReplicateAttemptedOnce(pMgr, pNode); + } else { + (void)syncLogReplMgrReplicateProbeOnce(pMgr, pNode); + } + return 0; +} + +int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { + ASSERT(!pMgr->restored); + SyncIndex index = pNode->pLogBuf->matchIndex; + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; + bool barrier = false; + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, pDestId, &barrier) < 0) { + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, + terrstr(), index, pDestId->addr); + return -1; + } + + SSyncLogBuffer* pBuf = pNode->pLogBuf; + sInfo("vgId:%d, attempted to probe the %d'th peer. pMgr(restored:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, + pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + return 0; +} + +int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { + ASSERT(pMgr->restored); + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; + int32_t batchSize = TMAX(1, pMgr->size / 10); + int32_t count = 0; + + for (SyncIndex index = pMgr->endIndex; index <= pNode->pLogBuf->matchIndex; index++) { + if (batchSize < count++ || pMgr->startIndex + pMgr->size <= index) { + break; + } + if (pMgr->startIndex + 1 < index && pMgr->states[(index - 1) % pMgr->size].barrier) { + break; + } + int64_t pos = index % pMgr->size; + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; + bool barrier = false; + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, pDestId, &barrier) < 0) { + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, + terrstr(), index, pDestId->addr); + return -1; + } + pMgr->states[pos].barrier = barrier; + pMgr->states[pos].timeMs = taosGetMonoTimestampMs(); + pMgr->states[pos].acked = false; + + pMgr->endIndex = index + 1; + if (barrier) { + break; + } + } + + SSyncLogBuffer* pBuf = pNode->pLogBuf; + sInfo("vgId:%d, attempted to replicate %d msgs to the %d'th peer. pMgr(restored:%d): [%" PRId64 " %" PRId64 + ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, count, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + syncLogReplMgrRetryOnNeed(pMgr, pNode); + return 0; +} + +int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { + ASSERT(pMgr->restored == true); + if (pMgr->startIndex <= pMsg->lastSendIndex && pMsg->lastSendIndex < pMgr->endIndex) { + pMgr->states[pMsg->lastSendIndex % pMgr->size].acked = true; + pMgr->matchIndex = TMAX(pMgr->matchIndex, pMsg->matchIndex); + for (SyncIndex index = pMgr->startIndex; index < pMgr->matchIndex; index++) { + memset(&pMgr->states[index % pMgr->size], 0, sizeof(pMgr->states[0])); + } + pMgr->startIndex = pMgr->matchIndex; + } + + return syncLogReplMgrReplicateAttemptedOnce(pMgr, pNode); +} + +SSyncLogReplMgr* syncLogReplMgrCreate() { + SSyncLogReplMgr* pMgr = taosMemoryCalloc(1, sizeof(SSyncLogReplMgr)); + if (pMgr == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + + pMgr->size = sizeof(pMgr->states) / sizeof(pMgr->states[0]); + + ASSERT(pMgr->size == TSDB_SYNC_LOG_BUFFER_SIZE); + + return pMgr; + +_err: + taosMemoryFree(pMgr); + return NULL; +} + +void syncLogReplMgrDestroy(SSyncLogReplMgr* pMgr) { + if (pMgr == NULL) { + return; + } + (void)taosMemoryFree(pMgr); + return; +} + +int32_t syncNodeLogReplMgrInit(SSyncNode* pNode) { + for (int i = 0; i < TSDB_MAX_REPLICA; i++) { + ASSERT(pNode->logReplMgrs[i] == NULL); + pNode->logReplMgrs[i] = syncLogReplMgrCreate(); + pNode->logReplMgrs[i]->peerId = i; + ASSERT(pNode->logReplMgrs[i] != NULL && "Out of memory."); + } + return 0; +} + +void syncNodeLogReplMgrDestroy(SSyncNode* pNode) { + for (int i = 0; i < TSDB_MAX_REPLICA; i++) { + syncLogReplMgrDestroy(pNode->logReplMgrs[i]); + pNode->logReplMgrs[i] = NULL; + } +} + SSyncLogBuffer* syncLogBufferCreate() { SSyncLogBuffer* pBuf = taosMemoryCalloc(1, sizeof(SSyncLogBuffer)); if (pBuf == NULL) { @@ -1397,9 +1643,13 @@ SSyncNode* syncNodeOpen(SSyncInfo* pOldSyncInfo) { // is config changing pSyncNode->changing = false; + // replication mgr + syncNodeLogReplMgrInit(pSyncNode); + // peer state syncNodePeerStateInit(pSyncNode); + // // min match index pSyncNode->minMatchIndex = SYNC_INDEX_INVALID; @@ -1532,6 +1782,7 @@ void syncNodeClose(SSyncNode* pSyncNode) { ret = raftStoreClose(pSyncNode->pRaftStore); ASSERT(ret == 0); + syncNodeLogReplMgrDestroy(pSyncNode); syncRespMgrDestroy(pSyncNode->pSyncRespMgr); pSyncNode->pSyncRespMgr = NULL; voteGrantedDestroy(pSyncNode->pVotesGranted); @@ -2477,6 +2728,11 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { pBuf->endIndex = pBuf->matchIndex + 1; + // reset repl mgr + for (int i = 0; i < pNode->replicaNum; i++) { + SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i]; + syncLogResetLogReplMgr(pMgr); + } taosThreadMutexUnlock(&pBuf->mutex); return 0; } @@ -2637,8 +2893,12 @@ void syncNodeCandidate2Leader(SSyncNode* pSyncNode) { syncNodeLog2("==state change syncNodeCandidate2Leader==", pSyncNode); - // Raft 3.6.2 Committing entries from previous terms - syncNodeAppendNoop(pSyncNode); + int32_t ret = syncNodeAppendNoop(pSyncNode); + ASSERT(ret == 0); + SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore); + ASSERT(lastIndex >= 0); + sInfo("vgId:%d, become leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64 "", + pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex); } void syncNodeCandidate2LeaderOld(SSyncNode* pSyncNode) { @@ -2671,22 +2931,33 @@ int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) { void syncNodeFollower2Candidate(SSyncNode* pSyncNode) { ASSERT(pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER); pSyncNode->state = TAOS_SYNC_STATE_CANDIDATE; + SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore); + sInfo("vgId:%d, become candidate from follower. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64, + pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex); - syncNodeEventLog(pSyncNode, "follower to candidate"); + // syncNodeEventLog(pSyncNode, "follower to candidate"); } void syncNodeLeader2Follower(SSyncNode* pSyncNode) { ASSERT(pSyncNode->state == TAOS_SYNC_STATE_LEADER); syncNodeBecomeFollower(pSyncNode, "leader to follower"); - syncNodeEventLog(pSyncNode, "leader to follower"); + SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore); + sInfo("vgId:%d, become follower from leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64, + pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex); + + // syncNodeEventLog(pSyncNode, "leader to follower"); } void syncNodeCandidate2Follower(SSyncNode* pSyncNode) { ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE); syncNodeBecomeFollower(pSyncNode, "candidate to follower"); - syncNodeEventLog(pSyncNode, "candidate to follower"); + SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore); + sInfo("vgId:%d, become follower from candidate. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64, + pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex); + + // syncNodeEventLog(pSyncNode, "candidate to follower"); } // raft vote -------------- @@ -3109,6 +3380,11 @@ int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) { // proceed match index, with replicating on needed SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); + sInfo("vgId:%d, append raft log index: %" PRId64 ", term: %" PRId64 " log buffer: [%" PRId64 " %" PRId64 " %" PRId64 + ", %" PRId64 ")", + ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex, + ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex); + // multi replica if (ths->replicaNum > 1) { return 0; @@ -3135,7 +3411,8 @@ static int32_t syncNodeAppendNoop(SSyncNode* ths) { return -1; } - return syncNodeAppend(ths, pEntry); + int32_t ret = syncNodeAppend(ths, pEntry); + return 0; } static int32_t syncNodeAppendNoopOld(SSyncNode* ths) { @@ -3264,7 +3541,7 @@ int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt syncLogBufferValidate(pBuf); SyncIndex index = pEntry->index; - if (index - pBuf->startIndex > pBuf->size) { + if (index - pBuf->startIndex >= pBuf->size) { sError("vgId:%d, failed to append due to log buffer full. index:%" PRId64 "", pNode->vgId, index); goto _out; } @@ -3294,30 +3571,57 @@ _out: return -1; } -SyncTerm syncLogBufferGetTerm(SSyncLogBuffer* pBuf, SyncIndex index) { - ASSERT(pBuf->startIndex <= index && index < pBuf->endIndex); - SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; - ASSERT(pEntry != NULL); - return pEntry->term; +SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + SSyncRaftEntry* pEntry = NULL; + SyncIndex prevIndex = index - 1; + SyncTerm prevLogTerm = -1; + terrno = TSDB_CODE_SUCCESS; + + if (prevIndex == -1) return 0; + + if (index - 1 > pBuf->matchIndex) { + terrno = TSDB_CODE_WAL_LOG_NOT_EXIST; + return -1; + } + + ASSERT(index - 1 == prevIndex); + + if (index - 1 >= pBuf->startIndex) { + pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + ASSERT(pEntry != NULL && "no log entry found"); + prevLogTerm = pBuf->entries[(index + pBuf->size) % pBuf->size].prevLogTerm; + return prevLogTerm; + } + + if (pMgr->startIndex <= prevIndex && prevIndex < pMgr->endIndex) { + int64_t timeMs = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].timeMs; + ASSERT(timeMs != 0 && "no log entry found"); + prevLogTerm = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].term; + return prevLogTerm; + } + + SSnapshot snapshot; + if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) == 0 && prevIndex == snapshot.lastApplyIndex) { + return snapshot.lastApplyTerm; + } + + if (pNode->pLogStore->syncLogGetEntry(pNode->pLogStore, prevIndex, &pEntry) == 0) { + prevLogTerm = pEntry->term; + syncEntryDestroy(pEntry); + pEntry = NULL; + return prevLogTerm; + } + + sError("vgId:%d, failed to get log term since %s. index: %" PRId64 "", pNode->vgId, terrstr(), prevIndex); + terrno = TSDB_CODE_WAL_LOG_NOT_EXIST; + return -1; } -SyncAppendEntries* syncLogToAppendEntries(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index) { +SyncAppendEntries* syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm) { SyncAppendEntries* pMsg = NULL; - - if (index < pBuf->startIndex || index >= pBuf->endIndex) { - sError("vgId:%d, log entry (%" PRId64 ") out of range of log buffer [%" PRId64 ", %" PRId64 ").", pNode->vgId, - index, pBuf->startIndex, pBuf->endIndex); - return pMsg; - } - - SSyncRaftEntry* pEntry = pBuf->entries[index % pBuf->size].pItem; - if (pEntry == NULL) { - sError("vgId:%d, log entry (%" PRId64 ") not exist in log buffer [%" PRId64 ", %" PRId64 ").", pNode->vgId, index, - pBuf->startIndex, pBuf->endIndex); - return pMsg; - } - uint32_t datalen = pEntry->bytes; + pMsg = syncAppendEntriesBuild(datalen, pNode->vgId); if (pMsg == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; @@ -3326,8 +3630,8 @@ SyncAppendEntries* syncLogToAppendEntries(SSyncLogBuffer* pBuf, SSyncNode* pNode (void)memcpy(pMsg->data, pEntry, datalen); - pMsg->prevLogIndex = index - 1; - pMsg->prevLogTerm = syncLogBufferGetTerm(pBuf, pMsg->prevLogIndex); + pMsg->prevLogIndex = pEntry->index - 1; + pMsg->prevLogTerm = prevLogTerm; pMsg->vgId = pNode->vgId; pMsg->srcId = pNode->myRaftId; pMsg->term = pNode->pRaftStore->currentTerm; @@ -3345,10 +3649,10 @@ void syncLogReplicateAppendEntries(SSyncNode* pNode, SyncAppendEntries* pMsg) { } } -int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index) { - SyncAppendEntries* pMsgOut = syncLogToAppendEntries(pNode->pLogBuf, pNode, index); +int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm) { + SyncAppendEntries* pMsgOut = syncLogToAppendEntries(pNode, pEntry, prevLogTerm); if (pMsgOut == NULL) { - sError("vgId:%d, failed to get append entries for index:%" PRId64 "", pNode->vgId, index); + sError("vgId:%d, failed to get append entries for index:%" PRId64 "", pNode->vgId, pEntry->index); goto _err; } diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index 181b9f2b74..3dcd2d8cdf 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -136,7 +136,21 @@ int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId) { return 0; } -int32_t syncNodeReplicate(SSyncNode* pSyncNode) { +int32_t syncNodeReplicate(SSyncNode* pNode) { + if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->replicaNum == 1) { + return -1; + } + for (int32_t i = 0; i < pNode->replicaNum; i++) { + if (syncUtilSameId(&pNode->replicasId[i], &pNode->myRaftId)) { + continue; + } + SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i]; + (void)syncLogBufferReplicateOnce(pMgr, pNode); + } + return 0; +} + +int32_t syncNodeReplicateOld(SSyncNode* pSyncNode) { if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { return -1; } @@ -159,6 +173,17 @@ int32_t syncNodeReplicate(SSyncNode* pSyncNode) { } int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, SRaftId* destRaftId, SyncAppendEntries* pMsg) { + sInfo("vgId:%d, send append entries msg index: %" PRId64 " to dest: 0x%016" PRId64, pSyncNode->vgId, + pMsg->prevLogIndex + 1, destRaftId->addr); + int32_t ret = 0; + pMsg->destId = *destRaftId; + SRpcMsg rpcMsg; + syncAppendEntries2RpcMsg(pMsg, &rpcMsg); + syncNodeSendMsgById(destRaftId, pSyncNode, &rpcMsg); + return 0; +} + +int32_t syncNodeSendAppendEntriesOld(SSyncNode* pSyncNode, SRaftId* destRaftId, SyncAppendEntries* pMsg) { int32_t ret = 0; pMsg->destId = *destRaftId; diff --git a/source/libs/transport/src/tmsgcb.c b/source/libs/transport/src/tmsgcb.c index 1cd1903851..2007bc474f 100644 --- a/source/libs/transport/src/tmsgcb.c +++ b/source/libs/transport/src/tmsgcb.c @@ -23,6 +23,9 @@ static SMsgCb defaultMsgCb; void tmsgSetDefault(const SMsgCb* msgcb) { defaultMsgCb = *msgcb; } int32_t tmsgPutToQueue(const SMsgCb* msgcb, EQueueType qtype, SRpcMsg* pMsg) { + if (msgcb == NULL) { + return -1; + } int32_t code = (*msgcb->putToQueueFp)(msgcb->mgmt, qtype, pMsg); if (code != 0) { rpcFreeCont(pMsg->pCont); From c0c1cd82113cbc2694d2d1d839ca4f1c89f6db99 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 15 Nov 2022 10:24:11 +0800 Subject: [PATCH 03/42] enh: reset sync log repl mgr if restarting of the peer detected in HeartbeatReply --- source/libs/sync/inc/syncInt.h | 6 +- source/libs/sync/src/syncAppendEntries.c | 146 ++++-------------- source/libs/sync/src/syncAppendEntriesReply.c | 12 +- source/libs/sync/src/syncCommit.c | 87 ----------- source/libs/sync/src/syncElection.c | 2 +- source/libs/sync/src/syncMain.c | 60 ++++--- source/libs/sync/src/syncReplication.c | 4 +- source/libs/sync/src/syncTimeout.c | 2 +- 8 files changed, 83 insertions(+), 236 deletions(-) diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 58b32ed025..ae1c4d9e50 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -166,21 +166,21 @@ int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode); int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf); int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry); int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevTerm); -int64_t syncLogBufferLoad(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex toIndex); int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode); int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t commitIndex); -SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index, bool* pInBuf); int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commtIndex); SyncAppendEntries* syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm); // private +SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index, bool* pInBuf); int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf); int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex); int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm); -void syncIndexMgrSetIndex(SSyncIndexMgr* pSyncIndexMgr, const SRaftId* pRaftId, SyncIndex index); bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index); +void syncIndexMgrSetIndex(SSyncIndexMgr* pSyncIndexMgr, const SRaftId* pRaftId, SyncIndex index); + typedef struct SSyncNode { // init by SSyncInfo SyncGroupId vgId; diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 285981012e..1111547a9a 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -324,11 +324,11 @@ int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr()); goto _err; } - - SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); SyncIndex commitIndex = snapshot.lastApplyIndex; SyncTerm commitTerm = snapshot.lastApplyTerm; - SyncIndex toIndex = TMAX(lastVer, commitIndex); + + SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); + SyncIndex toIndex = lastVer; ASSERT(lastVer >= commitIndex); // update match index @@ -406,93 +406,6 @@ _err: return -1; } -int64_t syncLogBufferLoadOld(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex toIndex) { - taosThreadMutexLock(&pBuf->mutex); - syncLogBufferValidate(pBuf); - - SSyncLogStore* pLogStore = pNode->pLogStore; - ASSERT(pBuf->startIndex <= pBuf->matchIndex); - ASSERT(pBuf->matchIndex + 1 == pBuf->endIndex); - SyncIndex index = pBuf->endIndex; - SSyncRaftEntry* pMatch = pBuf->entries[(index - 1 + pBuf->size) % pBuf->size].pItem; - ASSERT(pMatch != NULL); - - while (index - pBuf->startIndex < pBuf->size && index <= toIndex) { - SSyncRaftEntry* pEntry = NULL; - if (pLogStore->syncLogGetEntry(pLogStore, index, &pEntry) < 0) { - sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); - ASSERT(0); - break; - } - ASSERT(pMatch->index + 1 == pEntry->index); - SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = pMatch->index, .prevLogTerm = pMatch->term}; - pBuf->entries[pBuf->endIndex % pBuf->size] = tmp; - - sInfo("vgId:%d, loaded log entry into log buffer. index: %" PRId64 ", term: %" PRId64, pNode->vgId, pEntry->index, - pEntry->term); - - pBuf->matchIndex = index; - pBuf->endIndex = index + 1; - pMatch = pEntry; - index++; - } - - syncLogBufferValidate(pBuf); - taosThreadMutexUnlock(&pBuf->mutex); - return index; -} - -int32_t syncLogBufferInitOld(SSyncLogBuffer* pBuf, SSyncNode* pNode) { - taosThreadMutexLock(&pBuf->mutex); - ASSERT(pNode->pLogStore != NULL && "log store not created"); - ASSERT(pNode->pFsm != NULL && "pFsm not registered"); - ASSERT(pNode->pFsm->FpGetSnapshotInfo != NULL && "FpGetSnapshotInfo not registered"); - - SSnapshot snapshot; - if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) { - sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr()); - goto _err; - } - SyncIndex commitIndex = snapshot.lastApplyIndex; - SyncTerm commitTerm = snapshot.lastApplyTerm; - - // init log buffer indexes - pBuf->startIndex = commitIndex; - pBuf->matchIndex = commitIndex; - pBuf->commitIndex = commitIndex; - pBuf->endIndex = commitIndex + 1; - - // put a dummy record at initial commitIndex - SSyncRaftEntry* pDummy = syncEntryBuildDummy(commitTerm, commitIndex, pNode->vgId); - if (pDummy == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - goto _err; - } - SSyncLogBufEntry tmp = {.pItem = pDummy, .prevLogIndex = commitIndex - 1, .prevLogTerm = commitTerm}; - pBuf->entries[(commitIndex + pBuf->size) % pBuf->size] = tmp; - - taosThreadMutexUnlock(&pBuf->mutex); - return 0; - -_err: - taosThreadMutexUnlock(&pBuf->mutex); - return -1; -} - -int32_t syncLogBufferRollbackMatchIndex(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex toIndex) { - if (toIndex <= pBuf->commitIndex) { - sError("vgId:%d, cannot rollback across commit index:%" PRId64 ", to index:%" PRId64 "", pNode->vgId, - pBuf->commitIndex, toIndex); - return -1; - } - - pBuf->matchIndex = TMIN(pBuf->matchIndex, toIndex - 1); - - // update my match index - syncIndexMgrSetIndex(pNode->pMatchIndex, &pNode->myRaftId, pBuf->matchIndex); - return 0; -} - FORCE_INLINE SyncTerm syncLogBufferGetLastMatchTerm(SSyncLogBuffer* pBuf) { SyncIndex index = pBuf->matchIndex; SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; @@ -509,26 +422,25 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt SyncTerm lastMatchTerm = syncLogBufferGetLastMatchTerm(pBuf); if (index <= pBuf->commitIndex) { - sInfo("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, - pBuf->endIndex); + sDebug("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); ret = 0; goto _out; } if (index - pBuf->startIndex >= pBuf->size) { - sInfo("vgId:%d, raft entry out of buffer capacity. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, - pBuf->endIndex); + sDebug("vgId:%d, raft entry out of buffer capacity. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); goto _out; } if (index > pBuf->matchIndex && lastMatchTerm != prevTerm) { - sInfo("vgId:%d, not ready to accept raft entry (i.e. across barrier). index: %" PRId64 ", term: %" PRId64 - ": prevterm: %" PRId64 " /= lastmatch: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 - ", %" PRId64 ")", + sInfo("vgId:%d, not ready to accept raft entry. index: %" PRId64 ", term: %" PRId64 ": prevterm: %" PRId64 + " != lastmatch: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pEntry->index, pEntry->term, prevTerm, lastMatchTerm, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); goto _out; @@ -542,10 +454,10 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt if (pEntry->term != pExist->term) { (void)syncLogBufferRollback(pBuf, index); } else { - sInfo("vgId:%d, duplicate raft entry received. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, - pBuf->endIndex); + sDebug("vgId:%d, duplicate raft entry received. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); SyncTerm existPrevTerm = pBuf->entries[index % pBuf->size].prevLogTerm; ASSERT(pEntry->term == pExist->term && prevTerm == existPrevTerm); ret = 0; @@ -647,8 +559,8 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { // increase match index pBuf->matchIndex = index; - sInfo("vgId:%d, log buffer proceed. start index: %" PRId64 ", match index: %" PRId64 ", end index: %" PRId64, - pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); + sDebug("vgId:%d, log buffer proceed. start index: %" PRId64 ", match index: %" PRId64 ", end index: %" PRId64, + pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); // replicate on demand (void)syncNodeReplicate(pNode); @@ -759,8 +671,8 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm } pBuf->commitIndex = index; - sInfo("vgId:%d, committed index: %" PRId64 ", term: %" PRId64 ", role: %d, current term: %" PRId64 "", pNode->vgId, - pEntry->index, pEntry->term, role, term); + sDebug("vgId:%d, committed index: %" PRId64 ", term: %" PRId64 ", role: %d, current term: %" PRId64 "", pNode->vgId, + pEntry->index, pEntry->term, role, term); if (!inBuf) { syncEntryDestroy(pEntry); @@ -784,8 +696,8 @@ _out: if (!pNode->restoreFinish && pBuf->commitIndex >= pNode->commitIndex) { pNode->pFsm->FpRestoreFinishCb(pNode->pFsm); pNode->restoreFinish = true; - sInfo("vgId:%d, restore finished. commit index:%" PRId64 ", match index:%" PRId64 ", last index:%" PRId64 "", - pNode->vgId, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex - 1); + sInfo("vgId:%d, restore finished. pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); } if (!inBuf) { @@ -799,6 +711,7 @@ _out: int32_t syncNodeOnAppendEntries(SSyncNode* ths, SyncAppendEntries* pMsg) { SyncAppendEntriesReply* pReply = NULL; + bool accepted = false; // if already drop replica, do not process if (!syncNodeInRaftGroup(ths, &(pMsg->srcId))) { syncLogRecvAppendEntries(ths, pMsg, "not in my config"); @@ -847,20 +760,21 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, SyncAppendEntries* pMsg) { goto _IGNORE; } - sInfo("vgId:%d, recv append entries msg. index:%" PRId64 ", term:%" PRId64 ", preLogIndex:%" PRId64 - ", prevLogTerm:%" PRId64 " commitIndex:%" PRId64 "", - pMsg->vgId, pMsg->prevLogIndex + 1, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex); + sDebug("vgId:%d, recv append entries msg. index:%" PRId64 ", term:%" PRId64 ", preLogIndex:%" PRId64 + ", prevLogTerm:%" PRId64 " commitIndex:%" PRId64 "", + pMsg->vgId, pMsg->prevLogIndex + 1, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex); // accept if (syncLogBufferAccept(ths->pLogBuf, ths, pEntry, pMsg->prevLogTerm) < 0) { goto _SEND_RESPONSE; } + accepted = true; _SEND_RESPONSE: pReply->matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); bool matched = (pReply->matchIndex >= pReply->lastSendIndex); - pReply->success = matched; - if (matched) { + if (accepted && matched) { + pReply->success = true; // update commit index only after matching (void)syncNodeUpdateCommitIndex(ths, pMsg->commitIndex); } diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index 86d8ec11b9..ef20153fa7 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -96,8 +96,8 @@ int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) { if (indexLikely > ths->commitIndex && syncNodeAgreedUpon(ths, indexLikely)) { SyncIndex commitIndex = indexLikely; syncNodeUpdateCommitIndex(ths, commitIndex); - sInfo("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state, - ths->pRaftStore->currentTerm, commitIndex); + sDebug("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state, + ths->pRaftStore->currentTerm, commitIndex); } return ths->commitIndex; } @@ -140,7 +140,7 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn SyncTerm prevLogTerm = -1; SSyncLogBuffer* pBuf = pNode->pLogBuf; - sInfo("vgId:%d, replicate one msg index: %" PRId64 " to dest: 0x%016" PRIx64, pNode->vgId, index, pDestId->addr); + sDebug("vgId:%d, replicate one msg index: %" PRId64 " to dest: 0x%016" PRIx64, pNode->vgId, index, pDestId->addr); pEntry = syncLogBufferGetOneEntry(pBuf, pNode, index, &inBuf); if (pEntry == NULL) { @@ -199,8 +199,8 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, SyncAppendEntriesReply* pMs ASSERT(pMsg->term == ths->pRaftStore->currentTerm); - sInfo("vgId:%d received append entries reply. srcId:0x%016" PRIx64 ", term:%" PRId64 ", matchIndex:%" PRId64 "", - pMsg->vgId, pMsg->srcId.addr, pMsg->term, pMsg->matchIndex); + sDebug("vgId:%d received append entries reply. srcId:0x%016" PRIx64 ", term:%" PRId64 ", matchIndex:%" PRId64 "", + pMsg->vgId, pMsg->srcId.addr, pMsg->term, pMsg->matchIndex); if (pMsg->success) { SyncIndex oldMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId)); @@ -216,7 +216,7 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, SyncAppendEntriesReply* pMs // replicate log SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId); - ASSERT(pMgr != NULL); + // ASSERT(pMgr != NULL); if (pMgr != NULL) { (void)syncLogReplMgrProcessReply(pMgr, ths, pMsg); } diff --git a/source/libs/sync/src/syncCommit.c b/source/libs/sync/src/syncCommit.c index a96fa31f83..6ba1867635 100644 --- a/source/libs/sync/src/syncCommit.c +++ b/source/libs/sync/src/syncCommit.c @@ -51,93 +51,6 @@ void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) { return; } - // update commit index - SyncIndex newCommitIndex = pSyncNode->commitIndex; - for (SyncIndex index = syncNodeGetLastIndex(pSyncNode); index > pSyncNode->commitIndex; --index) { - bool agree = syncAgree(pSyncNode, index); - - if (agree) { - // term - SSyncRaftEntry* pEntry = NULL; - SLRUCache* pCache = pSyncNode->pLogStore->pCache; - LRUHandle* h = taosLRUCacheLookup(pCache, &index, sizeof(index)); - if (h) { - pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h); - } else { - int32_t code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry); - if (code != 0) { - char logBuf[128]; - snprintf(logBuf, sizeof(logBuf), "advance commit index error, read wal index:%" PRId64, index); - syncNodeErrorLog(pSyncNode, logBuf); - return; - } - } - // cannot commit, even if quorum agree. need check term! - if (pEntry->term <= pSyncNode->pRaftStore->currentTerm) { - // update commit index - newCommitIndex = index; - - if (h) { - taosLRUCacheRelease(pCache, h, false); - } else { - syncEntryDestroy(pEntry); - } - - break; - } else { - do { - char logBuf[128]; - snprintf(logBuf, sizeof(logBuf), "can not commit due to term not equal, index:%" PRId64 ", term:%" PRIu64, - pEntry->index, pEntry->term); - syncNodeEventLog(pSyncNode, logBuf); - } while (0); - } - - if (h) { - taosLRUCacheRelease(pCache, h, false); - } else { - syncEntryDestroy(pEntry); - } - } - } - - // advance commit index as large as possible - SyncIndex walCommitVer = logStoreWalCommitVer(pSyncNode->pLogStore); - if (walCommitVer > newCommitIndex) { - newCommitIndex = walCommitVer; - } - - // maybe execute fsm - if (newCommitIndex > pSyncNode->commitIndex) { - SyncIndex beginIndex = pSyncNode->commitIndex + 1; - SyncIndex endIndex = newCommitIndex; - - // update commit index - pSyncNode->commitIndex = newCommitIndex; - - // call back Wal - pSyncNode->pLogStore->syncLogUpdateCommitIndex(pSyncNode->pLogStore, pSyncNode->commitIndex); - - // execute fsm - if (pSyncNode->pFsm != NULL) { - int32_t code = syncNodeDoCommit(pSyncNode, beginIndex, endIndex, pSyncNode->state); - if (code != 0) { - char logBuf[128]; - snprintf(logBuf, sizeof(logBuf), "advance commit index error, do commit begin:%" PRId64 ", end:%" PRId64, - beginIndex, endIndex); - syncNodeErrorLog(pSyncNode, logBuf); - return; - } - } - } -} - -void syncMaybeAdvanceCommitIndexOld(SSyncNode* pSyncNode) { - if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { - syncNodeErrorLog(pSyncNode, "not leader, can not advance commit index"); - return; - } - // advance commit index to sanpshot first SSnapshot snapshot; pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot); diff --git a/source/libs/sync/src/syncElection.c b/source/libs/sync/src/syncElection.c index b428f4d2f2..a287727a39 100644 --- a/source/libs/sync/src/syncElection.c +++ b/source/libs/sync/src/syncElection.c @@ -111,4 +111,4 @@ int32_t syncNodeSendRequestVote(SSyncNode* pSyncNode, const SRaftId* destRaftId, syncRequestVote2RpcMsg(pMsg, &rpcMsg); syncNodeSendMsgById(destRaftId, pSyncNode, &rpcMsg); return ret; -} \ No newline at end of file +} diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index f9f6760e8c..aaed6a10d0 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -1116,6 +1116,8 @@ int32_t syncLogResetLogReplMgr(SSyncLogReplMgr* pMgr) { return 0; } +_Atomic int64_t tsRetryCnt = 0; + int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { if (pMgr->endIndex <= pMgr->startIndex) { return 0; @@ -1147,6 +1149,7 @@ int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { pMgr->states[pos].timeMs = nowMs; pMgr->states[pos].acked = false; retried = true; + tsRetryCnt++; } ret = 0; @@ -1185,10 +1188,10 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod (void)syncLogResetLogReplMgr(pMgr); } + // send match index SyncIndex index = TMIN(pMsg->matchIndex, pNode->pLogBuf->matchIndex); bool barrier = false; ASSERT(index >= 0); - // send match index if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &destId, &barrier) < 0) { sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, terrstr(), index, destId.addr); @@ -1206,6 +1209,17 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod return 0; } +int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + taosThreadMutexLock(&pBuf->mutex); + if (pMsg->startTime != pMgr->peerStartTime) { + syncLogResetLogReplMgr(pMgr); + pMgr->peerStartTime = pMsg->startTime; + } + taosThreadMutexUnlock(&pBuf->mutex); + return 0; +} + int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { SSyncLogBuffer* pBuf = pNode->pLogBuf; taosThreadMutexLock(&pBuf->mutex); @@ -1245,17 +1259,19 @@ int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode } SSyncLogBuffer* pBuf = pNode->pLogBuf; - sInfo("vgId:%d, attempted to probe the %d'th peer. pMgr(restored:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + sInfo("vgId:%d, attempted to probe the %d'th peer. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } +_Atomic int64_t tsSendCnt = 0; + int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { ASSERT(pMgr->restored); SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; - int32_t batchSize = TMAX(1, pMgr->size / 10); + int32_t batchSize = TMAX(1, pMgr->size / 20); int32_t count = 0; for (SyncIndex index = pMgr->endIndex; index <= pNode->pLogBuf->matchIndex; index++) { @@ -1278,16 +1294,17 @@ int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* p pMgr->states[pos].acked = false; pMgr->endIndex = index + 1; + tsSendCnt++; if (barrier) { break; } } SSyncLogBuffer* pBuf = pNode->pLogBuf; - sInfo("vgId:%d, attempted to replicate %d msgs to the %d'th peer. pMgr(restored:%d): [%" PRId64 " %" PRId64 - ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, count, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + sDebug("vgId:%d, attempted to replicate %d msgs to the %d'th peer. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, count, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); syncLogReplMgrRetryOnNeed(pMgr, pNode); return 0; } @@ -1704,18 +1721,13 @@ int32_t syncNodeRestore(SSyncNode* pSyncNode) { SyncIndex commitIndex = pSyncNode->pLogStore->syncLogCommitIndex(pSyncNode->pLogStore); SyncIndex endIndex = pSyncNode->pLogBuf->endIndex; + ASSERT(endIndex == lastVer + 1); commitIndex = TMAX(pSyncNode->commitIndex, commitIndex); if (syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, commitIndex) < 0) { return -1; } - if (endIndex <= lastVer) { - sError("vgId:%d, failed to load log entries into log buffers. commit index:%" PRId64 ", lastVer: %" PRId64 "", - pSyncNode->vgId, commitIndex, lastVer); - return -1; - } - return 0; } @@ -2722,9 +2734,8 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { (void)syncLogBufferRollback(pBuf, pBuf->matchIndex + 1); - sInfo("vgId:%d, reset log buffer. start index: %" PRId64 ", commit index: %" PRId64 ", match Index: %" PRId64 - ", end index: %" PRId64 "", - pNode->vgId, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + sInfo("vgId:%d, reset log buffer. pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); pBuf->endIndex = pBuf->matchIndex + 1; @@ -3380,10 +3391,10 @@ int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) { // proceed match index, with replicating on needed SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); - sInfo("vgId:%d, append raft log index: %" PRId64 ", term: %" PRId64 " log buffer: [%" PRId64 " %" PRId64 " %" PRId64 - ", %" PRId64 ")", - ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex, - ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex); + sDebug("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64 + ", %" PRId64 ")", + ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex, + ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex); // multi replica if (ths->replicaNum > 1) { @@ -3521,6 +3532,15 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, SyncHeartbeat* pMsg) { } int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, SyncHeartbeatReply* pMsg) { + SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId); + if (pMgr == NULL) { + sError("vgId:%d, failed to get log repl mgr for the peer at addr 0x016%" PRIx64 "", ths->vgId, pMsg->srcId.addr); + return -1; + } + return syncLogReplMgrProcessHeartbeatReply(pMgr, ths, pMsg); +} + +int32_t syncNodeOnHeartbeatReplyOld(SSyncNode* ths, SyncHeartbeatReply* pMsg) { syncLogRecvHeartbeatReply(ths, pMsg, ""); // update last reply time, make decision whether the other node is alive or not diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index 3dcd2d8cdf..1c4b997875 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -173,8 +173,8 @@ int32_t syncNodeReplicateOld(SSyncNode* pSyncNode) { } int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, SRaftId* destRaftId, SyncAppendEntries* pMsg) { - sInfo("vgId:%d, send append entries msg index: %" PRId64 " to dest: 0x%016" PRId64, pSyncNode->vgId, - pMsg->prevLogIndex + 1, destRaftId->addr); + sTrace("vgId:%d, send append entries msg index: %" PRId64 " to dest: 0x%016" PRId64, pSyncNode->vgId, + pMsg->prevLogIndex + 1, destRaftId->addr); int32_t ret = 0; pMsg->destId = *destRaftId; SRpcMsg rpcMsg; diff --git a/source/libs/sync/src/syncTimeout.c b/source/libs/sync/src/syncTimeout.c index 17c8c14136..3fe7f08816 100644 --- a/source/libs/sync/src/syncTimeout.c +++ b/source/libs/sync/src/syncTimeout.c @@ -135,4 +135,4 @@ int32_t syncNodeOnTimer(SSyncNode* ths, SyncTimeout* pMsg) { } return ret; -} \ No newline at end of file +} From cf14200dfe2fbfcbd6424a9e5739fdf1fa823c40 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 15 Nov 2022 10:26:32 +0800 Subject: [PATCH 04/42] enh: get timestamp thru clock_gettime(CLOCK_MONOTONIC,) for taosGetMonotonicMs() --- source/os/src/osTimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/os/src/osTimer.c b/source/os/src/osTimer.c index d1c233ea9c..d2b8f3134a 100644 --- a/source/os/src/osTimer.c +++ b/source/os/src/osTimer.c @@ -216,7 +216,7 @@ int64_t taosGetMonotonicMs() { #if 0 return getMonotonicUs() / 1000; #else - return taosGetTimestampMs(); + return taosGetMonoTimestampMs(); #endif } From bf634a840d05f65d7d2001601668939e876c4e55 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 15 Nov 2022 17:13:31 +0800 Subject: [PATCH 05/42] enh: check lastVer and commit version during syncNodeOpen --- source/libs/sync/inc/syncInt.h | 9 ++- source/libs/sync/src/syncAppendEntries.c | 13 +++- source/libs/sync/src/syncAppendEntriesReply.c | 11 ++-- source/libs/sync/src/syncMain.c | 62 +++++++++++-------- 4 files changed, 56 insertions(+), 39 deletions(-) diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index ae1c4d9e50..7f6061cf00 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -120,16 +120,15 @@ static FORCE_INLINE int32_t syncLogGetNextRetryBackoff(SSyncLogReplMgr* pMgr) { } static FORCE_INLINE int32_t syncLogReplMgrUpdateTerm(SSyncLogReplMgr* pMgr, SyncIndex index, SyncTerm term) { - if (index < pMgr->startIndex || index >= pMgr->endIndex) { - return -1; - } + if (pMgr->endIndex == 0) return -1; + ASSERT(pMgr->startIndex <= index && index < pMgr->endIndex); pMgr->states[(index + pMgr->size) % pMgr->size].term = term; return 0; } SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index); -int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SRaftId* pDestId, - bool* pBarrier); +int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SyncTerm* pTerm, + SRaftId* pDestId, bool* pBarrier); int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 1111547a9a..f94e7c041a 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -328,9 +328,16 @@ int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { SyncTerm commitTerm = snapshot.lastApplyTerm; SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); - SyncIndex toIndex = lastVer; - ASSERT(lastVer >= commitIndex); + if (lastVer < commitIndex) { + sError("vgId:%d, lastVer of WAL log less than tsdb commit version. lastVer: %" PRId64 + ", tsdb commit version: %" PRId64 "", + pNode->vgId, lastVer, commitIndex); + // TODO: terrno = TSDB_CODE_WAL_LOG_INCOMPLETE; + goto _err; + } + ASSERT(lastVer >= commitIndex); + SyncIndex toIndex = lastVer; // update match index pBuf->commitIndex = commitIndex; pBuf->matchIndex = toIndex; @@ -547,7 +554,7 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { ASSERT(prevLogIndex == pMatch->index); if (pMatch->term != prevLogTerm) { - sError( + sInfo( "vgId:%d, mismatching raft log entries encountered. " "{ index:%" PRId64 ", term:%" PRId64 " } " diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index ef20153fa7..eddd0d51c9 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -131,8 +131,8 @@ static FORCE_INLINE bool syncLogIsReplicationBarrier(SSyncRaftEntry* pEntry) { return pEntry->originalRpcType == TDMT_SYNC_NOOP; } -int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SRaftId* pDestId, - bool* pBarrier) { +int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SyncTerm* pTerm, + SRaftId* pDestId, bool* pBarrier) { SSyncRaftEntry* pEntry = NULL; SyncAppendEntries* pMsgOut = NULL; bool inBuf = false; @@ -140,8 +140,6 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn SyncTerm prevLogTerm = -1; SSyncLogBuffer* pBuf = pNode->pLogBuf; - sDebug("vgId:%d, replicate one msg index: %" PRId64 " to dest: 0x%016" PRIx64, pNode->vgId, index, pDestId->addr); - pEntry = syncLogBufferGetOneEntry(pBuf, pNode, index, &inBuf); if (pEntry == NULL) { sError("vgId:%d, failed to get raft entry for index: %" PRId64 "", pNode->vgId, index); @@ -154,7 +152,7 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn sError("vgId:%d, failed to get prev log term since %s. index: %" PRId64 "", pNode->vgId, terrstr(), index); goto _out; } - (void)syncLogReplMgrUpdateTerm(pMgr, pEntry->index, pEntry->term); + if (pTerm) *pTerm = pEntry->term; pMsgOut = syncLogToAppendEntries(pNode, pEntry, prevLogTerm); if (pMsgOut == NULL) { @@ -165,6 +163,9 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn (void)syncNodeSendAppendEntries(pNode, pDestId, pMsgOut); ret = 0; + sInfo("vgId:%d, replicate one msg index: %" PRId64 " term: %" PRId64 " prevterm: %" PRId64 " to dest: 0x%016" PRIx64, + pNode->vgId, pEntry->index, pEntry->term, prevLogTerm, pDestId->addr); + _out: syncAppendEntriesDestroy(pMsgOut); pMsgOut = NULL; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index aaed6a10d0..ccaf5b92a3 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -1140,13 +1140,15 @@ int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; bool barrier = false; - if (syncLogBufferReplicateOneTo(pMgr, pNode, index, pDestId, &barrier) < 0) { + SyncTerm term = -1; + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, terrstr(), index, pDestId->addr); goto _out; } ASSERT(barrier == pMgr->states[pos].barrier); pMgr->states[pos].timeMs = nowMs; + pMgr->states[pos].term = term; pMgr->states[pos].acked = false; retried = true; tsRetryCnt++; @@ -1162,6 +1164,7 @@ _out: int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; SRaftId destId = pMsg->srcId; ASSERT(pMgr->restored == false); @@ -1170,6 +1173,10 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod ASSERT(pMgr->matchIndex == 0); if (pMsg->matchIndex < 0) { pMgr->restored = true; + sInfo("vgId:%d, sync log repl mgr of the %d'th peer restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } } else { @@ -1182,6 +1189,10 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod if (pMsg->matchIndex == pMsg->lastSendIndex) { pMgr->restored = true; + sInfo("vgId:%d, sync log repl mgr of the %d'th peer restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -1191,8 +1202,9 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod // send match index SyncIndex index = TMIN(pMsg->matchIndex, pNode->pLogBuf->matchIndex); bool barrier = false; + SyncTerm term = -1; ASSERT(index >= 0); - if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &destId, &barrier) < 0) { + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, &destId, &barrier) < 0) { sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, terrstr(), index, destId.addr); return -1; @@ -1201,6 +1213,7 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod int64_t nowMs = taosGetMonoTimestampMs(); pMgr->states[index % pMgr->size].barrier = barrier; pMgr->states[index % pMgr->size].timeMs = nowMs; + pMgr->states[index % pMgr->size].term = term; pMgr->states[index % pMgr->size].acked = false; pMgr->matchIndex = index; @@ -1212,7 +1225,9 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg) { SSyncLogBuffer* pBuf = pNode->pLogBuf; taosThreadMutexLock(&pBuf->mutex); - if (pMsg->startTime != pMgr->peerStartTime) { + if (pMsg->startTime != 0 && pMsg->startTime != pMgr->peerStartTime) { + sInfo("vgId:%d, reset sync log repl mgr in heartbeat. start time:%" PRId64 ", old start time:%" PRId64 "", + pNode->vgId, pMsg->startTime, pMgr->peerStartTime); syncLogResetLogReplMgr(pMgr); pMgr->peerStartTime = pMsg->startTime; } @@ -1224,6 +1239,9 @@ int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Sync SSyncLogBuffer* pBuf = pNode->pLogBuf; taosThreadMutexLock(&pBuf->mutex); if (pMsg->startTime != pMgr->peerStartTime) { + sInfo("vgId:%d, reset sync log repl mgr in append entries reply. start time:%" PRId64 ", old start time:%" PRId64 + "", + pNode->vgId, pMsg->startTime, pMgr->peerStartTime); syncLogResetLogReplMgr(pMgr); pMgr->peerStartTime = pMsg->startTime; } @@ -1252,17 +1270,19 @@ int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode SyncIndex index = pNode->pLogBuf->matchIndex; SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; bool barrier = false; - if (syncLogBufferReplicateOneTo(pMgr, pNode, index, pDestId, &barrier) < 0) { + SyncTerm term = -1; + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, terrstr(), index, pDestId->addr); return -1; } SSyncLogBuffer* pBuf = pNode->pLogBuf; - sInfo("vgId:%d, attempted to probe the %d'th peer. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 - "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, - pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + sInfo("vgId:%d, attempted to probe the %d'th peer with msg of index:%" PRId64 " term: %" PRId64 + ". pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 + ")", + pNode->vgId, pMgr->peerId, index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -1273,6 +1293,7 @@ int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* p SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; int32_t batchSize = TMAX(1, pMgr->size / 20); int32_t count = 0; + int64_t nowMs = taosGetMonoTimestampMs(); for (SyncIndex index = pMgr->endIndex; index <= pNode->pLogBuf->matchIndex; index++) { if (batchSize < count++ || pMgr->startIndex + pMgr->size <= index) { @@ -1284,13 +1305,15 @@ int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* p int64_t pos = index % pMgr->size; SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; bool barrier = false; - if (syncLogBufferReplicateOneTo(pMgr, pNode, index, pDestId, &barrier) < 0) { + SyncTerm term = -1; + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, terrstr(), index, pDestId->addr); return -1; } pMgr->states[pos].barrier = barrier; - pMgr->states[pos].timeMs = taosGetMonoTimestampMs(); + pMgr->states[pos].timeMs = nowMs; + pMgr->states[pos].term = term; pMgr->states[pos].acked = false; pMgr->endIndex = index + 1; @@ -1685,7 +1708,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pOldSyncInfo) { // init log buffer if (syncLogBufferInit(pSyncNode->pLogBuf, pSyncNode) < 0) { sError("vgId:%d, failed to init raft log buffer since %s", pSyncNode->vgId, terrstr()); - ASSERT(false); + goto _error; } syncNodeEventLog(pSyncNode, "sync open"); @@ -3497,6 +3520,7 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, SyncHeartbeat* pMsg) { pMsgReply->srcId = ths->myRaftId; pMsgReply->term = ths->pRaftStore->currentTerm; pMsgReply->privateTerm = 8864; // magic number + pMsgReply->startTime = ths->startTime; SRpcMsg rpcMsg; syncHeartbeatReply2RpcMsg(pMsgReply, &rpcMsg); @@ -3618,6 +3642,7 @@ SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, S int64_t timeMs = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].timeMs; ASSERT(timeMs != 0 && "no log entry found"); prevLogTerm = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].term; + ASSERT(prevIndex == 0 || prevLogTerm != 0); return prevLogTerm; } @@ -3669,21 +3694,6 @@ void syncLogReplicateAppendEntries(SSyncNode* pNode, SyncAppendEntries* pMsg) { } } -int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm) { - SyncAppendEntries* pMsgOut = syncLogToAppendEntries(pNode, pEntry, prevLogTerm); - if (pMsgOut == NULL) { - sError("vgId:%d, failed to get append entries for index:%" PRId64 "", pNode->vgId, pEntry->index); - goto _err; - } - - // replicate pMsgOut - (void)syncLogReplicateAppendEntries(pNode, pMsgOut); - -_err: - syncAppendEntriesDestroy(pMsgOut); - return 0; -} - // TLA+ Spec // ClientRequest(i, v) == // /\ state[i] = Leader From 9e8404f60aca4daba5bec1669c60380c097ee08f Mon Sep 17 00:00:00 2001 From: Minghao Li Date: Tue, 25 Oct 2022 18:03:22 +0800 Subject: [PATCH 06/42] refactor(sync): adjust timer --- source/libs/sync/inc/syncInt.h | 1 - source/libs/sync/src/syncMain.c | 84 ++++++++++-------------------- source/libs/sync/src/syncTimeout.c | 4 +- 3 files changed, 29 insertions(+), 60 deletions(-) diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 7f6061cf00..5b5f198436 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -248,7 +248,6 @@ typedef struct SSyncNode { tmr_h pElectTimer; int32_t electTimerMS; uint64_t electTimerLogicClock; - uint64_t electTimerLogicClockUser; TAOS_TMR_CALLBACK FpElectTimerCB; // Timer Fp uint64_t electTimerCounter; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index ccaf5b92a3..1e60b4950a 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -1631,7 +1631,6 @@ SSyncNode* syncNodeOpen(SSyncInfo* pOldSyncInfo) { pSyncNode->pElectTimer = NULL; pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine); atomic_store_64(&pSyncNode->electTimerLogicClock, 0); - atomic_store_64(&pSyncNode->electTimerLogicClockUser, 0); pSyncNode->FpElectTimerCB = syncNodeEqElectTimer; pSyncNode->electTimerCounter = 0; @@ -1937,15 +1936,6 @@ int32_t syncNodeStartElectTimer(SSyncNode* pSyncNode, int32_t ms) { pSyncNode->electTimerMS = ms; taosTmrReset(pSyncNode->FpElectTimerCB, pSyncNode->electTimerMS, pSyncNode, gSyncEnv->pTimerManager, &pSyncNode->pElectTimer); - atomic_store_64(&pSyncNode->electTimerLogicClock, pSyncNode->electTimerLogicClockUser); - - /* - do { - char logBuf[128]; - snprintf(logBuf, sizeof(logBuf), "elect timer reset, ms:%d", ms); - syncNodeEventLog(pSyncNode, logBuf); - } while (0); - */ } else { sError("vgId:%d, start elect timer error, sync env is stop", pSyncNode->vgId); @@ -1955,11 +1945,10 @@ int32_t syncNodeStartElectTimer(SSyncNode* pSyncNode, int32_t ms) { int32_t syncNodeStopElectTimer(SSyncNode* pSyncNode) { int32_t ret = 0; - atomic_add_fetch_64(&pSyncNode->electTimerLogicClockUser, 1); + atomic_add_fetch_64(&pSyncNode->electTimerLogicClock, 1); taosTmrStop(pSyncNode->pElectTimer); pSyncNode->pElectTimer = NULL; - // sTrace("vgId:%d, sync %s stop elect timer", pSyncNode->vgId, syncUtilState2String(pSyncNode->state)); return ret; } @@ -2185,8 +2174,6 @@ cJSON* syncNode2Json(const SSyncNode* pSyncNode) { cJSON_AddNumberToObject(pRoot, "electTimerMS", pSyncNode->electTimerMS); snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->electTimerLogicClock); cJSON_AddStringToObject(pRoot, "electTimerLogicClock", u64buf); - snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->electTimerLogicClockUser); - cJSON_AddStringToObject(pRoot, "electTimerLogicClockUser", u64buf); snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpElectTimerCB); cJSON_AddStringToObject(pRoot, "FpElectTimerCB", u64buf); snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->electTimerCounter); @@ -2288,7 +2275,7 @@ inline void syncNodeEventLog(const SSyncNode* pSyncNode, char* str) { snapshot.lastApplyTerm, pSyncNode->pRaftCfg->isStandBy, pSyncNode->pRaftCfg->snapshotStrategy, pSyncNode->pRaftCfg->batchSize, pSyncNode->replicaNum, pSyncNode->pRaftCfg->lastConfigIndex, pSyncNode->changing, pSyncNode->restoreFinish, syncNodeDynamicQuorum(pSyncNode), - pSyncNode->electTimerLogicClockUser, pSyncNode->heartbeatTimerLogicClockUser, peerStateStr, printStr); + pSyncNode->electTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser, peerStateStr, printStr); } else { snprintf(logBuf, sizeof(logBuf), "%s", str); } @@ -2312,7 +2299,7 @@ inline void syncNodeEventLog(const SSyncNode* pSyncNode, char* str) { snapshot.lastApplyTerm, pSyncNode->pRaftCfg->isStandBy, pSyncNode->pRaftCfg->snapshotStrategy, pSyncNode->pRaftCfg->batchSize, pSyncNode->replicaNum, pSyncNode->pRaftCfg->lastConfigIndex, pSyncNode->changing, pSyncNode->restoreFinish, syncNodeDynamicQuorum(pSyncNode), - pSyncNode->electTimerLogicClockUser, pSyncNode->heartbeatTimerLogicClockUser, peerStateStr, printStr); + pSyncNode->electTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser, peerStateStr, printStr); } else { snprintf(s, len, "%s", str); } @@ -2362,7 +2349,7 @@ inline void syncNodeErrorLog(const SSyncNode* pSyncNode, char* str) { snapshot.lastApplyTerm, pSyncNode->pRaftCfg->isStandBy, pSyncNode->pRaftCfg->snapshotStrategy, pSyncNode->pRaftCfg->batchSize, pSyncNode->replicaNum, pSyncNode->pRaftCfg->lastConfigIndex, pSyncNode->changing, pSyncNode->restoreFinish, syncNodeDynamicQuorum(pSyncNode), - pSyncNode->electTimerLogicClockUser, pSyncNode->heartbeatTimerLogicClockUser, printStr); + pSyncNode->electTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser, printStr); } else { snprintf(logBuf, sizeof(logBuf), "%s", str); } @@ -2384,7 +2371,7 @@ inline void syncNodeErrorLog(const SSyncNode* pSyncNode, char* str) { snapshot.lastApplyTerm, pSyncNode->pRaftCfg->isStandBy, pSyncNode->pRaftCfg->snapshotStrategy, pSyncNode->pRaftCfg->batchSize, pSyncNode->replicaNum, pSyncNode->pRaftCfg->lastConfigIndex, pSyncNode->changing, pSyncNode->restoreFinish, syncNodeDynamicQuorum(pSyncNode), - pSyncNode->electTimerLogicClockUser, pSyncNode->heartbeatTimerLogicClockUser, printStr); + pSyncNode->electTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser, printStr); } else { snprintf(s, len, "%s", str); } @@ -3226,36 +3213,31 @@ static void syncNodeEqPingTimer(void* param, void* tmrId) { static void syncNodeEqElectTimer(void* param, void* tmrId) { SSyncNode* pSyncNode = (SSyncNode*)param; - if (atomic_load_64(&pSyncNode->electTimerLogicClockUser) <= atomic_load_64(&pSyncNode->electTimerLogicClock)) { - SyncTimeout* pSyncMsg = syncTimeoutBuild2(SYNC_TIMEOUT_ELECTION, atomic_load_64(&pSyncNode->electTimerLogicClock), - pSyncNode->electTimerMS, pSyncNode->vgId, pSyncNode); - SRpcMsg rpcMsg; - syncTimeout2RpcMsg(pSyncMsg, &rpcMsg); - syncRpcMsgLog2((char*)"==syncNodeEqElectTimer==", &rpcMsg); - if (pSyncNode->FpEqMsg != NULL) { - int32_t code = pSyncNode->FpEqMsg(pSyncNode->msgcb, &rpcMsg); - if (code != 0) { - sError("vgId:%d, sync enqueue elect msg error, code:%d", pSyncNode->vgId, code); - rpcFreeCont(rpcMsg.pCont); - syncTimeoutDestroy(pSyncMsg); - return; - } - } else { - sTrace("syncNodeEqElectTimer FpEqMsg is NULL"); - } - syncTimeoutDestroy(pSyncMsg); - // reset timer ms - if (syncEnvIsStart() && pSyncNode->electBaseLine > 0) { - pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine); - taosTmrReset(syncNodeEqElectTimer, pSyncNode->electTimerMS, pSyncNode, gSyncEnv->pTimerManager, - &pSyncNode->pElectTimer); - } else { - sError("sync env is stop, syncNodeEqElectTimer"); + SyncTimeout* pSyncMsg = syncTimeoutBuild2(SYNC_TIMEOUT_ELECTION, atomic_load_64(&pSyncNode->electTimerLogicClock), + pSyncNode->electTimerMS, pSyncNode->vgId, pSyncNode); + SRpcMsg rpcMsg; + syncTimeout2RpcMsg(pSyncMsg, &rpcMsg); + if (pSyncNode->FpEqMsg != NULL) { + int32_t code = pSyncNode->FpEqMsg(pSyncNode->msgcb, &rpcMsg); + if (code != 0) { + sError("vgId:%d, sync enqueue elect msg error, code:%d", pSyncNode->vgId, code); + rpcFreeCont(rpcMsg.pCont); + syncTimeoutDestroy(pSyncMsg); + return; } } else { - sTrace("==syncNodeEqElectTimer== electTimerLogicClock:%" PRIu64 ", electTimerLogicClockUser:%" PRIu64, - pSyncNode->electTimerLogicClock, pSyncNode->electTimerLogicClockUser); + sTrace("syncNodeEqElectTimer FpEqMsg is NULL"); + } + syncTimeoutDestroy(pSyncMsg); + + // reset timer ms + if (syncEnvIsStart() && pSyncNode->electBaseLine > 0) { + pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine); + taosTmrReset(syncNodeEqElectTimer, pSyncNode->electTimerMS, pSyncNode, gSyncEnv->pTimerManager, + &pSyncNode->pElectTimer); + } else { + sError("sync env is stop, syncNodeEqElectTimer"); } } @@ -3480,16 +3462,6 @@ static int32_t syncNodeAppendNoopOld(SSyncNode* ths) { // on message ---- int32_t syncNodeOnPingCb(SSyncNode* ths, SyncPing* pMsg) { // log state - char logBuf[1024] = {0}; - snprintf(logBuf, sizeof(logBuf), - "==syncNodeOnPingCb== vgId:%d, state: %d, %s, term:%" PRIu64 " electTimerLogicClock:%" PRIu64 - ", " - "electTimerLogicClockUser:%" PRIu64 ", electTimerMS:%d", - ths->vgId, ths->state, syncUtilState2String(ths->state), ths->pRaftStore->currentTerm, - ths->electTimerLogicClock, ths->electTimerLogicClockUser, ths->electTimerMS); - - int32_t ret = 0; - syncPingLog2(logBuf, pMsg); SyncPingReply* pMsgReply = syncPingReplyBuild3(&ths->myRaftId, &pMsg->srcId, ths->vgId); SRpcMsg rpcMsg; syncPingReply2RpcMsg(pMsgReply, &rpcMsg); @@ -3503,7 +3475,7 @@ int32_t syncNodeOnPingCb(SSyncNode* ths, SyncPing* pMsg) { syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg); - return ret; + return 0; } int32_t syncNodeOnPingReplyCb(SSyncNode* ths, SyncPingReply* pMsg) { diff --git a/source/libs/sync/src/syncTimeout.c b/source/libs/sync/src/syncTimeout.c index 3fe7f08816..fa7660dccf 100644 --- a/source/libs/sync/src/syncTimeout.c +++ b/source/libs/sync/src/syncTimeout.c @@ -113,10 +113,8 @@ int32_t syncNodeOnTimer(SSyncNode* ths, SyncTimeout* pMsg) { } } else if (pMsg->timeoutType == SYNC_TIMEOUT_ELECTION) { - if (atomic_load_64(&ths->electTimerLogicClockUser) <= pMsg->logicClock) { + if (atomic_load_64(&ths->electTimerLogicClock) <= pMsg->logicClock) { ++(ths->electTimerCounter); - sTrace("vgId:%d, sync timer, type:election count:%" PRIu64 ", lc-user:%" PRIu64, ths->vgId, - ths->electTimerCounter, ths->electTimerLogicClockUser); syncNodeElect(ths); } From ab2f4e974c720b8ece920713a22d6f5bff682014 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Wed, 16 Nov 2022 12:02:46 +0800 Subject: [PATCH 07/42] fix: protect against nullptr of pMsg in cliHandleExeptImpl --- source/libs/transport/src/transCli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 126b0b638e..04269a3e05 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -451,7 +451,7 @@ void cliHandleExceptImpl(SCliConn* pConn, int32_t code) { if (pCtx == NULL || pCtx->pSem == NULL) { if (transMsg.info.ahandle == NULL) { - if (REQUEST_NO_RESP(&pMsg->msg) || pMsg->type == Release) destroyCmsg(pMsg); + if (pMsg == NULL || REQUEST_NO_RESP(&pMsg->msg) || pMsg->type == Release) destroyCmsg(pMsg); once = true; continue; } From 3e13cd82808653f02248dda4e5f3ce86d9388ef5 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Wed, 16 Nov 2022 14:26:18 +0800 Subject: [PATCH 08/42] fix: check if timer triggered ahead of time in syncNodeElect --- include/libs/sync/syncTools.h | 2 + include/util/tdef.h | 2 +- source/libs/sync/inc/syncInt.h | 1 + source/libs/sync/src/syncAppendEntriesReply.c | 6 +-- source/libs/sync/src/syncElection.c | 13 +++++- source/libs/sync/src/syncMain.c | 41 +++++++++++++++---- source/libs/sync/src/syncReplication.c | 7 ++++ 7 files changed, 58 insertions(+), 14 deletions(-) diff --git a/include/libs/sync/syncTools.h b/include/libs/sync/syncTools.h index d5c015bfb2..d009ea0b4f 100644 --- a/include/libs/sync/syncTools.h +++ b/include/libs/sync/syncTools.h @@ -696,6 +696,8 @@ int32_t syncNodeOnSnapshotReply(SSyncNode* ths, SyncSnapshotRsp* pMsg); int32_t syncNodeOnHeartbeat(SSyncNode* ths, SyncHeartbeat* pMsg); int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, SyncHeartbeatReply* pMsg); +void syncNodePegLastMsgRecvTime(SSyncNode* ths); + // ----------------------------------------- typedef int32_t (*FpOnPingCb)(SSyncNode* ths, SyncPing* pMsg); typedef int32_t (*FpOnPingReplyCb)(SSyncNode* ths, SyncPingReply* pMsg); diff --git a/include/util/tdef.h b/include/util/tdef.h index c5776e8d87..742b6b1da9 100644 --- a/include/util/tdef.h +++ b/include/util/tdef.h @@ -281,7 +281,7 @@ typedef enum ELogicConditionType { #define TSDB_DNODE_ROLE_VNODE 2 #define TSDB_MAX_REPLICA 5 -#define TSDB_SYNC_LOG_BUFFER_SIZE 512 +#define TSDB_SYNC_LOG_BUFFER_SIZE 1024 #define TSDB_TBNAME_COLUMN_INDEX (-1) #define TSDB_MULTI_TABLEMETA_MAX_NUM 100000 // maximum batch size allowed to load table meta diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 5b5f198436..b341587bbe 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -298,6 +298,7 @@ typedef struct SSyncNode { int64_t startTime; int64_t leaderTime; int64_t lastReplicateTime; + int64_t lastMsgRecvTime; } SSyncNode; diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index eddd0d51c9..cbff83acf8 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -148,7 +148,7 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn *pBarrier = syncLogIsReplicationBarrier(pEntry); prevLogTerm = syncLogReplMgrGetPrevLogTerm(pMgr, pNode, index); - if (prevLogTerm < 0 && terrno != TSDB_CODE_SUCCESS) { + if (prevLogTerm < 0) { sError("vgId:%d, failed to get prev log term since %s. index: %" PRId64 "", pNode->vgId, terrstr(), index); goto _out; } @@ -163,8 +163,8 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn (void)syncNodeSendAppendEntries(pNode, pDestId, pMsgOut); ret = 0; - sInfo("vgId:%d, replicate one msg index: %" PRId64 " term: %" PRId64 " prevterm: %" PRId64 " to dest: 0x%016" PRIx64, - pNode->vgId, pEntry->index, pEntry->term, prevLogTerm, pDestId->addr); + sDebug("vgId:%d, replicate one msg index: %" PRId64 " term: %" PRId64 " prevterm: %" PRId64 " to dest: 0x%016" PRIx64, + pNode->vgId, pEntry->index, pEntry->term, prevLogTerm, pDestId->addr); _out: syncAppendEntriesDestroy(pMsgOut); diff --git a/source/libs/sync/src/syncElection.c b/source/libs/sync/src/syncElection.c index a287727a39..8d904ff934 100644 --- a/source/libs/sync/src/syncElection.c +++ b/source/libs/sync/src/syncElection.c @@ -34,6 +34,13 @@ int32_t syncNodeElect(SSyncNode* pSyncNode) { syncNodeEventLog(pSyncNode, "begin election"); + int64_t nowMs = taosGetMonoTimestampMs(); + if (nowMs < pSyncNode->lastMsgRecvTime + pSyncNode->electTimerMS) { + sError("vgId:%d, election timer triggered ahead of time for %" PRId64 "ms", pSyncNode->vgId, + pSyncNode->lastMsgRecvTime + pSyncNode->electTimerMS - nowMs); + return -1; + } + int32_t ret = 0; if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) { syncNodeFollower2Candidate(pSyncNode); @@ -105,7 +112,11 @@ int32_t syncNodeRequestVotePeers(SSyncNode* pSyncNode) { int32_t syncNodeSendRequestVote(SSyncNode* pSyncNode, const SRaftId* destRaftId, const SyncRequestVote* pMsg) { int32_t ret = 0; - syncLogSendRequestVote(pSyncNode, pMsg, ""); + // syncLogSendRequestVote(pSyncNode, pMsg, ""); + char host[64]; + uint16_t port; + syncUtilU642Addr(pMsg->destId.addr, host, sizeof(host), &port); + sInfo("vgId:%d, send request vote of term: %" PRId64 " to %s:%d", pSyncNode->vgId, pMsg->term, host, port); SRpcMsg rpcMsg; syncRequestVote2RpcMsg(pMsg, &rpcMsg); diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 1e60b4950a..7811d3ca59 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -1167,15 +1167,18 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod SSyncLogBuffer* pBuf = pNode->pLogBuf; SRaftId destId = pMsg->srcId; ASSERT(pMgr->restored == false); + char host[64]; + uint16_t port; + syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port); if (pMgr->endIndex == 0) { ASSERT(pMgr->startIndex == 0); ASSERT(pMgr->matchIndex == 0); if (pMsg->matchIndex < 0) { pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr of the %d'th peer restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + sInfo("vgId:%d, sync log repl mgr of peer %s:%d restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pNode->vgId, host, port, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -1189,9 +1192,9 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod if (pMsg->matchIndex == pMsg->lastSendIndex) { pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr of the %d'th peer restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + sInfo("vgId:%d, sync log repl mgr of peer %s:%d restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pNode->vgId, host, port, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -1278,11 +1281,11 @@ int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode } SSyncLogBuffer* pBuf = pNode->pLogBuf; - sInfo("vgId:%d, attempted to probe the %d'th peer with msg of index:%" PRId64 " term: %" PRId64 - ". pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 - ")", - pNode->vgId, pMgr->peerId, index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + sDebug("vgId:%d, attempted to probe the %d'th peer with msg of index:%" PRId64 " term: %" PRId64 + ". pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 + ")", + pNode->vgId, pMgr->peerId, index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -1963,6 +1966,8 @@ int32_t syncNodeResetElectTimer(SSyncNode* pSyncNode) { int32_t ret = 0; int32_t electMS; + syncNodePegLastMsgRecvTime(pSyncNode); + if (pSyncNode->pRaftCfg->isStandBy) { electMS = TIMER_MAX_MS; } else { @@ -3231,6 +3236,7 @@ static void syncNodeEqElectTimer(void* param, void* tmrId) { } syncTimeoutDestroy(pSyncMsg); +#if 0 // reset timer ms if (syncEnvIsStart() && pSyncNode->electBaseLine > 0) { pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine); @@ -3239,6 +3245,7 @@ static void syncNodeEqElectTimer(void* param, void* tmrId) { } else { sError("sync env is stop, syncNodeEqElectTimer"); } +#endif } static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) { @@ -3246,6 +3253,10 @@ static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) { syncNodeEventLog(pSyncNode, "eq hb timer"); +#if 0 + sInfo("vgId:%d, heartbeat timer tick.", pSyncNode->vgId); +#endif + if (pSyncNode->replicaNum > 1) { if (atomic_load_64(&pSyncNode->heartbeatTimerLogicClockUser) <= atomic_load_64(&pSyncNode->heartbeatTimerLogicClock)) { @@ -3484,6 +3495,11 @@ int32_t syncNodeOnPingReplyCb(SSyncNode* ths, SyncPingReply* pMsg) { return ret; } +void syncNodePegLastMsgRecvTime(SSyncNode* ths) { + int64_t nowMs = taosGetMonoTimestampMs(); + ths->lastMsgRecvTime = nowMs; +} + int32_t syncNodeOnHeartbeat(SSyncNode* ths, SyncHeartbeat* pMsg) { syncLogRecvHeartbeat(ths, pMsg, ""); @@ -3497,6 +3513,13 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, SyncHeartbeat* pMsg) { SRpcMsg rpcMsg; syncHeartbeatReply2RpcMsg(pMsgReply, &rpcMsg); +#if 0 + char host[64]; + uint16_t port; + syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port); + sInfo("vgId:%d, recv heartbeat msg from %s:%d", ths->vgId, host, port); +#endif + #if 1 if (pMsg->term >= ths->pRaftStore->currentTerm && ths->state != TAOS_SYNC_STATE_FOLLOWER) { syncNodeStepDown(ths, pMsg->term); diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index 1c4b997875..0c6290180f 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -239,6 +239,13 @@ int32_t syncNodeSendHeartbeat(SSyncNode* pSyncNode, const SRaftId* destRaftId, c int32_t ret = 0; syncLogSendHeartbeat(pSyncNode, pMsg, ""); +#if 0 + char host[64]; + uint16_t port; + syncUtilU642Addr(pMsg->destId.addr, host, sizeof(host), &port); + sInfo("vgId:%d, send heartbeat msg to %s:%d", pSyncNode->vgId, host, port); +#endif + SRpcMsg rpcMsg; syncHeartbeat2RpcMsg(pMsg, &rpcMsg); syncNodeSendMsgById(&(pMsg->destId), pSyncNode, &rpcMsg); From 60626327222dac0a956be8715ee55dbbccb22dc5 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Wed, 16 Nov 2022 19:53:22 +0800 Subject: [PATCH 09/42] enh: turn commit-cb logging msg to debug level in vnodeSyncCommitMsg --- source/dnode/vnode/src/vnd/vnodeSync.c | 8 ++++---- source/libs/sync/src/syncAppendEntries.c | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index c3ccbddc53..e66bed22cd 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -418,10 +418,10 @@ static void vnodeSyncCommitMsg(SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMeta c rpcMsg.info.conn.applyIndex = cbMeta.index; rpcMsg.info.conn.applyTerm = cbMeta.term; - vInfo("vgId:%d, commit-cb is excuted, fsm:%p, index:%" PRId64 ", term:%" PRIu64 ", msg-index:%" PRId64 - ", weak:%d, code:%d, state:%d %s, type:%s", - syncGetVgId(pVnode->sync), pFsm, cbMeta.index, cbMeta.term, rpcMsg.info.conn.applyIndex, cbMeta.isWeak, - cbMeta.code, cbMeta.state, syncUtilState2String(cbMeta.state), TMSG_INFO(pMsg->msgType)); + vDebug("vgId:%d, commit-cb is excuted, fsm:%p, index:%" PRId64 ", term:%" PRIu64 ", msg-index:%" PRId64 + ", weak:%d, code:%d, state:%d %s, type:%s", + syncGetVgId(pVnode->sync), pFsm, cbMeta.index, cbMeta.term, rpcMsg.info.conn.applyIndex, cbMeta.isWeak, + cbMeta.code, cbMeta.state, syncUtilState2String(cbMeta.state), TMSG_INFO(pMsg->msgType)); tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, &rpcMsg); } else { diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index f94e7c041a..2552867a99 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -618,10 +618,12 @@ int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf) { ASSERT(pBuf->commitIndex <= pBuf->matchIndex); ASSERT(pBuf->matchIndex < pBuf->endIndex); ASSERT(pBuf->endIndex - pBuf->startIndex <= pBuf->size); +#if 0 for (SyncIndex index = pBuf->startIndex; index <= pBuf->matchIndex; index++) { SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; ASSERT(pEntry != NULL); } +#endif return 0; } From da469149db61b1725c9d2adb657497f3b935f057 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Thu, 17 Nov 2022 20:27:23 +0800 Subject: [PATCH 10/42] enh: debug to info --- source/dnode/vnode/src/vnd/vnodeSync.c | 12 ++--- source/libs/sync/src/syncAppendEntries.c | 26 +++++----- source/libs/sync/src/syncAppendEntriesReply.c | 12 ++--- source/libs/sync/src/syncMain.c | 48 +++++++------------ 4 files changed, 41 insertions(+), 57 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index 7ab19c5c6e..feecbc6d3b 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -307,10 +307,10 @@ static void vnodeSyncApplyMsg(const SSyncFSM *pFsm, const SRpcMsg *pMsg, const S rpcMsg.info.conn.applyTerm = pMeta->term; const STraceId *trace = &pMsg->info.traceId; - vGTrace("vgId:%d, commit-cb is excuted, fsm:%p, index:%" PRId64 ", term:%" PRIu64 ", msg-index:%" PRId64 - ", weak:%d, code:%d, state:%d %s, type:%s", - pVnode->config.vgId, pFsm, pMeta->index, pMeta->term, rpcMsg.info.conn.applyIndex, pMeta->isWeak, - pMeta->code, pMeta->state, syncStr(pMeta->state), TMSG_INFO(pMsg->msgType)); + vGInfo("vgId:%d, commit-cb is excuted, fsm:%p, index:%" PRId64 ", term:%" PRIu64 ", msg-index:%" PRId64 + ", weak:%d, code:%d, state:%d %s, type:%s", + pVnode->config.vgId, pFsm, pMeta->index, pMeta->term, rpcMsg.info.conn.applyIndex, pMeta->isWeak, + pMeta->code, pMeta->state, syncStr(pMeta->state), TMSG_INFO(pMsg->msgType)); tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, &rpcMsg); } else { @@ -558,12 +558,12 @@ bool vnodeIsLeader(SVnode *pVnode) { } else { terrno = TSDB_CODE_APP_NOT_READY; } - vDebug("vgId:%d, vnode not ready, state:%s, restore:%d", pVnode->config.vgId, syncStr(state.state), state.restored); + vInfo("vgId:%d, vnode not ready, state:%s, restore:%d", pVnode->config.vgId, syncStr(state.state), state.restored); return false; } if (!pVnode->restored) { - vDebug("vgId:%d, vnode not restored", pVnode->config.vgId); + vInfo("vgId:%d, vnode not restored", pVnode->config.vgId); terrno = TSDB_CODE_APP_NOT_READY; return false; } diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 0d892cdd1d..8197119b87 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -245,19 +245,19 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt SyncTerm lastMatchTerm = syncLogBufferGetLastMatchTerm(pBuf); if (index <= pBuf->commitIndex) { - sDebug("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, - pBuf->endIndex); + sInfo("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); ret = 0; goto _out; } if (index - pBuf->startIndex >= pBuf->size) { - sDebug("vgId:%d, raft entry out of buffer capacity. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, - pBuf->endIndex); + sInfo("vgId:%d, raft entry out of buffer capacity. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); goto _out; } @@ -382,8 +382,8 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { // increase match index pBuf->matchIndex = index; - sDebug("vgId:%d, log buffer proceed. start index: %" PRId64 ", match index: %" PRId64 ", end index: %" PRId64, - pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); + sInfo("vgId:%d, log buffer proceed. start index: %" PRId64 ", match index: %" PRId64 ", end index: %" PRId64, + pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); // replicate on demand (void)syncNodeReplicate(pNode); @@ -586,9 +586,9 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) { goto _IGNORE; } - sDebug("vgId:%d, recv append entries msg. index:%" PRId64 ", term:%" PRId64 ", preLogIndex:%" PRId64 - ", prevLogTerm:%" PRId64 " commitIndex:%" PRId64 "", - pMsg->vgId, pMsg->prevLogIndex + 1, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex); + sInfo("vgId:%d, recv append entries msg. index:%" PRId64 ", term:%" PRId64 ", preLogIndex:%" PRId64 + ", prevLogTerm:%" PRId64 " commitIndex:%" PRId64 "", + pMsg->vgId, pMsg->prevLogIndex + 1, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex); // accept if (syncLogBufferAccept(ths->pLogBuf, ths, pEntry, pMsg->prevLogTerm) < 0) { diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index 3a7513f276..3295f03016 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -50,8 +50,8 @@ int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) { if (indexLikely > ths->commitIndex && syncNodeAgreedUpon(ths, indexLikely)) { SyncIndex commitIndex = indexLikely; syncNodeUpdateCommitIndex(ths, commitIndex); - sDebug("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state, - ths->pRaftStore->currentTerm, commitIndex); + sInfo("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state, + ths->pRaftStore->currentTerm, commitIndex); } return ths->commitIndex; } @@ -113,8 +113,8 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn (void)syncNodeSendAppendEntries(pNode, pDestId, &msgOut); ret = 0; - sDebug("vgId:%d, replicate one msg index: %" PRId64 " term: %" PRId64 " prevterm: %" PRId64 " to dest: 0x%016" PRIx64, - pNode->vgId, pEntry->index, pEntry->term, prevLogTerm, pDestId->addr); + sInfo("vgId:%d, replicate one msg index: %" PRId64 " term: %" PRId64 " prevterm: %" PRId64 " to dest: 0x%016" PRIx64, + pNode->vgId, pEntry->index, pEntry->term, prevLogTerm, pDestId->addr); if (!inBuf) { syncEntryDestroy(pEntry); @@ -157,8 +157,8 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { ASSERT(pMsg->term == ths->pRaftStore->currentTerm); - sDebug("vgId:%d received append entries reply. srcId:0x%016" PRIx64 ", term:%" PRId64 ", matchIndex:%" PRId64 "", - pMsg->vgId, pMsg->srcId.addr, pMsg->term, pMsg->matchIndex); + sInfo("vgId:%d received append entries reply. srcId:0x%016" PRIx64 ", term:%" PRId64 ", matchIndex:%" PRId64 "", + pMsg->vgId, pMsg->srcId.addr, pMsg->term, pMsg->matchIndex); if (pMsg->success) { SyncIndex oldMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId)); diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index a54fbd294b..7e19b9aa5c 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -93,13 +93,13 @@ int32_t syncStart(int64_t rid) { goto _err; } - if (syncNodeStart(pSyncNode) < 0) { - sError("vgId:%d, failed to start sync node since %s", pSyncNode->vgId, terrstr()); - goto _err; - } + if (syncNodeStart(pSyncNode) < 0) { + sError("vgId:%d, failed to start sync node since %s", pSyncNode->vgId, terrstr()); + goto _err; + } - syncNodeRelease(pSyncNode); - return 0; + syncNodeRelease(pSyncNode); + return 0; _err: syncNodeRelease(pSyncNode); @@ -524,13 +524,8 @@ int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapsho pSnapshot->lastApplyTerm = pEntry->term; pSnapshot->lastConfigIndex = syncNodeGetSnapshotConfigIndex(pSyncNode, index); -<<<<<<< HEAD - syncEntryDestroy(pEntry); - taosReleaseRef(tsNodeRefId, pSyncNode->rid); -======= syncEntryDestroy(pEntry); syncNodeRelease(pSyncNode); ->>>>>>> 3.0 return 0; } @@ -771,7 +766,7 @@ _out: int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { SSyncLogBuffer* pBuf = pNode->pLogBuf; - SRaftId destId = pMsg->srcId; + SRaftId destId = pMsg->srcId; ASSERT(pMgr->restored == false); char host[64]; uint16_t port; @@ -2508,10 +2503,10 @@ int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) { // proceed match index, with replicating on needed SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); - sDebug("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64 - ", %" PRId64 ")", - ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex, - ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex); + sInfo("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64 + ", %" PRId64 ")", + ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex, + ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex); // multi replica if (ths->replicaNum > 1) { @@ -2646,7 +2641,7 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) { int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { SyncHeartbeatReply* pMsg = pRpcMsg->pCont; - SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId); + SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId); if (pMgr == NULL) { sError("vgId:%d, failed to get log repl mgr for the peer at addr 0x016%" PRIx64 "", ths->vgId, pMsg->srcId.addr); return -1; @@ -2797,17 +2792,6 @@ int32_t syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTer return 0; } -#if 0 -void syncLogReplicateAppendEntries(SSyncNode* pNode, SyncAppendEntries* pMsg) { - for (int i = 0; i < pNode->replicaNum; i++) { - SRaftId* pDestId = &pNode->peersId[i]; - if (!syncUtilSameId(pDestId, &pNode->myRaftId)) { - (void)syncNodeSendAppendEntries(pNode, pDestId, pMsg); - } - } -} -#endif - // TLA+ Spec // ClientRequest(i, v) == // /\ state[i] = Leader @@ -2824,9 +2808,9 @@ int32_t syncNodeOnClientRequest(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIn int32_t code = 0; - SyncIndex index = syncLogBufferGetEndIndex(ths->pLogBuf); - SyncTerm term = ths->pRaftStore->currentTerm; - SSyncRaftEntry* pEntry = NULL; + SyncIndex index = syncLogBufferGetEndIndex(ths->pLogBuf); + SyncTerm term = ths->pRaftStore->currentTerm; + SSyncRaftEntry* pEntry = NULL; if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) { pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index); } else { @@ -2841,7 +2825,7 @@ int32_t syncNodeOnClientRequest(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIn return syncNodeAppend(ths, pEntry); } - return 0; + return -1; } int32_t syncNodeOnClientRequestOld(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) { From 4526ff28769554b8c0c7aa002244052e46717226 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Thu, 17 Nov 2022 23:37:02 +0800 Subject: [PATCH 11/42] enh: add syncLogBuffer.h and syncLogBuffer.c --- source/libs/sync/inc/syncInt.h | 98 +- source/libs/sync/inc/syncLogBuffer.h | 129 +++ source/libs/sync/src/syncAppendEntries.c | 394 +------- source/libs/sync/src/syncAppendEntriesReply.c | 77 +- source/libs/sync/src/syncLogBuffer.c | 944 ++++++++++++++++++ source/libs/sync/src/syncMain.c | 455 +-------- source/libs/sync/src/syncReplication.c | 3 + source/os/src/osTimer.c | 2 +- 8 files changed, 1083 insertions(+), 1019 deletions(-) create mode 100644 source/libs/sync/inc/syncLogBuffer.h create mode 100644 source/libs/sync/src/syncLogBuffer.c diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 2199b9b65a..f204f7f4fe 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -50,6 +50,8 @@ typedef struct SyncPreSnapshotReply SyncPreSnapshotReply; typedef struct SyncHeartbeatReply SyncHeartbeatReply; typedef struct SyncHeartbeat SyncHeartbeat; typedef struct SyncPreSnapshot SyncPreSnapshot; +typedef struct SSyncLogBuffer SSyncLogBuffer; +typedef struct SSyncLogReplMgr SSyncLogReplMgr; typedef struct SRaftId { SyncNodeId addr; @@ -85,102 +87,6 @@ typedef struct SPeerState { int64_t lastSendTime; } SPeerState; -typedef struct SSyncReplInfo { - bool barrier; - bool acked; - int64_t timeMs; - int64_t term; -} SSyncReplInfo; - -typedef struct SSyncLogReplMgr { - SSyncReplInfo states[TSDB_SYNC_LOG_BUFFER_SIZE]; - int64_t startIndex; - int64_t matchIndex; - int64_t endIndex; - int64_t size; - bool restored; - int64_t peerStartTime; - int32_t retryBackoff; - int32_t peerId; -} SSyncLogReplMgr; - -SSyncLogReplMgr* syncLogReplMgrCreate(); -void syncLogReplMgrDestroy(SSyncLogReplMgr* pMgr); - -// access -static FORCE_INLINE int64_t syncLogGetRetryBackoffTimeMs(SSyncLogReplMgr* pMgr) { - return (1 << pMgr->retryBackoff) * SYNC_LOG_REPL_RETRY_WAIT_MS; -} - -static FORCE_INLINE int32_t syncLogGetNextRetryBackoff(SSyncLogReplMgr* pMgr) { - return TMIN(pMgr->retryBackoff + 1, SYNC_MAX_RETRY_BACKOFF); -} - -static FORCE_INLINE int32_t syncLogReplMgrUpdateTerm(SSyncLogReplMgr* pMgr, SyncIndex index, SyncTerm term) { - if (pMgr->endIndex == 0) return -1; - ASSERT(pMgr->startIndex <= index && index < pMgr->endIndex); - pMgr->states[(index + pMgr->size) % pMgr->size].term = term; - return 0; -} - -SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index); -int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SyncTerm* pTerm, - SRaftId* pDestId, bool* pBarrier); -int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); -int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); -int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); -int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); -int32_t syncLogResetLogReplMgr(SSyncLogReplMgr* pMgr); -int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); -int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); -int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode); - -int32_t syncNodeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); -int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); -void syncLogDestroyAppendEntries(SRpcMsg* pRpcMsg); - -// others -bool syncLogReplMgrValidate(SSyncLogReplMgr* pMgr); - -typedef struct SSyncLogBufEntry { - SSyncRaftEntry* pItem; - SyncIndex prevLogIndex; - SyncTerm prevLogTerm; -} SSyncLogBufEntry; - -typedef struct SSyncLogBuffer { - SSyncLogBufEntry entries[TSDB_SYNC_LOG_BUFFER_SIZE]; - int64_t startIndex; - int64_t commitIndex; - int64_t matchIndex; - int64_t endIndex; - int64_t size; - TdThreadMutex mutex; -} SSyncLogBuffer; - -SSyncLogBuffer* syncLogBufferCreate(); -void syncLogBufferDestroy(SSyncLogBuffer* pBuf); -int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode); - -// access -int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf); -int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry); -int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevTerm); -int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode); -int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t commitIndex); - -int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commtIndex); -int32_t syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm, SRpcMsg* pRpcMsg); - -// private -SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index, bool* pInBuf); -int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf); -int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex); -int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm); -bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index); - -void syncIndexMgrSetIndex(SSyncIndexMgr* pSyncIndexMgr, const SRaftId* pRaftId, SyncIndex index); - typedef struct SSyncNode { // init by SSyncInfo SyncGroupId vgId; diff --git a/source/libs/sync/inc/syncLogBuffer.h b/source/libs/sync/inc/syncLogBuffer.h new file mode 100644 index 0000000000..4c209549b4 --- /dev/null +++ b/source/libs/sync/inc/syncLogBuffer.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_LOG_BUFFER_H +#define _TD_LIBS_SYNC_LOG_BUFFER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "syncInt.h" + +typedef struct SSyncReplInfo { + bool barrier; + bool acked; + int64_t timeMs; + int64_t term; +} SSyncReplInfo; + +typedef struct SSyncLogReplMgr { + SSyncReplInfo states[TSDB_SYNC_LOG_BUFFER_SIZE]; + int64_t startIndex; + int64_t matchIndex; + int64_t endIndex; + int64_t size; + bool restored; + int64_t peerStartTime; + int32_t retryBackoff; + int32_t peerId; +} SSyncLogReplMgr; + +SSyncLogReplMgr* syncLogReplMgrCreate(); +void syncLogReplMgrDestroy(SSyncLogReplMgr* pMgr); +int32_t syncNodeLogReplMgrInit(SSyncNode* pNode); +void syncNodeLogReplMgrDestroy(SSyncNode* pNode); + +// access +static FORCE_INLINE int64_t syncLogGetRetryBackoffTimeMs(SSyncLogReplMgr* pMgr) { + return (1 << pMgr->retryBackoff) * SYNC_LOG_REPL_RETRY_WAIT_MS; +} + +static FORCE_INLINE int32_t syncLogGetNextRetryBackoff(SSyncLogReplMgr* pMgr) { + return TMIN(pMgr->retryBackoff + 1, SYNC_MAX_RETRY_BACKOFF); +} + +static FORCE_INLINE int32_t syncLogReplMgrUpdateTerm(SSyncLogReplMgr* pMgr, SyncIndex index, SyncTerm term) { + if (pMgr->endIndex == 0) return -1; + ASSERT(pMgr->startIndex <= index && index < pMgr->endIndex); + pMgr->states[(index + pMgr->size) % pMgr->size].term = term; + return 0; +} + +SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index); +int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SyncTerm* pTerm, + SRaftId* pDestId, bool* pBarrier); +int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); +int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); +int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); +int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); +int32_t syncLogResetLogReplMgr(SSyncLogReplMgr* pMgr); +int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); +int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); +int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode); +int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg); + +int32_t syncNodeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); +int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); +void syncLogDestroyAppendEntries(SRpcMsg* pRpcMsg); + +// others +bool syncLogReplMgrValidate(SSyncLogReplMgr* pMgr); + +typedef struct SSyncLogBufEntry { + SSyncRaftEntry* pItem; + SyncIndex prevLogIndex; + SyncTerm prevLogTerm; +} SSyncLogBufEntry; + +typedef struct SSyncLogBuffer { + SSyncLogBufEntry entries[TSDB_SYNC_LOG_BUFFER_SIZE]; + int64_t startIndex; + int64_t commitIndex; + int64_t matchIndex; + int64_t endIndex; + int64_t size; + TdThreadMutex mutex; +} SSyncLogBuffer; + +SSyncLogBuffer* syncLogBufferCreate(); +void syncLogBufferDestroy(SSyncLogBuffer* pBuf); +int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode); + +// access +int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf); +int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry); +int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevTerm); +int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode); +int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t commitIndex); +int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode); + +// private +SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index, bool* pInBuf); +int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf); +int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex); +int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm); + +// others +bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index); +void syncIndexMgrSetIndex(SSyncIndexMgr* pSyncIndexMgr, const SRaftId* pRaftId, SyncIndex index); +int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commtIndex); +int32_t syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm, SRpcMsg* pRpcMsg); + +#ifdef __cplusplus +} +#endif + +#endif /*_TD_LIBS_SYNC_LOG_BUFFER_H*/ diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 8197119b87..4eff19ced8 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -15,6 +15,7 @@ #define _DEFAULT_SOURCE #include "syncAppendEntries.h" +#include "syncLogBuffer.h" #include "syncMessage.h" #include "syncRaftLog.h" #include "syncRaftStore.h" @@ -125,187 +126,6 @@ int32_t syncNodeFollowerCommit(SSyncNode* ths, SyncIndex newCommitIndex) { return 0; } -SSyncRaftEntry* syncEntryBuildDummy(SyncTerm term, SyncIndex index, int32_t vgId) { - return syncEntryBuildNoop(term, index, vgId); -} - -int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { - taosThreadMutexLock(&pBuf->mutex); - ASSERT(pNode->pLogStore != NULL && "log store not created"); - ASSERT(pNode->pFsm != NULL && "pFsm not registered"); - ASSERT(pNode->pFsm->FpGetSnapshotInfo != NULL && "FpGetSnapshotInfo not registered"); - - SSnapshot snapshot; - if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) { - sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr()); - goto _err; - } - SyncIndex commitIndex = snapshot.lastApplyIndex; - SyncTerm commitTerm = snapshot.lastApplyTerm; - - SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); - if (lastVer < commitIndex) { - sError("vgId:%d, lastVer of WAL log less than tsdb commit version. lastVer: %" PRId64 - ", tsdb commit version: %" PRId64 "", - pNode->vgId, lastVer, commitIndex); - terrno = TSDB_CODE_WAL_LOG_INCOMPLETE; - goto _err; - } - - ASSERT(lastVer >= commitIndex); - SyncIndex toIndex = lastVer; - // update match index - pBuf->commitIndex = commitIndex; - pBuf->matchIndex = toIndex; - pBuf->endIndex = toIndex + 1; - - // load log entries in reverse order - SSyncLogStore* pLogStore = pNode->pLogStore; - SyncIndex index = toIndex; - SSyncRaftEntry* pEntry = NULL; - bool takeDummy = false; - - while (true) { - if (index <= pBuf->commitIndex) { - takeDummy = true; - break; - } - - if (pLogStore->syncLogGetEntry(pLogStore, index, &pEntry) < 0) { - sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); - ASSERT(0); - break; - } - - bool taken = false; - if (toIndex <= index + pBuf->size - 1) { - SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = -1, .prevLogTerm = -1}; - pBuf->entries[index % pBuf->size] = tmp; - taken = true; - } - - if (index < toIndex) { - pBuf->entries[(index + 1) % pBuf->size].prevLogIndex = pEntry->index; - pBuf->entries[(index + 1) % pBuf->size].prevLogTerm = pEntry->term; - } - - if (!taken) { - syncEntryDestroy(pEntry); - pEntry = NULL; - break; - } - - index--; - } - - // put a dummy record at commitIndex if present in log buffer - if (takeDummy) { - ASSERT(index == pBuf->commitIndex); - - SSyncRaftEntry* pDummy = syncEntryBuildDummy(commitTerm, commitIndex, pNode->vgId); - if (pDummy == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - goto _err; - } - SSyncLogBufEntry tmp = {.pItem = pDummy, .prevLogIndex = commitIndex - 1, .prevLogTerm = commitTerm}; - pBuf->entries[(commitIndex + pBuf->size) % pBuf->size] = tmp; - - if (index < toIndex) { - pBuf->entries[(index + 1) % pBuf->size].prevLogIndex = commitIndex; - pBuf->entries[(index + 1) % pBuf->size].prevLogTerm = commitTerm; - } - } - - // update startIndex - pBuf->startIndex = takeDummy ? index : index + 1; - - // validate - syncLogBufferValidate(pBuf); - taosThreadMutexUnlock(&pBuf->mutex); - return 0; - -_err: - taosThreadMutexUnlock(&pBuf->mutex); - return -1; -} - -FORCE_INLINE SyncTerm syncLogBufferGetLastMatchTerm(SSyncLogBuffer* pBuf) { - SyncIndex index = pBuf->matchIndex; - SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; - ASSERT(pEntry != NULL); - return pEntry->term; -} - -int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevTerm) { - taosThreadMutexLock(&pBuf->mutex); - syncLogBufferValidate(pBuf); - int32_t ret = -1; - SyncIndex index = pEntry->index; - SyncIndex prevIndex = pEntry->index - 1; - SyncTerm lastMatchTerm = syncLogBufferGetLastMatchTerm(pBuf); - - if (index <= pBuf->commitIndex) { - sInfo("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, - pBuf->endIndex); - ret = 0; - goto _out; - } - - if (index - pBuf->startIndex >= pBuf->size) { - sInfo("vgId:%d, raft entry out of buffer capacity. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, - pBuf->endIndex); - goto _out; - } - - if (index > pBuf->matchIndex && lastMatchTerm != prevTerm) { - sInfo("vgId:%d, not ready to accept raft entry. index: %" PRId64 ", term: %" PRId64 ": prevterm: %" PRId64 - " != lastmatch: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, prevTerm, lastMatchTerm, pBuf->startIndex, pBuf->commitIndex, - pBuf->matchIndex, pBuf->endIndex); - goto _out; - } - - // check current in buffer - SSyncRaftEntry* pExist = pBuf->entries[index % pBuf->size].pItem; - if (pExist != NULL) { - ASSERT(pEntry->index == pExist->index); - - if (pEntry->term != pExist->term) { - (void)syncLogBufferRollback(pBuf, index); - } else { - sDebug("vgId:%d, duplicate raft entry received. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, - pBuf->endIndex); - SyncTerm existPrevTerm = pBuf->entries[index % pBuf->size].prevLogTerm; - ASSERT(pEntry->term == pExist->term && prevTerm == existPrevTerm); - ret = 0; - goto _out; - } - } - - // update - SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = prevIndex, .prevLogTerm = prevTerm}; - pEntry = NULL; - pBuf->entries[index % pBuf->size] = tmp; - - // update end index - pBuf->endIndex = TMAX(index + 1, pBuf->endIndex); - - // success - ret = 0; - -_out: - syncEntryDestroy(pEntry); - syncLogBufferValidate(pBuf); - taosThreadMutexUnlock(&pBuf->mutex); - return ret; -} - SSyncRaftEntry* syncLogAppendEntriesToRaftEntry(const SyncAppendEntries* pMsg) { SSyncRaftEntry* pEntry = taosMemoryMalloc(pMsg->dataLen); if (pEntry == NULL) { @@ -317,218 +137,6 @@ SSyncRaftEntry* syncLogAppendEntriesToRaftEntry(const SyncAppendEntries* pMsg) { return pEntry; } -int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) { - ASSERT(pEntry->index >= 0); - SyncIndex lastVer = pLogStore->syncLogLastIndex(pLogStore); - if (lastVer >= pEntry->index && pLogStore->syncLogTruncate(pLogStore, pEntry->index) < 0) { - sError("failed to truncate log store since %s. from index:%" PRId64 "", terrstr(), pEntry->index); - return -1; - } - lastVer = pLogStore->syncLogLastIndex(pLogStore); - ASSERT(pEntry->index == lastVer + 1); - - if (pLogStore->syncLogAppendEntry(pLogStore, pEntry) < 0) { - sError("failed to append raft log entry since %s. index:%" PRId64 ", term:%" PRId64 "", terrstr(), pEntry->index, - pEntry->term); - return -1; - } - - lastVer = pLogStore->syncLogLastIndex(pLogStore); - ASSERT(pEntry->index == lastVer); - return 0; -} - -int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { - taosThreadMutexLock(&pBuf->mutex); - syncLogBufferValidate(pBuf); - - SSyncLogStore* pLogStore = pNode->pLogStore; - int64_t matchIndex = pBuf->matchIndex; - - while (pBuf->matchIndex + 1 < pBuf->endIndex) { - int64_t index = pBuf->matchIndex + 1; - ASSERT(index >= 0); - - // try to proceed - SSyncLogBufEntry* pBufEntry = &pBuf->entries[index % pBuf->size]; - SyncIndex prevLogIndex = pBufEntry->prevLogIndex; - SyncTerm prevLogTerm = pBufEntry->prevLogTerm; - SSyncRaftEntry* pEntry = pBufEntry->pItem; - if (pEntry == NULL) { - sDebug("vgId:%d, cannot proceed match index in log buffer. no raft entry at next pos of matchIndex:%" PRId64, - pNode->vgId, pBuf->matchIndex); - goto _out; - } - - ASSERT(index == pEntry->index); - - // match - SSyncRaftEntry* pMatch = pBuf->entries[(pBuf->matchIndex + pBuf->size) % pBuf->size].pItem; - ASSERT(pMatch != NULL); - ASSERT(pMatch->index == pBuf->matchIndex); - ASSERT(pMatch->index + 1 == pEntry->index); - ASSERT(prevLogIndex == pMatch->index); - - if (pMatch->term != prevLogTerm) { - sInfo( - "vgId:%d, mismatching raft log entries encountered. " - "{ index:%" PRId64 ", term:%" PRId64 - " } " - "{ index:%" PRId64 ", term:%" PRId64 ", prevLogIndex:%" PRId64 ", prevLogTerm:%" PRId64 " } ", - pNode->vgId, pMatch->index, pMatch->term, pEntry->index, pEntry->term, prevLogIndex, prevLogTerm); - goto _out; - } - - // increase match index - pBuf->matchIndex = index; - - sInfo("vgId:%d, log buffer proceed. start index: %" PRId64 ", match index: %" PRId64 ", end index: %" PRId64, - pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); - - // replicate on demand - (void)syncNodeReplicate(pNode); - - // persist - if (syncLogStorePersist(pLogStore, pEntry) < 0) { - sError("vgId:%d, failed to persist raft log entry from log buffer since %s. index:%" PRId64, pNode->vgId, - terrstr(), pEntry->index); - goto _out; - } - ASSERT(pEntry->index == pBuf->matchIndex); - - // update my match index - matchIndex = pBuf->matchIndex; - syncIndexMgrSetIndex(pNode->pMatchIndex, &pNode->myRaftId, pBuf->matchIndex); - } // end of while - -_out: - pBuf->matchIndex = matchIndex; - syncLogBufferValidate(pBuf); - taosThreadMutexUnlock(&pBuf->mutex); - return matchIndex; -} - -int32_t syncLogFsmExecute(SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry) { - ASSERT(pFsm->FpCommitCb != NULL && "No commit cb registered for the FSM"); - - SRpcMsg rpcMsg; - syncEntry2OriginalRpc(pEntry, &rpcMsg); - - SFsmCbMeta cbMeta = {0}; - cbMeta.index = pEntry->index; - cbMeta.lastConfigIndex = -1; - cbMeta.isWeak = pEntry->isWeak; - cbMeta.code = 0; - cbMeta.state = role; - cbMeta.seqNum = pEntry->seqNum; - cbMeta.term = pEntry->term; - cbMeta.currentTerm = term; - cbMeta.flag = -1; - - pFsm->FpCommitCb(pFsm, &rpcMsg, &cbMeta); - return 0; -} - -int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf) { - ASSERT(pBuf->startIndex <= pBuf->matchIndex); - ASSERT(pBuf->commitIndex <= pBuf->matchIndex); - ASSERT(pBuf->matchIndex < pBuf->endIndex); - ASSERT(pBuf->endIndex - pBuf->startIndex <= pBuf->size); - ASSERT(pBuf->entries[(pBuf->matchIndex + pBuf->size) % pBuf->size].pItem); - return 0; -} - -int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t commitIndex) { - taosThreadMutexLock(&pBuf->mutex); - syncLogBufferValidate(pBuf); - - SSyncLogStore* pLogStore = pNode->pLogStore; - SSyncFSM* pFsm = pNode->pFsm; - ESyncState role = pNode->state; - SyncTerm term = pNode->pRaftStore->currentTerm; - SyncGroupId vgId = pNode->vgId; - int32_t ret = 0; - int64_t upperIndex = TMIN(commitIndex, pBuf->matchIndex); - SSyncRaftEntry* pEntry = NULL; - bool inBuf = false; - - if (commitIndex <= pBuf->commitIndex) { - sDebug("vgId:%d, stale commit update. current:%" PRId64 ", notified:%" PRId64 "", vgId, pBuf->commitIndex, - commitIndex); - ret = 0; - goto _out; - } - - sDebug("vgId:%d, log buffer info. role: %d, term: %" PRId64 ". start index:%" PRId64 ", commit index:%" PRId64 - ", match index: %" PRId64 ", end index:%" PRId64 "", - pNode->vgId, role, term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); - - // execute in fsm - for (int64_t index = pBuf->commitIndex + 1; index <= upperIndex; index++) { - // get a log entry - pEntry = syncLogBufferGetOneEntry(pBuf, pNode, index, &inBuf); - if (pEntry == NULL) { - goto _out; - } - - // execute it - if (!syncUtilUserCommit(pEntry->originalRpcType)) { - sInfo("vgId:%d, non-user msg in raft log entry. index: %" PRId64 ", term:%" PRId64 "", vgId, pEntry->index, - pEntry->term); - pBuf->commitIndex = index; - if (!inBuf) { - syncEntryDestroy(pEntry); - pEntry = NULL; - } - continue; - } - - if (syncLogFsmExecute(pFsm, role, term, pEntry) != 0) { - sError("vgId:%d, failed to execute raft entry in FSM. log index:%" PRId64 ", term:%" PRId64 "", vgId, - pEntry->index, pEntry->term); - ret = -1; - goto _out; - } - pBuf->commitIndex = index; - - sDebug("vgId:%d, committed index: %" PRId64 ", term: %" PRId64 ", role: %d, current term: %" PRId64 "", pNode->vgId, - pEntry->index, pEntry->term, role, term); - - if (!inBuf) { - syncEntryDestroy(pEntry); - pEntry = NULL; - } - } - - // recycle - SyncIndex used = pBuf->endIndex - pBuf->startIndex; - SyncIndex until = pBuf->commitIndex - (pBuf->size - used) / 2; - for (SyncIndex index = pBuf->startIndex; index < until; index++) { - SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; - ASSERT(pEntry != NULL); - syncEntryDestroy(pEntry); - memset(&pBuf->entries[(index + pBuf->size) % pBuf->size], 0, sizeof(pBuf->entries[0])); - pBuf->startIndex = index + 1; - } - -_out: - // mark as restored if needed - if (!pNode->restoreFinish && pBuf->commitIndex >= pNode->commitIndex) { - pNode->pFsm->FpRestoreFinishCb(pNode->pFsm); - pNode->restoreFinish = true; - sInfo("vgId:%d, restore finished. pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); - } - - if (!inBuf) { - syncEntryDestroy(pEntry); - pEntry = NULL; - } - syncLogBufferValidate(pBuf); - taosThreadMutexUnlock(&pBuf->mutex); - return ret; -} - int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) { SyncAppendEntries* pMsg = pRpcMsg->pCont; SRpcMsg rpcRsp = {0}; diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index 3295f03016..36eb6fefc7 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -17,6 +17,7 @@ #include "syncAppendEntriesReply.h" #include "syncCommit.h" #include "syncIndexMgr.h" +#include "syncLogBuffer.h" #include "syncMessage.h" #include "syncRaftEntry.h" #include "syncRaftStore.h" @@ -56,82 +57,6 @@ int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) { return ths->commitIndex; } -SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index, bool* pInBuf) { - SSyncRaftEntry* pEntry = NULL; - if (index >= pBuf->endIndex) { - return NULL; - } - if (index > pBuf->startIndex) { // startIndex might be dummy - *pInBuf = true; - pEntry = pBuf->entries[index % pBuf->size].pItem; - } else { - *pInBuf = false; - if (pNode->pLogStore->syncLogGetEntry(pNode->pLogStore, index, &pEntry) < 0) { - sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); - } - } - return pEntry; -} - -bool syncLogReplMgrValidate(SSyncLogReplMgr* pMgr) { - ASSERT(pMgr->startIndex <= pMgr->endIndex); - for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { - ASSERT(pMgr->states[(index + pMgr->size) % pMgr->size].barrier == false || index + 1 == pMgr->endIndex); - } - return true; -} - -int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SyncTerm* pTerm, - SRaftId* pDestId, bool* pBarrier) { - SSyncRaftEntry* pEntry = NULL; - SRpcMsg msgOut = {0}; - bool inBuf = false; - int32_t ret = -1; - SyncTerm prevLogTerm = -1; - SSyncLogBuffer* pBuf = pNode->pLogBuf; - - pEntry = syncLogBufferGetOneEntry(pBuf, pNode, index, &inBuf); - if (pEntry == NULL) { - sError("vgId:%d, failed to get raft entry for index: %" PRId64 "", pNode->vgId, index); - goto _err; - } - *pBarrier = syncLogIsReplicationBarrier(pEntry); - - prevLogTerm = syncLogReplMgrGetPrevLogTerm(pMgr, pNode, index); - if (prevLogTerm < 0) { - sError("vgId:%d, failed to get prev log term since %s. index: %" PRId64 "", pNode->vgId, terrstr(), index); - goto _err; - } - if (pTerm) *pTerm = pEntry->term; - - int32_t code = syncLogToAppendEntries(pNode, pEntry, prevLogTerm, &msgOut); - if (code < 0) { - sError("vgId:%d, failed to get append entries for index:%" PRId64 "", pNode->vgId, index); - goto _err; - } - - (void)syncNodeSendAppendEntries(pNode, pDestId, &msgOut); - ret = 0; - - sInfo("vgId:%d, replicate one msg index: %" PRId64 " term: %" PRId64 " prevterm: %" PRId64 " to dest: 0x%016" PRIx64, - pNode->vgId, pEntry->index, pEntry->term, prevLogTerm, pDestId->addr); - - if (!inBuf) { - syncEntryDestroy(pEntry); - pEntry = NULL; - } - return 0; - -_err: - rpcFreeCont(msgOut.pCont); - msgOut.pCont = NULL; - if (!inBuf) { - syncEntryDestroy(pEntry); - pEntry = NULL; - } - return -1; -} - int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { SyncAppendEntriesReply* pMsg = pRpcMsg->pCont; int32_t ret = 0; diff --git a/source/libs/sync/src/syncLogBuffer.c b/source/libs/sync/src/syncLogBuffer.c new file mode 100644 index 0000000000..36f33fa46e --- /dev/null +++ b/source/libs/sync/src/syncLogBuffer.c @@ -0,0 +1,944 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#define _DEFAULT_SOURCE + +#include "syncLogBuffer.h" +#include "syncInt.h" +#include "syncRaftEntry.h" +#include "syncRaftStore.h" +#include "syncReplication.h" +#include "syncUtil.h" + +int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf) { + taosThreadMutexLock(&pBuf->mutex); + int64_t index = pBuf->endIndex; + taosThreadMutexUnlock(&pBuf->mutex); + return index; +} + +int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry) { + taosThreadMutexLock(&pBuf->mutex); + syncLogBufferValidate(pBuf); + SyncIndex index = pEntry->index; + + if (index - pBuf->startIndex >= pBuf->size) { + sError("vgId:%d, failed to append due to log buffer full. index:%" PRId64 "", pNode->vgId, index); + goto _out; + } + + ASSERT(index == pBuf->endIndex); + + SSyncRaftEntry* pExist = pBuf->entries[index % pBuf->size].pItem; + ASSERT(pExist == NULL); + + // initial log buffer with at least one item, e.g. commitIndex + SSyncRaftEntry* pMatch = pBuf->entries[(index - 1 + pBuf->size) % pBuf->size].pItem; + ASSERT(pMatch != NULL && "no matched raft log entry"); + ASSERT(pMatch->index + 1 == index); + + SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = pMatch->index, .prevLogTerm = pMatch->term}; + pBuf->entries[index % pBuf->size] = tmp; + pBuf->endIndex = index + 1; + + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return 0; + +_out: + syncLogBufferValidate(pBuf); + syncEntryDestroy(pEntry); + taosThreadMutexUnlock(&pBuf->mutex); + return -1; +} + +SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + SSyncRaftEntry* pEntry = NULL; + SyncIndex prevIndex = index - 1; + SyncTerm prevLogTerm = -1; + terrno = TSDB_CODE_SUCCESS; + + if (prevIndex == -1) return 0; + + if (index - 1 > pBuf->matchIndex) { + terrno = TSDB_CODE_WAL_LOG_NOT_EXIST; + return -1; + } + + ASSERT(index - 1 == prevIndex); + + if (index - 1 >= pBuf->startIndex) { + pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + ASSERT(pEntry != NULL && "no log entry found"); + prevLogTerm = pBuf->entries[(index + pBuf->size) % pBuf->size].prevLogTerm; + return prevLogTerm; + } + + if (pMgr->startIndex <= prevIndex && prevIndex < pMgr->endIndex) { + int64_t timeMs = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].timeMs; + ASSERT(timeMs != 0 && "no log entry found"); + prevLogTerm = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].term; + ASSERT(prevIndex == 0 || prevLogTerm != 0); + return prevLogTerm; + } + + SSnapshot snapshot; + if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) == 0 && prevIndex == snapshot.lastApplyIndex) { + return snapshot.lastApplyTerm; + } + + if (pNode->pLogStore->syncLogGetEntry(pNode->pLogStore, prevIndex, &pEntry) == 0) { + prevLogTerm = pEntry->term; + syncEntryDestroy(pEntry); + pEntry = NULL; + return prevLogTerm; + } + + sError("vgId:%d, failed to get log term since %s. index: %" PRId64 "", pNode->vgId, terrstr(), prevIndex); + terrno = TSDB_CODE_WAL_LOG_NOT_EXIST; + return -1; +} + +SSyncRaftEntry* syncEntryBuildDummy(SyncTerm term, SyncIndex index, int32_t vgId) { + return syncEntryBuildNoop(term, index, vgId); +} + +int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { + taosThreadMutexLock(&pBuf->mutex); + ASSERT(pNode->pLogStore != NULL && "log store not created"); + ASSERT(pNode->pFsm != NULL && "pFsm not registered"); + ASSERT(pNode->pFsm->FpGetSnapshotInfo != NULL && "FpGetSnapshotInfo not registered"); + + SSnapshot snapshot; + if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) { + sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr()); + goto _err; + } + SyncIndex commitIndex = snapshot.lastApplyIndex; + SyncTerm commitTerm = snapshot.lastApplyTerm; + + SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); + if (lastVer < commitIndex) { + sError("vgId:%d, lastVer of WAL log less than tsdb commit version. lastVer: %" PRId64 + ", tsdb commit version: %" PRId64 "", + pNode->vgId, lastVer, commitIndex); + terrno = TSDB_CODE_WAL_LOG_INCOMPLETE; + goto _err; + } + + ASSERT(lastVer >= commitIndex); + SyncIndex toIndex = lastVer; + // update match index + pBuf->commitIndex = commitIndex; + pBuf->matchIndex = toIndex; + pBuf->endIndex = toIndex + 1; + + // load log entries in reverse order + SSyncLogStore* pLogStore = pNode->pLogStore; + SyncIndex index = toIndex; + SSyncRaftEntry* pEntry = NULL; + bool takeDummy = false; + + while (true) { + if (index <= pBuf->commitIndex) { + takeDummy = true; + break; + } + + if (pLogStore->syncLogGetEntry(pLogStore, index, &pEntry) < 0) { + sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); + ASSERT(0); + break; + } + + bool taken = false; + if (toIndex <= index + pBuf->size - 1) { + SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = -1, .prevLogTerm = -1}; + pBuf->entries[index % pBuf->size] = tmp; + taken = true; + } + + if (index < toIndex) { + pBuf->entries[(index + 1) % pBuf->size].prevLogIndex = pEntry->index; + pBuf->entries[(index + 1) % pBuf->size].prevLogTerm = pEntry->term; + } + + if (!taken) { + syncEntryDestroy(pEntry); + pEntry = NULL; + break; + } + + index--; + } + + // put a dummy record at commitIndex if present in log buffer + if (takeDummy) { + ASSERT(index == pBuf->commitIndex); + + SSyncRaftEntry* pDummy = syncEntryBuildDummy(commitTerm, commitIndex, pNode->vgId); + if (pDummy == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _err; + } + SSyncLogBufEntry tmp = {.pItem = pDummy, .prevLogIndex = commitIndex - 1, .prevLogTerm = commitTerm}; + pBuf->entries[(commitIndex + pBuf->size) % pBuf->size] = tmp; + + if (index < toIndex) { + pBuf->entries[(index + 1) % pBuf->size].prevLogIndex = commitIndex; + pBuf->entries[(index + 1) % pBuf->size].prevLogTerm = commitTerm; + } + } + + // update startIndex + pBuf->startIndex = takeDummy ? index : index + 1; + + // validate + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return 0; + +_err: + taosThreadMutexUnlock(&pBuf->mutex); + return -1; +} + +FORCE_INLINE SyncTerm syncLogBufferGetLastMatchTerm(SSyncLogBuffer* pBuf) { + SyncIndex index = pBuf->matchIndex; + SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + ASSERT(pEntry != NULL); + return pEntry->term; +} + +int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevTerm) { + taosThreadMutexLock(&pBuf->mutex); + syncLogBufferValidate(pBuf); + int32_t ret = -1; + SyncIndex index = pEntry->index; + SyncIndex prevIndex = pEntry->index - 1; + SyncTerm lastMatchTerm = syncLogBufferGetLastMatchTerm(pBuf); + + if (index <= pBuf->commitIndex) { + sInfo("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); + ret = 0; + goto _out; + } + + if (index - pBuf->startIndex >= pBuf->size) { + sInfo("vgId:%d, raft entry out of buffer capacity. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); + goto _out; + } + + if (index > pBuf->matchIndex && lastMatchTerm != prevTerm) { + sInfo("vgId:%d, not ready to accept raft entry. index: %" PRId64 ", term: %" PRId64 ": prevterm: %" PRId64 + " != lastmatch: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, prevTerm, lastMatchTerm, pBuf->startIndex, pBuf->commitIndex, + pBuf->matchIndex, pBuf->endIndex); + goto _out; + } + + // check current in buffer + SSyncRaftEntry* pExist = pBuf->entries[index % pBuf->size].pItem; + if (pExist != NULL) { + ASSERT(pEntry->index == pExist->index); + + if (pEntry->term != pExist->term) { + (void)syncLogBufferRollback(pBuf, index); + } else { + sDebug("vgId:%d, duplicate raft entry received. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); + SyncTerm existPrevTerm = pBuf->entries[index % pBuf->size].prevLogTerm; + ASSERT(pEntry->term == pExist->term && prevTerm == existPrevTerm); + ret = 0; + goto _out; + } + } + + // update + SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = prevIndex, .prevLogTerm = prevTerm}; + pEntry = NULL; + pBuf->entries[index % pBuf->size] = tmp; + + // update end index + pBuf->endIndex = TMAX(index + 1, pBuf->endIndex); + + // success + ret = 0; + +_out: + syncEntryDestroy(pEntry); + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return ret; +} + +int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) { + ASSERT(pEntry->index >= 0); + SyncIndex lastVer = pLogStore->syncLogLastIndex(pLogStore); + if (lastVer >= pEntry->index && pLogStore->syncLogTruncate(pLogStore, pEntry->index) < 0) { + sError("failed to truncate log store since %s. from index:%" PRId64 "", terrstr(), pEntry->index); + return -1; + } + lastVer = pLogStore->syncLogLastIndex(pLogStore); + ASSERT(pEntry->index == lastVer + 1); + + if (pLogStore->syncLogAppendEntry(pLogStore, pEntry) < 0) { + sError("failed to append raft log entry since %s. index:%" PRId64 ", term:%" PRId64 "", terrstr(), pEntry->index, + pEntry->term); + return -1; + } + + lastVer = pLogStore->syncLogLastIndex(pLogStore); + ASSERT(pEntry->index == lastVer); + return 0; +} + +int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { + taosThreadMutexLock(&pBuf->mutex); + syncLogBufferValidate(pBuf); + + SSyncLogStore* pLogStore = pNode->pLogStore; + int64_t matchIndex = pBuf->matchIndex; + + while (pBuf->matchIndex + 1 < pBuf->endIndex) { + int64_t index = pBuf->matchIndex + 1; + ASSERT(index >= 0); + + // try to proceed + SSyncLogBufEntry* pBufEntry = &pBuf->entries[index % pBuf->size]; + SyncIndex prevLogIndex = pBufEntry->prevLogIndex; + SyncTerm prevLogTerm = pBufEntry->prevLogTerm; + SSyncRaftEntry* pEntry = pBufEntry->pItem; + if (pEntry == NULL) { + sDebug("vgId:%d, cannot proceed match index in log buffer. no raft entry at next pos of matchIndex:%" PRId64, + pNode->vgId, pBuf->matchIndex); + goto _out; + } + + ASSERT(index == pEntry->index); + + // match + SSyncRaftEntry* pMatch = pBuf->entries[(pBuf->matchIndex + pBuf->size) % pBuf->size].pItem; + ASSERT(pMatch != NULL); + ASSERT(pMatch->index == pBuf->matchIndex); + ASSERT(pMatch->index + 1 == pEntry->index); + ASSERT(prevLogIndex == pMatch->index); + + if (pMatch->term != prevLogTerm) { + sInfo( + "vgId:%d, mismatching raft log entries encountered. " + "{ index:%" PRId64 ", term:%" PRId64 + " } " + "{ index:%" PRId64 ", term:%" PRId64 ", prevLogIndex:%" PRId64 ", prevLogTerm:%" PRId64 " } ", + pNode->vgId, pMatch->index, pMatch->term, pEntry->index, pEntry->term, prevLogIndex, prevLogTerm); + goto _out; + } + + // increase match index + pBuf->matchIndex = index; + + sInfo("vgId:%d, log buffer proceed. start index: %" PRId64 ", match index: %" PRId64 ", end index: %" PRId64, + pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); + + // replicate on demand + (void)syncNodeReplicate(pNode); + + // persist + if (syncLogStorePersist(pLogStore, pEntry) < 0) { + sError("vgId:%d, failed to persist raft log entry from log buffer since %s. index:%" PRId64, pNode->vgId, + terrstr(), pEntry->index); + goto _out; + } + ASSERT(pEntry->index == pBuf->matchIndex); + + // update my match index + matchIndex = pBuf->matchIndex; + syncIndexMgrSetIndex(pNode->pMatchIndex, &pNode->myRaftId, pBuf->matchIndex); + } // end of while + +_out: + pBuf->matchIndex = matchIndex; + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return matchIndex; +} + +int32_t syncLogFsmExecute(SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry) { + ASSERT(pFsm->FpCommitCb != NULL && "No commit cb registered for the FSM"); + + SRpcMsg rpcMsg; + syncEntry2OriginalRpc(pEntry, &rpcMsg); + + SFsmCbMeta cbMeta = {0}; + cbMeta.index = pEntry->index; + cbMeta.lastConfigIndex = -1; + cbMeta.isWeak = pEntry->isWeak; + cbMeta.code = 0; + cbMeta.state = role; + cbMeta.seqNum = pEntry->seqNum; + cbMeta.term = pEntry->term; + cbMeta.currentTerm = term; + cbMeta.flag = -1; + + pFsm->FpCommitCb(pFsm, &rpcMsg, &cbMeta); + return 0; +} + +int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf) { + ASSERT(pBuf->startIndex <= pBuf->matchIndex); + ASSERT(pBuf->commitIndex <= pBuf->matchIndex); + ASSERT(pBuf->matchIndex < pBuf->endIndex); + ASSERT(pBuf->endIndex - pBuf->startIndex <= pBuf->size); + ASSERT(pBuf->entries[(pBuf->matchIndex + pBuf->size) % pBuf->size].pItem); + return 0; +} + +int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t commitIndex) { + taosThreadMutexLock(&pBuf->mutex); + syncLogBufferValidate(pBuf); + + SSyncLogStore* pLogStore = pNode->pLogStore; + SSyncFSM* pFsm = pNode->pFsm; + ESyncState role = pNode->state; + SyncTerm term = pNode->pRaftStore->currentTerm; + SyncGroupId vgId = pNode->vgId; + int32_t ret = 0; + int64_t upperIndex = TMIN(commitIndex, pBuf->matchIndex); + SSyncRaftEntry* pEntry = NULL; + bool inBuf = false; + + if (commitIndex <= pBuf->commitIndex) { + sDebug("vgId:%d, stale commit update. current:%" PRId64 ", notified:%" PRId64 "", vgId, pBuf->commitIndex, + commitIndex); + ret = 0; + goto _out; + } + + sDebug("vgId:%d, log buffer info. role: %d, term: %" PRId64 ". start index:%" PRId64 ", commit index:%" PRId64 + ", match index: %" PRId64 ", end index:%" PRId64 "", + pNode->vgId, role, term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + + // execute in fsm + for (int64_t index = pBuf->commitIndex + 1; index <= upperIndex; index++) { + // get a log entry + pEntry = syncLogBufferGetOneEntry(pBuf, pNode, index, &inBuf); + if (pEntry == NULL) { + goto _out; + } + + // execute it + if (!syncUtilUserCommit(pEntry->originalRpcType)) { + sInfo("vgId:%d, non-user msg in raft log entry. index: %" PRId64 ", term:%" PRId64 "", vgId, pEntry->index, + pEntry->term); + pBuf->commitIndex = index; + if (!inBuf) { + syncEntryDestroy(pEntry); + pEntry = NULL; + } + continue; + } + + if (syncLogFsmExecute(pFsm, role, term, pEntry) != 0) { + sError("vgId:%d, failed to execute raft entry in FSM. log index:%" PRId64 ", term:%" PRId64 "", vgId, + pEntry->index, pEntry->term); + ret = -1; + goto _out; + } + pBuf->commitIndex = index; + + sDebug("vgId:%d, committed index: %" PRId64 ", term: %" PRId64 ", role: %d, current term: %" PRId64 "", pNode->vgId, + pEntry->index, pEntry->term, role, term); + + if (!inBuf) { + syncEntryDestroy(pEntry); + pEntry = NULL; + } + } + + // recycle + SyncIndex used = pBuf->endIndex - pBuf->startIndex; + SyncIndex until = pBuf->commitIndex - (pBuf->size - used) / 2; + for (SyncIndex index = pBuf->startIndex; index < until; index++) { + SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + ASSERT(pEntry != NULL); + syncEntryDestroy(pEntry); + memset(&pBuf->entries[(index + pBuf->size) % pBuf->size], 0, sizeof(pBuf->entries[0])); + pBuf->startIndex = index + 1; + } + +_out: + // mark as restored if needed + if (!pNode->restoreFinish && pBuf->commitIndex >= pNode->commitIndex) { + pNode->pFsm->FpRestoreFinishCb(pNode->pFsm); + pNode->restoreFinish = true; + sInfo("vgId:%d, restore finished. pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + } + + if (!inBuf) { + syncEntryDestroy(pEntry); + pEntry = NULL; + } + syncLogBufferValidate(pBuf); + taosThreadMutexUnlock(&pBuf->mutex); + return ret; +} + +int32_t syncLogResetLogReplMgr(SSyncLogReplMgr* pMgr) { + ASSERT(pMgr->startIndex >= 0); + for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { + memset(&pMgr->states[index % pMgr->size], 0, sizeof(pMgr->states[0])); + } + pMgr->startIndex = 0; + pMgr->matchIndex = 0; + pMgr->endIndex = 0; + pMgr->restored = false; + pMgr->retryBackoff = 0; + return 0; +} + +_Atomic int64_t tsRetryCnt = 0; + +int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { + if (pMgr->endIndex <= pMgr->startIndex) { + return 0; + } + + int32_t ret = -1; + bool retried = false; + int64_t retryWaitMs = syncLogGetRetryBackoffTimeMs(pMgr); + + for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { + int64_t pos = index % pMgr->size; + ASSERT(!pMgr->states[pos].barrier || (index == pMgr->startIndex || index + 1 == pMgr->endIndex)); + if (pMgr->states[pos].acked) { + continue; + } + int64_t nowMs = taosGetMonoTimestampMs(); + if (nowMs < pMgr->states[pos].timeMs + retryWaitMs) { + break; + } + + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; + bool barrier = false; + SyncTerm term = -1; + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, + terrstr(), index, pDestId->addr); + goto _out; + } + ASSERT(barrier == pMgr->states[pos].barrier); + pMgr->states[pos].timeMs = nowMs; + pMgr->states[pos].term = term; + pMgr->states[pos].acked = false; + retried = true; + tsRetryCnt++; + } + + ret = 0; +_out: + if (retried) { + pMgr->retryBackoff = syncLogGetNextRetryBackoff(pMgr); + } + return ret; +} + +int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, + SyncAppendEntriesReply* pMsg) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + SRaftId destId = pMsg->srcId; + ASSERT(pMgr->restored == false); + char host[64]; + uint16_t port; + syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port); + + if (pMgr->endIndex == 0) { + ASSERT(pMgr->startIndex == 0); + ASSERT(pMgr->matchIndex == 0); + if (pMsg->matchIndex < 0) { + pMgr->restored = true; + sInfo("vgId:%d, sync log repl mgr of peer %s:%d restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, host, port, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + return 0; + } + } else { + if (pMsg->lastSendIndex < pMgr->startIndex || pMsg->lastSendIndex >= pMgr->endIndex) { + syncLogReplMgrRetryOnNeed(pMgr, pNode); + return 0; + } + + pMgr->states[pMsg->lastSendIndex % pMgr->size].acked = true; + + if (pMsg->matchIndex == pMsg->lastSendIndex) { + pMgr->restored = true; + sInfo("vgId:%d, sync log repl mgr of peer %s:%d restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, host, port, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + return 0; + } + + (void)syncLogResetLogReplMgr(pMgr); + } + + // send match index + SyncIndex index = TMIN(pMsg->matchIndex, pNode->pLogBuf->matchIndex); + bool barrier = false; + SyncTerm term = -1; + ASSERT(index >= 0); + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, &destId, &barrier) < 0) { + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, + terrstr(), index, destId.addr); + return -1; + } + + int64_t nowMs = taosGetMonoTimestampMs(); + pMgr->states[index % pMgr->size].barrier = barrier; + pMgr->states[index % pMgr->size].timeMs = nowMs; + pMgr->states[index % pMgr->size].term = term; + pMgr->states[index % pMgr->size].acked = false; + + pMgr->matchIndex = index; + pMgr->startIndex = index; + pMgr->endIndex = index + 1; + return 0; +} + +int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + taosThreadMutexLock(&pBuf->mutex); + if (pMsg->startTime != 0 && pMsg->startTime != pMgr->peerStartTime) { + sInfo("vgId:%d, reset sync log repl mgr in heartbeat. start time:%" PRId64 ", old start time:%" PRId64 "", + pNode->vgId, pMsg->startTime, pMgr->peerStartTime); + syncLogResetLogReplMgr(pMgr); + pMgr->peerStartTime = pMsg->startTime; + } + taosThreadMutexUnlock(&pBuf->mutex); + return 0; +} + +int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + taosThreadMutexLock(&pBuf->mutex); + if (pMsg->startTime != pMgr->peerStartTime) { + sInfo("vgId:%d, reset sync log repl mgr in append entries reply. start time:%" PRId64 ", old start time:%" PRId64 + "", + pNode->vgId, pMsg->startTime, pMgr->peerStartTime); + syncLogResetLogReplMgr(pMgr); + pMgr->peerStartTime = pMsg->startTime; + } + + if (pMgr->restored) { + (void)syncLogReplMgrProcessReplyInNormalMode(pMgr, pNode, pMsg); + } else { + (void)syncLogReplMgrProcessReplyInRecoveryMode(pMgr, pNode, pMsg); + } + taosThreadMutexUnlock(&pBuf->mutex); + return 0; +} + +int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + if (pMgr->restored) { + (void)syncLogReplMgrReplicateAttemptedOnce(pMgr, pNode); + } else { + (void)syncLogReplMgrReplicateProbeOnce(pMgr, pNode); + } + return 0; +} + +int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { + ASSERT(!pMgr->restored); + SyncIndex index = pNode->pLogBuf->matchIndex; + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; + bool barrier = false; + SyncTerm term = -1; + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, + terrstr(), index, pDestId->addr); + return -1; + } + + SSyncLogBuffer* pBuf = pNode->pLogBuf; + sDebug("vgId:%d, attempted to probe the %d'th peer with msg of index:%" PRId64 " term: %" PRId64 + ". pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 + ")", + pNode->vgId, pMgr->peerId, index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + return 0; +} + +_Atomic int64_t tsSendCnt = 0; + +int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { + ASSERT(pMgr->restored); + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; + int32_t batchSize = TMAX(1, pMgr->size / 20); + int32_t count = 0; + int64_t nowMs = taosGetMonoTimestampMs(); + + for (SyncIndex index = pMgr->endIndex; index <= pNode->pLogBuf->matchIndex; index++) { + if (batchSize < count++ || pMgr->startIndex + pMgr->size <= index) { + break; + } + if (pMgr->startIndex + 1 < index && pMgr->states[(index - 1) % pMgr->size].barrier) { + break; + } + int64_t pos = index % pMgr->size; + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; + bool barrier = false; + SyncTerm term = -1; + if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, + terrstr(), index, pDestId->addr); + return -1; + } + pMgr->states[pos].barrier = barrier; + pMgr->states[pos].timeMs = nowMs; + pMgr->states[pos].term = term; + pMgr->states[pos].acked = false; + + pMgr->endIndex = index + 1; + tsSendCnt++; + if (barrier) { + break; + } + } + + SSyncLogBuffer* pBuf = pNode->pLogBuf; + sDebug("vgId:%d, attempted to replicate %d msgs to the %d'th peer. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, count, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + syncLogReplMgrRetryOnNeed(pMgr, pNode); + return 0; +} + +int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { + ASSERT(pMgr->restored == true); + if (pMgr->startIndex <= pMsg->lastSendIndex && pMsg->lastSendIndex < pMgr->endIndex) { + pMgr->states[pMsg->lastSendIndex % pMgr->size].acked = true; + pMgr->matchIndex = TMAX(pMgr->matchIndex, pMsg->matchIndex); + for (SyncIndex index = pMgr->startIndex; index < pMgr->matchIndex; index++) { + memset(&pMgr->states[index % pMgr->size], 0, sizeof(pMgr->states[0])); + } + pMgr->startIndex = pMgr->matchIndex; + } + + return syncLogReplMgrReplicateAttemptedOnce(pMgr, pNode); +} + +SSyncLogReplMgr* syncLogReplMgrCreate() { + SSyncLogReplMgr* pMgr = taosMemoryCalloc(1, sizeof(SSyncLogReplMgr)); + if (pMgr == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + + pMgr->size = sizeof(pMgr->states) / sizeof(pMgr->states[0]); + + ASSERT(pMgr->size == TSDB_SYNC_LOG_BUFFER_SIZE); + + return pMgr; + +_err: + taosMemoryFree(pMgr); + return NULL; +} + +void syncLogReplMgrDestroy(SSyncLogReplMgr* pMgr) { + if (pMgr == NULL) { + return; + } + (void)taosMemoryFree(pMgr); + return; +} + +int32_t syncNodeLogReplMgrInit(SSyncNode* pNode) { + for (int i = 0; i < TSDB_MAX_REPLICA; i++) { + ASSERT(pNode->logReplMgrs[i] == NULL); + pNode->logReplMgrs[i] = syncLogReplMgrCreate(); + pNode->logReplMgrs[i]->peerId = i; + ASSERT(pNode->logReplMgrs[i] != NULL && "Out of memory."); + } + return 0; +} + +void syncNodeLogReplMgrDestroy(SSyncNode* pNode) { + for (int i = 0; i < TSDB_MAX_REPLICA; i++) { + syncLogReplMgrDestroy(pNode->logReplMgrs[i]); + pNode->logReplMgrs[i] = NULL; + } +} + +SSyncLogBuffer* syncLogBufferCreate() { + SSyncLogBuffer* pBuf = taosMemoryCalloc(1, sizeof(SSyncLogBuffer)); + if (pBuf == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + + pBuf->size = sizeof(pBuf->entries) / sizeof(pBuf->entries[0]); + + ASSERT(pBuf->size == TSDB_SYNC_LOG_BUFFER_SIZE); + + if (taosThreadMutexInit(&pBuf->mutex, NULL) < 0) { + sError("failed to init log buffer mutex due to %s", strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(errno); + goto _err; + } + return pBuf; + +_err: + taosMemoryFree(pBuf); + return NULL; +} + +void syncLogBufferDestroy(SSyncLogBuffer* pBuf) { + if (pBuf == NULL) { + return; + } + (void)taosThreadMutexDestroy(&pBuf->mutex); + (void)taosMemoryFree(pBuf); + return; +} + +int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex) { + ASSERT(pBuf->commitIndex < toIndex && toIndex <= pBuf->endIndex); + + SyncIndex index = pBuf->endIndex - 1; + while (index >= toIndex) { + SSyncRaftEntry* pEntry = pBuf->entries[index % pBuf->size].pItem; + if (pEntry != NULL) { + syncEntryDestroy(pEntry); + pEntry = NULL; + memset(&pBuf->entries[index % pBuf->size], 0, sizeof(pBuf->entries[0])); + } + index--; + } + pBuf->endIndex = toIndex; + pBuf->matchIndex = TMIN(pBuf->matchIndex, index); + ASSERT(index + 1 == toIndex); + return 0; +} + +int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { + taosThreadMutexLock(&pBuf->mutex); + SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); + ASSERT(lastVer == pBuf->matchIndex); + SyncIndex index = pBuf->endIndex - 1; + + (void)syncLogBufferRollback(pBuf, pBuf->matchIndex + 1); + + sInfo("vgId:%d, reset log buffer. pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + + pBuf->endIndex = pBuf->matchIndex + 1; + + // reset repl mgr + for (int i = 0; i < pNode->replicaNum; i++) { + SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i]; + syncLogResetLogReplMgr(pMgr); + } + taosThreadMutexUnlock(&pBuf->mutex); + return 0; +} + +SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index, bool* pInBuf) { + SSyncRaftEntry* pEntry = NULL; + if (index >= pBuf->endIndex) { + return NULL; + } + if (index > pBuf->startIndex) { // startIndex might be dummy + *pInBuf = true; + pEntry = pBuf->entries[index % pBuf->size].pItem; + } else { + *pInBuf = false; + if (pNode->pLogStore->syncLogGetEntry(pNode->pLogStore, index, &pEntry) < 0) { + sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); + } + } + return pEntry; +} + +bool syncLogReplMgrValidate(SSyncLogReplMgr* pMgr) { + ASSERT(pMgr->startIndex <= pMgr->endIndex); + for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { + ASSERT(pMgr->states[(index + pMgr->size) % pMgr->size].barrier == false || index + 1 == pMgr->endIndex); + } + return true; +} + +int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SyncTerm* pTerm, + SRaftId* pDestId, bool* pBarrier) { + SSyncRaftEntry* pEntry = NULL; + SRpcMsg msgOut = {0}; + bool inBuf = false; + int32_t ret = -1; + SyncTerm prevLogTerm = -1; + SSyncLogBuffer* pBuf = pNode->pLogBuf; + + pEntry = syncLogBufferGetOneEntry(pBuf, pNode, index, &inBuf); + if (pEntry == NULL) { + sError("vgId:%d, failed to get raft entry for index: %" PRId64 "", pNode->vgId, index); + goto _err; + } + *pBarrier = syncLogIsReplicationBarrier(pEntry); + + prevLogTerm = syncLogReplMgrGetPrevLogTerm(pMgr, pNode, index); + if (prevLogTerm < 0) { + sError("vgId:%d, failed to get prev log term since %s. index: %" PRId64 "", pNode->vgId, terrstr(), index); + goto _err; + } + if (pTerm) *pTerm = pEntry->term; + + int32_t code = syncLogToAppendEntries(pNode, pEntry, prevLogTerm, &msgOut); + if (code < 0) { + sError("vgId:%d, failed to get append entries for index:%" PRId64 "", pNode->vgId, index); + goto _err; + } + + (void)syncNodeSendAppendEntries(pNode, pDestId, &msgOut); + ret = 0; + + sInfo("vgId:%d, replicate one msg index: %" PRId64 " term: %" PRId64 " prevterm: %" PRId64 " to dest: 0x%016" PRIx64, + pNode->vgId, pEntry->index, pEntry->term, prevLogTerm, pDestId->addr); + + if (!inBuf) { + syncEntryDestroy(pEntry); + pEntry = NULL; + } + return 0; + +_err: + rpcFreeCont(msgOut.pCont); + msgOut.pCont = NULL; + if (!inBuf) { + syncEntryDestroy(pEntry); + pEntry = NULL; + } + return -1; +} diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 7e19b9aa5c..33c24bcca8 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -22,6 +22,7 @@ #include "syncEnv.h" #include "syncIndexMgr.h" #include "syncInt.h" +#include "syncLogBuffer.h" #include "syncMessage.h" #include "syncRaftCfg.h" #include "syncRaftLog.h" @@ -704,327 +705,6 @@ static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) { return ret; } -int32_t syncLogResetLogReplMgr(SSyncLogReplMgr* pMgr) { - ASSERT(pMgr->startIndex >= 0); - for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { - memset(&pMgr->states[index % pMgr->size], 0, sizeof(pMgr->states[0])); - } - pMgr->startIndex = 0; - pMgr->matchIndex = 0; - pMgr->endIndex = 0; - pMgr->restored = false; - pMgr->retryBackoff = 0; - return 0; -} - -_Atomic int64_t tsRetryCnt = 0; - -int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { - if (pMgr->endIndex <= pMgr->startIndex) { - return 0; - } - - int32_t ret = -1; - bool retried = false; - int64_t retryWaitMs = syncLogGetRetryBackoffTimeMs(pMgr); - - for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { - int64_t pos = index % pMgr->size; - ASSERT(!pMgr->states[pos].barrier || (index == pMgr->startIndex || index + 1 == pMgr->endIndex)); - if (pMgr->states[pos].acked) { - continue; - } - int64_t nowMs = taosGetMonoTimestampMs(); - if (nowMs < pMgr->states[pos].timeMs + retryWaitMs) { - break; - } - - SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; - bool barrier = false; - SyncTerm term = -1; - if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { - sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, - terrstr(), index, pDestId->addr); - goto _out; - } - ASSERT(barrier == pMgr->states[pos].barrier); - pMgr->states[pos].timeMs = nowMs; - pMgr->states[pos].term = term; - pMgr->states[pos].acked = false; - retried = true; - tsRetryCnt++; - } - - ret = 0; -_out: - if (retried) { - pMgr->retryBackoff = syncLogGetNextRetryBackoff(pMgr); - } - return ret; -} - -int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, - SyncAppendEntriesReply* pMsg) { - SSyncLogBuffer* pBuf = pNode->pLogBuf; - SRaftId destId = pMsg->srcId; - ASSERT(pMgr->restored == false); - char host[64]; - uint16_t port; - syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port); - - if (pMgr->endIndex == 0) { - ASSERT(pMgr->startIndex == 0); - ASSERT(pMgr->matchIndex == 0); - if (pMsg->matchIndex < 0) { - pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr of peer %s:%d restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 - "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, host, port, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); - return 0; - } - } else { - if (pMsg->lastSendIndex < pMgr->startIndex || pMsg->lastSendIndex >= pMgr->endIndex) { - syncLogReplMgrRetryOnNeed(pMgr, pNode); - return 0; - } - - pMgr->states[pMsg->lastSendIndex % pMgr->size].acked = true; - - if (pMsg->matchIndex == pMsg->lastSendIndex) { - pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr of peer %s:%d restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 - "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, host, port, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); - return 0; - } - - (void)syncLogResetLogReplMgr(pMgr); - } - - // send match index - SyncIndex index = TMIN(pMsg->matchIndex, pNode->pLogBuf->matchIndex); - bool barrier = false; - SyncTerm term = -1; - ASSERT(index >= 0); - if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, &destId, &barrier) < 0) { - sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, - terrstr(), index, destId.addr); - return -1; - } - - int64_t nowMs = taosGetMonoTimestampMs(); - pMgr->states[index % pMgr->size].barrier = barrier; - pMgr->states[index % pMgr->size].timeMs = nowMs; - pMgr->states[index % pMgr->size].term = term; - pMgr->states[index % pMgr->size].acked = false; - - pMgr->matchIndex = index; - pMgr->startIndex = index; - pMgr->endIndex = index + 1; - return 0; -} - -int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg) { - SSyncLogBuffer* pBuf = pNode->pLogBuf; - taosThreadMutexLock(&pBuf->mutex); - if (pMsg->startTime != 0 && pMsg->startTime != pMgr->peerStartTime) { - sInfo("vgId:%d, reset sync log repl mgr in heartbeat. start time:%" PRId64 ", old start time:%" PRId64 "", - pNode->vgId, pMsg->startTime, pMgr->peerStartTime); - syncLogResetLogReplMgr(pMgr); - pMgr->peerStartTime = pMsg->startTime; - } - taosThreadMutexUnlock(&pBuf->mutex); - return 0; -} - -int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { - SSyncLogBuffer* pBuf = pNode->pLogBuf; - taosThreadMutexLock(&pBuf->mutex); - if (pMsg->startTime != pMgr->peerStartTime) { - sInfo("vgId:%d, reset sync log repl mgr in append entries reply. start time:%" PRId64 ", old start time:%" PRId64 - "", - pNode->vgId, pMsg->startTime, pMgr->peerStartTime); - syncLogResetLogReplMgr(pMgr); - pMgr->peerStartTime = pMsg->startTime; - } - - if (pMgr->restored) { - (void)syncLogReplMgrProcessReplyInNormalMode(pMgr, pNode, pMsg); - } else { - (void)syncLogReplMgrProcessReplyInRecoveryMode(pMgr, pNode, pMsg); - } - taosThreadMutexUnlock(&pBuf->mutex); - return 0; -} - -int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { - SSyncLogBuffer* pBuf = pNode->pLogBuf; - if (pMgr->restored) { - (void)syncLogReplMgrReplicateAttemptedOnce(pMgr, pNode); - } else { - (void)syncLogReplMgrReplicateProbeOnce(pMgr, pNode); - } - return 0; -} - -int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { - ASSERT(!pMgr->restored); - SyncIndex index = pNode->pLogBuf->matchIndex; - SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; - bool barrier = false; - SyncTerm term = -1; - if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { - sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, - terrstr(), index, pDestId->addr); - return -1; - } - - SSyncLogBuffer* pBuf = pNode->pLogBuf; - sDebug("vgId:%d, attempted to probe the %d'th peer with msg of index:%" PRId64 " term: %" PRId64 - ". pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 - ")", - pNode->vgId, pMgr->peerId, index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); - return 0; -} - -_Atomic int64_t tsSendCnt = 0; - -int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { - ASSERT(pMgr->restored); - SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; - int32_t batchSize = TMAX(1, pMgr->size / 20); - int32_t count = 0; - int64_t nowMs = taosGetMonoTimestampMs(); - - for (SyncIndex index = pMgr->endIndex; index <= pNode->pLogBuf->matchIndex; index++) { - if (batchSize < count++ || pMgr->startIndex + pMgr->size <= index) { - break; - } - if (pMgr->startIndex + 1 < index && pMgr->states[(index - 1) % pMgr->size].barrier) { - break; - } - int64_t pos = index % pMgr->size; - SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; - bool barrier = false; - SyncTerm term = -1; - if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { - sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, - terrstr(), index, pDestId->addr); - return -1; - } - pMgr->states[pos].barrier = barrier; - pMgr->states[pos].timeMs = nowMs; - pMgr->states[pos].term = term; - pMgr->states[pos].acked = false; - - pMgr->endIndex = index + 1; - tsSendCnt++; - if (barrier) { - break; - } - } - - SSyncLogBuffer* pBuf = pNode->pLogBuf; - sDebug("vgId:%d, attempted to replicate %d msgs to the %d'th peer. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 - "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, count, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); - syncLogReplMgrRetryOnNeed(pMgr, pNode); - return 0; -} - -int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { - ASSERT(pMgr->restored == true); - if (pMgr->startIndex <= pMsg->lastSendIndex && pMsg->lastSendIndex < pMgr->endIndex) { - pMgr->states[pMsg->lastSendIndex % pMgr->size].acked = true; - pMgr->matchIndex = TMAX(pMgr->matchIndex, pMsg->matchIndex); - for (SyncIndex index = pMgr->startIndex; index < pMgr->matchIndex; index++) { - memset(&pMgr->states[index % pMgr->size], 0, sizeof(pMgr->states[0])); - } - pMgr->startIndex = pMgr->matchIndex; - } - - return syncLogReplMgrReplicateAttemptedOnce(pMgr, pNode); -} - -SSyncLogReplMgr* syncLogReplMgrCreate() { - SSyncLogReplMgr* pMgr = taosMemoryCalloc(1, sizeof(SSyncLogReplMgr)); - if (pMgr == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return NULL; - } - - pMgr->size = sizeof(pMgr->states) / sizeof(pMgr->states[0]); - - ASSERT(pMgr->size == TSDB_SYNC_LOG_BUFFER_SIZE); - - return pMgr; - -_err: - taosMemoryFree(pMgr); - return NULL; -} - -void syncLogReplMgrDestroy(SSyncLogReplMgr* pMgr) { - if (pMgr == NULL) { - return; - } - (void)taosMemoryFree(pMgr); - return; -} - -int32_t syncNodeLogReplMgrInit(SSyncNode* pNode) { - for (int i = 0; i < TSDB_MAX_REPLICA; i++) { - ASSERT(pNode->logReplMgrs[i] == NULL); - pNode->logReplMgrs[i] = syncLogReplMgrCreate(); - pNode->logReplMgrs[i]->peerId = i; - ASSERT(pNode->logReplMgrs[i] != NULL && "Out of memory."); - } - return 0; -} - -void syncNodeLogReplMgrDestroy(SSyncNode* pNode) { - for (int i = 0; i < TSDB_MAX_REPLICA; i++) { - syncLogReplMgrDestroy(pNode->logReplMgrs[i]); - pNode->logReplMgrs[i] = NULL; - } -} - -SSyncLogBuffer* syncLogBufferCreate() { - SSyncLogBuffer* pBuf = taosMemoryCalloc(1, sizeof(SSyncLogBuffer)); - if (pBuf == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return NULL; - } - - pBuf->size = sizeof(pBuf->entries) / sizeof(pBuf->entries[0]); - - ASSERT(pBuf->size == TSDB_SYNC_LOG_BUFFER_SIZE); - - if (taosThreadMutexInit(&pBuf->mutex, NULL) < 0) { - sError("failed to init log buffer mutex due to %s", strerror(errno)); - terrno = TAOS_SYSTEM_ERROR(errno); - goto _err; - } - return pBuf; - -_err: - taosMemoryFree(pBuf); - return NULL; -} - -void syncLogBufferDestroy(SSyncLogBuffer* pBuf) { - if (pBuf == NULL) { - return; - } - (void)taosThreadMutexDestroy(&pBuf->mutex); - (void)taosMemoryFree(pBuf); - return; -} - // open/close -------------- SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { SSyncNode* pSyncNode = taosMemoryCalloc(1, sizeof(SSyncNode)); @@ -1895,47 +1575,6 @@ void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm) { void syncNodeLeaderChangeRsp(SSyncNode* pSyncNode) { syncRespCleanRsp(pSyncNode->pSyncRespMgr); } -int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex) { - ASSERT(pBuf->commitIndex < toIndex && toIndex <= pBuf->endIndex); - - SyncIndex index = pBuf->endIndex - 1; - while (index >= toIndex) { - SSyncRaftEntry* pEntry = pBuf->entries[index % pBuf->size].pItem; - if (pEntry != NULL) { - syncEntryDestroy(pEntry); - pEntry = NULL; - memset(&pBuf->entries[index % pBuf->size], 0, sizeof(pBuf->entries[0])); - } - index--; - } - pBuf->endIndex = toIndex; - pBuf->matchIndex = TMIN(pBuf->matchIndex, index); - ASSERT(index + 1 == toIndex); - return 0; -} - -int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { - taosThreadMutexLock(&pBuf->mutex); - SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); - ASSERT(lastVer == pBuf->matchIndex); - SyncIndex index = pBuf->endIndex - 1; - - (void)syncLogBufferRollback(pBuf, pBuf->matchIndex + 1); - - sInfo("vgId:%d, reset log buffer. pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); - - pBuf->endIndex = pBuf->matchIndex + 1; - - // reset repl mgr - for (int i = 0; i < pNode->replicaNum; i++) { - SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i]; - syncLogResetLogReplMgr(pMgr); - } - taosThreadMutexUnlock(&pBuf->mutex); - return 0; -} - void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) { // maybe clear leader cache if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) { @@ -2675,96 +2314,6 @@ int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) { return 0; } -int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf) { - taosThreadMutexLock(&pBuf->mutex); - int64_t index = pBuf->endIndex; - taosThreadMutexUnlock(&pBuf->mutex); - return index; -} - -int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry) { - taosThreadMutexLock(&pBuf->mutex); - syncLogBufferValidate(pBuf); - SyncIndex index = pEntry->index; - - if (index - pBuf->startIndex >= pBuf->size) { - sError("vgId:%d, failed to append due to log buffer full. index:%" PRId64 "", pNode->vgId, index); - goto _out; - } - - ASSERT(index == pBuf->endIndex); - - SSyncRaftEntry* pExist = pBuf->entries[index % pBuf->size].pItem; - ASSERT(pExist == NULL); - - // initial log buffer with at least one item, e.g. commitIndex - SSyncRaftEntry* pMatch = pBuf->entries[(index - 1 + pBuf->size) % pBuf->size].pItem; - ASSERT(pMatch != NULL && "no matched raft log entry"); - ASSERT(pMatch->index + 1 == index); - - SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = pMatch->index, .prevLogTerm = pMatch->term}; - pBuf->entries[index % pBuf->size] = tmp; - pBuf->endIndex = index + 1; - - syncLogBufferValidate(pBuf); - taosThreadMutexUnlock(&pBuf->mutex); - return 0; - -_out: - syncLogBufferValidate(pBuf); - syncEntryDestroy(pEntry); - taosThreadMutexUnlock(&pBuf->mutex); - return -1; -} - -SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index) { - SSyncLogBuffer* pBuf = pNode->pLogBuf; - SSyncRaftEntry* pEntry = NULL; - SyncIndex prevIndex = index - 1; - SyncTerm prevLogTerm = -1; - terrno = TSDB_CODE_SUCCESS; - - if (prevIndex == -1) return 0; - - if (index - 1 > pBuf->matchIndex) { - terrno = TSDB_CODE_WAL_LOG_NOT_EXIST; - return -1; - } - - ASSERT(index - 1 == prevIndex); - - if (index - 1 >= pBuf->startIndex) { - pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; - ASSERT(pEntry != NULL && "no log entry found"); - prevLogTerm = pBuf->entries[(index + pBuf->size) % pBuf->size].prevLogTerm; - return prevLogTerm; - } - - if (pMgr->startIndex <= prevIndex && prevIndex < pMgr->endIndex) { - int64_t timeMs = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].timeMs; - ASSERT(timeMs != 0 && "no log entry found"); - prevLogTerm = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].term; - ASSERT(prevIndex == 0 || prevLogTerm != 0); - return prevLogTerm; - } - - SSnapshot snapshot; - if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) == 0 && prevIndex == snapshot.lastApplyIndex) { - return snapshot.lastApplyTerm; - } - - if (pNode->pLogStore->syncLogGetEntry(pNode->pLogStore, prevIndex, &pEntry) == 0) { - prevLogTerm = pEntry->term; - syncEntryDestroy(pEntry); - pEntry = NULL; - return prevLogTerm; - } - - sError("vgId:%d, failed to get log term since %s. index: %" PRId64 "", pNode->vgId, terrstr(), prevIndex); - terrno = TSDB_CODE_WAL_LOG_NOT_EXIST; - return -1; -} - int32_t syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm, SRpcMsg* pRpcMsg) { uint32_t dataLen = pEntry->bytes; uint32_t bytes = sizeof(SyncAppendEntries) + dataLen; @@ -3014,7 +2563,7 @@ int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) { } bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) { - return false; + // return false; return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1); } diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index 46c7649e02..e8709f95a2 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -16,6 +16,7 @@ #define _DEFAULT_SOURCE #include "syncReplication.h" #include "syncIndexMgr.h" +#include "syncLogBuffer.h" #include "syncRaftEntry.h" #include "syncRaftStore.h" #include "syncUtil.h" @@ -45,6 +46,8 @@ // mdest |-> j]) // /\ UNCHANGED <> +int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); + int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId, bool snapshot) { // next index SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId); diff --git a/source/os/src/osTimer.c b/source/os/src/osTimer.c index d2b8f3134a..d1c233ea9c 100644 --- a/source/os/src/osTimer.c +++ b/source/os/src/osTimer.c @@ -216,7 +216,7 @@ int64_t taosGetMonotonicMs() { #if 0 return getMonotonicUs() / 1000; #else - return taosGetMonoTimestampMs(); + return taosGetTimestampMs(); #endif } From 363cbc8985dce3d223be1b6596563fa797b44042 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 18 Nov 2022 09:37:58 +0800 Subject: [PATCH 12/42] fix: update cbMeta rsp handle info in syncLogFsmExecute --- source/dnode/vnode/src/vnd/vnodeSync.c | 2 +- source/libs/sync/inc/syncCommit.h | 5 ++ source/libs/sync/inc/syncLogBuffer.h | 47 +++++++++---------- source/libs/sync/inc/syncReplication.h | 3 ++ source/libs/sync/inc/syncSnapshot.h | 2 + source/libs/sync/src/syncAppendEntriesReply.c | 18 ------- source/libs/sync/src/syncCommit.c | 18 +++++++ source/libs/sync/src/syncLogBuffer.c | 32 ++++++------- source/libs/sync/src/syncMain.c | 1 - source/libs/transport/src/tmsgcb.c | 5 +- 10 files changed, 66 insertions(+), 67 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index feecbc6d3b..c67ee41b12 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -415,7 +415,7 @@ static void vnodeRestoreFinish(const SSyncFSM *pFsm) { walApplyVer(pVnode->pWal, pVnode->state.applied); pVnode->restored = true; - vDebug("vgId:%d, sync restore finished", pVnode->config.vgId); + vInfo("vgId:%d, sync restore finished", pVnode->config.vgId); } static void vnodeBecomeFollower(const SSyncFSM *pFsm) { diff --git a/source/libs/sync/inc/syncCommit.h b/source/libs/sync/inc/syncCommit.h index d3ba556f82..7d638a7336 100644 --- a/source/libs/sync/inc/syncCommit.h +++ b/source/libs/sync/inc/syncCommit.h @@ -47,8 +47,13 @@ extern "C" { // void syncOneReplicaAdvance(SSyncNode* pSyncNode); void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode); + bool syncAgreeIndex(SSyncNode* pSyncNode, SRaftId* pRaftId, SyncIndex index); bool syncAgree(SSyncNode* pSyncNode, SyncIndex index); +bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index); + +int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commitIndex); +int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely); #ifdef __cplusplus } diff --git a/source/libs/sync/inc/syncLogBuffer.h b/source/libs/sync/inc/syncLogBuffer.h index 4c209549b4..39b4439d62 100644 --- a/source/libs/sync/inc/syncLogBuffer.h +++ b/source/libs/sync/inc/syncLogBuffer.h @@ -41,8 +41,28 @@ typedef struct SSyncLogReplMgr { int32_t peerId; } SSyncLogReplMgr; +typedef struct SSyncLogBufEntry { + SSyncRaftEntry* pItem; + SyncIndex prevLogIndex; + SyncTerm prevLogTerm; +} SSyncLogBufEntry; + +typedef struct SSyncLogBuffer { + SSyncLogBufEntry entries[TSDB_SYNC_LOG_BUFFER_SIZE]; + int64_t startIndex; + int64_t commitIndex; + int64_t matchIndex; + int64_t endIndex; + int64_t size; + TdThreadMutex mutex; +} SSyncLogBuffer; + +// SSyncLogRepMgr + SSyncLogReplMgr* syncLogReplMgrCreate(); void syncLogReplMgrDestroy(SSyncLogReplMgr* pMgr); +int32_t syncLogReplMgrReset(SSyncLogReplMgr* pMgr); + int32_t syncNodeLogReplMgrInit(SSyncNode* pNode); void syncNodeLogReplMgrDestroy(SSyncNode* pNode); @@ -69,35 +89,12 @@ int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); -int32_t syncLogResetLogReplMgr(SSyncLogReplMgr* pMgr); int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg); -int32_t syncNodeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); -int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); -void syncLogDestroyAppendEntries(SRpcMsg* pRpcMsg); - -// others -bool syncLogReplMgrValidate(SSyncLogReplMgr* pMgr); - -typedef struct SSyncLogBufEntry { - SSyncRaftEntry* pItem; - SyncIndex prevLogIndex; - SyncTerm prevLogTerm; -} SSyncLogBufEntry; - -typedef struct SSyncLogBuffer { - SSyncLogBufEntry entries[TSDB_SYNC_LOG_BUFFER_SIZE]; - int64_t startIndex; - int64_t commitIndex; - int64_t matchIndex; - int64_t endIndex; - int64_t size; - TdThreadMutex mutex; -} SSyncLogBuffer; - +// SSyncLogBuffer SSyncLogBuffer* syncLogBufferCreate(); void syncLogBufferDestroy(SSyncLogBuffer* pBuf); int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode); @@ -117,8 +114,6 @@ int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex); int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm); // others -bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index); -void syncIndexMgrSetIndex(SSyncIndexMgr* pSyncIndexMgr, const SRaftId* pRaftId, SyncIndex index); int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commtIndex); int32_t syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm, SRpcMsg* pRpcMsg); diff --git a/source/libs/sync/inc/syncReplication.h b/source/libs/sync/inc/syncReplication.h index 7da610a9ed..f077306475 100644 --- a/source/libs/sync/inc/syncReplication.h +++ b/source/libs/sync/inc/syncReplication.h @@ -53,6 +53,9 @@ int32_t syncNodeSendHeartbeat(SSyncNode* pSyncNode, const SRaftId* pDestId, SRpc int32_t syncNodeReplicate(SSyncNode* pSyncNode); int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId, bool snapshot); +int32_t syncNodeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); +int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); + #ifdef __cplusplus } #endif diff --git a/source/libs/sync/inc/syncSnapshot.h b/source/libs/sync/inc/syncSnapshot.h index 7b8e768391..1f9675a3cd 100644 --- a/source/libs/sync/inc/syncSnapshot.h +++ b/source/libs/sync/inc/syncSnapshot.h @@ -87,6 +87,8 @@ void snapshotReceiverForceStop(SSyncSnapshotReceiver *pReceive int32_t syncNodeOnSnapshot(SSyncNode *ths, const SRpcMsg *pMsg); int32_t syncNodeOnSnapshotReply(SSyncNode *ths, const SRpcMsg *pMsg); +SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode *pSyncNode, SyncIndex snapshotLastApplyIndex); + // start #ifdef __cplusplus diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index 36eb6fefc7..234e8bffeb 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -39,24 +39,6 @@ // /\ UNCHANGED <> // -int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commitIndex) { - SyncIndex lastVer = ths->pLogStore->syncLogLastIndex(ths->pLogStore); - commitIndex = TMAX(commitIndex, ths->commitIndex); - ths->commitIndex = TMIN(commitIndex, lastVer); - ths->pLogStore->syncLogUpdateCommitIndex(ths->pLogStore, ths->commitIndex); - return ths->commitIndex; -} - -int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) { - if (indexLikely > ths->commitIndex && syncNodeAgreedUpon(ths, indexLikely)) { - SyncIndex commitIndex = indexLikely; - syncNodeUpdateCommitIndex(ths, commitIndex); - sInfo("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state, - ths->pRaftStore->currentTerm, commitIndex); - } - return ths->commitIndex; -} - int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { SyncAppendEntriesReply* pMsg = pRpcMsg->pCont; int32_t ret = 0; diff --git a/source/libs/sync/src/syncCommit.c b/source/libs/sync/src/syncCommit.c index 73fc3050ef..fc7ea7cc30 100644 --- a/source/libs/sync/src/syncCommit.c +++ b/source/libs/sync/src/syncCommit.c @@ -311,3 +311,21 @@ bool syncAgree(SSyncNode* pNode, SyncIndex index) { } return false; } + +int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commitIndex) { + SyncIndex lastVer = ths->pLogStore->syncLogLastIndex(ths->pLogStore); + commitIndex = TMAX(commitIndex, ths->commitIndex); + ths->commitIndex = TMIN(commitIndex, lastVer); + ths->pLogStore->syncLogUpdateCommitIndex(ths->pLogStore, ths->commitIndex); + return ths->commitIndex; +} + +int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) { + if (indexLikely > ths->commitIndex && syncNodeAgreedUpon(ths, indexLikely)) { + SyncIndex commitIndex = indexLikely; + syncNodeUpdateCommitIndex(ths, commitIndex); + sInfo("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state, + ths->pRaftStore->currentTerm, commitIndex); + } + return ths->commitIndex; +} diff --git a/source/libs/sync/src/syncLogBuffer.c b/source/libs/sync/src/syncLogBuffer.c index 36f33fa46e..621bce6683 100644 --- a/source/libs/sync/src/syncLogBuffer.c +++ b/source/libs/sync/src/syncLogBuffer.c @@ -16,10 +16,13 @@ #define _DEFAULT_SOURCE #include "syncLogBuffer.h" +#include "syncIndexMgr.h" #include "syncInt.h" #include "syncRaftEntry.h" #include "syncRaftStore.h" #include "syncReplication.h" +#include "syncRespMgr.h" +#include "syncSnapshot.h" #include "syncUtil.h" int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf) { @@ -384,7 +387,7 @@ _out: return matchIndex; } -int32_t syncLogFsmExecute(SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry) { +int32_t syncLogFsmExecute(SSyncNode* pNode, SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry) { ASSERT(pFsm->FpCommitCb != NULL && "No commit cb registered for the FSM"); SRpcMsg rpcMsg; @@ -392,7 +395,7 @@ int32_t syncLogFsmExecute(SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncR SFsmCbMeta cbMeta = {0}; cbMeta.index = pEntry->index; - cbMeta.lastConfigIndex = -1; + cbMeta.lastConfigIndex = syncNodeGetSnapshotConfigIndex(pNode, pEntry->index); cbMeta.isWeak = pEntry->isWeak; cbMeta.code = 0; cbMeta.state = role; @@ -401,6 +404,7 @@ int32_t syncLogFsmExecute(SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncR cbMeta.currentTerm = term; cbMeta.flag = -1; + (void)syncRespMgrGetAndDel(pNode->pSyncRespMgr, cbMeta.seqNum, &rpcMsg.info); pFsm->FpCommitCb(pFsm, &rpcMsg, &cbMeta); return 0; } @@ -423,7 +427,7 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm ESyncState role = pNode->state; SyncTerm term = pNode->pRaftStore->currentTerm; SyncGroupId vgId = pNode->vgId; - int32_t ret = 0; + int32_t ret = -1; int64_t upperIndex = TMIN(commitIndex, pBuf->matchIndex); SSyncRaftEntry* pEntry = NULL; bool inBuf = false; @@ -459,10 +463,9 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm continue; } - if (syncLogFsmExecute(pFsm, role, term, pEntry) != 0) { + if (syncLogFsmExecute(pNode, pFsm, role, term, pEntry) != 0) { sError("vgId:%d, failed to execute raft entry in FSM. log index:%" PRId64 ", term:%" PRId64 "", vgId, pEntry->index, pEntry->term); - ret = -1; goto _out; } pBuf->commitIndex = index; @@ -487,6 +490,7 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm pBuf->startIndex = index + 1; } + ret = 0; _out: // mark as restored if needed if (!pNode->restoreFinish && pBuf->commitIndex >= pNode->commitIndex) { @@ -505,7 +509,7 @@ _out: return ret; } -int32_t syncLogResetLogReplMgr(SSyncLogReplMgr* pMgr) { +int32_t syncLogReplMgrReset(SSyncLogReplMgr* pMgr) { ASSERT(pMgr->startIndex >= 0); for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { memset(&pMgr->states[index % pMgr->size], 0, sizeof(pMgr->states[0])); @@ -601,7 +605,7 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod return 0; } - (void)syncLogResetLogReplMgr(pMgr); + (void)syncLogReplMgrReset(pMgr); } // send match index @@ -633,7 +637,7 @@ int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pN if (pMsg->startTime != 0 && pMsg->startTime != pMgr->peerStartTime) { sInfo("vgId:%d, reset sync log repl mgr in heartbeat. start time:%" PRId64 ", old start time:%" PRId64 "", pNode->vgId, pMsg->startTime, pMgr->peerStartTime); - syncLogResetLogReplMgr(pMgr); + syncLogReplMgrReset(pMgr); pMgr->peerStartTime = pMsg->startTime; } taosThreadMutexUnlock(&pBuf->mutex); @@ -647,7 +651,7 @@ int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Sync sInfo("vgId:%d, reset sync log repl mgr in append entries reply. start time:%" PRId64 ", old start time:%" PRId64 "", pNode->vgId, pMsg->startTime, pMgr->peerStartTime); - syncLogResetLogReplMgr(pMgr); + syncLogReplMgrReset(pMgr); pMgr->peerStartTime = pMsg->startTime; } @@ -861,7 +865,7 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { // reset repl mgr for (int i = 0; i < pNode->replicaNum; i++) { SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i]; - syncLogResetLogReplMgr(pMgr); + syncLogReplMgrReset(pMgr); } taosThreadMutexUnlock(&pBuf->mutex); return 0; @@ -884,14 +888,6 @@ SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, return pEntry; } -bool syncLogReplMgrValidate(SSyncLogReplMgr* pMgr) { - ASSERT(pMgr->startIndex <= pMgr->endIndex); - for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { - ASSERT(pMgr->states[(index + pMgr->size) % pMgr->size].barrier == false || index + 1 == pMgr->endIndex); - } - return true; -} - int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SyncTerm* pTerm, SRaftId* pDestId, bool* pBarrier) { SSyncRaftEntry* pEntry = NULL; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 33c24bcca8..457ef7eedd 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -58,7 +58,6 @@ static int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeade static int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry); static ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode); -static SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode* pSyncNode, SyncIndex snapshotLastApplyIndex); int64_t syncOpen(SSyncInfo* pSyncInfo) { SSyncNode* pSyncNode = syncNodeOpen(pSyncInfo); diff --git a/source/libs/transport/src/tmsgcb.c b/source/libs/transport/src/tmsgcb.c index 2007bc474f..48f557c1a8 100644 --- a/source/libs/transport/src/tmsgcb.c +++ b/source/libs/transport/src/tmsgcb.c @@ -16,6 +16,7 @@ #define _DEFAULT_SOURCE #include "tmsgcb.h" #include "taoserror.h" +#include "transLog.h" #include "trpc.h" static SMsgCb defaultMsgCb; @@ -23,9 +24,7 @@ static SMsgCb defaultMsgCb; void tmsgSetDefault(const SMsgCb* msgcb) { defaultMsgCb = *msgcb; } int32_t tmsgPutToQueue(const SMsgCb* msgcb, EQueueType qtype, SRpcMsg* pMsg) { - if (msgcb == NULL) { - return -1; - } + ASSERT(msgcb != NULL); int32_t code = (*msgcb->putToQueueFp)(msgcb->mgmt, qtype, pMsg); if (code != 0) { rpcFreeCont(pMsg->pCont); From 03b88ff41cba1c13a6fc0417447c9eaa9ab2f841 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 18 Nov 2022 09:50:36 +0800 Subject: [PATCH 13/42] enh: rename syncLogBuffer.h|c to syncPipeline.h|c --- .../libs/sync/inc/{syncLogBuffer.h => syncPipeline.h} | 11 +++-------- source/libs/sync/src/syncAppendEntries.c | 3 ++- source/libs/sync/src/syncAppendEntriesReply.c | 2 +- source/libs/sync/src/syncMain.c | 2 +- .../libs/sync/src/{syncLogBuffer.c => syncPipeline.c} | 2 +- source/libs/sync/src/syncReplication.c | 2 +- 6 files changed, 9 insertions(+), 13 deletions(-) rename source/libs/sync/inc/{syncLogBuffer.h => syncPipeline.h} (93%) rename source/libs/sync/src/{syncLogBuffer.c => syncPipeline.c} (99%) diff --git a/source/libs/sync/inc/syncLogBuffer.h b/source/libs/sync/inc/syncPipeline.h similarity index 93% rename from source/libs/sync/inc/syncLogBuffer.h rename to source/libs/sync/inc/syncPipeline.h index 39b4439d62..666515078a 100644 --- a/source/libs/sync/inc/syncLogBuffer.h +++ b/source/libs/sync/inc/syncPipeline.h @@ -13,8 +13,8 @@ * along with this program. If not, see . */ -#ifndef _TD_LIBS_SYNC_LOG_BUFFER_H -#define _TD_LIBS_SYNC_LOG_BUFFER_H +#ifndef _TD_LIBS_SYNC_PIPELINE_H +#define _TD_LIBS_SYNC_PIPELINE_H #ifdef __cplusplus extern "C" { @@ -58,7 +58,6 @@ typedef struct SSyncLogBuffer { } SSyncLogBuffer; // SSyncLogRepMgr - SSyncLogReplMgr* syncLogReplMgrCreate(); void syncLogReplMgrDestroy(SSyncLogReplMgr* pMgr); int32_t syncLogReplMgrReset(SSyncLogReplMgr* pMgr); @@ -113,12 +112,8 @@ int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf); int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex); int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm); -// others -int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commtIndex); -int32_t syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm, SRpcMsg* pRpcMsg); - #ifdef __cplusplus } #endif -#endif /*_TD_LIBS_SYNC_LOG_BUFFER_H*/ +#endif /*_TD_LIBS_SYNC_PIPELINE_H*/ diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 4eff19ced8..f2bdf6be70 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -15,12 +15,13 @@ #define _DEFAULT_SOURCE #include "syncAppendEntries.h" -#include "syncLogBuffer.h" +#include "syncPipeline.h" #include "syncMessage.h" #include "syncRaftLog.h" #include "syncRaftStore.h" #include "syncReplication.h" #include "syncUtil.h" +#include "syncCommit.h" // TLA+ Spec // HandleAppendEntriesRequest(i, j, m) == diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index 234e8bffeb..ad388e193a 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -17,7 +17,7 @@ #include "syncAppendEntriesReply.h" #include "syncCommit.h" #include "syncIndexMgr.h" -#include "syncLogBuffer.h" +#include "syncPipeline.h" #include "syncMessage.h" #include "syncRaftEntry.h" #include "syncRaftStore.h" diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 457ef7eedd..8dd972506a 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -22,7 +22,7 @@ #include "syncEnv.h" #include "syncIndexMgr.h" #include "syncInt.h" -#include "syncLogBuffer.h" +#include "syncPipeline.h" #include "syncMessage.h" #include "syncRaftCfg.h" #include "syncRaftLog.h" diff --git a/source/libs/sync/src/syncLogBuffer.c b/source/libs/sync/src/syncPipeline.c similarity index 99% rename from source/libs/sync/src/syncLogBuffer.c rename to source/libs/sync/src/syncPipeline.c index 621bce6683..c8bb6260e8 100644 --- a/source/libs/sync/src/syncLogBuffer.c +++ b/source/libs/sync/src/syncPipeline.c @@ -15,7 +15,7 @@ #define _DEFAULT_SOURCE -#include "syncLogBuffer.h" +#include "syncPipeline.h" #include "syncIndexMgr.h" #include "syncInt.h" #include "syncRaftEntry.h" diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index e8709f95a2..d4a046b7ae 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -16,7 +16,7 @@ #define _DEFAULT_SOURCE #include "syncReplication.h" #include "syncIndexMgr.h" -#include "syncLogBuffer.h" +#include "syncPipeline.h" #include "syncRaftEntry.h" #include "syncRaftStore.h" #include "syncUtil.h" From c9c05761d6204f186eef626fe059927e87c27dc7 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 18 Nov 2022 10:09:31 +0800 Subject: [PATCH 14/42] enh: rename syncLogToAppendEntries to syncBuildAppendEntriesFromRaftLog --- source/libs/sync/inc/syncMessage.h | 2 ++ source/libs/sync/src/syncMain.c | 27 --------------------------- source/libs/sync/src/syncMessage.c | 29 +++++++++++++++++++++++++++++ source/libs/sync/src/syncPipeline.c | 2 +- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/source/libs/sync/inc/syncMessage.h b/source/libs/sync/inc/syncMessage.h index 92e7b555a4..c5fdc27426 100644 --- a/source/libs/sync/inc/syncMessage.h +++ b/source/libs/sync/inc/syncMessage.h @@ -243,6 +243,8 @@ int32_t syncBuildRequestVote(SRpcMsg* pMsg, int32_t vgId); int32_t syncBuildRequestVoteReply(SRpcMsg* pMsg, int32_t vgId); int32_t syncBuildAppendEntries(SRpcMsg* pMsg, int32_t dataLen, int32_t vgId); int32_t syncBuildAppendEntriesReply(SRpcMsg* pMsg, int32_t vgId); +int32_t syncBuildAppendEntriesFromRaftLog(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm, + SRpcMsg* pRpcMsg); int32_t syncBuildHeartbeat(SRpcMsg* pMsg, int32_t vgId); int32_t syncBuildHeartbeatReply(SRpcMsg* pMsg, int32_t vgId); int32_t syncBuildPreSnapshot(SRpcMsg* pMsg, int32_t vgId); diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 8dd972506a..c95a3aba62 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -2313,33 +2313,6 @@ int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) { return 0; } -int32_t syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm, SRpcMsg* pRpcMsg) { - uint32_t dataLen = pEntry->bytes; - uint32_t bytes = sizeof(SyncAppendEntries) + dataLen; - pRpcMsg->contLen = bytes; - pRpcMsg->pCont = rpcMallocCont(pRpcMsg->contLen); - if (pRpcMsg->pCont == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; - } - - SyncAppendEntries* pMsg = pRpcMsg->pCont; - pMsg->bytes = pRpcMsg->contLen; - pMsg->msgType = pRpcMsg->msgType = TDMT_SYNC_APPEND_ENTRIES; - pMsg->dataLen = dataLen; - - (void)memcpy(pMsg->data, pEntry, dataLen); - - pMsg->prevLogIndex = pEntry->index - 1; - pMsg->prevLogTerm = prevLogTerm; - pMsg->vgId = pNode->vgId; - pMsg->srcId = pNode->myRaftId; - pMsg->term = pNode->pRaftStore->currentTerm; - pMsg->commitIndex = pNode->commitIndex; - pMsg->privateTerm = 0; - return 0; -} - // TLA+ Spec // ClientRequest(i, v) == // /\ state[i] = Leader diff --git a/source/libs/sync/src/syncMessage.c b/source/libs/sync/src/syncMessage.c index ce98419980..ef1d585a89 100644 --- a/source/libs/sync/src/syncMessage.c +++ b/source/libs/sync/src/syncMessage.c @@ -16,6 +16,7 @@ #define _DEFAULT_SOURCE #include "syncMessage.h" #include "syncRaftEntry.h" +#include "syncRaftStore.h" int32_t syncBuildTimeout(SRpcMsg* pMsg, ESyncTimeoutType timeoutType, uint64_t logicClock, int32_t timerMS, SSyncNode* pNode) { @@ -152,6 +153,34 @@ int32_t syncBuildAppendEntriesReply(SRpcMsg* pMsg, int32_t vgId) { return 0; } +int32_t syncBuildAppendEntriesFromRaftLog(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm, + SRpcMsg* pRpcMsg) { + uint32_t dataLen = pEntry->bytes; + uint32_t bytes = sizeof(SyncAppendEntries) + dataLen; + pRpcMsg->contLen = bytes; + pRpcMsg->pCont = rpcMallocCont(pRpcMsg->contLen); + if (pRpcMsg->pCont == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + SyncAppendEntries* pMsg = pRpcMsg->pCont; + pMsg->bytes = pRpcMsg->contLen; + pMsg->msgType = pRpcMsg->msgType = TDMT_SYNC_APPEND_ENTRIES; + pMsg->dataLen = dataLen; + + (void)memcpy(pMsg->data, pEntry, dataLen); + + pMsg->prevLogIndex = pEntry->index - 1; + pMsg->prevLogTerm = prevLogTerm; + pMsg->vgId = pNode->vgId; + pMsg->srcId = pNode->myRaftId; + pMsg->term = pNode->pRaftStore->currentTerm; + pMsg->commitIndex = pNode->commitIndex; + pMsg->privateTerm = 0; + return 0; +} + int32_t syncBuildHeartbeat(SRpcMsg* pMsg, int32_t vgId) { int32_t bytes = sizeof(SyncHeartbeat); pMsg->pCont = rpcMallocCont(bytes); diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index c8bb6260e8..501efa8782 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -911,7 +911,7 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn } if (pTerm) *pTerm = pEntry->term; - int32_t code = syncLogToAppendEntries(pNode, pEntry, prevLogTerm, &msgOut); + int32_t code = syncBuildAppendEntriesFromRaftLog(pNode, pEntry, prevLogTerm, &msgOut); if (code < 0) { sError("vgId:%d, failed to get append entries for index:%" PRId64 "", pNode->vgId, index); goto _err; From b90ee796236f4ce551a36c93d3eaff109d81e8d3 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Mon, 21 Nov 2022 13:51:01 +0800 Subject: [PATCH 15/42] fix: use syncLogBufferCommit in syncNodeOnLocalCmd --- include/util/tdef.h | 2 +- source/dnode/vnode/src/tsdb/tsdbCommit.c | 4 +- source/dnode/vnode/src/vnd/vnodeSvr.c | 7 ++++ source/dnode/vnode/src/vnd/vnodeSync.c | 8 ++-- source/libs/sync/src/syncAppendEntries.c | 6 +-- source/libs/sync/src/syncAppendEntriesReply.c | 4 +- source/libs/sync/src/syncCommit.c | 4 +- source/libs/sync/src/syncMain.c | 38 ++++++++++++++++--- source/libs/sync/src/syncPipeline.c | 26 +++++++------ source/libs/transport/src/transSvr.c | 1 + 10 files changed, 68 insertions(+), 32 deletions(-) diff --git a/include/util/tdef.h b/include/util/tdef.h index ef0eca4db3..124d98e3dc 100644 --- a/include/util/tdef.h +++ b/include/util/tdef.h @@ -281,7 +281,7 @@ typedef enum ELogicConditionType { #define TSDB_DNODE_ROLE_VNODE 2 #define TSDB_MAX_REPLICA 5 -#define TSDB_SYNC_LOG_BUFFER_SIZE 1024 +#define TSDB_SYNC_LOG_BUFFER_SIZE 4096 #define TSDB_TBNAME_COLUMN_INDEX (-1) #define TSDB_MULTI_TABLEMETA_MAX_NUM 100000 // maximum batch size allowed to load table meta diff --git a/source/dnode/vnode/src/tsdb/tsdbCommit.c b/source/dnode/vnode/src/tsdb/tsdbCommit.c index 65a46331aa..65fd266083 100644 --- a/source/dnode/vnode/src/tsdb/tsdbCommit.c +++ b/source/dnode/vnode/src/tsdb/tsdbCommit.c @@ -1258,7 +1258,7 @@ static int32_t tsdbCommitMergeBlock(SCommitter *pCommitter, SDataBlk *pDataBlk) } } } else { - ASSERT(0); + ASSERT(0 && "dup rows not allowed"); } if (pBDataW->nRow >= pCommitter->maxRow) { @@ -1679,4 +1679,4 @@ _exit: tsdbInfo("vgId:%d, tsdb rollback commit", TD_VID(pTsdb->pVnode)); } return code; -} \ No newline at end of file +} diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index 5c8c166833..4abbeb61b4 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -178,6 +178,13 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t version, SRp return -1; } + if (version <= pVnode->state.applied) { + vError("vgId:%d, duplicate write request. version: %" PRId64 ", applied: %" PRId64 "", TD_VID(pVnode), version, + pVnode->state.applied); + pRsp->info.handle = NULL; + return -1; + } + vDebug("vgId:%d, start to process write request %s, index:%" PRId64, TD_VID(pVnode), TMSG_INFO(pMsg->msgType), version); diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index c67ee41b12..6837dd1341 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -307,10 +307,10 @@ static void vnodeSyncApplyMsg(const SSyncFSM *pFsm, const SRpcMsg *pMsg, const S rpcMsg.info.conn.applyTerm = pMeta->term; const STraceId *trace = &pMsg->info.traceId; - vGInfo("vgId:%d, commit-cb is excuted, fsm:%p, index:%" PRId64 ", term:%" PRIu64 ", msg-index:%" PRId64 - ", weak:%d, code:%d, state:%d %s, type:%s", - pVnode->config.vgId, pFsm, pMeta->index, pMeta->term, rpcMsg.info.conn.applyIndex, pMeta->isWeak, - pMeta->code, pMeta->state, syncStr(pMeta->state), TMSG_INFO(pMsg->msgType)); + vGTrace("vgId:%d, commit-cb is excuted, fsm:%p, index:%" PRId64 ", term:%" PRIu64 ", msg-index:%" PRId64 + ", weak:%d, code:%d, state:%d %s, type:%s", + pVnode->config.vgId, pFsm, pMeta->index, pMeta->term, rpcMsg.info.conn.applyIndex, pMeta->isWeak, + pMeta->code, pMeta->state, syncStr(pMeta->state), TMSG_INFO(pMsg->msgType)); tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, &rpcMsg); } else { diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index f2bdf6be70..9634f4ee26 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -195,9 +195,9 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) { goto _IGNORE; } - sInfo("vgId:%d, recv append entries msg. index:%" PRId64 ", term:%" PRId64 ", preLogIndex:%" PRId64 - ", prevLogTerm:%" PRId64 " commitIndex:%" PRId64 "", - pMsg->vgId, pMsg->prevLogIndex + 1, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex); + sTrace("vgId:%d, recv append entries msg. index:%" PRId64 ", term:%" PRId64 ", preLogIndex:%" PRId64 + ", prevLogTerm:%" PRId64 " commitIndex:%" PRId64 "", + pMsg->vgId, pMsg->prevLogIndex + 1, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex); // accept if (syncLogBufferAccept(ths->pLogBuf, ths, pEntry, pMsg->prevLogTerm) < 0) { diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index ad388e193a..32e424666b 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -64,8 +64,8 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { ASSERT(pMsg->term == ths->pRaftStore->currentTerm); - sInfo("vgId:%d received append entries reply. srcId:0x%016" PRIx64 ", term:%" PRId64 ", matchIndex:%" PRId64 "", - pMsg->vgId, pMsg->srcId.addr, pMsg->term, pMsg->matchIndex); + sTrace("vgId:%d received append entries reply. srcId:0x%016" PRIx64 ", term:%" PRId64 ", matchIndex:%" PRId64 "", + pMsg->vgId, pMsg->srcId.addr, pMsg->term, pMsg->matchIndex); if (pMsg->success) { SyncIndex oldMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId)); diff --git a/source/libs/sync/src/syncCommit.c b/source/libs/sync/src/syncCommit.c index fc7ea7cc30..5d4298552d 100644 --- a/source/libs/sync/src/syncCommit.c +++ b/source/libs/sync/src/syncCommit.c @@ -324,8 +324,8 @@ int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) { if (indexLikely > ths->commitIndex && syncNodeAgreedUpon(ths, indexLikely)) { SyncIndex commitIndex = indexLikely; syncNodeUpdateCommitIndex(ths, commitIndex); - sInfo("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state, - ths->pRaftStore->currentTerm, commitIndex); + sTrace("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state, + ths->pRaftStore->currentTerm, commitIndex); } return ths->commitIndex; } diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index c95a3aba62..8177f3b6db 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -595,7 +595,7 @@ void syncGetRetryEpSet(int64_t rid, SEpSet* pEpSet) { tstrncpy(pEp->fqdn, pSyncNode->pRaftCfg->cfg.nodeInfo[i].nodeFqdn, TSDB_FQDN_LEN); pEp->port = (pSyncNode->pRaftCfg->cfg.nodeInfo)[i].nodePort; pEpSet->numOfEps++; - sInfo("vgId:%d, sync get retry epset, index:%d %s:%d", pSyncNode->vgId, i, pEp->fqdn, pEp->port); + sDebug("vgId:%d, sync get retry epset, index:%d %s:%d", pSyncNode->vgId, i, pEp->fqdn, pEp->port); } if (pEpSet->numOfEps > 0) { pEpSet->inUse = (pSyncNode->pRaftCfg->cfg.myIndex + 1) % pEpSet->numOfEps; @@ -1028,6 +1028,12 @@ int32_t syncNodeRestore(SSyncNode* pSyncNode) { SyncIndex lastVer = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore); SyncIndex commitIndex = pSyncNode->pLogStore->syncLogCommitIndex(pSyncNode->pLogStore); SyncIndex endIndex = pSyncNode->pLogBuf->endIndex; + if (lastVer != -1 && endIndex != lastVer + 1) { + terrno = TSDB_CODE_WAL_LOG_INCOMPLETE; + sError("vgId:%d, failed to restore sync node since %s. expected lastLogIndex: %" PRId64 ", lastVer: %" PRId64 "", + pSyncNode->vgId, terrstr(), endIndex - 1, lastVer); + return -1; + } ASSERT(endIndex == lastVer + 1); commitIndex = TMAX(pSyncNode->commitIndex, commitIndex); @@ -2141,10 +2147,10 @@ int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) { // proceed match index, with replicating on needed SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); - sInfo("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64 - ", %" PRId64 ")", - ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex, - ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex); + sDebug("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64 + ", %" PRId64 ")", + ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex, + ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex); // multi replica if (ths->replicaNum > 1) { @@ -2300,6 +2306,26 @@ int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) { SyncLocalCmd* pMsg = pRpcMsg->pCont; syncLogRecvLocalCmd(ths, pMsg, ""); + if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) { + syncNodeStepDown(ths, pMsg->sdNewTerm); + + } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) { + (void)syncNodeUpdateCommitIndex(ths, pMsg->fcIndex); + if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { + sError("vgId:%d, failed to commit raft log since %s. commit index: %" PRId64 "", ths->vgId, terrstr(), + ths->commitIndex); + } + } else { + sError("error local cmd"); + } + + return 0; +} + +int32_t syncNodeOnLocalCmdOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) { + SyncLocalCmd* pMsg = pRpcMsg->pCont; + syncLogRecvLocalCmd(ths, pMsg, ""); + if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) { syncNodeStepDown(ths, pMsg->sdNewTerm); @@ -2535,11 +2561,11 @@ int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) { } bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) { - // return false; return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1); } int32_t syncNodeDoCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, uint64_t flag) { + ASSERT(false); if (beginIndex > endIndex) { return 0; } diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 501efa8782..656b32bd7b 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -163,7 +163,6 @@ int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { if (pLogStore->syncLogGetEntry(pLogStore, index, &pEntry) < 0) { sError("vgId:%d, failed to get log entry since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); - ASSERT(0); break; } @@ -334,7 +333,7 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { SyncTerm prevLogTerm = pBufEntry->prevLogTerm; SSyncRaftEntry* pEntry = pBufEntry->pItem; if (pEntry == NULL) { - sDebug("vgId:%d, cannot proceed match index in log buffer. no raft entry at next pos of matchIndex:%" PRId64, + sTrace("vgId:%d, cannot proceed match index in log buffer. no raft entry at next pos of matchIndex:%" PRId64, pNode->vgId, pBuf->matchIndex); goto _out; } @@ -361,8 +360,8 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { // increase match index pBuf->matchIndex = index; - sInfo("vgId:%d, log buffer proceed. start index: %" PRId64 ", match index: %" PRId64 ", end index: %" PRId64, - pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); + sTrace("vgId:%d, log buffer proceed. start index: %" PRId64 ", match index: %" PRId64 ", end index: %" PRId64, + pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); // replicate on demand (void)syncNodeReplicate(pNode); @@ -390,6 +389,10 @@ _out: int32_t syncLogFsmExecute(SSyncNode* pNode, SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry) { ASSERT(pFsm->FpCommitCb != NULL && "No commit cb registered for the FSM"); + if ((pNode->replicaNum == 1) && pNode->restoreFinish && pNode->vgId != 1) { + return 0; + } + SRpcMsg rpcMsg; syncEntry2OriginalRpc(pEntry, &rpcMsg); @@ -439,7 +442,7 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm goto _out; } - sDebug("vgId:%d, log buffer info. role: %d, term: %" PRId64 ". start index:%" PRId64 ", commit index:%" PRId64 + sTrace("vgId:%d, log buffer info. role: %d, term: %" PRId64 ". start index:%" PRId64 ", commit index:%" PRId64 ", match index: %" PRId64 ", end index:%" PRId64 "", pNode->vgId, role, term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); @@ -470,7 +473,7 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm } pBuf->commitIndex = index; - sDebug("vgId:%d, committed index: %" PRId64 ", term: %" PRId64 ", role: %d, current term: %" PRId64 "", pNode->vgId, + sTrace("vgId:%d, committed index: %" PRId64 ", term: %" PRId64 ", role: %d, current term: %" PRId64 "", pNode->vgId, pEntry->index, pEntry->term, role, term); if (!inBuf) { @@ -480,8 +483,7 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm } // recycle - SyncIndex used = pBuf->endIndex - pBuf->startIndex; - SyncIndex until = pBuf->commitIndex - (pBuf->size - used) / 2; + SyncIndex until = pBuf->commitIndex - (pBuf->size >> 4); for (SyncIndex index = pBuf->startIndex; index < until; index++) { SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; ASSERT(pEntry != NULL); @@ -687,7 +689,7 @@ int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode } SSyncLogBuffer* pBuf = pNode->pLogBuf; - sDebug("vgId:%d, attempted to probe the %d'th peer with msg of index:%" PRId64 " term: %" PRId64 + sTrace("vgId:%d, attempted to probe the %d'th peer with msg of index:%" PRId64 " term: %" PRId64 ". pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pMgr->peerId, index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, @@ -733,7 +735,7 @@ int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* p } SSyncLogBuffer* pBuf = pNode->pLogBuf; - sDebug("vgId:%d, attempted to replicate %d msgs to the %d'th peer. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + sTrace("vgId:%d, attempted to replicate %d msgs to the %d'th peer. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, count, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); @@ -920,8 +922,8 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn (void)syncNodeSendAppendEntries(pNode, pDestId, &msgOut); ret = 0; - sInfo("vgId:%d, replicate one msg index: %" PRId64 " term: %" PRId64 " prevterm: %" PRId64 " to dest: 0x%016" PRIx64, - pNode->vgId, pEntry->index, pEntry->term, prevLogTerm, pDestId->addr); + sTrace("vgId:%d, replicate one msg index: %" PRId64 " term: %" PRId64 " prevterm: %" PRId64 " to dest: 0x%016" PRIx64, + pNode->vgId, pEntry->index, pEntry->term, prevLogTerm, pDestId->addr); if (!inBuf) { syncEntryDestroy(pEntry); diff --git a/source/libs/transport/src/transSvr.c b/source/libs/transport/src/transSvr.c index 5f36d91023..ef59b54124 100644 --- a/source/libs/transport/src/transSvr.c +++ b/source/libs/transport/src/transSvr.c @@ -1246,6 +1246,7 @@ _return2: tTrace("handle %p failed to send to release handle", exh); return -1; } + int transSendResponse(const STransMsg* msg) { if (msg->info.noResp) { rpcFreeCont(msg->pCont); From 8ef5ca78eec2135e713afb0b9ba6ccd14ce9948c Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 22 Nov 2022 10:54:32 +0800 Subject: [PATCH 16/42] fix: protect syncNodeReplicate with mutex lock --- source/libs/sync/inc/syncPipeline.h | 2 +- source/libs/sync/inc/syncReplication.h | 1 + source/libs/sync/src/syncAppendEntriesReply.c | 8 +++++--- source/libs/sync/src/syncPipeline.c | 15 +++++++-------- source/libs/sync/src/syncReplication.c | 10 +++++++++- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/source/libs/sync/inc/syncPipeline.h b/source/libs/sync/inc/syncPipeline.h index 666515078a..d8697da64b 100644 --- a/source/libs/sync/inc/syncPipeline.h +++ b/source/libs/sync/inc/syncPipeline.h @@ -82,10 +82,10 @@ static FORCE_INLINE int32_t syncLogReplMgrUpdateTerm(SSyncLogReplMgr* pMgr, Sync } SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index); +int32_t syncLogReplMgrReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SyncTerm* pTerm, SRaftId* pDestId, bool* pBarrier); int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); -int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); diff --git a/source/libs/sync/inc/syncReplication.h b/source/libs/sync/inc/syncReplication.h index f077306475..f2e240344f 100644 --- a/source/libs/sync/inc/syncReplication.h +++ b/source/libs/sync/inc/syncReplication.h @@ -52,6 +52,7 @@ int32_t syncNodeSendHeartbeat(SSyncNode* pSyncNode, const SRaftId* pDestId, SRpc int32_t syncNodeReplicate(SSyncNode* pSyncNode); int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId, bool snapshot); +int32_t syncNodeReplicateWithoutLock(SSyncNode* pNode); int32_t syncNodeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index 32e424666b..ed38ea9559 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -81,10 +81,12 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { // replicate log SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId); - // ASSERT(pMgr != NULL); - if (pMgr != NULL) { - (void)syncLogReplMgrProcessReply(pMgr, ths, pMsg); + if (pMgr == NULL) { + sError("vgId:%d, failed to get log repl mgr for src addr: 0x%016" PRIx64, ths->vgId, pMsg->srcId.addr); + return -1; } + ASSERT(pMgr != NULL); + (void)syncLogReplMgrProcessReply(pMgr, ths, pMsg); } return 0; } diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 656b32bd7b..199ef289f3 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -234,10 +234,10 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt SyncTerm lastMatchTerm = syncLogBufferGetLastMatchTerm(pBuf); if (index <= pBuf->commitIndex) { - sInfo("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, - pBuf->endIndex); + sTrace("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, + pBuf->endIndex); ret = 0; goto _out; } @@ -364,7 +364,7 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { pNode->vgId, pBuf->startIndex, pBuf->matchIndex, pBuf->endIndex); // replicate on demand - (void)syncNodeReplicate(pNode); + (void)syncNodeReplicateWithoutLock(pNode); // persist if (syncLogStorePersist(pLogStore, pEntry) < 0) { @@ -393,7 +393,7 @@ int32_t syncLogFsmExecute(SSyncNode* pNode, SSyncFSM* pFsm, ESyncState role, Syn return 0; } - SRpcMsg rpcMsg; + SRpcMsg rpcMsg = {0}; syncEntry2OriginalRpc(pEntry, &rpcMsg); SFsmCbMeta cbMeta = {0}; @@ -666,8 +666,7 @@ int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Sync return 0; } -int32_t syncLogBufferReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { - SSyncLogBuffer* pBuf = pNode->pLogBuf; +int32_t syncLogReplMgrReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { if (pMgr->restored) { (void)syncLogReplMgrReplicateAttemptedOnce(pMgr, pNode); } else { diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index d4a046b7ae..2623eebc23 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -135,6 +135,14 @@ int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId, bool snapsh } int32_t syncNodeReplicate(SSyncNode* pNode) { + SSyncLogBuffer* pBuf = pNode->pLogBuf; + taosThreadMutexLock(&pBuf->mutex); + int32_t ret = syncNodeReplicateWithoutLock(pNode); + taosThreadMutexUnlock(&pBuf->mutex); + return ret; +} + +int32_t syncNodeReplicateWithoutLock(SSyncNode* pNode) { if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->replicaNum == 1) { return -1; } @@ -143,7 +151,7 @@ int32_t syncNodeReplicate(SSyncNode* pNode) { continue; } SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i]; - (void)syncLogBufferReplicateOnce(pMgr, pNode); + (void)syncLogReplMgrReplicateOnce(pMgr, pNode); } return 0; } From b80095dc870547ff30cecc16e471e88ec173c338 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 22 Nov 2022 22:24:20 +0800 Subject: [PATCH 17/42] enh: snapshot during recovery of SSynclogReplMgr --- source/libs/sync/src/syncPipeline.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 199ef289f3..3dbca1210d 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -610,6 +610,15 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod (void)syncLogReplMgrReset(pMgr); } + // check existence of WAl log + SyncIndex firstVer = pNode->pLogStore->syncLogBeginIndex(pNode->pLogStore); + if (pMsg->matchIndex < firstVer) { + if (syncNodeStartSnapshot(pNode, &destId) < 0) { + sError("vgId:%d, failed to start snapshot for dest: 0x%016" PRIx64, pNode->vgId, destId.addr); + } + return 0; + } + // send match index SyncIndex index = TMIN(pMsg->matchIndex, pNode->pLogBuf->matchIndex); bool barrier = false; @@ -901,6 +910,13 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn pEntry = syncLogBufferGetOneEntry(pBuf, pNode, index, &inBuf); if (pEntry == NULL) { sError("vgId:%d, failed to get raft entry for index: %" PRId64 "", pNode->vgId, index); + if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) { + SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(pNode, pDestId); + if (pMgr) { + sInfo("vgId:%d, reset log repl mgr for dest: 0x%016" PRIx64, pNode->vgId, pDestId->addr); + (void)syncLogReplMgrReset(pMgr); + } + } goto _err; } *pBarrier = syncLogIsReplicationBarrier(pEntry); From 95f8e96eb50ec251f838070e83050de6747cf259 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 22 Nov 2022 23:41:58 +0800 Subject: [PATCH 18/42] fix: align firstVer with lastVer too if no WAL logs exist --- source/libs/wal/src/walMeta.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/libs/wal/src/walMeta.c b/source/libs/wal/src/walMeta.c index 6cac4b6093..d77acbbb6f 100644 --- a/source/libs/wal/src/walMeta.c +++ b/source/libs/wal/src/walMeta.c @@ -285,6 +285,9 @@ void walAlignVersions(SWal* pWal) { if (pWal->vers.lastVer < pWal->vers.snapshotVer) { wWarn("vgId:%d, lastVer:%" PRId64 " is less than snapshotVer:%" PRId64 ". align with it.", pWal->cfg.vgId, pWal->vers.lastVer, pWal->vers.snapshotVer); + if (pWal->vers.lastVer < pWal->vers.firstVer) { + pWal->vers.firstVer = pWal->vers.snapshotVer + 1; + } pWal->vers.lastVer = pWal->vers.snapshotVer; } if (pWal->vers.commitVer < pWal->vers.snapshotVer) { From 006e13e6638475a7f34662fe3aee437865f69684 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Wed, 23 Nov 2022 10:24:17 +0800 Subject: [PATCH 19/42] enh: validate alignment of WAL and tsdb commit during syncLogBufferInit --- source/libs/sync/src/syncPipeline.c | 28 ++++++++++++++++++++++------ source/libs/sync/src/syncUtil.c | 7 ++++--- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 3dbca1210d..a31812e51e 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -119,6 +119,26 @@ SSyncRaftEntry* syncEntryBuildDummy(SyncTerm term, SyncIndex index, int32_t vgId return syncEntryBuildNoop(term, index, vgId); } +int32_t syncLogValidateAlignmentOfCommit(SSyncNode* pNode, SyncIndex commitIndex) { + SyncIndex firstVer = pNode->pLogStore->syncLogBeginIndex(pNode->pLogStore); + if (firstVer > commitIndex + 1) { + sError("vgId:%d, firstVer of WAL log greater than tsdb commit version + 1. firstVer: %" PRId64 + ", tsdb commit version: %" PRId64 "", + pNode->vgId, firstVer, commitIndex); + return -1; + } + + SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); + if (lastVer < commitIndex) { + sError("vgId:%d, lastVer of WAL log less than tsdb commit version. lastVer: %" PRId64 + ", tsdb commit version: %" PRId64 "", + pNode->vgId, lastVer, commitIndex); + return -1; + } + + return 0; +} + int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { taosThreadMutexLock(&pBuf->mutex); ASSERT(pNode->pLogStore != NULL && "log store not created"); @@ -132,16 +152,12 @@ int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { } SyncIndex commitIndex = snapshot.lastApplyIndex; SyncTerm commitTerm = snapshot.lastApplyTerm; - - SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); - if (lastVer < commitIndex) { - sError("vgId:%d, lastVer of WAL log less than tsdb commit version. lastVer: %" PRId64 - ", tsdb commit version: %" PRId64 "", - pNode->vgId, lastVer, commitIndex); + if (syncLogValidateAlignmentOfCommit(pNode, commitIndex)) { terrno = TSDB_CODE_WAL_LOG_INCOMPLETE; goto _err; } + SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); ASSERT(lastVer >= commitIndex); SyncIndex toIndex = lastVer; // update match index diff --git a/source/libs/sync/src/syncUtil.c b/source/libs/sync/src/syncUtil.c index b50336cd63..97de188253 100644 --- a/source/libs/sync/src/syncUtil.c +++ b/source/libs/sync/src/syncUtil.c @@ -238,9 +238,10 @@ void syncPrintNodeLog(const char* flags, ELogLevel level, int32_t dflag, SSyncNo taosPrintLog(flags, level, dflag, "vgId:%d, sync %s " "%s" - ", tm:%" PRIu64 ", cmt:%" PRId64 ", fst:%" PRId64 ", lst:%" PRId64 ", min:%" PRId64 ", snap:%" PRId64 - ", snap-tm:%" PRIu64 ", sby:%d, aq:%d, bch:%d, r-num:%d, lcfg:%" PRId64 - ", chging:%d, rsto:%d, dquorum:%d, elt:%" PRId64 ", hb:%" PRId64 ", %s, %s", + ", term:%" PRIu64 ", commitIdx:%" PRId64 ", firstVer:%" PRId64 ", lastVer:%" PRId64 ", min:%" PRId64 + ", snap.lastApply:%" PRId64 ", snap.term:%" PRIu64 + ", standby:%d, aqItems:%d, batchSz:%d, replicaNum:%d, lastCfgIdx:%" PRId64 + ", changing:%d, restore:%d, quorum:%d, electTimer:%" PRId64 ", hb:%" PRId64 ", %s, %s", pNode->vgId, syncStr(pNode->state), eventLog, currentTerm, pNode->commitIndex, logBeginIndex, logLastIndex, pNode->minMatchIndex, snapshot.lastApplyIndex, snapshot.lastApplyTerm, pNode->pRaftCfg->isStandBy, aqItems, pNode->pRaftCfg->batchSize, pNode->replicaNum, From f68e41a40e12d0554bbbab2344894193b7848bf0 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Wed, 23 Nov 2022 15:57:40 +0800 Subject: [PATCH 20/42] enh: initialize log buffer again after receiving a complete snapshot --- source/libs/sync/inc/syncPipeline.h | 5 +++-- source/libs/sync/src/syncMain.c | 25 +++++++++++++++++++++ source/libs/sync/src/syncPipeline.c | 35 ++++++++++++++++++++++++----- source/libs/sync/src/syncSnapshot.c | 12 +++++++++- source/libs/wal/src/walWrite.c | 2 +- 5 files changed, 69 insertions(+), 10 deletions(-) diff --git a/source/libs/sync/inc/syncPipeline.h b/source/libs/sync/inc/syncPipeline.h index d8697da64b..9a68b1e0c8 100644 --- a/source/libs/sync/inc/syncPipeline.h +++ b/source/libs/sync/inc/syncPipeline.h @@ -89,14 +89,15 @@ int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); -int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); -int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode); +int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg); +int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode); // SSyncLogBuffer SSyncLogBuffer* syncLogBufferCreate(); void syncLogBufferDestroy(SSyncLogBuffer* pBuf); int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode); +int32_t syncLogBufferReInit(SSyncLogBuffer* pBuf, SSyncNode* pNode); // access int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf); diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 8177f3b6db..a4c0022d98 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -704,6 +704,28 @@ static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) { return ret; } +int32_t syncNodeLogStoreRestoreOnNeed(SSyncNode* pNode) { + ASSERT(pNode->pLogStore != NULL && "log store not created"); + ASSERT(pNode->pFsm != NULL && "pFsm not registered"); + ASSERT(pNode->pFsm->FpGetSnapshotInfo != NULL && "FpGetSnapshotInfo not registered"); + SSnapshot snapshot; + if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) { + sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr()); + return -1; + } + SyncIndex commitIndex = snapshot.lastApplyIndex; + SyncIndex firstVer = pNode->pLogStore->syncLogBeginIndex(pNode->pLogStore); + SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore); + if (lastVer < commitIndex || firstVer > commitIndex + 1) { + if (pNode->pLogStore->syncLogRestoreFromSnapshot(pNode->pLogStore, commitIndex)) { + sError("vgId:%d, failed to restore log store from snapshot since %s. lastVer: %" PRId64 ", snapshotVer: %" PRId64, + pNode->vgId, terrstr(), lastVer, commitIndex); + return -1; + } + } + return 0; +} + // open/close -------------- SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { SSyncNode* pSyncNode = taosMemoryCalloc(1, sizeof(SSyncNode)); @@ -912,6 +934,9 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { } pSyncNode->commitIndex = commitIndex; + if (syncNodeLogStoreRestoreOnNeed(pSyncNode) < 0) { + goto _error; + } // timer ms init pSyncNode->pingBaseLine = PING_TIMER_MS; pSyncNode->electBaseLine = ELECT_TIMER_MS_MIN; diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index a31812e51e..71ac5d1464 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -139,8 +139,7 @@ int32_t syncLogValidateAlignmentOfCommit(SSyncNode* pNode, SyncIndex commitIndex return 0; } -int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { - taosThreadMutexLock(&pBuf->mutex); +int32_t syncLogBufferInitWithoutLock(SSyncLogBuffer* pBuf, SSyncNode* pNode) { ASSERT(pNode->pLogStore != NULL && "log store not created"); ASSERT(pNode->pFsm != NULL && "pFsm not registered"); ASSERT(pNode->pFsm->FpGetSnapshotInfo != NULL && "FpGetSnapshotInfo not registered"); @@ -226,14 +225,37 @@ int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { // validate syncLogBufferValidate(pBuf); - taosThreadMutexUnlock(&pBuf->mutex); return 0; _err: - taosThreadMutexUnlock(&pBuf->mutex); return -1; } +int32_t syncLogBufferInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { + taosThreadMutexLock(&pBuf->mutex); + int32_t ret = syncLogBufferInitWithoutLock(pBuf, pNode); + taosThreadMutexUnlock(&pBuf->mutex); + return ret; +} + +int32_t syncLogBufferReInit(SSyncLogBuffer* pBuf, SSyncNode* pNode) { + taosThreadMutexLock(&pBuf->mutex); + for (SyncIndex index = pBuf->startIndex; index < pBuf->endIndex; index++) { + SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + if (pEntry == NULL) continue; + syncEntryDestroy(pEntry); + pEntry = NULL; + memset(&pBuf->entries[(index + pBuf->size) % pBuf->size], 0, sizeof(pBuf->entries[0])); + } + pBuf->startIndex = pBuf->commitIndex = pBuf->matchIndex = pBuf->endIndex = 0; + int32_t ret = syncLogBufferInitWithoutLock(pBuf, pNode); + if (ret < 0) { + sError("vgId:%d, failed to re-initialize sync log buffer since %s.", pNode->vgId, terrstr()); + } + taosThreadMutexUnlock(&pBuf->mutex); + return ret; +} + FORCE_INLINE SyncTerm syncLogBufferGetLastMatchTerm(SSyncLogBuffer* pBuf) { SyncIndex index = pBuf->matchIndex; SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; @@ -628,7 +650,7 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod // check existence of WAl log SyncIndex firstVer = pNode->pLogStore->syncLogBeginIndex(pNode->pLogStore); - if (pMsg->matchIndex < firstVer) { + if (pMsg->matchIndex + 1 < firstVer) { if (syncNodeStartSnapshot(pNode, &destId) < 0) { sError("vgId:%d, failed to start snapshot for dest: 0x%016" PRIx64, pNode->vgId, destId.addr); } @@ -929,7 +951,8 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) { SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(pNode, pDestId); if (pMgr) { - sInfo("vgId:%d, reset log repl mgr for dest: 0x%016" PRIx64, pNode->vgId, pDestId->addr); + sInfo("vgId:%d, reset sync log repl mgr for peer: 0x%016" PRIx64 " since %s. index: %" PRId64, pNode->vgId, + pDestId->addr, terrstr(), index); (void)syncLogReplMgrReset(pMgr); } } diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 222b7c4e1e..1035925c2b 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -16,6 +16,7 @@ #define _DEFAULT_SOURCE #include "syncSnapshot.h" #include "syncIndexMgr.h" +#include "syncPipeline.h" #include "syncRaftCfg.h" #include "syncRaftLog.h" #include "syncRaftStore.h" @@ -273,6 +274,11 @@ int32_t syncNodeStartSnapshot(SSyncNode *pSyncNode, SRaftId *pDestId) { return 1; } + char host[64]; + uint16_t port; + syncUtilU642Addr(pDestId->addr, host, sizeof(host), &port); + sInfo("vgId:%d, start snapshot for peer: %s:%d", pSyncNode->vgId, host, port); + code = snapshotSenderStart(pSender); if (code != 0) { sNError(pSyncNode, "snapshot sender start error"); @@ -372,7 +378,10 @@ int32_t snapshotReceiverStartWriter(SSyncSnapshotReceiver *pReceiver, SyncSnapsh } int32_t snapshotReceiverStart(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *pPreMsg) { - ASSERT(!snapshotReceiverIsStart(pReceiver)); + if (snapshotReceiverIsStart(pReceiver)) { + sWarn("vgId:%d, snapshot receiver has started.", pReceiver->pSyncNode->vgId); + return 0; + } pReceiver->start = true; pReceiver->ack = SYNC_SNAPSHOT_SEQ_PRE_SNAPSHOT; @@ -738,6 +747,7 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_END) { syncNodeOnSnapshotEnd(pSyncNode, pMsg); + (void)syncLogBufferReInit(pSyncNode->pLogBuf, pSyncNode); } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_FORCE_CLOSE) { // force close, no response diff --git a/source/libs/wal/src/walWrite.c b/source/libs/wal/src/walWrite.c index 10fb2ee97e..4a1b2d1444 100644 --- a/source/libs/wal/src/walWrite.c +++ b/source/libs/wal/src/walWrite.c @@ -70,7 +70,7 @@ int32_t walRestoreFromSnapshot(SWal *pWal, int64_t ver) { pWal->lastRollSeq = -1; taosArrayClear(pWal->fileInfoSet); - pWal->vers.firstVer = -1; + pWal->vers.firstVer = ver + 1; pWal->vers.lastVer = ver; pWal->vers.commitVer = ver; pWal->vers.snapshotVer = ver; From 2e640e8e68c2aae790dbe69f50237b7dffc1955e Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Thu, 24 Nov 2022 10:25:06 +0800 Subject: [PATCH 21/42] feat: start snapshot in recovery mode of sync repl mgr with lastMatchTerm in reply msg --- source/libs/sync/inc/syncMessage.h | 2 +- source/libs/sync/inc/syncPipeline.h | 2 +- source/libs/sync/src/syncAppendEntries.c | 2 +- source/libs/sync/src/syncMain.c | 2 +- source/libs/sync/src/syncPipeline.c | 60 +++++++++++++++--------- source/libs/sync/src/syncUtil.c | 4 +- 6 files changed, 45 insertions(+), 27 deletions(-) diff --git a/source/libs/sync/inc/syncMessage.h b/source/libs/sync/inc/syncMessage.h index c5fdc27426..2af2e7b5cd 100644 --- a/source/libs/sync/inc/syncMessage.h +++ b/source/libs/sync/inc/syncMessage.h @@ -105,7 +105,7 @@ typedef struct SyncAppendEntriesReply { SRaftId destId; // private data SyncTerm term; - SyncTerm privateTerm; + SyncTerm lastMatchTerm; bool success; SyncIndex matchIndex; SyncIndex lastSendIndex; diff --git a/source/libs/sync/inc/syncPipeline.h b/source/libs/sync/inc/syncPipeline.h index 9a68b1e0c8..ca07876def 100644 --- a/source/libs/sync/inc/syncPipeline.h +++ b/source/libs/sync/inc/syncPipeline.h @@ -103,7 +103,7 @@ int32_t syncLogBufferReInit(SSyncLogBuffer* pBuf, SSyncNode* pNode); int64_t syncLogBufferGetEndIndex(SSyncLogBuffer* pBuf); int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry); int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevTerm); -int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode); +int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncTerm* pMatchTerm); int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t commitIndex); int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode); diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 9634f4ee26..d719169470 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -206,7 +206,7 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) { accepted = true; _SEND_RESPONSE: - pReply->matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); + pReply->matchIndex = syncLogBufferProceed(ths->pLogBuf, ths, &pReply->lastMatchTerm); bool matched = (pReply->matchIndex >= pReply->lastSendIndex); if (accepted && matched) { pReply->success = true; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index a4c0022d98..108b9ab5bd 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -2170,7 +2170,7 @@ int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) { } // proceed match index, with replicating on needed - SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths); + SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths, NULL); sDebug("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 71ac5d1464..a7a983a06e 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -76,17 +76,17 @@ SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, S if (prevIndex == -1) return 0; - if (index - 1 > pBuf->matchIndex) { + if (prevIndex > pBuf->matchIndex) { terrno = TSDB_CODE_WAL_LOG_NOT_EXIST; return -1; } ASSERT(index - 1 == prevIndex); - if (index - 1 >= pBuf->startIndex) { - pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + if (prevIndex >= pBuf->startIndex) { + pEntry = pBuf->entries[(prevIndex + pBuf->size) % pBuf->size].pItem; ASSERT(pEntry != NULL && "no log entry found"); - prevLogTerm = pBuf->entries[(index + pBuf->size) % pBuf->size].prevLogTerm; + prevLogTerm = pEntry->term; return prevLogTerm; } @@ -354,7 +354,7 @@ int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) { return 0; } -int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { +int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncTerm* pMatchTerm) { taosThreadMutexLock(&pBuf->mutex); syncLogBufferValidate(pBuf); @@ -419,6 +419,9 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode) { _out: pBuf->matchIndex = matchIndex; + if (pMatchTerm) { + *pMatchTerm = pBuf->entries[(matchIndex + pBuf->size) % pBuf->size].pItem->term; + } syncLogBufferValidate(pBuf); taosThreadMutexUnlock(&pBuf->mutex); return matchIndex; @@ -615,16 +618,16 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod ASSERT(pMgr->restored == false); char host[64]; uint16_t port; - syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port); + syncUtilU642Addr(destId.addr, host, sizeof(host), &port); if (pMgr->endIndex == 0) { ASSERT(pMgr->startIndex == 0); ASSERT(pMgr->matchIndex == 0); if (pMsg->matchIndex < 0) { pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr of peer %s:%d restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + sInfo("vgId:%d, sync log repl mgr of peer %s:%d (%" PRIx64 ") restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, host, port, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pNode->vgId, host, port, destId.addr, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -638,9 +641,9 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod if (pMsg->matchIndex == pMsg->lastSendIndex) { pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr of peer %s:%d restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 + sInfo("vgId:%d, sync log repl mgr of peer %s:%d (%" PRIx64 ") restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, host, port, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pNode->vgId, host, port, destId.addr, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -648,23 +651,38 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod (void)syncLogReplMgrReset(pMgr); } - // check existence of WAl log + // check last match term + SyncTerm term = -1; SyncIndex firstVer = pNode->pLogStore->syncLogBeginIndex(pNode->pLogStore); - if (pMsg->matchIndex + 1 < firstVer) { - if (syncNodeStartSnapshot(pNode, &destId) < 0) { - sError("vgId:%d, failed to start snapshot for dest: 0x%016" PRIx64, pNode->vgId, destId.addr); + SyncIndex index = TMIN(pMsg->matchIndex, pNode->pLogBuf->matchIndex); + + if (pMsg->matchIndex < pNode->pLogBuf->matchIndex) { + term = syncLogReplMgrGetPrevLogTerm(pMgr, pNode, index + 1); + + if (term < 0 || (term != pMsg->lastMatchTerm && (index + 1 == firstVer || index == firstVer))) { + ASSERT(term >= 0 || terrno == TSDB_CODE_WAL_LOG_NOT_EXIST); + if (syncNodeStartSnapshot(pNode, &destId) < 0) { + sError("vgId:%d, failed to start snapshot for peer %s:%d", pNode->vgId, host, port); + } + return 0; + } + + ASSERT(index + 1 >= firstVer); + + if (term == pMsg->lastMatchTerm) { + index = index + 1; + ASSERT(index <= pNode->pLogBuf->matchIndex); + } else { + ASSERT(index > firstVer); } - return 0; } - // send match index - SyncIndex index = TMIN(pMsg->matchIndex, pNode->pLogBuf->matchIndex); - bool barrier = false; - SyncTerm term = -1; + // attempt to replicate the raft log at index + bool barrier = false; ASSERT(index >= 0); if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, &destId, &barrier) < 0) { - sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, - terrstr(), index, destId.addr); + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", peer %s:%d", pNode->vgId, + terrstr(), index, host, port); return -1; } diff --git a/source/libs/sync/src/syncUtil.c b/source/libs/sync/src/syncUtil.c index 97de188253..181ddffd5b 100644 --- a/source/libs/sync/src/syncUtil.c +++ b/source/libs/sync/src/syncUtil.c @@ -373,7 +373,7 @@ void syncLogSendAppendEntriesReply(SSyncNode* pSyncNode, const SyncAppendEntries sNTrace(pSyncNode, "send sync-append-entries-reply to %s:%d, {term:%" PRId64 ", pterm:%" PRId64 ", success:%d, lsend-index:%" PRId64 ", match:%" PRId64 "}, %s", - host, port, pMsg->term, pMsg->privateTerm, pMsg->success, pMsg->lastSendIndex, pMsg->matchIndex, s); + host, port, pMsg->term, pMsg->lastMatchTerm, pMsg->success, pMsg->lastSendIndex, pMsg->matchIndex, s); } void syncLogRecvAppendEntriesReply(SSyncNode* pSyncNode, const SyncAppendEntriesReply* pMsg, const char* s) { @@ -384,7 +384,7 @@ void syncLogRecvAppendEntriesReply(SSyncNode* pSyncNode, const SyncAppendEntries sNTrace(pSyncNode, "recv sync-append-entries-reply from %s:%d {term:%" PRId64 ", pterm:%" PRId64 ", success:%d, lsend-index:%" PRId64 ", match:%" PRId64 "}, %s", - host, port, pMsg->term, pMsg->privateTerm, pMsg->success, pMsg->lastSendIndex, pMsg->matchIndex, s); + host, port, pMsg->term, pMsg->lastMatchTerm, pMsg->success, pMsg->lastSendIndex, pMsg->matchIndex, s); } void syncLogSendHeartbeat(SSyncNode* pSyncNode, const SyncHeartbeat* pMsg, const char* s) { From 22d64b9c0befba2a0c36b6156a293a9d0d393cc3 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Thu, 24 Nov 2022 15:55:31 +0800 Subject: [PATCH 22/42] fix: remove syncNodeReplicateOne from syncNodeOnSnapshotReply --- source/dnode/vnode/src/vnd/vnodeBufPool.c | 3 +++ source/libs/sync/src/syncPipeline.c | 27 ++++++++++++----------- source/libs/sync/src/syncReplication.c | 1 + source/libs/sync/src/syncSnapshot.c | 4 ++-- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeBufPool.c b/source/dnode/vnode/src/vnd/vnodeBufPool.c index 71e926bd35..dcc323f778 100644 --- a/source/dnode/vnode/src/vnd/vnodeBufPool.c +++ b/source/dnode/vnode/src/vnd/vnodeBufPool.c @@ -176,6 +176,9 @@ void vnodeBufPoolRef(SVBufPool *pPool) { } void vnodeBufPoolUnRef(SVBufPool *pPool) { + if (pPool == NULL) { + return; + } int32_t nRef = atomic_sub_fetch_32(&pPool->nRef, 1); if (nRef == 0) { SVnode *pVnode = pPool->pVnode; diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index a7a983a06e..891547e80f 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -497,8 +497,8 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm // execute it if (!syncUtilUserCommit(pEntry->originalRpcType)) { - sInfo("vgId:%d, non-user msg in raft log entry. index: %" PRId64 ", term:%" PRId64 "", vgId, pEntry->index, - pEntry->term); + sInfo("vgId:%d, raft mgmt msg in log entry. index: %" PRId64 ", term:%" PRId64 ", type: %s", vgId, pEntry->index, + pEntry->term, TMSG_INFO(pEntry->originalRpcType)); pBuf->commitIndex = index; if (!inBuf) { syncEntryDestroy(pEntry); @@ -539,7 +539,7 @@ _out: if (!pNode->restoreFinish && pBuf->commitIndex >= pNode->commitIndex) { pNode->pFsm->FpRestoreFinishCb(pNode->pFsm); pNode->restoreFinish = true; - sInfo("vgId:%d, restore finished. pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, + sInfo("vgId:%d, restore finished. log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); } @@ -625,8 +625,8 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod ASSERT(pMgr->matchIndex == 0); if (pMsg->matchIndex < 0) { pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr of peer %s:%d (%" PRIx64 ") restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 - "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + sInfo("vgId:%d, sync log repl mgr restored. peer: %s:%d (%" PRIx64 "), repl mgr(rs:%d): [%" PRId64 " %" PRId64 + ", %" PRId64 "), log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, host, port, destId.addr, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; @@ -641,8 +641,8 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod if (pMsg->matchIndex == pMsg->lastSendIndex) { pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr of peer %s:%d (%" PRIx64 ") restored. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 - "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + sInfo("vgId:%d, sync log repl mgr restored. peer: %s:%d (%" PRIx64 "), repl mgr(rs:%d): [%" PRId64 " %" PRId64 + ", %" PRId64 "), log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, host, port, destId.addr, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; @@ -663,7 +663,9 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod ASSERT(term >= 0 || terrno == TSDB_CODE_WAL_LOG_NOT_EXIST); if (syncNodeStartSnapshot(pNode, &destId) < 0) { sError("vgId:%d, failed to start snapshot for peer %s:%d", pNode->vgId, host, port); + return -1; } + sInfo("vgId:%d, snapshot replication to peer %s:%d started", pNode->vgId, host, port); return 0; } @@ -702,8 +704,8 @@ int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pN SSyncLogBuffer* pBuf = pNode->pLogBuf; taosThreadMutexLock(&pBuf->mutex); if (pMsg->startTime != 0 && pMsg->startTime != pMgr->peerStartTime) { - sInfo("vgId:%d, reset sync log repl mgr in heartbeat. start time:%" PRId64 ", old start time:%" PRId64 "", - pNode->vgId, pMsg->startTime, pMgr->peerStartTime); + sInfo("vgId:%d, reset sync log repl mgr in heartbeat. peer: %" PRIx64 ", start time:%" PRId64 ", old:%" PRId64 "", + pNode->vgId, pMsg->srcId.addr, pMsg->startTime, pMgr->peerStartTime); syncLogReplMgrReset(pMgr); pMgr->peerStartTime = pMsg->startTime; } @@ -715,9 +717,8 @@ int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Sync SSyncLogBuffer* pBuf = pNode->pLogBuf; taosThreadMutexLock(&pBuf->mutex); if (pMsg->startTime != pMgr->peerStartTime) { - sInfo("vgId:%d, reset sync log repl mgr in append entries reply. start time:%" PRId64 ", old start time:%" PRId64 - "", - pNode->vgId, pMsg->startTime, pMgr->peerStartTime); + sInfo("vgId:%d, reset sync log repl mgr in append entries reply. peer: %" PRIx64 ", start time:%" PRId64 ", old:%" PRId64, + pNode->vgId, pMsg->srcId.addr, pMsg->startTime, pMgr->peerStartTime); syncLogReplMgrReset(pMgr); pMgr->peerStartTime = pMsg->startTime; } @@ -923,7 +924,7 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { (void)syncLogBufferRollback(pBuf, pBuf->matchIndex + 1); - sInfo("vgId:%d, reset log buffer. pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, + sInfo("vgId:%d, reset log buffer. log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); pBuf->endIndex = pBuf->matchIndex + 1; diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index 2623eebc23..cfa6b5215e 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -49,6 +49,7 @@ int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId, bool snapshot) { + ASSERT(false && "deplicated"); // next index SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId); diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 1035925c2b..4ceb2c9fc9 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -881,8 +881,8 @@ int32_t syncNodeOnSnapshotReply(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { snapshotSenderStop(pSender, true); // update next-index - syncIndexMgrSetIndex(pSyncNode->pNextIndex, &(pMsg->srcId), pMsg->lastIndex + 1); - syncNodeReplicateOne(pSyncNode, &(pMsg->srcId), false); + // syncIndexMgrSetIndex(pSyncNode->pNextIndex, &(pMsg->srcId), pMsg->lastIndex + 1); + // syncNodeReplicateOne(pSyncNode, &(pMsg->srcId), false); return 0; } From 6729c130f8dafd9117f491c0caa026bbfa575022 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Thu, 24 Nov 2022 16:43:26 +0800 Subject: [PATCH 23/42] fix: leave some free space in log buffer during syncLogBufferInit --- source/libs/sync/src/syncPipeline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 891547e80f..f5f0ee7c69 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -182,7 +182,8 @@ int32_t syncLogBufferInitWithoutLock(SSyncLogBuffer* pBuf, SSyncNode* pNode) { } bool taken = false; - if (toIndex <= index + pBuf->size - 1) { + int emptySize = 5; + if (toIndex - index + 1 <= pBuf->size - emptySize) { SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = -1, .prevLogTerm = -1}; pBuf->entries[index % pBuf->size] = tmp; taken = true; From ff286e1f1c1ad01a7c751b799183629843d4b8e4 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Thu, 24 Nov 2022 18:37:31 +0800 Subject: [PATCH 24/42] enh: spool commit sync barrier as info in syncLogBufferCommit --- source/libs/sync/src/syncPipeline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index f5f0ee7c69..3c485b4872 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -498,7 +498,7 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm // execute it if (!syncUtilUserCommit(pEntry->originalRpcType)) { - sInfo("vgId:%d, raft mgmt msg in log entry. index: %" PRId64 ", term:%" PRId64 ", type: %s", vgId, pEntry->index, + sInfo("vgId:%d, commit sync barrier. index: %" PRId64 ", term:%" PRId64 ", type: %s", vgId, pEntry->index, pEntry->term, TMSG_INFO(pEntry->originalRpcType)); pBuf->commitIndex = index; if (!inBuf) { @@ -925,7 +925,7 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { (void)syncLogBufferRollback(pBuf, pBuf->matchIndex + 1); - sInfo("vgId:%d, reset log buffer. log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, + sInfo("vgId:%d, reset sync log buffer. log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); pBuf->endIndex = pBuf->matchIndex + 1; From d5ae1ca18a69006374f2a6b1eeb64ed5aea4d82f Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 25 Nov 2022 10:37:25 +0800 Subject: [PATCH 25/42] enh: make the range span of unconfirmed logs sent less than half of the size of log ring buffer --- include/libs/sync/sync.h | 2 +- source/libs/sync/src/syncPipeline.c | 43 ++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index 55f67c430e..11e3cbd494 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -41,7 +41,7 @@ extern "C" { #define SNAPSHOT_WAIT_MS 1000 * 30 #define SYNC_MAX_RETRY_BACKOFF 5 -#define SYNC_LOG_REPL_RETRY_WAIT_MS 50 +#define SYNC_LOG_REPL_RETRY_WAIT_MS 100 #define SYNC_APPEND_ENTRIES_TIMEOUT_MS 10000 #define SYNC_MAX_BATCH_SIZE 1 diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 3c485b4872..1af31efd5d 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -566,33 +566,42 @@ int32_t syncLogReplMgrReset(SSyncLogReplMgr* pMgr) { return 0; } -_Atomic int64_t tsRetryCnt = 0; - int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { if (pMgr->endIndex <= pMgr->startIndex) { return 0; } + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; + if (pMgr->retryBackoff == SYNC_MAX_RETRY_BACKOFF) { + syncLogReplMgrReset(pMgr); + sWarn("vgId:%d, reset sync log repl mgr since retry backoff exceeding limit. peer: %" PRIx64, pNode->vgId, + pDestId->addr); + return -1; + } + int32_t ret = -1; bool retried = false; int64_t retryWaitMs = syncLogGetRetryBackoffTimeMs(pMgr); + int64_t nowMs = taosGetMonoTimestampMs(); + int count = 0; + int64_t firstIndex = -1; + SyncTerm term = -1; for (SyncIndex index = pMgr->startIndex; index < pMgr->endIndex; index++) { int64_t pos = index % pMgr->size; ASSERT(!pMgr->states[pos].barrier || (index == pMgr->startIndex || index + 1 == pMgr->endIndex)); - if (pMgr->states[pos].acked) { - continue; - } - int64_t nowMs = taosGetMonoTimestampMs(); + if (nowMs < pMgr->states[pos].timeMs + retryWaitMs) { break; } - SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; - bool barrier = false; - SyncTerm term = -1; + if (pMgr->states[pos].acked) { + continue; + } + + bool barrier = false; if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { - sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: 0x%016" PRIx64 "", pNode->vgId, + sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: %" PRIx64 "", pNode->vgId, terrstr(), index, pDestId->addr); goto _out; } @@ -601,13 +610,19 @@ int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { pMgr->states[pos].term = term; pMgr->states[pos].acked = false; retried = true; - tsRetryCnt++; + if (firstIndex == -1) firstIndex = index; + count++; } ret = 0; _out: if (retried) { pMgr->retryBackoff = syncLogGetNextRetryBackoff(pMgr); + sInfo("vgId:%d, resend %d raft log entries. dest: %" PRIx64 ", for indexes: %" PRId64 + " etc., maybe of term: %" PRId64 ", retryWaitMs: %" PRId64 ", repl mgr: [%" PRId64 " %" PRId64 ", %" PRId64 + ")", + pNode->vgId, count, pDestId->addr, firstIndex, term, retryWaitMs, pMgr->startIndex, pMgr->matchIndex, + pMgr->endIndex); } return ret; } @@ -771,9 +786,10 @@ int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* p int32_t batchSize = TMAX(1, pMgr->size / 20); int32_t count = 0; int64_t nowMs = taosGetMonoTimestampMs(); + int64_t limit = pMgr->size >> 1; for (SyncIndex index = pMgr->endIndex; index <= pNode->pLogBuf->matchIndex; index++) { - if (batchSize < count++ || pMgr->startIndex + pMgr->size <= index) { + if (batchSize < count++ || limit <= index - pMgr->startIndex) { break; } if (pMgr->startIndex + 1 < index && pMgr->states[(index - 1) % pMgr->size].barrier) { @@ -800,12 +816,13 @@ int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* p } } + syncLogReplMgrRetryOnNeed(pMgr, pNode); + SSyncLogBuffer* pBuf = pNode->pLogBuf; sTrace("vgId:%d, attempted to replicate %d msgs to the %d'th peer. pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, count, pMgr->peerId, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); - syncLogReplMgrRetryOnNeed(pMgr, pNode); return 0; } From 143a2e8552008e2a3fb21c2c0f435ae5816297ff Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 25 Nov 2022 18:56:14 +0800 Subject: [PATCH 26/42] feat: start snapshot replication to rollback in recovery mode --- source/libs/sync/src/syncAppendEntries.c | 2 +- source/libs/sync/src/syncAppendEntriesReply.c | 1 - source/libs/sync/src/syncPipeline.c | 23 +++++++++++++++---- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index d719169470..3a9e97a08f 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -188,7 +188,7 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) { goto _IGNORE; } - if (pMsg->prevLogIndex + 1 != pEntry->index) { + if (pMsg->prevLogIndex + 1 != pEntry->index || pEntry->term < 0) { sError("vgId:%d, invalid previous log index in msg. index:%" PRId64 ", term:%" PRId64 ", prevLogIndex:%" PRId64 ", prevLogTerm:%" PRId64, ths->vgId, pEntry->index, pEntry->term, pMsg->prevLogIndex, pMsg->prevLogTerm); diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index b1ebd5d8d1..524abf3c2a 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -85,7 +85,6 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { sError("vgId:%d, failed to get log repl mgr for src addr: 0x%016" PRIx64, ths->vgId, pMsg->srcId.addr); return -1; } - ASSERT(pMgr != NULL); (void)syncLogReplMgrProcessReply(pMgr, ths, pMsg); } return 0; diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 1af31efd5d..f1d401759b 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -90,7 +90,7 @@ SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, S return prevLogTerm; } - if (pMgr->startIndex <= prevIndex && prevIndex < pMgr->endIndex) { + if (pMgr && pMgr->startIndex <= prevIndex && prevIndex < pMgr->endIndex) { int64_t timeMs = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].timeMs; ASSERT(timeMs != 0 && "no log entry found"); prevLogTerm = pMgr->states[(prevIndex + pMgr->size) % pMgr->size].term; @@ -277,7 +277,11 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); - ret = 0; + SyncTerm term = syncLogReplMgrGetPrevLogTerm(NULL, pNode, index + 1); + ASSERT(pEntry->term >= 0); + if (term == pEntry->term) { + ret = 0; + } goto _out; } @@ -655,7 +659,7 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod pMgr->states[pMsg->lastSendIndex % pMgr->size].acked = true; - if (pMsg->matchIndex == pMsg->lastSendIndex) { + if (pMsg->success && pMsg->matchIndex == pMsg->lastSendIndex) { pMgr->restored = true; sInfo("vgId:%d, sync log repl mgr restored. peer: %s:%d (%" PRIx64 "), repl mgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", @@ -664,6 +668,17 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod return 0; } + if (pMsg->success == false && pMsg->matchIndex >= pMsg->lastSendIndex) { + sError("vgId:%d, failed to rollback match index. peer: %s:%d, match index: %" PRId64 ", last sent: %" PRId64, pNode->vgId, + host, port, pMsg->matchIndex, pMsg->lastSendIndex); + if (syncNodeStartSnapshot(pNode, &destId) < 0) { + sError("vgId:%d, failed to start snapshot for peer %s:%d", pNode->vgId, host, port); + return -1; + } + sInfo("vgId:%d, snapshot replication to rollback. peer: %s:%d", pNode->vgId, host, port); + return 0; + } + (void)syncLogReplMgrReset(pMgr); } @@ -681,7 +696,7 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod sError("vgId:%d, failed to start snapshot for peer %s:%d", pNode->vgId, host, port); return -1; } - sInfo("vgId:%d, snapshot replication to peer %s:%d started", pNode->vgId, host, port); + sInfo("vgId:%d, snapshot replication to peer %s:%d", pNode->vgId, host, port); return 0; } From b63afcd52f64fcd30c6fdd33a749ae000085fe5b Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 25 Nov 2022 23:32:32 +0800 Subject: [PATCH 27/42] enh: reset sync log repl mgr on snapshot ending --- source/libs/sync/src/syncSnapshot.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 4ceb2c9fc9..8a8af1468e 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -879,11 +879,10 @@ int32_t syncNodeOnSnapshotReply(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { // receive ack is finish, close sender if (pMsg->ack == SYNC_SNAPSHOT_SEQ_END) { snapshotSenderStop(pSender, true); - - // update next-index - // syncIndexMgrSetIndex(pSyncNode->pNextIndex, &(pMsg->srcId), pMsg->lastIndex + 1); - // syncNodeReplicateOne(pSyncNode, &(pMsg->srcId), false); - + SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(pSyncNode, &pMsg->srcId); + if (pMgr) { + syncLogReplMgrReset(pMgr); + } return 0; } From 736a1cc017126af1f5fc8754f13bb187b07ac6c7 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 25 Nov 2022 23:35:28 +0800 Subject: [PATCH 28/42] enh: adjust some info msgs for raft pipelining --- source/libs/sync/src/syncMain.c | 8 ++-- source/libs/sync/src/syncPipeline.c | 64 ++++++++++++++--------------- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 47eab279a5..5a1acb4731 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -89,7 +89,7 @@ int32_t syncStart(int64_t rid) { } if (syncNodeRestore(pSyncNode) < 0) { - sError("vgId:%d, failed to restore raft log buffer since %s", pSyncNode->vgId, terrstr()); + sError("vgId:%d, failed to restore sync log buffer since %s", pSyncNode->vgId, terrstr()); goto _err; } @@ -847,7 +847,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { // create raft log ring buffer pSyncNode->pLogBuf = syncLogBufferCreate(); if (pSyncNode->pLogBuf == NULL) { - sError("failed to init log buffer since %s. vgId:%d", terrstr(), pSyncNode->vgId); + sError("failed to init sync log buffer since %s. vgId:%d", terrstr(), pSyncNode->vgId); goto _error; } @@ -1060,7 +1060,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { // init log buffer if (syncLogBufferInit(pSyncNode->pLogBuf, pSyncNode) < 0) { - sError("vgId:%d, failed to init raft log buffer since %s", pSyncNode->vgId, terrstr()); + sError("vgId:%d, failed to init sync log buffer since %s", pSyncNode->vgId, terrstr()); goto _error; } @@ -2239,7 +2239,7 @@ int32_t syncCacheEntry(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, LRUHand int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) { // append to log buffer if (syncLogBufferAppend(ths->pLogBuf, ths, pEntry) < 0) { - sError("vgId:%d, failed to enqueue log buffer. index:%" PRId64 "", ths->vgId, pEntry->index); + sError("vgId:%d, failed to enqueue sync log buffer. index:%" PRId64 "", ths->vgId, pEntry->index); return -1; } diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index f1d401759b..cfea6e8bc6 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -38,7 +38,7 @@ int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt SyncIndex index = pEntry->index; if (index - pBuf->startIndex >= pBuf->size) { - sError("vgId:%d, failed to append due to log buffer full. index:%" PRId64 "", pNode->vgId, index); + sError("vgId:%d, failed to append due to sync log buffer full. index:%" PRId64 "", pNode->vgId, index); goto _out; } @@ -49,7 +49,7 @@ int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt // initial log buffer with at least one item, e.g. commitIndex SSyncRaftEntry* pMatch = pBuf->entries[(index - 1 + pBuf->size) % pBuf->size].pItem; - ASSERT(pMatch != NULL && "no matched raft log entry"); + ASSERT(pMatch != NULL && "no matched log entry"); ASSERT(pMatch->index + 1 == index); SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = pMatch->index, .prevLogTerm = pMatch->term}; @@ -273,8 +273,8 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt SyncTerm lastMatchTerm = syncLogBufferGetLastMatchTerm(pBuf); if (index <= pBuf->commitIndex) { - sTrace("vgId:%d, raft entry already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", + sTrace("vgId:%d, already committed. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 + " %" PRId64 ", %" PRId64 ")", pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); SyncTerm term = syncLogReplMgrGetPrevLogTerm(NULL, pNode, index + 1); @@ -286,15 +286,15 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt } if (index - pBuf->startIndex >= pBuf->size) { - sInfo("vgId:%d, raft entry out of buffer capacity. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 - " %" PRId64 " %" PRId64 ", %" PRId64 ")", + sInfo("vgId:%d, out of buffer range. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 + " %" PRId64 ", %" PRId64 ")", pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); goto _out; } if (index > pBuf->matchIndex && lastMatchTerm != prevTerm) { - sInfo("vgId:%d, not ready to accept raft entry. index: %" PRId64 ", term: %" PRId64 ": prevterm: %" PRId64 + sInfo("vgId:%d, not ready to accept. index: %" PRId64 ", term: %" PRId64 ": prevterm: %" PRId64 " != lastmatch: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pEntry->index, pEntry->term, prevTerm, lastMatchTerm, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); @@ -309,7 +309,7 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt if (pEntry->term != pExist->term) { (void)syncLogBufferRollback(pBuf, index); } else { - sDebug("vgId:%d, duplicate raft entry received. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + sDebug("vgId:%d, duplicate log entry received. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); @@ -349,7 +349,7 @@ int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) { ASSERT(pEntry->index == lastVer + 1); if (pLogStore->syncLogAppendEntry(pLogStore, pEntry) < 0) { - sError("failed to append raft log entry since %s. index:%" PRId64 ", term:%" PRId64 "", terrstr(), pEntry->index, + sError("failed to append sync log entry since %s. index:%" PRId64 ", term:%" PRId64 "", terrstr(), pEntry->index, pEntry->term); return -1; } @@ -392,7 +392,7 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncTerm* p if (pMatch->term != prevLogTerm) { sInfo( - "vgId:%d, mismatching raft log entries encountered. " + "vgId:%d, mismatching sync log entries encountered. " "{ index:%" PRId64 ", term:%" PRId64 " } " "{ index:%" PRId64 ", term:%" PRId64 ", prevLogIndex:%" PRId64 ", prevLogTerm:%" PRId64 " } ", @@ -411,8 +411,8 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncTerm* p // persist if (syncLogStorePersist(pLogStore, pEntry) < 0) { - sError("vgId:%d, failed to persist raft log entry from log buffer since %s. index:%" PRId64, pNode->vgId, - terrstr(), pEntry->index); + sError("vgId:%d, failed to persist sync log entry from buffer since %s. index:%" PRId64, pNode->vgId, terrstr(), + pEntry->index); goto _out; } ASSERT(pEntry->index == pBuf->matchIndex); @@ -482,15 +482,14 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm bool inBuf = false; if (commitIndex <= pBuf->commitIndex) { - sDebug("vgId:%d, stale commit update. current:%" PRId64 ", notified:%" PRId64 "", vgId, pBuf->commitIndex, + sDebug("vgId:%d, stale commit index. current:%" PRId64 ", notified:%" PRId64 "", vgId, pBuf->commitIndex, commitIndex); ret = 0; goto _out; } - sTrace("vgId:%d, log buffer info. role: %d, term: %" PRId64 ". start index:%" PRId64 ", commit index:%" PRId64 - ", match index: %" PRId64 ", end index:%" PRId64 "", - pNode->vgId, role, term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + sTrace("vgId:%d, commit. log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 "), role: %d, term: %" PRId64, + pNode->vgId, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex, role, term); // execute in fsm for (int64_t index = pBuf->commitIndex + 1; index <= upperIndex; index++) { @@ -513,7 +512,7 @@ int32_t syncLogBufferCommit(SSyncLogBuffer* pBuf, SSyncNode* pNode, int64_t comm } if (syncLogFsmExecute(pNode, pFsm, role, term, pEntry) != 0) { - sError("vgId:%d, failed to execute raft entry in FSM. log index:%" PRId64 ", term:%" PRId64 "", vgId, + sError("vgId:%d, failed to execute sync log entry in FSM. log index:%" PRId64 ", term:%" PRId64 "", vgId, pEntry->index, pEntry->term); goto _out; } @@ -605,7 +604,7 @@ int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { bool barrier = false; if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, pDestId, &barrier) < 0) { - sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", dest: %" PRIx64 "", pNode->vgId, + sError("vgId:%d, failed to replicate sync log entry since %s. index: %" PRId64 ", dest: %" PRIx64 "", pNode->vgId, terrstr(), index, pDestId->addr); goto _out; } @@ -622,9 +621,8 @@ int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { _out: if (retried) { pMgr->retryBackoff = syncLogGetNextRetryBackoff(pMgr); - sInfo("vgId:%d, resend %d raft log entries. dest: %" PRIx64 ", for indexes: %" PRId64 - " etc., maybe of term: %" PRId64 ", retryWaitMs: %" PRId64 ", repl mgr: [%" PRId64 " %" PRId64 ", %" PRId64 - ")", + sInfo("vgId:%d, resend %d sync log entries. dest: %" PRIx64 ", indexes: %" PRId64 " ..., likely term: %" PRId64 + ", retryWaitMs: %" PRId64 ", repl mgr: [%" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, count, pDestId->addr, firstIndex, term, retryWaitMs, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex); } @@ -645,8 +643,8 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod ASSERT(pMgr->matchIndex == 0); if (pMsg->matchIndex < 0) { pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr restored. peer: %s:%d (%" PRIx64 "), repl mgr(rs:%d): [%" PRId64 " %" PRId64 - ", %" PRId64 "), log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + sInfo("vgId:%d, sync log repl mgr restored. peer: %s:%d (%" PRIx64 "), mgr: rs(%d) [%" PRId64 " %" PRId64 + ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, host, port, destId.addr, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; @@ -661,21 +659,21 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod if (pMsg->success && pMsg->matchIndex == pMsg->lastSendIndex) { pMgr->restored = true; - sInfo("vgId:%d, sync log repl mgr restored. peer: %s:%d (%" PRIx64 "), repl mgr(rs:%d): [%" PRId64 " %" PRId64 - ", %" PRId64 "), log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + sInfo("vgId:%d, sync log repl mgr restored. peer: %s:%d (%" PRIx64 "), mgr: rs(%d) [%" PRId64 " %" PRId64 + ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, host, port, destId.addr, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } if (pMsg->success == false && pMsg->matchIndex >= pMsg->lastSendIndex) { - sError("vgId:%d, failed to rollback match index. peer: %s:%d, match index: %" PRId64 ", last sent: %" PRId64, pNode->vgId, - host, port, pMsg->matchIndex, pMsg->lastSendIndex); + sError("vgId:%d, failed to rollback match index. peer: %s:%d, match index: %" PRId64 ", last sent: %" PRId64, + pNode->vgId, host, port, pMsg->matchIndex, pMsg->lastSendIndex); if (syncNodeStartSnapshot(pNode, &destId) < 0) { sError("vgId:%d, failed to start snapshot for peer %s:%d", pNode->vgId, host, port); return -1; } - sInfo("vgId:%d, snapshot replication to rollback. peer: %s:%d", pNode->vgId, host, port); + sInfo("vgId:%d, snapshot replication to peer %s:%d", pNode->vgId, host, port); return 0; } @@ -748,7 +746,8 @@ int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Sync SSyncLogBuffer* pBuf = pNode->pLogBuf; taosThreadMutexLock(&pBuf->mutex); if (pMsg->startTime != pMgr->peerStartTime) { - sInfo("vgId:%d, reset sync log repl mgr in append entries reply. peer: %" PRIx64 ", start time:%" PRId64 ", old:%" PRId64, + sInfo("vgId:%d, reset sync log repl mgr in appendlog reply. peer: %" PRIx64 ", start time:%" PRId64 + ", old:%" PRId64, pNode->vgId, pMsg->srcId.addr, pMsg->startTime, pMgr->peerStartTime); syncLogReplMgrReset(pMgr); pMgr->peerStartTime = pMsg->startTime; @@ -793,8 +792,6 @@ int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode return 0; } -_Atomic int64_t tsSendCnt = 0; - int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { ASSERT(pMgr->restored); SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; @@ -825,7 +822,6 @@ int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* p pMgr->states[pos].acked = false; pMgr->endIndex = index + 1; - tsSendCnt++; if (barrier) { break; } @@ -957,7 +953,7 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { (void)syncLogBufferRollback(pBuf, pBuf->matchIndex + 1); - sInfo("vgId:%d, reset sync log buffer. log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, + sInfo("vgId:%d, reset sync log buffer. buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); pBuf->endIndex = pBuf->matchIndex + 1; @@ -1003,7 +999,7 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Syn if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) { SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(pNode, pDestId); if (pMgr) { - sInfo("vgId:%d, reset sync log repl mgr for peer: 0x%016" PRIx64 " since %s. index: %" PRId64, pNode->vgId, + sInfo("vgId:%d, reset sync log repl mgr of peer: %" PRIx64 " since %s. index: %" PRId64, pNode->vgId, pDestId->addr, terrstr(), index); (void)syncLogReplMgrReset(pMgr); } From 082428acab600583f5afc37754cb573af0a9f41f Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Sat, 26 Nov 2022 12:46:51 +0800 Subject: [PATCH 29/42] enh: reduce the number of probing msgs under stress in recovery mode --- source/libs/sync/inc/syncPipeline.h | 2 +- source/libs/sync/src/syncPipeline.c | 52 +++++++++++++++-------------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/source/libs/sync/inc/syncPipeline.h b/source/libs/sync/inc/syncPipeline.h index ca07876def..e7980bc2dd 100644 --- a/source/libs/sync/inc/syncPipeline.h +++ b/source/libs/sync/inc/syncPipeline.h @@ -87,7 +87,7 @@ int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, Sy SRaftId* pDestId, bool* pBarrier); int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); -int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); +int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index); int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg); diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index cfea6e8bc6..b4f2541f9c 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -612,6 +612,7 @@ int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { pMgr->states[pos].timeMs = nowMs; pMgr->states[pos].term = term; pMgr->states[pos].acked = false; + retried = true; if (firstIndex == -1) firstIndex = index; count++; @@ -658,6 +659,7 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod pMgr->states[pMsg->lastSendIndex % pMgr->size].acked = true; if (pMsg->success && pMsg->matchIndex == pMsg->lastSendIndex) { + pMgr->matchIndex = pMsg->matchIndex; pMgr->restored = true; sInfo("vgId:%d, sync log repl mgr restored. peer: %s:%d (%" PRIx64 "), mgr: rs(%d) [%" PRId64 " %" PRId64 ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", @@ -667,8 +669,8 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod } if (pMsg->success == false && pMsg->matchIndex >= pMsg->lastSendIndex) { - sError("vgId:%d, failed to rollback match index. peer: %s:%d, match index: %" PRId64 ", last sent: %" PRId64, - pNode->vgId, host, port, pMsg->matchIndex, pMsg->lastSendIndex); + sWarn("vgId:%d, failed to rollback match index. peer: %s:%d, match index: %" PRId64 ", last sent: %" PRId64, + pNode->vgId, host, port, pMsg->matchIndex, pMsg->lastSendIndex); if (syncNodeStartSnapshot(pNode, &destId) < 0) { sError("vgId:%d, failed to start snapshot for peer %s:%d", pNode->vgId, host, port); return -1; @@ -676,8 +678,6 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod sInfo("vgId:%d, snapshot replication to peer %s:%d", pNode->vgId, host, port); return 0; } - - (void)syncLogReplMgrReset(pMgr); } // check last match term @@ -709,24 +709,8 @@ int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNod } // attempt to replicate the raft log at index - bool barrier = false; - ASSERT(index >= 0); - if (syncLogBufferReplicateOneTo(pMgr, pNode, index, &term, &destId, &barrier) < 0) { - sError("vgId:%d, failed to replicate log entry since %s. index: %" PRId64 ", peer %s:%d", pNode->vgId, - terrstr(), index, host, port); - return -1; - } - - int64_t nowMs = taosGetMonoTimestampMs(); - pMgr->states[index % pMgr->size].barrier = barrier; - pMgr->states[index % pMgr->size].timeMs = nowMs; - pMgr->states[index % pMgr->size].term = term; - pMgr->states[index % pMgr->size].acked = false; - - pMgr->matchIndex = index; - pMgr->startIndex = index; - pMgr->endIndex = index + 1; - return 0; + (void)syncLogReplMgrReset(pMgr); + return syncLogReplMgrReplicateProbeOnce(pMgr, pNode, index); } int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg) { @@ -766,14 +750,23 @@ int32_t syncLogReplMgrReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { if (pMgr->restored) { (void)syncLogReplMgrReplicateAttemptedOnce(pMgr, pNode); } else { - (void)syncLogReplMgrReplicateProbeOnce(pMgr, pNode); + (void)syncLogReplMgrReplicateProbeOnce(pMgr, pNode, pNode->pLogBuf->matchIndex); } return 0; } -int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { +int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index) { ASSERT(!pMgr->restored); - SyncIndex index = pNode->pLogBuf->matchIndex; + ASSERT(pMgr->startIndex >= 0); + int64_t retryMaxWaitMs = SYNC_LOG_REPL_RETRY_WAIT_MS * (1 << SYNC_MAX_RETRY_BACKOFF); + int64_t nowMs = taosGetMonoTimestampMs(); + + if (pMgr->endIndex > pMgr->startIndex && + nowMs < pMgr->states[pMgr->startIndex % pMgr->size].timeMs + retryMaxWaitMs) { + return 0; + } + (void)syncLogReplMgrReset(pMgr); + SRaftId* pDestId = &pNode->replicasId[pMgr->peerId]; bool barrier = false; SyncTerm term = -1; @@ -783,6 +776,15 @@ int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode return -1; } + ASSERT(index >= 0); + pMgr->states[index % pMgr->size].barrier = barrier; + pMgr->states[index % pMgr->size].timeMs = nowMs; + pMgr->states[index % pMgr->size].term = term; + pMgr->states[index % pMgr->size].acked = false; + + pMgr->startIndex = index; + pMgr->endIndex = index + 1; + SSyncLogBuffer* pBuf = pNode->pLogBuf; sTrace("vgId:%d, attempted to probe the %d'th peer with msg of index:%" PRId64 " term: %" PRId64 ". pMgr(rs:%d): [%" PRId64 " %" PRId64 ", %" PRId64 "), pBuf: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 From 012bcf1a5d89af225cb5bc42f27f631813a9e82b Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Sat, 26 Nov 2022 13:31:43 +0800 Subject: [PATCH 30/42] enh: adjust logging levels in syncLogBufferAccept, e.g. sInfo to sWarn --- source/libs/sync/src/syncPipeline.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index b4f2541f9c..994d62ca73 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -286,7 +286,7 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt } if (index - pBuf->startIndex >= pBuf->size) { - sInfo("vgId:%d, out of buffer range. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 + sWarn("vgId:%d, out of buffer range. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); @@ -294,7 +294,7 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt } if (index > pBuf->matchIndex && lastMatchTerm != prevTerm) { - sInfo("vgId:%d, not ready to accept. index: %" PRId64 ", term: %" PRId64 ": prevterm: %" PRId64 + sWarn("vgId:%d, not ready to accept. index: %" PRId64 ", term: %" PRId64 ": prevterm: %" PRId64 " != lastmatch: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pEntry->index, pEntry->term, prevTerm, lastMatchTerm, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); @@ -309,7 +309,7 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt if (pEntry->term != pExist->term) { (void)syncLogBufferRollback(pBuf, index); } else { - sDebug("vgId:%d, duplicate log entry received. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 + sTrace("vgId:%d, duplicate log entry received. index: %" PRId64 ", term: %" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pEntry->index, pEntry->term, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); @@ -622,7 +622,7 @@ int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { _out: if (retried) { pMgr->retryBackoff = syncLogGetNextRetryBackoff(pMgr); - sInfo("vgId:%d, resend %d sync log entries. dest: %" PRIx64 ", indexes: %" PRId64 " ..., likely term: %" PRId64 + sInfo("vgId:%d, resend %d sync log entries. dest: %" PRIx64 ", indexes: %" PRId64 " ..., terms: ... %" PRId64 ", retryWaitMs: %" PRId64 ", repl mgr: [%" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, count, pDestId->addr, firstIndex, term, retryWaitMs, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex); From a01f34fd83fcb7c8977a1af8dde91866df8dd0ef Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Sat, 26 Nov 2022 16:20:08 +0800 Subject: [PATCH 31/42] fix: clear sync log buffer in syncLogBufferDestroy --- source/libs/sync/src/syncPipeline.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 994d62ca73..bf8f830185 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -622,7 +622,7 @@ int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { _out: if (retried) { pMgr->retryBackoff = syncLogGetNextRetryBackoff(pMgr); - sInfo("vgId:%d, resend %d sync log entries. dest: %" PRIx64 ", indexes: %" PRId64 " ..., terms: ... %" PRId64 + sInfo("vgId:%d, resent %d sync log entries. dest: %" PRIx64 ", indexes: %" PRId64 " ..., terms: ... %" PRId64 ", retryWaitMs: %" PRId64 ", repl mgr: [%" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, count, pDestId->addr, firstIndex, term, retryWaitMs, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex); @@ -919,10 +919,24 @@ _err: return NULL; } +void syncLogBufferClear(SSyncLogBuffer* pBuf) { + taosThreadMutexLock(&pBuf->mutex); + for (SyncIndex index = pBuf->startIndex; index < pBuf->endIndex; index++) { + SSyncRaftEntry* pEntry = pBuf->entries[(index + pBuf->size) % pBuf->size].pItem; + if (pEntry == NULL) continue; + syncEntryDestroy(pEntry); + pEntry = NULL; + memset(&pBuf->entries[(index + pBuf->size) % pBuf->size], 0, sizeof(pBuf->entries[0])); + } + pBuf->startIndex = pBuf->commitIndex = pBuf->matchIndex = pBuf->endIndex = 0; + taosThreadMutexUnlock(&pBuf->mutex); +} + void syncLogBufferDestroy(SSyncLogBuffer* pBuf) { if (pBuf == NULL) { return; } + syncLogBufferClear(pBuf); (void)taosThreadMutexDestroy(&pBuf->mutex); (void)taosMemoryFree(pBuf); return; From 4edef438eabb3c050ab36f84ba9f294ad3fa9909 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Sat, 26 Nov 2022 21:53:38 +0800 Subject: [PATCH 32/42] enh: transfer ownership of msgs while committing sync log entries --- include/libs/sync/sync.h | 10 +++++----- source/dnode/mgmt/mgmt_mnode/src/mmWorker.c | 2 ++ source/dnode/mgmt/mgmt_qnode/src/qmWorker.c | 2 ++ source/dnode/mgmt/mgmt_snode/src/smWorker.c | 1 + source/dnode/mgmt/mgmt_vnode/src/vmWorker.c | 2 +- source/dnode/mnode/impl/src/mndSync.c | 12 +++++++----- source/dnode/vnode/src/vnd/vnodeSync.c | 21 ++++++++------------- source/libs/sync/src/syncPipeline.c | 1 + source/libs/sync/src/syncRaftEntry.c | 2 +- 9 files changed, 28 insertions(+), 25 deletions(-) diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index 7fb3a890cb..00935f280a 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -134,13 +134,13 @@ typedef struct SSnapshotMeta { typedef struct SSyncFSM { void* data; - void (*FpCommitCb)(const struct SSyncFSM* pFsm, const SRpcMsg* pMsg, const SFsmCbMeta* pMeta); - void (*FpPreCommitCb)(const struct SSyncFSM* pFsm, const SRpcMsg* pMsg, const SFsmCbMeta* pMeta); - void (*FpRollBackCb)(const struct SSyncFSM* pFsm, const SRpcMsg* pMsg, const SFsmCbMeta* pMeta); + void (*FpCommitCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SFsmCbMeta* pMeta); + void (*FpPreCommitCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SFsmCbMeta* pMeta); + void (*FpRollBackCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SFsmCbMeta* pMeta); void (*FpRestoreFinishCb)(const struct SSyncFSM* pFsm); - void (*FpReConfigCb)(const struct SSyncFSM* pFsm, const SRpcMsg* pMsg, const SReConfigCbMeta* pMeta); - void (*FpLeaderTransferCb)(const struct SSyncFSM* pFsm, const SRpcMsg* pMsg, const SFsmCbMeta* pMeta); + void (*FpReConfigCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SReConfigCbMeta* pMeta); + void (*FpLeaderTransferCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SFsmCbMeta* pMeta); bool (*FpApplyQueueEmptyCb)(const struct SSyncFSM* pFsm); int32_t (*FpApplyQueueItems)(const struct SSyncFSM* pFsm); diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmWorker.c b/source/dnode/mgmt/mgmt_mnode/src/mmWorker.c index d3d92e1bbf..212de0bfb4 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmWorker.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmWorker.c @@ -162,11 +162,13 @@ int32_t mmPutMsgToQueue(SMnodeMgmt *pMgmt, EQueueType qtype, SRpcMsg *pRpc) { SRpcMsg *pMsg = taosAllocateQitem(sizeof(SRpcMsg), RPC_QITEM); if (pMsg == NULL) return -1; memcpy(pMsg, pRpc, sizeof(SRpcMsg)); + pRpc->pCont = NULL; dTrace("msg:%p, is created and will put into %s queue, type:%s", pMsg, pWorker->name, TMSG_INFO(pRpc->msgType)); int32_t code = mmPutMsgToWorker(pMgmt, pWorker, pMsg); if (code != 0) { dTrace("msg:%p, is freed", pMsg); + rpcFreeCont(pMsg->pCont); taosFreeQitem(pMsg); } return code; diff --git a/source/dnode/mgmt/mgmt_qnode/src/qmWorker.c b/source/dnode/mgmt/mgmt_qnode/src/qmWorker.c index edbe9882a4..3e5ad65db7 100644 --- a/source/dnode/mgmt/mgmt_qnode/src/qmWorker.c +++ b/source/dnode/mgmt/mgmt_qnode/src/qmWorker.c @@ -61,6 +61,7 @@ int32_t qmPutRpcMsgToQueue(SQnodeMgmt *pMgmt, EQueueType qtype, SRpcMsg *pRpc) { SRpcMsg *pMsg = taosAllocateQitem(sizeof(SRpcMsg), RPC_QITEM); if (pMsg == NULL) return -1; memcpy(pMsg, pRpc, sizeof(SRpcMsg)); + pRpc->pCont = NULL; switch (qtype) { case QUERY_QUEUE: @@ -74,6 +75,7 @@ int32_t qmPutRpcMsgToQueue(SQnodeMgmt *pMgmt, EQueueType qtype, SRpcMsg *pRpc) { return 0; default: terrno = TSDB_CODE_INVALID_PARA; + rpcFreeCont(pMsg->pCont); taosFreeQitem(pMsg); return -1; } diff --git a/source/dnode/mgmt/mgmt_snode/src/smWorker.c b/source/dnode/mgmt/mgmt_snode/src/smWorker.c index f6942c8114..2d2a121795 100644 --- a/source/dnode/mgmt/mgmt_snode/src/smWorker.c +++ b/source/dnode/mgmt/mgmt_snode/src/smWorker.c @@ -151,6 +151,7 @@ int32_t smPutMsgToQueue(SSnodeMgmt *pMgmt, EQueueType qtype, SRpcMsg *pRpc) { pHead->contLen = htonl(pHead->contLen); pHead->vgId = SNODE_HANDLE; memcpy(pMsg, pRpc, sizeof(SRpcMsg)); + pRpc->pCont = NULL; switch (qtype) { case STREAM_QUEUE: diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c index 4aa07cad98..08ea880b97 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c @@ -246,12 +246,12 @@ int32_t vmPutRpcMsgToQueue(SVnodeMgmt *pMgmt, EQueueType qtype, SRpcMsg *pRpc) { pHead->contLen = htonl(pHead->contLen); pHead->vgId = htonl(pHead->vgId); memcpy(pMsg, pRpc, sizeof(SRpcMsg)); + pRpc->pCont = NULL; int32_t code = vmPutMsgToQueue(pMgmt, pMsg, qtype); if (code != 0) { dTrace("msg:%p, is freed", pMsg); rpcFreeCont(pMsg->pCont); - pRpc->pCont = NULL; taosFreeQitem(pMsg); } diff --git a/source/dnode/mnode/impl/src/mndSync.c b/source/dnode/mnode/impl/src/mndSync.c index 320cc10d40..10cd6416b4 100644 --- a/source/dnode/mnode/impl/src/mndSync.c +++ b/source/dnode/mnode/impl/src/mndSync.c @@ -72,15 +72,11 @@ static int32_t mndSyncSendMsg(const SEpSet *pEpSet, SRpcMsg *pMsg) { return code; } -void mndSyncCommitMsg(const SSyncFSM *pFsm, const SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { +void mndProcessWriteMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { SMnode *pMnode = pFsm->data; SSyncMgmt *pMgmt = &pMnode->syncMgmt; SSdbRaw *pRaw = pMsg->pCont; - // delete msg handle - SRpcMsg rpcMsg = {0}; - rpcMsg.info = pMsg->info; - int32_t transId = sdbGetIdFromRaw(pMnode->pSdb, pRaw); pMgmt->errCode = pMeta->code; mInfo("trans:%d, is proposed, saved:%d code:0x%x, apply index:%" PRId64 " term:%" PRIu64 " config:%" PRId64 @@ -120,6 +116,12 @@ void mndSyncCommitMsg(const SSyncFSM *pFsm, const SRpcMsg *pMsg, const SFsmCbMet } } +void mndSyncCommitMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { + mndProcessWriteMsg(pFsm, pMsg, pMeta); + rpcFreeCont(pMsg->pCont); + pMsg->pCont = NULL; +} + int32_t mndSyncGetSnapshot(const SSyncFSM *pFsm, SSnapshot *pSnapshot, void *pReaderParam, void **ppReader) { mInfo("start to read snapshot from sdb in atomic way"); SMnode *pMnode = pFsm->data; diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index bc6eb81275..f5c1e1b169 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -295,36 +295,31 @@ static int32_t vnodeSyncGetSnapshot(const SSyncFSM *pFsm, SSnapshot *pSnapshot) return 0; } -static void vnodeSyncApplyMsg(const SSyncFSM *pFsm, const SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { +static void vnodeSyncApplyMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { SVnode *pVnode = pFsm->data; - - SRpcMsg rpcMsg = {.msgType = pMsg->msgType, .contLen = pMsg->contLen}; - rpcMsg.pCont = rpcMallocCont(rpcMsg.contLen); - memcpy(rpcMsg.pCont, pMsg->pCont, pMsg->contLen); - rpcMsg.info = pMsg->info; - rpcMsg.info.conn.applyIndex = pMeta->index; - rpcMsg.info.conn.applyTerm = pMeta->term; + pMsg->info.conn.applyIndex = pMeta->index; + pMsg->info.conn.applyTerm = pMeta->term; const STraceId *trace = &pMsg->info.traceId; vGTrace("vgId:%d, commit-cb is excuted, fsm:%p, index:%" PRId64 ", term:%" PRIu64 ", msg-index:%" PRId64 ", weak:%d, code:%d, state:%d %s, type:%s", - pVnode->config.vgId, pFsm, pMeta->index, pMeta->term, rpcMsg.info.conn.applyIndex, pMeta->isWeak, pMeta->code, + pVnode->config.vgId, pFsm, pMeta->index, pMeta->term, pMsg->info.conn.applyIndex, pMeta->isWeak, pMeta->code, pMeta->state, syncStr(pMeta->state), TMSG_INFO(pMsg->msgType)); - tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, &rpcMsg); + tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, pMsg); } -static void vnodeSyncCommitMsg(const SSyncFSM *pFsm, const SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { +static void vnodeSyncCommitMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { vnodeSyncApplyMsg(pFsm, pMsg, pMeta); } -static void vnodeSyncPreCommitMsg(const SSyncFSM *pFsm, const SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { +static void vnodeSyncPreCommitMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { if (pMeta->isWeak == 1) { vnodeSyncApplyMsg(pFsm, pMsg, pMeta); } } -static void vnodeSyncRollBackMsg(const SSyncFSM *pFsm, const SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { +static void vnodeSyncRollBackMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { SVnode *pVnode = pFsm->data; vTrace("vgId:%d, rollback-cb is excuted, fsm:%p, index:%" PRId64 ", weak:%d, code:%d, state:%d %s, type:%s", pVnode->config.vgId, pFsm, pMeta->index, pMeta->isWeak, pMeta->code, pMeta->state, syncStr(pMeta->state), diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index bf8f830185..2000379160 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -455,6 +455,7 @@ int32_t syncLogFsmExecute(SSyncNode* pNode, SSyncFSM* pFsm, ESyncState role, Syn (void)syncRespMgrGetAndDel(pNode->pSyncRespMgr, cbMeta.seqNum, &rpcMsg.info); pFsm->FpCommitCb(pFsm, &rpcMsg, &cbMeta); + ASSERT(rpcMsg.pCont == NULL); return 0; } diff --git a/source/libs/sync/src/syncRaftEntry.c b/source/libs/sync/src/syncRaftEntry.c index 959e00fcde..988a86cc67 100644 --- a/source/libs/sync/src/syncRaftEntry.c +++ b/source/libs/sync/src/syncRaftEntry.c @@ -20,7 +20,7 @@ SSyncRaftEntry* syncEntryBuild(int32_t dataLen) { int32_t bytes = sizeof(SSyncRaftEntry) + dataLen; - SSyncRaftEntry* pEntry = taosMemoryMalloc(bytes); + SSyncRaftEntry* pEntry = taosMemoryCalloc(1, bytes); if (pEntry == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; return NULL; From 5f4fb90f699993098ce2ca91397375f294370f06 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Mon, 28 Nov 2022 13:47:07 +0800 Subject: [PATCH 33/42] enh: reduce retryBackoff if first timeMs less than last one for half the current retryWaitMs --- source/libs/sync/src/syncPipeline.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 2000379160..00f3791ea2 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -843,6 +843,14 @@ int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* p int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg) { ASSERT(pMgr->restored == true); if (pMgr->startIndex <= pMsg->lastSendIndex && pMsg->lastSendIndex < pMgr->endIndex) { + if (pMgr->startIndex < pMgr->matchIndex && pMgr->retryBackoff > 0) { + int64_t firstSentMs = pMgr->states[pMgr->startIndex % pMgr->size].timeMs; + int64_t lastSentMs = pMgr->states[(pMgr->endIndex - 1) % pMgr->size].timeMs; + int64_t timeDiffMs = lastSentMs - firstSentMs; + if (timeDiffMs > 0 && timeDiffMs < (SYNC_LOG_REPL_RETRY_WAIT_MS << (pMgr->retryBackoff - 1))) { + pMgr->retryBackoff -= 1; + } + } pMgr->states[pMsg->lastSendIndex % pMgr->size].acked = true; pMgr->matchIndex = TMAX(pMgr->matchIndex, pMsg->matchIndex); for (SyncIndex index = pMgr->startIndex; index < pMgr->matchIndex; index++) { From 8475aaaaf1efd95b8cdc4928848ff3778e053ce2 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Mon, 28 Nov 2022 13:48:57 +0800 Subject: [PATCH 34/42] enh: add ASSERT info msg in tsdbStartCommit --- source/dnode/vnode/src/tsdb/tsdbCommit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbCommit.c b/source/dnode/vnode/src/tsdb/tsdbCommit.c index 520ae36c73..391e10e223 100644 --- a/source/dnode/vnode/src/tsdb/tsdbCommit.c +++ b/source/dnode/vnode/src/tsdb/tsdbCommit.c @@ -811,7 +811,7 @@ static int32_t tsdbStartCommit(STsdb *pTsdb, SCommitter *pCommitter) { int32_t lino = 0; memset(pCommitter, 0, sizeof(*pCommitter)); - ASSERT(pTsdb->mem && pTsdb->imem == NULL); + ASSERT(pTsdb->mem && pTsdb->imem == NULL && "last tsdb commit incomplete"); taosThreadRwlockWrlock(&pTsdb->rwLock); pTsdb->imem = pTsdb->mem; From d52e7319535c59b287950ebbf4299863e834db3b Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 29 Nov 2022 09:34:28 +0800 Subject: [PATCH 35/42] fix: define CLOCK_MONOTONIC for Win OS --- include/os/osTime.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/os/osTime.h b/include/os/osTime.h index 88eabd206d..0a0a54119b 100644 --- a/include/os/osTime.h +++ b/include/os/osTime.h @@ -35,6 +35,7 @@ extern "C" { #ifdef WINDOWS #define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 0 #define MILLISECOND_PER_SECOND (1000i64) #else From 4f37947e42ab6f6e37b6759292495005bfec539f Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 29 Nov 2022 17:15:48 +0800 Subject: [PATCH 36/42] fix: minimum commitTerm as 0 in syncLogBufferInit --- source/libs/sync/inc/syncPipeline.h | 13 ++++--------- source/libs/sync/src/syncPipeline.c | 2 +- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/source/libs/sync/inc/syncPipeline.h b/source/libs/sync/inc/syncPipeline.h index e7980bc2dd..a59b14259a 100644 --- a/source/libs/sync/inc/syncPipeline.h +++ b/source/libs/sync/inc/syncPipeline.h @@ -74,22 +74,18 @@ static FORCE_INLINE int32_t syncLogGetNextRetryBackoff(SSyncLogReplMgr* pMgr) { return TMIN(pMgr->retryBackoff + 1, SYNC_MAX_RETRY_BACKOFF); } -static FORCE_INLINE int32_t syncLogReplMgrUpdateTerm(SSyncLogReplMgr* pMgr, SyncIndex index, SyncTerm term) { - if (pMgr->endIndex == 0) return -1; - ASSERT(pMgr->startIndex <= index && index < pMgr->endIndex); - pMgr->states[(index + pMgr->size) % pMgr->size].term = term; - return 0; -} - SyncTerm syncLogReplMgrGetPrevLogTerm(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index); + int32_t syncLogReplMgrReplicateOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogBufferReplicateOneTo(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index, SyncTerm* pTerm, SRaftId* pDestId, bool* pBarrier); -int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode); int32_t syncLogReplMgrReplicateProbeOnce(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex index); + +int32_t syncLogReplMgrProcessReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogReplMgrProcessReplyInRecoveryMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); int32_t syncLogReplMgrProcessReplyInNormalMode(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEntriesReply* pMsg); + int32_t syncLogReplMgrProcessHeartbeatReply(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncHeartbeatReply* pMsg); int32_t syncLogReplMgrRetryOnNeed(SSyncLogReplMgr* pMgr, SSyncNode* pNode); @@ -111,7 +107,6 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode); SSyncRaftEntry* syncLogBufferGetOneEntry(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncIndex index, bool* pInBuf); int32_t syncLogBufferValidate(SSyncLogBuffer* pBuf); int32_t syncLogBufferRollback(SSyncLogBuffer* pBuf, SyncIndex toIndex); -int32_t syncLogBufferReplicate(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm); #ifdef __cplusplus } diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 00f3791ea2..1fc6798471 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -150,7 +150,7 @@ int32_t syncLogBufferInitWithoutLock(SSyncLogBuffer* pBuf, SSyncNode* pNode) { goto _err; } SyncIndex commitIndex = snapshot.lastApplyIndex; - SyncTerm commitTerm = snapshot.lastApplyTerm; + SyncTerm commitTerm = TMAX(snapshot.lastApplyTerm, 0); if (syncLogValidateAlignmentOfCommit(pNode, commitIndex)) { terrno = TSDB_CODE_WAL_LOG_INCOMPLETE; goto _err; From f1991ac88271541ecab48be0d7674fd4dd40395b Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 29 Nov 2022 17:19:47 +0800 Subject: [PATCH 37/42] enh: improve runOneLinux and TDTestCase of python test scripts --- tests/pytest/util/cases.py | 2 +- tests/system-test/0-others/taosdShell.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pytest/util/cases.py b/tests/pytest/util/cases.py index 3eb5cf2548..82aa552359 100644 --- a/tests/pytest/util/cases.py +++ b/tests/pytest/util/cases.py @@ -63,7 +63,7 @@ class TDCases: tdLog.info("total %d Linux test case(s) executed" % (runNum)) - def runOneLinux(self, conn, fileName, replicaVar): + def runOneLinux(self, conn, fileName, replicaVar=1): testModule = self.__dynamicLoadModule(fileName) runNum = 0 diff --git a/tests/system-test/0-others/taosdShell.py b/tests/system-test/0-others/taosdShell.py index 581448a6d9..7ad7e4d0ef 100644 --- a/tests/system-test/0-others/taosdShell.py +++ b/tests/system-test/0-others/taosdShell.py @@ -166,7 +166,7 @@ class TDTestCase: # keyDict['c'] = cfgPath # keyDict['P'] = self.serverPort tdDnodes=cluster.dnodes - for i in range(5): + for i in range(len(tdDnodes)): tdDnodes[i].stoptaosd() From decb17fcb16c38f548bd2a1bed1258ba1400038d Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 29 Nov 2022 20:21:33 +0800 Subject: [PATCH 38/42] fix: use recursive mutex for relocking ring log buffer in syncNodeDoConfigChange --- source/libs/sync/inc/syncPipeline.h | 1 + source/libs/sync/src/syncCommit.c | 1 + source/libs/sync/src/syncMain.c | 2 +- source/libs/sync/src/syncPipeline.c | 16 +++++++++++++++- source/libs/sync/src/syncReplication.c | 2 +- 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/source/libs/sync/inc/syncPipeline.h b/source/libs/sync/inc/syncPipeline.h index a59b14259a..4208d40a69 100644 --- a/source/libs/sync/inc/syncPipeline.h +++ b/source/libs/sync/inc/syncPipeline.h @@ -55,6 +55,7 @@ typedef struct SSyncLogBuffer { int64_t endIndex; int64_t size; TdThreadMutex mutex; + TdThreadMutexAttr attr; } SSyncLogBuffer; // SSyncLogRepMgr diff --git a/source/libs/sync/src/syncCommit.c b/source/libs/sync/src/syncCommit.c index 3377efe12c..07b1101256 100644 --- a/source/libs/sync/src/syncCommit.c +++ b/source/libs/sync/src/syncCommit.c @@ -84,6 +84,7 @@ void syncOneReplicaAdvance(SSyncNode* pSyncNode) { } void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) { + ASSERT(false && "deprecated"); if (pSyncNode == NULL) { sError("pSyncNode is NULL"); return; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 2b72c8d287..d809c28090 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -1602,7 +1602,7 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde // Raft 3.6.2 Committing entries from previous terms syncNodeAppendNoop(pSyncNode); - syncMaybeAdvanceCommitIndex(pSyncNode); + // syncMaybeAdvanceCommitIndex(pSyncNode); } else { syncNodeBecomeFollower(pSyncNode, tmpbuf); diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 1fc6798471..e655ed13c8 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -916,11 +916,24 @@ SSyncLogBuffer* syncLogBufferCreate() { ASSERT(pBuf->size == TSDB_SYNC_LOG_BUFFER_SIZE); - if (taosThreadMutexInit(&pBuf->mutex, NULL) < 0) { + if (taosThreadMutexAttrInit(&pBuf->attr) < 0) { + sError("failed to init log buffer mutexattr due to %s", strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(errno); + goto _err; + } + + if (taosThreadMutexAttrSetType(&pBuf->attr, PTHREAD_MUTEX_RECURSIVE) < 0) { + sError("failed to set log buffer mutexattr type due to %s", strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(errno); + goto _err; + } + + if (taosThreadMutexInit(&pBuf->mutex, &pBuf->attr) < 0) { sError("failed to init log buffer mutex due to %s", strerror(errno)); terrno = TAOS_SYSTEM_ERROR(errno); goto _err; } + return pBuf; _err: @@ -947,6 +960,7 @@ void syncLogBufferDestroy(SSyncLogBuffer* pBuf) { } syncLogBufferClear(pBuf); (void)taosThreadMutexDestroy(&pBuf->mutex); + (void)taosThreadMutexAttrDestroy(&pBuf->attr); (void)taosMemoryFree(pBuf); return; } diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index ba9fe5b56a..0f56921ec7 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -49,7 +49,7 @@ int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId, bool snapshot) { - ASSERT(false && "deplicated"); + ASSERT(false && "deprecated"); // next index SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId); From 3cbe109e4b17a325c60c748c75db558726cab641 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Wed, 30 Nov 2022 22:41:33 +0800 Subject: [PATCH 39/42] fix: initialize vnode state applyTerm with commitTerm in vnodeOpen --- source/dnode/vnode/src/vnd/vnodeCommit.c | 4 ++-- source/dnode/vnode/src/vnd/vnodeOpen.c | 9 +++------ source/dnode/vnode/src/vnd/vnodeSvr.c | 1 + source/libs/sync/src/syncPipeline.c | 7 +++++++ source/libs/sync/src/syncTimeout.c | 2 +- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeCommit.c b/source/dnode/vnode/src/vnd/vnodeCommit.c index 7040d2d7c8..4bdaf8d353 100644 --- a/source/dnode/vnode/src/vnd/vnodeCommit.c +++ b/source/dnode/vnode/src/vnd/vnodeCommit.c @@ -209,8 +209,8 @@ int vnodeCommit(SVnode *pVnode) { SVnodeInfo info = {0}; char dir[TSDB_FILENAME_LEN]; - vInfo("vgId:%d, start to commit, commit ID:%" PRId64 " version:%" PRId64, TD_VID(pVnode), pVnode->state.commitID, - pVnode->state.applied); + vInfo("vgId:%d, start to commit, commit ID:%" PRId64 " version:%" PRId64 " term: %" PRId64, TD_VID(pVnode), + pVnode->state.commitID, pVnode->state.applied, pVnode->state.applyTerm); // persist wal before starting if (walPersist(pVnode->pWal) < 0) { diff --git a/source/dnode/vnode/src/vnd/vnodeOpen.c b/source/dnode/vnode/src/vnd/vnodeOpen.c index 77d375bc45..0ff4a46d44 100644 --- a/source/dnode/vnode/src/vnd/vnodeOpen.c +++ b/source/dnode/vnode/src/vnd/vnodeOpen.c @@ -144,9 +144,9 @@ SVnode *vnodeOpen(const char *path, STfs *pTfs, SMsgCb msgCb) { pVnode->config = info.config; pVnode->state.committed = info.state.committed; pVnode->state.commitTerm = info.state.commitTerm; - pVnode->state.applied = info.state.committed; pVnode->state.commitID = info.state.commitID; - pVnode->state.commitTerm = info.state.commitTerm; + pVnode->state.applied = info.state.committed; + pVnode->state.applyTerm = info.state.commitTerm; pVnode->pTfs = pTfs; pVnode->msgCb = msgCb; taosThreadMutexInit(&pVnode->lock, NULL); @@ -269,10 +269,7 @@ void vnodeClose(SVnode *pVnode) { } // start the sync timer after the queue is ready -int32_t vnodeStart(SVnode *pVnode) { - vnodeSyncStart(pVnode); - return 0; -} +int32_t vnodeStart(SVnode *pVnode) { return vnodeSyncStart(pVnode); } void vnodeStop(SVnode *pVnode) {} diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index 4248998d58..5c55296858 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -187,6 +187,7 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t version, SRp vDebug("vgId:%d, start to process write request %s, index:%" PRId64, TD_VID(pVnode), TMSG_INFO(pMsg->msgType), version); + ASSERT(pVnode->state.applyTerm <= pMsg->info.conn.applyTerm); pVnode->state.applied = version; pVnode->state.applyTerm = pMsg->info.conn.applyTerm; diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index e655ed13c8..03306af925 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -224,6 +224,9 @@ int32_t syncLogBufferInitWithoutLock(SSyncLogBuffer* pBuf, SSyncNode* pNode) { // update startIndex pBuf->startIndex = takeDummy ? index : index + 1; + sInfo("vgId:%d, init sync log buffer. buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + // validate syncLogBufferValidate(pBuf); return 0; @@ -826,6 +829,10 @@ int32_t syncLogReplMgrReplicateAttemptedOnce(SSyncLogReplMgr* pMgr, SSyncNode* p pMgr->endIndex = index + 1; if (barrier) { + sInfo("vgId:%d, replicated sync barrier to dest: %" PRIx64 ". index: %" PRId64 ", term: %" PRId64 + ", repl mgr: rs(%d) [%" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pDestId->addr, index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, + pMgr->endIndex); break; } } diff --git a/source/libs/sync/src/syncTimeout.c b/source/libs/sync/src/syncTimeout.c index 73b5d7e79a..529fada467 100644 --- a/source/libs/sync/src/syncTimeout.c +++ b/source/libs/sync/src/syncTimeout.c @@ -52,7 +52,7 @@ static void syncNodeCleanConfigIndex(SSyncNode* ths) { } static int32_t syncNodeTimerRoutine(SSyncNode* ths) { - sNInfo(ths, "timer routines"); + sNDebug(ths, "timer routines"); // timer replicate syncNodeReplicate(ths); From bccb31aced6e5dfcb95d39c256a3810dccc7e651 Mon Sep 17 00:00:00 2001 From: 54liuyao <54liuyao@163.com> Date: Thu, 1 Dec 2022 14:15:50 +0800 Subject: [PATCH 40/42] fix:stream crash && add debug log --- source/libs/executor/inc/executorimpl.h | 1 + source/libs/executor/src/filloperator.c | 4 ++-- source/libs/executor/src/scanoperator.c | 2 +- source/libs/executor/src/timewindowoperator.c | 9 ++++++++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/source/libs/executor/inc/executorimpl.h b/source/libs/executor/inc/executorimpl.h index f17d7f6468..7100be58e3 100644 --- a/source/libs/executor/inc/executorimpl.h +++ b/source/libs/executor/inc/executorimpl.h @@ -536,6 +536,7 @@ typedef struct SStreamIntervalOperatorInfo { SArray* pChildren; SStreamState* pState; SWinKey delKey; + uint64_t numOfDatapack; } SStreamIntervalOperatorInfo; typedef struct SDataGroupInfo { diff --git a/source/libs/executor/src/filloperator.c b/source/libs/executor/src/filloperator.c index e26bfe9889..5ed26f627a 100644 --- a/source/libs/executor/src/filloperator.c +++ b/source/libs/executor/src/filloperator.c @@ -1146,7 +1146,7 @@ static void doDeleteFillResult(SOperatorInfo* pOperator) { if (delTs > nextKey.ts) { break; } - endTs = delTs; + SWinKey delKey = {.groupId = delGroupId, .ts = delTs}; if (delTs == nextKey.ts) { code = streamStateCurNext(pOperator->pTaskInfo->streamInfo.pState, pCur); @@ -1159,7 +1159,7 @@ static void doDeleteFillResult(SOperatorInfo* pOperator) { streamStateFreeCur(pCur); pCur = streamStateGetAndCheckCur(pOperator->pTaskInfo->streamInfo.pState, &nextKey); } - endTs = TMAX(ts, nextKey.ts - 1); + endTs = TMAX(delTs, nextKey.ts - 1); if (code != TSDB_CODE_SUCCESS) { break; } diff --git a/source/libs/executor/src/scanoperator.c b/source/libs/executor/src/scanoperator.c index b6353061fb..7f0dec1959 100644 --- a/source/libs/executor/src/scanoperator.c +++ b/source/libs/executor/src/scanoperator.c @@ -1113,7 +1113,7 @@ static STimeWindow getSlidingWindow(TSKEY* startTsCol, TSKEY* endTsCol, uint64_t if (hasGroup) { (*pRowIndex) += 1; } else { - while ((groupId == gpIdCol[(*pRowIndex)] && startTsCol[*pRowIndex] < endWin.ekey)) { + while ((groupId == gpIdCol[(*pRowIndex)] && startTsCol[*pRowIndex] <= endWin.ekey)) { (*pRowIndex) += 1; if ((*pRowIndex) == pDataBlockInfo->rows) { break; diff --git a/source/libs/executor/src/timewindowoperator.c b/source/libs/executor/src/timewindowoperator.c index f2c2e24a41..5cf0d31117 100644 --- a/source/libs/executor/src/timewindowoperator.c +++ b/source/libs/executor/src/timewindowoperator.c @@ -2516,9 +2516,11 @@ static SSDataBlock* doStreamFinalIntervalAgg(SOperatorInfo* pOperator) { SSDataBlock* pBlock = downstream->fpSet.getNextFn(downstream); if (pBlock == NULL) { pOperator->status = OP_RES_TO_RETURN; - qDebug("%s return data", IS_FINAL_OP(pInfo) ? "interval final" : "interval semi"); + qDebug("===stream===return data:%s. recv datablock num:%" PRIu64 , IS_FINAL_OP(pInfo) ? "interval final" : "interval semi", pInfo->numOfDatapack); + pInfo->numOfDatapack = 0; break; } + pInfo->numOfDatapack++; printDataBlock(pBlock, IS_FINAL_OP(pInfo) ? "interval final recv" : "interval semi recv"); ASSERT(pBlock->info.type != STREAM_INVERT); @@ -2734,6 +2736,7 @@ SOperatorInfo* createStreamFinalIntervalOperatorInfo(SOperatorInfo* downstream, pInfo->pDelWins = taosArrayInit(4, sizeof(SWinKey)); pInfo->delKey.ts = INT64_MAX; pInfo->delKey.groupId = 0; + pInfo->numOfDatapack = 0; pOperator->operatorType = pPhyNode->type; pOperator->blocking = true; @@ -4700,8 +4703,11 @@ static SSDataBlock* doStreamIntervalAgg(SOperatorInfo* pOperator) { while (1) { SSDataBlock* pBlock = downstream->fpSet.getNextFn(downstream); if (pBlock == NULL) { + qDebug("===stream===return data:single interval. recv datablock num:%" PRIu64, pInfo->numOfDatapack); + pInfo->numOfDatapack = 0; break; } + pInfo->numOfDatapack++; printDataBlock(pBlock, "single interval recv"); if (pBlock->info.type == STREAM_DELETE_DATA || pBlock->info.type == STREAM_DELETE_RESULT || @@ -4851,6 +4857,7 @@ SOperatorInfo* createStreamIntervalOperatorInfo(SOperatorInfo* downstream, SPhys pInfo->pChildren = NULL; pInfo->delKey.ts = INT64_MAX; pInfo->delKey.groupId = 0; + pInfo->numOfDatapack = 0; setOperatorInfo(pOperator, "StreamIntervalOperator", QUERY_NODE_PHYSICAL_PLAN_STREAM_INTERVAL, true, OP_NOT_OPENED, pInfo, pTaskInfo); From 704885e21245ba277374823ee279be3317d70417 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Thu, 1 Dec 2022 15:07:40 +0800 Subject: [PATCH 41/42] fix: compile error --- source/libs/sync/src/syncMain.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 2b4a435aee..f102095ce2 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -2532,6 +2532,10 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) { } int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { + const STraceId* trace = &pRpcMsg->info.traceId; + char tbuf[40] = {0}; + TRACE_TO_STR(trace, tbuf); + SyncHeartbeatReply* pMsg = pRpcMsg->pCont; SSyncLogReplMgr* pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId); if (pMgr == NULL) { @@ -2540,7 +2544,7 @@ int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { } int64_t tsMs = taosGetTimestampMs(); - syncLogRecvHeartbeatReply(ths, pMsg, tsMs - pMsg->timeStamp); + syncLogRecvHeartbeatReply(ths, pMsg, tsMs - pMsg->timeStamp, tbuf); syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->srcId, tsMs); From 0860ac1c9bf2a8b3cb98695c466c54492ffbaad9 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Thu, 1 Dec 2022 15:14:34 +0800 Subject: [PATCH 42/42] fix: show Permission denied error in mac --- source/util/src/terror.c | 2 +- tools/shell/src/shellEngine.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 1901e48c50..2025f196f2 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -519,7 +519,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_FIRST_COLUMN, "First column must b TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_VAR_COLUMN_LEN, "Invalid binary/nchar column length") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_TAGS_NUM, "Invalid number of tag columns") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_PERMISSION_DENIED, "Permission denied") -TAOS_DEFINE_ERROR(TSDB_CODE_PAR_PERMISSION_DENIED, "Invalid stream query") +TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_STREAM_QUERY, "Invalid stream query") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_INTERNAL_PK, "Invalid _c0 or _rowts expression") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_TIMELINE_FUNC, "Invalid timeline function") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_PASSWD, "Invalid password") diff --git a/tools/shell/src/shellEngine.c b/tools/shell/src/shellEngine.c index 28578e48a2..118a6caf7a 100644 --- a/tools/shell/src/shellEngine.c +++ b/tools/shell/src/shellEngine.c @@ -935,7 +935,7 @@ void shellGetGrantInfo() { int32_t code = taos_errno(tres); if (code != TSDB_CODE_SUCCESS) { - if (code != TSDB_CODE_OPS_NOT_SUPPORT && code != TSDB_CODE_MND_NO_RIGHTS) { + if (code != TSDB_CODE_OPS_NOT_SUPPORT && code != TSDB_CODE_MND_NO_RIGHTS && code != TSDB_CODE_PAR_PERMISSION_DENIED) { fprintf(stderr, "Failed to check Server Edition, Reason:0x%04x:%s\r\n\r\n", code, taos_errstr(tres)); } return;