From 2dc480adf632360694843fd181665308cf1d85e8 Mon Sep 17 00:00:00 2001 From: lichuang Date: Fri, 29 Oct 2021 14:31:21 +0800 Subject: [PATCH 01/16] [TD-10645][raft]sync manager --- include/libs/sync/sync.h | 2 +- source/libs/sync/inc/{raftInt.h => raft.h} | 18 ++- source/libs/sync/inc/syncInt.h | 57 +++++++++ source/libs/sync/src/sync.c | 130 ++++++++++++++++++++- 4 files changed, 189 insertions(+), 18 deletions(-) rename source/libs/sync/inc/{raftInt.h => raft.h} (74%) create mode 100644 source/libs/sync/inc/syncInt.h diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index 9ffd74c229..f9d348d77e 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -152,7 +152,7 @@ int32_t syncPropose(SSyncNode* syncNode, SSyncBuffer buffer, void* pData, bool i // int32_t syncRemoveNode(SSyncNode syncNode, const SNodeInfo *pNode); -extern int32_t syncDebugFlag; +extern int32_t sDebugFlag; #ifdef __cplusplus } diff --git a/source/libs/sync/inc/raftInt.h b/source/libs/sync/inc/raft.h similarity index 74% rename from source/libs/sync/inc/raftInt.h rename to source/libs/sync/inc/raft.h index 75c1c2187f..78c0c97ed6 100644 --- a/source/libs/sync/inc/raftInt.h +++ b/source/libs/sync/inc/raft.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 TAOS Data, Inc. + * Copyright (c) 2019 TAOS Data, Inc. * * This program is free software: you can use, redistribute, and/or modify * it under the terms of the GNU Affero General Public License, version 3 @@ -13,15 +13,11 @@ * along with this program. If not, see . */ -#ifndef _TD_RAFT_INT_H_ -#define _TD_RAFT_INT_H_ +#ifndef _TD_LIBS_SYNC_RAFT_H +#define _TD_LIBS_SYNC_RAFT_H -#ifdef __cplusplus -extern "C" { -#endif +typedef struct SSyncRaft { + +} SSyncRaft; -#ifdef __cplusplus -} -#endif - -#endif /*_TD_RAFT_INT_H_*/ \ No newline at end of file +#endif /* _TD_LIBS_SYNC_RAFT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h new file mode 100644 index 0000000000..33cbd836a1 --- /dev/null +++ b/source/libs/sync/inc/syncInt.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_INT_H +#define _TD_LIBS_SYNC_INT_H + +#include "thash.h" +#include "os.h" +#include "sync.h" +#include "raft.h" +#include "tlog.h" + +#define TAOS_SYNC_MAX_WORKER 3 + +typedef struct SSyncWorker { + pthread_t thread; +} SSyncWorker; + +struct SSyncNode { + pthread_mutex_t mutex; + SyncGroupId vgId; + SSyncRaft raft; +}; + +typedef struct SSyncManager { + pthread_mutex_t mutex; + + // worker threads + SSyncWorker worker[TAOS_SYNC_MAX_WORKER]; + + // vgroup hash table + SHashObj* vgroupTable; + +} SSyncManager; + +extern SSyncManager* gSyncManager; + +#define syncFatal(...) do { if (sDebugFlag & DEBUG_FATAL) { taosPrintLog("SYNC FATAL ", 255, __VA_ARGS__); }} while(0) +#define syncError(...) do { if (sDebugFlag & DEBUG_ERROR) { taosPrintLog("SYNC ERROR ", 255, __VA_ARGS__); }} while(0) +#define syncWarn(...) do { if (sDebugFlag & DEBUG_WARN) { taosPrintLog("SYNC WARN ", 255, __VA_ARGS__); }} while(0) +#define syncInfo(...) do { if (sDebugFlag & DEBUG_INFO) { taosPrintLog("SYNC ", 255, __VA_ARGS__); }} while(0) +#define syncDebug(...) do { if (sDebugFlag & DEBUG_DEBUG) { taosPrintLog("SYNC ", sDebugFlag, __VA_ARGS__); }} while(0) +#define syncTrace(...) do { if (sDebugFlag & DEBUG_TRACE) { taosPrintLog("SYNC ", sDebugFlag, __VA_ARGS__); }} while(0) + +#endif /* _TD_LIBS_SYNC_INT_H */ \ No newline at end of file diff --git a/source/libs/sync/src/sync.c b/source/libs/sync/src/sync.c index 879f2d4f6d..a974a17ad2 100644 --- a/source/libs/sync/src/sync.c +++ b/source/libs/sync/src/sync.c @@ -13,14 +13,132 @@ * along with this program. If not, see . */ -#include "sync.h" +#include "syncInt.h" -int32_t syncInit() { return 0; } +SSyncManager* gSyncManager = NULL; -void syncCleanUp() {} +static int syncOpenWorkerPool(SSyncManager* syncManager); +static int syncCloseWorkerPool(SSyncManager* syncManager); +static void *syncWorkerMain(void *argv); -SSyncNode* syncStart(const SSyncInfo* pInfo) { return NULL; } +int32_t syncInit() { + if (gSyncManager != NULL) { + return 0; + } -void syncStop(const SSyncNode* pNode) {} + gSyncManager = (SSyncManager*)malloc(sizeof(SSyncManager)); + if (gSyncManager == NULL) { + syncError("malloc SSyncManager fail"); + return -1; + } -void syncReconfig(const SSyncNode* pNode, const SSyncCluster* pCfg) {} \ No newline at end of file + pthread_mutex_init(&gSyncManager->mutex, NULL); + // init worker pool + if (syncOpenWorkerPool(gSyncManager) != 0) { + syncCleanUp(); + return -1; + } + + // init vgroup hash table + gSyncManager->vgroupTable = taosHashInit(TSDB_MIN_VNODES, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); + if (gSyncManager->vgroupTable == NULL) { + syncCleanUp(); + return -1; + } + return 0; +} + +void syncCleanUp() { + if (gSyncManager == NULL) { + return; + } + pthread_mutex_lock(&gSyncManager->mutex); + if (gSyncManager->vgroupTable) { + taosHashCleanup(gSyncManager->vgroupTable); + } + syncCloseWorkerPool(gSyncManager); + pthread_mutex_unlock(&gSyncManager->mutex); + pthread_mutex_destroy(&gSyncManager->mutex); + free(gSyncManager); + gSyncManager = NULL; +} + +SSyncNode* syncStart(const SSyncInfo* pInfo) { + pthread_mutex_lock(&gSyncManager->mutex); + + SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pInfo->vgId, sizeof(SyncGroupId)); + if (ppNode != NULL) { + syncInfo("vgroup %d already exist", pInfo->vgId); + pthread_mutex_unlock(&gSyncManager->mutex); + return *ppNode; + } + + SSyncNode *pNode = (SSyncNode*)malloc(sizeof(SSyncNode)); + if (pNode == NULL) { + syncInfo("malloc vgroup %d node fail", pInfo->vgId); + pthread_mutex_unlock(&gSyncManager->mutex); + return NULL; + } + + pthread_mutex_init(&pNode->mutex, NULL); + + taosHashPut(gSyncManager->vgroupTable, &pInfo->vgId, sizeof(SyncGroupId), &pNode, sizeof(SSyncNode *)); + + pthread_mutex_unlock(&gSyncManager->mutex); + return NULL; +} + +void syncStop(const SSyncNode* pNode) { + pthread_mutex_lock(&gSyncManager->mutex); + + SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId)); + if (ppNode == NULL) { + syncInfo("vgroup %d not exist", pNode->vgId); + pthread_mutex_unlock(&gSyncManager->mutex); + return; + } + assert(*ppNode == pNode); + + taosHashRemove(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId)); + pthread_mutex_unlock(&gSyncManager->mutex); + + pthread_mutex_destroy(&pNode->mutex); + free(*ppNode); +} + +void syncReconfig(const SSyncNode* pNode, const SSyncCluster* pCfg) {} + +static int syncOpenWorkerPool(SSyncManager* syncManager) { + int i; + pthread_attr_t thattr; + + pthread_attr_init(&thattr); + pthread_attr_setdetachstate(&thattr, PTHREAD_CREATE_JOINABLE); + + for (i = 0; i < TAOS_SYNC_MAX_WORKER; ++i) { + SSyncWorker* pWorker = &(syncManager->worker[i]); + + if (pthread_create(&(pWorker->thread), &thattr, (void *)syncWorkerMain, pWorker) != 0) { + syncError("failed to create sync worker since %s", strerror(errno)); + + return -1; + } + } + + pthread_attr_destroy(&thattr); + + return 0; +} + +static int syncCloseWorkerPool(SSyncManager* syncManager) { + return 0; +} + +static void *syncWorkerMain(void *argv) { + SSyncWorker* pWorker = (SSyncWorker *)argv; + + taosBlockSIGPIPE(); + setThreadName("syncWorker"); + + return NULL; +} \ No newline at end of file From c319d1cb12840088e0c60fc0db54b92ef0bd17a4 Mon Sep 17 00:00:00 2001 From: lichuang Date: Fri, 29 Oct 2021 16:05:25 +0800 Subject: [PATCH 02/16] [TD-10645][raft]add raft module --- include/libs/sync/sync.h | 7 ++- source/libs/sync/inc/raft.h | 12 ++++- source/libs/sync/inc/raft_message.h | 76 +++++++++++++++++++++++++++++ source/libs/sync/inc/syncInt.h | 3 ++ source/libs/sync/src/raft.c | 74 ++++++++++++++++++++++++++++ source/libs/sync/src/raft_message.c | 17 +++++++ source/libs/sync/src/sync.c | 21 +++++++- 7 files changed, 206 insertions(+), 4 deletions(-) create mode 100644 source/libs/sync/inc/raft_message.h create mode 100644 source/libs/sync/src/raft.c create mode 100644 source/libs/sync/src/raft_message.c diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index f9d348d77e..1c228675bd 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -89,6 +89,10 @@ typedef struct SSyncLogStore { // write log with given index int32_t (*logWrite)(struct SSyncLogStore* logStore, SyncIndex index, SSyncBuffer* pBuf); + // read log from given index with limit, return the actual num in nBuf + int32_t (*logRead)(struct SSyncLogStore* logStore, SyncIndex index, int limit, + SSyncBuffer* pBuf, int* nBuf); + // mark log with given index has been commtted int32_t (*logCommit)(struct SSyncLogStore* logStore, SyncIndex index); @@ -102,6 +106,7 @@ typedef struct SSyncLogStore { typedef struct SSyncServerState { SyncNodeId voteFor; SSyncTerm term; + SyncIndex commitIndex; } SSyncServerState; typedef struct SSyncClusterConfig { @@ -146,7 +151,7 @@ SSyncNode* syncStart(const SSyncInfo*); void syncReconfig(const SSyncNode*, const SSyncCluster*); void syncStop(const SSyncNode*); -int32_t syncPropose(SSyncNode* syncNode, SSyncBuffer buffer, void* pData, bool isWeak); +int32_t syncPropose(SSyncNode* syncNode, const SSyncBuffer* pBuf, void* pData, bool isWeak); // int32_t syncAddNode(SSyncNode syncNode, const SNodeInfo *pNode); diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 78c0c97ed6..0df46db3fc 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -16,8 +16,18 @@ #ifndef _TD_LIBS_SYNC_RAFT_H #define _TD_LIBS_SYNC_RAFT_H +#include "sync.h" +#include "raft_message.h" + typedef struct SSyncRaft { - + // owner sync node + SSyncNode* pNode; + + SSyncInfo info; + } SSyncRaft; +int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo); +int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg); + #endif /* _TD_LIBS_SYNC_RAFT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h new file mode 100644 index 0000000000..cb0552500a --- /dev/null +++ b/source/libs/sync/inc/raft_message.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_RAFT_MESSAGE_H +#define _TD_LIBS_SYNC_RAFT_MESSAGE_H + +#include "sync.h" + +/** + * below define message type which handled by Raft node thread + * internal message, which communicate in threads, start with RAFT_MSG_INTERNAL_*, + * internal message use pointer only, need not to be decode/encode + * outter message start with RAFT_MSG_*, need to implement its decode/encode functions + **/ +typedef enum RaftMessageType { + // client propose a cmd + RAFT_MSG_INTERNAL_PROP = 1, + + RAFT_MSG_APPEND, + RAFT_MSG_APPEND_RESP, + + RAFT_MSG_VOTE, + RAFT_MSG_VOTE_RESP, + + RAFT_MSG_PRE_VOTE, + RAFT_MSG_PRE_VOTE_RESP, + +} RaftMessageType; + +typedef struct RaftMsgInternal_Prop { + const SSyncBuffer *pBuf; + bool isWeak; + void* pData; +} RaftMsgInternal_Prop; + +typedef struct RaftMessage { + RaftMessageType msgType; + SSyncTerm term; + SyncNodeId from; + SyncNodeId to; + + union { + RaftMsgInternal_Prop propose; + }; +} RaftMessage; + +static FORCE_INLINE RaftMessage* syncInitPropMsg(RaftMessage* pMsg, const SSyncBuffer* pBuf, void* pData, bool isWeak) { + *pMsg = (RaftMessage) { + .msgType = RAFT_MSG_INTERNAL_PROP, + .propose = (RaftMsgInternal_Prop) { + .isWeak = isWeak, + .pBuf = pBuf, + .pData = pData, + }, + }; + + return pMsg; +} + +static FORCE_INLINE bool syncIsInternalMsg(const RaftMessage* pMsg) { + return pMsg->msgType == RAFT_MSG_INTERNAL_PROP; +} + +#endif /* _TD_LIBS_SYNC_RAFT_MESSAGE_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 33cbd836a1..c1c3ed17a8 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -40,6 +40,9 @@ typedef struct SSyncManager { // worker threads SSyncWorker worker[TAOS_SYNC_MAX_WORKER]; + // sync net worker + SSyncWorker netWorker; + // vgroup hash table SHashObj* vgroupTable; diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c new file mode 100644 index 0000000000..109b08902a --- /dev/null +++ b/source/libs/sync/src/raft.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "raft.h" +#include "syncInt.h" + +#ifndef MIN +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#endif + +#define RAFT_READ_LOG_MAX_NUM 100 + +int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { + SSyncNode* pNode = pRaft->pNode; + SSyncServerState serverState; + SStateManager* stateManager; + SSyncLogStore* logStore; + SSyncFSM* fsm; + SyncIndex initIndex = pInfo->snapshotIndex; + SSyncBuffer buffer[RAFT_READ_LOG_MAX_NUM]; + int nBuf, limit, i; + + memcpy(&pRaft->info, pInfo, sizeof(SSyncInfo)); + stateManager = &(pRaft->info.stateManager); + logStore = &(pRaft->info.logStore); + fsm = &(pRaft->info.fsm); + + // read server state + if (stateManager->readServerState(stateManager, &serverState) != 0) { + syncError("readServerState for vgid %d fail", pInfo->vgId); + return -1; + } + assert(initIndex <= serverState.commitIndex); + + // restore fsm state from snapshot index + 1, until commitIndex + ++initIndex; + while (initIndex < serverState.commitIndex) { + limit = MIN(RAFT_READ_LOG_MAX_NUM, serverState.commitIndex - initIndex); + + if (logStore->logRead(logStore, initIndex, limit, buffer, &nBuf) != 0) { + return -1; + } + assert(limit == nBuf); + + for (i = 0; i < limit; ++i) { + fsm->applyLog(fsm, initIndex + i, &(buffer[i]), NULL); + free(buffer[i].data); + } + initIndex += nBuf; + } + assert(initIndex == serverState.commitIndex); + + syncInfo("restore vgid %d state: snapshot index:", pInfo->vgId); + return 0; +} + +int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg) { + if (!syncIsInternalMsg(pMsg)) { + free(pMsg); + } + return 0; +} \ No newline at end of file diff --git a/source/libs/sync/src/raft_message.c b/source/libs/sync/src/raft_message.c new file mode 100644 index 0000000000..d35efce9db --- /dev/null +++ b/source/libs/sync/src/raft_message.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "raft_message.h" + diff --git a/source/libs/sync/src/sync.c b/source/libs/sync/src/sync.c index a974a17ad2..e627cf8bc1 100644 --- a/source/libs/sync/src/sync.c +++ b/source/libs/sync/src/sync.c @@ -75,7 +75,15 @@ SSyncNode* syncStart(const SSyncInfo* pInfo) { SSyncNode *pNode = (SSyncNode*)malloc(sizeof(SSyncNode)); if (pNode == NULL) { - syncInfo("malloc vgroup %d node fail", pInfo->vgId); + syncError("malloc vgroup %d node fail", pInfo->vgId); + pthread_mutex_unlock(&gSyncManager->mutex); + return NULL; + } + + // start raft + pNode->raft.pNode = pNode; + if (syncRaftStart(&pNode->raft, pInfo) != 0) { + syncError("raft start at %d node fail", pInfo->vgId); pthread_mutex_unlock(&gSyncManager->mutex); return NULL; } @@ -102,10 +110,19 @@ void syncStop(const SSyncNode* pNode) { taosHashRemove(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId)); pthread_mutex_unlock(&gSyncManager->mutex); - pthread_mutex_destroy(&pNode->mutex); + pthread_mutex_destroy(&((*ppNode)->mutex)); free(*ppNode); } +int32_t syncPropose(SSyncNode* syncNode, const SSyncBuffer* pBuf, void* pData, bool isWeak) { + RaftMessage msg; + + pthread_mutex_lock(&syncNode->mutex); + int32_t ret = syncRaftStep(&syncNode->raft, syncInitPropMsg(&msg, pBuf, pData, isWeak)); + pthread_mutex_unlock(&syncNode->mutex); + return ret; +} + void syncReconfig(const SSyncNode* pNode, const SSyncCluster* pCfg) {} static int syncOpenWorkerPool(SSyncManager* syncManager) { From 5b7261d63fab335351123b0c6b025f28aa48c9fb Mon Sep 17 00:00:00 2001 From: lichuang Date: Fri, 29 Oct 2021 17:09:25 +0800 Subject: [PATCH 03/16] [TD-10645][raft]add sync node timer --- source/libs/sync/inc/raft.h | 1 + source/libs/sync/inc/raft_message.h | 2 ++ source/libs/sync/inc/syncInt.h | 5 +++++ source/libs/sync/src/raft.c | 8 ++++--- source/libs/sync/src/raft_message.c | 5 +++++ source/libs/sync/src/sync.c | 33 ++++++++++++++++++++++++++++- 6 files changed, 50 insertions(+), 4 deletions(-) diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 0df46db3fc..f81040658e 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -29,5 +29,6 @@ typedef struct SSyncRaft { int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo); int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg); +int32_t syncRaftTick(SSyncRaft* pRaft); #endif /* _TD_LIBS_SYNC_RAFT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h index cb0552500a..faf14840c9 100644 --- a/source/libs/sync/inc/raft_message.h +++ b/source/libs/sync/inc/raft_message.h @@ -73,4 +73,6 @@ static FORCE_INLINE bool syncIsInternalMsg(const RaftMessage* pMsg) { return pMsg->msgType == RAFT_MSG_INTERNAL_PROP; } +void syncFreeMessage(const RaftMessage* pMsg); + #endif /* _TD_LIBS_SYNC_RAFT_MESSAGE_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index c1c3ed17a8..81cb686781 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -30,8 +30,10 @@ typedef struct SSyncWorker { struct SSyncNode { pthread_mutex_t mutex; + int32_t refCount; SyncGroupId vgId; SSyncRaft raft; + void* syncTimer; }; typedef struct SSyncManager { @@ -46,6 +48,9 @@ typedef struct SSyncManager { // vgroup hash table SHashObj* vgroupTable; + // timer manager + void* syncTimerManager; + } SSyncManager; extern SSyncManager* gSyncManager; diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 109b08902a..23442803c4 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -67,8 +67,10 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { } int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg) { - if (!syncIsInternalMsg(pMsg)) { - free(pMsg); - } + syncFreeMessage(pMsg); + return 0; +} + +int32_t syncRaftTick(SSyncRaft* pRaft) { return 0; } \ No newline at end of file diff --git a/source/libs/sync/src/raft_message.c b/source/libs/sync/src/raft_message.c index d35efce9db..912314daf2 100644 --- a/source/libs/sync/src/raft_message.c +++ b/source/libs/sync/src/raft_message.c @@ -15,3 +15,8 @@ #include "raft_message.h" +void syncFreeMessage(const RaftMessage* pMsg) { + if (!syncIsInternalMsg(pMsg)) { + free((RaftMessage*)pMsg); + } +} \ No newline at end of file diff --git a/source/libs/sync/src/sync.c b/source/libs/sync/src/sync.c index e627cf8bc1..a9df02f818 100644 --- a/source/libs/sync/src/sync.c +++ b/source/libs/sync/src/sync.c @@ -14,12 +14,16 @@ */ #include "syncInt.h" +#include "ttimer.h" SSyncManager* gSyncManager = NULL; +#define SYNC_TICK_TIMER 50 + static int syncOpenWorkerPool(SSyncManager* syncManager); static int syncCloseWorkerPool(SSyncManager* syncManager); static void *syncWorkerMain(void *argv); +static void syncNodeTick(void *param, void *tmrId); int32_t syncInit() { if (gSyncManager != NULL) { @@ -33,6 +37,14 @@ int32_t syncInit() { } pthread_mutex_init(&gSyncManager->mutex, NULL); + + // init sync timer manager + gSyncManager->syncTimerManager = taosTmrInit(1000, 50, 10000, "SYNC"); + if (gSyncManager->syncTimerManager == NULL) { + syncCleanUp(); + return -1; + } + // init worker pool if (syncOpenWorkerPool(gSyncManager) != 0) { syncCleanUp(); @@ -56,6 +68,7 @@ void syncCleanUp() { if (gSyncManager->vgroupTable) { taosHashCleanup(gSyncManager->vgroupTable); } + taosTmrCleanUp(gSyncManager->syncTimerManager); syncCloseWorkerPool(gSyncManager); pthread_mutex_unlock(&gSyncManager->mutex); pthread_mutex_destroy(&gSyncManager->mutex); @@ -80,6 +93,8 @@ SSyncNode* syncStart(const SSyncInfo* pInfo) { return NULL; } + pNode->syncTimer = taosTmrStart(syncNodeTick, SYNC_TICK_TIMER, (void*)pInfo->vgId, gSyncManager->syncTimerManager); + // start raft pNode->raft.pNode = pNode; if (syncRaftStart(&pNode->raft, pInfo) != 0) { @@ -106,7 +121,8 @@ void syncStop(const SSyncNode* pNode) { return; } assert(*ppNode == pNode); - + taosTmrStop(pNode->syncTimer); + taosHashRemove(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId)); pthread_mutex_unlock(&gSyncManager->mutex); @@ -158,4 +174,19 @@ static void *syncWorkerMain(void *argv) { setThreadName("syncWorker"); return NULL; +} + +static void syncNodeTick(void *param, void *tmrId) { + SyncGroupId vgId = (SyncGroupId)param; + SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &vgId, sizeof(SyncGroupId)); + if (ppNode == NULL) { + return; + } + SSyncNode *pNode = *ppNode; + + pthread_mutex_lock(&pNode->mutex); + syncRaftTick(&pNode->raft); + pthread_mutex_unlock(&pNode->mutex); + + pNode->syncTimer = taosTmrStart(syncNodeTick, SYNC_TICK_TIMER, (void*)pNode->vgId, gSyncManager->syncTimerManager); } \ No newline at end of file From fca35ceb29ee8556474812abf7ce25c4e5c16a19 Mon Sep 17 00:00:00 2001 From: lichuang Date: Tue, 2 Nov 2021 10:49:23 +0800 Subject: [PATCH 04/16] [TD-10645][raft]add sync rpc client and server --- source/libs/sync/CMakeLists.txt | 1 + source/libs/sync/inc/syncInt.h | 11 +++- source/libs/sync/src/sync.c | 106 +++++++++++++++++++++++++++++++- 3 files changed, 113 insertions(+), 5 deletions(-) diff --git a/source/libs/sync/CMakeLists.txt b/source/libs/sync/CMakeLists.txt index 124f4a1fee..37ee5194c8 100644 --- a/source/libs/sync/CMakeLists.txt +++ b/source/libs/sync/CMakeLists.txt @@ -4,6 +4,7 @@ add_library(sync ${SYNC_SRC}) target_link_libraries( sync PUBLIC common + PUBLIC transport PUBLIC util PUBLIC wal ) diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 81cb686781..73015e87a1 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -39,12 +39,17 @@ struct SSyncNode { typedef struct SSyncManager { pthread_mutex_t mutex; + // sync server rpc + void* serverRpc; + // rpc server hash table base on FQDN:port key + SHashObj* rpcServerTable; + + // sync client rpc + void* clientRpc; + // worker threads SSyncWorker worker[TAOS_SYNC_MAX_WORKER]; - // sync net worker - SSyncWorker netWorker; - // vgroup hash table SHashObj* vgroupTable; diff --git a/source/libs/sync/src/sync.c b/source/libs/sync/src/sync.c index a9df02f818..e3d0606c08 100644 --- a/source/libs/sync/src/sync.c +++ b/source/libs/sync/src/sync.c @@ -14,12 +14,20 @@ */ #include "syncInt.h" +#include "trpc.h" #include "ttimer.h" SSyncManager* gSyncManager = NULL; #define SYNC_TICK_TIMER 50 +#define SYNC_ACTIVITY_TIMER 5 +#define SYNC_SERVER_WORKER 2 +static void syncProcessRsp(SRpcMsg *pMsg, SRpcEpSet *pEpSet); +static void syncProcessReqMsg(SRpcMsg *pMsg, SRpcEpSet *pEpSet); + +static int syncInitRpcServer(SSyncManager* syncManager, const SSyncCluster* pSyncCfg); +static int syncInitRpcClient(SSyncManager* syncManager); static int syncOpenWorkerPool(SSyncManager* syncManager); static int syncCloseWorkerPool(SSyncManager* syncManager); static void *syncWorkerMain(void *argv); @@ -30,7 +38,7 @@ int32_t syncInit() { return 0; } - gSyncManager = (SSyncManager*)malloc(sizeof(SSyncManager)); + gSyncManager = (SSyncManager*)calloc(sizeof(SSyncManager), 0); if (gSyncManager == NULL) { syncError("malloc SSyncManager fail"); return -1; @@ -38,6 +46,12 @@ int32_t syncInit() { pthread_mutex_init(&gSyncManager->mutex, NULL); + // init client rpc + if (syncInitRpcClient(gSyncManager) != 0) { + syncCleanUp(); + return -1; + } + // init sync timer manager gSyncManager->syncTimerManager = taosTmrInit(1000, 50, 10000, "SYNC"); if (gSyncManager->syncTimerManager == NULL) { @@ -68,7 +82,13 @@ void syncCleanUp() { if (gSyncManager->vgroupTable) { taosHashCleanup(gSyncManager->vgroupTable); } - taosTmrCleanUp(gSyncManager->syncTimerManager); + if (gSyncManager->clientRpc) { + rpcClose(gSyncManager->clientRpc); + syncInfo("sync inter-sync rpc client is closed"); + } + if (gSyncManager->syncTimerManager) { + taosTmrCleanUp(gSyncManager->syncTimerManager); + } syncCloseWorkerPool(gSyncManager); pthread_mutex_unlock(&gSyncManager->mutex); pthread_mutex_destroy(&gSyncManager->mutex); @@ -86,6 +106,12 @@ SSyncNode* syncStart(const SSyncInfo* pInfo) { return *ppNode; } + // init rpc server + if (syncInitRpcServer(gSyncManager, &pInfo->syncCfg) != 0) { + pthread_mutex_unlock(&gSyncManager->mutex); + return NULL; + } + SSyncNode *pNode = (SSyncNode*)malloc(sizeof(SSyncNode)); if (pNode == NULL) { syncError("malloc vgroup %d node fail", pInfo->vgId); @@ -141,6 +167,82 @@ int32_t syncPropose(SSyncNode* syncNode, const SSyncBuffer* pBuf, void* pData, b void syncReconfig(const SSyncNode* pNode, const SSyncCluster* pCfg) {} +// process rpc rsp message from other sync server +static void syncProcessRsp(SRpcMsg *pMsg, SRpcEpSet *pEpSet) { + +} + +// process rpc message from other sync server +static void syncProcessReqMsg(SRpcMsg *pMsg, SRpcEpSet *pEpSet) { + +} + +static int syncInitRpcServer(SSyncManager* syncManager, const SSyncCluster* pSyncCfg) { + if (gSyncManager->rpcServerTable == NULL) { + gSyncManager->rpcServerTable = taosHashInit(TSDB_MIN_VNODES, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); + if (gSyncManager->rpcServerTable == NULL) { + syncError("init sync rpc server hash table error"); + return -1; + } + } + assert(pSyncCfg->selfIndex < pSyncCfg->replica && pSyncCfg->selfIndex >= 0); + const SNodeInfo* pNode = &(pSyncCfg->nodeInfo[pSyncCfg->replica]); + char buffer[20] = {'\0'}; + snprintf(buffer, sizeof(buffer), "%s:%d", &(pNode->nodeFqdn[0]), pNode->nodePort); + size_t len = strlen(buffer); + void** ppRpcServer = taosHashGet(gSyncManager->rpcServerTable, buffer, len); + if (ppRpcServer != NULL) { + // already inited + syncInfo("sync rpc server for %s already exist", buffer); + return 0; + } + + SRpcInit rpcInit; + memset(&rpcInit, 0, sizeof(rpcInit)); + rpcInit.localPort = pNode->nodePort; + rpcInit.label = "sync-server"; + rpcInit.numOfThreads = SYNC_SERVER_WORKER; + rpcInit.cfp = syncProcessReqMsg; + rpcInit.sessions = TSDB_MAX_VNODES << 4; + rpcInit.connType = TAOS_CONN_SERVER; + rpcInit.idleTime = SYNC_ACTIVITY_TIMER * 1000; + + void* rpcServer = rpcOpen(&rpcInit); + if (rpcServer == NULL) { + syncInfo("rpcOpen for sync rpc server for %s fail", buffer); + return -1; + } + + taosHashPut(gSyncManager->rpcServerTable, buffer, strlen(buffer), rpcServer, len); + syncInfo("sync rpc server for %s init success", buffer); + + return 0; +} + +static int syncInitRpcClient(SSyncManager* syncManager) { + char secret[TSDB_KEY_LEN] = "secret"; + SRpcInit rpcInit; + memset(&rpcInit, 0, sizeof(rpcInit)); + rpcInit.label = "sync-client"; + rpcInit.numOfThreads = 1; + rpcInit.cfp = syncProcessRsp; + rpcInit.sessions = TSDB_MAX_VNODES << 4; + rpcInit.connType = TAOS_CONN_CLIENT; + rpcInit.idleTime = SYNC_ACTIVITY_TIMER * 1000; + rpcInit.user = "t"; + rpcInit.ckey = "key"; + rpcInit.secret = secret; + + syncManager->clientRpc = rpcOpen(&rpcInit); + if (syncManager->clientRpc == NULL) { + syncError("failed to init sync rpc client"); + return -1; + } + + syncInfo("sync inter-sync rpc client is initialized"); + return 0; +} + static int syncOpenWorkerPool(SSyncManager* syncManager) { int i; pthread_attr_t thattr; From 24a0966da6afdb6dcde16b204675fe6ecf6dde3b Mon Sep 17 00:00:00 2001 From: lichuang Date: Tue, 2 Nov 2021 15:50:27 +0800 Subject: [PATCH 05/16] [TD-10645][raft]add raft progress --- include/libs/sync/sync.h | 16 +- source/libs/sync/inc/raft.h | 35 ++- source/libs/sync/inc/raft_progress.h | 181 +++++++++++++ source/libs/sync/inc/raft_unstable_log.h | 115 ++++++++ source/libs/sync/inc/syncInt.h | 1 + source/libs/sync/inc/sync_type.h | 33 +++ source/libs/sync/src/raft.c | 23 +- source/libs/sync/src/raft_progress.c | 317 +++++++++++++++++++++++ source/libs/sync/src/raft_unstable_log.c | 21 ++ 9 files changed, 727 insertions(+), 15 deletions(-) create mode 100644 source/libs/sync/inc/raft_progress.h create mode 100644 source/libs/sync/inc/raft_unstable_log.h create mode 100644 source/libs/sync/inc/sync_type.h create mode 100644 source/libs/sync/src/raft_progress.c create mode 100644 source/libs/sync/src/raft_unstable_log.c diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index 1c228675bd..ef8773f5cc 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -61,13 +61,13 @@ typedef struct { typedef struct SSyncFSM { void* pData; - // apply committed log, bufs will be free by raft module + // apply committed log, bufs will be free by sync module int32_t (*applyLog)(struct SSyncFSM* fsm, SyncIndex index, const SSyncBuffer* buf, void* pData); // cluster commit callback int32_t (*onClusterChanged)(struct SSyncFSM* fsm, const SSyncCluster* cluster, void* pData); - // fsm return snapshot in ppBuf, bufs will be free by raft module + // fsm return snapshot in ppBuf, bufs will be free by sync module // TODO: getSnapshot SHOULD be async? int32_t (*getSnapshot)(struct SSyncFSM* fsm, SSyncBuffer** ppBuf, int32_t* objId, bool* isLast); @@ -89,18 +89,24 @@ typedef struct SSyncLogStore { // write log with given index int32_t (*logWrite)(struct SSyncLogStore* logStore, SyncIndex index, SSyncBuffer* pBuf); - // read log from given index with limit, return the actual num in nBuf + /** + * read log from given index(included) with limit, return the actual num in nBuf, + * pBuf will be free in sync module + **/ int32_t (*logRead)(struct SSyncLogStore* logStore, SyncIndex index, int limit, SSyncBuffer* pBuf, int* nBuf); // mark log with given index has been commtted int32_t (*logCommit)(struct SSyncLogStore* logStore, SyncIndex index); - // prune log before given index + // prune log before given index(not included) int32_t (*logPrune)(struct SSyncLogStore* logStore, SyncIndex index); - // rollback log after given index + // rollback log after given index(included) int32_t (*logRollback)(struct SSyncLogStore* logStore, SyncIndex index); + + // return last index of log + SyncIndex (*logLastIndex)(struct SSyncLogStore* logStore); } SSyncLogStore; typedef struct SSyncServerState { diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index f81040658e..869baecdda 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -17,15 +17,46 @@ #define _TD_LIBS_SYNC_RAFT_H #include "sync.h" +#include "sync_type.h" #include "raft_message.h" -typedef struct SSyncRaft { +typedef struct SSyncRaftProgress SSyncRaftProgress; + +typedef struct RaftLeaderState { + int nProgress; + SSyncRaftProgress* progress; +} RaftLeaderState; + +typedef struct SSyncRaftIOMethods { + SyncTime (*time)(SSyncRaft*); + +} SSyncRaftIOMethods; + +struct SSyncRaft { // owner sync node SSyncNode* pNode; SSyncInfo info; -} SSyncRaft; + // election timeout tick(random in [3:6] tick) + uint16_t electionTick; + + // heartbeat timeout tick(default: 1 tick) + uint16_t heartbeatTick; + + int installSnapShotTimeoutMS; + + // + int heartbeatTimeoutMS; + + bool preVote; + + SSyncRaftIOMethods io; + + RaftLeaderState leaderState; + + SSyncRaftUnstableLog *log; +}; int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo); int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg); diff --git a/source/libs/sync/inc/raft_progress.h b/source/libs/sync/inc/raft_progress.h new file mode 100644 index 0000000000..73aa9db59f --- /dev/null +++ b/source/libs/sync/inc/raft_progress.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef TD_SYNC_RAFT_PROGRESS_H +#define TD_SYNC_RAFT_PROGRESS_H + +#include "sync_type.h" + +/** + * SSyncRaftInflights is a sliding window for the inflight messages. + * Thus inflight effectively limits both the number of inflight messages + * and the bandwidth each Progress can use. + * When inflights is full, no more message should be sent. + * When a leader sends out a message, the index of the last + * entry should be added to inflights. The index MUST be added + * into inflights in order. + * When a leader receives a reply, the previous inflights should + * be freed by calling syncRaftInflightFreeTo with the index of the last + * received entry. + **/ +typedef struct SSyncRaftInflights { + /* the starting index in the buffer */ + int start; + + /* number of inflights in the buffer */ + int count; + + /* the size of the buffer */ + int size; + + /** + * buffer contains the index of the last entry + * inside one message. + **/ + SyncIndex* buffer; +} SSyncRaftInflights; + +/** + * State defines how the leader should interact with the follower. + * + * When in PROGRESS_PROBE, leader sends at most one replication message + * per heartbeat interval. It also probes actual progress of the follower. + * + * When in PROGRESS_REPLICATE, leader optimistically increases next + * to the latest entry sent after sending replication message. This is + * an optimized state for fast replicating log entries to the follower. + * + * When in PROGRESS_SNAPSHOT, leader should have sent out snapshot + * before and stops sending any replication message. + * + * PROGRESS_PROBE is the initial state. + **/ +typedef enum RaftProgressState { + PROGRESS_PROBE = 0, + PROGRESS_REPLICATE, + PROGRESS_SNAPSHOT, +} RaftProgressState; + +/** + * Progress represents a follower’s progress in the view of the leader. Leader maintains + * progresses of all followers, and sends entries to the follower based on its progress. + **/ +struct SSyncRaftProgress { + SyncIndex nextIndex; + + SyncIndex matchIndex; + + RaftProgressState state; + + /** + * paused is used in PROGRESS_PROBE. + * When paused is true, raft should pause sending replication message to this peer. + **/ + bool paused; + + /** + * pendingSnapshotIndex is used in PROGRESS_SNAPSHOT. + * If there is a pending snapshot, the pendingSnapshotIndex will be set to the + * index of the snapshot. If pendingSnapshotIndex is set, the replication process of + * this Progress will be paused. raft will not resend snapshot until the pending one + * is reported to be failed. + **/ + SyncIndex pendingSnapshotIndex; + + /** + * recentActive is true if the progress is recently active. Receiving any messages + * from the corresponding follower indicates the progress is active. + * RecentActive can be reset to false after an election timeout. + **/ + bool recentActive; + + /** + * flow control sliding window + **/ + SSyncRaftInflights inflights; +}; + +int syncRaftProgressCreate(SSyncRaft* pRaft); +//int syncRaftProgressRecreate(SSyncRaft* pRaft, const RaftConfiguration* configuration); + +/** + * syncRaftProgressMaybeUpdate returns false if the given lastIndex index comes from i-th node's log. + * Otherwise it updates the progress and returns true. + **/ +bool syncRaftProgressMaybeUpdate(SSyncRaft* pRaft, int i, SyncIndex lastIndex); + +void syncRaftProgressOptimisticNextIndex(SSyncRaft* pRaft, int i, SyncIndex nextIndex); + +/** + * syncRaftProgressMaybeDecrTo returns false if the given to index comes from an out of order message. + * Otherwise it decreases the progress next index to min(rejected, last) and returns true. + **/ +bool syncRaftProgressMaybeDecrTo(SSyncRaft* pRaft, int i, + SyncIndex rejected, SyncIndex lastIndex); + +/** + * syncRaftProgressIsPaused returns whether sending log entries to this node has been + * paused. A node may be paused because it has rejected recent + * MsgApps, is currently waiting for a snapshot, or has reached the + * MaxInflightMsgs limit. + **/ +bool syncRaftProgressIsPaused(SSyncRaft* pRaft, int i); + +void syncRaftProgressFailure(SSyncRaft* pRaft, int i); + +bool syncRaftProgressNeedAbortSnapshot(SSyncRaft* pRaft, int i); + +/** + * return true if i-th node's log is up-todate + **/ +bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, int i); + +void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i); + +void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i); + +void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i, SyncIndex snapshotIndex); + +int syncRaftInflightReset(SSyncRaftInflights* inflights); +bool syncRaftInflightFull(SSyncRaftInflights* inflights); +void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex); +void syncRaftInflightFreeTo(SSyncRaftInflights* inflights, SyncIndex toIndex); +void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights); + +#if 0 + +void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); + +SyncIndex syncRaftProgressNextIndex(SSyncRaft* pRaft, int i); + +SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i); + +void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i); + +void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i); + +bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i); + +void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i); + +bool syncRaftProgressGetRecentRecv(SSyncRaft* pRaft, int i); + +void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); + +RaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i); + +#endif + +#endif /* TD_SYNC_RAFT_PROGRESS_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_unstable_log.h b/source/libs/sync/inc/raft_unstable_log.h new file mode 100644 index 0000000000..2b7b30c15a --- /dev/null +++ b/source/libs/sync/inc/raft_unstable_log.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef TD_SYNC_RAFT_UNSTABLE_LOG_H +#define TD_SYNC_RAFT_UNSTABLE_LOG_H + +#include "sync_type.h" + +/* in-memory unstable raft log storage */ +struct SSyncRaftUnstableLog { +#if 0 + /* Circular buffer of log entries */ + RaftEntry *entries; + + /* size of Circular buffer */ + int size; + + /* Indexes of used slots [front, back) */ + int front, back; + + /* Index of first entry is offset + 1 */ + SyncIndex offset; + + /* meta data of snapshot */ + SSyncRaftUnstableLog snapshot; +#endif +}; + +/** + * return index of last in memory log, return 0 if log is empty + **/ +SyncIndex syncRaftLogLastIndex(SSyncRaftUnstableLog* pLog); + +#if 0 +void raftLogInit(RaftLog* pLog); + +void raftLogClose(RaftLog* pLog); + +/** + * When startup populating log entrues loaded from disk, + * init raft memory log with snapshot index,term and log start idnex. + **/ +/* +void raftLogStart(RaftLog* pLog, + RaftSnapshotMeta snapshot, + SyncIndex startIndex); +*/ +/** + * Get the number of entries the log. + **/ +int raftLogNumEntries(const RaftLog* pLog); + + + +/** + * return last term of in memory log, return 0 if log is empty + **/ +SSyncTerm raftLogLastTerm(RaftLog* pLog); + +/** + * return term of log with the given index, return 0 if the term of index cannot be found + * , errCode will save the error code. + **/ +SSyncTerm raftLogTermOf(RaftLog* pLog, SyncIndex index, RaftCode* errCode); + +/** + * Get the last index of the most recent snapshot. Return 0 if there are no * + * snapshots. + **/ +SyncIndex raftLogSnapshotIndex(RaftLog* pLog); + +/* Append a new entry to the log. */ +int raftLogAppend(RaftLog* pLog, + SSyncTerm term, + const SSyncBuffer *buf); + +/** + * acquire log from given index onwards. + **/ +/* +int raftLogAcquire(RaftLog* pLog, + SyncIndex index, + RaftEntry **ppEntries, + int *n); + +void raftLogRelease(RaftLog* pLog, + SyncIndex index, + RaftEntry *pEntries, + int n); +*/ +/* Delete all entries from the given index (included) onwards. */ +void raftLogTruncate(RaftLog* pLog, SyncIndex index); + +/** + * when taking a new snapshot, the function will update the last snapshot information and delete + * all entries up last_index - trailing (included). If the log contains no entry + * a last_index - trailing, then no entry will be deleted. + **/ +void raftLogSnapshot(RaftLog* pLog, SyncIndex index, SyncIndex trailing); + +#endif + +#endif /* TD_SYNC_RAFT_UNSTABLE_LOG_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 73015e87a1..f99fb066ae 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -19,6 +19,7 @@ #include "thash.h" #include "os.h" #include "sync.h" +#include "sync_type.h" #include "raft.h" #include "tlog.h" diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h new file mode 100644 index 0000000000..2c9f24287a --- /dev/null +++ b/source/libs/sync/inc/sync_type.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_TYPE_H +#define _TD_LIBS_SYNC_TYPE_H + +typedef int32_t SyncTime; + +typedef struct SSyncRaftUnstableLog SSyncRaftUnstableLog; + +typedef struct SSyncRaft SSyncRaft; + +#ifndef MIN +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#endif + +#ifndef MAX +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#endif + +#endif /* _TD_LIBS_SYNC_TYPE_H */ diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 23442803c4..42b220e642 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -16,12 +16,10 @@ #include "raft.h" #include "syncInt.h" -#ifndef MIN -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) -#endif - #define RAFT_READ_LOG_MAX_NUM 100 +static void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term); + int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { SSyncNode* pNode = pRaft->pNode; SSyncServerState serverState; @@ -44,10 +42,10 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { } assert(initIndex <= serverState.commitIndex); - // restore fsm state from snapshot index + 1, until commitIndex + // restore fsm state from snapshot index + 1 until commitIndex ++initIndex; - while (initIndex < serverState.commitIndex) { - limit = MIN(RAFT_READ_LOG_MAX_NUM, serverState.commitIndex - initIndex); + while (initIndex <= serverState.commitIndex) { + limit = MIN(RAFT_READ_LOG_MAX_NUM, serverState.commitIndex - initIndex + 1); if (logStore->logRead(logStore, initIndex, limit, buffer, &nBuf) != 0) { return -1; @@ -62,7 +60,11 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { } assert(initIndex == serverState.commitIndex); - syncInfo("restore vgid %d state: snapshot index:", pInfo->vgId); + pRaft->heartbeatTick = 1; + + syncRaftBecomeFollower(pRaft, 1); + + syncInfo("restore vgid %d state: snapshot index success", pInfo->vgId); return 0; } @@ -73,4 +75,9 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg) { int32_t syncRaftTick(SSyncRaft* pRaft) { return 0; +} + +static void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term) { + pRaft->electionTick = taosRand() % 3 + 3; + return; } \ No newline at end of file diff --git a/source/libs/sync/src/raft_progress.c b/source/libs/sync/src/raft_progress.c new file mode 100644 index 0000000000..0f51d20531 --- /dev/null +++ b/source/libs/sync/src/raft_progress.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "raft.h" +#include "raft_unstable_log.h" +#include "raft_progress.h" +#include "sync.h" +#include "syncInt.h" + +static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state); + +static void resumeProgress(SSyncRaftProgress* progress); +static void pauseProgress(SSyncRaftProgress* progress); + +int syncRaftProgressCreate(SSyncRaft* pRaft) { + +/* + inflights->buffer = (SyncIndex*)malloc(sizeof(SyncIndex) * pRaft->maxInflightMsgs); + if (inflights->buffer == NULL) { + return RAFT_OOM; + } + inflights->size = pRaft->maxInflightMsgs; +*/ +} + +/* +int syncRaftProgressRecreate(SSyncRaft* pRaft, const RaftConfiguration* configuration) { + +} +*/ + +bool syncRaftProgressMaybeUpdate(SSyncRaft* pRaft, int i, SyncIndex lastIndex) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + bool updated = false; + + if (progress->matchIndex < lastIndex) { + progress->matchIndex = lastIndex; + updated = true; + resumeProgress(progress); + } + if (progress->nextIndex < lastIndex + 1) { + progress->nextIndex = lastIndex + 1; + } + + return updated; +} + +void syncRaftProgressOptimisticNextIndex(SSyncRaft* pRaft, int i, SyncIndex nextIndex) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + pRaft->leaderState.progress[i].nextIndex = nextIndex + 1; +} + +bool syncRaftProgressMaybeDecrTo(SSyncRaft* pRaft, int i, + SyncIndex rejected, SyncIndex lastIndex) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + + if (progress->state == PROGRESS_REPLICATE) { + /** + * the rejection must be stale if the progress has matched and "rejected" + * is smaller than "match". + **/ + if (rejected <= progress->matchIndex) { + syncDebug("match index is up to date,ignore"); + return false; + } + + /* directly decrease next to match + 1 */ + progress->nextIndex = progress->matchIndex + 1; + //syncRaftProgressBecomeProbe(raft, i); + return true; + } + + if (rejected != progress->nextIndex - 1) { + syncDebug("rejected index %" PRId64 " different from next index %" PRId64 " -> ignore" + , rejected, progress->nextIndex); + return false; + } + + progress->nextIndex = MIN(rejected, lastIndex + 1); + if (progress->nextIndex < 1) { + progress->nextIndex = 1; + } + + resumeProgress(progress); + return true; +} + +static void resumeProgress(SSyncRaftProgress* progress) { + progress->paused = false; +} + +static void pauseProgress(SSyncRaftProgress* progress) { + progress->paused = true; +} + +bool syncRaftProgressIsPaused(SSyncRaft* pRaft, int i) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + + switch (progress->state) { + case PROGRESS_PROBE: + return progress->paused; + case PROGRESS_REPLICATE: + return syncRaftInflightFull(&progress->inflights); + case PROGRESS_SNAPSHOT: + return true; + default: + syncFatal("error sync state:%d", progress->state); + } +} + +void syncRaftProgressFailure(SSyncRaft* pRaft, int i) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + + progress->pendingSnapshotIndex = 0; +} + +bool syncRaftProgressNeedAbortSnapshot(SSyncRaft* pRaft, int i) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + + return progress->state == PROGRESS_SNAPSHOT && progress->matchIndex >= progress->pendingSnapshotIndex; +} + +bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, int i) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex; +} + +void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + /** + * If the original state is ProgressStateSnapshot, progress knows that + * the pending snapshot has been sent to this peer successfully, then + * probes from pendingSnapshot + 1. + **/ + if (progress->state == PROGRESS_SNAPSHOT) { + SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex; + resetProgressState(progress, PROGRESS_PROBE); + progress->nextIndex = MAX(progress->matchIndex + 1, pendingSnapshotIndex + 1); + } else { + resetProgressState(progress, PROGRESS_PROBE); + progress->nextIndex = progress->matchIndex + 1; + } +} + +void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + resetProgressState(progress, PROGRESS_REPLICATE); + progress->nextIndex = progress->matchIndex + 1; +} + +void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i, SyncIndex snapshotIndex) { + assert(i >= 0 && i < pRaft->leaderState.nProgress); + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + resetProgressState(progress, PROGRESS_SNAPSHOT); + progress->pendingSnapshotIndex = snapshotIndex; +} + +static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state) { + progress->paused = false; + progress->pendingSnapshotIndex = 0; + progress->state = state; + syncRaftInflightReset(&(progress->inflights)); +} + + +int syncRaftInflightReset(SSyncRaftInflights* inflights) { + inflights->count = 0; + inflights->start = 0; + + return 0; +} + +bool syncRaftInflightFull(SSyncRaftInflights* inflights) { + return inflights->count == inflights->size; +} + +void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex) { + assert(!syncRaftInflightFull(inflights)); + + int next = inflights->start + inflights->count; + int size = inflights->size; + /* is next wrapped around buffer? */ + if (next >= size) { + next -= size; + } + + inflights->buffer[next] = inflightIndex; + inflights->count++; +} + +void syncRaftInflightFreeTo(SSyncRaftInflights* inflights, SyncIndex toIndex) { + if (inflights->count == 0 || toIndex < inflights->buffer[inflights->start]) { + return; + } + + int i, idx; + for (i = 0, idx = inflights->start; i < inflights->count; i++) { + if (toIndex < inflights->buffer[idx]) { + break; + } + + int size = inflights->size; + idx++; + if (idx >= size) { + idx -= size; + } + } + + inflights->count -= i; + inflights->start = idx; + assert(inflights->count >= 0); + if (inflights->count == 0) { + inflights->start = 0; + } +} + +void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights) { + syncRaftInflightFreeTo(inflights, inflights->buffer[inflights->start]); +} + + + + + +#if 0 + +SyncIndex syncRaftProgressNextIndex(SSyncRaft* pRaft, int i) { + return pRaft->leaderState.progress[i].nextIndex; +} + +SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i) { + return pRaft->leaderState.progress[i].matchIndex; +} + +void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i) { + pRaft->leaderState.progress[i].lastSend = pRaft->io.time(pRaft); +} + +void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i) { + pRaft->leaderState.progress[i].lastSendSnapshot = pRaft->io.time(pRaft); +} + +bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i) { + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + bool prev = progress->recentRecv; + progress->recentRecv = false; + return prev; +} + +void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i) { + pRaft->leaderState.progress[i].recentRecv = true; +} + +bool syncRaftProgressGetRecentRecv(SSyncRaft* pRaft, int i) { + return pRaft->leaderState.progress[i].recentRecv; +} + +void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i) { + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + resetProgressState(progress, PROGRESS_SNAPSHOT); + progress->pendingSnapshotIndex = raftLogSnapshotIndex(pRaft->log); +} + +void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i) { + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + + if (progress->state == PROGRESS_SNAPSHOT) { + assert(progress->pendingSnapshotIndex > 0); + SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex; + resetProgressState(progress, PROGRESS_PROBE); + progress->nextIndex = max(progress->matchIndex + 1, pendingSnapshotIndex); + } else { + resetProgressState(progress, PROGRESS_PROBE); + progress->nextIndex = progress->matchIndex + 1; + } +} + +void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i) { + resetProgressState(pRaft->leaderState.progress, PROGRESS_REPLICATE); + pRaft->leaderState.progress->nextIndex = pRaft->leaderState.progress->matchIndex + 1; +} + +void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i) { + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + progress->pendingSnapshotIndex = 0; + progress->state = PROGRESS_PROBE; +} + +RaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i) { + return pRaft->leaderState.progress[i].state; +} + + + +#endif \ No newline at end of file diff --git a/source/libs/sync/src/raft_unstable_log.c b/source/libs/sync/src/raft_unstable_log.c new file mode 100644 index 0000000000..4735242d3c --- /dev/null +++ b/source/libs/sync/src/raft_unstable_log.c @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "sync.h" +#include "raft_unstable_log.h" + +SyncIndex syncRaftLogLastIndex(SSyncRaftUnstableLog* pLog) { + return 0; +} \ No newline at end of file From 349a6a47711dbd26dcc3d97df411b2aa2a74b185 Mon Sep 17 00:00:00 2001 From: lichuang Date: Wed, 3 Nov 2021 11:47:44 +0800 Subject: [PATCH 06/16] [TD-10645][raft]add raft message handle --- include/libs/sync/sync.h | 4 +- source/libs/sync/inc/raft.h | 66 +++++++++-- source/libs/sync/inc/raft_message.h | 56 ++++++--- source/libs/sync/inc/raft_progress.h | 1 + source/libs/sync/src/raft.c | 168 +++++++++++++++++++++++++-- source/libs/sync/src/raft_message.c | 4 +- source/libs/sync/src/raft_progress.c | 15 +-- source/libs/sync/src/sync.c | 2 +- 8 files changed, 271 insertions(+), 45 deletions(-) diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index ef8773f5cc..ced9cc72fc 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -133,9 +133,9 @@ typedef struct SStateManager { int32_t (*readServerState)(struct SStateManager* stateMng, SSyncServerState* state); - // void (*saveCluster)(struct SStateManager* stateMng, const SSyncClusterConfig* cluster); + void (*saveCluster)(struct SStateManager* stateMng, const SSyncClusterConfig* cluster); - // const SSyncClusterConfig* (*readCluster)(struct SStateManager* stateMng); + const SSyncClusterConfig* (*readCluster)(struct SStateManager* stateMng); } SStateManager; typedef struct { diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 869baecdda..0e2d1769b3 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -20,6 +20,8 @@ #include "sync_type.h" #include "raft_message.h" +#define SYNC_NON_NODE_ID -1 + typedef struct SSyncRaftProgress SSyncRaftProgress; typedef struct RaftLeaderState { @@ -28,38 +30,84 @@ typedef struct RaftLeaderState { } RaftLeaderState; typedef struct SSyncRaftIOMethods { - SyncTime (*time)(SSyncRaft*); + } SSyncRaftIOMethods; +typedef int (*SyncRaftStepFp)(SSyncRaft* pRaft, const SSyncMessage* pMsg); +typedef void (*SyncRaftTickFp)(SSyncRaft* pRaft); + struct SSyncRaft { // owner sync node SSyncNode* pNode; SSyncInfo info; + SSyncTerm term; + SyncNodeId voteFor; + + SyncNodeId selfId; + + /** + * the leader id + **/ + SyncNodeId leaderId; + + /** + * leadTransferee is id of the leader transfer target when its value is not zero. + * Follow the procedure defined in raft thesis 3.10. + **/ + SyncNodeId leadTransferee; + + /** + * New configuration is ignored if there exists unapplied configuration. + **/ + bool pendingConf; + + ESyncRole state; + + /** + * number of ticks since it reached last electionTimeout when it is leader + * or candidate. + * number of ticks since it reached last electionTimeout or received a + * valid message from current leader when it is a follower. + **/ + uint16_t electionElapsed; + + /** + * number of ticks since it reached last heartbeatTimeout. + * only leader keeps heartbeatElapsed. + **/ + uint16_t heartbeatElapsed; + // election timeout tick(random in [3:6] tick) - uint16_t electionTick; + uint16_t electionTimeoutTick; // heartbeat timeout tick(default: 1 tick) - uint16_t heartbeatTick; - - int installSnapShotTimeoutMS; - - // - int heartbeatTimeoutMS; + uint16_t heartbeatTimeoutTick; bool preVote; + bool checkQuorum; SSyncRaftIOMethods io; RaftLeaderState leaderState; SSyncRaftUnstableLog *log; + + SyncRaftStepFp stepFp; + + SyncRaftTickFp tickFp; }; int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo); -int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg); +int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg); int32_t syncRaftTick(SSyncRaft* pRaft); + +void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term, SyncNodeId leaderId); +void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft); +bool syncRaftIsPromotable(SSyncRaft* pRaft); +bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft); + #endif /* _TD_LIBS_SYNC_RAFT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h index faf14840c9..9e690855c7 100644 --- a/source/libs/sync/inc/raft_message.h +++ b/source/libs/sync/inc/raft_message.h @@ -28,15 +28,14 @@ typedef enum RaftMessageType { // client propose a cmd RAFT_MSG_INTERNAL_PROP = 1, - RAFT_MSG_APPEND, - RAFT_MSG_APPEND_RESP, + // node election timeout + RAFT_MSG_INTERNAL_ELECTION = 2, - RAFT_MSG_VOTE, - RAFT_MSG_VOTE_RESP, - - RAFT_MSG_PRE_VOTE, - RAFT_MSG_PRE_VOTE_RESP, + RAFT_MSG_VOTE = 3, + RAFT_MSG_VOTE_RESP = 4, + RAFT_MSG_PRE_VOTE = 5, + RAFT_MSG_PRE_VOTE_RESP = 6, } RaftMessageType; typedef struct RaftMsgInternal_Prop { @@ -45,7 +44,15 @@ typedef struct RaftMsgInternal_Prop { void* pData; } RaftMsgInternal_Prop; -typedef struct RaftMessage { +typedef struct RaftMsgInternal_Election { + +} RaftMsgInternal_Election; + +typedef struct RaftMsg_PreVoteResp { + bool reject; +} RaftMsg_PreVoteResp; + +typedef struct SSyncMessage { RaftMessageType msgType; SSyncTerm term; SyncNodeId from; @@ -53,12 +60,17 @@ typedef struct RaftMessage { union { RaftMsgInternal_Prop propose; - }; -} RaftMessage; -static FORCE_INLINE RaftMessage* syncInitPropMsg(RaftMessage* pMsg, const SSyncBuffer* pBuf, void* pData, bool isWeak) { - *pMsg = (RaftMessage) { + RaftMsgInternal_Election election; + + RaftMsg_PreVoteResp preVoteResp; + }; +} SSyncMessage; + +static FORCE_INLINE SSyncMessage* syncInitPropMsg(SSyncMessage* pMsg, const SSyncBuffer* pBuf, void* pData, bool isWeak) { + *pMsg = (SSyncMessage) { .msgType = RAFT_MSG_INTERNAL_PROP, + .term = 0, .propose = (RaftMsgInternal_Prop) { .isWeak = isWeak, .pBuf = pBuf, @@ -69,10 +81,24 @@ static FORCE_INLINE RaftMessage* syncInitPropMsg(RaftMessage* pMsg, const SSyncB return pMsg; } -static FORCE_INLINE bool syncIsInternalMsg(const RaftMessage* pMsg) { - return pMsg->msgType == RAFT_MSG_INTERNAL_PROP; +static FORCE_INLINE SSyncMessage* syncInitElectionMsg(SSyncMessage* pMsg, SyncNodeId from) { + *pMsg = (SSyncMessage) { + .msgType = RAFT_MSG_INTERNAL_ELECTION, + .term = 0, + .from = from, + .election = (RaftMsgInternal_Election) { + + }, + }; + + return pMsg; } -void syncFreeMessage(const RaftMessage* pMsg); +static FORCE_INLINE bool syncIsInternalMsg(const SSyncMessage* pMsg) { + return pMsg->msgType == RAFT_MSG_INTERNAL_PROP || + pMsg->msgType == RAFT_MSG_INTERNAL_ELECTION; +} + +void syncFreeMessage(const SSyncMessage* pMsg); #endif /* _TD_LIBS_SYNC_RAFT_MESSAGE_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_progress.h b/source/libs/sync/inc/raft_progress.h index 73aa9db59f..159a80fa0e 100644 --- a/source/libs/sync/inc/raft_progress.h +++ b/source/libs/sync/inc/raft_progress.h @@ -148,6 +148,7 @@ void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i); void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i, SyncIndex snapshotIndex); +/* inflights APIs */ int syncRaftInflightReset(SSyncRaftInflights* inflights); bool syncRaftInflightFull(SSyncRaftInflights* inflights); void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex); diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 42b220e642..09f29cbd28 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -18,7 +18,20 @@ #define RAFT_READ_LOG_MAX_NUM 100 -static void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term); +static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); +static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); +static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); + +static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg); +static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg); +static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg); + +static void tickElection(SSyncRaft* pRaft); +static void tickHeartbeat(SSyncRaft* pRaft); + +static void abortLeaderTransfer(SSyncRaft* pRaft); + +static void resetRaft(SSyncRaft* pRaft, SSyncTerm term); int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { SSyncNode* pNode = pRaft->pNode; @@ -30,6 +43,8 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { SSyncBuffer buffer[RAFT_READ_LOG_MAX_NUM]; int nBuf, limit, i; + memset(pRaft, 0, sizeof(SSyncRaft)); + memcpy(&pRaft->info, pInfo, sizeof(SSyncInfo)); stateManager = &(pRaft->info.stateManager); logStore = &(pRaft->info.logStore); @@ -60,15 +75,30 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { } assert(initIndex == serverState.commitIndex); - pRaft->heartbeatTick = 1; + pRaft->heartbeatTimeoutTick = 1; - syncRaftBecomeFollower(pRaft, 1); + syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); syncInfo("restore vgid %d state: snapshot index success", pInfo->vgId); return 0; } -int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg) { +int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + syncDebug("from "); + if (preHandleMessage(pRaft, pMsg)) { + syncFreeMessage(pMsg); + return 0; + } + + RaftMessageType msgType = pMsg->msgType; + if (msgType == RAFT_MSG_INTERNAL_ELECTION) { + + } else if (msgType == RAFT_MSG_VOTE || msgType == RAFT_MSG_PRE_VOTE) { + + } else { + pRaft->stepFp(pRaft, pMsg); + } + syncFreeMessage(pMsg); return 0; } @@ -77,7 +107,131 @@ int32_t syncRaftTick(SSyncRaft* pRaft) { return 0; } -static void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term) { - pRaft->electionTick = taosRand() % 3 + 3; - return; +void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term, SyncNodeId leaderId) { + pRaft->stepFp = stepFollower; + resetRaft(pRaft, term); + pRaft->tickFp = tickElection; + pRaft->leaderId = leaderId; + pRaft->state = TAOS_SYNC_ROLE_FOLLOWER; +} + +void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) { + // electionTimeoutTick in [3,6] tick + pRaft->electionTimeoutTick = taosRand() % 4 + 3; +} + +bool syncRaftIsPromotable(SSyncRaft* pRaft) { + return pRaft->info.syncCfg.selfIndex >= 0 && + pRaft->info.syncCfg.selfIndex < pRaft->info.syncCfg.replica && + pRaft->selfId != SYNC_NON_NODE_ID; +} + +bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) { + return pRaft->electionElapsed >= pRaft->electionTimeoutTick; +} + +/** + * pre-handle message, return true is no need to continue + * Handle the message term, which may result in our stepping down to a follower. + **/ +static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + // local message? + if (pMsg->term == 0) { + return false; + } + + if (pMsg->term > pRaft->term) { + return preHandleNewTermMessage(pRaft, pMsg); + } + + return preHandleOldTermMessage(pRaft, pMsg);; +} + +static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + SyncNodeId leaderId = pMsg->from; + RaftMessageType msgType = pMsg->msgType; + + if (msgType == RAFT_MSG_VOTE || msgType == RAFT_MSG_PRE_VOTE) { + leaderId = SYNC_NON_NODE_ID; + } + + if (msgType == RAFT_MSG_PRE_VOTE) { + // Never change our term in response to a PreVote + } else if (msgType == RAFT_MSG_PRE_VOTE_RESP && !pMsg->preVoteResp.reject) { + /** + * We send pre-vote requests with a term in our future. If the + * pre-vote is granted, we will increment our term when we get a + * quorum. If it is not, the term comes from the node that + * rejected our vote so we should become a follower at the new + * term. + **/ + } else { + syncRaftBecomeFollower(pRaft, pMsg->term, leaderId); + } + + return false; +} + +static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + + // if receive old term message, no need to continue + return true; +} + +static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + return 0; +} + +static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + return 0; +} + +static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + return 0; +} + +/** + * tickElection is run by followers and candidates per tick. + **/ +static void tickElection(SSyncRaft* pRaft) { + pRaft->electionElapsed += 1; + + if (!syncRaftIsPromotable(pRaft)) { + return; + } + + if (!syncRaftIsPastElectionTimeout(pRaft)) { + return; + } + + // election timeout + pRaft->electionElapsed = 0; + SSyncMessage msg; + syncRaftStep(pRaft, syncInitElectionMsg(&msg, pRaft->selfId)); +} + +static void tickHeartbeat(SSyncRaft* pRaft) { + +} + +static void abortLeaderTransfer(SSyncRaft* pRaft) { + pRaft->leadTransferee = SYNC_NON_NODE_ID; +} + +static void resetRaft(SSyncRaft* pRaft, SSyncTerm term) { + if (pRaft->term != term) { + pRaft->term = term; + pRaft->voteFor = SYNC_NON_NODE_ID; + } + + pRaft->leaderId = SYNC_NON_NODE_ID; + + pRaft->electionElapsed = 0; + pRaft->heartbeatElapsed = 0; + + syncRaftRandomizedElectionTimeout(pRaft); + + abortLeaderTransfer(pRaft); + + pRaft->pendingConf = false; } \ No newline at end of file diff --git a/source/libs/sync/src/raft_message.c b/source/libs/sync/src/raft_message.c index 912314daf2..d17a5b732b 100644 --- a/source/libs/sync/src/raft_message.c +++ b/source/libs/sync/src/raft_message.c @@ -15,8 +15,8 @@ #include "raft_message.h" -void syncFreeMessage(const RaftMessage* pMsg) { +void syncFreeMessage(const SSyncMessage* pMsg) { if (!syncIsInternalMsg(pMsg)) { - free((RaftMessage*)pMsg); + free((SSyncMessage*)pMsg); } } \ No newline at end of file diff --git a/source/libs/sync/src/raft_progress.c b/source/libs/sync/src/raft_progress.c index 0f51d20531..ba09973f48 100644 --- a/source/libs/sync/src/raft_progress.c +++ b/source/libs/sync/src/raft_progress.c @@ -177,14 +177,6 @@ void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i, SyncIndex snapshotI progress->pendingSnapshotIndex = snapshotIndex; } -static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state) { - progress->paused = false; - progress->pendingSnapshotIndex = 0; - progress->state = state; - syncRaftInflightReset(&(progress->inflights)); -} - - int syncRaftInflightReset(SSyncRaftInflights* inflights) { inflights->count = 0; inflights->start = 0; @@ -240,7 +232,12 @@ void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights) { syncRaftInflightFreeTo(inflights, inflights->buffer[inflights->start]); } - +static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state) { + progress->paused = false; + progress->pendingSnapshotIndex = 0; + progress->state = state; + syncRaftInflightReset(&(progress->inflights)); +} diff --git a/source/libs/sync/src/sync.c b/source/libs/sync/src/sync.c index e3d0606c08..9077be3f2d 100644 --- a/source/libs/sync/src/sync.c +++ b/source/libs/sync/src/sync.c @@ -157,7 +157,7 @@ void syncStop(const SSyncNode* pNode) { } int32_t syncPropose(SSyncNode* syncNode, const SSyncBuffer* pBuf, void* pData, bool isWeak) { - RaftMessage msg; + SSyncMessage msg; pthread_mutex_lock(&syncNode->mutex); int32_t ret = syncRaftStep(&syncNode->raft, syncInitPropMsg(&msg, pBuf, pData, isWeak)); From aab981f667f930117088f1e17c65d22d1488ed32 Mon Sep 17 00:00:00 2001 From: lichuang Date: Wed, 3 Nov 2021 15:30:54 +0800 Subject: [PATCH 07/16] [TD-10645][raft]add raft election message handle --- include/libs/sync/sync.h | 5 +- source/libs/sync/inc/raft.h | 16 ++++- source/libs/sync/inc/raft_message.h | 17 +++-- source/libs/sync/inc/raft_unstable_log.h | 6 +- source/libs/sync/src/raft.c | 62 ++++++++++++++-- .../sync/src/raft_handle_election_message.c | 71 +++++++++++++++++++ source/libs/sync/src/raft_message.c | 2 +- 7 files changed, 162 insertions(+), 17 deletions(-) create mode 100644 source/libs/sync/src/raft_handle_election_message.c diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index ced9cc72fc..b938bbba77 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -26,12 +26,13 @@ extern "C" { typedef int32_t SyncNodeId; typedef int32_t SyncGroupId; typedef int64_t SyncIndex; -typedef uint64_t SSyncTerm; +typedef uint64_t SyncTerm; typedef enum { TAOS_SYNC_ROLE_FOLLOWER = 0, TAOS_SYNC_ROLE_CANDIDATE = 1, TAOS_SYNC_ROLE_LEADER = 2, + TAOS_SYNC_ROLE_PRE_CANDIDATE = 3, } ESyncRole; typedef struct { @@ -111,7 +112,7 @@ typedef struct SSyncLogStore { typedef struct SSyncServerState { SyncNodeId voteFor; - SSyncTerm term; + SyncTerm term; SyncIndex commitIndex; } SSyncServerState; diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 0e2d1769b3..702fcd00cf 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -43,10 +43,11 @@ struct SSyncRaft { SSyncInfo info; - SSyncTerm term; + SyncTerm term; SyncNodeId voteFor; SyncNodeId selfId; + SyncGroupId selfGroupId; /** * the leader id @@ -100,14 +101,25 @@ struct SSyncRaft { SyncRaftTickFp tickFp; }; +typedef enum { + SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0, + SYNC_RAFT_CAMPAIGN_ELECTION = 1, +} SyncRaftCampaignType; + int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo); int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg); int32_t syncRaftTick(SSyncRaft* pRaft); -void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term, SyncNodeId leaderId); +void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId); +void syncRaftBecomePreCandidate(SSyncRaft* pRaft); +void syncRaftBecomeCandidate(SSyncRaft* pRaft); +void syncRaftBecomeLeader(SSyncRaft* pRaft); + void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft); bool syncRaftIsPromotable(SSyncRaft* pRaft); bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft); +int syncRaftQuorum(SSyncRaft* pRaft); +int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, RaftMessageType msgType, bool accept); #endif /* _TD_LIBS_SYNC_RAFT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h index 9e690855c7..71fe37bebd 100644 --- a/source/libs/sync/inc/raft_message.h +++ b/source/libs/sync/inc/raft_message.h @@ -17,6 +17,7 @@ #define _TD_LIBS_SYNC_RAFT_MESSAGE_H #include "sync.h" +#include "sync_type.h" /** * below define message type which handled by Raft node thread @@ -54,7 +55,7 @@ typedef struct RaftMsg_PreVoteResp { typedef struct SSyncMessage { RaftMessageType msgType; - SSyncTerm term; + SyncTerm term; SyncNodeId from; SyncNodeId to; @@ -94,11 +95,19 @@ static FORCE_INLINE SSyncMessage* syncInitElectionMsg(SSyncMessage* pMsg, SyncNo return pMsg; } -static FORCE_INLINE bool syncIsInternalMsg(const SSyncMessage* pMsg) { - return pMsg->msgType == RAFT_MSG_INTERNAL_PROP || - pMsg->msgType == RAFT_MSG_INTERNAL_ELECTION; +static FORCE_INLINE bool syncIsInternalMsg(RaftMessageType msgType) { + return msgType == RAFT_MSG_INTERNAL_PROP || + msgType == RAFT_MSG_INTERNAL_ELECTION; +} + +static FORCE_INLINE RaftMessageType SyncRaftVoteRespMsgType(RaftMessageType msgType) { + if (msgType == RAFT_MSG_VOTE) return RAFT_MSG_PRE_VOTE_RESP; + return RAFT_MSG_PRE_VOTE_RESP; } void syncFreeMessage(const SSyncMessage* pMsg); +// message handlers +void syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); + #endif /* _TD_LIBS_SYNC_RAFT_MESSAGE_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_unstable_log.h b/source/libs/sync/inc/raft_unstable_log.h index 2b7b30c15a..0c9957cb90 100644 --- a/source/libs/sync/inc/raft_unstable_log.h +++ b/source/libs/sync/inc/raft_unstable_log.h @@ -67,13 +67,13 @@ int raftLogNumEntries(const RaftLog* pLog); /** * return last term of in memory log, return 0 if log is empty **/ -SSyncTerm raftLogLastTerm(RaftLog* pLog); +SyncTerm raftLogLastTerm(RaftLog* pLog); /** * return term of log with the given index, return 0 if the term of index cannot be found * , errCode will save the error code. **/ -SSyncTerm raftLogTermOf(RaftLog* pLog, SyncIndex index, RaftCode* errCode); +SyncTerm raftLogTermOf(RaftLog* pLog, SyncIndex index, RaftCode* errCode); /** * Get the last index of the most recent snapshot. Return 0 if there are no * @@ -83,7 +83,7 @@ SyncIndex raftLogSnapshotIndex(RaftLog* pLog); /* Append a new entry to the log. */ int raftLogAppend(RaftLog* pLog, - SSyncTerm term, + SyncTerm term, const SSyncBuffer *buf); /** diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 09f29cbd28..87750eca9e 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -31,7 +31,7 @@ static void tickHeartbeat(SSyncRaft* pRaft); static void abortLeaderTransfer(SSyncRaft* pRaft); -static void resetRaft(SSyncRaft* pRaft, SSyncTerm term); +static void resetRaft(SSyncRaft* pRaft, SyncTerm term); int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { SSyncNode* pNode = pRaft->pNode; @@ -84,7 +84,9 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { } int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - syncDebug("from "); + syncDebug("from %d, to %d, type:%d, term:%" PRId64 ", state:%d", + pMsg->from, pMsg->to, pMsg->msgType, pMsg->term, pRaft->state); + if (preHandleMessage(pRaft, pMsg)) { syncFreeMessage(pMsg); return 0; @@ -92,7 +94,7 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) { RaftMessageType msgType = pMsg->msgType; if (msgType == RAFT_MSG_INTERNAL_ELECTION) { - + syncRaftHandleElectionMessage(pRaft, pMsg); } else if (msgType == RAFT_MSG_VOTE || msgType == RAFT_MSG_PRE_VOTE) { } else { @@ -107,7 +109,7 @@ int32_t syncRaftTick(SSyncRaft* pRaft) { return 0; } -void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term, SyncNodeId leaderId) { +void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId) { pRaft->stepFp = stepFollower; resetRaft(pRaft, term); pRaft->tickFp = tickElection; @@ -115,6 +117,40 @@ void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term, SyncNodeId leaderI pRaft->state = TAOS_SYNC_ROLE_FOLLOWER; } +void syncRaftBecomePreCandidate(SSyncRaft* pRaft) { + /** + * Becoming a pre-candidate changes our step functions and state, + * but doesn't change anything else. In particular it does not increase + * r.Term or change r.Vote. + **/ + pRaft->stepFp = stepCandidate; + pRaft->tickFp = tickElection; + pRaft->state = TAOS_SYNC_ROLE_PRE_CANDIDATE; + syncInfo("[%d:%d] became pre-candidate at term %d" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); +} + +void syncRaftBecomeCandidate(SSyncRaft* pRaft) { + pRaft->stepFp = stepCandidate; + // become candidate make term+1 + resetRaft(pRaft, pRaft->term + 1); + pRaft->tickFp = tickElection; + pRaft->voteFor = pRaft->selfId; + pRaft->state = TAOS_SYNC_ROLE_CANDIDATE; + syncInfo("[%d:%d] became candidate at term %d" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); +} + +void syncRaftBecomeLeader(SSyncRaft* pRaft) { + assert(pRaft->state != TAOS_SYNC_ROLE_FOLLOWER); + + pRaft->stepFp = stepLeader; + resetRaft(pRaft, pRaft->term); + pRaft->leaderId = pRaft->leaderId; + pRaft->state = TAOS_SYNC_ROLE_LEADER; + // TODO: check if there is pending config log + + syncInfo("[%d:%d] became leader at term %d" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); +} + void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) { // electionTimeoutTick in [3,6] tick pRaft->electionTimeoutTick = taosRand() % 4 + 3; @@ -130,6 +166,20 @@ bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) { return pRaft->electionElapsed >= pRaft->electionTimeoutTick; } +int syncRaftQuorum(SSyncRaft* pRaft) { + return pRaft->leaderState.nProgress / 2 + 1; +} + +int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, RaftMessageType msgType, bool accept) { + if (accept) { + + } else { + + } + + +} + /** * pre-handle message, return true is no need to continue * Handle the message term, which may result in our stepping down to a follower. @@ -166,6 +216,8 @@ static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) * term. **/ } else { + syncInfo("%d [term:%" PRId64 "] received a %d message with higher term from %d [term:%" PRId64 "]", + pRaft->selfId, pRaft->term, msgType, pMsg->from, pMsg->term); syncRaftBecomeFollower(pRaft, pMsg->term, leaderId); } @@ -218,7 +270,7 @@ static void abortLeaderTransfer(SSyncRaft* pRaft) { pRaft->leadTransferee = SYNC_NON_NODE_ID; } -static void resetRaft(SSyncRaft* pRaft, SSyncTerm term) { +static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { if (pRaft->term != term) { pRaft->term = term; pRaft->voteFor = SYNC_NON_NODE_ID; diff --git a/source/libs/sync/src/raft_handle_election_message.c b/source/libs/sync/src/raft_handle_election_message.c new file mode 100644 index 0000000000..2586cd918d --- /dev/null +++ b/source/libs/sync/src/raft_handle_election_message.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "syncInt.h" +#include "raft.h" +#include "raft_message.h" + +static void campaign(SSyncRaft* pRaft, SyncRaftCampaignType cType); + +void syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + if (pRaft->state == TAOS_SYNC_ROLE_LEADER) { + syncDebug("%d ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfId); + return; + } + + // TODO: is there pending uncommitted config? + + syncInfo("[%d:%d] is starting a new election at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); + + if (pRaft->preVote) { + + } else { + + } +} + +static void campaign(SSyncRaft* pRaft, SyncRaftCampaignType cType) { + SyncTerm term; + RaftMessageType voteMsgType; + + if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { + syncRaftBecomePreCandidate(pRaft); + voteMsgType = RAFT_MSG_PRE_VOTE; + // PreVote RPCs are sent for the next term before we've incremented r.Term. + term = pRaft->term + 1; + } else { + syncRaftBecomeCandidate(pRaft); + voteMsgType = RAFT_MSG_VOTE; + term = pRaft->term; + } + + int quorum = syncRaftQuorum(pRaft); + int granted = syncRaftNumOfGranted(pRaft, pRaft->selfId, SyncRaftVoteRespMsgType(voteMsgType), true); + if (quorum <= granted) { + /** + * We won the election after voting for ourselves (which must mean that + * this is a single-node cluster). Advance to the next state. + **/ + if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { + campaign(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); + } else { + syncRaftBecomeLeader(pRaft); + } + return; + } + + // broadcast vote message to other peers + +} \ No newline at end of file diff --git a/source/libs/sync/src/raft_message.c b/source/libs/sync/src/raft_message.c index d17a5b732b..e706127f29 100644 --- a/source/libs/sync/src/raft_message.c +++ b/source/libs/sync/src/raft_message.c @@ -16,7 +16,7 @@ #include "raft_message.h" void syncFreeMessage(const SSyncMessage* pMsg) { - if (!syncIsInternalMsg(pMsg)) { + if (!syncIsInternalMsg(pMsg->msgType)) { free((SSyncMessage*)pMsg); } } \ No newline at end of file From 446b14f315536822ad314cd661939acd5a236a51 Mon Sep 17 00:00:00 2001 From: lichuang Date: Thu, 4 Nov 2021 09:53:52 +0800 Subject: [PATCH 08/16] [TD-10645][raft]replace SRpcEpSet to SEpSet --- source/libs/sync/src/sync.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/libs/sync/src/sync.c b/source/libs/sync/src/sync.c index 9077be3f2d..fa35917668 100644 --- a/source/libs/sync/src/sync.c +++ b/source/libs/sync/src/sync.c @@ -23,8 +23,8 @@ SSyncManager* gSyncManager = NULL; #define SYNC_ACTIVITY_TIMER 5 #define SYNC_SERVER_WORKER 2 -static void syncProcessRsp(SRpcMsg *pMsg, SRpcEpSet *pEpSet); -static void syncProcessReqMsg(SRpcMsg *pMsg, SRpcEpSet *pEpSet); +static void syncProcessRsp(SRpcMsg *pMsg, SEpSet *pEpSet); +static void syncProcessReqMsg(SRpcMsg *pMsg, SEpSet *pEpSet); static int syncInitRpcServer(SSyncManager* syncManager, const SSyncCluster* pSyncCfg); static int syncInitRpcClient(SSyncManager* syncManager); @@ -168,12 +168,12 @@ int32_t syncPropose(SSyncNode* syncNode, const SSyncBuffer* pBuf, void* pData, b void syncReconfig(const SSyncNode* pNode, const SSyncCluster* pCfg) {} // process rpc rsp message from other sync server -static void syncProcessRsp(SRpcMsg *pMsg, SRpcEpSet *pEpSet) { +static void syncProcessRsp(SRpcMsg *pMsg, SEpSet *pEpSet) { } // process rpc message from other sync server -static void syncProcessReqMsg(SRpcMsg *pMsg, SRpcEpSet *pEpSet) { +static void syncProcessReqMsg(SRpcMsg *pMsg, SEpSet *pEpSet) { } From e05e6dba9aaa3448fb546eef7b0d34387ca258e6 Mon Sep 17 00:00:00 2001 From: lichuang Date: Thu, 4 Nov 2021 12:39:45 +0800 Subject: [PATCH 09/16] [TD-10645][raft]add raft election message handle --- include/libs/sync/sync.h | 1 - source/libs/sync/inc/raft.h | 35 ++++++---- source/libs/sync/inc/raft_configuration.h | 26 +++++++ source/libs/sync/inc/raft_log.h | 42 ++++++++++++ source/libs/sync/inc/raft_message.h | 49 ++++++++++--- source/libs/sync/inc/raft_unstable_log.h | 2 +- source/libs/sync/inc/sync_type.h | 10 ++- source/libs/sync/src/raft.c | 68 +++++++++++++------ source/libs/sync/src/raft_configuration.c | 25 +++++++ .../sync/src/raft_handle_election_message.c | 37 ++++++++-- source/libs/sync/src/raft_log.c | 36 ++++++++++ source/libs/sync/src/raft_unstable_log.c | 4 +- 12 files changed, 285 insertions(+), 50 deletions(-) create mode 100644 source/libs/sync/inc/raft_configuration.h create mode 100644 source/libs/sync/inc/raft_log.h create mode 100644 source/libs/sync/src/raft_configuration.c create mode 100644 source/libs/sync/src/raft_log.c diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index b938bbba77..726fbc0621 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -32,7 +32,6 @@ typedef enum { TAOS_SYNC_ROLE_FOLLOWER = 0, TAOS_SYNC_ROLE_CANDIDATE = 1, TAOS_SYNC_ROLE_LEADER = 2, - TAOS_SYNC_ROLE_PRE_CANDIDATE = 3, } ESyncRole; typedef struct { diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 702fcd00cf..44ee6a3b69 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -29,9 +29,16 @@ typedef struct RaftLeaderState { SSyncRaftProgress* progress; } RaftLeaderState; +typedef struct RaftCandidateState { + /* votes results */ + bool votes[TSDB_MAX_REPLICA]; + + /* true if in pre-vote phase */ + bool inPreVote; +} RaftCandidateState; + typedef struct SSyncRaftIOMethods { - - + int (*send)(const SSyncMessage* pMsg, const SNodeInfo* pNode); } SSyncRaftIOMethods; typedef int (*SyncRaftStepFp)(SSyncRaft* pRaft, const SSyncMessage* pMsg); @@ -41,7 +48,10 @@ struct SSyncRaft { // owner sync node SSyncNode* pNode; - SSyncInfo info; + //SSyncInfo info; + SSyncFSM fsm; + SSyncLogStore logStore; + SStateManager stateManager; SyncTerm term; SyncNodeId voteFor; @@ -65,6 +75,8 @@ struct SSyncRaft { **/ bool pendingConf; + SSyncCluster cluster; + ESyncRole state; /** @@ -92,25 +104,22 @@ struct SSyncRaft { SSyncRaftIOMethods io; - RaftLeaderState leaderState; - - SSyncRaftUnstableLog *log; + union { + RaftLeaderState leaderState; + RaftCandidateState candidateState; + }; + + SSyncRaftLog *log; SyncRaftStepFp stepFp; SyncRaftTickFp tickFp; }; -typedef enum { - SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0, - SYNC_RAFT_CAMPAIGN_ELECTION = 1, -} SyncRaftCampaignType; - int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo); int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg); int32_t syncRaftTick(SSyncRaft* pRaft); - void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId); void syncRaftBecomePreCandidate(SSyncRaft* pRaft); void syncRaftBecomeCandidate(SSyncRaft* pRaft); @@ -120,6 +129,6 @@ void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft); bool syncRaftIsPromotable(SSyncRaft* pRaft); bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft); int syncRaftQuorum(SSyncRaft* pRaft); -int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, RaftMessageType msgType, bool accept); +int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool accept); #endif /* _TD_LIBS_SYNC_RAFT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_configuration.h b/source/libs/sync/inc/raft_configuration.h new file mode 100644 index 0000000000..ed0cc33115 --- /dev/null +++ b/source/libs/sync/inc/raft_configuration.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_RAFT_CONFIGURATION_H +#define _TD_LIBS_SYNC_RAFT_CONFIGURATION_H + +#include "sync.h" +#include "sync_type.h" + +int syncRaftConfigurationIndexOfVoter(SSyncRaft *pRaft, SyncNodeId id); + +int syncRaftConfigurationVoterCount(SSyncRaft *pRaft); + +#endif /* _TD_LIBS_SYNC_RAFT_CONFIGURATION_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_log.h b/source/libs/sync/inc/raft_log.h new file mode 100644 index 0000000000..7ffb946c82 --- /dev/null +++ b/source/libs/sync/inc/raft_log.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_RAFT_LOG_H +#define _TD_LIBS_SYNC_RAFT_LOG_H + +#include "sync.h" +#include "sync_type.h" + +struct SSyncRaftLog { + SyncIndex uncommittedConfigIndex; + + SyncIndex commitIndex; + + SyncIndex appliedIndex; + + +}; + +SSyncRaftLog* syncRaftLogOpen(); + +SyncIndex syncRaftLogLastIndex(SSyncRaftLog* pLog); + +SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog); + +int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog); + +bool syncRaftHasUnappliedLog(SSyncRaftLog* pLog); + +#endif /* _TD_LIBS_SYNC_RAFT_LOG_H */ diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h index 71fe37bebd..d4736d6169 100644 --- a/source/libs/sync/inc/raft_message.h +++ b/source/libs/sync/inc/raft_message.h @@ -35,8 +35,7 @@ typedef enum RaftMessageType { RAFT_MSG_VOTE = 3, RAFT_MSG_VOTE_RESP = 4, - RAFT_MSG_PRE_VOTE = 5, - RAFT_MSG_PRE_VOTE_RESP = 6, + } RaftMessageType; typedef struct RaftMsgInternal_Prop { @@ -49,13 +48,21 @@ typedef struct RaftMsgInternal_Election { } RaftMsgInternal_Election; -typedef struct RaftMsg_PreVoteResp { +typedef struct RaftMsg_Vote { + SyncRaftCampaignType cType; + SyncIndex lastIndex; + SyncTerm lastTerm; +} RaftMsg_Vote; + +typedef struct RaftMsg_VoteResp { bool reject; -} RaftMsg_PreVoteResp; + SyncRaftCampaignType cType; +} RaftMsg_VoteResp; typedef struct SSyncMessage { RaftMessageType msgType; SyncTerm term; + SyncGroupId groupId; SyncNodeId from; SyncNodeId to; @@ -64,7 +71,8 @@ typedef struct SSyncMessage { RaftMsgInternal_Election election; - RaftMsg_PreVoteResp preVoteResp; + RaftMsg_Vote vote; + RaftMsg_VoteResp voteResp; }; } SSyncMessage; @@ -95,14 +103,39 @@ static FORCE_INLINE SSyncMessage* syncInitElectionMsg(SSyncMessage* pMsg, SyncNo return pMsg; } +static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId from, SyncNodeId to, + SyncTerm term, SyncRaftCampaignType cType, + SyncIndex lastIndex, SyncTerm lastTerm) { + SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); + if (pMsg == NULL) { + return NULL; + } + *pMsg = (SSyncMessage) { + .groupId = groupId, + .from = from, + .to = to, + .term = term, + .vote = (RaftMsg_Vote) { + .cType = cType, + .lastIndex = lastIndex, + .lastTerm = lastTerm, + }, + }; + + return pMsg; +} + static FORCE_INLINE bool syncIsInternalMsg(RaftMessageType msgType) { return msgType == RAFT_MSG_INTERNAL_PROP || msgType == RAFT_MSG_INTERNAL_ELECTION; } -static FORCE_INLINE RaftMessageType SyncRaftVoteRespMsgType(RaftMessageType msgType) { - if (msgType == RAFT_MSG_VOTE) return RAFT_MSG_PRE_VOTE_RESP; - return RAFT_MSG_PRE_VOTE_RESP; +static FORCE_INLINE bool syncIsPreVoteRespMsg(SSyncMessage* pMsg) { + return pMsg->msgType == RAFT_MSG_VOTE_RESP && pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION; +} + +static FORCE_INLINE bool syncIsPreVoteMsg(SSyncMessage* pMsg) { + return pMsg->msgType == RAFT_MSG_VOTE && pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION; } void syncFreeMessage(const SSyncMessage* pMsg); diff --git a/source/libs/sync/inc/raft_unstable_log.h b/source/libs/sync/inc/raft_unstable_log.h index 0c9957cb90..0748a425a1 100644 --- a/source/libs/sync/inc/raft_unstable_log.h +++ b/source/libs/sync/inc/raft_unstable_log.h @@ -41,7 +41,7 @@ struct SSyncRaftUnstableLog { /** * return index of last in memory log, return 0 if log is empty **/ -SyncIndex syncRaftLogLastIndex(SSyncRaftUnstableLog* pLog); +//SyncIndex syncRaftLogLastIndex(SSyncRaftUnstableLog* pLog); #if 0 void raftLogInit(RaftLog* pLog); diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h index 2c9f24287a..4343e607cb 100644 --- a/source/libs/sync/inc/sync_type.h +++ b/source/libs/sync/inc/sync_type.h @@ -18,10 +18,10 @@ typedef int32_t SyncTime; -typedef struct SSyncRaftUnstableLog SSyncRaftUnstableLog; - typedef struct SSyncRaft SSyncRaft; +typedef struct SSyncRaftLog SSyncRaftLog; + #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #endif @@ -30,4 +30,10 @@ typedef struct SSyncRaft SSyncRaft; #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #endif +typedef enum { + SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0, + SYNC_RAFT_CAMPAIGN_ELECTION = 1, + SYNC_RAFT_CAMPAIGN_TRANSFER = 3, +} SyncRaftCampaignType; + #endif /* _TD_LIBS_SYNC_TYPE_H */ diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 87750eca9e..a6e013758e 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -14,6 +14,7 @@ */ #include "raft.h" +#include "raft_configuration.h" #include "syncInt.h" #define RAFT_READ_LOG_MAX_NUM 100 @@ -22,6 +23,7 @@ static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); +static int convertClear(SSyncRaft* pRaft); static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg); static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg); static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg); @@ -45,11 +47,18 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { memset(pRaft, 0, sizeof(SSyncRaft)); - memcpy(&pRaft->info, pInfo, sizeof(SSyncInfo)); - stateManager = &(pRaft->info.stateManager); - logStore = &(pRaft->info.logStore); - fsm = &(pRaft->info.fsm); + memcpy(&pRaft->fsm, &pInfo->fsm, sizeof(SSyncFSM)); + memcpy(&pRaft->logStore, &pInfo->logStore, sizeof(SSyncLogStore)); + memcpy(&pRaft->stateManager, &pInfo->stateManager, sizeof(SStateManager)); + stateManager = &(pRaft->stateManager); + logStore = &(pRaft->logStore); + fsm = &(pRaft->fsm); + + // open raft log + if ((pRaft->log = syncRaftLogOpen()) == NULL) { + return -1; + } // read server state if (stateManager->readServerState(stateManager, &serverState) != 0) { syncError("readServerState for vgid %d fail", pInfo->vgId); @@ -79,7 +88,8 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); - syncInfo("restore vgid %d state: snapshot index success", pInfo->vgId); + syncInfo("[%d:%d] restore vgid %d state: snapshot index success", + pRaft->selfGroupId, pRaft->selfId, pInfo->vgId); return 0; } @@ -95,7 +105,7 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) { RaftMessageType msgType = pMsg->msgType; if (msgType == RAFT_MSG_INTERNAL_ELECTION) { syncRaftHandleElectionMessage(pRaft, pMsg); - } else if (msgType == RAFT_MSG_VOTE || msgType == RAFT_MSG_PRE_VOTE) { + } else if (msgType == RAFT_MSG_VOTE) { } else { pRaft->stepFp(pRaft, pMsg); @@ -125,11 +135,13 @@ void syncRaftBecomePreCandidate(SSyncRaft* pRaft) { **/ pRaft->stepFp = stepCandidate; pRaft->tickFp = tickElection; - pRaft->state = TAOS_SYNC_ROLE_PRE_CANDIDATE; + pRaft->state = TAOS_SYNC_ROLE_CANDIDATE; + pRaft->candidateState.inPreVote = true; syncInfo("[%d:%d] became pre-candidate at term %d" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); } void syncRaftBecomeCandidate(SSyncRaft* pRaft) { + pRaft->candidateState.inPreVote = false; pRaft->stepFp = stepCandidate; // become candidate make term+1 resetRaft(pRaft, pRaft->term + 1); @@ -157,9 +169,7 @@ void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) { } bool syncRaftIsPromotable(SSyncRaft* pRaft) { - return pRaft->info.syncCfg.selfIndex >= 0 && - pRaft->info.syncCfg.selfIndex < pRaft->info.syncCfg.replica && - pRaft->selfId != SYNC_NON_NODE_ID; + return pRaft->selfId != SYNC_NON_NODE_ID; } bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) { @@ -167,17 +177,29 @@ bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) { } int syncRaftQuorum(SSyncRaft* pRaft) { - return pRaft->leaderState.nProgress / 2 + 1; + return pRaft->cluster.replica / 2 + 1; } -int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, RaftMessageType msgType, bool accept) { +int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool accept) { if (accept) { - + syncInfo("[%d:%d] received (pre-vote %d) from %d at term %" PRId64 "", + pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); } else { - + syncInfo("[%d:%d] received rejection from %d at term %" PRId64 "", + pRaft->selfGroupId, pRaft->selfId, id, pRaft->term); } + int voteIndex = syncRaftConfigurationIndexOfVoter(pRaft, id); + assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0); + pRaft->candidateState.votes[voteIndex] = accept; + int granted = 0; + int i; + for (i = 0; i < pRaft->cluster.replica; ++i) { + if (pRaft->candidateState.votes[i]) granted++; + } + + return granted; } /** @@ -201,13 +223,13 @@ static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) SyncNodeId leaderId = pMsg->from; RaftMessageType msgType = pMsg->msgType; - if (msgType == RAFT_MSG_VOTE || msgType == RAFT_MSG_PRE_VOTE) { + if (msgType == RAFT_MSG_VOTE) { leaderId = SYNC_NON_NODE_ID; } - if (msgType == RAFT_MSG_PRE_VOTE) { + if (syncIsPreVoteMsg(pMsg)) { // Never change our term in response to a PreVote - } else if (msgType == RAFT_MSG_PRE_VOTE_RESP && !pMsg->preVoteResp.reject) { + } else if (syncIsPreVoteRespMsg(pMsg) && !pMsg->voteResp.reject) { /** * We send pre-vote requests with a term in our future. If the * pre-vote is granted, we will increment our term when we get a @@ -216,8 +238,8 @@ static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) * term. **/ } else { - syncInfo("%d [term:%" PRId64 "] received a %d message with higher term from %d [term:%" PRId64 "]", - pRaft->selfId, pRaft->term, msgType, pMsg->from, pMsg->term); + syncInfo("[%d:%d] [term:%" PRId64 "] received a %d message with higher term from %d [term:%" PRId64 "]", + pRaft->selfGroupId, pRaft->selfId, pRaft->term, msgType, pMsg->from, pMsg->term); syncRaftBecomeFollower(pRaft, pMsg->term, leaderId); } @@ -230,15 +252,23 @@ static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) return true; } +static int convertClear(SSyncRaft* pRaft) { + +} + static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + convertClear(pRaft); return 0; } static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + convertClear(pRaft); + memset(pRaft->candidateState.votes, 0, sizeof(bool) * TSDB_MAX_REPLICA); return 0; } static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + convertClear(pRaft); return 0; } diff --git a/source/libs/sync/src/raft_configuration.c b/source/libs/sync/src/raft_configuration.c new file mode 100644 index 0000000000..6f3a27e7c0 --- /dev/null +++ b/source/libs/sync/src/raft_configuration.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "raft_configuration.h" +#include "raft.h" + +int syncRaftConfigurationIndexOfVoter(SSyncRaft *pRaft, SyncNodeId id) { + return (int)(id); +} + +int syncRaftConfigurationVoterCount(SSyncRaft *pRaft) { + return pRaft->cluster.replica; +} \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_election_message.c b/source/libs/sync/src/raft_handle_election_message.c index 2586cd918d..0d2004dec2 100644 --- a/source/libs/sync/src/raft_handle_election_message.c +++ b/source/libs/sync/src/raft_handle_election_message.c @@ -25,34 +25,41 @@ void syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { return; } - // TODO: is there pending uncommitted config? + // if there is pending uncommitted config,cannot campaign + if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) { + syncWarn("[%d:%d] cannot campaign at term %" PRId64 " since there are still pending configuration changes to apply", + pRaft->selfGroupId, pRaft->selfId, pRaft->term); + return; + } syncInfo("[%d:%d] is starting a new election at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); if (pRaft->preVote) { - + campaign(pRaft, SYNC_RAFT_CAMPAIGN_PRE_ELECTION); } else { - + campaign(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); } } static void campaign(SSyncRaft* pRaft, SyncRaftCampaignType cType) { SyncTerm term; + bool preVote; RaftMessageType voteMsgType; if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { syncRaftBecomePreCandidate(pRaft); - voteMsgType = RAFT_MSG_PRE_VOTE; + preVote = true; // PreVote RPCs are sent for the next term before we've incremented r.Term. term = pRaft->term + 1; } else { syncRaftBecomeCandidate(pRaft); voteMsgType = RAFT_MSG_VOTE; term = pRaft->term; + preVote = false; } int quorum = syncRaftQuorum(pRaft); - int granted = syncRaftNumOfGranted(pRaft, pRaft->selfId, SyncRaftVoteRespMsgType(voteMsgType), true); + int granted = syncRaftNumOfGranted(pRaft, pRaft->selfId, preVote, true); if (quorum <= granted) { /** * We won the election after voting for ourselves (which must mean that @@ -67,5 +74,25 @@ static void campaign(SSyncRaft* pRaft, SyncRaftCampaignType cType) { } // broadcast vote message to other peers + int i; + SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); + SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); + for (i = 0; i < pRaft->cluster.replica; ++i) { + if (i == pRaft->cluster.selfIndex) { + continue; + } + SyncNodeId nodeId = pRaft->cluster.nodeInfo[i].nodeId; + + SSyncMessage* pMsg = syncNewVoteMsg(pRaft->selfGroupId, pRaft->selfId, nodeId, term, cType, lastIndex, lastTerm); + if (pMsg == NULL) { + continue; + } + + syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %d] sent %d request to %d at term %" PRId64 "", + pRaft->selfGroupId, pRaft->selfId, lastTerm, + lastIndex, voteMsgType, nodeId, pRaft->term); + + pRaft->io.send(pMsg, &(pRaft->cluster.nodeInfo[i])); + } } \ No newline at end of file diff --git a/source/libs/sync/src/raft_log.c b/source/libs/sync/src/raft_log.c new file mode 100644 index 0000000000..46c4e4b304 --- /dev/null +++ b/source/libs/sync/src/raft_log.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "raft_log.h" + +SSyncRaftLog* syncRaftLogOpen() { + return NULL; +} + +SyncIndex syncRaftLogLastIndex(SSyncRaftLog* pLog) { + return 0; +} + +SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog) { + return 0; +} + +int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog) { + return 0; +} + +bool syncRaftHasUnappliedLog(SSyncRaftLog* pLog) { + return pLog->commitIndex > pLog->appliedIndex; +} \ No newline at end of file diff --git a/source/libs/sync/src/raft_unstable_log.c b/source/libs/sync/src/raft_unstable_log.c index 4735242d3c..e798e20662 100644 --- a/source/libs/sync/src/raft_unstable_log.c +++ b/source/libs/sync/src/raft_unstable_log.c @@ -16,6 +16,8 @@ #include "sync.h" #include "raft_unstable_log.h" +/* SyncIndex syncRaftLogLastIndex(SSyncRaftUnstableLog* pLog) { return 0; -} \ No newline at end of file +} +*/ \ No newline at end of file From c25d174fc2499dcc39c17f6d6789c29ba0bf4204 Mon Sep 17 00:00:00 2001 From: lichuang Date: Thu, 4 Nov 2021 14:56:21 +0800 Subject: [PATCH 10/16] [TD-10645][raft]add raft vote resp message handle --- source/libs/sync/inc/raft.h | 9 ++- source/libs/sync/inc/raft_configuration.h | 1 + source/libs/sync/inc/raft_message.h | 13 ++-- source/libs/sync/inc/sync_type.h | 8 +- source/libs/sync/src/raft.c | 48 +++++++++--- source/libs/sync/src/raft_election.c | 75 +++++++++++++++++++ .../sync/src/raft_handle_election_message.c | 75 +++---------------- .../sync/src/raft_handle_vote_resp_message.c | 57 ++++++++++++++ source/libs/sync/src/raft_progress.c | 2 +- 9 files changed, 203 insertions(+), 85 deletions(-) create mode 100644 source/libs/sync/src/raft_election.c create mode 100644 source/libs/sync/src/raft_handle_vote_resp_message.c diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 44ee6a3b69..cba9434414 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -31,7 +31,7 @@ typedef struct RaftLeaderState { typedef struct RaftCandidateState { /* votes results */ - bool votes[TSDB_MAX_REPLICA]; + SyncRaftVoteRespType votes[TSDB_MAX_REPLICA]; /* true if in pre-vote phase */ bool inPreVote; @@ -125,10 +125,15 @@ void syncRaftBecomePreCandidate(SSyncRaft* pRaft); void syncRaftBecomeCandidate(SSyncRaft* pRaft); void syncRaftBecomeLeader(SSyncRaft* pRaft); +void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType); + +void syncRaftTriggerReplicate(SSyncRaft* pRaft); + void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft); bool syncRaftIsPromotable(SSyncRaft* pRaft); bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft); int syncRaftQuorum(SSyncRaft* pRaft); -int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool accept); +int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, + bool preVote, bool accept, int* rejectNum); #endif /* _TD_LIBS_SYNC_RAFT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_configuration.h b/source/libs/sync/inc/raft_configuration.h index ed0cc33115..993f863f33 100644 --- a/source/libs/sync/inc/raft_configuration.h +++ b/source/libs/sync/inc/raft_configuration.h @@ -19,6 +19,7 @@ #include "sync.h" #include "sync_type.h" +// return -1 if cannot find this id int syncRaftConfigurationIndexOfVoter(SSyncRaft *pRaft, SyncNodeId id); int syncRaftConfigurationVoterCount(SSyncRaft *pRaft); diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h index d4736d6169..da2e3bc52f 100644 --- a/source/libs/sync/inc/raft_message.h +++ b/source/libs/sync/inc/raft_message.h @@ -35,7 +35,7 @@ typedef enum RaftMessageType { RAFT_MSG_VOTE = 3, RAFT_MSG_VOTE_RESP = 4, - + RAFT_MSG_APPEND = 5, } RaftMessageType; typedef struct RaftMsgInternal_Prop { @@ -49,14 +49,14 @@ typedef struct RaftMsgInternal_Election { } RaftMsgInternal_Election; typedef struct RaftMsg_Vote { - SyncRaftCampaignType cType; + SyncRaftElectionType cType; SyncIndex lastIndex; SyncTerm lastTerm; } RaftMsg_Vote; typedef struct RaftMsg_VoteResp { bool reject; - SyncRaftCampaignType cType; + SyncRaftElectionType cType; } RaftMsg_VoteResp; typedef struct SSyncMessage { @@ -104,7 +104,7 @@ static FORCE_INLINE SSyncMessage* syncInitElectionMsg(SSyncMessage* pMsg, SyncNo } static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId from, SyncNodeId to, - SyncTerm term, SyncRaftCampaignType cType, + SyncTerm term, SyncRaftElectionType cType, SyncIndex lastIndex, SyncTerm lastTerm) { SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); if (pMsg == NULL) { @@ -134,13 +134,14 @@ static FORCE_INLINE bool syncIsPreVoteRespMsg(SSyncMessage* pMsg) { return pMsg->msgType == RAFT_MSG_VOTE_RESP && pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION; } -static FORCE_INLINE bool syncIsPreVoteMsg(SSyncMessage* pMsg) { +static FORCE_INLINE bool syncIsPreVoteMsg(const SSyncMessage* pMsg) { return pMsg->msgType == RAFT_MSG_VOTE && pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION; } void syncFreeMessage(const SSyncMessage* pMsg); // message handlers -void syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); +int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); +int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); #endif /* _TD_LIBS_SYNC_RAFT_MESSAGE_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h index 4343e607cb..f9632f6ae8 100644 --- a/source/libs/sync/inc/sync_type.h +++ b/source/libs/sync/inc/sync_type.h @@ -34,6 +34,12 @@ typedef enum { SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0, SYNC_RAFT_CAMPAIGN_ELECTION = 1, SYNC_RAFT_CAMPAIGN_TRANSFER = 3, -} SyncRaftCampaignType; +} SyncRaftElectionType; + +typedef enum { + SYNC_RAFT_VOTE_RESP_UNKNOWN = 0, + SYNC_RAFT_VOTE_RESP_GRANT = 1, + SYNC_RAFT_VOTE_RESP_REJECT = 2, +} SyncRaftVoteRespType; #endif /* _TD_LIBS_SYNC_TYPE_H */ diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index a6e013758e..83ae76fa5e 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -15,6 +15,7 @@ #include "raft.h" #include "raft_configuration.h" +#include "raft_log.h" #include "syncInt.h" #define RAFT_READ_LOG_MAX_NUM 100 @@ -120,14 +121,19 @@ int32_t syncRaftTick(SSyncRaft* pRaft) { } void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId) { + convertClear(pRaft); + pRaft->stepFp = stepFollower; resetRaft(pRaft, term); pRaft->tickFp = tickElection; pRaft->leaderId = leaderId; pRaft->state = TAOS_SYNC_ROLE_FOLLOWER; + syncInfo("[%d:%d] became followe at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); } void syncRaftBecomePreCandidate(SSyncRaft* pRaft) { + convertClear(pRaft); + memset(pRaft->candidateState.votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteRespType) * TSDB_MAX_REPLICA); /** * Becoming a pre-candidate changes our step functions and state, * but doesn't change anything else. In particular it does not increase @@ -137,10 +143,13 @@ void syncRaftBecomePreCandidate(SSyncRaft* pRaft) { pRaft->tickFp = tickElection; pRaft->state = TAOS_SYNC_ROLE_CANDIDATE; pRaft->candidateState.inPreVote = true; - syncInfo("[%d:%d] became pre-candidate at term %d" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); + syncInfo("[%d:%d] became pre-candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); } void syncRaftBecomeCandidate(SSyncRaft* pRaft) { + convertClear(pRaft); + memset(pRaft->candidateState.votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteRespType) * TSDB_MAX_REPLICA); + pRaft->candidateState.inPreVote = false; pRaft->stepFp = stepCandidate; // become candidate make term+1 @@ -148,7 +157,7 @@ void syncRaftBecomeCandidate(SSyncRaft* pRaft) { pRaft->tickFp = tickElection; pRaft->voteFor = pRaft->selfId; pRaft->state = TAOS_SYNC_ROLE_CANDIDATE; - syncInfo("[%d:%d] became candidate at term %d" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); + syncInfo("[%d:%d] became candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); } void syncRaftBecomeLeader(SSyncRaft* pRaft) { @@ -160,7 +169,11 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft) { pRaft->state = TAOS_SYNC_ROLE_LEADER; // TODO: check if there is pending config log - syncInfo("[%d:%d] became leader at term %d" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); + syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); +} + +void syncRaftTriggerReplicate(SSyncRaft* pRaft) { + } void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) { @@ -180,7 +193,7 @@ int syncRaftQuorum(SSyncRaft* pRaft) { return pRaft->cluster.replica / 2 + 1; } -int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool accept) { +int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool accept, int* rejectNum) { if (accept) { syncInfo("[%d:%d] received (pre-vote %d) from %d at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); @@ -188,17 +201,20 @@ int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool acc syncInfo("[%d:%d] received rejection from %d at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, id, pRaft->term); } - + int voteIndex = syncRaftConfigurationIndexOfVoter(pRaft, id); assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0); + assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN); - pRaft->candidateState.votes[voteIndex] = accept; - int granted = 0; + pRaft->candidateState.votes[voteIndex] = accept ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT; + int granted = 0, rejected = 0; int i; for (i = 0; i < pRaft->cluster.replica; ++i) { - if (pRaft->candidateState.votes[i]) granted++; + if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) granted++; + else if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_REJECT) rejected++; } + if (rejectNum) *rejectNum = rejected; return granted; } @@ -262,8 +278,20 @@ static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg) { } static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - convertClear(pRaft); - memset(pRaft->candidateState.votes, 0, sizeof(bool) * TSDB_MAX_REPLICA); + /** + * Only handle vote responses corresponding to our candidacy (while in + * StateCandidate, we may get stale MsgPreVoteResp messages in this term from + * our pre-candidate state). + **/ + RaftMessageType msgType = pMsg->msgType; + + if (msgType == RAFT_MSG_INTERNAL_PROP) { + return 0; + } + + if (msgType == RAFT_MSG_VOTE_RESP) { + return 0; + } return 0; } diff --git a/source/libs/sync/src/raft_election.c b/source/libs/sync/src/raft_election.c new file mode 100644 index 0000000000..7ebeb45254 --- /dev/null +++ b/source/libs/sync/src/raft_election.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "syncInt.h" +#include "raft.h" +#include "raft_message.h" + +void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) { + SyncTerm term; + bool preVote; + RaftMessageType voteMsgType; + + if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { + syncRaftBecomePreCandidate(pRaft); + preVote = true; + // PreVote RPCs are sent for the next term before we've incremented r.Term. + term = pRaft->term + 1; + } else { + syncRaftBecomeCandidate(pRaft); + voteMsgType = RAFT_MSG_VOTE; + term = pRaft->term; + preVote = false; + } + + int quorum = syncRaftQuorum(pRaft); + int granted = syncRaftNumOfGranted(pRaft, pRaft->selfId, preVote, true, NULL); + if (quorum <= granted) { + /** + * We won the election after voting for ourselves (which must mean that + * this is a single-node cluster). Advance to the next state. + **/ + if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { + syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); + } else { + syncRaftBecomeLeader(pRaft); + } + return; + } + + // broadcast vote message to other peers + int i; + SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); + SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); + for (i = 0; i < pRaft->cluster.replica; ++i) { + if (i == pRaft->cluster.selfIndex) { + continue; + } + + SyncNodeId nodeId = pRaft->cluster.nodeInfo[i].nodeId; + + SSyncMessage* pMsg = syncNewVoteMsg(pRaft->selfGroupId, pRaft->selfId, + nodeId, term, cType, lastIndex, lastTerm); + if (pMsg == NULL) { + continue; + } + + syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %d] sent %d request to %d at term %" PRId64 "", + pRaft->selfGroupId, pRaft->selfId, lastTerm, + lastIndex, voteMsgType, nodeId, pRaft->term); + + pRaft->io.send(pMsg, &(pRaft->cluster.nodeInfo[i])); + } +} \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_election_message.c b/source/libs/sync/src/raft_handle_election_message.c index 0d2004dec2..19471846ba 100644 --- a/source/libs/sync/src/raft_handle_election_message.c +++ b/source/libs/sync/src/raft_handle_election_message.c @@ -15,84 +15,29 @@ #include "syncInt.h" #include "raft.h" +#include "raft_log.h" #include "raft_message.h" -static void campaign(SSyncRaft* pRaft, SyncRaftCampaignType cType); - -void syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { +int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { if (pRaft->state == TAOS_SYNC_ROLE_LEADER) { syncDebug("%d ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfId); - return; + return 0; } - // if there is pending uncommitted config,cannot campaign + // if there is pending uncommitted config,cannot start election if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) { - syncWarn("[%d:%d] cannot campaign at term %" PRId64 " since there are still pending configuration changes to apply", + syncWarn("[%d:%d] cannot syncRaftStartElection at term %" PRId64 " since there are still pending configuration changes to apply", pRaft->selfGroupId, pRaft->selfId, pRaft->term); - return; + return 0; } syncInfo("[%d:%d] is starting a new election at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); if (pRaft->preVote) { - campaign(pRaft, SYNC_RAFT_CAMPAIGN_PRE_ELECTION); + syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_PRE_ELECTION); } else { - campaign(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); + syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); } + + return 0; } - -static void campaign(SSyncRaft* pRaft, SyncRaftCampaignType cType) { - SyncTerm term; - bool preVote; - RaftMessageType voteMsgType; - - if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { - syncRaftBecomePreCandidate(pRaft); - preVote = true; - // PreVote RPCs are sent for the next term before we've incremented r.Term. - term = pRaft->term + 1; - } else { - syncRaftBecomeCandidate(pRaft); - voteMsgType = RAFT_MSG_VOTE; - term = pRaft->term; - preVote = false; - } - - int quorum = syncRaftQuorum(pRaft); - int granted = syncRaftNumOfGranted(pRaft, pRaft->selfId, preVote, true); - if (quorum <= granted) { - /** - * We won the election after voting for ourselves (which must mean that - * this is a single-node cluster). Advance to the next state. - **/ - if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { - campaign(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); - } else { - syncRaftBecomeLeader(pRaft); - } - return; - } - - // broadcast vote message to other peers - int i; - SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); - SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); - for (i = 0; i < pRaft->cluster.replica; ++i) { - if (i == pRaft->cluster.selfIndex) { - continue; - } - - SyncNodeId nodeId = pRaft->cluster.nodeInfo[i].nodeId; - - SSyncMessage* pMsg = syncNewVoteMsg(pRaft->selfGroupId, pRaft->selfId, nodeId, term, cType, lastIndex, lastTerm); - if (pMsg == NULL) { - continue; - } - - syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %d] sent %d request to %d at term %" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, lastTerm, - lastIndex, voteMsgType, nodeId, pRaft->term); - - pRaft->io.send(pMsg, &(pRaft->cluster.nodeInfo[i])); - } -} \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_vote_resp_message.c b/source/libs/sync/src/raft_handle_vote_resp_message.c new file mode 100644 index 0000000000..e5d5d6cae7 --- /dev/null +++ b/source/libs/sync/src/raft_handle_vote_resp_message.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "syncInt.h" +#include "raft.h" +#include "raft_message.h" + +int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + int granted, rejected; + int quorum; + int voterIndex; + + voterIndex = syncRaftConfigurationIndexOfVoter(pRaft, pMsg->from); + if (voterIndex == -1) { + syncError("[%d:%d] recv vote resp from unknown server %d", pRaft->selfGroupId, pRaft->selfId, pMsg->from); + return 0; + } + + if (pRaft->state != TAOS_SYNC_ROLE_CANDIDATE) { + syncError("[%d:%d] is not candidate, ignore vote resp", pRaft->selfGroupId, pRaft->selfId); + return 0; + } + + granted = syncRaftNumOfGranted(pRaft, pMsg->from, + pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION, + !pMsg->voteResp.reject, &rejected); + quorum = syncRaftQuorum(pRaft); + + syncInfo("[%d:%d] [quorum:%d] has received %d votes and %d vote rejections", + pRaft->selfGroupId, pRaft->selfId, quorum, granted, rejected); + + if (granted >= quorum) { + if (pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { + syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); + } else { + syncRaftBecomeLeader(pRaft); + syncRaftTriggerReplicate(pRaft); + } + + return 0; + } else if (rejected == quorum) { + syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); + } + return 0; +} \ No newline at end of file diff --git a/source/libs/sync/src/raft_progress.c b/source/libs/sync/src/raft_progress.c index ba09973f48..458f829394 100644 --- a/source/libs/sync/src/raft_progress.c +++ b/source/libs/sync/src/raft_progress.c @@ -14,7 +14,7 @@ */ #include "raft.h" -#include "raft_unstable_log.h" +#include "raft_log.h" #include "raft_progress.h" #include "sync.h" #include "syncInt.h" From da106e29b2b0435d8b466f36901e610990bece99 Mon Sep 17 00:00:00 2001 From: lichuang Date: Thu, 4 Nov 2021 15:51:30 +0800 Subject: [PATCH 11/16] [TD-10645][raft]add raft vote message handle --- source/libs/sync/inc/raft.h | 2 + source/libs/sync/inc/raft_log.h | 2 + source/libs/sync/inc/raft_message.h | 36 ++++++++++-- source/libs/sync/src/raft.c | 4 +- source/libs/sync/src/raft_election.c | 3 +- .../libs/sync/src/raft_handle_vote_message.c | 57 +++++++++++++++++++ .../sync/src/raft_handle_vote_resp_message.c | 2 +- source/libs/sync/src/raft_log.c | 4 ++ 8 files changed, 101 insertions(+), 9 deletions(-) create mode 100644 source/libs/sync/src/raft_handle_vote_message.c diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index cba9434414..2ce2dcb5de 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -38,6 +38,7 @@ typedef struct RaftCandidateState { } RaftCandidateState; typedef struct SSyncRaftIOMethods { + // send SSyncMessage to node int (*send)(const SSyncMessage* pMsg, const SNodeInfo* pNode); } SSyncRaftIOMethods; @@ -104,6 +105,7 @@ struct SSyncRaft { SSyncRaftIOMethods io; + // union different state data union { RaftLeaderState leaderState; RaftCandidateState candidateState; diff --git a/source/libs/sync/inc/raft_log.h b/source/libs/sync/inc/raft_log.h index 7ffb946c82..3545bf7ba1 100644 --- a/source/libs/sync/inc/raft_log.h +++ b/source/libs/sync/inc/raft_log.h @@ -35,6 +35,8 @@ SyncIndex syncRaftLogLastIndex(SSyncRaftLog* pLog); SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog); +bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term); + int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog); bool syncRaftHasUnappliedLog(SSyncRaftLog* pLog); diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h index da2e3bc52f..d51822f8b3 100644 --- a/source/libs/sync/inc/raft_message.h +++ b/source/libs/sync/inc/raft_message.h @@ -20,10 +20,13 @@ #include "sync_type.h" /** - * below define message type which handled by Raft node thread - * internal message, which communicate in threads, start with RAFT_MSG_INTERNAL_*, - * internal message use pointer only, need not to be decode/encode - * outter message start with RAFT_MSG_*, need to implement its decode/encode functions + * below define message type which handled by Raft. + * + * internal message, which communicate between threads, start with RAFT_MSG_INTERNAL_*. + * internal message use pointer only and stack memory, need not to be decode/encode and free. + * + * outter message start with RAFT_MSG_*, which communicate between cluster peers, + * need to implement its decode/encode functions. **/ typedef enum RaftMessageType { // client propose a cmd @@ -36,6 +39,7 @@ typedef enum RaftMessageType { RAFT_MSG_VOTE_RESP = 4, RAFT_MSG_APPEND = 5, + RAFT_MSG_APPEND_RESP = 6, } RaftMessageType; typedef struct RaftMsgInternal_Prop { @@ -55,7 +59,7 @@ typedef struct RaftMsg_Vote { } RaftMsg_Vote; typedef struct RaftMsg_VoteResp { - bool reject; + bool rejected; SyncRaftElectionType cType; } RaftMsg_VoteResp; @@ -115,6 +119,7 @@ static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId .from = from, .to = to, .term = term, + .msgType = RAFT_MSG_VOTE, .vote = (RaftMsg_Vote) { .cType = cType, .lastIndex = lastIndex, @@ -125,6 +130,26 @@ static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId return pMsg; } +static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNodeId from, SyncNodeId to, + SyncRaftElectionType cType, bool rejected) { + SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); + if (pMsg == NULL) { + return NULL; + } + *pMsg = (SSyncMessage) { + .groupId = groupId, + .from = from, + .to = to, + .msgType = RAFT_MSG_VOTE_RESP, + .voteResp = (RaftMsg_VoteResp) { + .cType = cType, + .rejected = rejected, + }, + }; + + return pMsg; +} + static FORCE_INLINE bool syncIsInternalMsg(RaftMessageType msgType) { return msgType == RAFT_MSG_INTERNAL_PROP || msgType == RAFT_MSG_INTERNAL_ELECTION; @@ -142,6 +167,7 @@ void syncFreeMessage(const SSyncMessage* pMsg); // message handlers int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); +int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); #endif /* _TD_LIBS_SYNC_RAFT_MESSAGE_H */ \ No newline at end of file diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 83ae76fa5e..6e8e359305 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -107,7 +107,7 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) { if (msgType == RAFT_MSG_INTERNAL_ELECTION) { syncRaftHandleElectionMessage(pRaft, pMsg); } else if (msgType == RAFT_MSG_VOTE) { - + syncRaftHandleVoteMessage(pRaft, pMsg); } else { pRaft->stepFp(pRaft, pMsg); } @@ -245,7 +245,7 @@ static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) if (syncIsPreVoteMsg(pMsg)) { // Never change our term in response to a PreVote - } else if (syncIsPreVoteRespMsg(pMsg) && !pMsg->voteResp.reject) { + } else if (syncIsPreVoteRespMsg(pMsg) && !pMsg->voteResp.rejected) { /** * We send pre-vote requests with a term in our future. If the * pre-vote is granted, we will increment our term when we get a diff --git a/source/libs/sync/src/raft_election.c b/source/libs/sync/src/raft_election.c index 7ebeb45254..bb4a7541c2 100644 --- a/source/libs/sync/src/raft_election.c +++ b/source/libs/sync/src/raft_election.c @@ -15,6 +15,7 @@ #include "syncInt.h" #include "raft.h" +#include "raft_log.h" #include "raft_message.h" void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) { @@ -66,7 +67,7 @@ void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) { continue; } - syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %d] sent %d request to %d at term %" PRId64 "", + syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 "] sent %d request to %d at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, lastTerm, lastIndex, voteMsgType, nodeId, pRaft->term); diff --git a/source/libs/sync/src/raft_handle_vote_message.c b/source/libs/sync/src/raft_handle_vote_message.c new file mode 100644 index 0000000000..a575c5df1a --- /dev/null +++ b/source/libs/sync/src/raft_handle_vote_message.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "syncInt.h" +#include "raft.h" +#include "raft_log.h" +#include "raft_message.h" + +static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); + +int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + SSyncMessage* pRespMsg; + int voteIndex = syncRaftConfigurationIndexOfVoter(pRaft, pMsg->from); + if (voteIndex == -1) { + return 0; + } + bool grant; + SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); + SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); + + grant = canGrantVoteMessage(pRaft, pMsg); + pRespMsg = syncNewVoteRespMsg(pRaft->selfGroupId, pRaft->selfId, pMsg->to, pMsg->vote.cType, !grant); + if (pRespMsg == NULL) { + return 0; + } + syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 ", vote: %d] %s for %d" \ + "[logterm: %" PRId64 ", index: %" PRId64 ", vote: %d] at term %" PRId64 "", + pRaft->selfGroupId, pRaft->selfId, lastTerm, lastIndex, pRaft->voteFor, + grant ? "grant" : "reject", + pMsg->from, pMsg->vote.lastTerm, pMsg->vote.lastIndex, pRaft->term); + + pRaft->io.send(pRespMsg, &(pRaft->cluster.nodeInfo[voteIndex])); + return 0; +} + +static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + if (!(pRaft->voteFor == SYNC_NON_NODE_ID || pMsg->term > pRaft->term || pRaft->voteFor == pMsg->from)) { + return false; + } + if (!syncRaftLogIsUptodate(pRaft, pMsg->vote.lastIndex, pMsg->vote.lastTerm)) { + return false; + } + + return true; +} \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_vote_resp_message.c b/source/libs/sync/src/raft_handle_vote_resp_message.c index e5d5d6cae7..a155f0fe63 100644 --- a/source/libs/sync/src/raft_handle_vote_resp_message.c +++ b/source/libs/sync/src/raft_handle_vote_resp_message.c @@ -35,7 +35,7 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { granted = syncRaftNumOfGranted(pRaft, pMsg->from, pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION, - !pMsg->voteResp.reject, &rejected); + !pMsg->voteResp.rejected, &rejected); quorum = syncRaftQuorum(pRaft); syncInfo("[%d:%d] [quorum:%d] has received %d votes and %d vote rejections", diff --git a/source/libs/sync/src/raft_log.c b/source/libs/sync/src/raft_log.c index 46c4e4b304..f93595e9f3 100644 --- a/source/libs/sync/src/raft_log.c +++ b/source/libs/sync/src/raft_log.c @@ -27,6 +27,10 @@ SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog) { return 0; } +bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term) { + return true; +} + int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog) { return 0; } From aee5ebd1ced03863c7b9a3267176f317ffe53b8b Mon Sep 17 00:00:00 2001 From: lichuang Date: Fri, 5 Nov 2021 15:03:56 +0800 Subject: [PATCH 12/16] [TD-10645][raft]add raft append message handle --- source/libs/sync/inc/raft.h | 12 +-- source/libs/sync/inc/raft_log.h | 9 ++ source/libs/sync/inc/raft_message.h | 52 +++++++++-- source/libs/sync/inc/raft_progress.h | 43 +++++++-- source/libs/sync/inc/raft_replication.h | 25 ++++++ source/libs/sync/inc/sync_type.h | 15 +++- source/libs/sync/src/raft.c | 53 ++++++++--- source/libs/sync/src/raft_election.c | 2 +- .../libs/sync/src/raft_handle_vote_message.c | 7 +- .../sync/src/raft_handle_vote_resp_message.c | 4 +- source/libs/sync/src/raft_log.c | 9 ++ source/libs/sync/src/raft_progress.c | 16 +--- source/libs/sync/src/raft_replication.c | 90 +++++++++++++++++++ 13 files changed, 287 insertions(+), 50 deletions(-) create mode 100644 source/libs/sync/inc/raft_replication.h create mode 100644 source/libs/sync/src/raft_replication.c diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 2ce2dcb5de..dd3eed9e02 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -20,8 +20,6 @@ #include "sync_type.h" #include "raft_message.h" -#define SYNC_NON_NODE_ID -1 - typedef struct SSyncRaftProgress SSyncRaftProgress; typedef struct RaftLeaderState { @@ -49,7 +47,8 @@ struct SSyncRaft { // owner sync node SSyncNode* pNode; - //SSyncInfo info; + int maxMsgSize; + SSyncFSM fsm; SSyncLogStore logStore; SStateManager stateManager; @@ -74,7 +73,7 @@ struct SSyncRaft { /** * New configuration is ignored if there exists unapplied configuration. **/ - bool pendingConf; + bool hasPendingConf; SSyncCluster cluster; @@ -94,6 +93,9 @@ struct SSyncRaft { **/ uint16_t heartbeatElapsed; + // current tick count since start up + uint32_t currentTick; + // election timeout tick(random in [3:6] tick) uint16_t electionTimeoutTick; @@ -129,7 +131,7 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft); void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType); -void syncRaftTriggerReplicate(SSyncRaft* pRaft); +void syncRaftTriggerHeartbeat(SSyncRaft* pRaft); void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft); bool syncRaftIsPromotable(SSyncRaft* pRaft); diff --git a/source/libs/sync/inc/raft_log.h b/source/libs/sync/inc/raft_log.h index 3545bf7ba1..41b605b0d2 100644 --- a/source/libs/sync/inc/raft_log.h +++ b/source/libs/sync/inc/raft_log.h @@ -19,6 +19,10 @@ #include "sync.h" #include "sync_type.h" +struct SSyncRaftEntry { + +}; + struct SSyncRaftLog { SyncIndex uncommittedConfigIndex; @@ -41,4 +45,9 @@ int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog); bool syncRaftHasUnappliedLog(SSyncRaftLog* pLog); +SyncTerm syncRaftLogTermOf(SSyncRaftLog* pLog, SyncIndex index); + +int syncRaftLogAcquire(SSyncRaftLog* pLog, SyncIndex index, int maxMsgSize, + SSyncRaftEntry **ppEntries, int *n); + #endif /* _TD_LIBS_SYNC_RAFT_LOG_H */ diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h index d51822f8b3..58090a31f1 100644 --- a/source/libs/sync/inc/raft_message.h +++ b/source/libs/sync/inc/raft_message.h @@ -63,12 +63,28 @@ typedef struct RaftMsg_VoteResp { SyncRaftElectionType cType; } RaftMsg_VoteResp; +typedef struct RaftMsg_Append_Entries { + // index of log entry preceeding new ones + SyncIndex prevIndex; + + // term of entry at prevIndex + SyncTerm prevTerm; + + // leader's commit index. + SyncIndex commitIndex; + + // size of the log entries array + int nEntries; + + // log entries array + SSyncRaftEntry* entries; +} RaftMsg_Append_Entries; + typedef struct SSyncMessage { RaftMessageType msgType; SyncTerm term; SyncGroupId groupId; SyncNodeId from; - SyncNodeId to; union { RaftMsgInternal_Prop propose; @@ -77,6 +93,8 @@ typedef struct SSyncMessage { RaftMsg_Vote vote; RaftMsg_VoteResp voteResp; + + RaftMsg_Append_Entries appendEntries; }; } SSyncMessage; @@ -107,7 +125,7 @@ static FORCE_INLINE SSyncMessage* syncInitElectionMsg(SSyncMessage* pMsg, SyncNo return pMsg; } -static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId from, SyncNodeId to, +static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId from, SyncTerm term, SyncRaftElectionType cType, SyncIndex lastIndex, SyncTerm lastTerm) { SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); @@ -117,7 +135,6 @@ static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId *pMsg = (SSyncMessage) { .groupId = groupId, .from = from, - .to = to, .term = term, .msgType = RAFT_MSG_VOTE, .vote = (RaftMsg_Vote) { @@ -130,7 +147,7 @@ static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId return pMsg; } -static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNodeId from, SyncNodeId to, +static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNodeId from, SyncRaftElectionType cType, bool rejected) { SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); if (pMsg == NULL) { @@ -139,7 +156,6 @@ static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNo *pMsg = (SSyncMessage) { .groupId = groupId, .from = from, - .to = to, .msgType = RAFT_MSG_VOTE_RESP, .voteResp = (RaftMsg_VoteResp) { .cType = cType, @@ -150,12 +166,36 @@ static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNo return pMsg; } +static FORCE_INLINE SSyncMessage* syncNewAppendMsg(SyncGroupId groupId, SyncNodeId from, + SyncTerm term, SyncIndex prevIndex, SyncTerm prevTerm, + SyncIndex commitIndex, int nEntries, SSyncRaftEntry* entries) { + SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); + if (pMsg == NULL) { + return NULL; + } + *pMsg = (SSyncMessage) { + .groupId = groupId, + .from = from, + .term = term, + .msgType = RAFT_MSG_APPEND, + .appendEntries = (RaftMsg_Append_Entries) { + .prevIndex = prevIndex, + .prevTerm = prevTerm, + .commitIndex = commitIndex, + .nEntries = nEntries, + .entries = entries, + }, + }; + + return pMsg; +} + static FORCE_INLINE bool syncIsInternalMsg(RaftMessageType msgType) { return msgType == RAFT_MSG_INTERNAL_PROP || msgType == RAFT_MSG_INTERNAL_ELECTION; } -static FORCE_INLINE bool syncIsPreVoteRespMsg(SSyncMessage* pMsg) { +static FORCE_INLINE bool syncIsPreVoteRespMsg(const SSyncMessage* pMsg) { return pMsg->msgType == RAFT_MSG_VOTE_RESP && pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION; } diff --git a/source/libs/sync/inc/raft_progress.h b/source/libs/sync/inc/raft_progress.h index 159a80fa0e..5840468a5d 100644 --- a/source/libs/sync/inc/raft_progress.h +++ b/source/libs/sync/inc/raft_progress.h @@ -85,6 +85,9 @@ struct SSyncRaftProgress { **/ bool paused; + // last send append message tick + uint32_t lastSendTick; + /** * pendingSnapshotIndex is used in PROGRESS_SNAPSHOT. * If there is a pending snapshot, the pendingSnapshotIndex will be set to the @@ -116,7 +119,9 @@ int syncRaftProgressCreate(SSyncRaft* pRaft); **/ bool syncRaftProgressMaybeUpdate(SSyncRaft* pRaft, int i, SyncIndex lastIndex); -void syncRaftProgressOptimisticNextIndex(SSyncRaft* pRaft, int i, SyncIndex nextIndex); +static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* progress, SyncIndex nextIndex) { + progress->nextIndex = nextIndex + 1; +} /** * syncRaftProgressMaybeDecrTo returns false if the given to index comes from an out of order message. @@ -131,7 +136,35 @@ bool syncRaftProgressMaybeDecrTo(SSyncRaft* pRaft, int i, * MsgApps, is currently waiting for a snapshot, or has reached the * MaxInflightMsgs limit. **/ -bool syncRaftProgressIsPaused(SSyncRaft* pRaft, int i); +bool syncRaftProgressIsPaused(SSyncRaftProgress* progress); + +static FORCE_INLINE void syncRaftProgressPause(SSyncRaftProgress* progress) { + progress->paused = true; +} + +static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progress) { + return progress->nextIndex; +} + +static FORCE_INLINE RaftProgressState syncRaftProgressInReplicate(SSyncRaftProgress* progress) { + return progress->state == PROGRESS_REPLICATE; +} + +static FORCE_INLINE RaftProgressState syncRaftProgressInSnapshot(SSyncRaftProgress* progress) { + return progress->state == PROGRESS_SNAPSHOT; +} + +static FORCE_INLINE RaftProgressState syncRaftProgressInProbe(SSyncRaftProgress* progress) { + return progress->state == PROGRESS_PROBE; +} + +static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progress) { + return progress->recentActive; +} + +static FORCE_INLINE bool syncRaftProgressUpdateSendTick(SSyncRaftProgress* progress, SyncTick current) { + return progress->lastSendTick = current; +} void syncRaftProgressFailure(SSyncRaft* pRaft, int i); @@ -159,7 +192,7 @@ void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights); void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); -SyncIndex syncRaftProgressNextIndex(SSyncRaft* pRaft, int i); + SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i); @@ -171,12 +204,10 @@ bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i); void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i); -bool syncRaftProgressGetRecentRecv(SSyncRaft* pRaft, int i); + void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); -RaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i); - #endif #endif /* TD_SYNC_RAFT_PROGRESS_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_replication.h b/source/libs/sync/inc/raft_replication.h new file mode 100644 index 0000000000..e457063980 --- /dev/null +++ b/source/libs/sync/inc/raft_replication.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef TD_SYNC_RAFT_REPLICATION_H +#define TD_SYNC_RAFT_REPLICATION_H + +#include "sync.h" +#include "syncInt.h" +#include "sync_type.h" + +int syncRaftReplicate(SSyncRaft* pRaft, int i); + +#endif /* TD_SYNC_RAFT_REPLICATION_H */ diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h index f9632f6ae8..130243a72a 100644 --- a/source/libs/sync/inc/sync_type.h +++ b/source/libs/sync/inc/sync_type.h @@ -16,12 +16,18 @@ #ifndef _TD_LIBS_SYNC_TYPE_H #define _TD_LIBS_SYNC_TYPE_H +#define SYNC_NON_NODE_ID -1 +#define SYNC_NON_TERM 0 + typedef int32_t SyncTime; +typedef uint32_t SyncTick; typedef struct SSyncRaft SSyncRaft; typedef struct SSyncRaftLog SSyncRaftLog; +typedef struct SSyncRaftEntry SSyncRaftEntry; + #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #endif @@ -32,13 +38,18 @@ typedef struct SSyncRaftLog SSyncRaftLog; typedef enum { SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0, - SYNC_RAFT_CAMPAIGN_ELECTION = 1, - SYNC_RAFT_CAMPAIGN_TRANSFER = 3, + SYNC_RAFT_CAMPAIGN_ELECTION = 1, + SYNC_RAFT_CAMPAIGN_TRANSFER = 2, } SyncRaftElectionType; typedef enum { + // the init vote resp status SYNC_RAFT_VOTE_RESP_UNKNOWN = 0, + + // grant the vote request SYNC_RAFT_VOTE_RESP_GRANT = 1, + + //reject the vote request SYNC_RAFT_VOTE_RESP_REJECT = 2, } SyncRaftVoteRespType; diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 6e8e359305..dca5c4cf08 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -29,6 +29,8 @@ static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg); static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg); static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg); +static int triggerAll(SSyncRaft* pRaft); + static void tickElection(SSyncRaft* pRaft); static void tickHeartbeat(SSyncRaft* pRaft); @@ -95,8 +97,8 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { } int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - syncDebug("from %d, to %d, type:%d, term:%" PRId64 ", state:%d", - pMsg->from, pMsg->to, pMsg->msgType, pMsg->term, pRaft->state); + syncDebug("from %d, type:%d, term:%" PRId64 ", state:%d", + pMsg->from, pMsg->msgType, pMsg->term, pRaft->state); if (preHandleMessage(pRaft, pMsg)) { syncFreeMessage(pMsg); @@ -117,6 +119,7 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) { } int32_t syncRaftTick(SSyncRaft* pRaft) { + pRaft->currentTick += 1; return 0; } @@ -168,12 +171,22 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft) { pRaft->leaderId = pRaft->leaderId; pRaft->state = TAOS_SYNC_ROLE_LEADER; // TODO: check if there is pending config log + int nPendingConf = syncRaftLogNumOfPendingConf(pRaft->log); + if (nPendingConf > 1) { + syncFatal("unexpected multiple uncommitted config entry"); + } + if (nPendingConf == 1) { + pRaft->hasPendingConf = true; + } syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); + + // after become leader, send initial heartbeat + syncRaftTriggerHeartbeat(pRaft); } -void syncRaftTriggerReplicate(SSyncRaft* pRaft) { - +void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) { + triggerAll(pRaft); } void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) { @@ -219,7 +232,7 @@ int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool acc } /** - * pre-handle message, return true is no need to continue + * pre-handle message, return true means no need to continue * Handle the message term, which may result in our stepping down to a follower. **/ static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { @@ -230,9 +243,11 @@ static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { if (pMsg->term > pRaft->term) { return preHandleNewTermMessage(pRaft, pMsg); + } else if (pMsg->term < pRaft->term) { + return preHandleOldTermMessage(pRaft, pMsg); } - return preHandleOldTermMessage(pRaft, pMsg);; + return false; } static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { @@ -240,6 +255,7 @@ static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) RaftMessageType msgType = pMsg->msgType; if (msgType == RAFT_MSG_VOTE) { + // TODO leaderId = SYNC_NON_NODE_ID; } @@ -263,7 +279,7 @@ static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) } static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - + // TODO // if receive old term message, no need to continue return true; } @@ -273,7 +289,7 @@ static int convertClear(SSyncRaft* pRaft) { } static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - convertClear(pRaft); + return 0; } @@ -290,6 +306,7 @@ static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) { } if (msgType == RAFT_MSG_VOTE_RESP) { + syncRaftHandleVoteRespMessage(pRaft, pMsg); return 0; } return 0; @@ -324,6 +341,22 @@ static void tickHeartbeat(SSyncRaft* pRaft) { } +/** + * trigger I/O requests for newly appended log entries or heartbeats. + **/ +static int triggerAll(SSyncRaft* pRaft) { + assert(pRaft->state == TAOS_SYNC_ROLE_LEADER); + int i; + + for (i = 0; i < pRaft->cluster.replica; ++i) { + if (i == pRaft->cluster.selfIndex) { + continue; + } + + + } +} + static void abortLeaderTransfer(SSyncRaft* pRaft) { pRaft->leadTransferee = SYNC_NON_NODE_ID; } @@ -343,5 +376,5 @@ static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { abortLeaderTransfer(pRaft); - pRaft->pendingConf = false; -} \ No newline at end of file + pRaft->hasPendingConf = false; +} diff --git a/source/libs/sync/src/raft_election.c b/source/libs/sync/src/raft_election.c index bb4a7541c2..4ffb8d0943 100644 --- a/source/libs/sync/src/raft_election.c +++ b/source/libs/sync/src/raft_election.c @@ -62,7 +62,7 @@ void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) { SyncNodeId nodeId = pRaft->cluster.nodeInfo[i].nodeId; SSyncMessage* pMsg = syncNewVoteMsg(pRaft->selfGroupId, pRaft->selfId, - nodeId, term, cType, lastIndex, lastTerm); + term, cType, lastIndex, lastTerm); if (pMsg == NULL) { continue; } diff --git a/source/libs/sync/src/raft_handle_vote_message.c b/source/libs/sync/src/raft_handle_vote_message.c index a575c5df1a..87ef468d57 100644 --- a/source/libs/sync/src/raft_handle_vote_message.c +++ b/source/libs/sync/src/raft_handle_vote_message.c @@ -15,6 +15,7 @@ #include "syncInt.h" #include "raft.h" +#include "raft_configuration.h" #include "raft_log.h" #include "raft_message.h" @@ -31,12 +32,12 @@ int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); grant = canGrantVoteMessage(pRaft, pMsg); - pRespMsg = syncNewVoteRespMsg(pRaft->selfGroupId, pRaft->selfId, pMsg->to, pMsg->vote.cType, !grant); + pRespMsg = syncNewVoteRespMsg(pRaft->selfGroupId, pRaft->selfId, pMsg->vote.cType, !grant); if (pRespMsg == NULL) { return 0; } syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 ", vote: %d] %s for %d" \ - "[logterm: %" PRId64 ", index: %" PRId64 ", vote: %d] at term %" PRId64 "", + "[logterm: %" PRId64 ", index: %" PRId64 "] at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, lastTerm, lastIndex, pRaft->voteFor, grant ? "grant" : "reject", pMsg->from, pMsg->vote.lastTerm, pMsg->vote.lastIndex, pRaft->term); @@ -49,7 +50,7 @@ static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { if (!(pRaft->voteFor == SYNC_NON_NODE_ID || pMsg->term > pRaft->term || pRaft->voteFor == pMsg->from)) { return false; } - if (!syncRaftLogIsUptodate(pRaft, pMsg->vote.lastIndex, pMsg->vote.lastTerm)) { + if (!syncRaftLogIsUptodate(pRaft->log, pMsg->vote.lastIndex, pMsg->vote.lastTerm)) { return false; } diff --git a/source/libs/sync/src/raft_handle_vote_resp_message.c b/source/libs/sync/src/raft_handle_vote_resp_message.c index a155f0fe63..6e88b03b5a 100644 --- a/source/libs/sync/src/raft_handle_vote_resp_message.c +++ b/source/libs/sync/src/raft_handle_vote_resp_message.c @@ -15,6 +15,7 @@ #include "syncInt.h" #include "raft.h" +#include "raft_configuration.h" #include "raft_message.h" int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { @@ -45,8 +46,7 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { if (pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); } else { - syncRaftBecomeLeader(pRaft); - syncRaftTriggerReplicate(pRaft); + syncRaftBecomeLeader(pRaft); } return 0; diff --git a/source/libs/sync/src/raft_log.c b/source/libs/sync/src/raft_log.c index f93595e9f3..ee51fcbef3 100644 --- a/source/libs/sync/src/raft_log.c +++ b/source/libs/sync/src/raft_log.c @@ -37,4 +37,13 @@ int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog) { bool syncRaftHasUnappliedLog(SSyncRaftLog* pLog) { return pLog->commitIndex > pLog->appliedIndex; +} + +SyncTerm syncRaftLogTermOf(SSyncRaftLog* pLog, SyncIndex index) { + return SYNC_NON_TERM; +} + +int syncRaftLogAcquire(SSyncRaftLog* pLog, SyncIndex index, int maxMsgSize, + SSyncRaftEntry **ppEntries, int *n) { + return 0; } \ No newline at end of file diff --git a/source/libs/sync/src/raft_progress.c b/source/libs/sync/src/raft_progress.c index 458f829394..8133b670ff 100644 --- a/source/libs/sync/src/raft_progress.c +++ b/source/libs/sync/src/raft_progress.c @@ -22,7 +22,6 @@ static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state); static void resumeProgress(SSyncRaftProgress* progress); -static void pauseProgress(SSyncRaftProgress* progress); int syncRaftProgressCreate(SSyncRaft* pRaft) { @@ -58,11 +57,6 @@ bool syncRaftProgressMaybeUpdate(SSyncRaft* pRaft, int i, SyncIndex lastIndex) { return updated; } -void syncRaftProgressOptimisticNextIndex(SSyncRaft* pRaft, int i, SyncIndex nextIndex) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - pRaft->leaderState.progress[i].nextIndex = nextIndex + 1; -} - bool syncRaftProgressMaybeDecrTo(SSyncRaft* pRaft, int i, SyncIndex rejected, SyncIndex lastIndex) { assert(i >= 0 && i < pRaft->leaderState.nProgress); @@ -103,15 +97,7 @@ static void resumeProgress(SSyncRaftProgress* progress) { progress->paused = false; } -static void pauseProgress(SSyncRaftProgress* progress) { - progress->paused = true; -} - -bool syncRaftProgressIsPaused(SSyncRaft* pRaft, int i) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - +bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) { switch (progress->state) { case PROGRESS_PROBE: return progress->paused; diff --git a/source/libs/sync/src/raft_replication.c b/source/libs/sync/src/raft_replication.c new file mode 100644 index 0000000000..02d9804f7e --- /dev/null +++ b/source/libs/sync/src/raft_replication.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "raft.h" +#include "raft_log.h" +#include "raft_progress.h" +#include "raft_replication.h" + +static int sendSnapshot(SSyncRaft* pRaft, int i); +static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex index, SyncTerm term); + +int syncRaftReplicate(SSyncRaft* pRaft, int i) { + assert(pRaft->state == TAOS_SYNC_ROLE_LEADER); + assert(i >= 0 && i < pRaft->leaderState.nProgress); + + SyncNodeId nodeId = pRaft->cluster.nodeInfo[i].nodeId; + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + if (syncRaftProgressIsPaused(progress)) { + syncInfo("node %d paused", nodeId); + return 0; + } + + SyncIndex nextIndex = syncRaftProgressNextIndex(progress); + SyncIndex prevIndex = nextIndex - 1; + SyncTerm prevTerm = syncRaftLogTermOf(pRaft->log, prevIndex); + + if (prevTerm == SYNC_NON_TERM && !syncRaftProgressInSnapshot(progress)) { + goto send_snapshot; + } + +send_snapshot: + if (syncRaftProgressRecentActive(progress)) { + /* Only send a snapshot when we have heard from the server */ + return sendSnapshot(pRaft, i); + } else { + /* Send empty AppendEntries RPC when we haven't heard from the server */ + prevIndex = syncRaftLogLastIndex(pRaft->log); + prevTerm = syncRaftLogLastTerm(pRaft->log); + return sendAppendEntries(pRaft, i, prevIndex, prevTerm); + } +} + +static int sendSnapshot(SSyncRaft* pRaft, int i) { + return 0; +} + +static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex prevIndex, SyncTerm prevTerm) { + SyncIndex nextIndex = prevIndex + 1; + SSyncRaftEntry *entries; + int nEntry; + SNodeInfo* pNode = &(pRaft->cluster.nodeInfo[i]); + SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); + syncRaftLogAcquire(pRaft->log, nextIndex, pRaft->maxMsgSize, &entries, &nEntry); + + SSyncMessage* msg = syncNewAppendMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term, + prevIndex, prevTerm, pRaft->log->commitIndex, + nEntry, entries); + + if (msg == NULL) { + return 0; + } + + pRaft->io.send(msg, pNode); + + if (syncRaftProgressInReplicate(progress)) { + SyncIndex lastIndex = nextIndex + nEntry; + syncRaftProgressOptimisticNextIndex(progress, lastIndex); + syncRaftInflightAdd(&progress->inflights, lastIndex); + } else if (syncRaftProgressInProbe(progress)) { + syncRaftProgressPause(progress); + } else { + + } + + syncRaftProgressUpdateSendTick(progress, pRaft->currentTick); + + return 0; +} \ No newline at end of file From 4022f360a7b87822d37801afe77dc6643af158f7 Mon Sep 17 00:00:00 2001 From: lichuang Date: Fri, 5 Nov 2021 16:35:07 +0800 Subject: [PATCH 13/16] [TD-10645][raft]add raft append message handle --- source/libs/sync/inc/raft_configuration.h | 2 +- source/libs/sync/inc/raft_log.h | 11 ++++ source/libs/sync/inc/raft_message.h | 34 ++++++++-- source/libs/sync/src/raft.c | 42 +++++++++++-- source/libs/sync/src/raft_configuration.c | 2 +- .../src/raft_handle_append_entries_message.c | 49 +++++++++++++++ .../libs/sync/src/raft_handle_vote_message.c | 2 +- .../sync/src/raft_handle_vote_resp_message.c | 2 +- source/libs/sync/src/raft_log.c | 9 +++ source/libs/sync/src/raft_replication.c | 63 +++++++++++++++++-- 10 files changed, 198 insertions(+), 18 deletions(-) create mode 100644 source/libs/sync/src/raft_handle_append_entries_message.c diff --git a/source/libs/sync/inc/raft_configuration.h b/source/libs/sync/inc/raft_configuration.h index 993f863f33..ac9bbb5e55 100644 --- a/source/libs/sync/inc/raft_configuration.h +++ b/source/libs/sync/inc/raft_configuration.h @@ -20,7 +20,7 @@ #include "sync_type.h" // return -1 if cannot find this id -int syncRaftConfigurationIndexOfVoter(SSyncRaft *pRaft, SyncNodeId id); +int syncRaftConfigurationIndexOfNode(SSyncRaft *pRaft, SyncNodeId id); int syncRaftConfigurationVoterCount(SSyncRaft *pRaft); diff --git a/source/libs/sync/inc/raft_log.h b/source/libs/sync/inc/raft_log.h index 41b605b0d2..bab9932fb5 100644 --- a/source/libs/sync/inc/raft_log.h +++ b/source/libs/sync/inc/raft_log.h @@ -37,6 +37,8 @@ SSyncRaftLog* syncRaftLogOpen(); SyncIndex syncRaftLogLastIndex(SSyncRaftLog* pLog); +SyncIndex syncRaftLogSnapshotIndex(SSyncRaftLog* pLog); + SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog); bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term); @@ -50,4 +52,13 @@ SyncTerm syncRaftLogTermOf(SSyncRaftLog* pLog, SyncIndex index); int syncRaftLogAcquire(SSyncRaftLog* pLog, SyncIndex index, int maxMsgSize, SSyncRaftEntry **ppEntries, int *n); +void syncRaftLogRelease(SSyncRaftLog* pLog, SyncIndex index, + SSyncRaftEntry *pEntries, int n); + +bool syncRaftLogMatchTerm(); + +static FORCE_INLINE bool syncRaftLogIsCommitted(SSyncRaftLog* pLog, SyncIndex index) { + return pLog->commitIndex > index; +} + #endif /* _TD_LIBS_SYNC_RAFT_LOG_H */ diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h index 58090a31f1..2cb625d1fb 100644 --- a/source/libs/sync/inc/raft_message.h +++ b/source/libs/sync/inc/raft_message.h @@ -65,10 +65,10 @@ typedef struct RaftMsg_VoteResp { typedef struct RaftMsg_Append_Entries { // index of log entry preceeding new ones - SyncIndex prevIndex; + SyncIndex index; // term of entry at prevIndex - SyncTerm prevTerm; + SyncTerm term; // leader's commit index. SyncIndex commitIndex; @@ -80,6 +80,10 @@ typedef struct RaftMsg_Append_Entries { SSyncRaftEntry* entries; } RaftMsg_Append_Entries; +typedef struct RaftMsg_Append_Resp { + SyncIndex index; +} RaftMsg_Append_Resp; + typedef struct SSyncMessage { RaftMessageType msgType; SyncTerm term; @@ -95,6 +99,7 @@ typedef struct SSyncMessage { RaftMsg_VoteResp voteResp; RaftMsg_Append_Entries appendEntries; + RaftMsg_Append_Resp appendResp; }; } SSyncMessage; @@ -167,7 +172,7 @@ static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNo } static FORCE_INLINE SSyncMessage* syncNewAppendMsg(SyncGroupId groupId, SyncNodeId from, - SyncTerm term, SyncIndex prevIndex, SyncTerm prevTerm, + SyncTerm term, SyncIndex logIndex, SyncTerm logTerm, SyncIndex commitIndex, int nEntries, SSyncRaftEntry* entries) { SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); if (pMsg == NULL) { @@ -179,8 +184,8 @@ static FORCE_INLINE SSyncMessage* syncNewAppendMsg(SyncGroupId groupId, SyncNode .term = term, .msgType = RAFT_MSG_APPEND, .appendEntries = (RaftMsg_Append_Entries) { - .prevIndex = prevIndex, - .prevTerm = prevTerm, + .index = logIndex, + .term = logTerm, .commitIndex = commitIndex, .nEntries = nEntries, .entries = entries, @@ -190,6 +195,24 @@ static FORCE_INLINE SSyncMessage* syncNewAppendMsg(SyncGroupId groupId, SyncNode return pMsg; } +static FORCE_INLINE SSyncMessage* syncNewEmptyAppendRespMsg(SyncGroupId groupId, SyncNodeId from, SyncTerm term) { + SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); + if (pMsg == NULL) { + return NULL; + } + *pMsg = (SSyncMessage) { + .groupId = groupId, + .from = from, + .term = term, + .msgType = RAFT_MSG_APPEND_RESP, + .appendResp = (RaftMsg_Append_Resp) { + + }, + }; + + return pMsg; +} + static FORCE_INLINE bool syncIsInternalMsg(RaftMessageType msgType) { return msgType == RAFT_MSG_INTERNAL_PROP || msgType == RAFT_MSG_INTERNAL_ELECTION; @@ -209,5 +232,6 @@ void syncFreeMessage(const SSyncMessage* pMsg); int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); +int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); #endif /* _TD_LIBS_SYNC_RAFT_MESSAGE_H */ \ No newline at end of file diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index dca5c4cf08..39e7a80d0b 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -16,6 +16,7 @@ #include "raft.h" #include "raft_configuration.h" #include "raft_log.h" +#include "raft_replication.h" #include "syncInt.h" #define RAFT_READ_LOG_MAX_NUM 100 @@ -215,7 +216,7 @@ int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool acc pRaft->selfGroupId, pRaft->selfId, id, pRaft->term); } - int voteIndex = syncRaftConfigurationIndexOfVoter(pRaft, id); + int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, id); assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0); assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN); @@ -279,8 +280,38 @@ static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) } static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - // TODO - // if receive old term message, no need to continue + if (pRaft->checkQuorum && pMsg->msgType == RAFT_MSG_APPEND) { + /** + * We have received messages from a leader at a lower term. It is possible + * that these messages were simply delayed in the network, but this could + * also mean that this node has advanced its term number during a network + * partition, and it is now unable to either win an election or to rejoin + * the majority on the old term. If checkQuorum is false, this will be + * handled by incrementing term numbers in response to MsgVote with a + * higher term, but if checkQuorum is true we may not advance the term on + * MsgVote and must generate other messages to advance the term. The net + * result of these two features is to minimize the disruption caused by + * nodes that have been removed from the cluster's configuration: a + * removed node will send MsgVotes (or MsgPreVotes) which will be ignored, + * but it will not receive MsgApp or MsgHeartbeat, so it will not create + * disruptive term increases + **/ + int peerIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); + if (peerIndex < 0) { + return true; + } + SSyncMessage* msg = syncNewEmptyAppendRespMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term); + if (msg == NULL) { + return true; + } + + pRaft->io.send(msg, &(pRaft->cluster.nodeInfo[peerIndex])); + } else { + // ignore other cases + syncInfo("[%d:%d] [term:%" PRId64 "] ignored a %d message with lower term from %d [term:%" PRId64 "]", + pRaft->selfGroupId, pRaft->selfId, pRaft->term, pMsg->msgType, pMsg->from, pMsg->term); + } + return true; } @@ -308,6 +339,9 @@ static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) { if (msgType == RAFT_MSG_VOTE_RESP) { syncRaftHandleVoteRespMessage(pRaft, pMsg); return 0; + } else if (msgType == RAFT_MSG_APPEND) { + syncRaftBecomeFollower(pRaft, pRaft->term, pMsg->from); + syncRaftHandleAppendEntriesMessage(pRaft, pMsg); } return 0; } @@ -353,7 +387,7 @@ static int triggerAll(SSyncRaft* pRaft) { continue; } - + syncRaftReplicate(pRaft, i); } } diff --git a/source/libs/sync/src/raft_configuration.c b/source/libs/sync/src/raft_configuration.c index 6f3a27e7c0..e16cb34989 100644 --- a/source/libs/sync/src/raft_configuration.c +++ b/source/libs/sync/src/raft_configuration.c @@ -16,7 +16,7 @@ #include "raft_configuration.h" #include "raft.h" -int syncRaftConfigurationIndexOfVoter(SSyncRaft *pRaft, SyncNodeId id) { +int syncRaftConfigurationIndexOfNode(SSyncRaft *pRaft, SyncNodeId id) { return (int)(id); } diff --git a/source/libs/sync/src/raft_handle_append_entries_message.c b/source/libs/sync/src/raft_handle_append_entries_message.c new file mode 100644 index 0000000000..d4d362848f --- /dev/null +++ b/source/libs/sync/src/raft_handle_append_entries_message.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "syncInt.h" +#include "raft.h" +#include "raft_log.h" +#include "raft_configuration.h" +#include "raft_message.h" + +int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + RaftMsg_Append_Entries *appendEntries = &(pMsg->appendEntries); + + int peerIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); + + if (peerIndex < 0) { + return 0; + } + + SSyncMessage* pRespMsg = syncNewEmptyAppendRespMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term); + if (pRespMsg == NULL) { + return 0; + } + + RaftMsg_Append_Entries *appendResp = &(pMsg->appendResp); + // ignore committed logs + if (syncRaftLogIsCommitted(pRaft->log, appendEntries->index)) { + appendResp->index = pRaft->log->commitIndex; + goto out; + } + + syncInfo("[%d:%d] recv append from %d index %" PRId64"", + pRaft->selfGroupId, pRaft->selfId, pMsg->from, appendEntries->index); + +out: + pRaft->io.send(pRespMsg, &(pRaft->cluster.nodeInfo[peerIndex])); + return 0; +} \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_vote_message.c b/source/libs/sync/src/raft_handle_vote_message.c index 87ef468d57..2fab8ad5a9 100644 --- a/source/libs/sync/src/raft_handle_vote_message.c +++ b/source/libs/sync/src/raft_handle_vote_message.c @@ -23,7 +23,7 @@ static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { SSyncMessage* pRespMsg; - int voteIndex = syncRaftConfigurationIndexOfVoter(pRaft, pMsg->from); + int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); if (voteIndex == -1) { return 0; } diff --git a/source/libs/sync/src/raft_handle_vote_resp_message.c b/source/libs/sync/src/raft_handle_vote_resp_message.c index 6e88b03b5a..05464256af 100644 --- a/source/libs/sync/src/raft_handle_vote_resp_message.c +++ b/source/libs/sync/src/raft_handle_vote_resp_message.c @@ -23,7 +23,7 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { int quorum; int voterIndex; - voterIndex = syncRaftConfigurationIndexOfVoter(pRaft, pMsg->from); + voterIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); if (voterIndex == -1) { syncError("[%d:%d] recv vote resp from unknown server %d", pRaft->selfGroupId, pRaft->selfId, pMsg->from); return 0; diff --git a/source/libs/sync/src/raft_log.c b/source/libs/sync/src/raft_log.c index ee51fcbef3..a26650cbb7 100644 --- a/source/libs/sync/src/raft_log.c +++ b/source/libs/sync/src/raft_log.c @@ -23,6 +23,10 @@ SyncIndex syncRaftLogLastIndex(SSyncRaftLog* pLog) { return 0; } +SyncIndex syncRaftLogSnapshotIndex(SSyncRaftLog* pLog) { + return 0; +} + SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog) { return 0; } @@ -46,4 +50,9 @@ SyncTerm syncRaftLogTermOf(SSyncRaftLog* pLog, SyncIndex index) { int syncRaftLogAcquire(SSyncRaftLog* pLog, SyncIndex index, int maxMsgSize, SSyncRaftEntry **ppEntries, int *n) { return 0; +} + +void syncRaftLogRelease(SSyncRaftLog* pLog, SyncIndex index, + SSyncRaftEntry *pEntries, int n) { + return; } \ No newline at end of file diff --git a/source/libs/sync/src/raft_replication.c b/source/libs/sync/src/raft_replication.c index 02d9804f7e..b6ff1fb329 100644 --- a/source/libs/sync/src/raft_replication.c +++ b/source/libs/sync/src/raft_replication.c @@ -33,13 +33,62 @@ int syncRaftReplicate(SSyncRaft* pRaft, int i) { } SyncIndex nextIndex = syncRaftProgressNextIndex(progress); - SyncIndex prevIndex = nextIndex - 1; - SyncTerm prevTerm = syncRaftLogTermOf(pRaft->log, prevIndex); + SyncIndex snapshotIndex = syncRaftLogSnapshotIndex(pRaft->log); + bool inSnapshot = syncRaftProgressInSnapshot(progress); + SyncIndex prevIndex; + SyncTerm prevTerm; - if (prevTerm == SYNC_NON_TERM && !syncRaftProgressInSnapshot(progress)) { - goto send_snapshot; + /** + * From Section 3.5: + * + * When sending an AppendEntries RPC, the leader includes the index and + * term of the entry in its log that immediately precedes the new + * entries. If the follower does not find an entry in its log with the + * same index and term, then it refuses the new entries. The consistency + * check acts as an induction step: the initial empty state of the logs + * satisfies the Log Matching Property, and the consistency check + * preserves the Log Matching Property whenever logs are extended. As a + * result, whenever AppendEntries returns successfully, the leader knows + * that the follower's log is identical to its own log up through the new + * entries (Log Matching Property in Figure 3.2). + **/ + if (nextIndex == 1) { + /** + * We're including the very first entry, so prevIndex and prevTerm are + * null. If the first entry is not available anymore, send the last + * snapshot if we're not already sending one. + **/ + if (snapshotIndex > 0 && !inSnapshot) { + goto send_snapshot; + } + + // otherwise send append entries from start + prevIndex = 0; + prevTerm = 0; + } else { + /** + * Set prevIndex and prevTerm to the index and term of the entry at + * nextIndex - 1. + **/ + prevIndex = nextIndex - 1; + prevTerm = syncRaftLogTermOf(pRaft->log, prevIndex); + /** + * If the entry is not anymore in our log, send the last snapshot if we're + * not doing so already. + **/ + if (prevTerm == SYNC_NON_TERM && !inSnapshot) { + goto send_snapshot; + } } + /* Send empty AppendEntries RPC when installing a snaphot */ + if (inSnapshot) { + prevIndex = syncRaftLogLastIndex(pRaft->log); + prevTerm = syncRaftLogLastTerm(pRaft->log); + } + + return sendAppendEntries(pRaft, i, prevIndex, prevTerm); + send_snapshot: if (syncRaftProgressRecentActive(progress)) { /* Only send a snapshot when we have heard from the server */ @@ -69,7 +118,7 @@ static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex prevIndex, SyncT nEntry, entries); if (msg == NULL) { - return 0; + goto err_release_log; } pRaft->io.send(msg, pNode); @@ -87,4 +136,8 @@ static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex prevIndex, SyncT syncRaftProgressUpdateSendTick(progress, pRaft->currentTick); return 0; + +err_release_log: + syncRaftLogRelease(pRaft->log, nextIndex, entries, nEntry); + return 0; } \ No newline at end of file From ccf8f14fdb59a5301ccc4753dc4e3fd278685153 Mon Sep 17 00:00:00 2001 From: lichuang Date: Mon, 8 Nov 2021 15:58:19 +0800 Subject: [PATCH 14/16] [TD-10645][raft]add raft progress tracker --- source/libs/sync/inc/raft.h | 75 +++++++------ source/libs/sync/inc/raft_log.h | 14 ++- source/libs/sync/inc/raft_progress.h | 26 +++-- .../sync/inc/sync_raft_progress_tracker.h | 100 ++++++++++++++++++ source/libs/sync/inc/sync_raft_quorum_joint.h | 30 ++++++ source/libs/sync/inc/sync_type.h | 10 ++ source/libs/sync/src/raft.c | 78 ++++++++++++-- .../src/raft_handle_append_entries_message.c | 4 +- .../libs/sync/src/raft_handle_vote_message.c | 2 +- source/libs/sync/src/raft_log.c | 4 + source/libs/sync/src/raft_progress.c | 44 +++----- source/libs/sync/src/raft_replication.c | 5 + .../sync/src/sync_raft_progress_tracker.c | 41 +++++++ 13 files changed, 349 insertions(+), 84 deletions(-) create mode 100644 source/libs/sync/inc/sync_raft_progress_tracker.h create mode 100644 source/libs/sync/inc/sync_raft_quorum_joint.h create mode 100644 source/libs/sync/src/sync_raft_progress_tracker.c diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index dd3eed9e02..795ea7cc99 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -20,17 +20,12 @@ #include "sync_type.h" #include "raft_message.h" -typedef struct SSyncRaftProgress SSyncRaftProgress; typedef struct RaftLeaderState { - int nProgress; - SSyncRaftProgress* progress; + } RaftLeaderState; typedef struct RaftCandidateState { - /* votes results */ - SyncRaftVoteRespType votes[TSDB_MAX_REPLICA]; - /* true if in pre-vote phase */ bool inPreVote; } RaftCandidateState; @@ -47,17 +42,34 @@ struct SSyncRaft { // owner sync node SSyncNode* pNode; - int maxMsgSize; + SSyncCluster cluster; + + SyncNodeId selfId; + SyncGroupId selfGroupId; + + SSyncRaftIOMethods io; SSyncFSM fsm; SSyncLogStore logStore; SStateManager stateManager; + union { + RaftLeaderState leaderState; + RaftCandidateState candidateState; + }; + SyncTerm term; SyncNodeId voteFor; - SyncNodeId selfId; - SyncGroupId selfGroupId; + SSyncRaftLog *log; + + int maxMsgSize; + SSyncRaftProgressTracker *tracker; + + ESyncRole state; + + // isLearner is true if the local raft node is a learner. + bool isLearner; /** * the leader id @@ -70,15 +82,23 @@ struct SSyncRaft { **/ SyncNodeId leadTransferee; - /** - * New configuration is ignored if there exists unapplied configuration. + /** + * Only one conf change may be pending (in the log, but not yet + * applied) at a time. This is enforced via pendingConfIndex, which + * is set to a value >= the log index of the latest pending + * configuration change (if any). Config changes are only allowed to + * be proposed if the leader's applied index is greater than this + * value. **/ - bool hasPendingConf; - - SSyncCluster cluster; - - ESyncRole state; + SyncIndex pendingConfigIndex; + /** + * an estimate of the size of the uncommitted tail of the Raft log. Used to + * prevent unbounded log growth. Only maintained by the leader. Reset on + * term changes. + **/ + uint32_t uncommittedSize; + /** * number of ticks since it reached last electionTimeout when it is leader * or candidate. @@ -96,24 +116,19 @@ struct SSyncRaft { // current tick count since start up uint32_t currentTick; - // election timeout tick(random in [3:6] tick) - uint16_t electionTimeoutTick; - - // heartbeat timeout tick(default: 1 tick) - uint16_t heartbeatTimeoutTick; - bool preVote; bool checkQuorum; - SSyncRaftIOMethods io; + int heartbeatTimeout; + int electionTimeout; - // union different state data - union { - RaftLeaderState leaderState; - RaftCandidateState candidateState; - }; - - SSyncRaftLog *log; + /** + * randomizedElectionTimeout is a random number between + * [electiontimeout, 2 * electiontimeout - 1]. It gets reset + * when raft changes its state to follower or candidate. + **/ + int randomizedElectionTimeout; + bool disableProposalForwarding; SyncRaftStepFp stepFp; diff --git a/source/libs/sync/inc/raft_log.h b/source/libs/sync/inc/raft_log.h index bab9932fb5..a44f5a7273 100644 --- a/source/libs/sync/inc/raft_log.h +++ b/source/libs/sync/inc/raft_log.h @@ -19,8 +19,18 @@ #include "sync.h" #include "sync_type.h" -struct SSyncRaftEntry { +typedef enum SyncEntryType { + SYNC_ENTRY_TYPE_LOG = 1, +}SyncEntryType; +struct SSyncRaftEntry { + SyncTerm term; + + SyncIndex index; + + SyncEntryType type; + + SSyncBuffer buffer; }; struct SSyncRaftLog { @@ -49,6 +59,8 @@ bool syncRaftHasUnappliedLog(SSyncRaftLog* pLog); SyncTerm syncRaftLogTermOf(SSyncRaftLog* pLog, SyncIndex index); +int syncRaftLogAppend(SSyncRaftLog* pLog, SSyncRaftEntry *pEntries, int n); + int syncRaftLogAcquire(SSyncRaftLog* pLog, SyncIndex index, int maxMsgSize, SSyncRaftEntry **ppEntries, int *n); diff --git a/source/libs/sync/inc/raft_progress.h b/source/libs/sync/inc/raft_progress.h index 5840468a5d..41d66d59d0 100644 --- a/source/libs/sync/inc/raft_progress.h +++ b/source/libs/sync/inc/raft_progress.h @@ -73,6 +73,8 @@ typedef enum RaftProgressState { * progresses of all followers, and sends entries to the follower based on its progress. **/ struct SSyncRaftProgress { + SyncNodeId id; + SyncIndex nextIndex; SyncIndex matchIndex; @@ -108,16 +110,18 @@ struct SSyncRaftProgress { * flow control sliding window **/ SSyncRaftInflights inflights; + + // IsLearner is true if this progress is tracked for a learner. + bool isLearner; }; -int syncRaftProgressCreate(SSyncRaft* pRaft); -//int syncRaftProgressRecreate(SSyncRaft* pRaft, const RaftConfiguration* configuration); +void syncRaftInitProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress); /** * syncRaftProgressMaybeUpdate returns false if the given lastIndex index comes from i-th node's log. * Otherwise it updates the progress and returns true. **/ -bool syncRaftProgressMaybeUpdate(SSyncRaft* pRaft, int i, SyncIndex lastIndex); +bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex); static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* progress, SyncIndex nextIndex) { progress->nextIndex = nextIndex + 1; @@ -127,7 +131,7 @@ static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* * syncRaftProgressMaybeDecrTo returns false if the given to index comes from an out of order message. * Otherwise it decreases the progress next index to min(rejected, last) and returns true. **/ -bool syncRaftProgressMaybeDecrTo(SSyncRaft* pRaft, int i, +bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, SyncIndex rejected, SyncIndex lastIndex); /** @@ -166,20 +170,20 @@ static FORCE_INLINE bool syncRaftProgressUpdateSendTick(SSyncRaftProgress* progr return progress->lastSendTick = current; } -void syncRaftProgressFailure(SSyncRaft* pRaft, int i); +void syncRaftProgressFailure(SSyncRaftProgress* progress); -bool syncRaftProgressNeedAbortSnapshot(SSyncRaft* pRaft, int i); +bool syncRaftProgressNeedAbortSnapshot(SSyncRaftProgress* progress); /** - * return true if i-th node's log is up-todate + * return true if progress's log is up-todate **/ -bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, int i); +bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress); -void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i); +void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress); -void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i); +void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress); -void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i, SyncIndex snapshotIndex); +void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex); /* inflights APIs */ int syncRaftInflightReset(SSyncRaftInflights* inflights); diff --git a/source/libs/sync/inc/sync_raft_progress_tracker.h b/source/libs/sync/inc/sync_raft_progress_tracker.h new file mode 100644 index 0000000000..ffc134fec4 --- /dev/null +++ b/source/libs/sync/inc/sync_raft_progress_tracker.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H +#define _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H + +#include "sync_type.h" +#include "sync_raft_quorum_joint.h" +#include "raft_progress.h" + +struct SSyncRaftProgressTrackerConfig { + SSyncRaftQuorumJointConfig voters; + + /** AutoLeave is true if the configuration is joint and a transition to the + * incoming configuration should be carried out automatically by Raft when + * this is possible. If false, the configuration will be joint until the + * application initiates the transition manually. + **/ + bool autoLeave; + + /** + * Learners is a set of IDs corresponding to the learners active in the + * current configuration. + * + * Invariant: Learners and Voters does not intersect, i.e. if a peer is in + * either half of the joint config, it can't be a learner; if it is a + * learner it can't be in either half of the joint config. This invariant + * simplifies the implementation since it allows peers to have clarity about + * its current role without taking into account joint consensus. + **/ + SyncNodeId learners[TSDB_MAX_REPLICA]; + + /** + * When we turn a voter into a learner during a joint consensus transition, + * we cannot add the learner directly when entering the joint state. This is + * because this would violate the invariant that the intersection of + * voters and learners is empty. For example, assume a Voter is removed and + * immediately re-added as a learner (or in other words, it is demoted): + * + * Initially, the configuration will be + * + * voters: {1 2 3} + * learners: {} + * + * and we want to demote 3. Entering the joint configuration, we naively get + * + * voters: {1 2} & {1 2 3} + * learners: {3} + * + * but this violates the invariant (3 is both voter and learner). Instead, + * we get + * + * voters: {1 2} & {1 2 3} + * learners: {} + * next_learners: {3} + * + * Where 3 is now still purely a voter, but we are remembering the intention + * to make it a learner upon transitioning into the final configuration: + * + * voters: {1 2} + * learners: {3} + * next_learners: {} + * + * Note that next_learners is not used while adding a learner that is not + * also a voter in the joint config. In this case, the learner is added + * right away when entering the joint configuration, so that it is caught up + * as soon as possible. + **/ + SyncNodeId learnersNext[TSDB_MAX_REPLICA]; +}; + +struct SSyncRaftProgressTracker { + SSyncRaftProgressTrackerConfig config; + + SSyncRaftProgress progressMap[TSDB_MAX_REPLICA]; + + SyncRaftVoteRespType votes[TSDB_MAX_REPLICA]; + int maxInflight; +}; + +SSyncRaftProgressTracker* syncRaftOpenProgressTracker(); + +void syncRaftResetVotes(SSyncRaftProgressTracker*); + +typedef void (*visitProgressFp)(SSyncRaftProgress* progress, void* arg); +void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg); + +#endif /* _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H */ diff --git a/source/libs/sync/inc/sync_raft_quorum_joint.h b/source/libs/sync/inc/sync_raft_quorum_joint.h new file mode 100644 index 0000000000..4f7424db7e --- /dev/null +++ b/source/libs/sync/inc/sync_raft_quorum_joint.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H +#define _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H + +#include "taosdef.h" +#include "sync.h" + +/** + * JointConfig is a configuration of two groups of (possibly overlapping) + * majority configurations. Decisions require the support of both majorities. + **/ +typedef struct SSyncRaftQuorumJointConfig { + SyncNodeId majorityConfig[2][TSDB_MAX_REPLICA]; +}SSyncRaftQuorumJointConfig; + +#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */ diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h index 130243a72a..9faebe94b2 100644 --- a/source/libs/sync/inc/sync_type.h +++ b/source/libs/sync/inc/sync_type.h @@ -16,6 +16,9 @@ #ifndef _TD_LIBS_SYNC_TYPE_H #define _TD_LIBS_SYNC_TYPE_H +#include +#include "osMath.h" + #define SYNC_NON_NODE_ID -1 #define SYNC_NON_TERM 0 @@ -24,10 +27,16 @@ typedef uint32_t SyncTick; typedef struct SSyncRaft SSyncRaft; +typedef struct SSyncRaftProgress SSyncRaftProgress; +typedef struct SSyncRaftProgressTrackerConfig SSyncRaftProgressTrackerConfig; + +typedef struct SSyncRaftProgressTracker SSyncRaftProgressTracker; + typedef struct SSyncRaftLog SSyncRaftLog; typedef struct SSyncRaftEntry SSyncRaftEntry; +#if 0 #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #endif @@ -35,6 +44,7 @@ typedef struct SSyncRaftEntry SSyncRaftEntry; #ifndef MAX #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #endif +#endif typedef enum { SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0, diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 39e7a80d0b..4a3654131c 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -17,6 +17,7 @@ #include "raft_configuration.h" #include "raft_log.h" #include "raft_replication.h" +#include "sync_raft_progress_tracker.h" #include "syncInt.h" #define RAFT_READ_LOG_MAX_NUM 100 @@ -35,6 +36,9 @@ static int triggerAll(SSyncRaft* pRaft); static void tickElection(SSyncRaft* pRaft); static void tickHeartbeat(SSyncRaft* pRaft); +static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n); +static bool maybeCommit(SSyncRaft* pRaft); + static void abortLeaderTransfer(SSyncRaft* pRaft); static void resetRaft(SSyncRaft* pRaft, SyncTerm term); @@ -59,6 +63,12 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { logStore = &(pRaft->logStore); fsm = &(pRaft->fsm); + // init progress tracker + pRaft->tracker = syncRaftOpenProgressTracker(); + if (pRaft->tracker == NULL) { + return -1; + } + // open raft log if ((pRaft->log = syncRaftLogOpen()) == NULL) { return -1; @@ -88,7 +98,7 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { } assert(initIndex == serverState.commitIndex); - pRaft->heartbeatTimeoutTick = 1; + //pRaft->heartbeatTimeoutTick = 1; syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); @@ -137,7 +147,7 @@ void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId void syncRaftBecomePreCandidate(SSyncRaft* pRaft) { convertClear(pRaft); - memset(pRaft->candidateState.votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteRespType) * TSDB_MAX_REPLICA); + /** * Becoming a pre-candidate changes our step functions and state, * but doesn't change anything else. In particular it does not increase @@ -152,7 +162,6 @@ void syncRaftBecomePreCandidate(SSyncRaft* pRaft) { void syncRaftBecomeCandidate(SSyncRaft* pRaft) { convertClear(pRaft); - memset(pRaft->candidateState.votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteRespType) * TSDB_MAX_REPLICA); pRaft->candidateState.inPreVote = false; pRaft->stepFp = stepCandidate; @@ -176,14 +185,22 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft) { if (nPendingConf > 1) { syncFatal("unexpected multiple uncommitted config entry"); } - if (nPendingConf == 1) { - pRaft->hasPendingConf = true; - } syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); - // after become leader, send initial heartbeat - syncRaftTriggerHeartbeat(pRaft); + // after become leader, send a no-op log + SSyncRaftEntry* entry = (SSyncRaftEntry*)malloc(sizeof(SSyncRaftEntry)); + if (entry == NULL) { + return; + } + *entry = (SSyncRaftEntry) { + .buffer = (SSyncBuffer) { + .data = NULL, + .len = 0, + } + }; + appendEntries(pRaft, entry, 1); + //syncRaftTriggerHeartbeat(pRaft); } void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) { @@ -192,7 +209,7 @@ void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) { void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) { // electionTimeoutTick in [3,6] tick - pRaft->electionTimeoutTick = taosRand() % 4 + 3; + pRaft->randomizedElectionTimeout = taosRand() % 4 + 3; } bool syncRaftIsPromotable(SSyncRaft* pRaft) { @@ -200,7 +217,7 @@ bool syncRaftIsPromotable(SSyncRaft* pRaft) { } bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) { - return pRaft->electionElapsed >= pRaft->electionTimeoutTick; + return pRaft->electionElapsed >= pRaft->randomizedElectionTimeout; } int syncRaftQuorum(SSyncRaft* pRaft) { @@ -208,6 +225,7 @@ int syncRaftQuorum(SSyncRaft* pRaft) { } int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool accept, int* rejectNum) { +/* if (accept) { syncInfo("[%d:%d] received (pre-vote %d) from %d at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); @@ -230,6 +248,8 @@ int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool acc if (rejectNum) *rejectNum = rejected; return granted; +*/ + return 0; } /** @@ -375,6 +395,34 @@ static void tickHeartbeat(SSyncRaft* pRaft) { } +static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { + SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); + SyncTerm term = pRaft->term; + int i; + + for (i = 0; i < n; ++i) { + entries[i].term = term; + entries[i].index = lastIndex + 1 + i; + } + + syncRaftLogAppend(pRaft->log, entries, n); + + SSyncRaftProgress* progress = &(pRaft->tracker->progressMap[pRaft->cluster.selfIndex]); + syncRaftProgressMaybeUpdate(progress, lastIndex); + // Regardless of maybeCommit's return, our caller will call bcastAppend. + maybeCommit(pRaft); +} + +/** + * maybeCommit attempts to advance the commit index. Returns true if + * the commit index changed (in which case the caller should call + * r.bcastAppend). + **/ +static bool maybeCommit(SSyncRaft* pRaft) { + + return true; +} + /** * trigger I/O requests for newly appended log entries or heartbeats. **/ @@ -395,6 +443,10 @@ static void abortLeaderTransfer(SSyncRaft* pRaft) { pRaft->leadTransferee = SYNC_NON_NODE_ID; } +static void initProgress(SSyncRaftProgress* progress, void* arg) { + syncRaftInitProgress((SSyncRaft*)arg, progress); +} + static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { if (pRaft->term != term) { pRaft->term = term; @@ -410,5 +462,9 @@ static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { abortLeaderTransfer(pRaft); - pRaft->hasPendingConf = false; + syncRaftResetVotes(pRaft->tracker); + syncRaftProgressVisit(pRaft->tracker, initProgress, pRaft); + + pRaft->pendingConfigIndex = 0; + pRaft->uncommittedSize = 0; } diff --git a/source/libs/sync/src/raft_handle_append_entries_message.c b/source/libs/sync/src/raft_handle_append_entries_message.c index d4d362848f..8c014a56bc 100644 --- a/source/libs/sync/src/raft_handle_append_entries_message.c +++ b/source/libs/sync/src/raft_handle_append_entries_message.c @@ -20,7 +20,7 @@ #include "raft_message.h" int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - RaftMsg_Append_Entries *appendEntries = &(pMsg->appendEntries); + const RaftMsg_Append_Entries *appendEntries = &(pMsg->appendEntries); int peerIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); @@ -33,7 +33,7 @@ int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMs return 0; } - RaftMsg_Append_Entries *appendResp = &(pMsg->appendResp); + RaftMsg_Append_Entries *appendResp = &(pRespMsg->appendResp); // ignore committed logs if (syncRaftLogIsCommitted(pRaft->log, appendEntries->index)) { appendResp->index = pRaft->log->commitIndex; diff --git a/source/libs/sync/src/raft_handle_vote_message.c b/source/libs/sync/src/raft_handle_vote_message.c index 2fab8ad5a9..709e319c3e 100644 --- a/source/libs/sync/src/raft_handle_vote_message.c +++ b/source/libs/sync/src/raft_handle_vote_message.c @@ -36,7 +36,7 @@ int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { if (pRespMsg == NULL) { return 0; } - syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 ", vote: %d] %s for %d" \ + syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 ", vote: %d] %s for %d"\ "[logterm: %" PRId64 ", index: %" PRId64 "] at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, lastTerm, lastIndex, pRaft->voteFor, grant ? "grant" : "reject", diff --git a/source/libs/sync/src/raft_log.c b/source/libs/sync/src/raft_log.c index a26650cbb7..0654dbea6b 100644 --- a/source/libs/sync/src/raft_log.c +++ b/source/libs/sync/src/raft_log.c @@ -47,6 +47,10 @@ SyncTerm syncRaftLogTermOf(SSyncRaftLog* pLog, SyncIndex index) { return SYNC_NON_TERM; } +int syncRaftLogAppend(SSyncRaftLog* pLog, SSyncRaftEntry *pEntries, int n) { + +} + int syncRaftLogAcquire(SSyncRaftLog* pLog, SyncIndex index, int maxMsgSize, SSyncRaftEntry **ppEntries, int *n) { return 0; diff --git a/source/libs/sync/src/raft_progress.c b/source/libs/sync/src/raft_progress.c index 8133b670ff..6edc808698 100644 --- a/source/libs/sync/src/raft_progress.c +++ b/source/libs/sync/src/raft_progress.c @@ -40,9 +40,15 @@ int syncRaftProgressRecreate(SSyncRaft* pRaft, const RaftConfiguration* configur } */ -bool syncRaftProgressMaybeUpdate(SSyncRaft* pRaft, int i, SyncIndex lastIndex) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); +void syncRaftInitProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress) { + *progress = (SSyncRaftProgress) { + .matchIndex = progress->id == pRaft->selfId ? syncRaftLogLastIndex(pRaft->log) : 0, + .nextIndex = syncRaftLogLastIndex(pRaft->log) + 1, + //.inflights = + }; +} + +bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex) { bool updated = false; if (progress->matchIndex < lastIndex) { @@ -57,11 +63,8 @@ bool syncRaftProgressMaybeUpdate(SSyncRaft* pRaft, int i, SyncIndex lastIndex) { return updated; } -bool syncRaftProgressMaybeDecrTo(SSyncRaft* pRaft, int i, +bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, SyncIndex rejected, SyncIndex lastIndex) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - if (progress->state == PROGRESS_REPLICATE) { /** * the rejection must be stale if the progress has matched and "rejected" @@ -110,30 +113,19 @@ bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) { } } -void syncRaftProgressFailure(SSyncRaft* pRaft, int i) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - +void syncRaftProgressFailure(SSyncRaftProgress* progress) { progress->pendingSnapshotIndex = 0; } -bool syncRaftProgressNeedAbortSnapshot(SSyncRaft* pRaft, int i) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - +bool syncRaftProgressNeedAbortSnapshot(SSyncRaftProgress* progress) { return progress->state == PROGRESS_SNAPSHOT && progress->matchIndex >= progress->pendingSnapshotIndex; } -bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, int i) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); +bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) { return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex; } -void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); +void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress) { /** * If the original state is ProgressStateSnapshot, progress knows that * the pending snapshot has been sent to this peer successfully, then @@ -149,16 +141,12 @@ void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i) { } } -void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); +void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress) { resetProgressState(progress, PROGRESS_REPLICATE); progress->nextIndex = progress->matchIndex + 1; } -void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i, SyncIndex snapshotIndex) { - assert(i >= 0 && i < pRaft->leaderState.nProgress); - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); +void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex) { resetProgressState(progress, PROGRESS_SNAPSHOT); progress->pendingSnapshotIndex = snapshotIndex; } diff --git a/source/libs/sync/src/raft_replication.c b/source/libs/sync/src/raft_replication.c index b6ff1fb329..473499b795 100644 --- a/source/libs/sync/src/raft_replication.c +++ b/source/libs/sync/src/raft_replication.c @@ -22,6 +22,7 @@ static int sendSnapshot(SSyncRaft* pRaft, int i); static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex index, SyncTerm term); int syncRaftReplicate(SSyncRaft* pRaft, int i) { +#if 0 assert(pRaft->state == TAOS_SYNC_ROLE_LEADER); assert(i >= 0 && i < pRaft->leaderState.nProgress); @@ -99,6 +100,8 @@ send_snapshot: prevTerm = syncRaftLogLastTerm(pRaft->log); return sendAppendEntries(pRaft, i, prevIndex, prevTerm); } +#endif + return 0; } static int sendSnapshot(SSyncRaft* pRaft, int i) { @@ -106,6 +109,7 @@ static int sendSnapshot(SSyncRaft* pRaft, int i) { } static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex prevIndex, SyncTerm prevTerm) { +#if 0 SyncIndex nextIndex = prevIndex + 1; SSyncRaftEntry *entries; int nEntry; @@ -139,5 +143,6 @@ static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex prevIndex, SyncT err_release_log: syncRaftLogRelease(pRaft->log, nextIndex, entries, nEntry); +#endif return 0; } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c new file mode 100644 index 0000000000..d349cbb9b2 --- /dev/null +++ b/source/libs/sync/src/sync_raft_progress_tracker.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "sync_raft_progress_tracker.h" + +SSyncRaftProgressTracker* syncRaftOpenProgressTracker() { + SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)malloc(sizeof(SSyncRaftProgressTracker)); + if (tracker == NULL) { + return NULL; + } + + return tracker; +} + +void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) { + memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteRespType) * TSDB_MAX_REPLICA); +} + +void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) { + int i; + for (i = 0; i < TSDB_MAX_REPLICA; ++i) { + SSyncRaftProgress* progress = &(tracker->progressMap[i]); + if (progress->id == SYNC_NON_NODE_ID) { + continue; + } + + visit(progress, arg); + } +} \ No newline at end of file From e17f573e0e9f18b02a5cbf367c5f454bb0b0d9b8 Mon Sep 17 00:00:00 2001 From: lichuang Date: Tue, 9 Nov 2021 10:53:08 +0800 Subject: [PATCH 15/16] [TD-10645][raft]add raft progress tracker --- source/libs/sync/inc/raft.h | 7 +- source/libs/sync/inc/raft_progress.h | 217 ---------------- source/libs/sync/inc/sync_raft_inflights.h | 77 ++++++ source/libs/sync/inc/sync_raft_progress.h | 235 ++++++++++++++++++ .../sync/inc/sync_raft_progress_tracker.h | 4 +- source/libs/sync/src/raft.c | 6 +- source/libs/sync/src/raft_replication.c | 2 +- source/libs/sync/src/sync_raft_inflights.c | 104 ++++++++ .../{raft_progress.c => sync_raft_progress.c} | 188 ++++++-------- .../sync/src/sync_raft_progress_tracker.c | 6 +- 10 files changed, 498 insertions(+), 348 deletions(-) delete mode 100644 source/libs/sync/inc/raft_progress.h create mode 100644 source/libs/sync/inc/sync_raft_inflights.h create mode 100644 source/libs/sync/inc/sync_raft_progress.h create mode 100644 source/libs/sync/src/sync_raft_inflights.c rename source/libs/sync/src/{raft_progress.c => sync_raft_progress.c} (59%) diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 795ea7cc99..c8bf63f81c 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -44,6 +44,7 @@ struct SSyncRaft { SSyncCluster cluster; + int selfIndex; SyncNodeId selfId; SyncGroupId selfGroupId; @@ -113,9 +114,6 @@ struct SSyncRaft { **/ uint16_t heartbeatElapsed; - // current tick count since start up - uint32_t currentTick; - bool preVote; bool checkQuorum; @@ -130,6 +128,9 @@ struct SSyncRaft { int randomizedElectionTimeout; bool disableProposalForwarding; + // current tick count since start up + uint32_t currentTick; + SyncRaftStepFp stepFp; SyncRaftTickFp tickFp; diff --git a/source/libs/sync/inc/raft_progress.h b/source/libs/sync/inc/raft_progress.h deleted file mode 100644 index 41d66d59d0..0000000000 --- a/source/libs/sync/inc/raft_progress.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef TD_SYNC_RAFT_PROGRESS_H -#define TD_SYNC_RAFT_PROGRESS_H - -#include "sync_type.h" - -/** - * SSyncRaftInflights is a sliding window for the inflight messages. - * Thus inflight effectively limits both the number of inflight messages - * and the bandwidth each Progress can use. - * When inflights is full, no more message should be sent. - * When a leader sends out a message, the index of the last - * entry should be added to inflights. The index MUST be added - * into inflights in order. - * When a leader receives a reply, the previous inflights should - * be freed by calling syncRaftInflightFreeTo with the index of the last - * received entry. - **/ -typedef struct SSyncRaftInflights { - /* the starting index in the buffer */ - int start; - - /* number of inflights in the buffer */ - int count; - - /* the size of the buffer */ - int size; - - /** - * buffer contains the index of the last entry - * inside one message. - **/ - SyncIndex* buffer; -} SSyncRaftInflights; - -/** - * State defines how the leader should interact with the follower. - * - * When in PROGRESS_PROBE, leader sends at most one replication message - * per heartbeat interval. It also probes actual progress of the follower. - * - * When in PROGRESS_REPLICATE, leader optimistically increases next - * to the latest entry sent after sending replication message. This is - * an optimized state for fast replicating log entries to the follower. - * - * When in PROGRESS_SNAPSHOT, leader should have sent out snapshot - * before and stops sending any replication message. - * - * PROGRESS_PROBE is the initial state. - **/ -typedef enum RaftProgressState { - PROGRESS_PROBE = 0, - PROGRESS_REPLICATE, - PROGRESS_SNAPSHOT, -} RaftProgressState; - -/** - * Progress represents a follower’s progress in the view of the leader. Leader maintains - * progresses of all followers, and sends entries to the follower based on its progress. - **/ -struct SSyncRaftProgress { - SyncNodeId id; - - SyncIndex nextIndex; - - SyncIndex matchIndex; - - RaftProgressState state; - - /** - * paused is used in PROGRESS_PROBE. - * When paused is true, raft should pause sending replication message to this peer. - **/ - bool paused; - - // last send append message tick - uint32_t lastSendTick; - - /** - * pendingSnapshotIndex is used in PROGRESS_SNAPSHOT. - * If there is a pending snapshot, the pendingSnapshotIndex will be set to the - * index of the snapshot. If pendingSnapshotIndex is set, the replication process of - * this Progress will be paused. raft will not resend snapshot until the pending one - * is reported to be failed. - **/ - SyncIndex pendingSnapshotIndex; - - /** - * recentActive is true if the progress is recently active. Receiving any messages - * from the corresponding follower indicates the progress is active. - * RecentActive can be reset to false after an election timeout. - **/ - bool recentActive; - - /** - * flow control sliding window - **/ - SSyncRaftInflights inflights; - - // IsLearner is true if this progress is tracked for a learner. - bool isLearner; -}; - -void syncRaftInitProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress); - -/** - * syncRaftProgressMaybeUpdate returns false if the given lastIndex index comes from i-th node's log. - * Otherwise it updates the progress and returns true. - **/ -bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex); - -static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* progress, SyncIndex nextIndex) { - progress->nextIndex = nextIndex + 1; -} - -/** - * syncRaftProgressMaybeDecrTo returns false if the given to index comes from an out of order message. - * Otherwise it decreases the progress next index to min(rejected, last) and returns true. - **/ -bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, - SyncIndex rejected, SyncIndex lastIndex); - -/** - * syncRaftProgressIsPaused returns whether sending log entries to this node has been - * paused. A node may be paused because it has rejected recent - * MsgApps, is currently waiting for a snapshot, or has reached the - * MaxInflightMsgs limit. - **/ -bool syncRaftProgressIsPaused(SSyncRaftProgress* progress); - -static FORCE_INLINE void syncRaftProgressPause(SSyncRaftProgress* progress) { - progress->paused = true; -} - -static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progress) { - return progress->nextIndex; -} - -static FORCE_INLINE RaftProgressState syncRaftProgressInReplicate(SSyncRaftProgress* progress) { - return progress->state == PROGRESS_REPLICATE; -} - -static FORCE_INLINE RaftProgressState syncRaftProgressInSnapshot(SSyncRaftProgress* progress) { - return progress->state == PROGRESS_SNAPSHOT; -} - -static FORCE_INLINE RaftProgressState syncRaftProgressInProbe(SSyncRaftProgress* progress) { - return progress->state == PROGRESS_PROBE; -} - -static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progress) { - return progress->recentActive; -} - -static FORCE_INLINE bool syncRaftProgressUpdateSendTick(SSyncRaftProgress* progress, SyncTick current) { - return progress->lastSendTick = current; -} - -void syncRaftProgressFailure(SSyncRaftProgress* progress); - -bool syncRaftProgressNeedAbortSnapshot(SSyncRaftProgress* progress); - -/** - * return true if progress's log is up-todate - **/ -bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress); - -void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress); - -void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress); - -void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex); - -/* inflights APIs */ -int syncRaftInflightReset(SSyncRaftInflights* inflights); -bool syncRaftInflightFull(SSyncRaftInflights* inflights); -void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex); -void syncRaftInflightFreeTo(SSyncRaftInflights* inflights, SyncIndex toIndex); -void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights); - -#if 0 - -void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); - - - -SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i); - -void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i); - -void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i); - -bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i); - -void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i); - - - -void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); - -#endif - -#endif /* TD_SYNC_RAFT_PROGRESS_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_inflights.h b/source/libs/sync/inc/sync_raft_inflights.h new file mode 100644 index 0000000000..6d249c9274 --- /dev/null +++ b/source/libs/sync/inc/sync_raft_inflights.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef TD_SYNC_RAFT_INFLIGHTS_H +#define TD_SYNC_RAFT_INFLIGHTS_H + +#include "sync.h" + +/** + * SSyncRaftInflights limits the number of MsgApp (represented by the largest index + * contained within) sent to followers but not yet acknowledged by them. Callers + * use syncRaftInflightFull() to check whether more messages can be sent, + * call syncRaftInflightAdd() whenever they are sending a new append, + * and release "quota" via FreeLE() whenever an ack is received. +**/ +typedef struct SSyncRaftInflights { + /* the starting index in the buffer */ + int start; + + /* number of inflights in the buffer */ + int count; + + /* the size of the buffer */ + int size; + + /** + * buffer contains the index of the last entry + * inside one message. + **/ + SyncIndex* buffer; +} SSyncRaftInflights; + +SSyncRaftInflights* syncRaftOpenInflights(int size); +void syncRaftCloseInflights(SSyncRaftInflights*); + +static FORCE_INLINE void syncRaftInflightReset(SSyncRaftInflights* inflights) { + inflights->count = 0; + inflights->start = 0; +} + +static FORCE_INLINE bool syncRaftInflightFull(SSyncRaftInflights* inflights) { + return inflights->count == inflights->size; +} + +/** + * syncRaftInflightAdd notifies the Inflights that a new message with the given index is being + * dispatched. syncRaftInflightFull() must be called prior to syncRaftInflightAdd() + * to verify that there is room for one more message, + * and consecutive calls to add syncRaftInflightAdd() must provide a + * monotonic sequence of indexes. + **/ +void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex); + +/** + * syncRaftInflightFreeLE frees the inflights smaller or equal to the given `to` flight. + **/ +void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex); + +/** + * syncRaftInflightFreeFirstOne releases the first inflight. + * This is a no-op if nothing is inflight. + **/ +void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights); + +#endif /* TD_SYNC_RAFT_INFLIGHTS_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_progress.h b/source/libs/sync/inc/sync_raft_progress.h new file mode 100644 index 0000000000..1f693219be --- /dev/null +++ b/source/libs/sync/inc/sync_raft_progress.h @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef TD_SYNC_RAFT_PROGRESS_H +#define TD_SYNC_RAFT_PROGRESS_H + +#include "sync_type.h" +#include "sync_raft_inflights.h" + +/** + * State defines how the leader should interact with the follower. + * + * When in PROGRESS_STATE_PROBE, leader sends at most one replication message + * per heartbeat interval. It also probes actual progress of the follower. + * + * When in PROGRESS_STATE_REPLICATE, leader optimistically increases next + * to the latest entry sent after sending replication message. This is + * an optimized state for fast replicating log entries to the follower. + * + * When in PROGRESS_STATE_SNAPSHOT, leader should have sent out snapshot + * before and stops sending any replication message. + * + * PROGRESS_STATE_PROBE is the initial state. + **/ +typedef enum RaftProgressState { + /** + * StateProbe indicates a follower whose last index isn't known. Such a + * follower is "probed" (i.e. an append sent periodically) to narrow down + * its last index. In the ideal (and common) case, only one round of probing + * is necessary as the follower will react with a hint. Followers that are + * probed over extended periods of time are often offline. + **/ + PROGRESS_STATE_PROBE = 0, + + /** + * StateReplicate is the state steady in which a follower eagerly receives + * log entries to append to its log. + **/ + PROGRESS_STATE_REPLICATE, + + /** + * StateSnapshot indicates a follower that needs log entries not available + * from the leader's Raft log. Such a follower needs a full snapshot to + * return to StateReplicate. + **/ + PROGRESS_STATE_SNAPSHOT, +} RaftProgressState; + +/** + * Progress represents a follower’s progress in the view of the leader. Leader maintains + * progresses of all followers, and sends entries to the follower based on its progress. + **/ +struct SSyncRaftProgress { + SyncIndex nextIndex; + + SyncIndex matchIndex; + + /** + * State defines how the leader should interact with the follower. + * + * When in StateProbe, leader sends at most one replication message + * per heartbeat interval. It also probes actual progress of the follower. + * + * When in StateReplicate, leader optimistically increases next + * to the latest entry sent after sending replication message. This is + * an optimized state for fast replicating log entries to the follower. + * + * When in StateSnapshot, leader should have sent out snapshot + * before and stops sending any replication message. + **/ + RaftProgressState state; + + /** + * pendingSnapshotIndex is used in PROGRESS_STATE_SNAPSHOT. + * If there is a pending snapshot, the pendingSnapshotIndex will be set to the + * index of the snapshot. If pendingSnapshotIndex is set, the replication process of + * this Progress will be paused. raft will not resend snapshot until the pending one + * is reported to be failed. + **/ + SyncIndex pendingSnapshotIndex; + + /** + * recentActive is true if the progress is recently active. Receiving any messages + * from the corresponding follower indicates the progress is active. + * RecentActive can be reset to false after an election timeout. + **/ + bool recentActive; + + /** + * probeSent is used while this follower is in StateProbe. When probeSent is + * true, raft should pause sending replication message to this peer until + * probeSent is reset. See ProbeAcked() and IsPaused(). + **/ + bool probeSent; + + /** + * inflights is a sliding window for the inflight messages. + * Each inflight message contains one or more log entries. + * The max number of entries per message is defined in raft config as MaxSizePerMsg. + * Thus inflight effectively limits both the number of inflight messages + * and the bandwidth each Progress can use. + * When inflights is Full, no more message should be sent. + * When a leader sends out a message, the index of the last + * entry should be added to inflights. The index MUST be added + * into inflights in order. + * When a leader receives a reply, the previous inflights should + * be freed by calling inflights.FreeLE with the index of the last + * received entry. + **/ + SSyncRaftInflights* inflights; + + /** + * IsLearner is true if this progress is tracked for a learner. + **/ + bool isLearner; +}; + +void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress); + +/** + * syncRaftProgressBecomeProbe transitions into StateProbe. Next is reset to Match+1 or, + * optionally and if larger, the index of the pending snapshot. + **/ +void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress); + +/** + * syncRaftProgressBecomeReplicate transitions into StateReplicate, resetting Next to Match+1. + **/ +void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress); + +/** + * syncRaftProgressMaybeUpdate is called when an MsgAppResp arrives from the follower, with the + * index acked by it. The method returns false if the given n index comes from + * an outdated message. Otherwise it updates the progress and returns true. + **/ +bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex); + +/** + * syncRaftProgressOptimisticNextIndex signals that appends all the way up to and including index n + * are in-flight. As a result, Next is increased to n+1. + **/ +static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* progress, SyncIndex nextIndex) { + progress->nextIndex = nextIndex + 1; +} + +/** + * syncRaftProgressMaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The + * arguments are the index of the append message rejected by the follower, and + * the hint that we want to decrease to. + * + * Rejections can happen spuriously as messages are sent out of order or + * duplicated. In such cases, the rejection pertains to an index that the + * Progress already knows were previously acknowledged, and false is returned + * without changing the Progress. + * + * If the rejection is genuine, Next is lowered sensibly, and the Progress is + * cleared for sending log entries. +**/ +bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, + SyncIndex rejected, SyncIndex matchHint); + +/** + * syncRaftProgressIsPaused returns whether sending log entries to this node has been throttled. + * This is done when a node has rejected recent MsgApps, is currently waiting + * for a snapshot, or has reached the MaxInflightMsgs limit. In normal + * operation, this is false. A throttled node will be contacted less frequently + * until it has reached a state in which it's able to accept a steady stream of + * log entries again. + **/ +bool syncRaftProgressIsPaused(SSyncRaftProgress* progress); + +static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progress) { + return progress->nextIndex; +} + +static FORCE_INLINE RaftProgressState syncRaftProgressInReplicate(SSyncRaftProgress* progress) { + return progress->state == PROGRESS_STATE_REPLICATE; +} + +static FORCE_INLINE RaftProgressState syncRaftProgressInSnapshot(SSyncRaftProgress* progress) { + return progress->state == PROGRESS_STATE_SNAPSHOT; +} + +static FORCE_INLINE RaftProgressState syncRaftProgressInProbe(SSyncRaftProgress* progress) { + return progress->state == PROGRESS_STATE_PROBE; +} + +static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progress) { + return progress->recentActive; +} + +/** + * return true if progress's log is up-todate + **/ +bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress); + +void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex); + + + +#if 0 + +void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); + + + +SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i); + +void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i); + +void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i); + +bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i); + +void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i); + + + +void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); + +#endif + +#endif /* TD_SYNC_RAFT_PROGRESS_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_progress_tracker.h b/source/libs/sync/inc/sync_raft_progress_tracker.h index ffc134fec4..40d43895c8 100644 --- a/source/libs/sync/inc/sync_raft_progress_tracker.h +++ b/source/libs/sync/inc/sync_raft_progress_tracker.h @@ -18,7 +18,7 @@ #include "sync_type.h" #include "sync_raft_quorum_joint.h" -#include "raft_progress.h" +#include "sync_raft_progress.h" struct SSyncRaftProgressTrackerConfig { SSyncRaftQuorumJointConfig voters; @@ -94,7 +94,7 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker(); void syncRaftResetVotes(SSyncRaftProgressTracker*); -typedef void (*visitProgressFp)(SSyncRaftProgress* progress, void* arg); +typedef void (*visitProgressFp)(int i, SSyncRaftProgress* progress, void* arg); void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg); #endif /* _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H */ diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 4a3654131c..b43a35c03e 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -102,6 +102,8 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); + pRaft->selfIndex = pRaft->cluster.selfIndex; + syncInfo("[%d:%d] restore vgid %d state: snapshot index success", pRaft->selfGroupId, pRaft->selfId, pInfo->vgId); return 0; @@ -443,8 +445,8 @@ static void abortLeaderTransfer(SSyncRaft* pRaft) { pRaft->leadTransferee = SYNC_NON_NODE_ID; } -static void initProgress(SSyncRaftProgress* progress, void* arg) { - syncRaftInitProgress((SSyncRaft*)arg, progress); +static void initProgress(int i, SSyncRaftProgress* progress, void* arg) { + syncRaftInitProgress(i, (SSyncRaft*)arg, progress); } static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { diff --git a/source/libs/sync/src/raft_replication.c b/source/libs/sync/src/raft_replication.c index 473499b795..3c7216239a 100644 --- a/source/libs/sync/src/raft_replication.c +++ b/source/libs/sync/src/raft_replication.c @@ -15,7 +15,7 @@ #include "raft.h" #include "raft_log.h" -#include "raft_progress.h" +#include "sync_raft_progress.h" #include "raft_replication.h" static int sendSnapshot(SSyncRaft* pRaft, int i); diff --git a/source/libs/sync/src/sync_raft_inflights.c b/source/libs/sync/src/sync_raft_inflights.c new file mode 100644 index 0000000000..3d740b5a9e --- /dev/null +++ b/source/libs/sync/src/sync_raft_inflights.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "sync_raft_inflights.h" + +SSyncRaftInflights* syncRaftOpenInflights(int size) { + SSyncRaftInflights* inflights = (SSyncRaftInflights*)malloc(sizeof(SSyncRaftInflights)); + if (inflights == NULL) { + return NULL; + } + SyncIndex* buffer = (SyncIndex*)malloc(sizeof(SyncIndex) * size); + if (buffer == NULL) { + free(inflights); + return NULL; + } + *inflights = (SSyncRaftInflights) { + .buffer = buffer, + .count = 0, + .size = 0, + .start = 0, + }; + + return inflights; +} + +void syncRaftCloseInflights(SSyncRaftInflights* inflights) { + free(inflights->buffer); + free(inflights); +} + +/** + * syncRaftInflightAdd notifies the Inflights that a new message with the given index is being + * dispatched. syncRaftInflightFull() must be called prior to syncRaftInflightAdd() + * to verify that there is room for one more message, + * and consecutive calls to add syncRaftInflightAdd() must provide a + * monotonic sequence of indexes. + **/ +void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex) { + assert(!syncRaftInflightFull(inflights)); + + int next = inflights->start + inflights->count; + int size = inflights->size; + /* is next wrapped around buffer? */ + if (next >= size) { + next -= size; + } + + inflights->buffer[next] = inflightIndex; + inflights->count++; +} + +/** + * syncRaftInflightFreeLE frees the inflights smaller or equal to the given `to` flight. + **/ +void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex) { + if (inflights->count == 0 || toIndex < inflights->buffer[inflights->start]) { + /* out of the left side of the window */ + return; + } + + int i, idx; + for (i = 0, idx = inflights->start; i < inflights->count; i++) { + if (toIndex < inflights->buffer[idx]) { // found the first large inflight + break; + } + + // increase index and maybe rotate + int size = inflights->size; + idx++; + if (idx >= size) { + idx -= size; + } + } + + // free i inflights and set new start index + inflights->count -= i; + inflights->start = idx; + assert(inflights->count >= 0); + if (inflights->count == 0) { + // inflights is empty, reset the start index so that we don't grow the + // buffer unnecessarily. + inflights->start = 0; + } +} + +/** + * syncRaftInflightFreeFirstOne releases the first inflight. + * This is a no-op if nothing is inflight. + **/ +void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights) { + syncRaftInflightFreeLE(inflights, inflights->buffer[inflights->start]); +} diff --git a/source/libs/sync/src/raft_progress.c b/source/libs/sync/src/sync_raft_progress.c similarity index 59% rename from source/libs/sync/src/raft_progress.c rename to source/libs/sync/src/sync_raft_progress.c index 6edc808698..ec98be7dfa 100644 --- a/source/libs/sync/src/raft_progress.c +++ b/source/libs/sync/src/sync_raft_progress.c @@ -15,57 +15,50 @@ #include "raft.h" #include "raft_log.h" -#include "raft_progress.h" +#include "sync_raft_progress.h" +#include "sync_raft_progress_tracker.h" #include "sync.h" #include "syncInt.h" static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state); +static void probeAcked(SSyncRaftProgress* progress); static void resumeProgress(SSyncRaftProgress* progress); -int syncRaftProgressCreate(SSyncRaft* pRaft) { - -/* - inflights->buffer = (SyncIndex*)malloc(sizeof(SyncIndex) * pRaft->maxInflightMsgs); - if (inflights->buffer == NULL) { - return RAFT_OOM; +void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress) { + SSyncRaftInflights* inflights = syncRaftOpenInflights(pRaft->tracker->maxInflight); + if (inflights == NULL) { + return; } - inflights->size = pRaft->maxInflightMsgs; -*/ -} - -/* -int syncRaftProgressRecreate(SSyncRaft* pRaft, const RaftConfiguration* configuration) { - -} -*/ - -void syncRaftInitProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress) { *progress = (SSyncRaftProgress) { - .matchIndex = progress->id == pRaft->selfId ? syncRaftLogLastIndex(pRaft->log) : 0, + .matchIndex = i == pRaft->selfIndex ? syncRaftLogLastIndex(pRaft->log) : 0, .nextIndex = syncRaftLogLastIndex(pRaft->log) + 1, - //.inflights = + .inflights = inflights, }; } +/** + * syncRaftProgressMaybeUpdate is called when an MsgAppResp arrives from the follower, with the + * index acked by it. The method returns false if the given n index comes from + * an outdated message. Otherwise it updates the progress and returns true. + **/ bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex) { bool updated = false; if (progress->matchIndex < lastIndex) { progress->matchIndex = lastIndex; updated = true; - resumeProgress(progress); - } - if (progress->nextIndex < lastIndex + 1) { - progress->nextIndex = lastIndex + 1; + probeAcked(progress); } + progress->nextIndex = MAX(progress->nextIndex, lastIndex + 1); + return updated; } bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, - SyncIndex rejected, SyncIndex lastIndex) { - if (progress->state == PROGRESS_REPLICATE) { + SyncIndex rejected, SyncIndex matchHint) { + if (progress->state == PROGRESS_STATE_REPLICATE) { /** * the rejection must be stale if the progress has matched and "rejected" * is smaller than "match". @@ -77,143 +70,102 @@ bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, /* directly decrease next to match + 1 */ progress->nextIndex = progress->matchIndex + 1; - //syncRaftProgressBecomeProbe(raft, i); return true; } + /** + * The rejection must be stale if "rejected" does not match next - 1. This + * is because non-replicating followers are probed one entry at a time. + **/ if (rejected != progress->nextIndex - 1) { syncDebug("rejected index %" PRId64 " different from next index %" PRId64 " -> ignore" , rejected, progress->nextIndex); return false; } - progress->nextIndex = MIN(rejected, lastIndex + 1); - if (progress->nextIndex < 1) { - progress->nextIndex = 1; - } + progress->nextIndex = MAX(MIN(rejected, matchHint + 1), 1); - resumeProgress(progress); + progress->probeSent = false; return true; } -static void resumeProgress(SSyncRaftProgress* progress) { - progress->paused = false; -} - +/** + * syncRaftProgressIsPaused returns whether sending log entries to this node has been throttled. + * This is done when a node has rejected recent MsgApps, is currently waiting + * for a snapshot, or has reached the MaxInflightMsgs limit. In normal + * operation, this is false. A throttled node will be contacted less frequently + * until it has reached a state in which it's able to accept a steady stream of + * log entries again. + **/ bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) { switch (progress->state) { - case PROGRESS_PROBE: - return progress->paused; - case PROGRESS_REPLICATE: - return syncRaftInflightFull(&progress->inflights); - case PROGRESS_SNAPSHOT: + case PROGRESS_STATE_PROBE: + return progress->probeSent; + case PROGRESS_STATE_REPLICATE: + return syncRaftInflightFull(progress->inflights); + case PROGRESS_STATE_SNAPSHOT: return true; default: syncFatal("error sync state:%d", progress->state); } } -void syncRaftProgressFailure(SSyncRaftProgress* progress) { - progress->pendingSnapshotIndex = 0; -} - -bool syncRaftProgressNeedAbortSnapshot(SSyncRaftProgress* progress) { - return progress->state == PROGRESS_SNAPSHOT && progress->matchIndex >= progress->pendingSnapshotIndex; -} - bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) { return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex; } +/** + * syncRaftProgressBecomeProbe transitions into StateProbe. Next is reset to Match+1 or, + * optionally and if larger, the index of the pending snapshot. + **/ void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress) { /** * If the original state is ProgressStateSnapshot, progress knows that * the pending snapshot has been sent to this peer successfully, then * probes from pendingSnapshot + 1. **/ - if (progress->state == PROGRESS_SNAPSHOT) { + if (progress->state == PROGRESS_STATE_SNAPSHOT) { SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex; - resetProgressState(progress, PROGRESS_PROBE); + resetProgressState(progress, PROGRESS_STATE_PROBE); progress->nextIndex = MAX(progress->matchIndex + 1, pendingSnapshotIndex + 1); } else { - resetProgressState(progress, PROGRESS_PROBE); + resetProgressState(progress, PROGRESS_STATE_PROBE); progress->nextIndex = progress->matchIndex + 1; } } +/** + * syncRaftProgressBecomeReplicate transitions into StateReplicate, resetting Next to Match+1. + **/ void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress) { - resetProgressState(progress, PROGRESS_REPLICATE); + resetProgressState(progress, PROGRESS_STATE_REPLICATE); progress->nextIndex = progress->matchIndex + 1; } void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex) { - resetProgressState(progress, PROGRESS_SNAPSHOT); + resetProgressState(progress, PROGRESS_STATE_SNAPSHOT); progress->pendingSnapshotIndex = snapshotIndex; } -int syncRaftInflightReset(SSyncRaftInflights* inflights) { - inflights->count = 0; - inflights->start = 0; - - return 0; -} - -bool syncRaftInflightFull(SSyncRaftInflights* inflights) { - return inflights->count == inflights->size; -} - -void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex) { - assert(!syncRaftInflightFull(inflights)); - - int next = inflights->start + inflights->count; - int size = inflights->size; - /* is next wrapped around buffer? */ - if (next >= size) { - next -= size; - } - - inflights->buffer[next] = inflightIndex; - inflights->count++; -} - -void syncRaftInflightFreeTo(SSyncRaftInflights* inflights, SyncIndex toIndex) { - if (inflights->count == 0 || toIndex < inflights->buffer[inflights->start]) { - return; - } - - int i, idx; - for (i = 0, idx = inflights->start; i < inflights->count; i++) { - if (toIndex < inflights->buffer[idx]) { - break; - } - - int size = inflights->size; - idx++; - if (idx >= size) { - idx -= size; - } - } - - inflights->count -= i; - inflights->start = idx; - assert(inflights->count >= 0); - if (inflights->count == 0) { - inflights->start = 0; - } -} - -void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights) { - syncRaftInflightFreeTo(inflights, inflights->buffer[inflights->start]); -} - +/** + * ResetState moves the Progress into the specified State, resetting ProbeSent, + * PendingSnapshot, and Inflights. + **/ static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state) { - progress->paused = false; + progress->probeSent = false; progress->pendingSnapshotIndex = 0; progress->state = state; - syncRaftInflightReset(&(progress->inflights)); + syncRaftInflightReset(progress->inflights); } - +/** + * probeAcked is called when this peer has accepted an append. It resets + * ProbeSent to signal that additional append messages should be sent without + * further delay. + **/ +static void probeAcked(SSyncRaftProgress* progress) { + progress->probeSent = false; +} #if 0 @@ -250,33 +202,33 @@ bool syncRaftProgressGetRecentRecv(SSyncRaft* pRaft, int i) { void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i) { SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - resetProgressState(progress, PROGRESS_SNAPSHOT); + resetProgressState(progress, PROGRESS_STATE_SNAPSHOT); progress->pendingSnapshotIndex = raftLogSnapshotIndex(pRaft->log); } void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i) { SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - if (progress->state == PROGRESS_SNAPSHOT) { + if (progress->state == PROGRESS_STATE_SNAPSHOT) { assert(progress->pendingSnapshotIndex > 0); SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex; - resetProgressState(progress, PROGRESS_PROBE); + resetProgressState(progress, PROGRESS_STATE_PROBE); progress->nextIndex = max(progress->matchIndex + 1, pendingSnapshotIndex); } else { - resetProgressState(progress, PROGRESS_PROBE); + resetProgressState(progress, PROGRESS_STATE_PROBE); progress->nextIndex = progress->matchIndex + 1; } } void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i) { - resetProgressState(pRaft->leaderState.progress, PROGRESS_REPLICATE); + resetProgressState(pRaft->leaderState.progress, PROGRESS_STATE_REPLICATE); pRaft->leaderState.progress->nextIndex = pRaft->leaderState.progress->matchIndex + 1; } void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i) { SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); progress->pendingSnapshotIndex = 0; - progress->state = PROGRESS_PROBE; + progress->state = PROGRESS_STATE_PROBE; } RaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i) { diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c index d349cbb9b2..7104794cbb 100644 --- a/source/libs/sync/src/sync_raft_progress_tracker.c +++ b/source/libs/sync/src/sync_raft_progress_tracker.c @@ -32,10 +32,6 @@ void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp vi int i; for (i = 0; i < TSDB_MAX_REPLICA; ++i) { SSyncRaftProgress* progress = &(tracker->progressMap[i]); - if (progress->id == SYNC_NON_NODE_ID) { - continue; - } - - visit(progress, arg); + visit(i, progress, arg); } } \ No newline at end of file From de3164f16f6220d913fadd331d01413d2bb86413 Mon Sep 17 00:00:00 2001 From: lichuang Date: Tue, 9 Nov 2021 14:58:10 +0800 Subject: [PATCH 16/16] [TD-10645][raft]add raft progress tracker --- source/libs/sync/inc/raft.h | 19 +- source/libs/sync/inc/sync_raft_impl.h | 42 +++ source/libs/sync/inc/sync_raft_progress.h | 2 + .../sync/inc/sync_raft_progress_tracker.h | 17 +- source/libs/sync/inc/sync_raft_quorum.h | 40 +++ source/libs/sync/inc/sync_raft_quorum_joint.h | 12 +- .../libs/sync/inc/sync_raft_quorum_majority.h | 30 ++ source/libs/sync/inc/sync_type.h | 2 +- source/libs/sync/src/raft.c | 271 +--------------- source/libs/sync/src/raft_election.c | 9 +- .../sync/src/raft_handle_election_message.c | 6 +- .../sync/src/raft_handle_vote_resp_message.c | 19 +- source/libs/sync/src/sync_raft_impl.c | 306 ++++++++++++++++++ source/libs/sync/src/sync_raft_progress.c | 2 + .../sync/src/sync_raft_progress_tracker.c | 41 ++- source/libs/sync/src/sync_raft_quorum_joint.c | 41 +++ .../libs/sync/src/sync_raft_quorum_majority.c | 54 ++++ 17 files changed, 608 insertions(+), 305 deletions(-) create mode 100644 source/libs/sync/inc/sync_raft_impl.h create mode 100644 source/libs/sync/inc/sync_raft_quorum.h create mode 100644 source/libs/sync/inc/sync_raft_quorum_majority.h create mode 100644 source/libs/sync/src/sync_raft_impl.c create mode 100644 source/libs/sync/src/sync_raft_quorum_joint.c create mode 100644 source/libs/sync/src/sync_raft_quorum_majority.c diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index c8bf63f81c..14f587d58e 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -19,7 +19,8 @@ #include "sync.h" #include "sync_type.h" #include "raft_message.h" - +#include "sync_raft_impl.h" +#include "sync_raft_quorum.h" typedef struct RaftLeaderState { @@ -140,20 +141,4 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo); int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg); int32_t syncRaftTick(SSyncRaft* pRaft); -void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId); -void syncRaftBecomePreCandidate(SSyncRaft* pRaft); -void syncRaftBecomeCandidate(SSyncRaft* pRaft); -void syncRaftBecomeLeader(SSyncRaft* pRaft); - -void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType); - -void syncRaftTriggerHeartbeat(SSyncRaft* pRaft); - -void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft); -bool syncRaftIsPromotable(SSyncRaft* pRaft); -bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft); -int syncRaftQuorum(SSyncRaft* pRaft); -int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, - bool preVote, bool accept, int* rejectNum); - #endif /* _TD_LIBS_SYNC_RAFT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_impl.h b/source/libs/sync/inc/sync_raft_impl.h new file mode 100644 index 0000000000..26af06866b --- /dev/null +++ b/source/libs/sync/inc/sync_raft_impl.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_RAFT_IMPL_H +#define _TD_LIBS_SYNC_RAFT_IMPL_H + +#include "sync.h" +#include "sync_type.h" +#include "raft_message.h" +#include "sync_raft_quorum.h" + +void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId); +void syncRaftBecomePreCandidate(SSyncRaft* pRaft); +void syncRaftBecomeCandidate(SSyncRaft* pRaft); +void syncRaftBecomeLeader(SSyncRaft* pRaft); + +void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType); + +void syncRaftTriggerHeartbeat(SSyncRaft* pRaft); + +void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft); +bool syncRaftIsPromotable(SSyncRaft* pRaft); +bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft); +int syncRaftQuorum(SSyncRaft* pRaft); + +SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id, + bool preVote, bool accept, + int* rejectNum, int *granted); + +#endif /* _TD_LIBS_SYNC_RAFT_IMPL_H */ diff --git a/source/libs/sync/inc/sync_raft_progress.h b/source/libs/sync/inc/sync_raft_progress.h index 1f693219be..fff0c13e31 100644 --- a/source/libs/sync/inc/sync_raft_progress.h +++ b/source/libs/sync/inc/sync_raft_progress.h @@ -63,6 +63,8 @@ typedef enum RaftProgressState { * progresses of all followers, and sends entries to the follower based on its progress. **/ struct SSyncRaftProgress { + SyncNodeId id; + SyncIndex nextIndex; SyncIndex matchIndex; diff --git a/source/libs/sync/inc/sync_raft_progress_tracker.h b/source/libs/sync/inc/sync_raft_progress_tracker.h index 40d43895c8..887aeb2377 100644 --- a/source/libs/sync/inc/sync_raft_progress_tracker.h +++ b/source/libs/sync/inc/sync_raft_progress_tracker.h @@ -23,7 +23,8 @@ struct SSyncRaftProgressTrackerConfig { SSyncRaftQuorumJointConfig voters; - /** AutoLeave is true if the configuration is joint and a transition to the + /** + * autoLeave is true if the configuration is joint and a transition to the * incoming configuration should be carried out automatically by Raft when * this is possible. If false, the configuration will be joint until the * application initiates the transition manually. @@ -86,7 +87,7 @@ struct SSyncRaftProgressTracker { SSyncRaftProgress progressMap[TSDB_MAX_REPLICA]; - SyncRaftVoteRespType votes[TSDB_MAX_REPLICA]; + SyncRaftVoteResult votes[TSDB_MAX_REPLICA]; int maxInflight; }; @@ -97,4 +98,16 @@ void syncRaftResetVotes(SSyncRaftProgressTracker*); typedef void (*visitProgressFp)(int i, SSyncRaftProgress* progress, void* arg); void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg); +/** + * syncRaftRecordVote records that the node with the given id voted for this Raft + * instance if v == true (and declined it otherwise). + **/ +void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant); + +/** + * syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the + * election outcome is known. + **/ +SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted); + #endif /* _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H */ diff --git a/source/libs/sync/inc/sync_raft_quorum.h b/source/libs/sync/inc/sync_raft_quorum.h new file mode 100644 index 0000000000..42f65c9806 --- /dev/null +++ b/source/libs/sync/inc/sync_raft_quorum.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef TD_SYNC_RAFT_QUORUM_H +#define TD_SYNC_RAFT_QUORUM_H + +/** + * SSyncRaftVoteResult indicates the outcome of a vote. + **/ +typedef enum { + /** + * SYNC_RAFT_VOTE_PENDING indicates that the decision of the vote depends on future + * votes, i.e. neither "yes" or "no" has reached quorum yet. + **/ + SYNC_RAFT_VOTE_PENDING = 1, + + /** + * SYNC_RAFT_VOTE_LOST indicates that the quorum has voted "no". + **/ + SYNC_RAFT_VOTE_LOST = 2, + + /** + * SYNC_RAFT_VOTE_WON indicates that the quorum has voted "yes". + **/ + SYNC_RAFT_VOTE_WON = 3, +} SSyncRaftVoteResult; + +#endif /* TD_SYNC_RAFT_QUORUM_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_quorum_joint.h b/source/libs/sync/inc/sync_raft_quorum_joint.h index 4f7424db7e..14c1f63754 100644 --- a/source/libs/sync/inc/sync_raft_quorum_joint.h +++ b/source/libs/sync/inc/sync_raft_quorum_joint.h @@ -18,13 +18,21 @@ #include "taosdef.h" #include "sync.h" +#include "sync_type.h" /** - * JointConfig is a configuration of two groups of (possibly overlapping) + * SSyncRaftQuorumJointConfig is a configuration of two groups of (possibly overlapping) * majority configurations. Decisions require the support of both majorities. **/ typedef struct SSyncRaftQuorumJointConfig { - SyncNodeId majorityConfig[2][TSDB_MAX_REPLICA]; + SSyncCluster majorityConfig[2]; }SSyncRaftQuorumJointConfig; +/** + * syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns + * a result indicating whether the vote is pending, lost, or won. A joint quorum + * requires both majority quorums to vote in favor. + **/ +SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const SyncRaftVoteResult* votes); + #endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */ diff --git a/source/libs/sync/inc/sync_raft_quorum_majority.h b/source/libs/sync/inc/sync_raft_quorum_majority.h new file mode 100644 index 0000000000..b1857ef056 --- /dev/null +++ b/source/libs/sync/inc/sync_raft_quorum_majority.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H +#define _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H + +#include "sync.h" +#include "sync_type.h" + +/** + * syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns + * a result indicating whether the vote is pending (i.e. neither a quorum of + * yes/no has been reached), won (a quorum of yes has been reached), or lost (a + * quorum of no has been reached). + **/ +SyncRaftVoteResult syncRaftMajorityVoteResult(SSyncCluster* config, const SyncRaftVoteResult* votes); + +#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */ diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h index 9faebe94b2..525623b4cf 100644 --- a/source/libs/sync/inc/sync_type.h +++ b/source/libs/sync/inc/sync_type.h @@ -61,6 +61,6 @@ typedef enum { //reject the vote request SYNC_RAFT_VOTE_RESP_REJECT = 2, -} SyncRaftVoteRespType; +} SyncRaftVoteResult; #endif /* _TD_LIBS_SYNC_TYPE_H */ diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index b43a35c03e..20d24e3267 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -26,23 +26,6 @@ static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); -static int convertClear(SSyncRaft* pRaft); -static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg); -static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg); -static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg); - -static int triggerAll(SSyncRaft* pRaft); - -static void tickElection(SSyncRaft* pRaft); -static void tickHeartbeat(SSyncRaft* pRaft); - -static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n); -static bool maybeCommit(SSyncRaft* pRaft); - -static void abortLeaderTransfer(SSyncRaft* pRaft); - -static void resetRaft(SSyncRaft* pRaft, SyncTerm term); - int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { SSyncNode* pNode = pRaft->pNode; SSyncServerState serverState; @@ -136,124 +119,6 @@ int32_t syncRaftTick(SSyncRaft* pRaft) { return 0; } -void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId) { - convertClear(pRaft); - - pRaft->stepFp = stepFollower; - resetRaft(pRaft, term); - pRaft->tickFp = tickElection; - pRaft->leaderId = leaderId; - pRaft->state = TAOS_SYNC_ROLE_FOLLOWER; - syncInfo("[%d:%d] became followe at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); -} - -void syncRaftBecomePreCandidate(SSyncRaft* pRaft) { - convertClear(pRaft); - - /** - * Becoming a pre-candidate changes our step functions and state, - * but doesn't change anything else. In particular it does not increase - * r.Term or change r.Vote. - **/ - pRaft->stepFp = stepCandidate; - pRaft->tickFp = tickElection; - pRaft->state = TAOS_SYNC_ROLE_CANDIDATE; - pRaft->candidateState.inPreVote = true; - syncInfo("[%d:%d] became pre-candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); -} - -void syncRaftBecomeCandidate(SSyncRaft* pRaft) { - convertClear(pRaft); - - pRaft->candidateState.inPreVote = false; - pRaft->stepFp = stepCandidate; - // become candidate make term+1 - resetRaft(pRaft, pRaft->term + 1); - pRaft->tickFp = tickElection; - pRaft->voteFor = pRaft->selfId; - pRaft->state = TAOS_SYNC_ROLE_CANDIDATE; - syncInfo("[%d:%d] became candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); -} - -void syncRaftBecomeLeader(SSyncRaft* pRaft) { - assert(pRaft->state != TAOS_SYNC_ROLE_FOLLOWER); - - pRaft->stepFp = stepLeader; - resetRaft(pRaft, pRaft->term); - pRaft->leaderId = pRaft->leaderId; - pRaft->state = TAOS_SYNC_ROLE_LEADER; - // TODO: check if there is pending config log - int nPendingConf = syncRaftLogNumOfPendingConf(pRaft->log); - if (nPendingConf > 1) { - syncFatal("unexpected multiple uncommitted config entry"); - } - - syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); - - // after become leader, send a no-op log - SSyncRaftEntry* entry = (SSyncRaftEntry*)malloc(sizeof(SSyncRaftEntry)); - if (entry == NULL) { - return; - } - *entry = (SSyncRaftEntry) { - .buffer = (SSyncBuffer) { - .data = NULL, - .len = 0, - } - }; - appendEntries(pRaft, entry, 1); - //syncRaftTriggerHeartbeat(pRaft); -} - -void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) { - triggerAll(pRaft); -} - -void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) { - // electionTimeoutTick in [3,6] tick - pRaft->randomizedElectionTimeout = taosRand() % 4 + 3; -} - -bool syncRaftIsPromotable(SSyncRaft* pRaft) { - return pRaft->selfId != SYNC_NON_NODE_ID; -} - -bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) { - return pRaft->electionElapsed >= pRaft->randomizedElectionTimeout; -} - -int syncRaftQuorum(SSyncRaft* pRaft) { - return pRaft->cluster.replica / 2 + 1; -} - -int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool accept, int* rejectNum) { -/* - if (accept) { - syncInfo("[%d:%d] received (pre-vote %d) from %d at term %" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); - } else { - syncInfo("[%d:%d] received rejection from %d at term %" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, id, pRaft->term); - } - - int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, id); - assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0); - assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN); - - pRaft->candidateState.votes[voteIndex] = accept ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT; - int granted = 0, rejected = 0; - int i; - for (i = 0; i < pRaft->cluster.replica; ++i) { - if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) granted++; - else if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_REJECT) rejected++; - } - - if (rejectNum) *rejectNum = rejected; - return granted; -*/ - return 0; -} - /** * pre-handle message, return true means no need to continue * Handle the message term, which may result in our stepping down to a follower. @@ -335,138 +200,4 @@ static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) } return true; -} - -static int convertClear(SSyncRaft* pRaft) { - -} - -static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - - return 0; -} - -static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - /** - * Only handle vote responses corresponding to our candidacy (while in - * StateCandidate, we may get stale MsgPreVoteResp messages in this term from - * our pre-candidate state). - **/ - RaftMessageType msgType = pMsg->msgType; - - if (msgType == RAFT_MSG_INTERNAL_PROP) { - return 0; - } - - if (msgType == RAFT_MSG_VOTE_RESP) { - syncRaftHandleVoteRespMessage(pRaft, pMsg); - return 0; - } else if (msgType == RAFT_MSG_APPEND) { - syncRaftBecomeFollower(pRaft, pRaft->term, pMsg->from); - syncRaftHandleAppendEntriesMessage(pRaft, pMsg); - } - return 0; -} - -static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - convertClear(pRaft); - return 0; -} - -/** - * tickElection is run by followers and candidates per tick. - **/ -static void tickElection(SSyncRaft* pRaft) { - pRaft->electionElapsed += 1; - - if (!syncRaftIsPromotable(pRaft)) { - return; - } - - if (!syncRaftIsPastElectionTimeout(pRaft)) { - return; - } - - // election timeout - pRaft->electionElapsed = 0; - SSyncMessage msg; - syncRaftStep(pRaft, syncInitElectionMsg(&msg, pRaft->selfId)); -} - -static void tickHeartbeat(SSyncRaft* pRaft) { - -} - -static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { - SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); - SyncTerm term = pRaft->term; - int i; - - for (i = 0; i < n; ++i) { - entries[i].term = term; - entries[i].index = lastIndex + 1 + i; - } - - syncRaftLogAppend(pRaft->log, entries, n); - - SSyncRaftProgress* progress = &(pRaft->tracker->progressMap[pRaft->cluster.selfIndex]); - syncRaftProgressMaybeUpdate(progress, lastIndex); - // Regardless of maybeCommit's return, our caller will call bcastAppend. - maybeCommit(pRaft); -} - -/** - * maybeCommit attempts to advance the commit index. Returns true if - * the commit index changed (in which case the caller should call - * r.bcastAppend). - **/ -static bool maybeCommit(SSyncRaft* pRaft) { - - return true; -} - -/** - * trigger I/O requests for newly appended log entries or heartbeats. - **/ -static int triggerAll(SSyncRaft* pRaft) { - assert(pRaft->state == TAOS_SYNC_ROLE_LEADER); - int i; - - for (i = 0; i < pRaft->cluster.replica; ++i) { - if (i == pRaft->cluster.selfIndex) { - continue; - } - - syncRaftReplicate(pRaft, i); - } -} - -static void abortLeaderTransfer(SSyncRaft* pRaft) { - pRaft->leadTransferee = SYNC_NON_NODE_ID; -} - -static void initProgress(int i, SSyncRaftProgress* progress, void* arg) { - syncRaftInitProgress(i, (SSyncRaft*)arg, progress); -} - -static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { - if (pRaft->term != term) { - pRaft->term = term; - pRaft->voteFor = SYNC_NON_NODE_ID; - } - - pRaft->leaderId = SYNC_NON_NODE_ID; - - pRaft->electionElapsed = 0; - pRaft->heartbeatElapsed = 0; - - syncRaftRandomizedElectionTimeout(pRaft); - - abortLeaderTransfer(pRaft); - - syncRaftResetVotes(pRaft->tracker); - syncRaftProgressVisit(pRaft->tracker, initProgress, pRaft); - - pRaft->pendingConfigIndex = 0; - pRaft->uncommittedSize = 0; -} +} \ No newline at end of file diff --git a/source/libs/sync/src/raft_election.c b/source/libs/sync/src/raft_election.c index 4ffb8d0943..1ca3326810 100644 --- a/source/libs/sync/src/raft_election.c +++ b/source/libs/sync/src/raft_election.c @@ -23,6 +23,11 @@ void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) { bool preVote; RaftMessageType voteMsgType; + if (syncRaftIsPromotable(pRaft)) { + syncDebug("[%d:%d] is unpromotable; campaign() should have been called", pRaft->selfGroupId, pRaft->selfId); + return 0; + } + if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { syncRaftBecomePreCandidate(pRaft); preVote = true; @@ -36,8 +41,8 @@ void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) { } int quorum = syncRaftQuorum(pRaft); - int granted = syncRaftNumOfGranted(pRaft, pRaft->selfId, preVote, true, NULL); - if (quorum <= granted) { + SSyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL); + if (result == SYNC_RAFT_VOTE_WON) { /** * We won the election after voting for ourselves (which must mean that * this is a single-node cluster). Advance to the next state. diff --git a/source/libs/sync/src/raft_handle_election_message.c b/source/libs/sync/src/raft_handle_election_message.c index 19471846ba..6ffa24ff30 100644 --- a/source/libs/sync/src/raft_handle_election_message.c +++ b/source/libs/sync/src/raft_handle_election_message.c @@ -20,10 +20,14 @@ int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { if (pRaft->state == TAOS_SYNC_ROLE_LEADER) { - syncDebug("%d ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfId); + syncDebug("[%d:%d] ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfGroupId, pRaft->selfId); return 0; } + if (!syncRaftIsPromotable(pRaft)) { + syncDebug("[%d:%d] is unpromotable and can not campaign", pRaft->selfGroupId, pRaft->selfId); + return 0; + } // if there is pending uncommitted config,cannot start election if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) { syncWarn("[%d:%d] cannot syncRaftStartElection at term %" PRId64 " since there are still pending configuration changes to apply", diff --git a/source/libs/sync/src/raft_handle_vote_resp_message.c b/source/libs/sync/src/raft_handle_vote_resp_message.c index 05464256af..b3a47aac7f 100644 --- a/source/libs/sync/src/raft_handle_vote_resp_message.c +++ b/source/libs/sync/src/raft_handle_vote_resp_message.c @@ -23,6 +23,8 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { int quorum; int voterIndex; + assert(pRaft->state == TAOS_SYNC_ROLE_CANDIDATE); + voterIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); if (voterIndex == -1) { syncError("[%d:%d] recv vote resp from unknown server %d", pRaft->selfGroupId, pRaft->selfId, pMsg->from); @@ -34,24 +36,23 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { return 0; } - granted = syncRaftNumOfGranted(pRaft, pMsg->from, + SSyncRaftVoteResult result = syncRaftPollVote(pRaft, pMsg->from, pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION, - !pMsg->voteResp.rejected, &rejected); - quorum = syncRaftQuorum(pRaft); + !pMsg->voteResp.rejected, &rejected, &granted); syncInfo("[%d:%d] [quorum:%d] has received %d votes and %d vote rejections", pRaft->selfGroupId, pRaft->selfId, quorum, granted, rejected); - if (granted >= quorum) { - if (pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { + if (result == SYNC_RAFT_VOTE_WON) { + if (pRaft->candidateState.inPreVote) { syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); } else { - syncRaftBecomeLeader(pRaft); - } + syncRaftBecomeLeader(pRaft); - return 0; - } else if (rejected == quorum) { + } + } else if (result == SYNC_RAFT_VOTE_LOST) { syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); } + return 0; } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_impl.c b/source/libs/sync/src/sync_raft_impl.c new file mode 100644 index 0000000000..b7353fd787 --- /dev/null +++ b/source/libs/sync/src/sync_raft_impl.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "raft.h" +#include "raft_configuration.h" +#include "raft_log.h" +#include "raft_replication.h" +#include "sync_raft_progress_tracker.h" +#include "syncInt.h" + +static int convertClear(SSyncRaft* pRaft); +static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg); +static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg); +static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg); + +static int triggerAll(SSyncRaft* pRaft); + +static void tickElection(SSyncRaft* pRaft); +static void tickHeartbeat(SSyncRaft* pRaft); + +static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n); +static bool maybeCommit(SSyncRaft* pRaft); + +static void abortLeaderTransfer(SSyncRaft* pRaft); + +static void resetRaft(SSyncRaft* pRaft, SyncTerm term); + +void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId) { + convertClear(pRaft); + + pRaft->stepFp = stepFollower; + resetRaft(pRaft, term); + pRaft->tickFp = tickElection; + pRaft->leaderId = leaderId; + pRaft->state = TAOS_SYNC_ROLE_FOLLOWER; + syncInfo("[%d:%d] became followe at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); +} + +void syncRaftBecomePreCandidate(SSyncRaft* pRaft) { + convertClear(pRaft); + + /** + * Becoming a pre-candidate changes our step functions and state, + * but doesn't change anything else. In particular it does not increase + * r.Term or change r.Vote. + **/ + pRaft->stepFp = stepCandidate; + pRaft->tickFp = tickElection; + pRaft->state = TAOS_SYNC_ROLE_CANDIDATE; + pRaft->candidateState.inPreVote = true; + syncInfo("[%d:%d] became pre-candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); +} + +void syncRaftBecomeCandidate(SSyncRaft* pRaft) { + convertClear(pRaft); + + pRaft->candidateState.inPreVote = false; + pRaft->stepFp = stepCandidate; + // become candidate make term+1 + resetRaft(pRaft, pRaft->term + 1); + pRaft->tickFp = tickElection; + pRaft->voteFor = pRaft->selfId; + pRaft->state = TAOS_SYNC_ROLE_CANDIDATE; + syncInfo("[%d:%d] became candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); +} + +void syncRaftBecomeLeader(SSyncRaft* pRaft) { + assert(pRaft->state != TAOS_SYNC_ROLE_FOLLOWER); + + pRaft->stepFp = stepLeader; + resetRaft(pRaft, pRaft->term); + pRaft->leaderId = pRaft->leaderId; + pRaft->state = TAOS_SYNC_ROLE_LEADER; + // TODO: check if there is pending config log + int nPendingConf = syncRaftLogNumOfPendingConf(pRaft->log); + if (nPendingConf > 1) { + syncFatal("unexpected multiple uncommitted config entry"); + } + + syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); + + // after become leader, send a no-op log + SSyncRaftEntry* entry = (SSyncRaftEntry*)malloc(sizeof(SSyncRaftEntry)); + if (entry == NULL) { + return; + } + *entry = (SSyncRaftEntry) { + .buffer = (SSyncBuffer) { + .data = NULL, + .len = 0, + } + }; + appendEntries(pRaft, entry, 1); + //syncRaftTriggerHeartbeat(pRaft); +} + +void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) { + triggerAll(pRaft); +} + +void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) { + // electionTimeoutTick in [3,6] tick + pRaft->randomizedElectionTimeout = taosRand() % 4 + 3; +} + +bool syncRaftIsPromotable(SSyncRaft* pRaft) { + return pRaft->selfId != SYNC_NON_NODE_ID; +} + +bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) { + return pRaft->electionElapsed >= pRaft->randomizedElectionTimeout; +} + +int syncRaftQuorum(SSyncRaft* pRaft) { + return pRaft->cluster.replica / 2 + 1; +} + +SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id, + bool preVote, bool grant, + int* rejected, int *granted) { + int voterIndex = syncRaftConfigurationIndexOfNode(pRaft, id); + if (voterIndex == -1) { + return SYNC_RAFT_VOTE_PENDING; + } + + if (grant) { + syncInfo("[%d:%d] received grant (pre-vote %d) from %d at term %" PRId64 "", + pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); + } else { + syncInfo("[%d:%d] received rejection (pre-vote %d) from %d at term %" PRId64 "", + pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); + } + + syncRaftRecordVote(pRaft->tracker, voterIndex, grant); + return syncRaftTallyVotes(pRaft->tracker, rejected, granted); +} +/* + if (accept) { + syncInfo("[%d:%d] received (pre-vote %d) from %d at term %" PRId64 "", + pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); + } else { + syncInfo("[%d:%d] received rejection from %d at term %" PRId64 "", + pRaft->selfGroupId, pRaft->selfId, id, pRaft->term); + } + + int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, id); + assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0); + assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN); + + pRaft->candidateState.votes[voteIndex] = accept ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT; + int granted = 0, rejected = 0; + int i; + for (i = 0; i < pRaft->cluster.replica; ++i) { + if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) granted++; + else if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_REJECT) rejected++; + } + + if (rejectNum) *rejectNum = rejected; + return granted; +*/ + +static int convertClear(SSyncRaft* pRaft) { + +} + +static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + + return 0; +} + +static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + /** + * Only handle vote responses corresponding to our candidacy (while in + * StateCandidate, we may get stale MsgPreVoteResp messages in this term from + * our pre-candidate state). + **/ + RaftMessageType msgType = pMsg->msgType; + + if (msgType == RAFT_MSG_INTERNAL_PROP) { + return 0; + } + + if (msgType == RAFT_MSG_VOTE_RESP) { + syncRaftHandleVoteRespMessage(pRaft, pMsg); + return 0; + } else if (msgType == RAFT_MSG_APPEND) { + syncRaftBecomeFollower(pRaft, pRaft->term, pMsg->from); + syncRaftHandleAppendEntriesMessage(pRaft, pMsg); + } + return 0; +} + +static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg) { + convertClear(pRaft); + return 0; +} + +/** + * tickElection is run by followers and candidates per tick. + **/ +static void tickElection(SSyncRaft* pRaft) { + pRaft->electionElapsed += 1; + + if (!syncRaftIsPromotable(pRaft)) { + return; + } + + if (!syncRaftIsPastElectionTimeout(pRaft)) { + return; + } + + // election timeout + pRaft->electionElapsed = 0; + SSyncMessage msg; + syncRaftStep(pRaft, syncInitElectionMsg(&msg, pRaft->selfId)); +} + +static void tickHeartbeat(SSyncRaft* pRaft) { + +} + +static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { + SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); + SyncTerm term = pRaft->term; + int i; + + for (i = 0; i < n; ++i) { + entries[i].term = term; + entries[i].index = lastIndex + 1 + i; + } + + syncRaftLogAppend(pRaft->log, entries, n); + + SSyncRaftProgress* progress = &(pRaft->tracker->progressMap[pRaft->cluster.selfIndex]); + syncRaftProgressMaybeUpdate(progress, lastIndex); + // Regardless of maybeCommit's return, our caller will call bcastAppend. + maybeCommit(pRaft); +} + +/** + * maybeCommit attempts to advance the commit index. Returns true if + * the commit index changed (in which case the caller should call + * r.bcastAppend). + **/ +static bool maybeCommit(SSyncRaft* pRaft) { + + return true; +} + +/** + * trigger I/O requests for newly appended log entries or heartbeats. + **/ +static int triggerAll(SSyncRaft* pRaft) { + assert(pRaft->state == TAOS_SYNC_ROLE_LEADER); + int i; + + for (i = 0; i < pRaft->cluster.replica; ++i) { + if (i == pRaft->cluster.selfIndex) { + continue; + } + + syncRaftReplicate(pRaft, i); + } +} + +static void abortLeaderTransfer(SSyncRaft* pRaft) { + pRaft->leadTransferee = SYNC_NON_NODE_ID; +} + +static void initProgress(int i, SSyncRaftProgress* progress, void* arg) { + syncRaftInitProgress(i, (SSyncRaft*)arg, progress); +} + +static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { + if (pRaft->term != term) { + pRaft->term = term; + pRaft->voteFor = SYNC_NON_NODE_ID; + } + + pRaft->leaderId = SYNC_NON_NODE_ID; + + pRaft->electionElapsed = 0; + pRaft->heartbeatElapsed = 0; + + syncRaftRandomizedElectionTimeout(pRaft); + + abortLeaderTransfer(pRaft); + + syncRaftResetVotes(pRaft->tracker); + syncRaftProgressVisit(pRaft->tracker, initProgress, pRaft); + + pRaft->pendingConfigIndex = 0; + pRaft->uncommittedSize = 0; +} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_progress.c b/source/libs/sync/src/sync_raft_progress.c index ec98be7dfa..437c083b4d 100644 --- a/source/libs/sync/src/sync_raft_progress.c +++ b/source/libs/sync/src/sync_raft_progress.c @@ -34,6 +34,8 @@ void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress) .matchIndex = i == pRaft->selfIndex ? syncRaftLogLastIndex(pRaft->log) : 0, .nextIndex = syncRaftLogLastIndex(pRaft->log) + 1, .inflights = inflights, + .isLearner = false, + .state = PROGRESS_STATE_PROBE, }; } diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c index 7104794cbb..43b68a4b08 100644 --- a/source/libs/sync/src/sync_raft_progress_tracker.c +++ b/source/libs/sync/src/sync_raft_progress_tracker.c @@ -25,7 +25,7 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker() { } void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) { - memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteRespType) * TSDB_MAX_REPLICA); + memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteResult) * TSDB_MAX_REPLICA); } void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) { @@ -34,4 +34,43 @@ void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp vi SSyncRaftProgress* progress = &(tracker->progressMap[i]); visit(i, progress, arg); } +} + +void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant) { + if (tracker->votes[i] != SYNC_RAFT_VOTE_RESP_UNKNOWN) { + return; + } + + tracker->votes[i] = grant ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT; +} + +/** + * syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the + * election outcome is known. + **/ +SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) { + int i; + SSyncRaftProgress* progress; + int r, g; + + for (i = 0, r = 0, g = 0; i < TSDB_MAX_REPLICA; ++i) { + progress = &(tracker->progressMap[i]); + if (progress->id == SYNC_NON_NODE_ID) { + continue; + } + + if (tracker->votes[i] == SYNC_RAFT_VOTE_RESP_UNKNOWN) { + continue; + } + + if (tracker->votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) { + g++; + } else { + r++; + } + } + + if (rejected) *rejected = r; + if (granted) *granted = g; + return syncRaftVoteResult(&(tracker->config.voters), tracker->votes); } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_quorum_joint.c b/source/libs/sync/src/sync_raft_quorum_joint.c new file mode 100644 index 0000000000..a0e6a6782a --- /dev/null +++ b/source/libs/sync/src/sync_raft_quorum_joint.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "sync_raft_quorum_majority.h" +#include "sync_raft_quorum_joint.h" +#include "sync_raft_quorum.h" + +/** + * syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns + * a result indicating whether the vote is pending, lost, or won. A joint quorum + * requires both majority quorums to vote in favor. + **/ +SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const SyncRaftVoteResult* votes) { + SyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->majorityConfig[0]), votes); + SyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->majorityConfig[1]), votes); + + if (r1 == r2) { + // If they agree, return the agreed state. + return r1; + } + + if (r1 == SYNC_RAFT_VOTE_LOST || r2 == SYNC_RAFT_VOTE_LOST) { + // If either config has lost, loss is the only possible outcome. + return SYNC_RAFT_VOTE_LOST; + } + + // One side won, the other one is pending, so the whole outcome is. + return SYNC_RAFT_VOTE_PENDING; +} diff --git a/source/libs/sync/src/sync_raft_quorum_majority.c b/source/libs/sync/src/sync_raft_quorum_majority.c new file mode 100644 index 0000000000..ea543d7335 --- /dev/null +++ b/source/libs/sync/src/sync_raft_quorum_majority.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "sync_raft_quorum.h" +#include "sync_raft_quorum_majority.h" + +/** + * syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns + * a result indicating whether the vote is pending (i.e. neither a quorum of + * yes/no has been reached), won (a quorum of yes has been reached), or lost (a + * quorum of no has been reached). + **/ +SyncRaftVoteResult syncRaftMajorityVoteResult(SSyncCluster* config, const SyncRaftVoteResult* votes) { + if (config->replica == 0) { + return SYNC_RAFT_VOTE_WON; + } + + int i, g, r, missing; + for (i = g = r = missing = 0; i < TSDB_MAX_REPLICA; ++i) { + if (config->nodeInfo[i].nodeId == SYNC_NON_NODE_ID) { + continue; + } + + if (votes[i] == SYNC_RAFT_VOTE_RESP_UNKNOWN) { + missing += 1; + } else if (votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) { + g +=1; + } else { + r += 1; + } + } + + int quorum = config->replica / 2 + 1; + if (g >= quorum) { + return SYNC_RAFT_VOTE_WON; + } + if (r + missing >= quorum) { + return SYNC_RAFT_VOTE_PENDING; + } + + return SYNC_RAFT_VOTE_LOST; +} \ No newline at end of file