Merge branch '3.0' of github.com:taosdata/TDengine into 3.0
This commit is contained in:
commit
c1c3164503
|
@ -109,38 +109,25 @@ typedef struct SSyncLogStore {
|
|||
SyncIndex (*logLastIndex)(struct SSyncLogStore* logStore);
|
||||
} SSyncLogStore;
|
||||
|
||||
typedef struct SSyncServerState {
|
||||
SyncNodeId voteFor;
|
||||
SyncTerm term;
|
||||
SyncIndex commitIndex;
|
||||
} SSyncServerState;
|
||||
|
||||
typedef struct SSyncClusterConfig {
|
||||
// Log index number of current cluster config.
|
||||
SyncIndex index;
|
||||
|
||||
// Log index number of previous cluster config.
|
||||
SyncIndex prevIndex;
|
||||
|
||||
// current cluster
|
||||
const SSyncCluster* cluster;
|
||||
} SSyncClusterConfig;
|
||||
|
||||
typedef struct SStateManager {
|
||||
void* pData;
|
||||
|
||||
int32_t (*saveServerState)(struct SStateManager* stateMng, SSyncServerState* state);
|
||||
// save serialized server state data, buffer will be free by Sync
|
||||
int32_t (*saveServerState)(struct SStateManager* stateMng, const char* buffer, int n);
|
||||
|
||||
int32_t (*readServerState)(struct SStateManager* stateMng, SSyncServerState* state);
|
||||
// read serialized server state data, buffer will be free by Sync
|
||||
int32_t (*readServerState)(struct SStateManager* stateMng, char** ppBuffer, int* n);
|
||||
|
||||
void (*saveCluster)(struct SStateManager* stateMng, const SSyncClusterConfig* cluster);
|
||||
// save serialized cluster state data, buffer will be free by Sync
|
||||
void (*saveClusterState)(struct SStateManager* stateMng, const char* buffer, int n);
|
||||
|
||||
const SSyncClusterConfig* (*readCluster)(struct SStateManager* stateMng);
|
||||
// read serialized cluster state data, buffer will be free by Sync
|
||||
int32_t (*readClusterState)(struct SStateManager* stateMng, char** ppBuffer, int* n);
|
||||
} SStateManager;
|
||||
|
||||
typedef struct {
|
||||
SyncGroupId vgId;
|
||||
SyncIndex snapshotIndex;
|
||||
SyncIndex appliedIndex;
|
||||
SSyncCluster syncCfg;
|
||||
SSyncFSM fsm;
|
||||
SSyncLogStore logStore;
|
||||
|
|
|
@ -65,7 +65,8 @@ struct SSyncRaft {
|
|||
|
||||
SSyncRaftLog *log;
|
||||
|
||||
int maxMsgSize;
|
||||
uint64_t maxMsgSize;
|
||||
uint64_t maxUncommittedSize;
|
||||
SSyncRaftProgressTracker *tracker;
|
||||
|
||||
ESyncState state;
|
||||
|
|
|
@ -19,16 +19,16 @@
|
|||
#include "sync.h"
|
||||
#include "sync_type.h"
|
||||
|
||||
typedef enum SyncEntryType {
|
||||
typedef enum ESyncRaftEntryType {
|
||||
SYNC_ENTRY_TYPE_LOG = 1,
|
||||
}SyncEntryType;
|
||||
} ESyncRaftEntryType;
|
||||
|
||||
struct SSyncRaftEntry {
|
||||
SyncTerm term;
|
||||
|
||||
SyncIndex index;
|
||||
|
||||
SyncEntryType type;
|
||||
ESyncRaftEntryType type;
|
||||
|
||||
SSyncBuffer buffer;
|
||||
};
|
||||
|
@ -51,6 +51,8 @@ SyncIndex syncRaftLogSnapshotIndex(SSyncRaftLog* pLog);
|
|||
|
||||
SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog);
|
||||
|
||||
void syncRaftLogAppliedTo(SSyncRaftLog* pLog, SyncIndex appliedIndex);
|
||||
|
||||
bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term);
|
||||
|
||||
int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog);
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
* outter message start with RAFT_MSG_*, which communicate between cluster peers,
|
||||
* need to implement its decode/encode functions.
|
||||
**/
|
||||
typedef enum RaftMessageType {
|
||||
typedef enum ESyncRaftMessageType {
|
||||
// client propose a cmd
|
||||
RAFT_MSG_INTERNAL_PROP = 1,
|
||||
|
||||
|
@ -40,7 +40,7 @@ typedef enum RaftMessageType {
|
|||
|
||||
RAFT_MSG_APPEND = 5,
|
||||
RAFT_MSG_APPEND_RESP = 6,
|
||||
} RaftMessageType;
|
||||
} ESyncRaftMessageType;
|
||||
|
||||
typedef struct RaftMsgInternal_Prop {
|
||||
const SSyncBuffer *pBuf;
|
||||
|
@ -53,14 +53,14 @@ typedef struct RaftMsgInternal_Election {
|
|||
} RaftMsgInternal_Election;
|
||||
|
||||
typedef struct RaftMsg_Vote {
|
||||
SyncRaftElectionType cType;
|
||||
ESyncRaftElectionType cType;
|
||||
SyncIndex lastIndex;
|
||||
SyncTerm lastTerm;
|
||||
} RaftMsg_Vote;
|
||||
|
||||
typedef struct RaftMsg_VoteResp {
|
||||
bool rejected;
|
||||
SyncRaftElectionType cType;
|
||||
ESyncRaftElectionType cType;
|
||||
} RaftMsg_VoteResp;
|
||||
|
||||
typedef struct RaftMsg_Append_Entries {
|
||||
|
@ -85,7 +85,7 @@ typedef struct RaftMsg_Append_Resp {
|
|||
} RaftMsg_Append_Resp;
|
||||
|
||||
typedef struct SSyncMessage {
|
||||
RaftMessageType msgType;
|
||||
ESyncRaftMessageType msgType;
|
||||
SyncTerm term;
|
||||
SyncGroupId groupId;
|
||||
SyncNodeId from;
|
||||
|
@ -131,7 +131,7 @@ static FORCE_INLINE SSyncMessage* syncInitElectionMsg(SSyncMessage* pMsg, SyncNo
|
|||
}
|
||||
|
||||
static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId from,
|
||||
SyncTerm term, SyncRaftElectionType cType,
|
||||
SyncTerm term, ESyncRaftElectionType cType,
|
||||
SyncIndex lastIndex, SyncTerm lastTerm) {
|
||||
SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage));
|
||||
if (pMsg == NULL) {
|
||||
|
@ -153,7 +153,7 @@ static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId
|
|||
}
|
||||
|
||||
static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNodeId from,
|
||||
SyncRaftElectionType cType, bool rejected) {
|
||||
ESyncRaftElectionType cType, bool rejected) {
|
||||
SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage));
|
||||
if (pMsg == NULL) {
|
||||
return NULL;
|
||||
|
@ -213,7 +213,7 @@ static FORCE_INLINE SSyncMessage* syncNewEmptyAppendRespMsg(SyncGroupId groupId,
|
|||
return pMsg;
|
||||
}
|
||||
|
||||
static FORCE_INLINE bool syncIsInternalMsg(RaftMessageType msgType) {
|
||||
static FORCE_INLINE bool syncIsInternalMsg(ESyncRaftMessageType msgType) {
|
||||
return msgType == RAFT_MSG_INTERNAL_PROP ||
|
||||
msgType == RAFT_MSG_INTERNAL_ELECTION;
|
||||
}
|
||||
|
|
|
@ -20,6 +20,11 @@
|
|||
#include "syncInt.h"
|
||||
#include "sync_type.h"
|
||||
|
||||
int syncRaftReplicate(SSyncRaft* pRaft, int i);
|
||||
// syncRaftReplicate sends an append RPC with new entries to the given peer,
|
||||
// if necessary. Returns true if a message was sent. The sendIfEmpty
|
||||
// argument controls whether messages with no entries will be sent
|
||||
// ("empty" messages are useful to convey updated Commit indexes, but
|
||||
// are undesirable when we're sending multiple messages in a batch).
|
||||
bool syncRaftReplicate(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty);
|
||||
|
||||
#endif /* TD_SYNC_RAFT_REPLICATION_H */
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef TD_SYNC_RAFT_CONFIG_CHANGE_H
|
||||
#define TD_SYNC_RAFT_CONFIG_CHANGE_H
|
||||
|
||||
#include "sync_type.h"
|
||||
#include "sync_raft_proto.h"
|
||||
|
||||
/**
|
||||
* Changer facilitates configuration changes. It exposes methods to handle
|
||||
* simple and joint consensus while performing the proper validation that allows
|
||||
* refusing invalid configuration changes before they affect the active
|
||||
* configuration.
|
||||
**/
|
||||
struct SSyncRaftChanger {
|
||||
SSyncRaftProgressTracker* tracker;
|
||||
SyncIndex lastIndex;
|
||||
};
|
||||
|
||||
typedef int (*configChangeFp)(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
|
||||
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||
|
||||
int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
|
||||
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||
|
||||
int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css,
|
||||
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||
|
||||
#endif /* TD_SYNC_RAFT_CONFIG_CHANGE_H */
|
|
@ -26,7 +26,7 @@ void syncRaftBecomePreCandidate(SSyncRaft* pRaft);
|
|||
void syncRaftBecomeCandidate(SSyncRaft* pRaft);
|
||||
void syncRaftBecomeLeader(SSyncRaft* pRaft);
|
||||
|
||||
void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType);
|
||||
void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType);
|
||||
|
||||
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft);
|
||||
|
||||
|
@ -35,8 +35,20 @@ bool syncRaftIsPromotable(SSyncRaft* pRaft);
|
|||
bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft);
|
||||
int syncRaftQuorum(SSyncRaft* pRaft);
|
||||
|
||||
SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
||||
bool syncRaftMaybeCommit(SSyncRaft* pRaft);
|
||||
|
||||
ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
||||
bool preVote, bool accept,
|
||||
int* rejectNum, int *granted);
|
||||
|
||||
static FORCE_INLINE bool syncRaftIsEmptyServerState(const SSyncServerState* serverState) {
|
||||
return serverState->commitIndex == 0 &&
|
||||
serverState->term == SYNC_NON_TERM &&
|
||||
serverState->voteFor == SYNC_NON_NODE_ID;
|
||||
}
|
||||
|
||||
void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState);
|
||||
|
||||
void syncRaftBroadcastAppend(SSyncRaft* pRaft);
|
||||
|
||||
#endif /* _TD_LIBS_SYNC_RAFT_IMPL_H */
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
*
|
||||
* PROGRESS_STATE_PROBE is the initial state.
|
||||
**/
|
||||
typedef enum RaftProgressState {
|
||||
typedef enum ESyncRaftProgressState {
|
||||
/**
|
||||
* StateProbe indicates a follower whose last index isn't known. Such a
|
||||
* follower is "probed" (i.e. an append sent periodically) to narrow down
|
||||
|
@ -56,13 +56,22 @@ typedef enum RaftProgressState {
|
|||
* return to StateReplicate.
|
||||
**/
|
||||
PROGRESS_STATE_SNAPSHOT,
|
||||
} RaftProgressState;
|
||||
} ESyncRaftProgressState;
|
||||
|
||||
static const char* kProgressStateString[] = {
|
||||
"Probe",
|
||||
"Replicate",
|
||||
"Snapshot",
|
||||
};
|
||||
|
||||
/**
|
||||
* Progress represents a follower’s progress in the view of the leader. Leader maintains
|
||||
* progresses of all followers, and sends entries to the follower based on its progress.
|
||||
**/
|
||||
struct SSyncRaftProgress {
|
||||
// index in raft cluster config
|
||||
int selfIndex;
|
||||
|
||||
SyncNodeId id;
|
||||
|
||||
SyncIndex nextIndex;
|
||||
|
@ -82,7 +91,7 @@ struct SSyncRaftProgress {
|
|||
* When in StateSnapshot, leader should have sent out snapshot
|
||||
* before and stops sending any replication message.
|
||||
**/
|
||||
RaftProgressState state;
|
||||
ESyncRaftProgressState state;
|
||||
|
||||
/**
|
||||
* pendingSnapshotIndex is used in PROGRESS_STATE_SNAPSHOT.
|
||||
|
@ -129,6 +138,15 @@ struct SSyncRaftProgress {
|
|||
bool isLearner;
|
||||
};
|
||||
|
||||
struct SSyncRaftProgressMap {
|
||||
SSyncRaftProgress progress[TSDB_MAX_REPLICA];
|
||||
};
|
||||
|
||||
|
||||
static FORCE_INLINE const char* syncRaftProgressStateString(const SSyncRaftProgress* progress) {
|
||||
return kProgressStateString[progress->state];
|
||||
}
|
||||
|
||||
void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
||||
|
||||
/**
|
||||
|
@ -187,15 +205,15 @@ static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progr
|
|||
return progress->nextIndex;
|
||||
}
|
||||
|
||||
static FORCE_INLINE RaftProgressState syncRaftProgressInReplicate(SSyncRaftProgress* progress) {
|
||||
static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInReplicate(SSyncRaftProgress* progress) {
|
||||
return progress->state == PROGRESS_STATE_REPLICATE;
|
||||
}
|
||||
|
||||
static FORCE_INLINE RaftProgressState syncRaftProgressInSnapshot(SSyncRaftProgress* progress) {
|
||||
static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInSnapshot(SSyncRaftProgress* progress) {
|
||||
return progress->state == PROGRESS_STATE_SNAPSHOT;
|
||||
}
|
||||
|
||||
static FORCE_INLINE RaftProgressState syncRaftProgressInProbe(SSyncRaftProgress* progress) {
|
||||
static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInProbe(SSyncRaftProgress* progress) {
|
||||
return progress->state == PROGRESS_STATE_PROBE;
|
||||
}
|
||||
|
||||
|
@ -203,6 +221,12 @@ static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progres
|
|||
return progress->recentActive;
|
||||
}
|
||||
|
||||
int syncRaftFindProgressIndexByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||
|
||||
int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||
|
||||
void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||
|
||||
/**
|
||||
* return true if progress's log is up-todate
|
||||
**/
|
||||
|
@ -210,7 +234,9 @@ bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
|||
|
||||
void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex);
|
||||
|
||||
void syncRaftCopyProgress(const SSyncRaftProgress* from, SSyncRaftProgress* to);
|
||||
|
||||
void syncRaftProgressMapCopy(const SSyncRaftProgressMap* from, SSyncRaftProgressMap* to);
|
||||
|
||||
#if 0
|
||||
|
||||
|
|
|
@ -17,78 +17,74 @@
|
|||
#define _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H
|
||||
|
||||
#include "sync_type.h"
|
||||
#include "sync_raft_quorum.h"
|
||||
#include "sync_raft_quorum_joint.h"
|
||||
#include "sync_raft_progress.h"
|
||||
#include "sync_raft_proto.h"
|
||||
|
||||
struct SSyncRaftProgressTrackerConfig {
|
||||
SSyncRaftQuorumJointConfig voters;
|
||||
|
||||
/**
|
||||
* autoLeave is true if the configuration is joint and a transition to the
|
||||
* incoming configuration should be carried out automatically by Raft when
|
||||
* this is possible. If false, the configuration will be joint until the
|
||||
* application initiates the transition manually.
|
||||
**/
|
||||
// autoLeave is true if the configuration is joint and a transition to the
|
||||
// incoming configuration should be carried out automatically by Raft when
|
||||
// this is possible. If false, the configuration will be joint until the
|
||||
// application initiates the transition manually.
|
||||
bool autoLeave;
|
||||
|
||||
/**
|
||||
* Learners is a set of IDs corresponding to the learners active in the
|
||||
* current configuration.
|
||||
*
|
||||
* Invariant: Learners and Voters does not intersect, i.e. if a peer is in
|
||||
* either half of the joint config, it can't be a learner; if it is a
|
||||
* learner it can't be in either half of the joint config. This invariant
|
||||
* simplifies the implementation since it allows peers to have clarity about
|
||||
* its current role without taking into account joint consensus.
|
||||
**/
|
||||
SyncNodeId learners[TSDB_MAX_REPLICA];
|
||||
// Learners is a set of IDs corresponding to the learners active in the
|
||||
// current configuration.
|
||||
//
|
||||
// Invariant: Learners and Voters does not intersect, i.e. if a peer is in
|
||||
// either half of the joint config, it can't be a learner; if it is a
|
||||
// learner it can't be in either half of the joint config. This invariant
|
||||
// simplifies the implementation since it allows peers to have clarity about
|
||||
// its current role without taking into account joint consensus.
|
||||
SSyncRaftNodeMap learners;
|
||||
|
||||
/**
|
||||
* When we turn a voter into a learner during a joint consensus transition,
|
||||
* we cannot add the learner directly when entering the joint state. This is
|
||||
* because this would violate the invariant that the intersection of
|
||||
* voters and learners is empty. For example, assume a Voter is removed and
|
||||
* immediately re-added as a learner (or in other words, it is demoted):
|
||||
*
|
||||
* Initially, the configuration will be
|
||||
*
|
||||
* voters: {1 2 3}
|
||||
* learners: {}
|
||||
*
|
||||
* and we want to demote 3. Entering the joint configuration, we naively get
|
||||
*
|
||||
* voters: {1 2} & {1 2 3}
|
||||
* learners: {3}
|
||||
*
|
||||
* but this violates the invariant (3 is both voter and learner). Instead,
|
||||
* we get
|
||||
*
|
||||
* voters: {1 2} & {1 2 3}
|
||||
* learners: {}
|
||||
* next_learners: {3}
|
||||
*
|
||||
* Where 3 is now still purely a voter, but we are remembering the intention
|
||||
* to make it a learner upon transitioning into the final configuration:
|
||||
*
|
||||
* voters: {1 2}
|
||||
* learners: {3}
|
||||
* next_learners: {}
|
||||
*
|
||||
* Note that next_learners is not used while adding a learner that is not
|
||||
* also a voter in the joint config. In this case, the learner is added
|
||||
* right away when entering the joint configuration, so that it is caught up
|
||||
* as soon as possible.
|
||||
**/
|
||||
SyncNodeId learnersNext[TSDB_MAX_REPLICA];
|
||||
// When we turn a voter into a learner during a joint consensus transition,
|
||||
// we cannot add the learner directly when entering the joint state. This is
|
||||
// because this would violate the invariant that the intersection of
|
||||
// voters and learners is empty. For example, assume a Voter is removed and
|
||||
// immediately re-added as a learner (or in other words, it is demoted):
|
||||
//
|
||||
// Initially, the configuration will be
|
||||
//
|
||||
// voters: {1 2 3}
|
||||
// learners: {}
|
||||
//
|
||||
// and we want to demote 3. Entering the joint configuration, we naively get
|
||||
//
|
||||
// voters: {1 2} & {1 2 3}
|
||||
// learners: {3}
|
||||
//
|
||||
// but this violates the invariant (3 is both voter and learner). Instead,
|
||||
// we get
|
||||
//
|
||||
// voters: {1 2} & {1 2 3}
|
||||
// learners: {}
|
||||
// next_learners: {3}
|
||||
//
|
||||
// Where 3 is now still purely a voter, but we are remembering the intention
|
||||
// to make it a learner upon transitioning into the final configuration:
|
||||
//
|
||||
// voters: {1 2}
|
||||
// learners: {3}
|
||||
// next_learners: {}
|
||||
//
|
||||
// Note that next_learners is not used while adding a learner that is not
|
||||
// also a voter in the joint config. In this case, the learner is added
|
||||
// right away when entering the joint configuration, so that it is caught up
|
||||
// as soon as possible.
|
||||
SSyncRaftNodeMap learnersNext;
|
||||
};
|
||||
|
||||
struct SSyncRaftProgressTracker {
|
||||
SSyncRaftProgressTrackerConfig config;
|
||||
|
||||
SSyncRaftProgress progressMap[TSDB_MAX_REPLICA];
|
||||
SSyncRaftProgressMap progressMap;
|
||||
|
||||
SyncRaftVoteResult votes[TSDB_MAX_REPLICA];
|
||||
int maxInflight;
|
||||
ESyncRaftVoteType votes[TSDB_MAX_REPLICA];
|
||||
int maxInflightMsgs;
|
||||
};
|
||||
|
||||
SSyncRaftProgressTracker* syncRaftOpenProgressTracker();
|
||||
|
@ -104,10 +100,18 @@ void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, voi
|
|||
**/
|
||||
void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant);
|
||||
|
||||
void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressTrackerConfig* result);
|
||||
|
||||
int syncRaftCheckProgress(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||
|
||||
/**
|
||||
* syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the
|
||||
* election outcome is known.
|
||||
**/
|
||||
SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted);
|
||||
ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted);
|
||||
|
||||
void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs);
|
||||
|
||||
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
|
||||
|
||||
#endif /* _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H */
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef TD_SYNC_RAFT_PROTO_H
|
||||
#define TD_SYNC_RAFT_PROTO_H
|
||||
|
||||
#include "sync_type.h"
|
||||
|
||||
typedef enum ESyncRaftConfChangeType {
|
||||
SYNC_RAFT_Conf_AddNode = 0,
|
||||
SYNC_RAFT_Conf_RemoveNode = 1,
|
||||
SYNC_RAFT_Conf_UpdateNode = 2,
|
||||
SYNC_RAFT_Conf_AddLearnerNode = 3,
|
||||
} ESyncRaftConfChangeType;
|
||||
|
||||
// ConfChangeSingle is an individual configuration change operation. Multiple
|
||||
// such operations can be carried out atomically via a ConfChangeV2.
|
||||
typedef struct SSyncConfChangeSingle {
|
||||
ESyncRaftConfChangeType type;
|
||||
SyncNodeId nodeId;
|
||||
} SSyncConfChangeSingle;
|
||||
|
||||
typedef struct SSyncConfChangeSingleArray {
|
||||
int n;
|
||||
SSyncConfChangeSingle* changes;
|
||||
} SSyncConfChangeSingleArray;
|
||||
|
||||
typedef struct SSyncConfigState {
|
||||
// The voters in the incoming config. (If the configuration is not joint,
|
||||
// then the outgoing config is empty).
|
||||
SSyncRaftNodeMap voters;
|
||||
|
||||
// The learners in the incoming config.
|
||||
SSyncRaftNodeMap learners;
|
||||
|
||||
// The voters in the outgoing config.
|
||||
SSyncRaftNodeMap votersOutgoing;
|
||||
|
||||
// The nodes that will become learners when the outgoing config is removed.
|
||||
// These nodes are necessarily currently in nodes_joint (or they would have
|
||||
// been added to the incoming config right away).
|
||||
SSyncRaftNodeMap learnersNext;
|
||||
|
||||
// If set, the config is joint and Raft will automatically transition into
|
||||
// the final config (i.e. remove the outgoing config) when this is safe.
|
||||
bool autoLeave;
|
||||
} SSyncConfigState;
|
||||
|
||||
#endif /* TD_SYNC_RAFT_PROTO_H */
|
|
@ -17,11 +17,11 @@
|
|||
#define TD_SYNC_RAFT_QUORUM_H
|
||||
|
||||
/**
|
||||
* SSyncRaftVoteResult indicates the outcome of a vote.
|
||||
* ESyncRaftVoteResult indicates the outcome of a vote.
|
||||
**/
|
||||
typedef enum {
|
||||
/**
|
||||
* SYNC_RAFT_VOTE_PENDING indicates that the decision of the vote depends on future
|
||||
* SYNC_RAFT_VOTE_PENDING indicates that the decision of the vote depends on future
|
||||
* votes, i.e. neither "yes" or "no" has reached quorum yet.
|
||||
**/
|
||||
SYNC_RAFT_VOTE_PENDING = 1,
|
||||
|
@ -35,6 +35,6 @@ typedef enum {
|
|||
* SYNC_RAFT_VOTE_WON indicates that the quorum has voted "yes".
|
||||
**/
|
||||
SYNC_RAFT_VOTE_WON = 3,
|
||||
} SSyncRaftVoteResult;
|
||||
} ESyncRaftVoteResult;
|
||||
|
||||
#endif /* TD_SYNC_RAFT_QUORUM_H */
|
|
@ -25,14 +25,41 @@
|
|||
* majority configurations. Decisions require the support of both majorities.
|
||||
**/
|
||||
typedef struct SSyncRaftQuorumJointConfig {
|
||||
SSyncCluster majorityConfig[2];
|
||||
}SSyncRaftQuorumJointConfig;
|
||||
SSyncRaftNodeMap outgoing;
|
||||
SSyncRaftNodeMap incoming;
|
||||
} SSyncRaftQuorumJointConfig;
|
||||
|
||||
/**
|
||||
* syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||
* a result indicating whether the vote is pending, lost, or won. A joint quorum
|
||||
* requires both majority quorums to vote in favor.
|
||||
**/
|
||||
SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const SyncRaftVoteResult* votes);
|
||||
ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const ESyncRaftVoteType* votes);
|
||||
|
||||
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
|
||||
|
||||
static FORCE_INLINE bool syncRaftJointConfigInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
||||
return syncRaftIsInNodeMap(&config->outgoing, id);
|
||||
}
|
||||
|
||||
static FORCE_INLINE bool syncRaftJointConfigInIncoming(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
||||
return syncRaftIsInNodeMap(&config->incoming, id);
|
||||
}
|
||||
|
||||
void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id);
|
||||
|
||||
void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id);
|
||||
|
||||
static FORCE_INLINE const SSyncRaftNodeMap* syncRaftJointConfigIncoming(const SSyncRaftQuorumJointConfig* config) {
|
||||
return &config->incoming;
|
||||
}
|
||||
|
||||
static FORCE_INLINE const SSyncRaftNodeMap* syncRaftJointConfigOutgoing(const SSyncRaftQuorumJointConfig* config) {
|
||||
return &config->outgoing;
|
||||
}
|
||||
|
||||
static FORCE_INLINE void syncRaftJointConfigClearOutgoing(SSyncRaftQuorumJointConfig* config) {
|
||||
memset(&config->outgoing, 0, sizeof(SSyncCluster));
|
||||
}
|
||||
|
||||
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
#include "sync.h"
|
||||
#include "sync_type.h"
|
||||
#include "sync_raft_quorum.h"
|
||||
|
||||
/**
|
||||
* syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||
|
@ -25,6 +26,6 @@
|
|||
* yes/no has been reached), won (a quorum of yes has been reached), or lost (a
|
||||
* quorum of no has been reached).
|
||||
**/
|
||||
SyncRaftVoteResult syncRaftMajorityVoteResult(SSyncCluster* config, const SyncRaftVoteResult* votes);
|
||||
ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, const ESyncRaftVoteType* votes);
|
||||
|
||||
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef TD_SYNC_RAFT_RESTORE_H
|
||||
#define TD_SYNC_RAFT_RESTORE_H
|
||||
|
||||
#include "sync_type.h"
|
||||
#include "sync_raft_proto.h"
|
||||
|
||||
// syncRaftRestoreConfig takes a Changer (which must represent an empty configuration), and
|
||||
// runs a sequence of changes enacting the configuration described in the
|
||||
// ConfState.
|
||||
//
|
||||
// TODO(tbg) it's silly that this takes a Changer. Unravel this by making sure
|
||||
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
|
||||
// this can just take LastIndex and MaxInflight directly instead and cook up
|
||||
// the results from that alone.
|
||||
int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs);
|
||||
|
||||
#endif /* TD_SYNC_RAFT_RESTORE_H */
|
|
@ -17,6 +17,7 @@
|
|||
#define _TD_LIBS_SYNC_TYPE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "sync.h"
|
||||
#include "osMath.h"
|
||||
|
||||
#define SYNC_NON_NODE_ID -1
|
||||
|
@ -28,10 +29,13 @@ typedef uint32_t SyncTick;
|
|||
typedef struct SSyncRaft SSyncRaft;
|
||||
|
||||
typedef struct SSyncRaftProgress SSyncRaftProgress;
|
||||
typedef struct SSyncRaftProgressMap SSyncRaftProgressMap;
|
||||
typedef struct SSyncRaftProgressTrackerConfig SSyncRaftProgressTrackerConfig;
|
||||
|
||||
typedef struct SSyncRaftProgressTracker SSyncRaftProgressTracker;
|
||||
|
||||
typedef struct SSyncRaftChanger SSyncRaftChanger;
|
||||
|
||||
typedef struct SSyncRaftLog SSyncRaftLog;
|
||||
|
||||
typedef struct SSyncRaftEntry SSyncRaftEntry;
|
||||
|
@ -46,11 +50,34 @@ typedef struct SSyncRaftEntry SSyncRaftEntry;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
typedef struct SSyncServerState {
|
||||
SyncNodeId voteFor;
|
||||
SyncTerm term;
|
||||
SyncIndex commitIndex;
|
||||
} SSyncServerState;
|
||||
|
||||
typedef struct SSyncClusterConfig {
|
||||
// Log index number of current cluster config.
|
||||
SyncIndex index;
|
||||
|
||||
// Log index number of previous cluster config.
|
||||
SyncIndex prevIndex;
|
||||
|
||||
// current cluster
|
||||
const SSyncCluster* cluster;
|
||||
} SSyncClusterConfig;
|
||||
|
||||
typedef struct {
|
||||
int32_t replica;
|
||||
SyncNodeId nodeId[TSDB_MAX_REPLICA];
|
||||
} SSyncRaftNodeMap;
|
||||
|
||||
typedef enum {
|
||||
SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0,
|
||||
SYNC_RAFT_CAMPAIGN_ELECTION = 1,
|
||||
SYNC_RAFT_CAMPAIGN_TRANSFER = 2,
|
||||
} SyncRaftElectionType;
|
||||
} ESyncRaftElectionType;
|
||||
|
||||
typedef enum {
|
||||
// the init vote resp status
|
||||
|
@ -59,8 +86,8 @@ typedef enum {
|
|||
// grant the vote request
|
||||
SYNC_RAFT_VOTE_RESP_GRANT = 1,
|
||||
|
||||
//reject the vote request
|
||||
// reject the vote request
|
||||
SYNC_RAFT_VOTE_RESP_REJECT = 2,
|
||||
} SyncRaftVoteResult;
|
||||
} ESyncRaftVoteType;
|
||||
|
||||
#endif /* _TD_LIBS_SYNC_TYPE_H */
|
||||
|
|
|
@ -16,12 +16,22 @@
|
|||
#include "raft.h"
|
||||
#include "raft_configuration.h"
|
||||
#include "raft_log.h"
|
||||
#include "sync_raft_restore.h"
|
||||
#include "raft_replication.h"
|
||||
#include "sync_raft_config_change.h"
|
||||
#include "sync_raft_progress_tracker.h"
|
||||
#include "syncInt.h"
|
||||
|
||||
#define RAFT_READ_LOG_MAX_NUM 100
|
||||
|
||||
static int deserializeServerStateFromBuffer(SSyncServerState* server, const char* buffer, int n);
|
||||
static int deserializeClusterStateFromBuffer(SSyncConfigState* cluster, const char* buffer, int n);
|
||||
|
||||
static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfig* config,
|
||||
const SSyncRaftProgressMap* progressMap, SSyncConfigState* cs);
|
||||
|
||||
static void abortLeaderTransfer(SSyncRaft* pRaft);
|
||||
|
||||
static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||
static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||
static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||
|
@ -29,12 +39,15 @@ static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
|||
int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
||||
SSyncNode* pNode = pRaft->pNode;
|
||||
SSyncServerState serverState;
|
||||
SSyncConfigState confState;
|
||||
SStateManager* stateManager;
|
||||
SSyncLogStore* logStore;
|
||||
SSyncFSM* fsm;
|
||||
SyncIndex initIndex = pInfo->snapshotIndex;
|
||||
SSyncBuffer buffer[RAFT_READ_LOG_MAX_NUM];
|
||||
int nBuf, limit, i;
|
||||
char* buf;
|
||||
int n;
|
||||
SSyncRaftChanger changer;
|
||||
|
||||
memset(pRaft, 0, sizeof(SSyncRaft));
|
||||
|
||||
|
@ -57,36 +70,47 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
|||
return -1;
|
||||
}
|
||||
// read server state
|
||||
if (stateManager->readServerState(stateManager, &serverState) != 0) {
|
||||
if (stateManager->readServerState(stateManager, &buf, &n) != 0) {
|
||||
syncError("readServerState for vgid %d fail", pInfo->vgId);
|
||||
return -1;
|
||||
}
|
||||
assert(initIndex <= serverState.commitIndex);
|
||||
|
||||
// restore fsm state from snapshot index + 1 until commitIndex
|
||||
++initIndex;
|
||||
while (initIndex <= serverState.commitIndex) {
|
||||
limit = MIN(RAFT_READ_LOG_MAX_NUM, serverState.commitIndex - initIndex + 1);
|
||||
|
||||
if (logStore->logRead(logStore, initIndex, limit, buffer, &nBuf) != 0) {
|
||||
return -1;
|
||||
}
|
||||
assert(limit == nBuf);
|
||||
|
||||
for (i = 0; i < limit; ++i) {
|
||||
fsm->applyLog(fsm, initIndex + i, &(buffer[i]), NULL);
|
||||
free(buffer[i].data);
|
||||
}
|
||||
initIndex += nBuf;
|
||||
if (deserializeServerStateFromBuffer(&serverState, buf, n) != 0) {
|
||||
syncError("deserializeServerStateFromBuffer for vgid %d fail", pInfo->vgId);
|
||||
return -1;
|
||||
}
|
||||
assert(initIndex == serverState.commitIndex);
|
||||
free(buf);
|
||||
//assert(initIndex <= serverState.commitIndex);
|
||||
|
||||
//pRaft->heartbeatTimeoutTick = 1;
|
||||
// read config state
|
||||
if (stateManager->readClusterState(stateManager, &buf, &n) != 0) {
|
||||
syncError("readClusterState for vgid %d fail", pInfo->vgId);
|
||||
return -1;
|
||||
}
|
||||
if (deserializeClusterStateFromBuffer(&confState, buf, n) != 0) {
|
||||
syncError("deserializeClusterStateFromBuffer for vgid %d fail", pInfo->vgId);
|
||||
return -1;
|
||||
}
|
||||
free(buf);
|
||||
|
||||
changer = (SSyncRaftChanger) {
|
||||
.tracker = pRaft->tracker,
|
||||
.lastIndex = syncRaftLogLastIndex(pRaft->log),
|
||||
};
|
||||
if (syncRaftRestoreConfig(&changer, &confState) < 0) {
|
||||
syncError("syncRaftRestoreConfig for vgid %d fail", pInfo->vgId);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!syncRaftIsEmptyServerState(&serverState)) {
|
||||
syncRaftLoadState(pRaft, &serverState);
|
||||
}
|
||||
|
||||
if (pInfo->appliedIndex > 0) {
|
||||
syncRaftLogAppliedTo(pRaft->log, pInfo->appliedIndex);
|
||||
}
|
||||
|
||||
syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID);
|
||||
|
||||
pRaft->selfIndex = pRaft->cluster.selfIndex;
|
||||
|
||||
syncInfo("[%d:%d] restore vgid %d state: snapshot index success",
|
||||
pRaft->selfGroupId, pRaft->selfId, pInfo->vgId);
|
||||
return 0;
|
||||
|
@ -101,7 +125,7 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
RaftMessageType msgType = pMsg->msgType;
|
||||
ESyncRaftMessageType msgType = pMsg->msgType;
|
||||
if (msgType == RAFT_MSG_INTERNAL_ELECTION) {
|
||||
syncRaftHandleElectionMessage(pRaft, pMsg);
|
||||
} else if (msgType == RAFT_MSG_VOTE) {
|
||||
|
@ -119,6 +143,85 @@ int32_t syncRaftTick(SSyncRaft* pRaft) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int deserializeServerStateFromBuffer(SSyncServerState* server, const char* buffer, int n) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int deserializeClusterStateFromBuffer(SSyncConfigState* cluster, const char* buffer, int n) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void visitProgressMaybeSendAppend(int i, SSyncRaftProgress* progress, void* arg) {
|
||||
syncRaftReplicate(arg, progress, false);
|
||||
}
|
||||
|
||||
// switchToConfig reconfigures this node to use the provided configuration. It
|
||||
// updates the in-memory state and, when necessary, carries out additional
|
||||
// actions such as reacting to the removal of nodes or changed quorum
|
||||
// requirements.
|
||||
//
|
||||
// The inputs usually result from restoring a ConfState or applying a ConfChange.
|
||||
static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfig* config,
|
||||
const SSyncRaftProgressMap* progressMap, SSyncConfigState* cs) {
|
||||
SyncNodeId selfId = pRaft->selfId;
|
||||
int i;
|
||||
bool exist;
|
||||
SSyncRaftProgress* progress = NULL;
|
||||
|
||||
syncRaftConfigState(pRaft->tracker, cs);
|
||||
i = syncRaftFindProgressIndexByNodeId(&pRaft->tracker->progressMap, selfId);
|
||||
exist = (i != -1);
|
||||
|
||||
// Update whether the node itself is a learner, resetting to false when the
|
||||
// node is removed.
|
||||
if (exist) {
|
||||
progress = &pRaft->tracker->progressMap.progress[i];
|
||||
pRaft->isLearner = progress->isLearner;
|
||||
} else {
|
||||
pRaft->isLearner = false;
|
||||
}
|
||||
|
||||
if ((!exist || pRaft->isLearner) && pRaft->state == TAOS_SYNC_STATE_LEADER) {
|
||||
// This node is leader and was removed or demoted. We prevent demotions
|
||||
// at the time writing but hypothetically we handle them the same way as
|
||||
// removing the leader: stepping down into the next Term.
|
||||
//
|
||||
// TODO(tbg): step down (for sanity) and ask follower with largest Match
|
||||
// to TimeoutNow (to avoid interruption). This might still drop some
|
||||
// proposals but it's better than nothing.
|
||||
//
|
||||
// TODO(tbg): test this branch. It is untested at the time of writing.
|
||||
return;
|
||||
}
|
||||
|
||||
// The remaining steps only make sense if this node is the leader and there
|
||||
// are other nodes.
|
||||
if (pRaft->state != TAOS_SYNC_STATE_LEADER || cs->voters.replica == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (syncRaftMaybeCommit(pRaft)) {
|
||||
// If the configuration change means that more entries are committed now,
|
||||
// broadcast/append to everyone in the updated config.
|
||||
syncRaftBroadcastAppend(pRaft);
|
||||
} else {
|
||||
// Otherwise, still probe the newly added replicas; there's no reason to
|
||||
// let them wait out a heartbeat interval (or the next incoming
|
||||
// proposal).
|
||||
syncRaftProgressVisit(pRaft->tracker, visitProgressMaybeSendAppend, pRaft);
|
||||
|
||||
// If the the leadTransferee was removed or demoted, abort the leadership transfer.
|
||||
SyncNodeId leadTransferee = pRaft->leadTransferee;
|
||||
if (leadTransferee != SYNC_NON_NODE_ID && !syncRaftIsInNodeMap(&pRaft->tracker->config.voters, leadTransferee)) {
|
||||
abortLeaderTransfer(pRaft);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void abortLeaderTransfer(SSyncRaft* pRaft) {
|
||||
pRaft->leadTransferee = SYNC_NON_NODE_ID;
|
||||
}
|
||||
|
||||
/**
|
||||
* pre-handle message, return true means no need to continue
|
||||
* Handle the message term, which may result in our stepping down to a follower.
|
||||
|
@ -140,7 +243,7 @@ static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
|||
|
||||
static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||
SyncNodeId leaderId = pMsg->from;
|
||||
RaftMessageType msgType = pMsg->msgType;
|
||||
ESyncRaftMessageType msgType = pMsg->msgType;
|
||||
|
||||
if (msgType == RAFT_MSG_VOTE) {
|
||||
// TODO
|
||||
|
|
|
@ -18,10 +18,10 @@
|
|||
#include "raft_log.h"
|
||||
#include "raft_message.h"
|
||||
|
||||
void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) {
|
||||
void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) {
|
||||
SyncTerm term;
|
||||
bool preVote;
|
||||
RaftMessageType voteMsgType;
|
||||
ESyncRaftMessageType voteMsgType;
|
||||
|
||||
if (syncRaftIsPromotable(pRaft)) {
|
||||
syncDebug("[%d:%d] is unpromotable; campaign() should have been called", pRaft->selfGroupId, pRaft->selfId);
|
||||
|
@ -41,7 +41,7 @@ void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) {
|
|||
}
|
||||
|
||||
int quorum = syncRaftQuorum(pRaft);
|
||||
SSyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL);
|
||||
ESyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL);
|
||||
if (result == SYNC_RAFT_VOTE_WON) {
|
||||
/**
|
||||
* We won the election after voting for ourselves (which must mean that
|
||||
|
|
|
@ -33,7 +33,7 @@ int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMs
|
|||
return 0;
|
||||
}
|
||||
|
||||
RaftMsg_Append_Entries *appendResp = &(pRespMsg->appendResp);
|
||||
RaftMsg_Append_Resp *appendResp = &(pRespMsg->appendResp);
|
||||
// ignore committed logs
|
||||
if (syncRaftLogIsCommitted(pRaft->log, appendEntries->index)) {
|
||||
appendResp->index = pRaft->log->commitIndex;
|
||||
|
|
|
@ -36,7 +36,7 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
SSyncRaftVoteResult result = syncRaftPollVote(pRaft, pMsg->from,
|
||||
ESyncRaftVoteResult result = syncRaftPollVote(pRaft, pMsg->from,
|
||||
pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION,
|
||||
!pMsg->voteResp.rejected, &rejected, &granted);
|
||||
|
||||
|
|
|
@ -31,6 +31,10 @@ SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
void syncRaftLogAppliedTo(SSyncRaftLog* pLog, SyncIndex appliedIndex) {
|
||||
|
||||
}
|
||||
|
||||
bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term) {
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -16,106 +16,62 @@
|
|||
#include "raft.h"
|
||||
#include "raft_log.h"
|
||||
#include "sync_raft_progress.h"
|
||||
#include "syncInt.h"
|
||||
#include "raft_replication.h"
|
||||
|
||||
static int sendSnapshot(SSyncRaft* pRaft, int i);
|
||||
static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex index, SyncTerm term);
|
||||
static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
||||
static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress,
|
||||
SyncIndex prevIndex, SyncTerm prevTerm,
|
||||
const SSyncRaftEntry *entries, int nEntry);
|
||||
|
||||
int syncRaftReplicate(SSyncRaft* pRaft, int i) {
|
||||
#if 0
|
||||
// syncRaftReplicate sends an append RPC with new entries to the given peer,
|
||||
// if necessary. Returns true if a message was sent. The sendIfEmpty
|
||||
// argument controls whether messages with no entries will be sent
|
||||
// ("empty" messages are useful to convey updated Commit indexes, but
|
||||
// are undesirable when we're sending multiple messages in a batch).
|
||||
bool syncRaftReplicate(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty) {
|
||||
assert(pRaft->state == TAOS_SYNC_STATE_LEADER);
|
||||
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||
SyncNodeId nodeId = progress->id;
|
||||
|
||||
SyncNodeId nodeId = pRaft->cluster.nodeInfo[i].nodeId;
|
||||
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||
if (syncRaftProgressIsPaused(progress)) {
|
||||
syncInfo("node %d paused", nodeId);
|
||||
return 0;
|
||||
syncInfo("node [%d:%d] paused", pRaft->selfGroupId, nodeId);
|
||||
return false;
|
||||
}
|
||||
|
||||
SyncIndex nextIndex = syncRaftProgressNextIndex(progress);
|
||||
SyncIndex snapshotIndex = syncRaftLogSnapshotIndex(pRaft->log);
|
||||
bool inSnapshot = syncRaftProgressInSnapshot(progress);
|
||||
SSyncRaftEntry *entries;
|
||||
int nEntry;
|
||||
SyncIndex prevIndex;
|
||||
SyncTerm prevTerm;
|
||||
|
||||
/**
|
||||
* From Section 3.5:
|
||||
*
|
||||
* When sending an AppendEntries RPC, the leader includes the index and
|
||||
* term of the entry in its log that immediately precedes the new
|
||||
* entries. If the follower does not find an entry in its log with the
|
||||
* same index and term, then it refuses the new entries. The consistency
|
||||
* check acts as an induction step: the initial empty state of the logs
|
||||
* satisfies the Log Matching Property, and the consistency check
|
||||
* preserves the Log Matching Property whenever logs are extended. As a
|
||||
* result, whenever AppendEntries returns successfully, the leader knows
|
||||
* that the follower's log is identical to its own log up through the new
|
||||
* entries (Log Matching Property in Figure 3.2).
|
||||
**/
|
||||
if (nextIndex == 1) {
|
||||
/**
|
||||
* We're including the very first entry, so prevIndex and prevTerm are
|
||||
* null. If the first entry is not available anymore, send the last
|
||||
* snapshot if we're not already sending one.
|
||||
**/
|
||||
if (snapshotIndex > 0 && !inSnapshot) {
|
||||
goto send_snapshot;
|
||||
}
|
||||
prevIndex = nextIndex - 1;
|
||||
prevTerm = syncRaftLogTermOf(pRaft->log, prevIndex);
|
||||
int ret = syncRaftLogAcquire(pRaft->log, nextIndex, pRaft->maxMsgSize, &entries, &nEntry);
|
||||
|
||||
// otherwise send append entries from start
|
||||
prevIndex = 0;
|
||||
prevTerm = 0;
|
||||
} else {
|
||||
/**
|
||||
* Set prevIndex and prevTerm to the index and term of the entry at
|
||||
* nextIndex - 1.
|
||||
**/
|
||||
prevIndex = nextIndex - 1;
|
||||
prevTerm = syncRaftLogTermOf(pRaft->log, prevIndex);
|
||||
/**
|
||||
* If the entry is not anymore in our log, send the last snapshot if we're
|
||||
* not doing so already.
|
||||
**/
|
||||
if (prevTerm == SYNC_NON_TERM && !inSnapshot) {
|
||||
goto send_snapshot;
|
||||
}
|
||||
if (nEntry == 0 && !sendIfEmpty) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Send empty AppendEntries RPC when installing a snaphot */
|
||||
if (inSnapshot) {
|
||||
prevIndex = syncRaftLogLastIndex(pRaft->log);
|
||||
prevTerm = syncRaftLogLastTerm(pRaft->log);
|
||||
if (ret != 0 || prevTerm == SYNC_NON_TERM) {
|
||||
return sendSnapshot(pRaft, progress);
|
||||
}
|
||||
|
||||
return sendAppendEntries(pRaft, i, prevIndex, prevTerm);
|
||||
|
||||
send_snapshot:
|
||||
if (syncRaftProgressRecentActive(progress)) {
|
||||
/* Only send a snapshot when we have heard from the server */
|
||||
return sendSnapshot(pRaft, i);
|
||||
} else {
|
||||
/* Send empty AppendEntries RPC when we haven't heard from the server */
|
||||
prevIndex = syncRaftLogLastIndex(pRaft->log);
|
||||
prevTerm = syncRaftLogLastTerm(pRaft->log);
|
||||
return sendAppendEntries(pRaft, i, prevIndex, prevTerm);
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
return sendAppendEntries(pRaft, progress, prevIndex, prevTerm, entries, nEntry);
|
||||
}
|
||||
|
||||
static int sendSnapshot(SSyncRaft* pRaft, int i) {
|
||||
return 0;
|
||||
static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress) {
|
||||
if (!syncRaftProgressRecentActive(progress)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex prevIndex, SyncTerm prevTerm) {
|
||||
#if 0
|
||||
SyncIndex nextIndex = prevIndex + 1;
|
||||
SSyncRaftEntry *entries;
|
||||
int nEntry;
|
||||
SNodeInfo* pNode = &(pRaft->cluster.nodeInfo[i]);
|
||||
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||
syncRaftLogAcquire(pRaft->log, nextIndex, pRaft->maxMsgSize, &entries, &nEntry);
|
||||
static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress,
|
||||
SyncIndex prevIndex, SyncTerm prevTerm,
|
||||
const SSyncRaftEntry *entries, int nEntry) {
|
||||
SyncIndex lastIndex;
|
||||
SyncTerm logTerm = prevTerm;
|
||||
SNodeInfo* pNode = &(pRaft->cluster.nodeInfo[progress->selfIndex]);
|
||||
|
||||
SSyncMessage* msg = syncNewAppendMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term,
|
||||
prevIndex, prevTerm, pRaft->log->commitIndex,
|
||||
|
@ -125,24 +81,27 @@ static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex prevIndex, SyncT
|
|||
goto err_release_log;
|
||||
}
|
||||
|
||||
pRaft->io.send(msg, pNode);
|
||||
|
||||
if (syncRaftProgressInReplicate(progress)) {
|
||||
SyncIndex lastIndex = nextIndex + nEntry;
|
||||
syncRaftProgressOptimisticNextIndex(progress, lastIndex);
|
||||
syncRaftInflightAdd(&progress->inflights, lastIndex);
|
||||
} else if (syncRaftProgressInProbe(progress)) {
|
||||
syncRaftProgressPause(progress);
|
||||
} else {
|
||||
|
||||
if (nEntry != 0) {
|
||||
switch (progress->state) {
|
||||
// optimistically increase the next when in StateReplicate
|
||||
case PROGRESS_STATE_REPLICATE:
|
||||
lastIndex = entries[nEntry - 1].index;
|
||||
syncRaftProgressOptimisticNextIndex(progress, lastIndex);
|
||||
syncRaftInflightAdd(&progress->inflights, lastIndex);
|
||||
break;
|
||||
case PROGRESS_STATE_PROBE:
|
||||
progress->probeSent = true;
|
||||
break;
|
||||
default:
|
||||
syncFatal("[%d:%d] is sending append in unhandled state %s",
|
||||
pRaft->selfGroupId, pRaft->selfId, syncRaftProgressStateString(progress));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
syncRaftProgressUpdateSendTick(progress, pRaft->currentTick);
|
||||
|
||||
return 0;
|
||||
pRaft->io.send(msg, pNode);
|
||||
return true;
|
||||
|
||||
err_release_log:
|
||||
syncRaftLogRelease(pRaft->log, nextIndex, entries, nEntry);
|
||||
#endif
|
||||
return 0;
|
||||
syncRaftLogRelease(pRaft->log, prevIndex + 1, entries, nEntry);
|
||||
return false;
|
||||
}
|
|
@ -0,0 +1,388 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "syncInt.h"
|
||||
#include "sync_raft_config_change.h"
|
||||
#include "sync_raft_progress.h"
|
||||
#include "sync_raft_progress_tracker.h"
|
||||
#include "sync_raft_quorum_joint.h"
|
||||
|
||||
static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||
static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||
static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||
static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||
static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config);
|
||||
static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css);
|
||||
|
||||
static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r);
|
||||
|
||||
static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner);
|
||||
|
||||
static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id);
|
||||
static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id);
|
||||
|
||||
static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||
static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||
static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||
// syncRaftChangerSimpleConfig carries out a series of configuration changes that (in aggregate)
|
||||
// mutates the incoming majority config Voters[0] by at most one. This method
|
||||
// will return an error if that is not the case, if the resulting quorum is
|
||||
// zero, or if the configuration is in a joint state (i.e. if there is an
|
||||
// outgoing configuration).
|
||||
int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
|
||||
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||
int ret;
|
||||
|
||||
ret = checkAndCopy(changer, config, progressMap);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (hasJointConfig(config)) {
|
||||
syncError("can't apply simple config change in joint config");
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = applyConfig(changer, config, progressMap, css);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
int n = symDiff(syncRaftJointConfigIncoming(&changer->tracker->config.voters),
|
||||
syncRaftJointConfigIncoming(&config->voters));
|
||||
if (n > 1) {
|
||||
syncError("more than one voter changed without entering joint config");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return checkAndReturn(config, progressMap);
|
||||
}
|
||||
|
||||
// EnterJoint verifies that the outgoing (=right) majority config of the joint
|
||||
// config is empty and initializes it with a copy of the incoming (=left)
|
||||
// majority config. That is, it transitions from
|
||||
//
|
||||
// (1 2 3)&&()
|
||||
// to
|
||||
// (1 2 3)&&(1 2 3).
|
||||
//
|
||||
// The supplied changes are then applied to the incoming majority config,
|
||||
// resulting in a joint configuration that in terms of the Raft thesis[1]
|
||||
// (Section 4.3) corresponds to `C_{new,old}`.
|
||||
//
|
||||
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
|
||||
int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css,
|
||||
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||
int ret;
|
||||
|
||||
ret = checkAndCopy(changer, config, progressMap);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
if (hasJointConfig(config)) {
|
||||
syncError("config is already joint");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if(config->voters.incoming.replica == 0) {
|
||||
// We allow adding nodes to an empty config for convenience (testing and
|
||||
// bootstrap), but you can't enter a joint state.
|
||||
syncError("can't make a zero-voter config joint");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Clear the outgoing config.
|
||||
syncRaftJointConfigClearOutgoing(&config->voters);
|
||||
|
||||
// Copy incoming to outgoing.
|
||||
memcpy(&config->voters.outgoing, &config->voters.incoming, sizeof(SSyncCluster));
|
||||
|
||||
ret = applyConfig(changer, config, progressMap, css);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
config->autoLeave = autoLeave;
|
||||
return checkAndReturn(config, progressMap);
|
||||
}
|
||||
|
||||
// checkAndCopy copies the tracker's config and progress map (deeply enough for
|
||||
// the purposes of the Changer) and returns those copies. It returns an error
|
||||
// if checkInvariants does.
|
||||
static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||
syncRaftCloneTrackerConfig(&changer->tracker->config, config);
|
||||
int i;
|
||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
SSyncRaftProgress* progress = &(changer->tracker->progressMap.progress[i]);
|
||||
if (progress->id == SYNC_NON_NODE_ID) {
|
||||
continue;
|
||||
}
|
||||
syncRaftCopyProgress(progress, &(progressMap->progress[i]));
|
||||
}
|
||||
return checkAndReturn(config, progressMap);
|
||||
}
|
||||
|
||||
// checkAndReturn calls checkInvariants on the input and returns either the
|
||||
// resulting error or the input.
|
||||
static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||
if (checkInvariants(config, progressMap) != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// checkInvariants makes sure that the config and progress are compatible with
|
||||
// each other. This is used to check both what the Changer is initialized with,
|
||||
// as well as what it returns.
|
||||
static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||
int ret = syncRaftCheckProgress(config, progressMap);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
int i;
|
||||
// Any staged learner was staged because it could not be directly added due
|
||||
// to a conflicting voter in the outgoing config.
|
||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (!syncRaftJointConfigInOutgoing(&config->voters, config->learnersNext.nodeId[i])) {
|
||||
return -1;
|
||||
}
|
||||
if (progressMap->progress[i].id != SYNC_NON_NODE_ID && progressMap->progress[i].isLearner) {
|
||||
syncError("%d is in LearnersNext, but is already marked as learner", progressMap->progress[i].id);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
// Conversely Learners and Voters doesn't intersect at all.
|
||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (syncRaftJointConfigInIncoming(&config->voters, config->learners.nodeId[i])) {
|
||||
syncError("%d is in Learners and voter.incoming", progressMap->progress[i].id);
|
||||
return -1;
|
||||
}
|
||||
if (progressMap->progress[i].id != SYNC_NON_NODE_ID && !progressMap->progress[i].isLearner) {
|
||||
syncError("%d is in Learners, but is not marked as learner", progressMap->progress[i].id);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasJointConfig(config)) {
|
||||
// We enforce that empty maps are nil instead of zero.
|
||||
if (config->learnersNext.replica > 0) {
|
||||
syncError("cfg.LearnersNext must be nil when not joint");
|
||||
return -1;
|
||||
}
|
||||
if (config->autoLeave) {
|
||||
syncError("AutoLeave must be false when not joint");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config) {
|
||||
return config->voters.outgoing.replica > 0;
|
||||
}
|
||||
|
||||
static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < css->n; ++i) {
|
||||
const SSyncConfChangeSingle* cs = &(css->changes[i]);
|
||||
if (cs->nodeId == SYNC_NON_NODE_ID) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ESyncRaftConfChangeType type = cs->type;
|
||||
switch (type) {
|
||||
case SYNC_RAFT_Conf_AddNode:
|
||||
makeVoter(changer, config, progressMap, cs->nodeId);
|
||||
break;
|
||||
case SYNC_RAFT_Conf_AddLearnerNode:
|
||||
makeLearner(changer, config, progressMap, cs->nodeId);
|
||||
break;
|
||||
case SYNC_RAFT_Conf_RemoveNode:
|
||||
removeNodeId(changer, config, progressMap, cs->nodeId);
|
||||
break;
|
||||
case SYNC_RAFT_Conf_UpdateNode:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (config->voters.incoming.replica == 0) {
|
||||
syncError("removed all voters");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// symdiff returns the count of the symmetric difference between the sets of
|
||||
// uint64s, i.e. len( (l - r) \union (r - l)).
|
||||
static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r) {
|
||||
int n;
|
||||
int i;
|
||||
int j0, j1;
|
||||
const SSyncRaftNodeMap* pairs[2][2] = {
|
||||
{l, r}, // count elems in l but not in r
|
||||
{r, l}, // count elems in r but not in l
|
||||
};
|
||||
|
||||
for (n = 0, i = 0; i < 2; ++i) {
|
||||
const SSyncRaftNodeMap** pp = pairs[i];
|
||||
|
||||
const SSyncRaftNodeMap* p0 = pp[0];
|
||||
const SSyncRaftNodeMap* p1 = pp[1];
|
||||
for (j0 = 0; j0 < TSDB_MAX_REPLICA; ++j0) {
|
||||
SyncNodeId id = p0->nodeId[j0];
|
||||
if (id == SYNC_NON_NODE_ID) {
|
||||
continue;
|
||||
}
|
||||
for (j1 = 0; j1 < p1->replica; ++j1) {
|
||||
if (p1->nodeId[j1] != SYNC_NON_NODE_ID && p1->nodeId[j1] != id) {
|
||||
n+=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner) {
|
||||
|
||||
}
|
||||
|
||||
// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
|
||||
static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id) {
|
||||
int i;
|
||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (nodeMap->nodeId[i] == id) {
|
||||
nodeMap->replica -= 1;
|
||||
nodeMap->nodeId[i] = SYNC_NON_NODE_ID;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert(nodeMap->replica >= 0);
|
||||
}
|
||||
|
||||
// nilAwareAdd populates a map entry, creating the map if necessary.
|
||||
static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id) {
|
||||
int i, j;
|
||||
for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (nodeMap->nodeId[i] == id) {
|
||||
return;
|
||||
}
|
||||
if (j == -1 && nodeMap->nodeId[i] == SYNC_NON_NODE_ID) {
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
|
||||
assert(j != -1);
|
||||
nodeMap->nodeId[j] = id;
|
||||
nodeMap->replica += 1;
|
||||
}
|
||||
|
||||
// makeVoter adds or promotes the given ID to be a voter in the incoming
|
||||
// majority config.
|
||||
static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||
int i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
||||
if (i == -1) {
|
||||
initProgress(changer, config, progressMap, id, false);
|
||||
i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
||||
}
|
||||
|
||||
assert(i != -1);
|
||||
SSyncRaftProgress* progress = &(progressMap->progress[i]);
|
||||
|
||||
progress->isLearner = false;
|
||||
nilAwareDelete(&config->learners, id);
|
||||
nilAwareDelete(&config->learnersNext, id);
|
||||
syncRaftJointConfigAddToIncoming(&config->voters, id);
|
||||
}
|
||||
|
||||
// makeLearner makes the given ID a learner or stages it to be a learner once
|
||||
// an active joint configuration is exited.
|
||||
//
|
||||
// The former happens when the peer is not a part of the outgoing config, in
|
||||
// which case we either add a new learner or demote a voter in the incoming
|
||||
// config.
|
||||
//
|
||||
// The latter case occurs when the configuration is joint and the peer is a
|
||||
// voter in the outgoing config. In that case, we do not want to add the peer
|
||||
// as a learner because then we'd have to track a peer as a voter and learner
|
||||
// simultaneously. Instead, we add the learner to LearnersNext, so that it will
|
||||
// be added to Learners the moment the outgoing config is removed by
|
||||
// LeaveJoint().
|
||||
static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||
int i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
||||
if (i == -1) {
|
||||
initProgress(changer, config, progressMap, id, false);
|
||||
i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
||||
}
|
||||
|
||||
assert(i != -1);
|
||||
SSyncRaftProgress* progress = &(progressMap->progress[i]);
|
||||
if (progress->isLearner) {
|
||||
return;
|
||||
}
|
||||
// Remove any existing voter in the incoming config...
|
||||
removeNodeId(changer, config, progressMap, id);
|
||||
|
||||
// ... but save the Progress.
|
||||
syncRaftAddToProgressMap(progressMap, id);
|
||||
|
||||
// Use LearnersNext if we can't add the learner to Learners directly, i.e.
|
||||
// if the peer is still tracked as a voter in the outgoing config. It will
|
||||
// be turned into a learner in LeaveJoint().
|
||||
//
|
||||
// Otherwise, add a regular learner right away.
|
||||
bool inOutgoing = syncRaftJointConfigInCluster(&config->voters.outgoing, id);
|
||||
if (inOutgoing) {
|
||||
nilAwareAdd(&config->learnersNext, id);
|
||||
} else {
|
||||
nilAwareAdd(&config->learners, id);
|
||||
progress->isLearner = true;
|
||||
}
|
||||
}
|
||||
|
||||
// removeNodeId this peer as a voter or learner from the incoming config.
|
||||
static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||
int i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
||||
if (i == -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
syncRaftJointConfigRemoveFromIncoming(&config->voters, id);
|
||||
nilAwareDelete(&config->learners, id);
|
||||
nilAwareDelete(&config->learnersNext, id);
|
||||
|
||||
// If the peer is still a voter in the outgoing config, keep the Progress.
|
||||
bool inOutgoing = syncRaftJointConfigInCluster(&config->voters.outgoing, id);
|
||||
if (!inOutgoing) {
|
||||
syncRaftRemoveFromProgressMap(progressMap, id);
|
||||
}
|
||||
}
|
|
@ -31,7 +31,6 @@ static void tickElection(SSyncRaft* pRaft);
|
|||
static void tickHeartbeat(SSyncRaft* pRaft);
|
||||
|
||||
static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n);
|
||||
static bool maybeCommit(SSyncRaft* pRaft);
|
||||
|
||||
static void abortLeaderTransfer(SSyncRaft* pRaft);
|
||||
|
||||
|
@ -127,7 +126,7 @@ int syncRaftQuorum(SSyncRaft* pRaft) {
|
|||
return pRaft->cluster.replica / 2 + 1;
|
||||
}
|
||||
|
||||
SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
||||
ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
||||
bool preVote, bool grant,
|
||||
int* rejected, int *granted) {
|
||||
int voterIndex = syncRaftConfigurationIndexOfNode(pRaft, id);
|
||||
|
@ -171,6 +170,34 @@ SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
|||
return granted;
|
||||
*/
|
||||
|
||||
void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState) {
|
||||
SyncIndex commitIndex = serverState->commitIndex;
|
||||
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
||||
|
||||
if (commitIndex < pRaft->log->commitIndex || commitIndex > lastIndex) {
|
||||
syncFatal("[%d:%d] state.commit %"PRId64" is out of range [%" PRId64 ",%" PRId64 "",
|
||||
pRaft->selfGroupId, pRaft->selfId, commitIndex, pRaft->log->commitIndex, lastIndex);
|
||||
return;
|
||||
}
|
||||
|
||||
pRaft->log->commitIndex = commitIndex;
|
||||
pRaft->term = serverState->term;
|
||||
pRaft->voteFor = serverState->voteFor;
|
||||
}
|
||||
|
||||
static void visitProgressSendAppend(int i, SSyncRaftProgress* progress, void* arg) {
|
||||
SSyncRaft* pRaft = (SSyncRaft*)arg;
|
||||
if (pRaft->selfId == progress->id) {
|
||||
return;
|
||||
}
|
||||
|
||||
syncRaftReplicate(arg, progress, true);
|
||||
}
|
||||
|
||||
void syncRaftBroadcastAppend(SSyncRaft* pRaft) {
|
||||
syncRaftProgressVisit(pRaft->tracker, visitProgressSendAppend, pRaft);
|
||||
}
|
||||
|
||||
static int convertClear(SSyncRaft* pRaft) {
|
||||
|
||||
}
|
||||
|
@ -186,7 +213,7 @@ static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
|||
* StateCandidate, we may get stale MsgPreVoteResp messages in this term from
|
||||
* our pre-candidate state).
|
||||
**/
|
||||
RaftMessageType msgType = pMsg->msgType;
|
||||
ESyncRaftMessageType msgType = pMsg->msgType;
|
||||
|
||||
if (msgType == RAFT_MSG_INTERNAL_PROP) {
|
||||
return 0;
|
||||
|
@ -243,18 +270,16 @@ static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) {
|
|||
|
||||
syncRaftLogAppend(pRaft->log, entries, n);
|
||||
|
||||
SSyncRaftProgress* progress = &(pRaft->tracker->progressMap[pRaft->cluster.selfIndex]);
|
||||
SSyncRaftProgress* progress = &(pRaft->tracker->progressMap.progress[pRaft->cluster.selfIndex]);
|
||||
syncRaftProgressMaybeUpdate(progress, lastIndex);
|
||||
// Regardless of maybeCommit's return, our caller will call bcastAppend.
|
||||
maybeCommit(pRaft);
|
||||
// Regardless of syncRaftMaybeCommit's return, our caller will call bcastAppend.
|
||||
syncRaftMaybeCommit(pRaft);
|
||||
}
|
||||
|
||||
/**
|
||||
* maybeCommit attempts to advance the commit index. Returns true if
|
||||
* the commit index changed (in which case the caller should call
|
||||
* r.bcastAppend).
|
||||
**/
|
||||
static bool maybeCommit(SSyncRaft* pRaft) {
|
||||
// syncRaftMaybeCommit attempts to advance the commit index. Returns true if
|
||||
// the commit index changed (in which case the caller should call
|
||||
// r.bcastAppend).
|
||||
bool syncRaftMaybeCommit(SSyncRaft* pRaft) {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -263,6 +288,7 @@ static bool maybeCommit(SSyncRaft* pRaft) {
|
|||
* trigger I/O requests for newly appended log entries or heartbeats.
|
||||
**/
|
||||
static int triggerAll(SSyncRaft* pRaft) {
|
||||
#if 0
|
||||
assert(pRaft->state == TAOS_SYNC_STATE_LEADER);
|
||||
int i;
|
||||
|
||||
|
@ -271,8 +297,10 @@ static int triggerAll(SSyncRaft* pRaft) {
|
|||
continue;
|
||||
}
|
||||
|
||||
syncRaftReplicate(pRaft, i);
|
||||
syncRaftReplicate(pRaft, pRaft->tracker->progressMap.progress[i], true);
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void abortLeaderTransfer(SSyncRaft* pRaft) {
|
||||
|
|
|
@ -20,13 +20,13 @@
|
|||
#include "sync.h"
|
||||
#include "syncInt.h"
|
||||
|
||||
static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state);
|
||||
static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state);
|
||||
static void probeAcked(SSyncRaftProgress* progress);
|
||||
|
||||
static void resumeProgress(SSyncRaftProgress* progress);
|
||||
|
||||
void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress) {
|
||||
SSyncRaftInflights* inflights = syncRaftOpenInflights(pRaft->tracker->maxInflight);
|
||||
SSyncRaftInflights* inflights = syncRaftOpenInflights(pRaft->tracker->maxInflightMsgs);
|
||||
if (inflights == NULL) {
|
||||
return;
|
||||
}
|
||||
|
@ -112,6 +112,44 @@ bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) {
|
|||
}
|
||||
}
|
||||
|
||||
int syncRaftFindProgressIndexByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||
int i;
|
||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (progressMap->progress[i].id == id) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||
int i, j;
|
||||
|
||||
for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (progressMap->progress[i].id == id) {
|
||||
return i;
|
||||
}
|
||||
if (j == -1 && progressMap->progress[i].id == SYNC_NON_NODE_ID) {
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
|
||||
assert(j != -1);
|
||||
|
||||
progressMap->progress[i].id = id;
|
||||
}
|
||||
|
||||
void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (progressMap->progress[i].id == id) {
|
||||
progressMap->progress[i].id = SYNC_NON_NODE_ID;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) {
|
||||
return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex;
|
||||
}
|
||||
|
@ -149,11 +187,15 @@ void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snaps
|
|||
progress->pendingSnapshotIndex = snapshotIndex;
|
||||
}
|
||||
|
||||
void syncRaftCopyProgress(const SSyncRaftProgress* progress, SSyncRaftProgress* out) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* ResetState moves the Progress into the specified State, resetting ProbeSent,
|
||||
* PendingSnapshot, and Inflights.
|
||||
**/
|
||||
static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state) {
|
||||
static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state) {
|
||||
progress->probeSent = false;
|
||||
progress->pendingSnapshotIndex = 0;
|
||||
progress->state = state;
|
||||
|
@ -233,7 +275,7 @@ void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i) {
|
|||
progress->state = PROGRESS_STATE_PROBE;
|
||||
}
|
||||
|
||||
RaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i) {
|
||||
ESyncRaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i) {
|
||||
return pRaft->leaderState.progress[i].state;
|
||||
}
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
*/
|
||||
|
||||
#include "sync_raft_progress_tracker.h"
|
||||
#include "sync_raft_proto.h"
|
||||
|
||||
SSyncRaftProgressTracker* syncRaftOpenProgressTracker() {
|
||||
SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)malloc(sizeof(SSyncRaftProgressTracker));
|
||||
|
@ -25,13 +26,13 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker() {
|
|||
}
|
||||
|
||||
void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) {
|
||||
memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteResult) * TSDB_MAX_REPLICA);
|
||||
memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(ESyncRaftVoteType) * TSDB_MAX_REPLICA);
|
||||
}
|
||||
|
||||
void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) {
|
||||
int i;
|
||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
SSyncRaftProgress* progress = &(tracker->progressMap[i]);
|
||||
SSyncRaftProgress* progress = &(tracker->progressMap.progress[i]);
|
||||
visit(i, progress, arg);
|
||||
}
|
||||
}
|
||||
|
@ -44,17 +45,21 @@ void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant) {
|
|||
tracker->votes[i] = grant ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT;
|
||||
}
|
||||
|
||||
void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the
|
||||
* election outcome is known.
|
||||
**/
|
||||
SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) {
|
||||
ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) {
|
||||
int i;
|
||||
SSyncRaftProgress* progress;
|
||||
int r, g;
|
||||
|
||||
for (i = 0, r = 0, g = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
progress = &(tracker->progressMap[i]);
|
||||
progress = &(tracker->progressMap.progress[i]);
|
||||
if (progress->id == SYNC_NON_NODE_ID) {
|
||||
continue;
|
||||
}
|
||||
|
@ -74,3 +79,10 @@ SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* re
|
|||
if (granted) *granted = g;
|
||||
return syncRaftVoteResult(&(tracker->config.voters), tracker->votes);
|
||||
}
|
||||
|
||||
void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) {
|
||||
memcpy(&cs->voters, &tracker->config.voters.incoming, sizeof(SSyncRaftNodeMap));
|
||||
memcpy(&cs->votersOutgoing, &tracker->config.voters.outgoing, sizeof(SSyncRaftNodeMap));
|
||||
memcpy(&cs->learners, &tracker->config.learners, sizeof(SSyncRaftNodeMap));
|
||||
memcpy(&cs->learnersNext, &tracker->config.learnersNext, sizeof(SSyncRaftNodeMap));
|
||||
}
|
|
@ -22,9 +22,9 @@
|
|||
* a result indicating whether the vote is pending, lost, or won. A joint quorum
|
||||
* requires both majority quorums to vote in favor.
|
||||
**/
|
||||
SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const SyncRaftVoteResult* votes) {
|
||||
SyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->majorityConfig[0]), votes);
|
||||
SyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->majorityConfig[1]), votes);
|
||||
ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const ESyncRaftVoteType* votes) {
|
||||
ESyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->incoming), votes);
|
||||
ESyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->outgoing), votes);
|
||||
|
||||
if (r1 == r2) {
|
||||
// If they agree, return the agreed state.
|
||||
|
@ -39,3 +39,47 @@ SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const
|
|||
// One side won, the other one is pending, so the whole outcome is.
|
||||
return SYNC_RAFT_VOTE_PENDING;
|
||||
}
|
||||
|
||||
void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
||||
int i, min;
|
||||
|
||||
for (i = 0, min = -1; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (config->incoming.nodeId[i] == id) {
|
||||
return;
|
||||
}
|
||||
if (min == -1 && config->incoming.nodeId[i] == SYNC_NON_NODE_ID) {
|
||||
min = i;
|
||||
}
|
||||
}
|
||||
|
||||
assert(min != -1);
|
||||
config->incoming.nodeId[min] = id;
|
||||
config->incoming.replica += 1;
|
||||
}
|
||||
|
||||
void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (config->incoming.nodeId[i] == id) {
|
||||
config->incoming.replica -= 1;
|
||||
config->incoming.nodeId[i] = SYNC_NON_NODE_ID;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert(config->incoming.replica >= 0);
|
||||
}
|
||||
|
||||
|
||||
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (nodeId == nodeMap->nodeId[i]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
|
@ -22,14 +22,14 @@
|
|||
* yes/no has been reached), won (a quorum of yes has been reached), or lost (a
|
||||
* quorum of no has been reached).
|
||||
**/
|
||||
SyncRaftVoteResult syncRaftMajorityVoteResult(SSyncCluster* config, const SyncRaftVoteResult* votes) {
|
||||
ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, const ESyncRaftVoteType* votes) {
|
||||
if (config->replica == 0) {
|
||||
return SYNC_RAFT_VOTE_WON;
|
||||
}
|
||||
|
||||
int i, g, r, missing;
|
||||
for (i = g = r = missing = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||
if (config->nodeInfo[i].nodeId == SYNC_NON_NODE_ID) {
|
||||
if (config->nodeId[i] == SYNC_NON_NODE_ID) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,181 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "sync_raft_config_change.h"
|
||||
#include "sync_raft_restore.h"
|
||||
#include "sync_raft_progress_tracker.h"
|
||||
|
||||
static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in);
|
||||
|
||||
// syncRaftRestoreConfig takes a Changer (which must represent an empty configuration), and
|
||||
// runs a sequence of changes enacting the configuration described in the
|
||||
// ConfState.
|
||||
//
|
||||
// TODO(tbg) it's silly that this takes a Changer. Unravel this by making sure
|
||||
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
|
||||
// this can just take LastIndex and MaxInflight directly instead and cook up
|
||||
// the results from that alone.
|
||||
int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs) {
|
||||
SSyncConfChangeSingleArray outgoing;
|
||||
SSyncConfChangeSingleArray incoming;
|
||||
SSyncConfChangeSingleArray css;
|
||||
SSyncRaftProgressTracker* tracker = changer->tracker;
|
||||
SSyncRaftProgressTrackerConfig* config = &tracker->config;
|
||||
SSyncRaftProgressMap* progressMap = &tracker->progressMap;
|
||||
int i, ret;
|
||||
|
||||
ret = toConfChangeSingle(cs, &outgoing, &incoming);
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (outgoing.n == 0) {
|
||||
// No outgoing config, so just apply the incoming changes one by one.
|
||||
for (i = 0; i < incoming.n; ++i) {
|
||||
css = (SSyncConfChangeSingleArray) {
|
||||
.n = 1,
|
||||
.changes = &incoming.changes[i],
|
||||
};
|
||||
ret = syncRaftChangerSimpleConfig(changer, &css, config, progressMap);
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The ConfState describes a joint configuration.
|
||||
//
|
||||
// First, apply all of the changes of the outgoing config one by one, so
|
||||
// that it temporarily becomes the incoming active config. For example,
|
||||
// if the config is (1 2 3)&(2 3 4), this will establish (2 3 4)&().
|
||||
for (i = 0; i < outgoing.n; ++i) {
|
||||
css = (SSyncConfChangeSingleArray) {
|
||||
.n = 1,
|
||||
.changes = &outgoing.changes[i],
|
||||
};
|
||||
ret = syncRaftChangerSimpleConfig(changer, &css, config, progressMap);
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = syncRaftChangerEnterJoint(changer, cs->autoLeave, &incoming, config, progressMap);
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (incoming.n != 0) free(incoming.changes);
|
||||
if (outgoing.n != 0) free(outgoing.changes);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// toConfChangeSingle translates a conf state into 1) a slice of operations creating
|
||||
// first the config that will become the outgoing one, and then the incoming one, and
|
||||
// b) another slice that, when applied to the config resulted from 1), represents the
|
||||
// ConfState.
|
||||
static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in) {
|
||||
int i;
|
||||
|
||||
out->n = in->n = 0;
|
||||
|
||||
out->n = cs->votersOutgoing.replica;
|
||||
out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * out->n);
|
||||
if (out->changes == NULL) {
|
||||
out->n = 0;
|
||||
return -1;
|
||||
}
|
||||
in->n = cs->votersOutgoing.replica + cs->voters.replica + cs->learners.replica + cs->learnersNext.replica;
|
||||
out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * in->n);
|
||||
if (in->changes == NULL) {
|
||||
in->n = 0;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Example to follow along this code:
|
||||
// voters=(1 2 3) learners=(5) outgoing=(1 2 4 6) learners_next=(4)
|
||||
//
|
||||
// This means that before entering the joint config, the configuration
|
||||
// had voters (1 2 4 6) and perhaps some learners that are already gone.
|
||||
// The new set of voters is (1 2 3), i.e. (1 2) were kept around, and (4 6)
|
||||
// are no longer voters; however 4 is poised to become a learner upon leaving
|
||||
// the joint state.
|
||||
// We can't tell whether 5 was a learner before entering the joint config,
|
||||
// but it doesn't matter (we'll pretend that it wasn't).
|
||||
//
|
||||
// The code below will construct
|
||||
// outgoing = add 1; add 2; add 4; add 6
|
||||
// incoming = remove 1; remove 2; remove 4; remove 6
|
||||
// add 1; add 2; add 3;
|
||||
// add-learner 5;
|
||||
// add-learner 4;
|
||||
//
|
||||
// So, when starting with an empty config, after applying 'outgoing' we have
|
||||
//
|
||||
// quorum=(1 2 4 6)
|
||||
//
|
||||
// From which we enter a joint state via 'incoming'
|
||||
//
|
||||
// quorum=(1 2 3)&&(1 2 4 6) learners=(5) learners_next=(4)
|
||||
//
|
||||
// as desired.
|
||||
|
||||
for (i = 0; i < cs->votersOutgoing.replica; ++i) {
|
||||
// If there are outgoing voters, first add them one by one so that the
|
||||
// (non-joint) config has them all.
|
||||
out->changes[i] = (SSyncConfChangeSingle) {
|
||||
.type = SYNC_RAFT_Conf_AddNode,
|
||||
.nodeId = cs->votersOutgoing.nodeId[i],
|
||||
};
|
||||
}
|
||||
|
||||
// We're done constructing the outgoing slice, now on to the incoming one
|
||||
// (which will apply on top of the config created by the outgoing slice).
|
||||
|
||||
// First, we'll remove all of the outgoing voters.
|
||||
int j = 0;
|
||||
for (i = 0; i < cs->votersOutgoing.replica; ++i) {
|
||||
in->changes[j] = (SSyncConfChangeSingle) {
|
||||
.type = SYNC_RAFT_Conf_RemoveNode,
|
||||
.nodeId = cs->votersOutgoing.nodeId[i],
|
||||
};
|
||||
j += 1;
|
||||
}
|
||||
// Then we'll add the incoming voters and learners.
|
||||
for (i = 0; i < cs->voters.replica; ++i) {
|
||||
in->changes[j] = (SSyncConfChangeSingle) {
|
||||
.type = SYNC_RAFT_Conf_AddNode,
|
||||
.nodeId = cs->voters.nodeId[i],
|
||||
};
|
||||
j += 1;
|
||||
}
|
||||
for (i = 0; i < cs->learners.replica; ++i) {
|
||||
in->changes[j] = (SSyncConfChangeSingle) {
|
||||
.type = SYNC_RAFT_Conf_AddLearnerNode,
|
||||
.nodeId = cs->learners.nodeId[i],
|
||||
};
|
||||
j += 1;
|
||||
}
|
||||
// Same for LearnersNext; these are nodes we want to be learners but which
|
||||
// are currently voters in the outgoing config.
|
||||
for (i = 0; i < cs->learnersNext.replica; ++i) {
|
||||
in->changes[j] = (SSyncConfChangeSingle) {
|
||||
.type = SYNC_RAFT_Conf_AddLearnerNode,
|
||||
.nodeId = cs->learnersNext.nodeId[i],
|
||||
};
|
||||
j += 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue