Merge branch '3.0' of github.com:taosdata/TDengine into 3.0

This commit is contained in:
Hongze Cheng 2021-11-16 15:49:18 +08:00
commit c1c3164503
28 changed files with 1253 additions and 265 deletions

View File

@ -109,38 +109,25 @@ typedef struct SSyncLogStore {
SyncIndex (*logLastIndex)(struct SSyncLogStore* logStore); SyncIndex (*logLastIndex)(struct SSyncLogStore* logStore);
} SSyncLogStore; } SSyncLogStore;
typedef struct SSyncServerState {
SyncNodeId voteFor;
SyncTerm term;
SyncIndex commitIndex;
} SSyncServerState;
typedef struct SSyncClusterConfig {
// Log index number of current cluster config.
SyncIndex index;
// Log index number of previous cluster config.
SyncIndex prevIndex;
// current cluster
const SSyncCluster* cluster;
} SSyncClusterConfig;
typedef struct SStateManager { typedef struct SStateManager {
void* pData; void* pData;
int32_t (*saveServerState)(struct SStateManager* stateMng, SSyncServerState* state); // save serialized server state data, buffer will be free by Sync
int32_t (*saveServerState)(struct SStateManager* stateMng, const char* buffer, int n);
int32_t (*readServerState)(struct SStateManager* stateMng, SSyncServerState* state); // read serialized server state data, buffer will be free by Sync
int32_t (*readServerState)(struct SStateManager* stateMng, char** ppBuffer, int* n);
void (*saveCluster)(struct SStateManager* stateMng, const SSyncClusterConfig* cluster); // save serialized cluster state data, buffer will be free by Sync
void (*saveClusterState)(struct SStateManager* stateMng, const char* buffer, int n);
const SSyncClusterConfig* (*readCluster)(struct SStateManager* stateMng); // read serialized cluster state data, buffer will be free by Sync
int32_t (*readClusterState)(struct SStateManager* stateMng, char** ppBuffer, int* n);
} SStateManager; } SStateManager;
typedef struct { typedef struct {
SyncGroupId vgId; SyncGroupId vgId;
SyncIndex snapshotIndex; SyncIndex appliedIndex;
SSyncCluster syncCfg; SSyncCluster syncCfg;
SSyncFSM fsm; SSyncFSM fsm;
SSyncLogStore logStore; SSyncLogStore logStore;

View File

@ -65,7 +65,8 @@ struct SSyncRaft {
SSyncRaftLog *log; SSyncRaftLog *log;
int maxMsgSize; uint64_t maxMsgSize;
uint64_t maxUncommittedSize;
SSyncRaftProgressTracker *tracker; SSyncRaftProgressTracker *tracker;
ESyncState state; ESyncState state;

View File

@ -19,16 +19,16 @@
#include "sync.h" #include "sync.h"
#include "sync_type.h" #include "sync_type.h"
typedef enum SyncEntryType { typedef enum ESyncRaftEntryType {
SYNC_ENTRY_TYPE_LOG = 1, SYNC_ENTRY_TYPE_LOG = 1,
}SyncEntryType; } ESyncRaftEntryType;
struct SSyncRaftEntry { struct SSyncRaftEntry {
SyncTerm term; SyncTerm term;
SyncIndex index; SyncIndex index;
SyncEntryType type; ESyncRaftEntryType type;
SSyncBuffer buffer; SSyncBuffer buffer;
}; };
@ -51,6 +51,8 @@ SyncIndex syncRaftLogSnapshotIndex(SSyncRaftLog* pLog);
SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog); SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog);
void syncRaftLogAppliedTo(SSyncRaftLog* pLog, SyncIndex appliedIndex);
bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term); bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term);
int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog); int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog);

View File

@ -28,7 +28,7 @@
* outter message start with RAFT_MSG_*, which communicate between cluster peers, * outter message start with RAFT_MSG_*, which communicate between cluster peers,
* need to implement its decode/encode functions. * need to implement its decode/encode functions.
**/ **/
typedef enum RaftMessageType { typedef enum ESyncRaftMessageType {
// client propose a cmd // client propose a cmd
RAFT_MSG_INTERNAL_PROP = 1, RAFT_MSG_INTERNAL_PROP = 1,
@ -40,7 +40,7 @@ typedef enum RaftMessageType {
RAFT_MSG_APPEND = 5, RAFT_MSG_APPEND = 5,
RAFT_MSG_APPEND_RESP = 6, RAFT_MSG_APPEND_RESP = 6,
} RaftMessageType; } ESyncRaftMessageType;
typedef struct RaftMsgInternal_Prop { typedef struct RaftMsgInternal_Prop {
const SSyncBuffer *pBuf; const SSyncBuffer *pBuf;
@ -53,14 +53,14 @@ typedef struct RaftMsgInternal_Election {
} RaftMsgInternal_Election; } RaftMsgInternal_Election;
typedef struct RaftMsg_Vote { typedef struct RaftMsg_Vote {
SyncRaftElectionType cType; ESyncRaftElectionType cType;
SyncIndex lastIndex; SyncIndex lastIndex;
SyncTerm lastTerm; SyncTerm lastTerm;
} RaftMsg_Vote; } RaftMsg_Vote;
typedef struct RaftMsg_VoteResp { typedef struct RaftMsg_VoteResp {
bool rejected; bool rejected;
SyncRaftElectionType cType; ESyncRaftElectionType cType;
} RaftMsg_VoteResp; } RaftMsg_VoteResp;
typedef struct RaftMsg_Append_Entries { typedef struct RaftMsg_Append_Entries {
@ -85,7 +85,7 @@ typedef struct RaftMsg_Append_Resp {
} RaftMsg_Append_Resp; } RaftMsg_Append_Resp;
typedef struct SSyncMessage { typedef struct SSyncMessage {
RaftMessageType msgType; ESyncRaftMessageType msgType;
SyncTerm term; SyncTerm term;
SyncGroupId groupId; SyncGroupId groupId;
SyncNodeId from; SyncNodeId from;
@ -131,7 +131,7 @@ static FORCE_INLINE SSyncMessage* syncInitElectionMsg(SSyncMessage* pMsg, SyncNo
} }
static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId from, static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId from,
SyncTerm term, SyncRaftElectionType cType, SyncTerm term, ESyncRaftElectionType cType,
SyncIndex lastIndex, SyncTerm lastTerm) { SyncIndex lastIndex, SyncTerm lastTerm) {
SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage));
if (pMsg == NULL) { if (pMsg == NULL) {
@ -153,7 +153,7 @@ static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId
} }
static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNodeId from, static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNodeId from,
SyncRaftElectionType cType, bool rejected) { ESyncRaftElectionType cType, bool rejected) {
SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage));
if (pMsg == NULL) { if (pMsg == NULL) {
return NULL; return NULL;
@ -213,7 +213,7 @@ static FORCE_INLINE SSyncMessage* syncNewEmptyAppendRespMsg(SyncGroupId groupId,
return pMsg; return pMsg;
} }
static FORCE_INLINE bool syncIsInternalMsg(RaftMessageType msgType) { static FORCE_INLINE bool syncIsInternalMsg(ESyncRaftMessageType msgType) {
return msgType == RAFT_MSG_INTERNAL_PROP || return msgType == RAFT_MSG_INTERNAL_PROP ||
msgType == RAFT_MSG_INTERNAL_ELECTION; msgType == RAFT_MSG_INTERNAL_ELECTION;
} }

View File

@ -20,6 +20,11 @@
#include "syncInt.h" #include "syncInt.h"
#include "sync_type.h" #include "sync_type.h"
int syncRaftReplicate(SSyncRaft* pRaft, int i); // syncRaftReplicate sends an append RPC with new entries to the given peer,
// if necessary. Returns true if a message was sent. The sendIfEmpty
// argument controls whether messages with no entries will be sent
// ("empty" messages are useful to convey updated Commit indexes, but
// are undesirable when we're sending multiple messages in a batch).
bool syncRaftReplicate(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty);
#endif /* TD_SYNC_RAFT_REPLICATION_H */ #endif /* TD_SYNC_RAFT_REPLICATION_H */

View File

@ -0,0 +1,42 @@
/*
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TD_SYNC_RAFT_CONFIG_CHANGE_H
#define TD_SYNC_RAFT_CONFIG_CHANGE_H
#include "sync_type.h"
#include "sync_raft_proto.h"
/**
* Changer facilitates configuration changes. It exposes methods to handle
* simple and joint consensus while performing the proper validation that allows
* refusing invalid configuration changes before they affect the active
* configuration.
**/
struct SSyncRaftChanger {
SSyncRaftProgressTracker* tracker;
SyncIndex lastIndex;
};
typedef int (*configChangeFp)(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css,
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
#endif /* TD_SYNC_RAFT_CONFIG_CHANGE_H */

View File

@ -26,7 +26,7 @@ void syncRaftBecomePreCandidate(SSyncRaft* pRaft);
void syncRaftBecomeCandidate(SSyncRaft* pRaft); void syncRaftBecomeCandidate(SSyncRaft* pRaft);
void syncRaftBecomeLeader(SSyncRaft* pRaft); void syncRaftBecomeLeader(SSyncRaft* pRaft);
void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType); void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType);
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft); void syncRaftTriggerHeartbeat(SSyncRaft* pRaft);
@ -35,8 +35,20 @@ bool syncRaftIsPromotable(SSyncRaft* pRaft);
bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft); bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft);
int syncRaftQuorum(SSyncRaft* pRaft); int syncRaftQuorum(SSyncRaft* pRaft);
SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id, bool syncRaftMaybeCommit(SSyncRaft* pRaft);
ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
bool preVote, bool accept, bool preVote, bool accept,
int* rejectNum, int *granted); int* rejectNum, int *granted);
static FORCE_INLINE bool syncRaftIsEmptyServerState(const SSyncServerState* serverState) {
return serverState->commitIndex == 0 &&
serverState->term == SYNC_NON_TERM &&
serverState->voteFor == SYNC_NON_NODE_ID;
}
void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState);
void syncRaftBroadcastAppend(SSyncRaft* pRaft);
#endif /* _TD_LIBS_SYNC_RAFT_IMPL_H */ #endif /* _TD_LIBS_SYNC_RAFT_IMPL_H */

View File

@ -34,7 +34,7 @@
* *
* PROGRESS_STATE_PROBE is the initial state. * PROGRESS_STATE_PROBE is the initial state.
**/ **/
typedef enum RaftProgressState { typedef enum ESyncRaftProgressState {
/** /**
* StateProbe indicates a follower whose last index isn't known. Such a * StateProbe indicates a follower whose last index isn't known. Such a
* follower is "probed" (i.e. an append sent periodically) to narrow down * follower is "probed" (i.e. an append sent periodically) to narrow down
@ -56,13 +56,22 @@ typedef enum RaftProgressState {
* return to StateReplicate. * return to StateReplicate.
**/ **/
PROGRESS_STATE_SNAPSHOT, PROGRESS_STATE_SNAPSHOT,
} RaftProgressState; } ESyncRaftProgressState;
static const char* kProgressStateString[] = {
"Probe",
"Replicate",
"Snapshot",
};
/** /**
* Progress represents a followers progress in the view of the leader. Leader maintains * Progress represents a followers progress in the view of the leader. Leader maintains
* progresses of all followers, and sends entries to the follower based on its progress. * progresses of all followers, and sends entries to the follower based on its progress.
**/ **/
struct SSyncRaftProgress { struct SSyncRaftProgress {
// index in raft cluster config
int selfIndex;
SyncNodeId id; SyncNodeId id;
SyncIndex nextIndex; SyncIndex nextIndex;
@ -82,7 +91,7 @@ struct SSyncRaftProgress {
* When in StateSnapshot, leader should have sent out snapshot * When in StateSnapshot, leader should have sent out snapshot
* before and stops sending any replication message. * before and stops sending any replication message.
**/ **/
RaftProgressState state; ESyncRaftProgressState state;
/** /**
* pendingSnapshotIndex is used in PROGRESS_STATE_SNAPSHOT. * pendingSnapshotIndex is used in PROGRESS_STATE_SNAPSHOT.
@ -129,6 +138,15 @@ struct SSyncRaftProgress {
bool isLearner; bool isLearner;
}; };
struct SSyncRaftProgressMap {
SSyncRaftProgress progress[TSDB_MAX_REPLICA];
};
static FORCE_INLINE const char* syncRaftProgressStateString(const SSyncRaftProgress* progress) {
return kProgressStateString[progress->state];
}
void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress); void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress);
/** /**
@ -187,15 +205,15 @@ static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progr
return progress->nextIndex; return progress->nextIndex;
} }
static FORCE_INLINE RaftProgressState syncRaftProgressInReplicate(SSyncRaftProgress* progress) { static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInReplicate(SSyncRaftProgress* progress) {
return progress->state == PROGRESS_STATE_REPLICATE; return progress->state == PROGRESS_STATE_REPLICATE;
} }
static FORCE_INLINE RaftProgressState syncRaftProgressInSnapshot(SSyncRaftProgress* progress) { static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInSnapshot(SSyncRaftProgress* progress) {
return progress->state == PROGRESS_STATE_SNAPSHOT; return progress->state == PROGRESS_STATE_SNAPSHOT;
} }
static FORCE_INLINE RaftProgressState syncRaftProgressInProbe(SSyncRaftProgress* progress) { static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInProbe(SSyncRaftProgress* progress) {
return progress->state == PROGRESS_STATE_PROBE; return progress->state == PROGRESS_STATE_PROBE;
} }
@ -203,6 +221,12 @@ static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progres
return progress->recentActive; return progress->recentActive;
} }
int syncRaftFindProgressIndexByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id);
int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
/** /**
* return true if progress's log is up-todate * return true if progress's log is up-todate
**/ **/
@ -210,7 +234,9 @@ bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress);
void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex); void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex);
void syncRaftCopyProgress(const SSyncRaftProgress* from, SSyncRaftProgress* to);
void syncRaftProgressMapCopy(const SSyncRaftProgressMap* from, SSyncRaftProgressMap* to);
#if 0 #if 0

View File

@ -17,78 +17,74 @@
#define _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H #define _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H
#include "sync_type.h" #include "sync_type.h"
#include "sync_raft_quorum.h"
#include "sync_raft_quorum_joint.h" #include "sync_raft_quorum_joint.h"
#include "sync_raft_progress.h" #include "sync_raft_progress.h"
#include "sync_raft_proto.h"
struct SSyncRaftProgressTrackerConfig { struct SSyncRaftProgressTrackerConfig {
SSyncRaftQuorumJointConfig voters; SSyncRaftQuorumJointConfig voters;
/** // autoLeave is true if the configuration is joint and a transition to the
* autoLeave is true if the configuration is joint and a transition to the // incoming configuration should be carried out automatically by Raft when
* incoming configuration should be carried out automatically by Raft when // this is possible. If false, the configuration will be joint until the
* this is possible. If false, the configuration will be joint until the // application initiates the transition manually.
* application initiates the transition manually.
**/
bool autoLeave; bool autoLeave;
/** // Learners is a set of IDs corresponding to the learners active in the
* Learners is a set of IDs corresponding to the learners active in the // current configuration.
* current configuration. //
* // Invariant: Learners and Voters does not intersect, i.e. if a peer is in
* Invariant: Learners and Voters does not intersect, i.e. if a peer is in // either half of the joint config, it can't be a learner; if it is a
* either half of the joint config, it can't be a learner; if it is a // learner it can't be in either half of the joint config. This invariant
* learner it can't be in either half of the joint config. This invariant // simplifies the implementation since it allows peers to have clarity about
* simplifies the implementation since it allows peers to have clarity about // its current role without taking into account joint consensus.
* its current role without taking into account joint consensus. SSyncRaftNodeMap learners;
**/
SyncNodeId learners[TSDB_MAX_REPLICA];
/** // When we turn a voter into a learner during a joint consensus transition,
* When we turn a voter into a learner during a joint consensus transition, // we cannot add the learner directly when entering the joint state. This is
* we cannot add the learner directly when entering the joint state. This is // because this would violate the invariant that the intersection of
* because this would violate the invariant that the intersection of // voters and learners is empty. For example, assume a Voter is removed and
* voters and learners is empty. For example, assume a Voter is removed and // immediately re-added as a learner (or in other words, it is demoted):
* immediately re-added as a learner (or in other words, it is demoted): //
* // Initially, the configuration will be
* Initially, the configuration will be //
* // voters: {1 2 3}
* voters: {1 2 3} // learners: {}
* learners: {} //
* // and we want to demote 3. Entering the joint configuration, we naively get
* and we want to demote 3. Entering the joint configuration, we naively get //
* // voters: {1 2} & {1 2 3}
* voters: {1 2} & {1 2 3} // learners: {3}
* learners: {3} //
* // but this violates the invariant (3 is both voter and learner). Instead,
* but this violates the invariant (3 is both voter and learner). Instead, // we get
* we get //
* // voters: {1 2} & {1 2 3}
* voters: {1 2} & {1 2 3} // learners: {}
* learners: {} // next_learners: {3}
* next_learners: {3} //
* // Where 3 is now still purely a voter, but we are remembering the intention
* Where 3 is now still purely a voter, but we are remembering the intention // to make it a learner upon transitioning into the final configuration:
* to make it a learner upon transitioning into the final configuration: //
* // voters: {1 2}
* voters: {1 2} // learners: {3}
* learners: {3} // next_learners: {}
* next_learners: {} //
* // Note that next_learners is not used while adding a learner that is not
* Note that next_learners is not used while adding a learner that is not // also a voter in the joint config. In this case, the learner is added
* also a voter in the joint config. In this case, the learner is added // right away when entering the joint configuration, so that it is caught up
* right away when entering the joint configuration, so that it is caught up // as soon as possible.
* as soon as possible. SSyncRaftNodeMap learnersNext;
**/
SyncNodeId learnersNext[TSDB_MAX_REPLICA];
}; };
struct SSyncRaftProgressTracker { struct SSyncRaftProgressTracker {
SSyncRaftProgressTrackerConfig config; SSyncRaftProgressTrackerConfig config;
SSyncRaftProgress progressMap[TSDB_MAX_REPLICA]; SSyncRaftProgressMap progressMap;
SyncRaftVoteResult votes[TSDB_MAX_REPLICA]; ESyncRaftVoteType votes[TSDB_MAX_REPLICA];
int maxInflight; int maxInflightMsgs;
}; };
SSyncRaftProgressTracker* syncRaftOpenProgressTracker(); SSyncRaftProgressTracker* syncRaftOpenProgressTracker();
@ -104,10 +100,18 @@ void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, voi
**/ **/
void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant); void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant);
void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressTrackerConfig* result);
int syncRaftCheckProgress(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
/** /**
* syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the * syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the
* election outcome is known. * election outcome is known.
**/ **/
SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted); ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted);
void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs);
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
#endif /* _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H */ #endif /* _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H */

View File

@ -0,0 +1,61 @@
/*
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TD_SYNC_RAFT_PROTO_H
#define TD_SYNC_RAFT_PROTO_H
#include "sync_type.h"
typedef enum ESyncRaftConfChangeType {
SYNC_RAFT_Conf_AddNode = 0,
SYNC_RAFT_Conf_RemoveNode = 1,
SYNC_RAFT_Conf_UpdateNode = 2,
SYNC_RAFT_Conf_AddLearnerNode = 3,
} ESyncRaftConfChangeType;
// ConfChangeSingle is an individual configuration change operation. Multiple
// such operations can be carried out atomically via a ConfChangeV2.
typedef struct SSyncConfChangeSingle {
ESyncRaftConfChangeType type;
SyncNodeId nodeId;
} SSyncConfChangeSingle;
typedef struct SSyncConfChangeSingleArray {
int n;
SSyncConfChangeSingle* changes;
} SSyncConfChangeSingleArray;
typedef struct SSyncConfigState {
// The voters in the incoming config. (If the configuration is not joint,
// then the outgoing config is empty).
SSyncRaftNodeMap voters;
// The learners in the incoming config.
SSyncRaftNodeMap learners;
// The voters in the outgoing config.
SSyncRaftNodeMap votersOutgoing;
// The nodes that will become learners when the outgoing config is removed.
// These nodes are necessarily currently in nodes_joint (or they would have
// been added to the incoming config right away).
SSyncRaftNodeMap learnersNext;
// If set, the config is joint and Raft will automatically transition into
// the final config (i.e. remove the outgoing config) when this is safe.
bool autoLeave;
} SSyncConfigState;
#endif /* TD_SYNC_RAFT_PROTO_H */

View File

@ -17,11 +17,11 @@
#define TD_SYNC_RAFT_QUORUM_H #define TD_SYNC_RAFT_QUORUM_H
/** /**
* SSyncRaftVoteResult indicates the outcome of a vote. * ESyncRaftVoteResult indicates the outcome of a vote.
**/ **/
typedef enum { typedef enum {
/** /**
* SYNC_RAFT_VOTE_PENDING indicates that the decision of the vote depends on future * SYNC_RAFT_VOTE_PENDING indicates that the decision of the vote depends on future
* votes, i.e. neither "yes" or "no" has reached quorum yet. * votes, i.e. neither "yes" or "no" has reached quorum yet.
**/ **/
SYNC_RAFT_VOTE_PENDING = 1, SYNC_RAFT_VOTE_PENDING = 1,
@ -35,6 +35,6 @@ typedef enum {
* SYNC_RAFT_VOTE_WON indicates that the quorum has voted "yes". * SYNC_RAFT_VOTE_WON indicates that the quorum has voted "yes".
**/ **/
SYNC_RAFT_VOTE_WON = 3, SYNC_RAFT_VOTE_WON = 3,
} SSyncRaftVoteResult; } ESyncRaftVoteResult;
#endif /* TD_SYNC_RAFT_QUORUM_H */ #endif /* TD_SYNC_RAFT_QUORUM_H */

View File

@ -25,14 +25,41 @@
* majority configurations. Decisions require the support of both majorities. * majority configurations. Decisions require the support of both majorities.
**/ **/
typedef struct SSyncRaftQuorumJointConfig { typedef struct SSyncRaftQuorumJointConfig {
SSyncCluster majorityConfig[2]; SSyncRaftNodeMap outgoing;
}SSyncRaftQuorumJointConfig; SSyncRaftNodeMap incoming;
} SSyncRaftQuorumJointConfig;
/** /**
* syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns * syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
* a result indicating whether the vote is pending, lost, or won. A joint quorum * a result indicating whether the vote is pending, lost, or won. A joint quorum
* requires both majority quorums to vote in favor. * requires both majority quorums to vote in favor.
**/ **/
SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const SyncRaftVoteResult* votes); ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const ESyncRaftVoteType* votes);
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
static FORCE_INLINE bool syncRaftJointConfigInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
return syncRaftIsInNodeMap(&config->outgoing, id);
}
static FORCE_INLINE bool syncRaftJointConfigInIncoming(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
return syncRaftIsInNodeMap(&config->incoming, id);
}
void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id);
void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id);
static FORCE_INLINE const SSyncRaftNodeMap* syncRaftJointConfigIncoming(const SSyncRaftQuorumJointConfig* config) {
return &config->incoming;
}
static FORCE_INLINE const SSyncRaftNodeMap* syncRaftJointConfigOutgoing(const SSyncRaftQuorumJointConfig* config) {
return &config->outgoing;
}
static FORCE_INLINE void syncRaftJointConfigClearOutgoing(SSyncRaftQuorumJointConfig* config) {
memset(&config->outgoing, 0, sizeof(SSyncCluster));
}
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */ #endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */

View File

@ -18,6 +18,7 @@
#include "sync.h" #include "sync.h"
#include "sync_type.h" #include "sync_type.h"
#include "sync_raft_quorum.h"
/** /**
* syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns * syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
@ -25,6 +26,6 @@
* yes/no has been reached), won (a quorum of yes has been reached), or lost (a * yes/no has been reached), won (a quorum of yes has been reached), or lost (a
* quorum of no has been reached). * quorum of no has been reached).
**/ **/
SyncRaftVoteResult syncRaftMajorityVoteResult(SSyncCluster* config, const SyncRaftVoteResult* votes); ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, const ESyncRaftVoteType* votes);
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */ #endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */

View File

@ -0,0 +1,32 @@
/*
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TD_SYNC_RAFT_RESTORE_H
#define TD_SYNC_RAFT_RESTORE_H
#include "sync_type.h"
#include "sync_raft_proto.h"
// syncRaftRestoreConfig takes a Changer (which must represent an empty configuration), and
// runs a sequence of changes enacting the configuration described in the
// ConfState.
//
// TODO(tbg) it's silly that this takes a Changer. Unravel this by making sure
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
// this can just take LastIndex and MaxInflight directly instead and cook up
// the results from that alone.
int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs);
#endif /* TD_SYNC_RAFT_RESTORE_H */

View File

@ -17,6 +17,7 @@
#define _TD_LIBS_SYNC_TYPE_H #define _TD_LIBS_SYNC_TYPE_H
#include <stdint.h> #include <stdint.h>
#include "sync.h"
#include "osMath.h" #include "osMath.h"
#define SYNC_NON_NODE_ID -1 #define SYNC_NON_NODE_ID -1
@ -28,10 +29,13 @@ typedef uint32_t SyncTick;
typedef struct SSyncRaft SSyncRaft; typedef struct SSyncRaft SSyncRaft;
typedef struct SSyncRaftProgress SSyncRaftProgress; typedef struct SSyncRaftProgress SSyncRaftProgress;
typedef struct SSyncRaftProgressMap SSyncRaftProgressMap;
typedef struct SSyncRaftProgressTrackerConfig SSyncRaftProgressTrackerConfig; typedef struct SSyncRaftProgressTrackerConfig SSyncRaftProgressTrackerConfig;
typedef struct SSyncRaftProgressTracker SSyncRaftProgressTracker; typedef struct SSyncRaftProgressTracker SSyncRaftProgressTracker;
typedef struct SSyncRaftChanger SSyncRaftChanger;
typedef struct SSyncRaftLog SSyncRaftLog; typedef struct SSyncRaftLog SSyncRaftLog;
typedef struct SSyncRaftEntry SSyncRaftEntry; typedef struct SSyncRaftEntry SSyncRaftEntry;
@ -46,11 +50,34 @@ typedef struct SSyncRaftEntry SSyncRaftEntry;
#endif #endif
#endif #endif
typedef struct SSyncServerState {
SyncNodeId voteFor;
SyncTerm term;
SyncIndex commitIndex;
} SSyncServerState;
typedef struct SSyncClusterConfig {
// Log index number of current cluster config.
SyncIndex index;
// Log index number of previous cluster config.
SyncIndex prevIndex;
// current cluster
const SSyncCluster* cluster;
} SSyncClusterConfig;
typedef struct {
int32_t replica;
SyncNodeId nodeId[TSDB_MAX_REPLICA];
} SSyncRaftNodeMap;
typedef enum { typedef enum {
SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0, SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0,
SYNC_RAFT_CAMPAIGN_ELECTION = 1, SYNC_RAFT_CAMPAIGN_ELECTION = 1,
SYNC_RAFT_CAMPAIGN_TRANSFER = 2, SYNC_RAFT_CAMPAIGN_TRANSFER = 2,
} SyncRaftElectionType; } ESyncRaftElectionType;
typedef enum { typedef enum {
// the init vote resp status // the init vote resp status
@ -59,8 +86,8 @@ typedef enum {
// grant the vote request // grant the vote request
SYNC_RAFT_VOTE_RESP_GRANT = 1, SYNC_RAFT_VOTE_RESP_GRANT = 1,
//reject the vote request // reject the vote request
SYNC_RAFT_VOTE_RESP_REJECT = 2, SYNC_RAFT_VOTE_RESP_REJECT = 2,
} SyncRaftVoteResult; } ESyncRaftVoteType;
#endif /* _TD_LIBS_SYNC_TYPE_H */ #endif /* _TD_LIBS_SYNC_TYPE_H */

View File

@ -16,12 +16,22 @@
#include "raft.h" #include "raft.h"
#include "raft_configuration.h" #include "raft_configuration.h"
#include "raft_log.h" #include "raft_log.h"
#include "sync_raft_restore.h"
#include "raft_replication.h" #include "raft_replication.h"
#include "sync_raft_config_change.h"
#include "sync_raft_progress_tracker.h" #include "sync_raft_progress_tracker.h"
#include "syncInt.h" #include "syncInt.h"
#define RAFT_READ_LOG_MAX_NUM 100 #define RAFT_READ_LOG_MAX_NUM 100
static int deserializeServerStateFromBuffer(SSyncServerState* server, const char* buffer, int n);
static int deserializeClusterStateFromBuffer(SSyncConfigState* cluster, const char* buffer, int n);
static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfig* config,
const SSyncRaftProgressMap* progressMap, SSyncConfigState* cs);
static void abortLeaderTransfer(SSyncRaft* pRaft);
static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
@ -29,13 +39,16 @@ static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
SSyncNode* pNode = pRaft->pNode; SSyncNode* pNode = pRaft->pNode;
SSyncServerState serverState; SSyncServerState serverState;
SSyncConfigState confState;
SStateManager* stateManager; SStateManager* stateManager;
SSyncLogStore* logStore; SSyncLogStore* logStore;
SSyncFSM* fsm; SSyncFSM* fsm;
SyncIndex initIndex = pInfo->snapshotIndex;
SSyncBuffer buffer[RAFT_READ_LOG_MAX_NUM]; SSyncBuffer buffer[RAFT_READ_LOG_MAX_NUM];
int nBuf, limit, i; int nBuf, limit, i;
char* buf;
int n;
SSyncRaftChanger changer;
memset(pRaft, 0, sizeof(SSyncRaft)); memset(pRaft, 0, sizeof(SSyncRaft));
memcpy(&pRaft->fsm, &pInfo->fsm, sizeof(SSyncFSM)); memcpy(&pRaft->fsm, &pInfo->fsm, sizeof(SSyncFSM));
@ -57,36 +70,47 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
return -1; return -1;
} }
// read server state // read server state
if (stateManager->readServerState(stateManager, &serverState) != 0) { if (stateManager->readServerState(stateManager, &buf, &n) != 0) {
syncError("readServerState for vgid %d fail", pInfo->vgId); syncError("readServerState for vgid %d fail", pInfo->vgId);
return -1; return -1;
} }
assert(initIndex <= serverState.commitIndex); if (deserializeServerStateFromBuffer(&serverState, buf, n) != 0) {
syncError("deserializeServerStateFromBuffer for vgid %d fail", pInfo->vgId);
// restore fsm state from snapshot index + 1 until commitIndex return -1;
++initIndex;
while (initIndex <= serverState.commitIndex) {
limit = MIN(RAFT_READ_LOG_MAX_NUM, serverState.commitIndex - initIndex + 1);
if (logStore->logRead(logStore, initIndex, limit, buffer, &nBuf) != 0) {
return -1;
}
assert(limit == nBuf);
for (i = 0; i < limit; ++i) {
fsm->applyLog(fsm, initIndex + i, &(buffer[i]), NULL);
free(buffer[i].data);
}
initIndex += nBuf;
} }
assert(initIndex == serverState.commitIndex); free(buf);
//assert(initIndex <= serverState.commitIndex);
//pRaft->heartbeatTimeoutTick = 1; // read config state
if (stateManager->readClusterState(stateManager, &buf, &n) != 0) {
syncError("readClusterState for vgid %d fail", pInfo->vgId);
return -1;
}
if (deserializeClusterStateFromBuffer(&confState, buf, n) != 0) {
syncError("deserializeClusterStateFromBuffer for vgid %d fail", pInfo->vgId);
return -1;
}
free(buf);
changer = (SSyncRaftChanger) {
.tracker = pRaft->tracker,
.lastIndex = syncRaftLogLastIndex(pRaft->log),
};
if (syncRaftRestoreConfig(&changer, &confState) < 0) {
syncError("syncRaftRestoreConfig for vgid %d fail", pInfo->vgId);
return -1;
}
if (!syncRaftIsEmptyServerState(&serverState)) {
syncRaftLoadState(pRaft, &serverState);
}
if (pInfo->appliedIndex > 0) {
syncRaftLogAppliedTo(pRaft->log, pInfo->appliedIndex);
}
syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID);
pRaft->selfIndex = pRaft->cluster.selfIndex;
syncInfo("[%d:%d] restore vgid %d state: snapshot index success", syncInfo("[%d:%d] restore vgid %d state: snapshot index success",
pRaft->selfGroupId, pRaft->selfId, pInfo->vgId); pRaft->selfGroupId, pRaft->selfId, pInfo->vgId);
return 0; return 0;
@ -101,7 +125,7 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
return 0; return 0;
} }
RaftMessageType msgType = pMsg->msgType; ESyncRaftMessageType msgType = pMsg->msgType;
if (msgType == RAFT_MSG_INTERNAL_ELECTION) { if (msgType == RAFT_MSG_INTERNAL_ELECTION) {
syncRaftHandleElectionMessage(pRaft, pMsg); syncRaftHandleElectionMessage(pRaft, pMsg);
} else if (msgType == RAFT_MSG_VOTE) { } else if (msgType == RAFT_MSG_VOTE) {
@ -119,6 +143,85 @@ int32_t syncRaftTick(SSyncRaft* pRaft) {
return 0; return 0;
} }
static int deserializeServerStateFromBuffer(SSyncServerState* server, const char* buffer, int n) {
return 0;
}
static int deserializeClusterStateFromBuffer(SSyncConfigState* cluster, const char* buffer, int n) {
return 0;
}
static void visitProgressMaybeSendAppend(int i, SSyncRaftProgress* progress, void* arg) {
syncRaftReplicate(arg, progress, false);
}
// switchToConfig reconfigures this node to use the provided configuration. It
// updates the in-memory state and, when necessary, carries out additional
// actions such as reacting to the removal of nodes or changed quorum
// requirements.
//
// The inputs usually result from restoring a ConfState or applying a ConfChange.
static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfig* config,
const SSyncRaftProgressMap* progressMap, SSyncConfigState* cs) {
SyncNodeId selfId = pRaft->selfId;
int i;
bool exist;
SSyncRaftProgress* progress = NULL;
syncRaftConfigState(pRaft->tracker, cs);
i = syncRaftFindProgressIndexByNodeId(&pRaft->tracker->progressMap, selfId);
exist = (i != -1);
// Update whether the node itself is a learner, resetting to false when the
// node is removed.
if (exist) {
progress = &pRaft->tracker->progressMap.progress[i];
pRaft->isLearner = progress->isLearner;
} else {
pRaft->isLearner = false;
}
if ((!exist || pRaft->isLearner) && pRaft->state == TAOS_SYNC_STATE_LEADER) {
// This node is leader and was removed or demoted. We prevent demotions
// at the time writing but hypothetically we handle them the same way as
// removing the leader: stepping down into the next Term.
//
// TODO(tbg): step down (for sanity) and ask follower with largest Match
// to TimeoutNow (to avoid interruption). This might still drop some
// proposals but it's better than nothing.
//
// TODO(tbg): test this branch. It is untested at the time of writing.
return;
}
// The remaining steps only make sense if this node is the leader and there
// are other nodes.
if (pRaft->state != TAOS_SYNC_STATE_LEADER || cs->voters.replica == 0) {
return;
}
if (syncRaftMaybeCommit(pRaft)) {
// If the configuration change means that more entries are committed now,
// broadcast/append to everyone in the updated config.
syncRaftBroadcastAppend(pRaft);
} else {
// Otherwise, still probe the newly added replicas; there's no reason to
// let them wait out a heartbeat interval (or the next incoming
// proposal).
syncRaftProgressVisit(pRaft->tracker, visitProgressMaybeSendAppend, pRaft);
// If the the leadTransferee was removed or demoted, abort the leadership transfer.
SyncNodeId leadTransferee = pRaft->leadTransferee;
if (leadTransferee != SYNC_NON_NODE_ID && !syncRaftIsInNodeMap(&pRaft->tracker->config.voters, leadTransferee)) {
abortLeaderTransfer(pRaft);
}
}
}
static void abortLeaderTransfer(SSyncRaft* pRaft) {
pRaft->leadTransferee = SYNC_NON_NODE_ID;
}
/** /**
* pre-handle message, return true means no need to continue * pre-handle message, return true means no need to continue
* Handle the message term, which may result in our stepping down to a follower. * Handle the message term, which may result in our stepping down to a follower.
@ -140,7 +243,7 @@ static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
SyncNodeId leaderId = pMsg->from; SyncNodeId leaderId = pMsg->from;
RaftMessageType msgType = pMsg->msgType; ESyncRaftMessageType msgType = pMsg->msgType;
if (msgType == RAFT_MSG_VOTE) { if (msgType == RAFT_MSG_VOTE) {
// TODO // TODO

View File

@ -18,10 +18,10 @@
#include "raft_log.h" #include "raft_log.h"
#include "raft_message.h" #include "raft_message.h"
void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) { void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) {
SyncTerm term; SyncTerm term;
bool preVote; bool preVote;
RaftMessageType voteMsgType; ESyncRaftMessageType voteMsgType;
if (syncRaftIsPromotable(pRaft)) { if (syncRaftIsPromotable(pRaft)) {
syncDebug("[%d:%d] is unpromotable; campaign() should have been called", pRaft->selfGroupId, pRaft->selfId); syncDebug("[%d:%d] is unpromotable; campaign() should have been called", pRaft->selfGroupId, pRaft->selfId);
@ -41,7 +41,7 @@ void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) {
} }
int quorum = syncRaftQuorum(pRaft); int quorum = syncRaftQuorum(pRaft);
SSyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL); ESyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL);
if (result == SYNC_RAFT_VOTE_WON) { if (result == SYNC_RAFT_VOTE_WON) {
/** /**
* We won the election after voting for ourselves (which must mean that * We won the election after voting for ourselves (which must mean that

View File

@ -33,7 +33,7 @@ int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMs
return 0; return 0;
} }
RaftMsg_Append_Entries *appendResp = &(pRespMsg->appendResp); RaftMsg_Append_Resp *appendResp = &(pRespMsg->appendResp);
// ignore committed logs // ignore committed logs
if (syncRaftLogIsCommitted(pRaft->log, appendEntries->index)) { if (syncRaftLogIsCommitted(pRaft->log, appendEntries->index)) {
appendResp->index = pRaft->log->commitIndex; appendResp->index = pRaft->log->commitIndex;

View File

@ -36,7 +36,7 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
return 0; return 0;
} }
SSyncRaftVoteResult result = syncRaftPollVote(pRaft, pMsg->from, ESyncRaftVoteResult result = syncRaftPollVote(pRaft, pMsg->from,
pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION, pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION,
!pMsg->voteResp.rejected, &rejected, &granted); !pMsg->voteResp.rejected, &rejected, &granted);

View File

@ -31,6 +31,10 @@ SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog) {
return 0; return 0;
} }
void syncRaftLogAppliedTo(SSyncRaftLog* pLog, SyncIndex appliedIndex) {
}
bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term) { bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term) {
return true; return true;
} }

View File

@ -16,106 +16,62 @@
#include "raft.h" #include "raft.h"
#include "raft_log.h" #include "raft_log.h"
#include "sync_raft_progress.h" #include "sync_raft_progress.h"
#include "syncInt.h"
#include "raft_replication.h" #include "raft_replication.h"
static int sendSnapshot(SSyncRaft* pRaft, int i); static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress);
static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex index, SyncTerm term); static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress,
SyncIndex prevIndex, SyncTerm prevTerm,
const SSyncRaftEntry *entries, int nEntry);
int syncRaftReplicate(SSyncRaft* pRaft, int i) { // syncRaftReplicate sends an append RPC with new entries to the given peer,
#if 0 // if necessary. Returns true if a message was sent. The sendIfEmpty
// argument controls whether messages with no entries will be sent
// ("empty" messages are useful to convey updated Commit indexes, but
// are undesirable when we're sending multiple messages in a batch).
bool syncRaftReplicate(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty) {
assert(pRaft->state == TAOS_SYNC_STATE_LEADER); assert(pRaft->state == TAOS_SYNC_STATE_LEADER);
assert(i >= 0 && i < pRaft->leaderState.nProgress); SyncNodeId nodeId = progress->id;
SyncNodeId nodeId = pRaft->cluster.nodeInfo[i].nodeId;
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
if (syncRaftProgressIsPaused(progress)) { if (syncRaftProgressIsPaused(progress)) {
syncInfo("node %d paused", nodeId); syncInfo("node [%d:%d] paused", pRaft->selfGroupId, nodeId);
return 0; return false;
} }
SyncIndex nextIndex = syncRaftProgressNextIndex(progress); SyncIndex nextIndex = syncRaftProgressNextIndex(progress);
SyncIndex snapshotIndex = syncRaftLogSnapshotIndex(pRaft->log); SSyncRaftEntry *entries;
bool inSnapshot = syncRaftProgressInSnapshot(progress); int nEntry;
SyncIndex prevIndex; SyncIndex prevIndex;
SyncTerm prevTerm; SyncTerm prevTerm;
/** prevIndex = nextIndex - 1;
* From Section 3.5: prevTerm = syncRaftLogTermOf(pRaft->log, prevIndex);
* int ret = syncRaftLogAcquire(pRaft->log, nextIndex, pRaft->maxMsgSize, &entries, &nEntry);
* When sending an AppendEntries RPC, the leader includes the index and
* term of the entry in its log that immediately precedes the new
* entries. If the follower does not find an entry in its log with the
* same index and term, then it refuses the new entries. The consistency
* check acts as an induction step: the initial empty state of the logs
* satisfies the Log Matching Property, and the consistency check
* preserves the Log Matching Property whenever logs are extended. As a
* result, whenever AppendEntries returns successfully, the leader knows
* that the follower's log is identical to its own log up through the new
* entries (Log Matching Property in Figure 3.2).
**/
if (nextIndex == 1) {
/**
* We're including the very first entry, so prevIndex and prevTerm are
* null. If the first entry is not available anymore, send the last
* snapshot if we're not already sending one.
**/
if (snapshotIndex > 0 && !inSnapshot) {
goto send_snapshot;
}
// otherwise send append entries from start if (nEntry == 0 && !sendIfEmpty) {
prevIndex = 0; return false;
prevTerm = 0;
} else {
/**
* Set prevIndex and prevTerm to the index and term of the entry at
* nextIndex - 1.
**/
prevIndex = nextIndex - 1;
prevTerm = syncRaftLogTermOf(pRaft->log, prevIndex);
/**
* If the entry is not anymore in our log, send the last snapshot if we're
* not doing so already.
**/
if (prevTerm == SYNC_NON_TERM && !inSnapshot) {
goto send_snapshot;
}
} }
/* Send empty AppendEntries RPC when installing a snaphot */ if (ret != 0 || prevTerm == SYNC_NON_TERM) {
if (inSnapshot) { return sendSnapshot(pRaft, progress);
prevIndex = syncRaftLogLastIndex(pRaft->log);
prevTerm = syncRaftLogLastTerm(pRaft->log);
} }
return sendAppendEntries(pRaft, i, prevIndex, prevTerm); return sendAppendEntries(pRaft, progress, prevIndex, prevTerm, entries, nEntry);
send_snapshot:
if (syncRaftProgressRecentActive(progress)) {
/* Only send a snapshot when we have heard from the server */
return sendSnapshot(pRaft, i);
} else {
/* Send empty AppendEntries RPC when we haven't heard from the server */
prevIndex = syncRaftLogLastIndex(pRaft->log);
prevTerm = syncRaftLogLastTerm(pRaft->log);
return sendAppendEntries(pRaft, i, prevIndex, prevTerm);
}
#endif
return 0;
} }
static int sendSnapshot(SSyncRaft* pRaft, int i) { static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress) {
return 0; if (!syncRaftProgressRecentActive(progress)) {
return false;
}
return true;
} }
static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex prevIndex, SyncTerm prevTerm) { static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress,
#if 0 SyncIndex prevIndex, SyncTerm prevTerm,
SyncIndex nextIndex = prevIndex + 1; const SSyncRaftEntry *entries, int nEntry) {
SSyncRaftEntry *entries; SyncIndex lastIndex;
int nEntry; SyncTerm logTerm = prevTerm;
SNodeInfo* pNode = &(pRaft->cluster.nodeInfo[i]); SNodeInfo* pNode = &(pRaft->cluster.nodeInfo[progress->selfIndex]);
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
syncRaftLogAcquire(pRaft->log, nextIndex, pRaft->maxMsgSize, &entries, &nEntry);
SSyncMessage* msg = syncNewAppendMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term, SSyncMessage* msg = syncNewAppendMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term,
prevIndex, prevTerm, pRaft->log->commitIndex, prevIndex, prevTerm, pRaft->log->commitIndex,
@ -125,24 +81,27 @@ static int sendAppendEntries(SSyncRaft* pRaft, int i, SyncIndex prevIndex, SyncT
goto err_release_log; goto err_release_log;
} }
pRaft->io.send(msg, pNode); if (nEntry != 0) {
switch (progress->state) {
if (syncRaftProgressInReplicate(progress)) { // optimistically increase the next when in StateReplicate
SyncIndex lastIndex = nextIndex + nEntry; case PROGRESS_STATE_REPLICATE:
syncRaftProgressOptimisticNextIndex(progress, lastIndex); lastIndex = entries[nEntry - 1].index;
syncRaftInflightAdd(&progress->inflights, lastIndex); syncRaftProgressOptimisticNextIndex(progress, lastIndex);
} else if (syncRaftProgressInProbe(progress)) { syncRaftInflightAdd(&progress->inflights, lastIndex);
syncRaftProgressPause(progress); break;
} else { case PROGRESS_STATE_PROBE:
progress->probeSent = true;
break;
default:
syncFatal("[%d:%d] is sending append in unhandled state %s",
pRaft->selfGroupId, pRaft->selfId, syncRaftProgressStateString(progress));
break;
}
} }
pRaft->io.send(msg, pNode);
syncRaftProgressUpdateSendTick(progress, pRaft->currentTick); return true;
return 0;
err_release_log: err_release_log:
syncRaftLogRelease(pRaft->log, nextIndex, entries, nEntry); syncRaftLogRelease(pRaft->log, prevIndex + 1, entries, nEntry);
#endif return false;
return 0; }
}

View File

@ -0,0 +1,388 @@
/*
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "syncInt.h"
#include "sync_raft_config_change.h"
#include "sync_raft_progress.h"
#include "sync_raft_progress_tracker.h"
#include "sync_raft_quorum_joint.h"
static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config);
static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css);
static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r);
static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner);
static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id);
static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id);
static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, SyncNodeId id);
static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, SyncNodeId id);
static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, SyncNodeId id);
// syncRaftChangerSimpleConfig carries out a series of configuration changes that (in aggregate)
// mutates the incoming majority config Voters[0] by at most one. This method
// will return an error if that is not the case, if the resulting quorum is
// zero, or if the configuration is in a joint state (i.e. if there is an
// outgoing configuration).
int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
int ret;
ret = checkAndCopy(changer, config, progressMap);
if (ret != 0) {
return ret;
}
if (hasJointConfig(config)) {
syncError("can't apply simple config change in joint config");
return -1;
}
ret = applyConfig(changer, config, progressMap, css);
if (ret != 0) {
return ret;
}
int n = symDiff(syncRaftJointConfigIncoming(&changer->tracker->config.voters),
syncRaftJointConfigIncoming(&config->voters));
if (n > 1) {
syncError("more than one voter changed without entering joint config");
return -1;
}
return checkAndReturn(config, progressMap);
}
// EnterJoint verifies that the outgoing (=right) majority config of the joint
// config is empty and initializes it with a copy of the incoming (=left)
// majority config. That is, it transitions from
//
// (1 2 3)&&()
// to
// (1 2 3)&&(1 2 3).
//
// The supplied changes are then applied to the incoming majority config,
// resulting in a joint configuration that in terms of the Raft thesis[1]
// (Section 4.3) corresponds to `C_{new,old}`.
//
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css,
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
int ret;
ret = checkAndCopy(changer, config, progressMap);
if (ret != 0) {
return ret;
}
if (hasJointConfig(config)) {
syncError("config is already joint");
return -1;
}
if(config->voters.incoming.replica == 0) {
// We allow adding nodes to an empty config for convenience (testing and
// bootstrap), but you can't enter a joint state.
syncError("can't make a zero-voter config joint");
return -1;
}
// Clear the outgoing config.
syncRaftJointConfigClearOutgoing(&config->voters);
// Copy incoming to outgoing.
memcpy(&config->voters.outgoing, &config->voters.incoming, sizeof(SSyncCluster));
ret = applyConfig(changer, config, progressMap, css);
if (ret != 0) {
return ret;
}
config->autoLeave = autoLeave;
return checkAndReturn(config, progressMap);
}
// checkAndCopy copies the tracker's config and progress map (deeply enough for
// the purposes of the Changer) and returns those copies. It returns an error
// if checkInvariants does.
static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
syncRaftCloneTrackerConfig(&changer->tracker->config, config);
int i;
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
SSyncRaftProgress* progress = &(changer->tracker->progressMap.progress[i]);
if (progress->id == SYNC_NON_NODE_ID) {
continue;
}
syncRaftCopyProgress(progress, &(progressMap->progress[i]));
}
return checkAndReturn(config, progressMap);
}
// checkAndReturn calls checkInvariants on the input and returns either the
// resulting error or the input.
static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
if (checkInvariants(config, progressMap) != 0) {
return -1;
}
return 0;
}
// checkInvariants makes sure that the config and progress are compatible with
// each other. This is used to check both what the Changer is initialized with,
// as well as what it returns.
static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
int ret = syncRaftCheckProgress(config, progressMap);
if (ret != 0) {
return ret;
}
int i;
// Any staged learner was staged because it could not be directly added due
// to a conflicting voter in the outgoing config.
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
if (!syncRaftJointConfigInOutgoing(&config->voters, config->learnersNext.nodeId[i])) {
return -1;
}
if (progressMap->progress[i].id != SYNC_NON_NODE_ID && progressMap->progress[i].isLearner) {
syncError("%d is in LearnersNext, but is already marked as learner", progressMap->progress[i].id);
return -1;
}
}
// Conversely Learners and Voters doesn't intersect at all.
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
if (syncRaftJointConfigInIncoming(&config->voters, config->learners.nodeId[i])) {
syncError("%d is in Learners and voter.incoming", progressMap->progress[i].id);
return -1;
}
if (progressMap->progress[i].id != SYNC_NON_NODE_ID && !progressMap->progress[i].isLearner) {
syncError("%d is in Learners, but is not marked as learner", progressMap->progress[i].id);
return -1;
}
}
if (!hasJointConfig(config)) {
// We enforce that empty maps are nil instead of zero.
if (config->learnersNext.replica > 0) {
syncError("cfg.LearnersNext must be nil when not joint");
return -1;
}
if (config->autoLeave) {
syncError("AutoLeave must be false when not joint");
return -1;
}
}
return 0;
}
static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config) {
return config->voters.outgoing.replica > 0;
}
static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css) {
int i;
for (i = 0; i < css->n; ++i) {
const SSyncConfChangeSingle* cs = &(css->changes[i]);
if (cs->nodeId == SYNC_NON_NODE_ID) {
continue;
}
ESyncRaftConfChangeType type = cs->type;
switch (type) {
case SYNC_RAFT_Conf_AddNode:
makeVoter(changer, config, progressMap, cs->nodeId);
break;
case SYNC_RAFT_Conf_AddLearnerNode:
makeLearner(changer, config, progressMap, cs->nodeId);
break;
case SYNC_RAFT_Conf_RemoveNode:
removeNodeId(changer, config, progressMap, cs->nodeId);
break;
case SYNC_RAFT_Conf_UpdateNode:
break;
}
}
if (config->voters.incoming.replica == 0) {
syncError("removed all voters");
return -1;
}
return 0;
}
// symdiff returns the count of the symmetric difference between the sets of
// uint64s, i.e. len( (l - r) \union (r - l)).
static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r) {
int n;
int i;
int j0, j1;
const SSyncRaftNodeMap* pairs[2][2] = {
{l, r}, // count elems in l but not in r
{r, l}, // count elems in r but not in l
};
for (n = 0, i = 0; i < 2; ++i) {
const SSyncRaftNodeMap** pp = pairs[i];
const SSyncRaftNodeMap* p0 = pp[0];
const SSyncRaftNodeMap* p1 = pp[1];
for (j0 = 0; j0 < TSDB_MAX_REPLICA; ++j0) {
SyncNodeId id = p0->nodeId[j0];
if (id == SYNC_NON_NODE_ID) {
continue;
}
for (j1 = 0; j1 < p1->replica; ++j1) {
if (p1->nodeId[j1] != SYNC_NON_NODE_ID && p1->nodeId[j1] != id) {
n+=1;
}
}
}
}
return n;
}
static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner) {
}
// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id) {
int i;
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
if (nodeMap->nodeId[i] == id) {
nodeMap->replica -= 1;
nodeMap->nodeId[i] = SYNC_NON_NODE_ID;
break;
}
}
assert(nodeMap->replica >= 0);
}
// nilAwareAdd populates a map entry, creating the map if necessary.
static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id) {
int i, j;
for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) {
if (nodeMap->nodeId[i] == id) {
return;
}
if (j == -1 && nodeMap->nodeId[i] == SYNC_NON_NODE_ID) {
j = i;
}
}
assert(j != -1);
nodeMap->nodeId[j] = id;
nodeMap->replica += 1;
}
// makeVoter adds or promotes the given ID to be a voter in the incoming
// majority config.
static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
int i = syncRaftFindProgressIndexByNodeId(progressMap, id);
if (i == -1) {
initProgress(changer, config, progressMap, id, false);
i = syncRaftFindProgressIndexByNodeId(progressMap, id);
}
assert(i != -1);
SSyncRaftProgress* progress = &(progressMap->progress[i]);
progress->isLearner = false;
nilAwareDelete(&config->learners, id);
nilAwareDelete(&config->learnersNext, id);
syncRaftJointConfigAddToIncoming(&config->voters, id);
}
// makeLearner makes the given ID a learner or stages it to be a learner once
// an active joint configuration is exited.
//
// The former happens when the peer is not a part of the outgoing config, in
// which case we either add a new learner or demote a voter in the incoming
// config.
//
// The latter case occurs when the configuration is joint and the peer is a
// voter in the outgoing config. In that case, we do not want to add the peer
// as a learner because then we'd have to track a peer as a voter and learner
// simultaneously. Instead, we add the learner to LearnersNext, so that it will
// be added to Learners the moment the outgoing config is removed by
// LeaveJoint().
static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
int i = syncRaftFindProgressIndexByNodeId(progressMap, id);
if (i == -1) {
initProgress(changer, config, progressMap, id, false);
i = syncRaftFindProgressIndexByNodeId(progressMap, id);
}
assert(i != -1);
SSyncRaftProgress* progress = &(progressMap->progress[i]);
if (progress->isLearner) {
return;
}
// Remove any existing voter in the incoming config...
removeNodeId(changer, config, progressMap, id);
// ... but save the Progress.
syncRaftAddToProgressMap(progressMap, id);
// Use LearnersNext if we can't add the learner to Learners directly, i.e.
// if the peer is still tracked as a voter in the outgoing config. It will
// be turned into a learner in LeaveJoint().
//
// Otherwise, add a regular learner right away.
bool inOutgoing = syncRaftJointConfigInCluster(&config->voters.outgoing, id);
if (inOutgoing) {
nilAwareAdd(&config->learnersNext, id);
} else {
nilAwareAdd(&config->learners, id);
progress->isLearner = true;
}
}
// removeNodeId this peer as a voter or learner from the incoming config.
static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
int i = syncRaftFindProgressIndexByNodeId(progressMap, id);
if (i == -1) {
return;
}
syncRaftJointConfigRemoveFromIncoming(&config->voters, id);
nilAwareDelete(&config->learners, id);
nilAwareDelete(&config->learnersNext, id);
// If the peer is still a voter in the outgoing config, keep the Progress.
bool inOutgoing = syncRaftJointConfigInCluster(&config->voters.outgoing, id);
if (!inOutgoing) {
syncRaftRemoveFromProgressMap(progressMap, id);
}
}

View File

@ -31,7 +31,6 @@ static void tickElection(SSyncRaft* pRaft);
static void tickHeartbeat(SSyncRaft* pRaft); static void tickHeartbeat(SSyncRaft* pRaft);
static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n); static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n);
static bool maybeCommit(SSyncRaft* pRaft);
static void abortLeaderTransfer(SSyncRaft* pRaft); static void abortLeaderTransfer(SSyncRaft* pRaft);
@ -127,7 +126,7 @@ int syncRaftQuorum(SSyncRaft* pRaft) {
return pRaft->cluster.replica / 2 + 1; return pRaft->cluster.replica / 2 + 1;
} }
SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id, ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
bool preVote, bool grant, bool preVote, bool grant,
int* rejected, int *granted) { int* rejected, int *granted) {
int voterIndex = syncRaftConfigurationIndexOfNode(pRaft, id); int voterIndex = syncRaftConfigurationIndexOfNode(pRaft, id);
@ -171,6 +170,34 @@ SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
return granted; return granted;
*/ */
void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState) {
SyncIndex commitIndex = serverState->commitIndex;
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
if (commitIndex < pRaft->log->commitIndex || commitIndex > lastIndex) {
syncFatal("[%d:%d] state.commit %"PRId64" is out of range [%" PRId64 ",%" PRId64 "",
pRaft->selfGroupId, pRaft->selfId, commitIndex, pRaft->log->commitIndex, lastIndex);
return;
}
pRaft->log->commitIndex = commitIndex;
pRaft->term = serverState->term;
pRaft->voteFor = serverState->voteFor;
}
static void visitProgressSendAppend(int i, SSyncRaftProgress* progress, void* arg) {
SSyncRaft* pRaft = (SSyncRaft*)arg;
if (pRaft->selfId == progress->id) {
return;
}
syncRaftReplicate(arg, progress, true);
}
void syncRaftBroadcastAppend(SSyncRaft* pRaft) {
syncRaftProgressVisit(pRaft->tracker, visitProgressSendAppend, pRaft);
}
static int convertClear(SSyncRaft* pRaft) { static int convertClear(SSyncRaft* pRaft) {
} }
@ -186,7 +213,7 @@ static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
* StateCandidate, we may get stale MsgPreVoteResp messages in this term from * StateCandidate, we may get stale MsgPreVoteResp messages in this term from
* our pre-candidate state). * our pre-candidate state).
**/ **/
RaftMessageType msgType = pMsg->msgType; ESyncRaftMessageType msgType = pMsg->msgType;
if (msgType == RAFT_MSG_INTERNAL_PROP) { if (msgType == RAFT_MSG_INTERNAL_PROP) {
return 0; return 0;
@ -243,18 +270,16 @@ static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) {
syncRaftLogAppend(pRaft->log, entries, n); syncRaftLogAppend(pRaft->log, entries, n);
SSyncRaftProgress* progress = &(pRaft->tracker->progressMap[pRaft->cluster.selfIndex]); SSyncRaftProgress* progress = &(pRaft->tracker->progressMap.progress[pRaft->cluster.selfIndex]);
syncRaftProgressMaybeUpdate(progress, lastIndex); syncRaftProgressMaybeUpdate(progress, lastIndex);
// Regardless of maybeCommit's return, our caller will call bcastAppend. // Regardless of syncRaftMaybeCommit's return, our caller will call bcastAppend.
maybeCommit(pRaft); syncRaftMaybeCommit(pRaft);
} }
/** // syncRaftMaybeCommit attempts to advance the commit index. Returns true if
* maybeCommit attempts to advance the commit index. Returns true if // the commit index changed (in which case the caller should call
* the commit index changed (in which case the caller should call // r.bcastAppend).
* r.bcastAppend). bool syncRaftMaybeCommit(SSyncRaft* pRaft) {
**/
static bool maybeCommit(SSyncRaft* pRaft) {
return true; return true;
} }
@ -263,6 +288,7 @@ static bool maybeCommit(SSyncRaft* pRaft) {
* trigger I/O requests for newly appended log entries or heartbeats. * trigger I/O requests for newly appended log entries or heartbeats.
**/ **/
static int triggerAll(SSyncRaft* pRaft) { static int triggerAll(SSyncRaft* pRaft) {
#if 0
assert(pRaft->state == TAOS_SYNC_STATE_LEADER); assert(pRaft->state == TAOS_SYNC_STATE_LEADER);
int i; int i;
@ -271,8 +297,10 @@ static int triggerAll(SSyncRaft* pRaft) {
continue; continue;
} }
syncRaftReplicate(pRaft, i); syncRaftReplicate(pRaft, pRaft->tracker->progressMap.progress[i], true);
} }
#endif
return 0;
} }
static void abortLeaderTransfer(SSyncRaft* pRaft) { static void abortLeaderTransfer(SSyncRaft* pRaft) {

View File

@ -20,13 +20,13 @@
#include "sync.h" #include "sync.h"
#include "syncInt.h" #include "syncInt.h"
static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state); static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state);
static void probeAcked(SSyncRaftProgress* progress); static void probeAcked(SSyncRaftProgress* progress);
static void resumeProgress(SSyncRaftProgress* progress); static void resumeProgress(SSyncRaftProgress* progress);
void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress) { void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress) {
SSyncRaftInflights* inflights = syncRaftOpenInflights(pRaft->tracker->maxInflight); SSyncRaftInflights* inflights = syncRaftOpenInflights(pRaft->tracker->maxInflightMsgs);
if (inflights == NULL) { if (inflights == NULL) {
return; return;
} }
@ -112,6 +112,44 @@ bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) {
} }
} }
int syncRaftFindProgressIndexByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id) {
int i;
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
if (progressMap->progress[i].id == id) {
return i;
}
}
return -1;
}
int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) {
int i, j;
for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) {
if (progressMap->progress[i].id == id) {
return i;
}
if (j == -1 && progressMap->progress[i].id == SYNC_NON_NODE_ID) {
j = i;
}
}
assert(j != -1);
progressMap->progress[i].id = id;
}
void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) {
int i;
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
if (progressMap->progress[i].id == id) {
progressMap->progress[i].id = SYNC_NON_NODE_ID;
break;
}
}
}
bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) { bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) {
return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex; return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex;
} }
@ -149,11 +187,15 @@ void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snaps
progress->pendingSnapshotIndex = snapshotIndex; progress->pendingSnapshotIndex = snapshotIndex;
} }
void syncRaftCopyProgress(const SSyncRaftProgress* progress, SSyncRaftProgress* out) {
}
/** /**
* ResetState moves the Progress into the specified State, resetting ProbeSent, * ResetState moves the Progress into the specified State, resetting ProbeSent,
* PendingSnapshot, and Inflights. * PendingSnapshot, and Inflights.
**/ **/
static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state) { static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state) {
progress->probeSent = false; progress->probeSent = false;
progress->pendingSnapshotIndex = 0; progress->pendingSnapshotIndex = 0;
progress->state = state; progress->state = state;
@ -233,7 +275,7 @@ void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i) {
progress->state = PROGRESS_STATE_PROBE; progress->state = PROGRESS_STATE_PROBE;
} }
RaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i) { ESyncRaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i) {
return pRaft->leaderState.progress[i].state; return pRaft->leaderState.progress[i].state;
} }

View File

@ -14,6 +14,7 @@
*/ */
#include "sync_raft_progress_tracker.h" #include "sync_raft_progress_tracker.h"
#include "sync_raft_proto.h"
SSyncRaftProgressTracker* syncRaftOpenProgressTracker() { SSyncRaftProgressTracker* syncRaftOpenProgressTracker() {
SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)malloc(sizeof(SSyncRaftProgressTracker)); SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)malloc(sizeof(SSyncRaftProgressTracker));
@ -25,13 +26,13 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker() {
} }
void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) { void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) {
memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteResult) * TSDB_MAX_REPLICA); memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(ESyncRaftVoteType) * TSDB_MAX_REPLICA);
} }
void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) { void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) {
int i; int i;
for (i = 0; i < TSDB_MAX_REPLICA; ++i) { for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
SSyncRaftProgress* progress = &(tracker->progressMap[i]); SSyncRaftProgress* progress = &(tracker->progressMap.progress[i]);
visit(i, progress, arg); visit(i, progress, arg);
} }
} }
@ -44,17 +45,21 @@ void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant) {
tracker->votes[i] = grant ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT; tracker->votes[i] = grant ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT;
} }
void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) {
}
/** /**
* syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the * syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the
* election outcome is known. * election outcome is known.
**/ **/
SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) { ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) {
int i; int i;
SSyncRaftProgress* progress; SSyncRaftProgress* progress;
int r, g; int r, g;
for (i = 0, r = 0, g = 0; i < TSDB_MAX_REPLICA; ++i) { for (i = 0, r = 0, g = 0; i < TSDB_MAX_REPLICA; ++i) {
progress = &(tracker->progressMap[i]); progress = &(tracker->progressMap.progress[i]);
if (progress->id == SYNC_NON_NODE_ID) { if (progress->id == SYNC_NON_NODE_ID) {
continue; continue;
} }
@ -73,4 +78,11 @@ SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* re
if (rejected) *rejected = r; if (rejected) *rejected = r;
if (granted) *granted = g; if (granted) *granted = g;
return syncRaftVoteResult(&(tracker->config.voters), tracker->votes); return syncRaftVoteResult(&(tracker->config.voters), tracker->votes);
}
void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) {
memcpy(&cs->voters, &tracker->config.voters.incoming, sizeof(SSyncRaftNodeMap));
memcpy(&cs->votersOutgoing, &tracker->config.voters.outgoing, sizeof(SSyncRaftNodeMap));
memcpy(&cs->learners, &tracker->config.learners, sizeof(SSyncRaftNodeMap));
memcpy(&cs->learnersNext, &tracker->config.learnersNext, sizeof(SSyncRaftNodeMap));
} }

View File

@ -22,9 +22,9 @@
* a result indicating whether the vote is pending, lost, or won. A joint quorum * a result indicating whether the vote is pending, lost, or won. A joint quorum
* requires both majority quorums to vote in favor. * requires both majority quorums to vote in favor.
**/ **/
SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const SyncRaftVoteResult* votes) { ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const ESyncRaftVoteType* votes) {
SyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->majorityConfig[0]), votes); ESyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->incoming), votes);
SyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->majorityConfig[1]), votes); ESyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->outgoing), votes);
if (r1 == r2) { if (r1 == r2) {
// If they agree, return the agreed state. // If they agree, return the agreed state.
@ -39,3 +39,47 @@ SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const
// One side won, the other one is pending, so the whole outcome is. // One side won, the other one is pending, so the whole outcome is.
return SYNC_RAFT_VOTE_PENDING; return SYNC_RAFT_VOTE_PENDING;
} }
void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
int i, min;
for (i = 0, min = -1; i < TSDB_MAX_REPLICA; ++i) {
if (config->incoming.nodeId[i] == id) {
return;
}
if (min == -1 && config->incoming.nodeId[i] == SYNC_NON_NODE_ID) {
min = i;
}
}
assert(min != -1);
config->incoming.nodeId[min] = id;
config->incoming.replica += 1;
}
void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
int i;
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
if (config->incoming.nodeId[i] == id) {
config->incoming.replica -= 1;
config->incoming.nodeId[i] = SYNC_NON_NODE_ID;
break;
}
}
assert(config->incoming.replica >= 0);
}
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) {
int i;
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
if (nodeId == nodeMap->nodeId[i]) {
return true;
}
}
return false;
}

View File

@ -22,14 +22,14 @@
* yes/no has been reached), won (a quorum of yes has been reached), or lost (a * yes/no has been reached), won (a quorum of yes has been reached), or lost (a
* quorum of no has been reached). * quorum of no has been reached).
**/ **/
SyncRaftVoteResult syncRaftMajorityVoteResult(SSyncCluster* config, const SyncRaftVoteResult* votes) { ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, const ESyncRaftVoteType* votes) {
if (config->replica == 0) { if (config->replica == 0) {
return SYNC_RAFT_VOTE_WON; return SYNC_RAFT_VOTE_WON;
} }
int i, g, r, missing; int i, g, r, missing;
for (i = g = r = missing = 0; i < TSDB_MAX_REPLICA; ++i) { for (i = g = r = missing = 0; i < TSDB_MAX_REPLICA; ++i) {
if (config->nodeInfo[i].nodeId == SYNC_NON_NODE_ID) { if (config->nodeId[i] == SYNC_NON_NODE_ID) {
continue; continue;
} }

View File

@ -0,0 +1,181 @@
/*
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "sync_raft_config_change.h"
#include "sync_raft_restore.h"
#include "sync_raft_progress_tracker.h"
static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in);
// syncRaftRestoreConfig takes a Changer (which must represent an empty configuration), and
// runs a sequence of changes enacting the configuration described in the
// ConfState.
//
// TODO(tbg) it's silly that this takes a Changer. Unravel this by making sure
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
// this can just take LastIndex and MaxInflight directly instead and cook up
// the results from that alone.
int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs) {
SSyncConfChangeSingleArray outgoing;
SSyncConfChangeSingleArray incoming;
SSyncConfChangeSingleArray css;
SSyncRaftProgressTracker* tracker = changer->tracker;
SSyncRaftProgressTrackerConfig* config = &tracker->config;
SSyncRaftProgressMap* progressMap = &tracker->progressMap;
int i, ret;
ret = toConfChangeSingle(cs, &outgoing, &incoming);
if (ret != 0) {
goto out;
}
if (outgoing.n == 0) {
// No outgoing config, so just apply the incoming changes one by one.
for (i = 0; i < incoming.n; ++i) {
css = (SSyncConfChangeSingleArray) {
.n = 1,
.changes = &incoming.changes[i],
};
ret = syncRaftChangerSimpleConfig(changer, &css, config, progressMap);
if (ret != 0) {
goto out;
}
}
} else {
// The ConfState describes a joint configuration.
//
// First, apply all of the changes of the outgoing config one by one, so
// that it temporarily becomes the incoming active config. For example,
// if the config is (1 2 3)&(2 3 4), this will establish (2 3 4)&().
for (i = 0; i < outgoing.n; ++i) {
css = (SSyncConfChangeSingleArray) {
.n = 1,
.changes = &outgoing.changes[i],
};
ret = syncRaftChangerSimpleConfig(changer, &css, config, progressMap);
if (ret != 0) {
goto out;
}
}
ret = syncRaftChangerEnterJoint(changer, cs->autoLeave, &incoming, config, progressMap);
if (ret != 0) {
goto out;
}
}
out:
if (incoming.n != 0) free(incoming.changes);
if (outgoing.n != 0) free(outgoing.changes);
return ret;
}
// toConfChangeSingle translates a conf state into 1) a slice of operations creating
// first the config that will become the outgoing one, and then the incoming one, and
// b) another slice that, when applied to the config resulted from 1), represents the
// ConfState.
static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in) {
int i;
out->n = in->n = 0;
out->n = cs->votersOutgoing.replica;
out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * out->n);
if (out->changes == NULL) {
out->n = 0;
return -1;
}
in->n = cs->votersOutgoing.replica + cs->voters.replica + cs->learners.replica + cs->learnersNext.replica;
out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * in->n);
if (in->changes == NULL) {
in->n = 0;
return -1;
}
// Example to follow along this code:
// voters=(1 2 3) learners=(5) outgoing=(1 2 4 6) learners_next=(4)
//
// This means that before entering the joint config, the configuration
// had voters (1 2 4 6) and perhaps some learners that are already gone.
// The new set of voters is (1 2 3), i.e. (1 2) were kept around, and (4 6)
// are no longer voters; however 4 is poised to become a learner upon leaving
// the joint state.
// We can't tell whether 5 was a learner before entering the joint config,
// but it doesn't matter (we'll pretend that it wasn't).
//
// The code below will construct
// outgoing = add 1; add 2; add 4; add 6
// incoming = remove 1; remove 2; remove 4; remove 6
// add 1; add 2; add 3;
// add-learner 5;
// add-learner 4;
//
// So, when starting with an empty config, after applying 'outgoing' we have
//
// quorum=(1 2 4 6)
//
// From which we enter a joint state via 'incoming'
//
// quorum=(1 2 3)&&(1 2 4 6) learners=(5) learners_next=(4)
//
// as desired.
for (i = 0; i < cs->votersOutgoing.replica; ++i) {
// If there are outgoing voters, first add them one by one so that the
// (non-joint) config has them all.
out->changes[i] = (SSyncConfChangeSingle) {
.type = SYNC_RAFT_Conf_AddNode,
.nodeId = cs->votersOutgoing.nodeId[i],
};
}
// We're done constructing the outgoing slice, now on to the incoming one
// (which will apply on top of the config created by the outgoing slice).
// First, we'll remove all of the outgoing voters.
int j = 0;
for (i = 0; i < cs->votersOutgoing.replica; ++i) {
in->changes[j] = (SSyncConfChangeSingle) {
.type = SYNC_RAFT_Conf_RemoveNode,
.nodeId = cs->votersOutgoing.nodeId[i],
};
j += 1;
}
// Then we'll add the incoming voters and learners.
for (i = 0; i < cs->voters.replica; ++i) {
in->changes[j] = (SSyncConfChangeSingle) {
.type = SYNC_RAFT_Conf_AddNode,
.nodeId = cs->voters.nodeId[i],
};
j += 1;
}
for (i = 0; i < cs->learners.replica; ++i) {
in->changes[j] = (SSyncConfChangeSingle) {
.type = SYNC_RAFT_Conf_AddLearnerNode,
.nodeId = cs->learners.nodeId[i],
};
j += 1;
}
// Same for LearnersNext; these are nodes we want to be learners but which
// are currently voters in the outgoing config.
for (i = 0; i < cs->learnersNext.replica; ++i) {
in->changes[j] = (SSyncConfChangeSingle) {
.type = SYNC_RAFT_Conf_AddLearnerNode,
.nodeId = cs->learnersNext.nodeId[i],
};
j += 1;
}
return 0;
}