[TD-10645][raft]<feature>add raft progress tracker
This commit is contained in:
parent
e17f573e0e
commit
de3164f16f
|
@ -19,7 +19,8 @@
|
||||||
#include "sync.h"
|
#include "sync.h"
|
||||||
#include "sync_type.h"
|
#include "sync_type.h"
|
||||||
#include "raft_message.h"
|
#include "raft_message.h"
|
||||||
|
#include "sync_raft_impl.h"
|
||||||
|
#include "sync_raft_quorum.h"
|
||||||
|
|
||||||
typedef struct RaftLeaderState {
|
typedef struct RaftLeaderState {
|
||||||
|
|
||||||
|
@ -140,20 +141,4 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo);
|
||||||
int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
int32_t syncRaftTick(SSyncRaft* pRaft);
|
int32_t syncRaftTick(SSyncRaft* pRaft);
|
||||||
|
|
||||||
void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId);
|
|
||||||
void syncRaftBecomePreCandidate(SSyncRaft* pRaft);
|
|
||||||
void syncRaftBecomeCandidate(SSyncRaft* pRaft);
|
|
||||||
void syncRaftBecomeLeader(SSyncRaft* pRaft);
|
|
||||||
|
|
||||||
void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType);
|
|
||||||
|
|
||||||
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft);
|
|
||||||
|
|
||||||
void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft);
|
|
||||||
bool syncRaftIsPromotable(SSyncRaft* pRaft);
|
|
||||||
bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft);
|
|
||||||
int syncRaftQuorum(SSyncRaft* pRaft);
|
|
||||||
int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id,
|
|
||||||
bool preVote, bool accept, int* rejectNum);
|
|
||||||
|
|
||||||
#endif /* _TD_LIBS_SYNC_RAFT_H */
|
#endif /* _TD_LIBS_SYNC_RAFT_H */
|
|
@ -0,0 +1,42 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _TD_LIBS_SYNC_RAFT_IMPL_H
|
||||||
|
#define _TD_LIBS_SYNC_RAFT_IMPL_H
|
||||||
|
|
||||||
|
#include "sync.h"
|
||||||
|
#include "sync_type.h"
|
||||||
|
#include "raft_message.h"
|
||||||
|
#include "sync_raft_quorum.h"
|
||||||
|
|
||||||
|
void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId);
|
||||||
|
void syncRaftBecomePreCandidate(SSyncRaft* pRaft);
|
||||||
|
void syncRaftBecomeCandidate(SSyncRaft* pRaft);
|
||||||
|
void syncRaftBecomeLeader(SSyncRaft* pRaft);
|
||||||
|
|
||||||
|
void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType);
|
||||||
|
|
||||||
|
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft);
|
||||||
|
|
||||||
|
void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft);
|
||||||
|
bool syncRaftIsPromotable(SSyncRaft* pRaft);
|
||||||
|
bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft);
|
||||||
|
int syncRaftQuorum(SSyncRaft* pRaft);
|
||||||
|
|
||||||
|
SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
||||||
|
bool preVote, bool accept,
|
||||||
|
int* rejectNum, int *granted);
|
||||||
|
|
||||||
|
#endif /* _TD_LIBS_SYNC_RAFT_IMPL_H */
|
|
@ -63,6 +63,8 @@ typedef enum RaftProgressState {
|
||||||
* progresses of all followers, and sends entries to the follower based on its progress.
|
* progresses of all followers, and sends entries to the follower based on its progress.
|
||||||
**/
|
**/
|
||||||
struct SSyncRaftProgress {
|
struct SSyncRaftProgress {
|
||||||
|
SyncNodeId id;
|
||||||
|
|
||||||
SyncIndex nextIndex;
|
SyncIndex nextIndex;
|
||||||
|
|
||||||
SyncIndex matchIndex;
|
SyncIndex matchIndex;
|
||||||
|
|
|
@ -23,7 +23,8 @@
|
||||||
struct SSyncRaftProgressTrackerConfig {
|
struct SSyncRaftProgressTrackerConfig {
|
||||||
SSyncRaftQuorumJointConfig voters;
|
SSyncRaftQuorumJointConfig voters;
|
||||||
|
|
||||||
/** AutoLeave is true if the configuration is joint and a transition to the
|
/**
|
||||||
|
* autoLeave is true if the configuration is joint and a transition to the
|
||||||
* incoming configuration should be carried out automatically by Raft when
|
* incoming configuration should be carried out automatically by Raft when
|
||||||
* this is possible. If false, the configuration will be joint until the
|
* this is possible. If false, the configuration will be joint until the
|
||||||
* application initiates the transition manually.
|
* application initiates the transition manually.
|
||||||
|
@ -86,7 +87,7 @@ struct SSyncRaftProgressTracker {
|
||||||
|
|
||||||
SSyncRaftProgress progressMap[TSDB_MAX_REPLICA];
|
SSyncRaftProgress progressMap[TSDB_MAX_REPLICA];
|
||||||
|
|
||||||
SyncRaftVoteRespType votes[TSDB_MAX_REPLICA];
|
SyncRaftVoteResult votes[TSDB_MAX_REPLICA];
|
||||||
int maxInflight;
|
int maxInflight;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -97,4 +98,16 @@ void syncRaftResetVotes(SSyncRaftProgressTracker*);
|
||||||
typedef void (*visitProgressFp)(int i, SSyncRaftProgress* progress, void* arg);
|
typedef void (*visitProgressFp)(int i, SSyncRaftProgress* progress, void* arg);
|
||||||
void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg);
|
void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftRecordVote records that the node with the given id voted for this Raft
|
||||||
|
* instance if v == true (and declined it otherwise).
|
||||||
|
**/
|
||||||
|
void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the
|
||||||
|
* election outcome is known.
|
||||||
|
**/
|
||||||
|
SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted);
|
||||||
|
|
||||||
#endif /* _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H */
|
#endif /* _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H */
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef TD_SYNC_RAFT_QUORUM_H
|
||||||
|
#define TD_SYNC_RAFT_QUORUM_H
|
||||||
|
|
||||||
|
/**
|
||||||
|
* SSyncRaftVoteResult indicates the outcome of a vote.
|
||||||
|
**/
|
||||||
|
typedef enum {
|
||||||
|
/**
|
||||||
|
* SYNC_RAFT_VOTE_PENDING indicates that the decision of the vote depends on future
|
||||||
|
* votes, i.e. neither "yes" or "no" has reached quorum yet.
|
||||||
|
**/
|
||||||
|
SYNC_RAFT_VOTE_PENDING = 1,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* SYNC_RAFT_VOTE_LOST indicates that the quorum has voted "no".
|
||||||
|
**/
|
||||||
|
SYNC_RAFT_VOTE_LOST = 2,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* SYNC_RAFT_VOTE_WON indicates that the quorum has voted "yes".
|
||||||
|
**/
|
||||||
|
SYNC_RAFT_VOTE_WON = 3,
|
||||||
|
} SSyncRaftVoteResult;
|
||||||
|
|
||||||
|
#endif /* TD_SYNC_RAFT_QUORUM_H */
|
|
@ -18,13 +18,21 @@
|
||||||
|
|
||||||
#include "taosdef.h"
|
#include "taosdef.h"
|
||||||
#include "sync.h"
|
#include "sync.h"
|
||||||
|
#include "sync_type.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* JointConfig is a configuration of two groups of (possibly overlapping)
|
* SSyncRaftQuorumJointConfig is a configuration of two groups of (possibly overlapping)
|
||||||
* majority configurations. Decisions require the support of both majorities.
|
* majority configurations. Decisions require the support of both majorities.
|
||||||
**/
|
**/
|
||||||
typedef struct SSyncRaftQuorumJointConfig {
|
typedef struct SSyncRaftQuorumJointConfig {
|
||||||
SyncNodeId majorityConfig[2][TSDB_MAX_REPLICA];
|
SSyncCluster majorityConfig[2];
|
||||||
}SSyncRaftQuorumJointConfig;
|
}SSyncRaftQuorumJointConfig;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||||
|
* a result indicating whether the vote is pending, lost, or won. A joint quorum
|
||||||
|
* requires both majority quorums to vote in favor.
|
||||||
|
**/
|
||||||
|
SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const SyncRaftVoteResult* votes);
|
||||||
|
|
||||||
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */
|
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H
|
||||||
|
#define _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H
|
||||||
|
|
||||||
|
#include "sync.h"
|
||||||
|
#include "sync_type.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||||
|
* a result indicating whether the vote is pending (i.e. neither a quorum of
|
||||||
|
* yes/no has been reached), won (a quorum of yes has been reached), or lost (a
|
||||||
|
* quorum of no has been reached).
|
||||||
|
**/
|
||||||
|
SyncRaftVoteResult syncRaftMajorityVoteResult(SSyncCluster* config, const SyncRaftVoteResult* votes);
|
||||||
|
|
||||||
|
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */
|
|
@ -61,6 +61,6 @@ typedef enum {
|
||||||
|
|
||||||
//reject the vote request
|
//reject the vote request
|
||||||
SYNC_RAFT_VOTE_RESP_REJECT = 2,
|
SYNC_RAFT_VOTE_RESP_REJECT = 2,
|
||||||
} SyncRaftVoteRespType;
|
} SyncRaftVoteResult;
|
||||||
|
|
||||||
#endif /* _TD_LIBS_SYNC_TYPE_H */
|
#endif /* _TD_LIBS_SYNC_TYPE_H */
|
||||||
|
|
|
@ -26,23 +26,6 @@ static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
|
|
||||||
static int convertClear(SSyncRaft* pRaft);
|
|
||||||
static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
|
||||||
static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
|
||||||
static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
|
||||||
|
|
||||||
static int triggerAll(SSyncRaft* pRaft);
|
|
||||||
|
|
||||||
static void tickElection(SSyncRaft* pRaft);
|
|
||||||
static void tickHeartbeat(SSyncRaft* pRaft);
|
|
||||||
|
|
||||||
static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n);
|
|
||||||
static bool maybeCommit(SSyncRaft* pRaft);
|
|
||||||
|
|
||||||
static void abortLeaderTransfer(SSyncRaft* pRaft);
|
|
||||||
|
|
||||||
static void resetRaft(SSyncRaft* pRaft, SyncTerm term);
|
|
||||||
|
|
||||||
int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
||||||
SSyncNode* pNode = pRaft->pNode;
|
SSyncNode* pNode = pRaft->pNode;
|
||||||
SSyncServerState serverState;
|
SSyncServerState serverState;
|
||||||
|
@ -136,124 +119,6 @@ int32_t syncRaftTick(SSyncRaft* pRaft) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId) {
|
|
||||||
convertClear(pRaft);
|
|
||||||
|
|
||||||
pRaft->stepFp = stepFollower;
|
|
||||||
resetRaft(pRaft, term);
|
|
||||||
pRaft->tickFp = tickElection;
|
|
||||||
pRaft->leaderId = leaderId;
|
|
||||||
pRaft->state = TAOS_SYNC_ROLE_FOLLOWER;
|
|
||||||
syncInfo("[%d:%d] became followe at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftBecomePreCandidate(SSyncRaft* pRaft) {
|
|
||||||
convertClear(pRaft);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Becoming a pre-candidate changes our step functions and state,
|
|
||||||
* but doesn't change anything else. In particular it does not increase
|
|
||||||
* r.Term or change r.Vote.
|
|
||||||
**/
|
|
||||||
pRaft->stepFp = stepCandidate;
|
|
||||||
pRaft->tickFp = tickElection;
|
|
||||||
pRaft->state = TAOS_SYNC_ROLE_CANDIDATE;
|
|
||||||
pRaft->candidateState.inPreVote = true;
|
|
||||||
syncInfo("[%d:%d] became pre-candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftBecomeCandidate(SSyncRaft* pRaft) {
|
|
||||||
convertClear(pRaft);
|
|
||||||
|
|
||||||
pRaft->candidateState.inPreVote = false;
|
|
||||||
pRaft->stepFp = stepCandidate;
|
|
||||||
// become candidate make term+1
|
|
||||||
resetRaft(pRaft, pRaft->term + 1);
|
|
||||||
pRaft->tickFp = tickElection;
|
|
||||||
pRaft->voteFor = pRaft->selfId;
|
|
||||||
pRaft->state = TAOS_SYNC_ROLE_CANDIDATE;
|
|
||||||
syncInfo("[%d:%d] became candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftBecomeLeader(SSyncRaft* pRaft) {
|
|
||||||
assert(pRaft->state != TAOS_SYNC_ROLE_FOLLOWER);
|
|
||||||
|
|
||||||
pRaft->stepFp = stepLeader;
|
|
||||||
resetRaft(pRaft, pRaft->term);
|
|
||||||
pRaft->leaderId = pRaft->leaderId;
|
|
||||||
pRaft->state = TAOS_SYNC_ROLE_LEADER;
|
|
||||||
// TODO: check if there is pending config log
|
|
||||||
int nPendingConf = syncRaftLogNumOfPendingConf(pRaft->log);
|
|
||||||
if (nPendingConf > 1) {
|
|
||||||
syncFatal("unexpected multiple uncommitted config entry");
|
|
||||||
}
|
|
||||||
|
|
||||||
syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
|
||||||
|
|
||||||
// after become leader, send a no-op log
|
|
||||||
SSyncRaftEntry* entry = (SSyncRaftEntry*)malloc(sizeof(SSyncRaftEntry));
|
|
||||||
if (entry == NULL) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
*entry = (SSyncRaftEntry) {
|
|
||||||
.buffer = (SSyncBuffer) {
|
|
||||||
.data = NULL,
|
|
||||||
.len = 0,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
appendEntries(pRaft, entry, 1);
|
|
||||||
//syncRaftTriggerHeartbeat(pRaft);
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) {
|
|
||||||
triggerAll(pRaft);
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) {
|
|
||||||
// electionTimeoutTick in [3,6] tick
|
|
||||||
pRaft->randomizedElectionTimeout = taosRand() % 4 + 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool syncRaftIsPromotable(SSyncRaft* pRaft) {
|
|
||||||
return pRaft->selfId != SYNC_NON_NODE_ID;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) {
|
|
||||||
return pRaft->electionElapsed >= pRaft->randomizedElectionTimeout;
|
|
||||||
}
|
|
||||||
|
|
||||||
int syncRaftQuorum(SSyncRaft* pRaft) {
|
|
||||||
return pRaft->cluster.replica / 2 + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int syncRaftNumOfGranted(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool accept, int* rejectNum) {
|
|
||||||
/*
|
|
||||||
if (accept) {
|
|
||||||
syncInfo("[%d:%d] received (pre-vote %d) from %d at term %" PRId64 "",
|
|
||||||
pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term);
|
|
||||||
} else {
|
|
||||||
syncInfo("[%d:%d] received rejection from %d at term %" PRId64 "",
|
|
||||||
pRaft->selfGroupId, pRaft->selfId, id, pRaft->term);
|
|
||||||
}
|
|
||||||
|
|
||||||
int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, id);
|
|
||||||
assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0);
|
|
||||||
assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN);
|
|
||||||
|
|
||||||
pRaft->candidateState.votes[voteIndex] = accept ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT;
|
|
||||||
int granted = 0, rejected = 0;
|
|
||||||
int i;
|
|
||||||
for (i = 0; i < pRaft->cluster.replica; ++i) {
|
|
||||||
if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) granted++;
|
|
||||||
else if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_REJECT) rejected++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (rejectNum) *rejectNum = rejected;
|
|
||||||
return granted;
|
|
||||||
*/
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* pre-handle message, return true means no need to continue
|
* pre-handle message, return true means no need to continue
|
||||||
* Handle the message term, which may result in our stepping down to a follower.
|
* Handle the message term, which may result in our stepping down to a follower.
|
||||||
|
@ -336,137 +201,3 @@ static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg)
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int convertClear(SSyncRaft* pRaft) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
|
||||||
/**
|
|
||||||
* Only handle vote responses corresponding to our candidacy (while in
|
|
||||||
* StateCandidate, we may get stale MsgPreVoteResp messages in this term from
|
|
||||||
* our pre-candidate state).
|
|
||||||
**/
|
|
||||||
RaftMessageType msgType = pMsg->msgType;
|
|
||||||
|
|
||||||
if (msgType == RAFT_MSG_INTERNAL_PROP) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (msgType == RAFT_MSG_VOTE_RESP) {
|
|
||||||
syncRaftHandleVoteRespMessage(pRaft, pMsg);
|
|
||||||
return 0;
|
|
||||||
} else if (msgType == RAFT_MSG_APPEND) {
|
|
||||||
syncRaftBecomeFollower(pRaft, pRaft->term, pMsg->from);
|
|
||||||
syncRaftHandleAppendEntriesMessage(pRaft, pMsg);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
|
||||||
convertClear(pRaft);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* tickElection is run by followers and candidates per tick.
|
|
||||||
**/
|
|
||||||
static void tickElection(SSyncRaft* pRaft) {
|
|
||||||
pRaft->electionElapsed += 1;
|
|
||||||
|
|
||||||
if (!syncRaftIsPromotable(pRaft)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!syncRaftIsPastElectionTimeout(pRaft)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// election timeout
|
|
||||||
pRaft->electionElapsed = 0;
|
|
||||||
SSyncMessage msg;
|
|
||||||
syncRaftStep(pRaft, syncInitElectionMsg(&msg, pRaft->selfId));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void tickHeartbeat(SSyncRaft* pRaft) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) {
|
|
||||||
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
|
||||||
SyncTerm term = pRaft->term;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < n; ++i) {
|
|
||||||
entries[i].term = term;
|
|
||||||
entries[i].index = lastIndex + 1 + i;
|
|
||||||
}
|
|
||||||
|
|
||||||
syncRaftLogAppend(pRaft->log, entries, n);
|
|
||||||
|
|
||||||
SSyncRaftProgress* progress = &(pRaft->tracker->progressMap[pRaft->cluster.selfIndex]);
|
|
||||||
syncRaftProgressMaybeUpdate(progress, lastIndex);
|
|
||||||
// Regardless of maybeCommit's return, our caller will call bcastAppend.
|
|
||||||
maybeCommit(pRaft);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* maybeCommit attempts to advance the commit index. Returns true if
|
|
||||||
* the commit index changed (in which case the caller should call
|
|
||||||
* r.bcastAppend).
|
|
||||||
**/
|
|
||||||
static bool maybeCommit(SSyncRaft* pRaft) {
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* trigger I/O requests for newly appended log entries or heartbeats.
|
|
||||||
**/
|
|
||||||
static int triggerAll(SSyncRaft* pRaft) {
|
|
||||||
assert(pRaft->state == TAOS_SYNC_ROLE_LEADER);
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < pRaft->cluster.replica; ++i) {
|
|
||||||
if (i == pRaft->cluster.selfIndex) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
syncRaftReplicate(pRaft, i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void abortLeaderTransfer(SSyncRaft* pRaft) {
|
|
||||||
pRaft->leadTransferee = SYNC_NON_NODE_ID;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void initProgress(int i, SSyncRaftProgress* progress, void* arg) {
|
|
||||||
syncRaftInitProgress(i, (SSyncRaft*)arg, progress);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void resetRaft(SSyncRaft* pRaft, SyncTerm term) {
|
|
||||||
if (pRaft->term != term) {
|
|
||||||
pRaft->term = term;
|
|
||||||
pRaft->voteFor = SYNC_NON_NODE_ID;
|
|
||||||
}
|
|
||||||
|
|
||||||
pRaft->leaderId = SYNC_NON_NODE_ID;
|
|
||||||
|
|
||||||
pRaft->electionElapsed = 0;
|
|
||||||
pRaft->heartbeatElapsed = 0;
|
|
||||||
|
|
||||||
syncRaftRandomizedElectionTimeout(pRaft);
|
|
||||||
|
|
||||||
abortLeaderTransfer(pRaft);
|
|
||||||
|
|
||||||
syncRaftResetVotes(pRaft->tracker);
|
|
||||||
syncRaftProgressVisit(pRaft->tracker, initProgress, pRaft);
|
|
||||||
|
|
||||||
pRaft->pendingConfigIndex = 0;
|
|
||||||
pRaft->uncommittedSize = 0;
|
|
||||||
}
|
|
||||||
|
|
|
@ -23,6 +23,11 @@ void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) {
|
||||||
bool preVote;
|
bool preVote;
|
||||||
RaftMessageType voteMsgType;
|
RaftMessageType voteMsgType;
|
||||||
|
|
||||||
|
if (syncRaftIsPromotable(pRaft)) {
|
||||||
|
syncDebug("[%d:%d] is unpromotable; campaign() should have been called", pRaft->selfGroupId, pRaft->selfId);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) {
|
if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) {
|
||||||
syncRaftBecomePreCandidate(pRaft);
|
syncRaftBecomePreCandidate(pRaft);
|
||||||
preVote = true;
|
preVote = true;
|
||||||
|
@ -36,8 +41,8 @@ void syncRaftStartElection(SSyncRaft* pRaft, SyncRaftElectionType cType) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int quorum = syncRaftQuorum(pRaft);
|
int quorum = syncRaftQuorum(pRaft);
|
||||||
int granted = syncRaftNumOfGranted(pRaft, pRaft->selfId, preVote, true, NULL);
|
SSyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL);
|
||||||
if (quorum <= granted) {
|
if (result == SYNC_RAFT_VOTE_WON) {
|
||||||
/**
|
/**
|
||||||
* We won the election after voting for ourselves (which must mean that
|
* We won the election after voting for ourselves (which must mean that
|
||||||
* this is a single-node cluster). Advance to the next state.
|
* this is a single-node cluster). Advance to the next state.
|
||||||
|
|
|
@ -20,10 +20,14 @@
|
||||||
|
|
||||||
int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
if (pRaft->state == TAOS_SYNC_ROLE_LEADER) {
|
if (pRaft->state == TAOS_SYNC_ROLE_LEADER) {
|
||||||
syncDebug("%d ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfId);
|
syncDebug("[%d:%d] ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfGroupId, pRaft->selfId);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!syncRaftIsPromotable(pRaft)) {
|
||||||
|
syncDebug("[%d:%d] is unpromotable and can not campaign", pRaft->selfGroupId, pRaft->selfId);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
// if there is pending uncommitted config,cannot start election
|
// if there is pending uncommitted config,cannot start election
|
||||||
if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) {
|
if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) {
|
||||||
syncWarn("[%d:%d] cannot syncRaftStartElection at term %" PRId64 " since there are still pending configuration changes to apply",
|
syncWarn("[%d:%d] cannot syncRaftStartElection at term %" PRId64 " since there are still pending configuration changes to apply",
|
||||||
|
|
|
@ -23,6 +23,8 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
int quorum;
|
int quorum;
|
||||||
int voterIndex;
|
int voterIndex;
|
||||||
|
|
||||||
|
assert(pRaft->state == TAOS_SYNC_ROLE_CANDIDATE);
|
||||||
|
|
||||||
voterIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from);
|
voterIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from);
|
||||||
if (voterIndex == -1) {
|
if (voterIndex == -1) {
|
||||||
syncError("[%d:%d] recv vote resp from unknown server %d", pRaft->selfGroupId, pRaft->selfId, pMsg->from);
|
syncError("[%d:%d] recv vote resp from unknown server %d", pRaft->selfGroupId, pRaft->selfId, pMsg->from);
|
||||||
|
@ -34,24 +36,23 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
granted = syncRaftNumOfGranted(pRaft, pMsg->from,
|
SSyncRaftVoteResult result = syncRaftPollVote(pRaft, pMsg->from,
|
||||||
pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION,
|
pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION,
|
||||||
!pMsg->voteResp.rejected, &rejected);
|
!pMsg->voteResp.rejected, &rejected, &granted);
|
||||||
quorum = syncRaftQuorum(pRaft);
|
|
||||||
|
|
||||||
syncInfo("[%d:%d] [quorum:%d] has received %d votes and %d vote rejections",
|
syncInfo("[%d:%d] [quorum:%d] has received %d votes and %d vote rejections",
|
||||||
pRaft->selfGroupId, pRaft->selfId, quorum, granted, rejected);
|
pRaft->selfGroupId, pRaft->selfId, quorum, granted, rejected);
|
||||||
|
|
||||||
if (granted >= quorum) {
|
if (result == SYNC_RAFT_VOTE_WON) {
|
||||||
if (pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) {
|
if (pRaft->candidateState.inPreVote) {
|
||||||
syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION);
|
syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION);
|
||||||
} else {
|
} else {
|
||||||
syncRaftBecomeLeader(pRaft);
|
syncRaftBecomeLeader(pRaft);
|
||||||
|
|
||||||
|
}
|
||||||
|
} else if (result == SYNC_RAFT_VOTE_LOST) {
|
||||||
|
syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
|
||||||
} else if (rejected == quorum) {
|
|
||||||
syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID);
|
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
|
@ -0,0 +1,306 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "raft.h"
|
||||||
|
#include "raft_configuration.h"
|
||||||
|
#include "raft_log.h"
|
||||||
|
#include "raft_replication.h"
|
||||||
|
#include "sync_raft_progress_tracker.h"
|
||||||
|
#include "syncInt.h"
|
||||||
|
|
||||||
|
static int convertClear(SSyncRaft* pRaft);
|
||||||
|
static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
|
static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
|
static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
|
|
||||||
|
static int triggerAll(SSyncRaft* pRaft);
|
||||||
|
|
||||||
|
static void tickElection(SSyncRaft* pRaft);
|
||||||
|
static void tickHeartbeat(SSyncRaft* pRaft);
|
||||||
|
|
||||||
|
static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n);
|
||||||
|
static bool maybeCommit(SSyncRaft* pRaft);
|
||||||
|
|
||||||
|
static void abortLeaderTransfer(SSyncRaft* pRaft);
|
||||||
|
|
||||||
|
static void resetRaft(SSyncRaft* pRaft, SyncTerm term);
|
||||||
|
|
||||||
|
void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId) {
|
||||||
|
convertClear(pRaft);
|
||||||
|
|
||||||
|
pRaft->stepFp = stepFollower;
|
||||||
|
resetRaft(pRaft, term);
|
||||||
|
pRaft->tickFp = tickElection;
|
||||||
|
pRaft->leaderId = leaderId;
|
||||||
|
pRaft->state = TAOS_SYNC_ROLE_FOLLOWER;
|
||||||
|
syncInfo("[%d:%d] became followe at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftBecomePreCandidate(SSyncRaft* pRaft) {
|
||||||
|
convertClear(pRaft);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Becoming a pre-candidate changes our step functions and state,
|
||||||
|
* but doesn't change anything else. In particular it does not increase
|
||||||
|
* r.Term or change r.Vote.
|
||||||
|
**/
|
||||||
|
pRaft->stepFp = stepCandidate;
|
||||||
|
pRaft->tickFp = tickElection;
|
||||||
|
pRaft->state = TAOS_SYNC_ROLE_CANDIDATE;
|
||||||
|
pRaft->candidateState.inPreVote = true;
|
||||||
|
syncInfo("[%d:%d] became pre-candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftBecomeCandidate(SSyncRaft* pRaft) {
|
||||||
|
convertClear(pRaft);
|
||||||
|
|
||||||
|
pRaft->candidateState.inPreVote = false;
|
||||||
|
pRaft->stepFp = stepCandidate;
|
||||||
|
// become candidate make term+1
|
||||||
|
resetRaft(pRaft, pRaft->term + 1);
|
||||||
|
pRaft->tickFp = tickElection;
|
||||||
|
pRaft->voteFor = pRaft->selfId;
|
||||||
|
pRaft->state = TAOS_SYNC_ROLE_CANDIDATE;
|
||||||
|
syncInfo("[%d:%d] became candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftBecomeLeader(SSyncRaft* pRaft) {
|
||||||
|
assert(pRaft->state != TAOS_SYNC_ROLE_FOLLOWER);
|
||||||
|
|
||||||
|
pRaft->stepFp = stepLeader;
|
||||||
|
resetRaft(pRaft, pRaft->term);
|
||||||
|
pRaft->leaderId = pRaft->leaderId;
|
||||||
|
pRaft->state = TAOS_SYNC_ROLE_LEADER;
|
||||||
|
// TODO: check if there is pending config log
|
||||||
|
int nPendingConf = syncRaftLogNumOfPendingConf(pRaft->log);
|
||||||
|
if (nPendingConf > 1) {
|
||||||
|
syncFatal("unexpected multiple uncommitted config entry");
|
||||||
|
}
|
||||||
|
|
||||||
|
syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
||||||
|
|
||||||
|
// after become leader, send a no-op log
|
||||||
|
SSyncRaftEntry* entry = (SSyncRaftEntry*)malloc(sizeof(SSyncRaftEntry));
|
||||||
|
if (entry == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
*entry = (SSyncRaftEntry) {
|
||||||
|
.buffer = (SSyncBuffer) {
|
||||||
|
.data = NULL,
|
||||||
|
.len = 0,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
appendEntries(pRaft, entry, 1);
|
||||||
|
//syncRaftTriggerHeartbeat(pRaft);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) {
|
||||||
|
triggerAll(pRaft);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) {
|
||||||
|
// electionTimeoutTick in [3,6] tick
|
||||||
|
pRaft->randomizedElectionTimeout = taosRand() % 4 + 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftIsPromotable(SSyncRaft* pRaft) {
|
||||||
|
return pRaft->selfId != SYNC_NON_NODE_ID;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) {
|
||||||
|
return pRaft->electionElapsed >= pRaft->randomizedElectionTimeout;
|
||||||
|
}
|
||||||
|
|
||||||
|
int syncRaftQuorum(SSyncRaft* pRaft) {
|
||||||
|
return pRaft->cluster.replica / 2 + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
SSyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
||||||
|
bool preVote, bool grant,
|
||||||
|
int* rejected, int *granted) {
|
||||||
|
int voterIndex = syncRaftConfigurationIndexOfNode(pRaft, id);
|
||||||
|
if (voterIndex == -1) {
|
||||||
|
return SYNC_RAFT_VOTE_PENDING;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (grant) {
|
||||||
|
syncInfo("[%d:%d] received grant (pre-vote %d) from %d at term %" PRId64 "",
|
||||||
|
pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term);
|
||||||
|
} else {
|
||||||
|
syncInfo("[%d:%d] received rejection (pre-vote %d) from %d at term %" PRId64 "",
|
||||||
|
pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term);
|
||||||
|
}
|
||||||
|
|
||||||
|
syncRaftRecordVote(pRaft->tracker, voterIndex, grant);
|
||||||
|
return syncRaftTallyVotes(pRaft->tracker, rejected, granted);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
if (accept) {
|
||||||
|
syncInfo("[%d:%d] received (pre-vote %d) from %d at term %" PRId64 "",
|
||||||
|
pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term);
|
||||||
|
} else {
|
||||||
|
syncInfo("[%d:%d] received rejection from %d at term %" PRId64 "",
|
||||||
|
pRaft->selfGroupId, pRaft->selfId, id, pRaft->term);
|
||||||
|
}
|
||||||
|
|
||||||
|
int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, id);
|
||||||
|
assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0);
|
||||||
|
assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN);
|
||||||
|
|
||||||
|
pRaft->candidateState.votes[voteIndex] = accept ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT;
|
||||||
|
int granted = 0, rejected = 0;
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < pRaft->cluster.replica; ++i) {
|
||||||
|
if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) granted++;
|
||||||
|
else if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_REJECT) rejected++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rejectNum) *rejectNum = rejected;
|
||||||
|
return granted;
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int convertClear(SSyncRaft* pRaft) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
|
/**
|
||||||
|
* Only handle vote responses corresponding to our candidacy (while in
|
||||||
|
* StateCandidate, we may get stale MsgPreVoteResp messages in this term from
|
||||||
|
* our pre-candidate state).
|
||||||
|
**/
|
||||||
|
RaftMessageType msgType = pMsg->msgType;
|
||||||
|
|
||||||
|
if (msgType == RAFT_MSG_INTERNAL_PROP) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (msgType == RAFT_MSG_VOTE_RESP) {
|
||||||
|
syncRaftHandleVoteRespMessage(pRaft, pMsg);
|
||||||
|
return 0;
|
||||||
|
} else if (msgType == RAFT_MSG_APPEND) {
|
||||||
|
syncRaftBecomeFollower(pRaft, pRaft->term, pMsg->from);
|
||||||
|
syncRaftHandleAppendEntriesMessage(pRaft, pMsg);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
|
convertClear(pRaft);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tickElection is run by followers and candidates per tick.
|
||||||
|
**/
|
||||||
|
static void tickElection(SSyncRaft* pRaft) {
|
||||||
|
pRaft->electionElapsed += 1;
|
||||||
|
|
||||||
|
if (!syncRaftIsPromotable(pRaft)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!syncRaftIsPastElectionTimeout(pRaft)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// election timeout
|
||||||
|
pRaft->electionElapsed = 0;
|
||||||
|
SSyncMessage msg;
|
||||||
|
syncRaftStep(pRaft, syncInitElectionMsg(&msg, pRaft->selfId));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void tickHeartbeat(SSyncRaft* pRaft) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) {
|
||||||
|
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
||||||
|
SyncTerm term = pRaft->term;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < n; ++i) {
|
||||||
|
entries[i].term = term;
|
||||||
|
entries[i].index = lastIndex + 1 + i;
|
||||||
|
}
|
||||||
|
|
||||||
|
syncRaftLogAppend(pRaft->log, entries, n);
|
||||||
|
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->tracker->progressMap[pRaft->cluster.selfIndex]);
|
||||||
|
syncRaftProgressMaybeUpdate(progress, lastIndex);
|
||||||
|
// Regardless of maybeCommit's return, our caller will call bcastAppend.
|
||||||
|
maybeCommit(pRaft);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* maybeCommit attempts to advance the commit index. Returns true if
|
||||||
|
* the commit index changed (in which case the caller should call
|
||||||
|
* r.bcastAppend).
|
||||||
|
**/
|
||||||
|
static bool maybeCommit(SSyncRaft* pRaft) {
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* trigger I/O requests for newly appended log entries or heartbeats.
|
||||||
|
**/
|
||||||
|
static int triggerAll(SSyncRaft* pRaft) {
|
||||||
|
assert(pRaft->state == TAOS_SYNC_ROLE_LEADER);
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < pRaft->cluster.replica; ++i) {
|
||||||
|
if (i == pRaft->cluster.selfIndex) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
syncRaftReplicate(pRaft, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void abortLeaderTransfer(SSyncRaft* pRaft) {
|
||||||
|
pRaft->leadTransferee = SYNC_NON_NODE_ID;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void initProgress(int i, SSyncRaftProgress* progress, void* arg) {
|
||||||
|
syncRaftInitProgress(i, (SSyncRaft*)arg, progress);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void resetRaft(SSyncRaft* pRaft, SyncTerm term) {
|
||||||
|
if (pRaft->term != term) {
|
||||||
|
pRaft->term = term;
|
||||||
|
pRaft->voteFor = SYNC_NON_NODE_ID;
|
||||||
|
}
|
||||||
|
|
||||||
|
pRaft->leaderId = SYNC_NON_NODE_ID;
|
||||||
|
|
||||||
|
pRaft->electionElapsed = 0;
|
||||||
|
pRaft->heartbeatElapsed = 0;
|
||||||
|
|
||||||
|
syncRaftRandomizedElectionTimeout(pRaft);
|
||||||
|
|
||||||
|
abortLeaderTransfer(pRaft);
|
||||||
|
|
||||||
|
syncRaftResetVotes(pRaft->tracker);
|
||||||
|
syncRaftProgressVisit(pRaft->tracker, initProgress, pRaft);
|
||||||
|
|
||||||
|
pRaft->pendingConfigIndex = 0;
|
||||||
|
pRaft->uncommittedSize = 0;
|
||||||
|
}
|
|
@ -34,6 +34,8 @@ void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress)
|
||||||
.matchIndex = i == pRaft->selfIndex ? syncRaftLogLastIndex(pRaft->log) : 0,
|
.matchIndex = i == pRaft->selfIndex ? syncRaftLogLastIndex(pRaft->log) : 0,
|
||||||
.nextIndex = syncRaftLogLastIndex(pRaft->log) + 1,
|
.nextIndex = syncRaftLogLastIndex(pRaft->log) + 1,
|
||||||
.inflights = inflights,
|
.inflights = inflights,
|
||||||
|
.isLearner = false,
|
||||||
|
.state = PROGRESS_STATE_PROBE,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) {
|
void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) {
|
||||||
memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteRespType) * TSDB_MAX_REPLICA);
|
memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(SyncRaftVoteResult) * TSDB_MAX_REPLICA);
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) {
|
void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) {
|
||||||
|
@ -35,3 +35,42 @@ void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp vi
|
||||||
visit(i, progress, arg);
|
visit(i, progress, arg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant) {
|
||||||
|
if (tracker->votes[i] != SYNC_RAFT_VOTE_RESP_UNKNOWN) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
tracker->votes[i] = grant ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the
|
||||||
|
* election outcome is known.
|
||||||
|
**/
|
||||||
|
SyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) {
|
||||||
|
int i;
|
||||||
|
SSyncRaftProgress* progress;
|
||||||
|
int r, g;
|
||||||
|
|
||||||
|
for (i = 0, r = 0, g = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||||
|
progress = &(tracker->progressMap[i]);
|
||||||
|
if (progress->id == SYNC_NON_NODE_ID) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tracker->votes[i] == SYNC_RAFT_VOTE_RESP_UNKNOWN) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tracker->votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) {
|
||||||
|
g++;
|
||||||
|
} else {
|
||||||
|
r++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rejected) *rejected = r;
|
||||||
|
if (granted) *granted = g;
|
||||||
|
return syncRaftVoteResult(&(tracker->config.voters), tracker->votes);
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "sync_raft_quorum_majority.h"
|
||||||
|
#include "sync_raft_quorum_joint.h"
|
||||||
|
#include "sync_raft_quorum.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||||
|
* a result indicating whether the vote is pending, lost, or won. A joint quorum
|
||||||
|
* requires both majority quorums to vote in favor.
|
||||||
|
**/
|
||||||
|
SyncRaftVoteResult syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const SyncRaftVoteResult* votes) {
|
||||||
|
SyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->majorityConfig[0]), votes);
|
||||||
|
SyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->majorityConfig[1]), votes);
|
||||||
|
|
||||||
|
if (r1 == r2) {
|
||||||
|
// If they agree, return the agreed state.
|
||||||
|
return r1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (r1 == SYNC_RAFT_VOTE_LOST || r2 == SYNC_RAFT_VOTE_LOST) {
|
||||||
|
// If either config has lost, loss is the only possible outcome.
|
||||||
|
return SYNC_RAFT_VOTE_LOST;
|
||||||
|
}
|
||||||
|
|
||||||
|
// One side won, the other one is pending, so the whole outcome is.
|
||||||
|
return SYNC_RAFT_VOTE_PENDING;
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "sync_raft_quorum.h"
|
||||||
|
#include "sync_raft_quorum_majority.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||||
|
* a result indicating whether the vote is pending (i.e. neither a quorum of
|
||||||
|
* yes/no has been reached), won (a quorum of yes has been reached), or lost (a
|
||||||
|
* quorum of no has been reached).
|
||||||
|
**/
|
||||||
|
SyncRaftVoteResult syncRaftMajorityVoteResult(SSyncCluster* config, const SyncRaftVoteResult* votes) {
|
||||||
|
if (config->replica == 0) {
|
||||||
|
return SYNC_RAFT_VOTE_WON;
|
||||||
|
}
|
||||||
|
|
||||||
|
int i, g, r, missing;
|
||||||
|
for (i = g = r = missing = 0; i < TSDB_MAX_REPLICA; ++i) {
|
||||||
|
if (config->nodeInfo[i].nodeId == SYNC_NON_NODE_ID) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (votes[i] == SYNC_RAFT_VOTE_RESP_UNKNOWN) {
|
||||||
|
missing += 1;
|
||||||
|
} else if (votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) {
|
||||||
|
g +=1;
|
||||||
|
} else {
|
||||||
|
r += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int quorum = config->replica / 2 + 1;
|
||||||
|
if (g >= quorum) {
|
||||||
|
return SYNC_RAFT_VOTE_WON;
|
||||||
|
}
|
||||||
|
if (r + missing >= quorum) {
|
||||||
|
return SYNC_RAFT_VOTE_PENDING;
|
||||||
|
}
|
||||||
|
|
||||||
|
return SYNC_RAFT_VOTE_LOST;
|
||||||
|
}
|
Loading…
Reference in New Issue