249 lines
8.9 KiB
C
249 lines
8.9 KiB
C
/*
|
||
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||
*
|
||
* This program is free software: you can use, redistribute, and/or modify
|
||
* it under the terms of the GNU Affero General Public License, version 3
|
||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||
*
|
||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||
*
|
||
* You should have received a copy of the GNU Affero General Public License
|
||
* along with this program. If not, see <http: *www.gnu.org/licenses/>.
|
||
*/
|
||
|
||
#ifndef TD_SYNC_RAFT_PROGRESS_H
|
||
#define TD_SYNC_RAFT_PROGRESS_H
|
||
|
||
#include "sync_type.h"
|
||
#include "sync_raft_inflights.h"
|
||
|
||
/**
|
||
* State defines how the leader should interact with the follower.
|
||
*
|
||
* When in PROGRESS_STATE_PROBE, leader sends at most one replication message
|
||
* per heartbeat interval. It also probes actual progress of the follower.
|
||
*
|
||
* When in PROGRESS_STATE_REPLICATE, leader optimistically increases next
|
||
* to the latest entry sent after sending replication message. This is
|
||
* an optimized state for fast replicating log entries to the follower.
|
||
*
|
||
* When in PROGRESS_STATE_SNAPSHOT, leader should have sent out snapshot
|
||
* before and stops sending any replication message.
|
||
*
|
||
* PROGRESS_STATE_PROBE is the initial state.
|
||
**/
|
||
typedef enum ESyncRaftProgressState {
|
||
/**
|
||
* StateProbe indicates a follower whose last index isn't known. Such a
|
||
* follower is "probed" (i.e. an append sent periodically) to narrow down
|
||
* its last index. In the ideal (and common) case, only one round of probing
|
||
* is necessary as the follower will react with a hint. Followers that are
|
||
* probed over extended periods of time are often offline.
|
||
**/
|
||
PROGRESS_STATE_PROBE = 0,
|
||
|
||
/**
|
||
* StateReplicate is the state steady in which a follower eagerly receives
|
||
* log entries to append to its log.
|
||
**/
|
||
PROGRESS_STATE_REPLICATE,
|
||
|
||
/**
|
||
* StateSnapshot indicates a follower that needs log entries not available
|
||
* from the leader's Raft log. Such a follower needs a full snapshot to
|
||
* return to StateReplicate.
|
||
**/
|
||
PROGRESS_STATE_SNAPSHOT,
|
||
} ESyncRaftProgressState;
|
||
|
||
/**
|
||
* Progress represents a follower’s progress in the view of the leader. Leader maintains
|
||
* progresses of all followers, and sends entries to the follower based on its progress.
|
||
**/
|
||
struct SSyncRaftProgress {
|
||
SyncNodeId id;
|
||
|
||
SyncIndex nextIndex;
|
||
|
||
SyncIndex matchIndex;
|
||
|
||
/**
|
||
* State defines how the leader should interact with the follower.
|
||
*
|
||
* When in StateProbe, leader sends at most one replication message
|
||
* per heartbeat interval. It also probes actual progress of the follower.
|
||
*
|
||
* When in StateReplicate, leader optimistically increases next
|
||
* to the latest entry sent after sending replication message. This is
|
||
* an optimized state for fast replicating log entries to the follower.
|
||
*
|
||
* When in StateSnapshot, leader should have sent out snapshot
|
||
* before and stops sending any replication message.
|
||
**/
|
||
ESyncRaftProgressState state;
|
||
|
||
/**
|
||
* pendingSnapshotIndex is used in PROGRESS_STATE_SNAPSHOT.
|
||
* If there is a pending snapshot, the pendingSnapshotIndex will be set to the
|
||
* index of the snapshot. If pendingSnapshotIndex is set, the replication process of
|
||
* this Progress will be paused. raft will not resend snapshot until the pending one
|
||
* is reported to be failed.
|
||
**/
|
||
SyncIndex pendingSnapshotIndex;
|
||
|
||
/**
|
||
* recentActive is true if the progress is recently active. Receiving any messages
|
||
* from the corresponding follower indicates the progress is active.
|
||
* RecentActive can be reset to false after an election timeout.
|
||
**/
|
||
bool recentActive;
|
||
|
||
/**
|
||
* probeSent is used while this follower is in StateProbe. When probeSent is
|
||
* true, raft should pause sending replication message to this peer until
|
||
* probeSent is reset. See ProbeAcked() and IsPaused().
|
||
**/
|
||
bool probeSent;
|
||
|
||
/**
|
||
* inflights is a sliding window for the inflight messages.
|
||
* Each inflight message contains one or more log entries.
|
||
* The max number of entries per message is defined in raft config as MaxSizePerMsg.
|
||
* Thus inflight effectively limits both the number of inflight messages
|
||
* and the bandwidth each Progress can use.
|
||
* When inflights is Full, no more message should be sent.
|
||
* When a leader sends out a message, the index of the last
|
||
* entry should be added to inflights. The index MUST be added
|
||
* into inflights in order.
|
||
* When a leader receives a reply, the previous inflights should
|
||
* be freed by calling inflights.FreeLE with the index of the last
|
||
* received entry.
|
||
**/
|
||
SSyncRaftInflights* inflights;
|
||
|
||
/**
|
||
* IsLearner is true if this progress is tracked for a learner.
|
||
**/
|
||
bool isLearner;
|
||
};
|
||
|
||
struct SSyncRaftProgressMap {
|
||
SSyncRaftProgress progress[TSDB_MAX_REPLICA];
|
||
};
|
||
|
||
void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
||
|
||
/**
|
||
* syncRaftProgressBecomeProbe transitions into StateProbe. Next is reset to Match+1 or,
|
||
* optionally and if larger, the index of the pending snapshot.
|
||
**/
|
||
void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress);
|
||
|
||
/**
|
||
* syncRaftProgressBecomeReplicate transitions into StateReplicate, resetting Next to Match+1.
|
||
**/
|
||
void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress);
|
||
|
||
/**
|
||
* syncRaftProgressMaybeUpdate is called when an MsgAppResp arrives from the follower, with the
|
||
* index acked by it. The method returns false if the given n index comes from
|
||
* an outdated message. Otherwise it updates the progress and returns true.
|
||
**/
|
||
bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex);
|
||
|
||
/**
|
||
* syncRaftProgressOptimisticNextIndex signals that appends all the way up to and including index n
|
||
* are in-flight. As a result, Next is increased to n+1.
|
||
**/
|
||
static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* progress, SyncIndex nextIndex) {
|
||
progress->nextIndex = nextIndex + 1;
|
||
}
|
||
|
||
/**
|
||
* syncRaftProgressMaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The
|
||
* arguments are the index of the append message rejected by the follower, and
|
||
* the hint that we want to decrease to.
|
||
*
|
||
* Rejections can happen spuriously as messages are sent out of order or
|
||
* duplicated. In such cases, the rejection pertains to an index that the
|
||
* Progress already knows were previously acknowledged, and false is returned
|
||
* without changing the Progress.
|
||
*
|
||
* If the rejection is genuine, Next is lowered sensibly, and the Progress is
|
||
* cleared for sending log entries.
|
||
**/
|
||
bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress,
|
||
SyncIndex rejected, SyncIndex matchHint);
|
||
|
||
/**
|
||
* syncRaftProgressIsPaused returns whether sending log entries to this node has been throttled.
|
||
* This is done when a node has rejected recent MsgApps, is currently waiting
|
||
* for a snapshot, or has reached the MaxInflightMsgs limit. In normal
|
||
* operation, this is false. A throttled node will be contacted less frequently
|
||
* until it has reached a state in which it's able to accept a steady stream of
|
||
* log entries again.
|
||
**/
|
||
bool syncRaftProgressIsPaused(SSyncRaftProgress* progress);
|
||
|
||
static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progress) {
|
||
return progress->nextIndex;
|
||
}
|
||
|
||
static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInReplicate(SSyncRaftProgress* progress) {
|
||
return progress->state == PROGRESS_STATE_REPLICATE;
|
||
}
|
||
|
||
static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInSnapshot(SSyncRaftProgress* progress) {
|
||
return progress->state == PROGRESS_STATE_SNAPSHOT;
|
||
}
|
||
|
||
static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInProbe(SSyncRaftProgress* progress) {
|
||
return progress->state == PROGRESS_STATE_PROBE;
|
||
}
|
||
|
||
static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progress) {
|
||
return progress->recentActive;
|
||
}
|
||
|
||
int syncRaftFindProgressIndexByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||
|
||
int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||
|
||
void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||
|
||
/**
|
||
* return true if progress's log is up-todate
|
||
**/
|
||
bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
||
|
||
void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex);
|
||
|
||
void syncRaftCopyProgress(const SSyncRaftProgress* from, SSyncRaftProgress* to);
|
||
|
||
void syncRaftProgressMapCopy(const SSyncRaftProgressMap* from, SSyncRaftProgressMap* to);
|
||
|
||
#if 0
|
||
|
||
void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i);
|
||
|
||
|
||
|
||
SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i);
|
||
|
||
void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i);
|
||
|
||
void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i);
|
||
|
||
bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i);
|
||
|
||
void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i);
|
||
|
||
|
||
|
||
void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i);
|
||
|
||
#endif
|
||
|
||
#endif /* TD_SYNC_RAFT_PROGRESS_H */ |