[TD-10645][raft]<feature>add raft progress
This commit is contained in:
parent
fca35ceb29
commit
24a0966da6
|
@ -61,13 +61,13 @@ typedef struct {
|
||||||
typedef struct SSyncFSM {
|
typedef struct SSyncFSM {
|
||||||
void* pData;
|
void* pData;
|
||||||
|
|
||||||
// apply committed log, bufs will be free by raft module
|
// apply committed log, bufs will be free by sync module
|
||||||
int32_t (*applyLog)(struct SSyncFSM* fsm, SyncIndex index, const SSyncBuffer* buf, void* pData);
|
int32_t (*applyLog)(struct SSyncFSM* fsm, SyncIndex index, const SSyncBuffer* buf, void* pData);
|
||||||
|
|
||||||
// cluster commit callback
|
// cluster commit callback
|
||||||
int32_t (*onClusterChanged)(struct SSyncFSM* fsm, const SSyncCluster* cluster, void* pData);
|
int32_t (*onClusterChanged)(struct SSyncFSM* fsm, const SSyncCluster* cluster, void* pData);
|
||||||
|
|
||||||
// fsm return snapshot in ppBuf, bufs will be free by raft module
|
// fsm return snapshot in ppBuf, bufs will be free by sync module
|
||||||
// TODO: getSnapshot SHOULD be async?
|
// TODO: getSnapshot SHOULD be async?
|
||||||
int32_t (*getSnapshot)(struct SSyncFSM* fsm, SSyncBuffer** ppBuf, int32_t* objId, bool* isLast);
|
int32_t (*getSnapshot)(struct SSyncFSM* fsm, SSyncBuffer** ppBuf, int32_t* objId, bool* isLast);
|
||||||
|
|
||||||
|
@ -89,18 +89,24 @@ typedef struct SSyncLogStore {
|
||||||
// write log with given index
|
// write log with given index
|
||||||
int32_t (*logWrite)(struct SSyncLogStore* logStore, SyncIndex index, SSyncBuffer* pBuf);
|
int32_t (*logWrite)(struct SSyncLogStore* logStore, SyncIndex index, SSyncBuffer* pBuf);
|
||||||
|
|
||||||
// read log from given index with limit, return the actual num in nBuf
|
/**
|
||||||
|
* read log from given index(included) with limit, return the actual num in nBuf,
|
||||||
|
* pBuf will be free in sync module
|
||||||
|
**/
|
||||||
int32_t (*logRead)(struct SSyncLogStore* logStore, SyncIndex index, int limit,
|
int32_t (*logRead)(struct SSyncLogStore* logStore, SyncIndex index, int limit,
|
||||||
SSyncBuffer* pBuf, int* nBuf);
|
SSyncBuffer* pBuf, int* nBuf);
|
||||||
|
|
||||||
// mark log with given index has been commtted
|
// mark log with given index has been commtted
|
||||||
int32_t (*logCommit)(struct SSyncLogStore* logStore, SyncIndex index);
|
int32_t (*logCommit)(struct SSyncLogStore* logStore, SyncIndex index);
|
||||||
|
|
||||||
// prune log before given index
|
// prune log before given index(not included)
|
||||||
int32_t (*logPrune)(struct SSyncLogStore* logStore, SyncIndex index);
|
int32_t (*logPrune)(struct SSyncLogStore* logStore, SyncIndex index);
|
||||||
|
|
||||||
// rollback log after given index
|
// rollback log after given index(included)
|
||||||
int32_t (*logRollback)(struct SSyncLogStore* logStore, SyncIndex index);
|
int32_t (*logRollback)(struct SSyncLogStore* logStore, SyncIndex index);
|
||||||
|
|
||||||
|
// return last index of log
|
||||||
|
SyncIndex (*logLastIndex)(struct SSyncLogStore* logStore);
|
||||||
} SSyncLogStore;
|
} SSyncLogStore;
|
||||||
|
|
||||||
typedef struct SSyncServerState {
|
typedef struct SSyncServerState {
|
||||||
|
|
|
@ -17,15 +17,46 @@
|
||||||
#define _TD_LIBS_SYNC_RAFT_H
|
#define _TD_LIBS_SYNC_RAFT_H
|
||||||
|
|
||||||
#include "sync.h"
|
#include "sync.h"
|
||||||
|
#include "sync_type.h"
|
||||||
#include "raft_message.h"
|
#include "raft_message.h"
|
||||||
|
|
||||||
typedef struct SSyncRaft {
|
typedef struct SSyncRaftProgress SSyncRaftProgress;
|
||||||
|
|
||||||
|
typedef struct RaftLeaderState {
|
||||||
|
int nProgress;
|
||||||
|
SSyncRaftProgress* progress;
|
||||||
|
} RaftLeaderState;
|
||||||
|
|
||||||
|
typedef struct SSyncRaftIOMethods {
|
||||||
|
SyncTime (*time)(SSyncRaft*);
|
||||||
|
|
||||||
|
} SSyncRaftIOMethods;
|
||||||
|
|
||||||
|
struct SSyncRaft {
|
||||||
// owner sync node
|
// owner sync node
|
||||||
SSyncNode* pNode;
|
SSyncNode* pNode;
|
||||||
|
|
||||||
SSyncInfo info;
|
SSyncInfo info;
|
||||||
|
|
||||||
} SSyncRaft;
|
// election timeout tick(random in [3:6] tick)
|
||||||
|
uint16_t electionTick;
|
||||||
|
|
||||||
|
// heartbeat timeout tick(default: 1 tick)
|
||||||
|
uint16_t heartbeatTick;
|
||||||
|
|
||||||
|
int installSnapShotTimeoutMS;
|
||||||
|
|
||||||
|
//
|
||||||
|
int heartbeatTimeoutMS;
|
||||||
|
|
||||||
|
bool preVote;
|
||||||
|
|
||||||
|
SSyncRaftIOMethods io;
|
||||||
|
|
||||||
|
RaftLeaderState leaderState;
|
||||||
|
|
||||||
|
SSyncRaftUnstableLog *log;
|
||||||
|
};
|
||||||
|
|
||||||
int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo);
|
int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo);
|
||||||
int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg);
|
int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg);
|
||||||
|
|
|
@ -0,0 +1,181 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef TD_SYNC_RAFT_PROGRESS_H
|
||||||
|
#define TD_SYNC_RAFT_PROGRESS_H
|
||||||
|
|
||||||
|
#include "sync_type.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* SSyncRaftInflights is a sliding window for the inflight messages.
|
||||||
|
* Thus inflight effectively limits both the number of inflight messages
|
||||||
|
* and the bandwidth each Progress can use.
|
||||||
|
* When inflights is full, no more message should be sent.
|
||||||
|
* When a leader sends out a message, the index of the last
|
||||||
|
* entry should be added to inflights. The index MUST be added
|
||||||
|
* into inflights in order.
|
||||||
|
* When a leader receives a reply, the previous inflights should
|
||||||
|
* be freed by calling syncRaftInflightFreeTo with the index of the last
|
||||||
|
* received entry.
|
||||||
|
**/
|
||||||
|
typedef struct SSyncRaftInflights {
|
||||||
|
/* the starting index in the buffer */
|
||||||
|
int start;
|
||||||
|
|
||||||
|
/* number of inflights in the buffer */
|
||||||
|
int count;
|
||||||
|
|
||||||
|
/* the size of the buffer */
|
||||||
|
int size;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* buffer contains the index of the last entry
|
||||||
|
* inside one message.
|
||||||
|
**/
|
||||||
|
SyncIndex* buffer;
|
||||||
|
} SSyncRaftInflights;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* State defines how the leader should interact with the follower.
|
||||||
|
*
|
||||||
|
* When in PROGRESS_PROBE, leader sends at most one replication message
|
||||||
|
* per heartbeat interval. It also probes actual progress of the follower.
|
||||||
|
*
|
||||||
|
* When in PROGRESS_REPLICATE, leader optimistically increases next
|
||||||
|
* to the latest entry sent after sending replication message. This is
|
||||||
|
* an optimized state for fast replicating log entries to the follower.
|
||||||
|
*
|
||||||
|
* When in PROGRESS_SNAPSHOT, leader should have sent out snapshot
|
||||||
|
* before and stops sending any replication message.
|
||||||
|
*
|
||||||
|
* PROGRESS_PROBE is the initial state.
|
||||||
|
**/
|
||||||
|
typedef enum RaftProgressState {
|
||||||
|
PROGRESS_PROBE = 0,
|
||||||
|
PROGRESS_REPLICATE,
|
||||||
|
PROGRESS_SNAPSHOT,
|
||||||
|
} RaftProgressState;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Progress represents a follower’s progress in the view of the leader. Leader maintains
|
||||||
|
* progresses of all followers, and sends entries to the follower based on its progress.
|
||||||
|
**/
|
||||||
|
struct SSyncRaftProgress {
|
||||||
|
SyncIndex nextIndex;
|
||||||
|
|
||||||
|
SyncIndex matchIndex;
|
||||||
|
|
||||||
|
RaftProgressState state;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* paused is used in PROGRESS_PROBE.
|
||||||
|
* When paused is true, raft should pause sending replication message to this peer.
|
||||||
|
**/
|
||||||
|
bool paused;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* pendingSnapshotIndex is used in PROGRESS_SNAPSHOT.
|
||||||
|
* If there is a pending snapshot, the pendingSnapshotIndex will be set to the
|
||||||
|
* index of the snapshot. If pendingSnapshotIndex is set, the replication process of
|
||||||
|
* this Progress will be paused. raft will not resend snapshot until the pending one
|
||||||
|
* is reported to be failed.
|
||||||
|
**/
|
||||||
|
SyncIndex pendingSnapshotIndex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* recentActive is true if the progress is recently active. Receiving any messages
|
||||||
|
* from the corresponding follower indicates the progress is active.
|
||||||
|
* RecentActive can be reset to false after an election timeout.
|
||||||
|
**/
|
||||||
|
bool recentActive;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flow control sliding window
|
||||||
|
**/
|
||||||
|
SSyncRaftInflights inflights;
|
||||||
|
};
|
||||||
|
|
||||||
|
int syncRaftProgressCreate(SSyncRaft* pRaft);
|
||||||
|
//int syncRaftProgressRecreate(SSyncRaft* pRaft, const RaftConfiguration* configuration);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftProgressMaybeUpdate returns false if the given lastIndex index comes from i-th node's log.
|
||||||
|
* Otherwise it updates the progress and returns true.
|
||||||
|
**/
|
||||||
|
bool syncRaftProgressMaybeUpdate(SSyncRaft* pRaft, int i, SyncIndex lastIndex);
|
||||||
|
|
||||||
|
void syncRaftProgressOptimisticNextIndex(SSyncRaft* pRaft, int i, SyncIndex nextIndex);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftProgressMaybeDecrTo returns false if the given to index comes from an out of order message.
|
||||||
|
* Otherwise it decreases the progress next index to min(rejected, last) and returns true.
|
||||||
|
**/
|
||||||
|
bool syncRaftProgressMaybeDecrTo(SSyncRaft* pRaft, int i,
|
||||||
|
SyncIndex rejected, SyncIndex lastIndex);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* syncRaftProgressIsPaused returns whether sending log entries to this node has been
|
||||||
|
* paused. A node may be paused because it has rejected recent
|
||||||
|
* MsgApps, is currently waiting for a snapshot, or has reached the
|
||||||
|
* MaxInflightMsgs limit.
|
||||||
|
**/
|
||||||
|
bool syncRaftProgressIsPaused(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
void syncRaftProgressFailure(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
bool syncRaftProgressNeedAbortSnapshot(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return true if i-th node's log is up-todate
|
||||||
|
**/
|
||||||
|
bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i, SyncIndex snapshotIndex);
|
||||||
|
|
||||||
|
int syncRaftInflightReset(SSyncRaftInflights* inflights);
|
||||||
|
bool syncRaftInflightFull(SSyncRaftInflights* inflights);
|
||||||
|
void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex);
|
||||||
|
void syncRaftInflightFreeTo(SSyncRaftInflights* inflights, SyncIndex toIndex);
|
||||||
|
void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights);
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
|
||||||
|
void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
SyncIndex syncRaftProgressNextIndex(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
bool syncRaftProgressGetRecentRecv(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
RaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* TD_SYNC_RAFT_PROGRESS_H */
|
|
@ -0,0 +1,115 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef TD_SYNC_RAFT_UNSTABLE_LOG_H
|
||||||
|
#define TD_SYNC_RAFT_UNSTABLE_LOG_H
|
||||||
|
|
||||||
|
#include "sync_type.h"
|
||||||
|
|
||||||
|
/* in-memory unstable raft log storage */
|
||||||
|
struct SSyncRaftUnstableLog {
|
||||||
|
#if 0
|
||||||
|
/* Circular buffer of log entries */
|
||||||
|
RaftEntry *entries;
|
||||||
|
|
||||||
|
/* size of Circular buffer */
|
||||||
|
int size;
|
||||||
|
|
||||||
|
/* Indexes of used slots [front, back) */
|
||||||
|
int front, back;
|
||||||
|
|
||||||
|
/* Index of first entry is offset + 1 */
|
||||||
|
SyncIndex offset;
|
||||||
|
|
||||||
|
/* meta data of snapshot */
|
||||||
|
SSyncRaftUnstableLog snapshot;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return index of last in memory log, return 0 if log is empty
|
||||||
|
**/
|
||||||
|
SyncIndex syncRaftLogLastIndex(SSyncRaftUnstableLog* pLog);
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
void raftLogInit(RaftLog* pLog);
|
||||||
|
|
||||||
|
void raftLogClose(RaftLog* pLog);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When startup populating log entrues loaded from disk,
|
||||||
|
* init raft memory log with snapshot index,term and log start idnex.
|
||||||
|
**/
|
||||||
|
/*
|
||||||
|
void raftLogStart(RaftLog* pLog,
|
||||||
|
RaftSnapshotMeta snapshot,
|
||||||
|
SyncIndex startIndex);
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* Get the number of entries the log.
|
||||||
|
**/
|
||||||
|
int raftLogNumEntries(const RaftLog* pLog);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return last term of in memory log, return 0 if log is empty
|
||||||
|
**/
|
||||||
|
SSyncTerm raftLogLastTerm(RaftLog* pLog);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return term of log with the given index, return 0 if the term of index cannot be found
|
||||||
|
* , errCode will save the error code.
|
||||||
|
**/
|
||||||
|
SSyncTerm raftLogTermOf(RaftLog* pLog, SyncIndex index, RaftCode* errCode);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the last index of the most recent snapshot. Return 0 if there are no *
|
||||||
|
* snapshots.
|
||||||
|
**/
|
||||||
|
SyncIndex raftLogSnapshotIndex(RaftLog* pLog);
|
||||||
|
|
||||||
|
/* Append a new entry to the log. */
|
||||||
|
int raftLogAppend(RaftLog* pLog,
|
||||||
|
SSyncTerm term,
|
||||||
|
const SSyncBuffer *buf);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* acquire log from given index onwards.
|
||||||
|
**/
|
||||||
|
/*
|
||||||
|
int raftLogAcquire(RaftLog* pLog,
|
||||||
|
SyncIndex index,
|
||||||
|
RaftEntry **ppEntries,
|
||||||
|
int *n);
|
||||||
|
|
||||||
|
void raftLogRelease(RaftLog* pLog,
|
||||||
|
SyncIndex index,
|
||||||
|
RaftEntry *pEntries,
|
||||||
|
int n);
|
||||||
|
*/
|
||||||
|
/* Delete all entries from the given index (included) onwards. */
|
||||||
|
void raftLogTruncate(RaftLog* pLog, SyncIndex index);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* when taking a new snapshot, the function will update the last snapshot information and delete
|
||||||
|
* all entries up last_index - trailing (included). If the log contains no entry
|
||||||
|
* a last_index - trailing, then no entry will be deleted.
|
||||||
|
**/
|
||||||
|
void raftLogSnapshot(RaftLog* pLog, SyncIndex index, SyncIndex trailing);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* TD_SYNC_RAFT_UNSTABLE_LOG_H */
|
|
@ -19,6 +19,7 @@
|
||||||
#include "thash.h"
|
#include "thash.h"
|
||||||
#include "os.h"
|
#include "os.h"
|
||||||
#include "sync.h"
|
#include "sync.h"
|
||||||
|
#include "sync_type.h"
|
||||||
#include "raft.h"
|
#include "raft.h"
|
||||||
#include "tlog.h"
|
#include "tlog.h"
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _TD_LIBS_SYNC_TYPE_H
|
||||||
|
#define _TD_LIBS_SYNC_TYPE_H
|
||||||
|
|
||||||
|
typedef int32_t SyncTime;
|
||||||
|
|
||||||
|
typedef struct SSyncRaftUnstableLog SSyncRaftUnstableLog;
|
||||||
|
|
||||||
|
typedef struct SSyncRaft SSyncRaft;
|
||||||
|
|
||||||
|
#ifndef MIN
|
||||||
|
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef MAX
|
||||||
|
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* _TD_LIBS_SYNC_TYPE_H */
|
|
@ -16,12 +16,10 @@
|
||||||
#include "raft.h"
|
#include "raft.h"
|
||||||
#include "syncInt.h"
|
#include "syncInt.h"
|
||||||
|
|
||||||
#ifndef MIN
|
|
||||||
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define RAFT_READ_LOG_MAX_NUM 100
|
#define RAFT_READ_LOG_MAX_NUM 100
|
||||||
|
|
||||||
|
static void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term);
|
||||||
|
|
||||||
int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
||||||
SSyncNode* pNode = pRaft->pNode;
|
SSyncNode* pNode = pRaft->pNode;
|
||||||
SSyncServerState serverState;
|
SSyncServerState serverState;
|
||||||
|
@ -44,10 +42,10 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
||||||
}
|
}
|
||||||
assert(initIndex <= serverState.commitIndex);
|
assert(initIndex <= serverState.commitIndex);
|
||||||
|
|
||||||
// restore fsm state from snapshot index + 1, until commitIndex
|
// restore fsm state from snapshot index + 1 until commitIndex
|
||||||
++initIndex;
|
++initIndex;
|
||||||
while (initIndex < serverState.commitIndex) {
|
while (initIndex <= serverState.commitIndex) {
|
||||||
limit = MIN(RAFT_READ_LOG_MAX_NUM, serverState.commitIndex - initIndex);
|
limit = MIN(RAFT_READ_LOG_MAX_NUM, serverState.commitIndex - initIndex + 1);
|
||||||
|
|
||||||
if (logStore->logRead(logStore, initIndex, limit, buffer, &nBuf) != 0) {
|
if (logStore->logRead(logStore, initIndex, limit, buffer, &nBuf) != 0) {
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -62,7 +60,11 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
||||||
}
|
}
|
||||||
assert(initIndex == serverState.commitIndex);
|
assert(initIndex == serverState.commitIndex);
|
||||||
|
|
||||||
syncInfo("restore vgid %d state: snapshot index:", pInfo->vgId);
|
pRaft->heartbeatTick = 1;
|
||||||
|
|
||||||
|
syncRaftBecomeFollower(pRaft, 1);
|
||||||
|
|
||||||
|
syncInfo("restore vgid %d state: snapshot index success", pInfo->vgId);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,3 +76,8 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const RaftMessage* pMsg) {
|
||||||
int32_t syncRaftTick(SSyncRaft* pRaft) {
|
int32_t syncRaftTick(SSyncRaft* pRaft) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void syncRaftBecomeFollower(SSyncRaft* pRaft, SSyncTerm term) {
|
||||||
|
pRaft->electionTick = taosRand() % 3 + 3;
|
||||||
|
return;
|
||||||
|
}
|
|
@ -0,0 +1,317 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "raft.h"
|
||||||
|
#include "raft_unstable_log.h"
|
||||||
|
#include "raft_progress.h"
|
||||||
|
#include "sync.h"
|
||||||
|
#include "syncInt.h"
|
||||||
|
|
||||||
|
static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state);
|
||||||
|
|
||||||
|
static void resumeProgress(SSyncRaftProgress* progress);
|
||||||
|
static void pauseProgress(SSyncRaftProgress* progress);
|
||||||
|
|
||||||
|
int syncRaftProgressCreate(SSyncRaft* pRaft) {
|
||||||
|
|
||||||
|
/*
|
||||||
|
inflights->buffer = (SyncIndex*)malloc(sizeof(SyncIndex) * pRaft->maxInflightMsgs);
|
||||||
|
if (inflights->buffer == NULL) {
|
||||||
|
return RAFT_OOM;
|
||||||
|
}
|
||||||
|
inflights->size = pRaft->maxInflightMsgs;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
int syncRaftProgressRecreate(SSyncRaft* pRaft, const RaftConfiguration* configuration) {
|
||||||
|
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
bool syncRaftProgressMaybeUpdate(SSyncRaft* pRaft, int i, SyncIndex lastIndex) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
bool updated = false;
|
||||||
|
|
||||||
|
if (progress->matchIndex < lastIndex) {
|
||||||
|
progress->matchIndex = lastIndex;
|
||||||
|
updated = true;
|
||||||
|
resumeProgress(progress);
|
||||||
|
}
|
||||||
|
if (progress->nextIndex < lastIndex + 1) {
|
||||||
|
progress->nextIndex = lastIndex + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return updated;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressOptimisticNextIndex(SSyncRaft* pRaft, int i, SyncIndex nextIndex) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
pRaft->leaderState.progress[i].nextIndex = nextIndex + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftProgressMaybeDecrTo(SSyncRaft* pRaft, int i,
|
||||||
|
SyncIndex rejected, SyncIndex lastIndex) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
|
||||||
|
if (progress->state == PROGRESS_REPLICATE) {
|
||||||
|
/**
|
||||||
|
* the rejection must be stale if the progress has matched and "rejected"
|
||||||
|
* is smaller than "match".
|
||||||
|
**/
|
||||||
|
if (rejected <= progress->matchIndex) {
|
||||||
|
syncDebug("match index is up to date,ignore");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* directly decrease next to match + 1 */
|
||||||
|
progress->nextIndex = progress->matchIndex + 1;
|
||||||
|
//syncRaftProgressBecomeProbe(raft, i);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rejected != progress->nextIndex - 1) {
|
||||||
|
syncDebug("rejected index %" PRId64 " different from next index %" PRId64 " -> ignore"
|
||||||
|
, rejected, progress->nextIndex);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
progress->nextIndex = MIN(rejected, lastIndex + 1);
|
||||||
|
if (progress->nextIndex < 1) {
|
||||||
|
progress->nextIndex = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
resumeProgress(progress);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void resumeProgress(SSyncRaftProgress* progress) {
|
||||||
|
progress->paused = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void pauseProgress(SSyncRaftProgress* progress) {
|
||||||
|
progress->paused = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftProgressIsPaused(SSyncRaft* pRaft, int i) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
|
||||||
|
switch (progress->state) {
|
||||||
|
case PROGRESS_PROBE:
|
||||||
|
return progress->paused;
|
||||||
|
case PROGRESS_REPLICATE:
|
||||||
|
return syncRaftInflightFull(&progress->inflights);
|
||||||
|
case PROGRESS_SNAPSHOT:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
syncFatal("error sync state:%d", progress->state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressFailure(SSyncRaft* pRaft, int i) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
|
||||||
|
progress->pendingSnapshotIndex = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftProgressNeedAbortSnapshot(SSyncRaft* pRaft, int i) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
|
||||||
|
return progress->state == PROGRESS_SNAPSHOT && progress->matchIndex >= progress->pendingSnapshotIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, int i) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
/**
|
||||||
|
* If the original state is ProgressStateSnapshot, progress knows that
|
||||||
|
* the pending snapshot has been sent to this peer successfully, then
|
||||||
|
* probes from pendingSnapshot + 1.
|
||||||
|
**/
|
||||||
|
if (progress->state == PROGRESS_SNAPSHOT) {
|
||||||
|
SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex;
|
||||||
|
resetProgressState(progress, PROGRESS_PROBE);
|
||||||
|
progress->nextIndex = MAX(progress->matchIndex + 1, pendingSnapshotIndex + 1);
|
||||||
|
} else {
|
||||||
|
resetProgressState(progress, PROGRESS_PROBE);
|
||||||
|
progress->nextIndex = progress->matchIndex + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
resetProgressState(progress, PROGRESS_REPLICATE);
|
||||||
|
progress->nextIndex = progress->matchIndex + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i, SyncIndex snapshotIndex) {
|
||||||
|
assert(i >= 0 && i < pRaft->leaderState.nProgress);
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
resetProgressState(progress, PROGRESS_SNAPSHOT);
|
||||||
|
progress->pendingSnapshotIndex = snapshotIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void resetProgressState(SSyncRaftProgress* progress, RaftProgressState state) {
|
||||||
|
progress->paused = false;
|
||||||
|
progress->pendingSnapshotIndex = 0;
|
||||||
|
progress->state = state;
|
||||||
|
syncRaftInflightReset(&(progress->inflights));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int syncRaftInflightReset(SSyncRaftInflights* inflights) {
|
||||||
|
inflights->count = 0;
|
||||||
|
inflights->start = 0;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftInflightFull(SSyncRaftInflights* inflights) {
|
||||||
|
return inflights->count == inflights->size;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex) {
|
||||||
|
assert(!syncRaftInflightFull(inflights));
|
||||||
|
|
||||||
|
int next = inflights->start + inflights->count;
|
||||||
|
int size = inflights->size;
|
||||||
|
/* is next wrapped around buffer? */
|
||||||
|
if (next >= size) {
|
||||||
|
next -= size;
|
||||||
|
}
|
||||||
|
|
||||||
|
inflights->buffer[next] = inflightIndex;
|
||||||
|
inflights->count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftInflightFreeTo(SSyncRaftInflights* inflights, SyncIndex toIndex) {
|
||||||
|
if (inflights->count == 0 || toIndex < inflights->buffer[inflights->start]) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int i, idx;
|
||||||
|
for (i = 0, idx = inflights->start; i < inflights->count; i++) {
|
||||||
|
if (toIndex < inflights->buffer[idx]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int size = inflights->size;
|
||||||
|
idx++;
|
||||||
|
if (idx >= size) {
|
||||||
|
idx -= size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inflights->count -= i;
|
||||||
|
inflights->start = idx;
|
||||||
|
assert(inflights->count >= 0);
|
||||||
|
if (inflights->count == 0) {
|
||||||
|
inflights->start = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights) {
|
||||||
|
syncRaftInflightFreeTo(inflights, inflights->buffer[inflights->start]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
|
||||||
|
SyncIndex syncRaftProgressNextIndex(SSyncRaft* pRaft, int i) {
|
||||||
|
return pRaft->leaderState.progress[i].nextIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i) {
|
||||||
|
return pRaft->leaderState.progress[i].matchIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i) {
|
||||||
|
pRaft->leaderState.progress[i].lastSend = pRaft->io.time(pRaft);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i) {
|
||||||
|
pRaft->leaderState.progress[i].lastSendSnapshot = pRaft->io.time(pRaft);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i) {
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
bool prev = progress->recentRecv;
|
||||||
|
progress->recentRecv = false;
|
||||||
|
return prev;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i) {
|
||||||
|
pRaft->leaderState.progress[i].recentRecv = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftProgressGetRecentRecv(SSyncRaft* pRaft, int i) {
|
||||||
|
return pRaft->leaderState.progress[i].recentRecv;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i) {
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
resetProgressState(progress, PROGRESS_SNAPSHOT);
|
||||||
|
progress->pendingSnapshotIndex = raftLogSnapshotIndex(pRaft->log);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i) {
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
|
||||||
|
if (progress->state == PROGRESS_SNAPSHOT) {
|
||||||
|
assert(progress->pendingSnapshotIndex > 0);
|
||||||
|
SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex;
|
||||||
|
resetProgressState(progress, PROGRESS_PROBE);
|
||||||
|
progress->nextIndex = max(progress->matchIndex + 1, pendingSnapshotIndex);
|
||||||
|
} else {
|
||||||
|
resetProgressState(progress, PROGRESS_PROBE);
|
||||||
|
progress->nextIndex = progress->matchIndex + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i) {
|
||||||
|
resetProgressState(pRaft->leaderState.progress, PROGRESS_REPLICATE);
|
||||||
|
pRaft->leaderState.progress->nextIndex = pRaft->leaderState.progress->matchIndex + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i) {
|
||||||
|
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
||||||
|
progress->pendingSnapshotIndex = 0;
|
||||||
|
progress->state = PROGRESS_PROBE;
|
||||||
|
}
|
||||||
|
|
||||||
|
RaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i) {
|
||||||
|
return pRaft->leaderState.progress[i].state;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,21 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "sync.h"
|
||||||
|
#include "raft_unstable_log.h"
|
||||||
|
|
||||||
|
SyncIndex syncRaftLogLastIndex(SSyncRaftUnstableLog* pLog) {
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue