418 lines
14 KiB
C
418 lines
14 KiB
C
/*
|
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
|
*
|
|
* This program is free software: you can use, redistribute, and/or modify
|
|
* it under the terms of the GNU Affero General Public License, version 3
|
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
|
*
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "syncAppendEntriesReply.h"
|
|
#include "syncCommit.h"
|
|
#include "syncIndexMgr.h"
|
|
#include "syncInt.h"
|
|
#include "syncRaftCfg.h"
|
|
#include "syncRaftLog.h"
|
|
#include "syncRaftStore.h"
|
|
#include "syncSnapshot.h"
|
|
#include "syncUtil.h"
|
|
#include "syncVoteMgr.h"
|
|
|
|
// TLA+ Spec
|
|
// HandleAppendEntriesResponse(i, j, m) ==
|
|
// /\ m.mterm = currentTerm[i]
|
|
// /\ \/ /\ m.msuccess \* successful
|
|
// /\ nextIndex' = [nextIndex EXCEPT ![i][j] = m.mmatchIndex + 1]
|
|
// /\ matchIndex' = [matchIndex EXCEPT ![i][j] = m.mmatchIndex]
|
|
// \/ /\ \lnot m.msuccess \* not successful
|
|
// /\ nextIndex' = [nextIndex EXCEPT ![i][j] =
|
|
// Max({nextIndex[i][j] - 1, 1})]
|
|
// /\ UNCHANGED <<matchIndex>>
|
|
// /\ Discard(m)
|
|
// /\ UNCHANGED <<serverVars, candidateVars, logVars, elections>>
|
|
//
|
|
int32_t syncNodeOnAppendEntriesReplyCb(SSyncNode* ths, SyncAppendEntriesReply* pMsg) {
|
|
int32_t ret = 0;
|
|
|
|
// if already drop replica, do not process
|
|
if (!syncNodeInRaftGroup(ths, &(pMsg->srcId)) && !ths->pRaftCfg->isStandBy) {
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, "maybe replica already dropped");
|
|
return -1;
|
|
}
|
|
|
|
// drop stale response
|
|
if (pMsg->term < ths->pRaftStore->currentTerm) {
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, "drop stale response");
|
|
return 0;
|
|
}
|
|
|
|
// no need this code, because if I receive reply.term, then I must have sent for that term.
|
|
// if (pMsg->term > ths->pRaftStore->currentTerm) {
|
|
// syncNodeUpdateTerm(ths, pMsg->term);
|
|
// }
|
|
|
|
if (pMsg->term > ths->pRaftStore->currentTerm) {
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, "error term");
|
|
return -1;
|
|
}
|
|
|
|
ASSERT(pMsg->term == ths->pRaftStore->currentTerm);
|
|
|
|
// update time
|
|
syncIndexMgrSetStartTime(ths->pNextIndex, &(pMsg->srcId), pMsg->startTime);
|
|
syncIndexMgrSetRecvTime(ths->pNextIndex, &(pMsg->srcId), taosGetTimestampMs());
|
|
|
|
SyncIndex beforeNextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
|
|
SyncIndex beforeMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId));
|
|
|
|
if (pMsg->success) {
|
|
// nextIndex' = [nextIndex EXCEPT ![i][j] = m.mmatchIndex + 1]
|
|
syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), pMsg->matchIndex + 1);
|
|
|
|
// matchIndex' = [matchIndex EXCEPT ![i][j] = m.mmatchIndex]
|
|
syncIndexMgrSetIndex(ths->pMatchIndex, &(pMsg->srcId), pMsg->matchIndex);
|
|
|
|
// maybe commit
|
|
syncMaybeAdvanceCommitIndex(ths);
|
|
|
|
} else {
|
|
SyncIndex nextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
|
|
|
|
// notice! int64, uint64
|
|
if (nextIndex > SYNC_INDEX_BEGIN) {
|
|
--nextIndex;
|
|
} else {
|
|
nextIndex = SYNC_INDEX_BEGIN;
|
|
}
|
|
syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), nextIndex);
|
|
}
|
|
|
|
SyncIndex afterNextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
|
|
SyncIndex afterMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId));
|
|
do {
|
|
char logBuf[256];
|
|
snprintf(logBuf, sizeof(logBuf),
|
|
"before next:%" PRId64 ", match:%" PRId64 ", after next:%" PRId64 ", match:%" PRId64, beforeNextIndex,
|
|
beforeMatchIndex, afterNextIndex, afterMatchIndex);
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, logBuf);
|
|
} while (0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
// only start once
|
|
static void syncNodeStartSnapshotOnce(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, SyncTerm lastApplyTerm,
|
|
SyncAppendEntriesReply* pMsg) {
|
|
if (beginIndex > endIndex) {
|
|
do {
|
|
char logBuf[128];
|
|
snprintf(logBuf, sizeof(logBuf), "snapshot param error, start:%" PRId64 ", end:%" PRId64, beginIndex, endIndex);
|
|
syncNodeErrorLog(ths, logBuf);
|
|
} while (0);
|
|
|
|
return;
|
|
}
|
|
|
|
// get sender
|
|
SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(ths, &(pMsg->srcId));
|
|
ASSERT(pSender != NULL);
|
|
|
|
if (snapshotSenderIsStart(pSender)) {
|
|
do {
|
|
char* eventLog = snapshotSender2SimpleStr(pSender, "snapshot sender already start");
|
|
syncNodeErrorLog(ths, eventLog);
|
|
taosMemoryFree(eventLog);
|
|
} while (0);
|
|
|
|
return;
|
|
}
|
|
|
|
SSnapshot snapshot = {
|
|
.data = NULL, .lastApplyIndex = endIndex, .lastApplyTerm = lastApplyTerm, .lastConfigIndex = SYNC_INDEX_INVALID};
|
|
void* pReader = NULL;
|
|
SSnapshotParam readerParam = {.start = beginIndex, .end = endIndex};
|
|
int32_t code = ths->pFsm->FpSnapshotStartRead(ths->pFsm, &readerParam, &pReader);
|
|
ASSERT(code == 0);
|
|
|
|
if (pMsg->privateTerm < pSender->privateTerm) {
|
|
ASSERT(pReader != NULL);
|
|
snapshotSenderStart(pSender, readerParam, snapshot, pReader);
|
|
|
|
} else {
|
|
if (pReader != NULL) {
|
|
ths->pFsm->FpSnapshotStopRead(ths->pFsm, pReader);
|
|
}
|
|
}
|
|
}
|
|
|
|
int32_t syncNodeOnAppendEntriesReplySnapshot2Cb(SSyncNode* ths, SyncAppendEntriesReply* pMsg) {
|
|
int32_t ret = 0;
|
|
|
|
// if already drop replica, do not process
|
|
if (!syncNodeInRaftGroup(ths, &(pMsg->srcId)) && !ths->pRaftCfg->isStandBy) {
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, "maybe replica already dropped");
|
|
return -1;
|
|
}
|
|
|
|
// drop stale response
|
|
if (pMsg->term < ths->pRaftStore->currentTerm) {
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, "drop stale response");
|
|
return 0;
|
|
}
|
|
|
|
// error term
|
|
if (pMsg->term > ths->pRaftStore->currentTerm) {
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, "error term");
|
|
return -1;
|
|
}
|
|
|
|
ASSERT(pMsg->term == ths->pRaftStore->currentTerm);
|
|
|
|
// update time
|
|
syncIndexMgrSetStartTime(ths->pNextIndex, &(pMsg->srcId), pMsg->startTime);
|
|
syncIndexMgrSetRecvTime(ths->pNextIndex, &(pMsg->srcId), taosGetTimestampMs());
|
|
|
|
SyncIndex beforeNextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
|
|
SyncIndex beforeMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId));
|
|
|
|
if (pMsg->success) {
|
|
SyncIndex newNextIndex = pMsg->matchIndex + 1;
|
|
SyncIndex newMatchIndex = pMsg->matchIndex;
|
|
|
|
bool needStartSnapshot = false;
|
|
if (newMatchIndex >= SYNC_INDEX_BEGIN && !ths->pLogStore->syncLogExist(ths->pLogStore, newMatchIndex)) {
|
|
needStartSnapshot = true;
|
|
}
|
|
|
|
if (!needStartSnapshot) {
|
|
// update next-index, match-index
|
|
syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), newNextIndex);
|
|
syncIndexMgrSetIndex(ths->pMatchIndex, &(pMsg->srcId), newMatchIndex);
|
|
|
|
// maybe commit
|
|
if (ths->state == TAOS_SYNC_STATE_LEADER) {
|
|
syncMaybeAdvanceCommitIndex(ths);
|
|
}
|
|
|
|
} else {
|
|
// start snapshot <match+1, old snapshot.end>
|
|
SSnapshot oldSnapshot;
|
|
ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &oldSnapshot);
|
|
if (oldSnapshot.lastApplyIndex > newMatchIndex) {
|
|
syncNodeStartSnapshotOnce(ths, newMatchIndex + 1, oldSnapshot.lastApplyIndex, oldSnapshot.lastApplyTerm,
|
|
pMsg); // term maybe not ok?
|
|
}
|
|
|
|
syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), oldSnapshot.lastApplyIndex + 1);
|
|
syncIndexMgrSetIndex(ths->pMatchIndex, &(pMsg->srcId), newMatchIndex);
|
|
}
|
|
|
|
// event log, update next-index
|
|
do {
|
|
char host[64];
|
|
int16_t port;
|
|
syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port);
|
|
|
|
char logBuf[256];
|
|
snprintf(logBuf, sizeof(logBuf), "reset next-index:%" PRId64 ", match-index:%" PRId64 " for %s:%d", newNextIndex,
|
|
newMatchIndex, host, port);
|
|
syncNodeEventLog(ths, logBuf);
|
|
|
|
} while (0);
|
|
|
|
} else {
|
|
SyncIndex nextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
|
|
|
|
if (nextIndex > SYNC_INDEX_BEGIN) {
|
|
--nextIndex;
|
|
|
|
// speed up
|
|
if (nextIndex > pMsg->matchIndex + 1) {
|
|
nextIndex = pMsg->matchIndex + 1;
|
|
}
|
|
|
|
bool needStartSnapshot = false;
|
|
if (nextIndex >= SYNC_INDEX_BEGIN && !ths->pLogStore->syncLogExist(ths->pLogStore, nextIndex)) {
|
|
needStartSnapshot = true;
|
|
}
|
|
if (nextIndex - 1 >= SYNC_INDEX_BEGIN && !ths->pLogStore->syncLogExist(ths->pLogStore, nextIndex - 1)) {
|
|
needStartSnapshot = true;
|
|
}
|
|
|
|
if (!needStartSnapshot) {
|
|
// do nothing
|
|
|
|
} else {
|
|
SSnapshot oldSnapshot;
|
|
ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &oldSnapshot);
|
|
SyncTerm newSnapshotTerm = oldSnapshot.lastApplyTerm;
|
|
|
|
SyncIndex endIndex;
|
|
if (ths->pLogStore->syncLogExist(ths->pLogStore, nextIndex + 1)) {
|
|
endIndex = nextIndex;
|
|
} else {
|
|
endIndex = oldSnapshot.lastApplyIndex;
|
|
}
|
|
syncNodeStartSnapshotOnce(ths, pMsg->matchIndex + 1, endIndex, newSnapshotTerm, pMsg);
|
|
|
|
// get sender
|
|
SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(ths, &(pMsg->srcId));
|
|
ASSERT(pSender != NULL);
|
|
SyncIndex sentryIndex = pSender->snapshot.lastApplyIndex + 1;
|
|
|
|
// update nextIndex to sentryIndex
|
|
if (nextIndex <= sentryIndex) {
|
|
nextIndex = sentryIndex;
|
|
}
|
|
}
|
|
|
|
} else {
|
|
nextIndex = SYNC_INDEX_BEGIN;
|
|
}
|
|
syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), nextIndex);
|
|
|
|
SyncIndex oldMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId));
|
|
if (pMsg->matchIndex > oldMatchIndex) {
|
|
syncIndexMgrSetIndex(ths->pMatchIndex, &(pMsg->srcId), pMsg->matchIndex);
|
|
}
|
|
|
|
// event log, update next-index
|
|
do {
|
|
char host[64];
|
|
int16_t port;
|
|
syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port);
|
|
|
|
SyncIndex newNextIndex = nextIndex;
|
|
SyncIndex newMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId));
|
|
char logBuf[256];
|
|
snprintf(logBuf, sizeof(logBuf), "reset2 next-index:%" PRId64 ", match-index:%" PRId64 " for %s:%d", newNextIndex,
|
|
newMatchIndex, host, port);
|
|
syncNodeEventLog(ths, logBuf);
|
|
|
|
} while (0);
|
|
}
|
|
|
|
SyncIndex afterNextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
|
|
SyncIndex afterMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId));
|
|
do {
|
|
char logBuf[256];
|
|
snprintf(logBuf, sizeof(logBuf),
|
|
"before next:%" PRId64 ", match:%" PRId64 ", after next:%" PRId64 ", match:%" PRId64, beforeNextIndex,
|
|
beforeMatchIndex, afterNextIndex, afterMatchIndex);
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, logBuf);
|
|
} while (0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int32_t syncNodeOnAppendEntriesReplySnapshotCb(SSyncNode* ths, SyncAppendEntriesReply* pMsg) {
|
|
int32_t ret = 0;
|
|
|
|
// if already drop replica, do not process
|
|
if (!syncNodeInRaftGroup(ths, &(pMsg->srcId)) && !ths->pRaftCfg->isStandBy) {
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, "maybe replica already dropped");
|
|
return -1;
|
|
}
|
|
|
|
// drop stale response
|
|
if (pMsg->term < ths->pRaftStore->currentTerm) {
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, "drop stale response");
|
|
return 0;
|
|
}
|
|
|
|
// no need this code, because if I receive reply.term, then I must have sent for that term.
|
|
// if (pMsg->term > ths->pRaftStore->currentTerm) {
|
|
// syncNodeUpdateTerm(ths, pMsg->term);
|
|
// }
|
|
|
|
if (pMsg->term > ths->pRaftStore->currentTerm) {
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, "error term");
|
|
return -1;
|
|
}
|
|
|
|
ASSERT(pMsg->term == ths->pRaftStore->currentTerm);
|
|
|
|
// update time
|
|
syncIndexMgrSetStartTime(ths->pNextIndex, &(pMsg->srcId), pMsg->startTime);
|
|
syncIndexMgrSetRecvTime(ths->pNextIndex, &(pMsg->srcId), taosGetTimestampMs());
|
|
|
|
SyncIndex beforeNextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
|
|
SyncIndex beforeMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId));
|
|
|
|
if (pMsg->success) {
|
|
// nextIndex' = [nextIndex EXCEPT ![i][j] = m.mmatchIndex + 1]
|
|
syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), pMsg->matchIndex + 1);
|
|
|
|
// matchIndex' = [matchIndex EXCEPT ![i][j] = m.mmatchIndex]
|
|
syncIndexMgrSetIndex(ths->pMatchIndex, &(pMsg->srcId), pMsg->matchIndex);
|
|
|
|
// maybe commit
|
|
if (ths->state == TAOS_SYNC_STATE_LEADER) {
|
|
syncMaybeAdvanceCommitIndex(ths);
|
|
}
|
|
|
|
} else {
|
|
SyncIndex nextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
|
|
|
|
// notice! int64, uint64
|
|
if (nextIndex > SYNC_INDEX_BEGIN) {
|
|
--nextIndex;
|
|
|
|
// get sender
|
|
SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(ths, &(pMsg->srcId));
|
|
ASSERT(pSender != NULL);
|
|
|
|
SSnapshot snapshot = {.data = NULL,
|
|
.lastApplyIndex = SYNC_INDEX_INVALID,
|
|
.lastApplyTerm = 0,
|
|
.lastConfigIndex = SYNC_INDEX_INVALID};
|
|
void* pReader = NULL;
|
|
ths->pFsm->FpGetSnapshot(ths->pFsm, &snapshot, NULL, &pReader);
|
|
if (snapshot.lastApplyIndex >= SYNC_INDEX_BEGIN && nextIndex <= snapshot.lastApplyIndex + 1 &&
|
|
!snapshotSenderIsStart(pSender) && pMsg->privateTerm < pSender->privateTerm) {
|
|
// has snapshot
|
|
ASSERT(pReader != NULL);
|
|
SSnapshotParam readerParam = {.start = 0, .end = snapshot.lastApplyIndex};
|
|
snapshotSenderStart(pSender, readerParam, snapshot, pReader);
|
|
|
|
} else {
|
|
// no snapshot
|
|
if (pReader != NULL) {
|
|
ths->pFsm->FpSnapshotStopRead(ths->pFsm, pReader);
|
|
}
|
|
}
|
|
|
|
SyncIndex sentryIndex = pSender->snapshot.lastApplyIndex + 1;
|
|
|
|
// update nextIndex to sentryIndex
|
|
if (nextIndex <= sentryIndex) {
|
|
nextIndex = sentryIndex;
|
|
}
|
|
|
|
} else {
|
|
nextIndex = SYNC_INDEX_BEGIN;
|
|
}
|
|
|
|
syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), nextIndex);
|
|
}
|
|
|
|
SyncIndex afterNextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
|
|
SyncIndex afterMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId));
|
|
do {
|
|
char logBuf[256];
|
|
snprintf(logBuf, sizeof(logBuf),
|
|
"before next:%" PRId64 ", match:%" PRId64 ", after next:%" PRId64 ", match:%" PRId64, beforeNextIndex,
|
|
beforeMatchIndex, afterNextIndex, afterMatchIndex);
|
|
syncLogRecvAppendEntriesReply(ths, pMsg, logBuf);
|
|
} while (0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, SyncAppendEntriesReply* pMsg) { return 0; } |