diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 7cc1a47404..2d7b15ebda 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -517,6 +517,7 @@ int32_t* taosGetErrno(); #define TSDB_CODE_SYN_STANDBY_NOT_READY TAOS_DEF_ERROR_CODE(0, 0x0912) #define TSDB_CODE_SYN_BATCH_ERROR TAOS_DEF_ERROR_CODE(0, 0x0913) #define TSDB_CODE_SYN_RESTORING TAOS_DEF_ERROR_CODE(0, 0x0914) +#define TSDB_CODE_SYN_INVALID_SNAPSHOT_MSG TAOS_DEF_ERROR_CODE(0, 0x0915) // internal #define TSDB_CODE_SYN_INTERNAL_ERROR TAOS_DEF_ERROR_CODE(0, 0x09FF) // tq diff --git a/source/dnode/vnode/src/vnd/vnodeCommit.c b/source/dnode/vnode/src/vnd/vnodeCommit.c index be977e7cbd..f78956a431 100644 --- a/source/dnode/vnode/src/vnd/vnodeCommit.c +++ b/source/dnode/vnode/src/vnd/vnodeCommit.c @@ -234,10 +234,11 @@ int vnodeAsyncCommit(SVnode *pVnode) { _exit: if (code) { - vError("vgId:%d, %s failed since %s, commit id:%" PRId64, TD_VID(pVnode), __func__, tstrerror(code), + vError("vgId:%d, vnode async commit failed since %s, commitId:%" PRId64, TD_VID(pVnode), tstrerror(code), pVnode->state.commitID); } else { - vDebug("vgId:%d, %s done", TD_VID(pVnode), __func__); + vInfo("vgId:%d, vnode async commit done, commitId:%" PRId64 " term:%" PRId64 " applied:%" PRId64, TD_VID(pVnode), + pVnode->state.commitID, pVnode->state.applyTerm, pVnode->state.applied); } return code; } @@ -256,7 +257,7 @@ static int vnodeCommitImpl(SCommitInfo *pInfo) { char dir[TSDB_FILENAME_LEN] = {0}; SVnode *pVnode = pInfo->pVnode; - vInfo("vgId:%d, start to commit, commit ID:%" PRId64 " version:%" PRId64 " term: %" PRId64, TD_VID(pVnode), + vInfo("vgId:%d, start to commit, commitId:%" PRId64 " version:%" PRId64 " term: %" PRId64, TD_VID(pVnode), pVnode->state.commitID, pVnode->state.applied, pVnode->state.applyTerm); // persist wal before starting diff --git a/source/dnode/vnode/src/vnd/vnodeSnapshot.c b/source/dnode/vnode/src/vnd/vnodeSnapshot.c index fcfacd1ca9..dbd06d6ec0 100644 --- a/source/dnode/vnode/src/vnd/vnodeSnapshot.c +++ b/source/dnode/vnode/src/vnd/vnodeSnapshot.c @@ -423,7 +423,7 @@ int32_t vnodeSnapWrite(SVSnapWriter *pWriter, uint8_t *pData, uint32_t nData) { ASSERT(pHdr->index == pWriter->index + 1); pWriter->index = pHdr->index; - vInfo("vgId:%d, vnode snapshot write data, index:%" PRId64 " type:%d nData:%d", TD_VID(pVnode), pHdr->index, + vInfo("vgId:%d, vnode snapshot write data, index:%" PRId64 " type:%d blockLen:%d", TD_VID(pVnode), pHdr->index, pHdr->type, nData); switch (pHdr->type) { diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index 5caaae502f..0437703c92 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -465,9 +465,9 @@ static int32_t vnodeSnapshotStopWrite(const SSyncFSM *pFsm, void *pWriter, bool static int32_t vnodeSnapshotDoWrite(const SSyncFSM *pFsm, void *pWriter, void *pBuf, int32_t len) { SVnode *pVnode = pFsm->data; - vDebug("vgId:%d, continue write vnode snapshot, len:%d", pVnode->config.vgId, len); + vDebug("vgId:%d, continue write vnode snapshot, blockLen:%d", pVnode->config.vgId, len); int32_t code = vnodeSnapWrite(pWriter, pBuf, len); - vDebug("vgId:%d, continue write vnode snapshot finished, len:%d", pVnode->config.vgId, len); + vDebug("vgId:%d, continue write vnode snapshot finished, blockLen:%d", pVnode->config.vgId, len); return code; } diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 30324c1113..7a35f90165 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -294,7 +294,7 @@ int32_t syncNodeStartSnapshot(SSyncNode *pSyncNode, SRaftId *pDestId) { } if (snapshotSenderIsStart(pSender)) { - sSError(pSender, "snapshot sender already start, ignore"); + sSInfo(pSender, "snapshot sender already start, ignore"); return 0; } @@ -523,7 +523,7 @@ static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnap static int32_t snapshotReceiverGotData(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *pMsg) { if (pMsg->seq != pReceiver->ack + 1) { sRError(pReceiver, "snapshot receiver invalid seq, ack:%d seq:%d", pReceiver->ack, pMsg->seq); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; + terrno = TSDB_CODE_SYN_INVALID_SNAPSHOT_MSG; return -1; } @@ -721,8 +721,12 @@ static int32_t syncNodeOnSnapshotTransfering(SSyncNode *pSyncNode, SyncSnapshotS timeNow = taosGetTimestampMs(); } + int32_t code = 0; if (snapshotReceiverGotData(pReceiver, pMsg) != 0) { - return -1; + code = terrno; + if (code >= SYNC_SNAPSHOT_SEQ_INVALID) { + code = TSDB_CODE_SYN_INTERNAL_ERROR; + } } // build msg @@ -740,7 +744,7 @@ static int32_t syncNodeOnSnapshotTransfering(SSyncNode *pSyncNode, SyncSnapshotS pRspMsg->lastTerm = pMsg->lastTerm; pRspMsg->startTime = pReceiver->startTime; pRspMsg->ack = pReceiver->ack; // receiver maybe already closed - pRspMsg->code = 0; + pRspMsg->code = code; pRspMsg->snapBeginIndex = pReceiver->snapshotParam.start; // send msg @@ -861,7 +865,7 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process force stop"); snapshotReceiverForceStop(pReceiver); } else if (pMsg->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->seq < SYNC_SNAPSHOT_SEQ_END) { - syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq"); + syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq data"); syncNodeOnSnapshotTransfering(pSyncNode, pMsg); } else { // error log @@ -982,68 +986,85 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { } // state, term, seq/ack - if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) { - if (pMsg->term == pSyncNode->pRaftStore->currentTerm) { - // prepare , send begin msg - if (pMsg->ack == SYNC_SNAPSHOT_SEQ_PRE_SNAPSHOT) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq pre-snapshot"); - syncNodeOnSnapshotReplyPre(pSyncNode, pMsg); - return 0; - } + if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { + sSError(pSender, "snapshot sender not leader"); + return -1; + } - if (pMsg->ack == SYNC_SNAPSHOT_SEQ_BEGIN) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq begin"); - if (snapshotSenderUpdateProgress(pSender, pMsg) != 0) { - return -1; - } + if (pMsg->term != pSyncNode->pRaftStore->currentTerm) { + sSError(pSender, "snapshot sender term not equal"); + return -1; + } - if (snapshotSend(pSender) != 0) { - return -1; - } - return 0; - } + if (pMsg->code != 0) { + syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "receive error code"); + sSError(pSender, "snapshot sender receive error code:0x%x and stop sender", pMsg->code); + snapshotSenderStop(pSender, true); + SSyncLogReplMgr *pMgr = syncNodeGetLogReplMgr(pSyncNode, &pMsg->srcId); + if (pMgr) { + syncLogReplMgrReset(pMgr); + } - // receive ack is finish, close sender - if (pMsg->ack == SYNC_SNAPSHOT_SEQ_END) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq end"); - snapshotSenderStop(pSender, true); - SSyncLogReplMgr *pMgr = syncNodeGetLogReplMgr(pSyncNode, &pMsg->srcId); - if (pMgr) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "reset repl mgr"); - syncLogReplMgrReset(pMgr); - } - return 0; - } + return -1; + } - // send next msg - if (pMsg->ack == pSender->seq) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq"); - // update sender ack - if (snapshotSenderUpdateProgress(pSender, pMsg) != 0) { - return -1; - } - if (snapshotSend(pSender) != 0) { - return -1; - } + // prepare , send begin msg + if (pMsg->ack == SYNC_SNAPSHOT_SEQ_PRE_SNAPSHOT) { + syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq pre-snapshot"); + syncNodeOnSnapshotReplyPre(pSyncNode, pMsg); + return 0; + } - } else if (pMsg->ack == pSender->seq - 1) { - // maybe resend - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq and resend"); - snapshotReSend(pSender); - - } else { - // error log - sSError(pSender, "snapshot sender recv error ack:%d, my seq:%d", pMsg->ack, pSender->seq); - return -1; - } - } else { - // error log - sSError(pSender, "snapshot sender term not equal"); + if (pMsg->ack == SYNC_SNAPSHOT_SEQ_BEGIN) { + syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq begin"); + if (snapshotSenderUpdateProgress(pSender, pMsg) != 0) { return -1; } + + if (snapshotSend(pSender) != 0) { + return -1; + } + return 0; + } + + // receive ack is finish, close sender + if (pMsg->ack == SYNC_SNAPSHOT_SEQ_END) { + syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq end"); + snapshotSenderStop(pSender, true); + SSyncLogReplMgr *pMgr = syncNodeGetLogReplMgr(pSyncNode, &pMsg->srcId); + if (pMgr) { + syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "reset repl mgr"); + syncLogReplMgrReset(pMgr); + } + return 0; + } + + // send next msg + if (pMsg->ack == pSender->seq) { + syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq data"); + // update sender ack + if (snapshotSenderUpdateProgress(pSender, pMsg) != 0) { + return -1; + } + if (snapshotSend(pSender) != 0) { + return -1; + } + + } else if (pMsg->ack == pSender->seq - 1) { + // maybe resend + syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq and resend"); + snapshotReSend(pSender); + } else { // error log - sSError(pSender, "snapshot sender not leader"); + syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "receive error ack"); + sSError(pSender, "snapshot sender receive error ack:%d, my seq:%d", pMsg->ack, pSender->seq); + snapshotSenderStop(pSender, true); + SSyncLogReplMgr *pMgr = syncNodeGetLogReplMgr(pSyncNode, &pMsg->srcId); + if (pMgr) { + syncLogReplMgrReset(pMgr); + } + return -1; } diff --git a/source/libs/sync/test/syncRaftLogTest2.cpp b/source/libs/sync/test/syncRaftLogTest2.cpp index a7752dcb8b..de9137fbe1 100644 --- a/source/libs/sync/test/syncRaftLogTest2.cpp +++ b/source/libs/sync/test/syncRaftLogTest2.cpp @@ -47,7 +47,7 @@ void init() { pSyncNode->pWal = pWal; pSyncNode->pFsm = (SSyncFSM*)taosMemoryMalloc(sizeof(SSyncFSM)); - pSyncNode->pFsm->FpGetSnapshotInfo = GetSnapshotCb; + // pSyncNode->pFsm->FpGetSnapshotInfo = GetSnapshotCb; } void cleanup() { diff --git a/source/libs/sync/test/syncRaftLogTest3.cpp b/source/libs/sync/test/syncRaftLogTest3.cpp index 31c06625aa..1eed08102f 100644 --- a/source/libs/sync/test/syncRaftLogTest3.cpp +++ b/source/libs/sync/test/syncRaftLogTest3.cpp @@ -47,7 +47,7 @@ void init() { pSyncNode->pWal = pWal; pSyncNode->pFsm = (SSyncFSM*)taosMemoryMalloc(sizeof(SSyncFSM)); - pSyncNode->pFsm->FpGetSnapshotInfo = GetSnapshotCb; + // pSyncNode->pFsm->FpGetSnapshotInfo = GetSnapshotCb; } void cleanup() { diff --git a/source/util/src/terror.c b/source/util/src/terror.c index d73c8661fc..e59b1daa05 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -405,6 +405,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_SYN_PROPOSE_NOT_READY, "Sync not ready for pr TAOS_DEFINE_ERROR(TSDB_CODE_SYN_STANDBY_NOT_READY, "Sync not ready for standby") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_BATCH_ERROR, "Sync batch error") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_RESTORING, "Sync is restoring") +TAOS_DEFINE_ERROR(TSDB_CODE_SYN_INVALID_SNAPSHOT_MSG, "Sync invalid snapshot msg") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_INTERNAL_ERROR, "Sync internal error") //tq