From 2c28cdcbb59c935db1b9ddf88c96f49d6776b910 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 12 Aug 2022 18:49:28 +0800 Subject: [PATCH 1/2] fix: deadlock of mnode if its state changed --- source/dnode/mnode/impl/src/mndMnode.c | 2 ++ source/dnode/mnode/impl/src/mndSync.c | 23 +++++++++++++++-------- source/dnode/vnode/src/vnd/vnodeSync.c | 7 +++++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndMnode.c b/source/dnode/mnode/impl/src/mndMnode.c index 13655ac21f..4f07d9e014 100644 --- a/source/dnode/mnode/impl/src/mndMnode.c +++ b/source/dnode/mnode/impl/src/mndMnode.c @@ -742,7 +742,9 @@ static int32_t mndProcessAlterMnodeReq(SRpcMsg *pReq) { return code; } else { pMgmt->errCode = 0; + taosWLockLatch(&pMgmt->lock); pMgmt->transId = -1; + taosWUnLockLatch(&pMgmt->lock); tsem_wait(&pMgmt->syncSem); mInfo("alter mnode sync result:0x%x %s", pMgmt->errCode, tstrerror(pMgmt->errCode)); terrno = pMgmt->errCode; diff --git a/source/dnode/mnode/impl/src/mndSync.c b/source/dnode/mnode/impl/src/mndSync.c index 03e5c2b3a2..e899f71052 100644 --- a/source/dnode/mnode/impl/src/mndSync.c +++ b/source/dnode/mnode/impl/src/mndSync.c @@ -60,22 +60,22 @@ void mndSyncCommitMsg(struct SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMeta cbM sdbSetApplyInfo(pMnode->pSdb, cbMeta.index, cbMeta.term, cbMeta.lastConfigIndex); } - taosRLockLatch(&pMgmt->lock); + taosWLockLatch(&pMgmt->lock); if (transId <= 0) { - taosRUnLockLatch(&pMgmt->lock); + taosWUnLockLatch(&pMgmt->lock); mError("trans:%d, invalid commit msg", transId); } else if (transId == pMgmt->transId) { - taosRUnLockLatch(&pMgmt->lock); if (pMgmt->errCode != 0) { mError("trans:%d, failed to propose since %s", transId, tstrerror(pMgmt->errCode)); } pMgmt->transId = 0; + taosWUnLockLatch(&pMgmt->lock); tsem_post(&pMgmt->syncSem); } else { - taosRUnLockLatch(&pMgmt->lock); + taosWUnLockLatch(&pMgmt->lock); STrans *pTrans = mndAcquireTrans(pMnode, transId); if (pTrans != NULL) { - mDebug("trans:%d, execute in mnode which not leader", transId); + mInfo("trans:%d, execute in mnode which not leader", transId); mndTransExecute(pMnode, pTrans); mndReleaseTrans(pMnode, pTrans); // sdbWriteFile(pMnode->pSdb, SDB_WRITE_DELTA); @@ -275,9 +275,16 @@ int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw, int32_t transId) { pMgmt->errCode = 0; taosWLockLatch(&pMgmt->lock); - pMgmt->transId = transId; - taosWUnLockLatch(&pMgmt->lock); - mTrace("trans:%d, will be proposed", pMgmt->transId); + if (pMgmt->transId != 0) { + mInfo("trans:%d, can't be proposed since trans:%s alrady waiting for confirm", transId, pMgmt->transId); + taosWUnLockLatch(&pMgmt->lock); + terrno = TSDB_CODE_APP_NOT_READY; + return -1; + } else { + pMgmt->transId = transId; + mDebug("trans:%d, will be proposed", pMgmt->transId); + taosWUnLockLatch(&pMgmt->lock); + } const bool isWeak = false; int32_t code = syncPropose(pMgmt->sync, &req, isWeak); diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index fdd930ebd8..c7b5ce052d 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -696,6 +696,13 @@ static void vnodeBecomeFollower(struct SSyncFSM *pFsm) { static void vnodeBecomeLeader(struct SSyncFSM *pFsm) { SVnode *pVnode = pFsm->data; vDebug("vgId:%d, become leader", pVnode->config.vgId); + + taosThreadMutexLock(&pVnode->lock); + if (pVnode->blocked) { + pVnode->blocked = false; + tsem_post(&pVnode->syncSem); + } + taosThreadMutexUnlock(&pVnode->lock); } static SSyncFSM *vnodeSyncMakeFsm(SVnode *pVnode) { From 65fa1e0164922d246a3cab87ce54a913d8dd79ee Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 12 Aug 2022 19:20:31 +0800 Subject: [PATCH 2/2] fix: deadlock of mnode if its state changed --- source/dnode/mnode/impl/src/mndSync.c | 20 +++++++++----------- source/dnode/vnode/src/vnd/vnodeSync.c | 13 +++++++------ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndSync.c b/source/dnode/mnode/impl/src/mndSync.c index e899f71052..0a660571af 100644 --- a/source/dnode/mnode/impl/src/mndSync.c +++ b/source/dnode/mnode/impl/src/mndSync.c @@ -66,7 +66,9 @@ void mndSyncCommitMsg(struct SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMeta cbM mError("trans:%d, invalid commit msg", transId); } else if (transId == pMgmt->transId) { if (pMgmt->errCode != 0) { - mError("trans:%d, failed to propose since %s", transId, tstrerror(pMgmt->errCode)); + mError("trans:%d, failed to propose since %s, post sem", transId, tstrerror(pMgmt->errCode)); + } else { + mInfo("trans:%d, is proposed and post sem", transId, tstrerror(pMgmt->errCode)); } pMgmt->transId = 0; taosWUnLockLatch(&pMgmt->lock); @@ -122,7 +124,10 @@ void mndReConfig(struct SSyncFSM *pFsm, const SRpcMsg *pMsg, SReConfigCbMeta cbM taosWLockLatch(&pMgmt->lock); if (pMgmt->transId == -1) { if (pMgmt->errCode != 0) { - mError("trans:-1, failed to propose sync reconfig since %s", tstrerror(pMgmt->errCode)); + mError("trans:-1, failed to propose sync reconfig since %s, post sem", tstrerror(pMgmt->errCode)); + } else { + mInfo("trans:-1, sync reconfig is proposed, saved:%d code:0x%x, index:%" PRId64 " term:%" PRId64 " post sem", + pMgmt->transId, cbMeta.code, cbMeta.index, cbMeta.term); } pMgmt->transId = 0; tsem_post(&pMgmt->syncSem); @@ -174,7 +179,7 @@ void mndLeaderTransfer(struct SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMeta cb static void mndBecomeFollower(struct SSyncFSM *pFsm) { SMnode *pMnode = pFsm->data; - mDebug("vgId:1, become follower"); + mDebug("vgId:1, become follower and post sem"); taosWLockLatch(&pMnode->syncMgmt.lock); if (pMnode->syncMgmt.transId != 0) { @@ -187,13 +192,6 @@ static void mndBecomeFollower(struct SSyncFSM *pFsm) { static void mndBecomeLeader(struct SSyncFSM *pFsm) { mDebug("vgId:1, become leader"); SMnode *pMnode = pFsm->data; - - taosWLockLatch(&pMnode->syncMgmt.lock); - if (pMnode->syncMgmt.transId != 0) { - pMnode->syncMgmt.transId = 0; - tsem_post(&pMnode->syncMgmt.syncSem); - } - taosWUnLockLatch(&pMnode->syncMgmt.lock); } SSyncFSM *mndSyncMakeFsm(SMnode *pMnode) { @@ -276,7 +274,7 @@ int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw, int32_t transId) { pMgmt->errCode = 0; taosWLockLatch(&pMgmt->lock); if (pMgmt->transId != 0) { - mInfo("trans:%d, can't be proposed since trans:%s alrady waiting for confirm", transId, pMgmt->transId); + mError("trans:%d, can't be proposed since trans:%s alrady waiting for confirm", transId, pMgmt->transId); taosWUnLockLatch(&pMgmt->lock); terrno = TSDB_CODE_APP_NOT_READY; return -1; diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index c7b5ce052d..9703ed27ae 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -688,6 +688,7 @@ static void vnodeBecomeFollower(struct SSyncFSM *pFsm) { taosThreadMutexLock(&pVnode->lock); if (pVnode->blocked) { pVnode->blocked = false; + vDebug("vgId:%d, become follower and post block", pVnode->config.vgId); tsem_post(&pVnode->syncSem); } taosThreadMutexUnlock(&pVnode->lock); @@ -697,12 +698,12 @@ static void vnodeBecomeLeader(struct SSyncFSM *pFsm) { SVnode *pVnode = pFsm->data; vDebug("vgId:%d, become leader", pVnode->config.vgId); - taosThreadMutexLock(&pVnode->lock); - if (pVnode->blocked) { - pVnode->blocked = false; - tsem_post(&pVnode->syncSem); - } - taosThreadMutexUnlock(&pVnode->lock); + // taosThreadMutexLock(&pVnode->lock); + // if (pVnode->blocked) { + // pVnode->blocked = false; + // tsem_post(&pVnode->syncSem); + // } + // taosThreadMutexUnlock(&pVnode->lock); } static SSyncFSM *vnodeSyncMakeFsm(SVnode *pVnode) {