From 490caa967ba71b7af0a67675adadeceeb2cfeaa5 Mon Sep 17 00:00:00 2001 From: Minglei Jin Date: Wed, 9 Jun 2021 20:31:11 +0800 Subject: [PATCH 1/3] [TD-4599]: fix false dnode offline --- src/balance/src/bnMain.c | 8 ++++---- src/balance/src/bnThread.c | 12 ++++++------ src/mnode/inc/mnodeDef.h | 3 +-- src/mnode/inc/mnodeDnode.h | 2 +- src/mnode/src/mnodeDnode.c | 12 ++++++------ 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/balance/src/bnMain.c b/src/balance/src/bnMain.c index 3055f77e81..25f316cb5e 100644 --- a/src/balance/src/bnMain.c +++ b/src/balance/src/bnMain.c @@ -405,7 +405,7 @@ void bnReset() { if (pDnode == NULL) break; // while master change, should reset dnode to offline - mInfo("dnode:%d set access:%d to 0", pDnode->dnodeId, pDnode->lastAccess); + mInfo("dnode:%d set access:%" PRId64 " to 0", pDnode->dnodeId, pDnode->lastAccess); pDnode->lastAccess = 0; if (pDnode->status != TAOS_DN_STATUS_DROPPING) { pDnode->status = TAOS_DN_STATUS_OFFLINE; @@ -499,7 +499,7 @@ static bool bnMontiorDropping() { if (dnodeIsMasterEp(pDnode->dnodeEp)) continue; if (mnodeGetDnodesNum() <= 1) continue; - mLInfo("dnode:%d, set to removing state for it offline:%d seconds", pDnode->dnodeId, + mLInfo("dnode:%d, set to removing state for it offline:%" PRId64 " seconds", pDnode->dnodeId, tsAccessSquence - pDnode->lastAccess); pDnode->status = TAOS_DN_STATUS_DROPPING; @@ -574,8 +574,8 @@ void bnCheckStatus() { if (pDnode->status != TAOS_DN_STATUS_DROPPING && pDnode->status != TAOS_DN_STATUS_OFFLINE) { pDnode->status = TAOS_DN_STATUS_OFFLINE; pDnode->offlineReason = TAOS_DN_OFF_STATUS_MSG_TIMEOUT; - mInfo("dnode:%d, set to offline state, access seq:%d last seq:%d laststat:%d", pDnode->dnodeId, tsAccessSquence, - pDnode->lastAccess, pDnode->status); + mInfo("dnode:%d, set to offline state, access seq:%" PRId64 " last seq:%" PRId64 " laststat:%d", pDnode->dnodeId, + tsAccessSquence, pDnode->lastAccess, pDnode->status); bnSetVgroupOffline(pDnode); bnStartTimer(3000); } diff --git a/src/balance/src/bnThread.c b/src/balance/src/bnThread.c index d07591ecd5..f39b82f2c6 100644 --- a/src/balance/src/bnThread.c +++ b/src/balance/src/bnThread.c @@ -101,13 +101,13 @@ static void bnProcessTimer(void *handle, void *tmrId) { if (!sdbIsMaster()) return; if (tsBnThread.stop) return; - tsBnThread.timer = NULL; - tsAccessSquence++; - - bnStartTimer(-1); - bnCheckStatus(); - if (handle == NULL) { + tsBnThread.timer = NULL; + ++tsAccessSquence; + + bnStartTimer(-1); + bnCheckStatus(); + if (tsAccessSquence % tsBalanceInterval == 0) { mDebug("balance function is scheduled by timer"); bnPostSignal(); diff --git a/src/mnode/inc/mnodeDef.h b/src/mnode/inc/mnodeDef.h index e052f34a33..c1f2ea7fd7 100644 --- a/src/mnode/inc/mnodeDef.h +++ b/src/mnode/inc/mnodeDef.h @@ -48,9 +48,8 @@ typedef struct SDnodeObj { int32_t dnodeId; int32_t openVnodes; int64_t createdTime; - int32_t resever0; // from dnode status msg, config information + int64_t lastAccess; int32_t customScore; // config by user - uint32_t lastAccess; uint16_t numOfCores; // from dnode status msg uint16_t dnodePort; char dnodeFqdn[TSDB_FQDN_LEN]; diff --git a/src/mnode/inc/mnodeDnode.h b/src/mnode/inc/mnodeDnode.h index fa1995254e..d357cd65b8 100644 --- a/src/mnode/inc/mnodeDnode.h +++ b/src/mnode/inc/mnodeDnode.h @@ -77,7 +77,7 @@ void * mnodeGetDnodeByEp(char *ep); void mnodeUpdateDnode(SDnodeObj *pDnode); int32_t mnodeDropDnode(SDnodeObj *pDnode, void *pMsg); -extern int32_t tsAccessSquence; +extern int64_t tsAccessSquence; #ifdef __cplusplus } diff --git a/src/mnode/src/mnodeDnode.c b/src/mnode/src/mnodeDnode.c index 8a5d24c474..fb775d92d8 100644 --- a/src/mnode/src/mnodeDnode.c +++ b/src/mnode/src/mnodeDnode.c @@ -39,8 +39,8 @@ #include "mnodePeer.h" #include "mnodeCluster.h" -int32_t tsAccessSquence = 0; -int64_t tsDnodeRid = -1; +int64_t tsAccessSquence = 0; +int64_t tsDnodeRid = -1; static void * tsDnodeSdb = NULL; static int32_t tsDnodeUpdateSize = 0; extern void * tsMnodeSdb; @@ -567,7 +567,7 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) { mnodeGetClusterId()); return TSDB_CODE_MND_INVALID_CLUSTER_ID; } else { - mTrace("dnode:%d, status received, access times %d openVnodes:%d:%d", pDnode->dnodeId, pDnode->lastAccess, + mTrace("dnode:%d, status received, access times %" PRId64 " openVnodes:%d:%d", pDnode->dnodeId, pDnode->lastAccess, htons(pStatus->openVnodes), pDnode->openVnodes); } } @@ -629,9 +629,9 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) { bnNotify(); } - if (!tsEnableBalance) { - int32_t numOfMnodes = mnodeGetMnodesNum(); - if (numOfMnodes < tsNumOfMnodes) bnNotify(); + int32_t numOfMnodes = mnodeGetMnodesNum(); + if (numOfMnodes < tsNumOfMnodes && numOfMnodes < mnodeGetOnlineDnodesNum()) { + bnNotify(); } if (openVnodes != pDnode->openVnodes) { From 935d41b1f77caff075e2c14d0055e1eb872315f4 Mon Sep 17 00:00:00 2001 From: Minglei Jin Date: Thu, 10 Jun 2021 09:53:51 +0800 Subject: [PATCH 2/3] bnThread: dummy commit to make CI happy --- src/balance/src/bnThread.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/balance/src/bnThread.c b/src/balance/src/bnThread.c index f39b82f2c6..b5043c19bb 100644 --- a/src/balance/src/bnThread.c +++ b/src/balance/src/bnThread.c @@ -122,8 +122,7 @@ static void bnProcessTimer(void *handle, void *tmrId) { void bnStartTimer(int32_t mseconds) { if (tsBnThread.stop) return; - bool updateSoon = (mseconds != -1); - if (updateSoon) { + if (mseconds != -1) { mTrace("balance function will be called after %d ms", mseconds); taosTmrReset(bnProcessTimer, mseconds, (void *)(int64_t)mseconds, tsMnodeTmr, &tsBnThread.timer); } else { @@ -132,5 +131,5 @@ void bnStartTimer(int32_t mseconds) { } void bnNotify() { - bnStartTimer(500); + bnStartTimer(10); } From b0fb8f9dc6af18759eb23888e2ebdb8c5b88e1c1 Mon Sep 17 00:00:00 2001 From: Minglei Jin Date: Thu, 10 Jun 2021 13:32:42 +0800 Subject: [PATCH 3/3] balance: remove unused pVgroup->lbTime checking --- src/balance/src/bnMain.c | 4 ---- src/balance/src/bnThread.c | 12 ++++++------ src/mnode/src/mnodeDnode.c | 2 +- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/balance/src/bnMain.c b/src/balance/src/bnMain.c index 25f316cb5e..67741b1473 100644 --- a/src/balance/src/bnMain.c +++ b/src/balance/src/bnMain.c @@ -220,10 +220,6 @@ int32_t bnAllocVnodes(SVgObj *pVgroup) { } static bool bnCheckVgroupReady(SVgObj *pVgroup, SVnodeGid *pRmVnode) { - if (pVgroup->lbTime + 5 * tsStatusInterval > tsAccessSquence) { - return false; - } - int32_t rmVnodeVer = 0; for (int32_t i = 0; i < pVgroup->numOfVnodes; ++i) { SVnodeGid *pVnode = pVgroup->vnodeGid + i; diff --git a/src/balance/src/bnThread.c b/src/balance/src/bnThread.c index b5043c19bb..44cb24effa 100644 --- a/src/balance/src/bnThread.c +++ b/src/balance/src/bnThread.c @@ -101,12 +101,12 @@ static void bnProcessTimer(void *handle, void *tmrId) { if (!sdbIsMaster()) return; if (tsBnThread.stop) return; - if (handle == NULL) { - tsBnThread.timer = NULL; - ++tsAccessSquence; + tsBnThread.timer = NULL; + bnStartTimer(-1); + bnCheckStatus(); - bnStartTimer(-1); - bnCheckStatus(); + if (handle == NULL) { + ++tsAccessSquence; if (tsAccessSquence % tsBalanceInterval == 0) { mDebug("balance function is scheduled by timer"); @@ -131,5 +131,5 @@ void bnStartTimer(int32_t mseconds) { } void bnNotify() { - bnStartTimer(10); + bnStartTimer(500); } diff --git a/src/mnode/src/mnodeDnode.c b/src/mnode/src/mnodeDnode.c index fb775d92d8..70a63517ca 100644 --- a/src/mnode/src/mnodeDnode.c +++ b/src/mnode/src/mnodeDnode.c @@ -630,7 +630,7 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) { } int32_t numOfMnodes = mnodeGetMnodesNum(); - if (numOfMnodes < tsNumOfMnodes && numOfMnodes < mnodeGetOnlineDnodesNum()) { + if (numOfMnodes < tsNumOfMnodes && numOfMnodes < mnodeGetOnlineDnodesNum() && !pDnode->isMgmt) { bnNotify(); }