From e28171454349fbcd770ab794d14b0d955b924d4a Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Mon, 24 Oct 2022 18:14:57 +0800 Subject: [PATCH 1/3] enh: redistribute vgroup --- source/dnode/mnode/impl/src/mndVgroup.c | 42 ++++++++++++++++++------- tests/script/jenkins/basic.txt | 2 +- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndVgroup.c b/source/dnode/mnode/impl/src/mndVgroup.c index 3f2c6ac2a8..2d9023c526 100644 --- a/source/dnode/mnode/impl/src/mndVgroup.c +++ b/source/dnode/mnode/impl/src/mndVgroup.c @@ -1185,9 +1185,21 @@ static int32_t mndAddIncVgroupReplicaToTrans(SMnode *pMnode, STrans *pTrans, SDb pGid->dnodeId = newDnodeId; pGid->syncState = TAOS_SYNC_STATE_ERROR; - if (mndAddCreateVnodeAction(pMnode, pTrans, pDb, pVgroup, pGid) != 0) return -1; - if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, -1) != 0) return -1; - if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pDb, pVgroup) != 0) return -1; + if (pVgroup->replica == 2) { + if (mndAddCreateVnodeAction(pMnode, pTrans, pDb, pVgroup, pGid) != 0) return -1; + if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, pVgroup->vnodeGid[0].dnodeId) != 0) return -1; + if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pDb, pVgroup) != 0) return -1; + } else if (pVgroup->replica == 4) { + if (mndAddCreateVnodeAction(pMnode, pTrans, pDb, pVgroup, pGid) != 0) return -1; + if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, pVgroup->vnodeGid[0].dnodeId) != 0) return -1; + if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, pVgroup->vnodeGid[1].dnodeId) != 0) return -1; + if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, pVgroup->vnodeGid[2].dnodeId) != 0) return -1; + if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pDb, pVgroup) != 0) return -1; + } else { + mError("vgId:%d, failed to add 1 vnode since invalid replica:%d", pVgroup->vgId, pVgroup->replica); + terrno = TSDB_CODE_MND_APP_ERROR; + return -1; + } return 0; } @@ -1212,9 +1224,21 @@ static int32_t mndAddDecVgroupReplicaFromTrans(SMnode *pMnode, STrans *pTrans, S memcpy(pGid, &pVgroup->vnodeGid[pVgroup->replica], sizeof(SVnodeGid)); memset(&pVgroup->vnodeGid[pVgroup->replica], 0, sizeof(SVnodeGid)); - if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, -1) != 0) return -1; - if (mndAddDropVnodeAction(pMnode, pTrans, pDb, pVgroup, &delGid, true) != 0) return -1; - if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pDb, pVgroup) != 0) return -1; + if (pVgroup->replica == 1) { + if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, pVgroup->vnodeGid[0].dnodeId) != 0) return -1; + if (mndAddDropVnodeAction(pMnode, pTrans, pDb, pVgroup, &delGid, true) != 0) return -1; + if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pDb, pVgroup) != 0) return -1; + } else if (pVgroup->replica == 3) { + if (mndAddDropVnodeAction(pMnode, pTrans, pDb, pVgroup, &delGid, true) != 0) return -1; + if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, pVgroup->vnodeGid[0].dnodeId) != 0) return -1; + if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, pVgroup->vnodeGid[1].dnodeId) != 0) return -1; + if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pDb, pVgroup, pVgroup->vnodeGid[2].dnodeId) != 0) return -1; + if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pDb, pVgroup) != 0) return -1; + } else { + mError("vgId:%d, failed to remove 1 vnode since invalid replica:%d", pVgroup->vgId, pVgroup->replica); + terrno = TSDB_CODE_MND_APP_ERROR; + return -1; + } return 0; } @@ -1334,9 +1358,6 @@ _OVER: } static int32_t mndProcessRedistributeVgroupMsg(SRpcMsg *pReq) { -#if 1 - return TSDB_CODE_OPS_NOT_SUPPORT; -#else SMnode *pMnode = pReq->info.node; SDnodeObj *pNew1 = NULL; SDnodeObj *pNew2 = NULL; @@ -1530,7 +1551,6 @@ _OVER: mndReleaseDb(pMnode, pDb); return code; -#endif } static int32_t mndCheckDnodeMemory(SMnode *pMnode, SDbObj *pOldDb, SDbObj *pNewDb, SVgObj *pOldVgroup, @@ -1868,7 +1888,7 @@ static int32_t mndProcessBalanceVgroupMsg(SRpcMsg *pReq) { SMnode *pMnode = pReq->info.node; int32_t code = -1; SArray *pArray = NULL; - void *pIter = NULL; + void *pIter = NULL; int64_t curMs = taosGetTimestampMs(); SBalanceVgroupReq req = {0}; diff --git a/tests/script/jenkins/basic.txt b/tests/script/jenkins/basic.txt index d5f19cc62f..50019cd837 100644 --- a/tests/script/jenkins/basic.txt +++ b/tests/script/jenkins/basic.txt @@ -50,7 +50,7 @@ # unsupport ./test.sh -f tsim/dnode/drop_dnode_has_multi_vnode_replica1.sim # unsupport ./test.sh -f tsim/dnode/drop_dnode_has_multi_vnode_replica3.sim ./test.sh -f tsim/dnode/offline_reason.sim -# unsupport ./test.sh -f tsim/dnode/redistribute_vgroup_replica1.sim +./test.sh -f tsim/dnode/redistribute_vgroup_replica1.sim # unsupport ./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v1_leader.sim # unsupport ./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v1_follower.sim # unsupport ./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v2.sim From b508c2511840fd5aaeed6a8b11df9631ff5c4e2e Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Tue, 25 Oct 2022 15:55:19 +0800 Subject: [PATCH 2/3] enh: redistribute vgroup --- source/dnode/mgmt/mgmt_vnode/src/vmWorker.c | 4 ++++ source/dnode/mnode/impl/src/mndVgroup.c | 10 +++++++++- source/util/src/tqueue.c | 2 ++ tests/script/jenkins/basic.txt | 8 ++++---- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c index 76d181761b..d5b1e18c7b 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c @@ -307,6 +307,10 @@ int32_t vmGetQueueSize(SVnodeMgmt *pMgmt, int32_t vgId, EQueueType qtype) { } vmReleaseVnode(pMgmt, pVnode); } + if (size < 0) { + dError("vgId:%d, can't get size from queue since %s, qtype:%d", vgId, terrstr(), qtype); + size = 0; + } return size; } diff --git a/source/dnode/mnode/impl/src/mndVgroup.c b/source/dnode/mnode/impl/src/mndVgroup.c index 2d9023c526..23397cedc6 100644 --- a/source/dnode/mnode/impl/src/mndVgroup.c +++ b/source/dnode/mnode/impl/src/mndVgroup.c @@ -260,6 +260,12 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg return NULL; } + mInfo("vgId:%d, build create vnode req, replica:%d selfIndex:%d strict:%d", createReq.vgId, createReq.replica, + createReq.selfIndex, createReq.strict); + for (int32_t i = 0; i < createReq.replica; ++i) { + mInfo("vgId:%d, replica:%d ep:%s:%u", createReq.vgId, i, createReq.replicas[i].fqdn, createReq.replicas[i].port); + } + int32_t contLen = tSerializeSCreateVnodeReq(NULL, 0, &createReq); if (contLen < 0) { terrno = TSDB_CODE_OUT_OF_MEMORY; @@ -293,6 +299,7 @@ static void *mndBuildAlterVnodeConfigReq(SMnode *pMnode, SDbObj *pDb, SVgObj *pV alterReq.strict = pDb->cfg.strict; alterReq.cacheLast = pDb->cfg.cacheLast; + mInfo("vgId:%d, build alter vnode config req", pVgroup->vgId); int32_t contLen = tSerializeSAlterVnodeConfigReq(NULL, 0, &alterReq); if (contLen < 0) { terrno = TSDB_CODE_OUT_OF_MEMORY; @@ -342,7 +349,7 @@ static void *mndBuildAlterVnodeReplicaReq(SMnode *pMnode, SDbObj *pDb, SVgObj *p } } alterReq.replica = pVgroup->replica; - mInfo("vgId:%d, start to alter vnode, replica:%d selfIndex:%d strict:%d", alterReq.vgId, alterReq.replica, + mInfo("vgId:%d, build alter vnode req, replica:%d selfIndex:%d strict:%d", alterReq.vgId, alterReq.replica, alterReq.selfIndex, alterReq.strict); for (int32_t i = 0; i < alterReq.replica; ++i) { mInfo("vgId:%d, replica:%d ep:%s:%u", alterReq.vgId, i, alterReq.replicas[i].fqdn, alterReq.replicas[i].port); @@ -377,6 +384,7 @@ void *mndBuildDropVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVgOb memcpy(dropReq.db, pDb->name, TSDB_DB_FNAME_LEN); dropReq.dbUid = pDb->uid; + mInfo("vgId:%d, build drop vnode req", dropReq.vgId); int32_t contLen = tSerializeSDropVnodeReq(NULL, 0, &dropReq); if (contLen < 0) { terrno = TSDB_CODE_OUT_OF_MEMORY; diff --git a/source/util/src/tqueue.c b/source/util/src/tqueue.c index f1f926c0b7..19b9b89cab 100644 --- a/source/util/src/tqueue.c +++ b/source/util/src/tqueue.c @@ -137,6 +137,8 @@ int32_t taosQueueItemSize(STaosQueue *queue) { taosThreadMutexLock(&queue->mutex); int32_t numOfItems = queue->numOfItems; taosThreadMutexUnlock(&queue->mutex); + + uTrace("queue:%p, numOfItems:%d memOfItems:%" PRId64, queue, queue->numOfItems, queue->memOfItems); return numOfItems; } diff --git a/tests/script/jenkins/basic.txt b/tests/script/jenkins/basic.txt index 50019cd837..e8b8bce5c2 100644 --- a/tests/script/jenkins/basic.txt +++ b/tests/script/jenkins/basic.txt @@ -51,10 +51,10 @@ # unsupport ./test.sh -f tsim/dnode/drop_dnode_has_multi_vnode_replica3.sim ./test.sh -f tsim/dnode/offline_reason.sim ./test.sh -f tsim/dnode/redistribute_vgroup_replica1.sim -# unsupport ./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v1_leader.sim -# unsupport ./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v1_follower.sim -# unsupport ./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v2.sim -# unsupport ./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v3.sim +./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v1_leader.sim +./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v1_follower.sim +./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v2.sim +./test.sh -f tsim/dnode/redistribute_vgroup_replica3_v3.sim # unsupport ./test.sh -f tsim/dnode/vnode_clean.sim ./test.sh -f tsim/dnode/use_dropped_dnode.sim From eae23f51abf9e260eff348bb8a1d37f8ac463b7f Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Tue, 25 Oct 2022 16:23:30 +0800 Subject: [PATCH 3/3] fix: wait more times if database not ready --- source/dnode/mnode/impl/src/mndTrans.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndTrans.c b/source/dnode/mnode/impl/src/mndTrans.c index 6f430ad3e4..6a606a1a7e 100644 --- a/source/dnode/mnode/impl/src/mndTrans.c +++ b/source/dnode/mnode/impl/src/mndTrans.c @@ -918,13 +918,19 @@ static void mndTransSendRpcRsp(SMnode *pMnode, STrans *pTrans) { sendRsp = true; } } else { - if (pTrans->stage == TRN_STAGE_REDO_ACTION && pTrans->failedTimes > 6) { + if (pTrans->stage == TRN_STAGE_REDO_ACTION && ((code == TSDB_CODE_APP_NOT_READY && pTrans->failedTimes > 60) || + (code != TSDB_CODE_APP_NOT_READY && pTrans->failedTimes > 6))) { if (code == 0) code = TSDB_CODE_MND_TRANS_UNKNOW_ERROR; sendRsp = true; } } - if (!sendRsp) return; + if (!sendRsp) { + return; + } else { + mInfo("trans:%d, send rsp, stage:%s failedTimes:%d code:0x%x", pTrans->id, mndTransStr(pTrans->stage), + pTrans->failedTimes, code); + } int32_t size = taosArrayGetSize(pTrans->pRpcArray); if (size <= 0) return;