From e8c11c58f0086377cb44497ff46432f176103867 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 3 Mar 2023 10:20:52 +0800 Subject: [PATCH 01/30] fix(tq): add some logs. --- source/client/src/clientMain.c | 10 +++++- source/client/src/clientTmq.c | 41 ++++++++++++++--------- source/common/src/tmsg.c | 2 +- source/dnode/mnode/impl/src/mndConsumer.c | 13 +++---- source/dnode/mnode/impl/src/mndTrans.c | 10 +++--- source/dnode/vnode/src/tq/tq.c | 9 +++-- 6 files changed, 56 insertions(+), 29 deletions(-) diff --git a/source/client/src/clientMain.c b/source/client/src/clientMain.c index 4ba51ce50d..5a3986257c 100644 --- a/source/client/src/clientMain.c +++ b/source/client/src/clientMain.c @@ -271,6 +271,8 @@ TAOS_ROW taos_fetch_row(TAOS_RES *res) { SReqResultInfo *pResultInfo; if (msg->resIter == -1) { pResultInfo = tmqGetNextResInfo(res, true); + tscDebug("consumer:0x%" PRIx64 ", vgId:%d, numOfRows:%" PRId64 ", total rows:%" PRId64, msg->rsp.head.consumerId, + msg->vgId, pResultInfo->numOfRows, pResultInfo->totalRows); } else { pResultInfo = tmqGetCurResInfo(res); } @@ -281,7 +283,13 @@ TAOS_ROW taos_fetch_row(TAOS_RES *res) { return pResultInfo->row; } else { pResultInfo = tmqGetNextResInfo(res, true); - if (pResultInfo == NULL) return NULL; + if (pResultInfo == NULL) { + return NULL; + } + + tscDebug("consumer:0x%" PRIx64 " vgId:%d, numOfRows:%" PRId64 ", total rows:%" PRId64, msg->rsp.head.consumerId, + msg->vgId, pResultInfo->numOfRows, pResultInfo->totalRows); + doSetOneRowPtr(pResultInfo); pResultInfo->current += 1; return pResultInfo->row; diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index d913b32642..40eccd57dd 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -192,6 +192,7 @@ typedef struct { SMqClientTopic* pTopic; int32_t vgId; tsem_t rspSem; + uint64_t requestId; // request id for debug purpose } SMqPollCbParam; typedef struct { @@ -1054,7 +1055,7 @@ tmq_t* tmq_consumer_new(tmq_conf_t* conf, char* errstr, int32_t errstrLen) { pTmq->hbLiveTimer = taosTmrStart(tmqSendHbReq, 1000, pRefId, tmqMgmt.timer); } - tscInfo("consumer:0x%" PRIx64 " is setup, consumer groupId %s", pTmq->consumerId, pTmq->groupId); + tscInfo("consumer:0x%" PRIx64 " is setup, groupId:%s", pTmq->consumerId, pTmq->groupId); return pTmq; FAIL: @@ -1224,6 +1225,7 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { atomic_store_8(&tmq->status, TMQ_CONSUMER_STATUS__RECOVER); goto CREATE_MSG_FAIL; } + if (code == TSDB_CODE_TQ_NO_COMMITTED_OFFSET) { SMqPollRspWrapper* pRspWrapper = taosAllocateQitem(sizeof(SMqPollRspWrapper), DEF_QITEM, 0); if (pRspWrapper == NULL) { @@ -1277,10 +1279,9 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { tDecoderClear(&decoder); memcpy(&pRspWrapper->dataRsp, pMsg->pData, sizeof(SMqRspHead)); - tscDebug("consumer:0x%" PRIx64 ", recv poll: vgId:%d, req offset %" PRId64 ", rsp offset %" PRId64 " type %d", + tscDebug("consumer:0x%" PRIx64 " recv poll rsp, vgId:%d, req offset:%" PRId64 ", rsp offset:%" PRId64 " type %d, reqId:0x%"PRIx64, tmq->consumerId, pVg->vgId, pRspWrapper->dataRsp.reqOffset.version, pRspWrapper->dataRsp.rspOffset.version, - rspType); - + rspType, pParam->requestId); } else if (rspType == TMQ_MSG_TYPE__POLL_META_RSP) { SDecoder decoder; tDecoderInit(&decoder, POINTER_SHIFT(pMsg->pData, sizeof(SMqRspHead)), pMsg->len - sizeof(SMqRspHead)); @@ -1298,7 +1299,7 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { taosMemoryFree(pMsg->pData); taosMemoryFree(pMsg->pEpSet); - tscDebug("consumer:0x%" PRIx64 ", put poll res into mqueue %p", tmq->consumerId, pRspWrapper); + tscDebug("consumer:0x%" PRIx64 ", put poll res into mqueue, total in queue:%d", tmq->consumerId, tmq->mqueue->numOfItems); taosWriteQitem(tmq->mqueue, pRspWrapper); tsem_post(&tmq->rspSem); @@ -1344,7 +1345,7 @@ bool tmqUpdateEp(tmq_t* tmq, int32_t epoch, const SMqAskEpRsp* pRsp) { sprintf(vgKey, "%s:%d", pTopicCur->topicName, pVgCur->vgId); char buf[80]; tFormatOffset(buf, 80, &pVgCur->currentOffset); - tscDebug("consumer:0x%" PRIx64 ", epoch %d vgId:%d vgKey is %s, offset is %s", tmq->consumerId, epoch, + tscDebug("consumer:0x%" PRIx64 ", epoch:%d vgId:%d vgKey:%s, offset:%s", tmq->consumerId, epoch, pVgCur->vgId, vgKey, buf); taosHashPut(pHash, vgKey, strlen(vgKey), &pVgCur->currentOffset, sizeof(STqOffsetVal)); } @@ -1552,7 +1553,7 @@ int32_t tmqAskEp(tmq_t* tmq, bool async) { .handle = NULL, }; - sendInfo->requestId = tmq->consumerId; + sendInfo->requestId = generateRequestId(); sendInfo->requestObjRefId = 0; sendInfo->param = pParam; sendInfo->fp = tmqAskEpCb; @@ -1560,7 +1561,7 @@ int32_t tmqAskEp(tmq_t* tmq, bool async) { SEpSet epSet = getEpSet_s(&tmq->pTscObj->pAppInfo->mgmtEp); tscDebug("consumer:0x%" PRIx64 " ask ep from mnode, async:%d, reqId:0x%" PRIx64, tmq->consumerId, async, - tmq->consumerId); + sendInfo->requestId); int64_t transporterId = 0; asyncSendMsgToServer(tmq->pTscObj->pAppInfo->pTransporter, &epSet, &transporterId, sendInfo); @@ -1674,6 +1675,7 @@ static int32_t doTmqPollImpl(tmq_t* pTmq, SMqClientTopic* pTopic, SMqClientVg* p pParam->pVg = pVg; // pVg may be released,fix it pParam->pTopic = pTopic; pParam->vgId = pVg->vgId; + pParam->requestId = req.reqId; SMsgSendInfo* sendInfo = taosMemoryCalloc(1, sizeof(SMsgSendInfo)); if (sendInfo == NULL) { @@ -1698,7 +1700,7 @@ static int32_t doTmqPollImpl(tmq_t* pTmq, SMqClientTopic* pTopic, SMqClientVg* p char offsetFormatBuf[80]; tFormatOffset(offsetFormatBuf, tListLen(offsetFormatBuf), &pVg->currentOffset); - tscDebug("consumer:0x%" PRIx64 " send poll to %s vgId:%d, epoch %d, req offset:%s, reqId:0x%" PRIx64, + tscDebug("consumer:0x%" PRIx64 " send poll to %s vgId:%d, epoch %d, req:%s, reqId:0x%" PRIx64, pTmq->consumerId, pTopic->topicName, pVg->vgId, pTmq->epoch, offsetFormatBuf, req.reqId); asyncSendMsgToServer(pTmq->pTscObj->pAppInfo->pTransporter, &pVg->epSet, &transporterId, sendInfo); @@ -1722,10 +1724,9 @@ int32_t tmqPollImpl(tmq_t* tmq, int64_t timeout) { int32_t vgStatus = atomic_val_compare_exchange_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE, TMQ_VG_STATUS__WAIT); if (vgStatus == TMQ_VG_STATUS__WAIT) { int32_t vgSkipCnt = atomic_add_fetch_32(&pVg->vgSkipCnt, 1); - tscDebug("consumer:0x%" PRIx64 " epoch %d wait poll-rsp, skip vgId:%d skip cnt %d", tmq->consumerId, tmq->epoch, + tscTrace("consumer:0x%" PRIx64 " epoch %d wait poll-rsp, skip vgId:%d skip cnt %d", tmq->consumerId, tmq->epoch, pVg->vgId, vgSkipCnt); continue; - /*if (vgSkipCnt < 10000) continue;*/ #if 0 if (skipCnt < 30000) { continue; @@ -1767,7 +1768,7 @@ int32_t tmqHandleNoPollRsp(tmq_t* tmq, SMqRspWrapper* rspWrapper, bool* pReset) } void* tmqHandleAllRsp(tmq_t* tmq, int64_t timeout, bool pollIfReset) { - tscDebug("consumer:0x%" PRIx64 " start to handle the rsp", tmq->consumerId); + tscDebug("consumer:0x%" PRIx64 " start to handle the rsp, total:%d", tmq->consumerId, tmq->qall->numOfItems); while (1) { SMqRspWrapper* rspWrapper = NULL; @@ -1785,25 +1786,33 @@ void* tmqHandleAllRsp(tmq_t* tmq, int64_t timeout, bool pollIfReset) { if (rspWrapper->tmqRspType == TMQ_MSG_TYPE__END_RSP) { taosFreeQitem(rspWrapper); terrno = TSDB_CODE_TQ_NO_COMMITTED_OFFSET; + tscError("consumer:0x%" PRIx64 " unexpected rsp from poll, code:%s", tmq->consumerId, tstrerror(terrno)); return NULL; } else if (rspWrapper->tmqRspType == TMQ_MSG_TYPE__POLL_RSP) { SMqPollRspWrapper* pollRspWrapper = (SMqPollRspWrapper*)rspWrapper; - tscDebug("consumer:0x%" PRIx64 " process poll rsp", tmq->consumerId); + /*atomic_sub_fetch_32(&tmq->readyRequest, 1);*/ int32_t consumerEpoch = atomic_load_32(&tmq->epoch); if (pollRspWrapper->dataRsp.head.epoch == consumerEpoch) { SMqClientVg* pVg = pollRspWrapper->vgHandle; - /*printf("vgId:%d, offset %" PRId64 " up to %" PRId64 "\n", pVg->vgId, pVg->currentOffset, - * rspMsg->msg.rspOffset);*/ pVg->currentOffset = pollRspWrapper->dataRsp.rspOffset; atomic_store_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE); + if (pollRspWrapper->dataRsp.blockNum == 0) { + tscDebug("consumer:0x%" PRIx64 " empty block received in poll rsp", tmq->consumerId); + taosFreeQitem(pollRspWrapper); rspWrapper = NULL; continue; } + // build rsp + char buf[80]; + tFormatOffset(buf, 80, &pVg->currentOffset); SMqRspObj* pRsp = tmqBuildRspFromWrapper(pollRspWrapper); + tscDebug("consumer:0x%" PRIx64 " process poll rsp, vgId:%d, offset:%s, blocks:%d", tmq->consumerId, + pVg->vgId, buf, pollRspWrapper->dataRsp.blockNum); + taosFreeQitem(pollRspWrapper); return pRsp; } else { @@ -1876,6 +1885,8 @@ void* tmqHandleAllRsp(tmq_t* tmq, int64_t timeout, bool pollIfReset) { } } } + + tscDebug("consumer:0x%" PRIx64 " handle the rsp completed", tmq->consumerId); } TAOS_RES* tmq_consumer_poll(tmq_t* tmq, int64_t timeout) { diff --git a/source/common/src/tmsg.c b/source/common/src/tmsg.c index e180959d1e..b9b5f3f5dd 100644 --- a/source/common/src/tmsg.c +++ b/source/common/src/tmsg.c @@ -6570,7 +6570,7 @@ int32_t tFormatOffset(char *buf, int32_t maxLen, const STqOffsetVal *pVal) { } else if (pVal->type == TMQ_OFFSET__LOG) { snprintf(buf, maxLen, "offset(log) ver:%" PRId64, pVal->version); } else if (pVal->type == TMQ_OFFSET__SNAPSHOT_DATA || pVal->type == TMQ_OFFSET__SNAPSHOT_META) { - snprintf(buf, maxLen, "offset(ss data) uid:%" PRId64 ", ts:%" PRId64, pVal->uid, pVal->ts); + snprintf(buf, maxLen, "offset(snapshot) uid:%" PRId64 " ts:%" PRId64, pVal->uid, pVal->ts); } else { ASSERT(0); } diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index 280f3b0ecc..1b86c28a67 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -29,7 +29,7 @@ #define MND_CONSUMER_LOST_HB_CNT 3 #define MND_CONSUMER_LOST_CLEAR_THRESHOLD 43200 -static int8_t mqRebInExecCnt = 0; +static int32_t mqRebInExecCnt = 0; static const char *mndConsumerStatusName(int status); @@ -76,15 +76,16 @@ int32_t mndInitConsumer(SMnode *pMnode) { void mndCleanupConsumer(SMnode *pMnode) {} bool mndRebTryStart() { - int8_t old = atomic_val_compare_exchange_8(&mqRebInExecCnt, 0, 1); + int32_t old = atomic_val_compare_exchange_32(&mqRebInExecCnt, 0, 1); + mInfo("tq timer, rebalance counter old val:%d", old); return old == 0; } -void mndRebEnd() { atomic_sub_fetch_8(&mqRebInExecCnt, 1); } +void mndRebEnd() { int32_t val = atomic_sub_fetch_32(&mqRebInExecCnt, 1); mInfo("rebalance end, rebalance counter:%d", val); } -void mndRebCntInc() { atomic_add_fetch_8(&mqRebInExecCnt, 1); } +void mndRebCntInc() { int32_t val = atomic_add_fetch_32(&mqRebInExecCnt, 1); mInfo("rebalance trans start, rebalance count:%d", val);} -void mndRebCntDec() { atomic_sub_fetch_8(&mqRebInExecCnt, 1); } +void mndRebCntDec() { int32_t val = atomic_sub_fetch_32(&mqRebInExecCnt, 1); mInfo("rebalance trans end, rebalance count:%d", val); } static int32_t mndProcessConsumerLostMsg(SRpcMsg *pMsg) { SMnode *pMnode = pMsg->info.node; @@ -334,7 +335,7 @@ static int32_t mndProcessMqTimerMsg(SRpcMsg *pMsg) { } else { taosHashCleanup(pRebMsg->rebSubHash); rpcFreeCont(pRebMsg); - mTrace("mq rebalance finished, no modification"); + mInfo("mq rebalance finished, no modification"); mndRebEnd(); } return 0; diff --git a/source/dnode/mnode/impl/src/mndTrans.c b/source/dnode/mnode/impl/src/mndTrans.c index 55e9faf020..7721dda576 100644 --- a/source/dnode/mnode/impl/src/mndTrans.c +++ b/source/dnode/mnode/impl/src/mndTrans.c @@ -503,7 +503,8 @@ static TransCbFp mndTransGetCbFp(ETrnFunc ftype) { } static int32_t mndTransActionInsert(SSdb *pSdb, STrans *pTrans) { - mTrace("trans:%d, perform insert action, row:%p stage:%s", pTrans->id, pTrans, mndTransStr(pTrans->stage)); + mInfo("trans:%d, perform insert action, row:%p stage:%s, startFunc:%d, callfunc:1", pTrans->id, pTrans, mndTransStr(pTrans->stage), + pTrans->startFunc); if (pTrans->startFunc > 0) { TransCbFp fp = mndTransGetCbFp(pTrans->startFunc); @@ -546,8 +547,9 @@ static void mndTransDropData(STrans *pTrans) { } static int32_t mndTransActionDelete(SSdb *pSdb, STrans *pTrans, bool callFunc) { - mTrace("trans:%d, perform delete action, row:%p stage:%s callfunc:%d", pTrans->id, pTrans, mndTransStr(pTrans->stage), - callFunc); + mInfo("trans:%d, perform delete action, row:%p stage:%s callfunc:%d, stopFunc:%d", pTrans->id, pTrans, mndTransStr(pTrans->stage), + pTrans->stopFunc, callFunc); + if (pTrans->stopFunc > 0 && callFunc) { TransCbFp fp = mndTransGetCbFp(pTrans->stopFunc); if (fp) { @@ -572,7 +574,7 @@ static void mndTransUpdateActions(SArray *pOldArray, SArray *pNewArray) { } static int32_t mndTransActionUpdate(SSdb *pSdb, STrans *pOld, STrans *pNew) { - mTrace("trans:%d, perform update action, old row:%p stage:%s create:%" PRId64 ", new row:%p stage:%s create:%" PRId64, + mInfo("trans:%d, perform update action, old row:%p stage:%s create:%" PRId64 ", new row:%p stage:%s create:%" PRId64, pOld->id, pOld, mndTransStr(pOld->stage), pOld->createdTime, pNew, mndTransStr(pNew->stage), pNew->createdTime); diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index d07933c9f1..4d21a2e7f3 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -488,7 +488,7 @@ int32_t tqProcessPollReq(STQ* pTq, SRpcMsg* pMsg) { // 2. check rebalance if (pHandle->consumerId != consumerId) { - tqError("tmq poll: consumer:0x%" PRIx64 " vgId:%d, subkey %s, mismatch for saved handle consumer:0x%" PRIx64, + tqDebug("ERROR tmq poll: consumer:0x%" PRIx64 " vgId:%d, subkey %s, mismatch for saved handle consumer:0x%" PRIx64, consumerId, TD_VID(pTq->pVnode), req.subKey, pHandle->consumerId); terrno = TSDB_CODE_TMQ_CONSUMER_MISMATCH; return -1; @@ -822,6 +822,7 @@ int32_t tqProcessSubscribeReq(STQ* pTq, int64_t sversion, char* msg, int32_t msg pHandle = &tqHandle; /*taosInitRWLatch(&pExec->lock);*/ + uint64_t oldConsumerId = pHandle->consumerId; memcpy(pHandle->subKey, req.subKey, TSDB_SUBSCRIBE_KEY_LEN); pHandle->consumerId = req.newConsumerId; pHandle->epoch = -1; @@ -884,13 +885,16 @@ int32_t tqProcessSubscribeReq(STQ* pTq, int64_t sversion, char* msg, int32_t msg (SSnapContext**)(&handle.sContext)); pHandle->execHandle.task = qCreateQueueExecTaskInfo(NULL, &handle, NULL, NULL); } + taosHashPut(pTq->pHandle, req.subKey, strlen(req.subKey), pHandle, sizeof(STqHandle)); - tqDebug("try to persist handle %s consumer:0x%" PRIx64, req.subKey, pHandle->consumerId); + tqDebug("try to persist handle %s consumer:0x%" PRIx64" , old consumer:0x%"PRIx64, req.subKey, pHandle->consumerId, + oldConsumerId); if (tqMetaSaveHandle(pTq, req.subKey, pHandle) < 0) { return -1; } } else { // TODO handle qmsg and exec modification + tqInfo("update the consumer info, old consumer id:0x%"PRIx64", new Id:0x%"PRIx64, pHandle->consumerId, req.newConsumerId); atomic_store_32(&pHandle->epoch, -1); atomic_store_64(&pHandle->consumerId, req.newConsumerId); atomic_add_fetch_32(&pHandle->epoch, 1); @@ -898,6 +902,7 @@ int32_t tqProcessSubscribeReq(STQ* pTq, int64_t sversion, char* msg, int32_t msg if (pHandle->execHandle.subType == TOPIC_SUB_TYPE__COLUMN) { qStreamCloseTsdbReader(pHandle->execHandle.task); } + if (tqMetaSaveHandle(pTq, req.subKey, pHandle) < 0) { return -1; } From b26b1aa8fd459a5f9e858b3373852ed2d88750be Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 3 Mar 2023 10:30:23 +0800 Subject: [PATCH 02/30] fix: minor changes --- source/dnode/mnode/impl/src/mndTrans.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndTrans.c b/source/dnode/mnode/impl/src/mndTrans.c index 7721dda576..05368d3445 100644 --- a/source/dnode/mnode/impl/src/mndTrans.c +++ b/source/dnode/mnode/impl/src/mndTrans.c @@ -503,7 +503,7 @@ static TransCbFp mndTransGetCbFp(ETrnFunc ftype) { } static int32_t mndTransActionInsert(SSdb *pSdb, STrans *pTrans) { - mInfo("trans:%d, perform insert action, row:%p stage:%s, startFunc:%d, callfunc:1", pTrans->id, pTrans, mndTransStr(pTrans->stage), + mInfo("trans:%d, perform insert action, row:%p stage:%s, callfunc:1, startFunc:%d", pTrans->id, pTrans, mndTransStr(pTrans->stage), pTrans->startFunc); if (pTrans->startFunc > 0) { @@ -547,8 +547,8 @@ static void mndTransDropData(STrans *pTrans) { } static int32_t mndTransActionDelete(SSdb *pSdb, STrans *pTrans, bool callFunc) { - mInfo("trans:%d, perform delete action, row:%p stage:%s callfunc:%d, stopFunc:%d", pTrans->id, pTrans, mndTransStr(pTrans->stage), - pTrans->stopFunc, callFunc); + mInfo("trans:%d, perform delete action, row:%p stage:%s callfunc:%d, stopFunc:%d", pTrans->id, pTrans, + mndTransStr(pTrans->stage), callFunc, pTrans->stopFunc); if (pTrans->stopFunc > 0 && callFunc) { TransCbFp fp = mndTransGetCbFp(pTrans->stopFunc); From f30496d958e1bbe2ce35210cd0409776026b435a Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 3 Mar 2023 10:30:45 +0800 Subject: [PATCH 03/30] refactor: update the consumerid --- source/dnode/mnode/impl/src/mndSubscribe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndSubscribe.c b/source/dnode/mnode/impl/src/mndSubscribe.c index 8544994c3e..bcddde52f7 100644 --- a/source/dnode/mnode/impl/src/mndSubscribe.c +++ b/source/dnode/mnode/impl/src/mndSubscribe.c @@ -427,10 +427,10 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR if (pIter == NULL) break; SMqConsumerEp *pConsumerEp = (SMqConsumerEp *)pIter; int32_t sz = taosArrayGetSize(pConsumerEp->vgs); - mInfo("sub:%s mq re-balance final cfg: consumer:0x%" PRId64 " has %d vg", sub, pConsumerEp->consumerId, sz); + mInfo("sub:%s mq re-balance final cfg: consumer:0x%" PRIx64 " has %d vg", sub, pConsumerEp->consumerId, sz); for (int32_t i = 0; i < sz; i++) { SMqVgEp *pVgEp = taosArrayGetP(pConsumerEp->vgs, i); - mInfo("sub:%s mq re-balance final cfg: vg %d to consumer:0x%" PRId64, sub, pVgEp->vgId, + mInfo("sub:%s mq re-balance final cfg: vg %d to consumer:0x%" PRIx64, sub, pVgEp->vgId, pConsumerEp->consumerId); } } From 8ffc397cf2682d98cef9b5bfae592d3472cda9b7 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 3 Mar 2023 10:35:13 +0800 Subject: [PATCH 04/30] fix: minor changes --- source/dnode/mnode/impl/src/mndSubscribe.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndSubscribe.c b/source/dnode/mnode/impl/src/mndSubscribe.c index 8544994c3e..fcd68d9001 100644 --- a/source/dnode/mnode/impl/src/mndSubscribe.c +++ b/source/dnode/mnode/impl/src/mndSubscribe.c @@ -224,7 +224,7 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR .pVgEp = pVgEp, }; taosHashPut(pHash, &pVgEp->vgId, sizeof(int32_t), &outputVg, sizeof(SMqRebOutputVg)); - mInfo("sub:%s mq re-balance remove vgId:%d from consumer:%" PRId64, sub, pVgEp->vgId, consumerId); + mInfo("sub:%s mq re-balance remove vgId:%d from consumer:%" PRIx64, sub, pVgEp->vgId, consumerId); } taosArrayDestroy(pConsumerEp->vgs); taosHashRemove(pOutput->pSub->consumerHash, &consumerId, sizeof(int64_t)); @@ -296,7 +296,7 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR .pVgEp = pVgEp, }; taosHashPut(pHash, &pVgEp->vgId, sizeof(int32_t), &outputVg, sizeof(SMqRebOutputVg)); - mInfo("sub:%s mq rebalance remove vgId:%d from consumer:%" PRId64 ",(first scan)", sub, pVgEp->vgId, + mInfo("sub:%s mq rebalance remove vgId:%d from consumer:%" PRIx64 ",(first scan)", sub, pVgEp->vgId, pConsumerEp->consumerId); } imbCnt++; @@ -311,7 +311,7 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR .pVgEp = pVgEp, }; taosHashPut(pHash, &pVgEp->vgId, sizeof(int32_t), &outputVg, sizeof(SMqRebOutputVg)); - mInfo("sub:%s mq rebalance remove vgId:%d from consumer:%" PRId64 ",(first scan)", sub, pVgEp->vgId, + mInfo("sub:%s mq rebalance remove vgId:%d from consumer:%" PRIx64 ",(first scan)", sub, pVgEp->vgId, pConsumerEp->consumerId); } } @@ -329,7 +329,7 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR newConsumerEp.vgs = taosArrayInit(0, sizeof(void *)); taosHashPut(pOutput->pSub->consumerHash, &consumerId, sizeof(int64_t), &newConsumerEp, sizeof(SMqConsumerEp)); taosArrayPush(pOutput->newConsumers, &consumerId); - mInfo("sub:%s mq rebalance add new consumer:%" PRId64, sub, consumerId); + mInfo("sub:%s mq rebalance add new consumer:%" PRIx64, sub, consumerId); } } @@ -357,7 +357,7 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR taosArrayPush(pConsumerEp->vgs, &pRebVg->pVgEp); pRebVg->newConsumerId = pConsumerEp->consumerId; taosArrayPush(pOutput->rebVgs, pRebVg); - mInfo("mq rebalance: add vgId:%d to consumer:%" PRId64 " (second scan) (not enough)", pRebVg->pVgEp->vgId, + mInfo("mq rebalance: add vgId:%d to consumer:%" PRIx64 " (second scan) (not enough)", pRebVg->pVgEp->vgId, pConsumerEp->consumerId); } } @@ -387,12 +387,12 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR taosArrayPush(pConsumerEp->vgs, &pRebVg->pVgEp); pRebVg->newConsumerId = pConsumerEp->consumerId; if (pRebVg->newConsumerId == pRebVg->oldConsumerId) { - mInfo("mq rebalance: skip vg %d for same consumer:%" PRId64 " (second scan)", pRebVg->pVgEp->vgId, + mInfo("mq rebalance: skip vg %d for same consumer:%" PRIx64 " (second scan)", pRebVg->pVgEp->vgId, pConsumerEp->consumerId); continue; } taosArrayPush(pOutput->rebVgs, pRebVg); - mInfo("mq rebalance: add vgId:%d to consumer:%" PRId64 " (second scan) (unassigned)", pRebVg->pVgEp->vgId, + mInfo("mq rebalance: add vgId:%d to consumer:%" PRIx64 " (second scan) (unassigned)", pRebVg->pVgEp->vgId, pConsumerEp->consumerId); } } else { @@ -427,10 +427,10 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR if (pIter == NULL) break; SMqConsumerEp *pConsumerEp = (SMqConsumerEp *)pIter; int32_t sz = taosArrayGetSize(pConsumerEp->vgs); - mInfo("sub:%s mq re-balance final cfg: consumer:0x%" PRId64 " has %d vg", sub, pConsumerEp->consumerId, sz); + mInfo("sub:%s mq re-balance final cfg: consumer:0x%" PRIx64 " has %d vg", sub, pConsumerEp->consumerId, sz); for (int32_t i = 0; i < sz; i++) { SMqVgEp *pVgEp = taosArrayGetP(pConsumerEp->vgs, i); - mInfo("sub:%s mq re-balance final cfg: vg %d to consumer:0x%" PRId64, sub, pVgEp->vgId, + mInfo("sub:%s mq re-balance final cfg: vg %d to consumer:0x%" PRIx64, sub, pVgEp->vgId, pConsumerEp->consumerId); } } @@ -1017,7 +1017,7 @@ int32_t mndRetrieveSubscribe(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock *pBlock pColInfo = taosArrayGet(pBlock->pDataBlock, cols++); colDataSetVal(pColInfo, numOfRows, (const char *)&pConsumerEp->consumerId, false); - mDebug("mnd show subscriptions: topic %s, consumer %" PRId64 " cgroup %s vgid %d", varDataVal(topic), + mDebug("mnd show subscriptions: topic %s, consumer:%" PRIx64 " cgroup %s vgid %d", varDataVal(topic), pConsumerEp->consumerId, varDataVal(cgroup), pVgEp->vgId); // offset From dfeb3fb458a2dd0275251b79f99c17d8fbc15655 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 3 Mar 2023 13:45:04 +0800 Subject: [PATCH 05/30] refactor: update some logs. --- source/dnode/mnode/impl/src/mndConsumer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index 1b86c28a67..2b5956f6a6 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -944,8 +944,9 @@ static int32_t mndConsumerActionUpdate(SSdb *pSdb, SMqConsumerObj *pOldConsumer, pOldConsumer->rebalanceTime = pNewConsumer->upTime; atomic_add_fetch_32(&pOldConsumer->epoch, 1); - mDebug("consumer:0x%" PRIx64 " state %s -> %s, new epoch:%d, reb-time:%" PRId64 ", current topics:%d", - pOldConsumer->consumerId, mndConsumerStatusName(status), mndConsumerStatusName(pOldConsumer->status), + mDebug("consumer:0x%" PRIx64 " state (%d)%s -> (%d)%s, new epoch:%d, reb-time:%" PRId64 ", current topics:%d", + pOldConsumer->consumerId, status, mndConsumerStatusName(status), pOldConsumer->status, + mndConsumerStatusName(pOldConsumer->status), pOldConsumer->epoch, pOldConsumer->rebalanceTime, (int)taosArrayGetSize(pOldConsumer->currentTopics)); } else if (pNewConsumer->updateType == CONSUMER_UPDATE__REMOVE) { /*A(taosArrayGetSize(pNewConsumer->rebNewTopics) == 0);*/ @@ -1003,8 +1004,9 @@ static int32_t mndConsumerActionUpdate(SSdb *pSdb, SMqConsumerObj *pOldConsumer, pOldConsumer->rebalanceTime = pNewConsumer->upTime; atomic_add_fetch_32(&pOldConsumer->epoch, 1); - mDebug("consumer:0x%" PRIx64 " state %s -> %s, new epoch:%d, reb-time:%" PRId64 ", current topics:%d", - pOldConsumer->consumerId, mndConsumerStatusName(status), mndConsumerStatusName(pOldConsumer->status), + mDebug("consumer:0x%" PRIx64 " state %d(%s) -> %d(%s), new epoch:%d, reb-time:%" PRId64 ", current topics:%d", + pOldConsumer->consumerId, status, mndConsumerStatusName(status), pOldConsumer->status, + mndConsumerStatusName(pOldConsumer->status), pOldConsumer->epoch, pOldConsumer->rebalanceTime, (int)taosArrayGetSize(pOldConsumer->currentTopics)); } From 6c68b36d7f1609cbf53d7b176448f364969bc834 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Fri, 3 Mar 2023 18:22:26 +0800 Subject: [PATCH 06/30] fix:change ask ep to 20s --- source/client/src/clientTmq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index 40eccd57dd..826bbbd8a3 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -1162,7 +1162,7 @@ int32_t tmq_subscribe(tmq_t* tmq, const tmq_list_t* topic_list) { int32_t retryCnt = 0; while (TSDB_CODE_MND_CONSUMER_NOT_READY == tmqAskEp(tmq, false)) { - if (retryCnt++ > 10) { + if (retryCnt++ > 40) { goto FAIL; } From 24bb33f24a11a8b55aa87f4d7f7a3f1841260aac Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 3 Mar 2023 18:34:38 +0800 Subject: [PATCH 07/30] refactor: add some logs. --- source/dnode/vnode/inc/vnode.h | 2 +- source/dnode/vnode/src/tq/tqRead.c | 8 ++++---- source/libs/executor/src/executor.c | 2 +- source/libs/executor/src/scanoperator.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/source/dnode/vnode/inc/vnode.h b/source/dnode/vnode/inc/vnode.h index acfaccafe2..1d14829891 100644 --- a/source/dnode/vnode/inc/vnode.h +++ b/source/dnode/vnode/inc/vnode.h @@ -264,7 +264,7 @@ int32_t tqReaderSetTbUidList(STqReader *pReader, const SArray *tbUidList); int32_t tqReaderAddTbUidList(STqReader *pReader, const SArray *tbUidList); int32_t tqReaderRemoveTbUidList(STqReader *pReader, const SArray *tbUidList); -int32_t tqSeekVer(STqReader *pReader, int64_t ver); +int32_t tqSeekVer(STqReader *pReader, int64_t ver, const char* id); int32_t tqNextBlock(STqReader *pReader, SFetchRet *ret); int32_t tqReaderSetSubmitReq2(STqReader *pReader, void *msgStr, int32_t msgLen, int64_t ver); diff --git a/source/dnode/vnode/src/tq/tqRead.c b/source/dnode/vnode/src/tq/tqRead.c index ee00cdc205..f4c96d073b 100644 --- a/source/dnode/vnode/src/tq/tqRead.c +++ b/source/dnode/vnode/src/tq/tqRead.c @@ -290,14 +290,14 @@ void tqCloseReader(STqReader* pReader) { taosMemoryFree(pReader); } -int32_t tqSeekVer(STqReader* pReader, int64_t ver) { +int32_t tqSeekVer(STqReader* pReader, int64_t ver, const char* id) { // todo set the correct vgId - tqDebug("tmq poll: vgId:%d wal seek to version:%"PRId64, 0, ver); + tqDebug("tmq poll: wal seek to version:%"PRId64" %s", ver, id); if (walReadSeekVer(pReader->pWalReader, ver) < 0) { - tqError("tmq poll: wal reader failed to seek to ver:%"PRId64, ver); + tqError("tmq poll: wal reader failed to seek to ver:%"PRId64" code:%s, %s", ver, tstrerror(terrno), id); return -1; } else { - tqDebug("tmq poll: wal reader seek to ver:%"PRId64, ver); + tqError("tmq poll: wal reader seek to ver:%"PRId64" %s", ver, id); return 0; } } diff --git a/source/libs/executor/src/executor.c b/source/libs/executor/src/executor.c index c2bf001c86..f24fe2d211 100644 --- a/source/libs/executor/src/executor.c +++ b/source/libs/executor/src/executor.c @@ -1068,7 +1068,7 @@ int32_t qStreamPrepareScan(qTaskInfo_t tinfo, STqOffsetVal* pOffset, int8_t subT tsdbReaderClose(pTSInfo->base.dataReader); pTSInfo->base.dataReader = NULL; // let's seek to the next version in wal file - if (tqSeekVer(pInfo->tqReader, pOffset->version + 1) < 0) { + if (tqSeekVer(pInfo->tqReader, pOffset->version + 1, pTaskInfo->id.str) < 0) { return -1; } } else if (pOffset->type == TMQ_OFFSET__SNAPSHOT_DATA) { diff --git a/source/libs/executor/src/scanoperator.c b/source/libs/executor/src/scanoperator.c index 7961d5518a..8576ac74c7 100644 --- a/source/libs/executor/src/scanoperator.c +++ b/source/libs/executor/src/scanoperator.c @@ -1618,7 +1618,7 @@ static SSDataBlock* doQueueScan(SOperatorInfo* pOperator) { pTSInfo->base.dataReader = NULL; tqOffsetResetToLog(&pTaskInfo->streamInfo.prepareStatus, pTaskInfo->streamInfo.snapshotVer); qDebug("queue scan tsdb over, switch to wal ver %" PRId64 "", pTaskInfo->streamInfo.snapshotVer + 1); - if (tqSeekVer(pInfo->tqReader, pTaskInfo->streamInfo.snapshotVer + 1) < 0) { + if (tqSeekVer(pInfo->tqReader, pTaskInfo->streamInfo.snapshotVer + 1, pTaskInfo->id.str) < 0) { tqOffsetResetToLog(&pTaskInfo->streamInfo.lastStatus, pTaskInfo->streamInfo.snapshotVer); return NULL; } From 0b00922a43706bb5d40b7aee00cd90dd1beb98f5 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 3 Mar 2023 23:53:32 +0800 Subject: [PATCH 08/30] fix: should not set startFunc/stopFunc to 0. when the trans is not finished on taosd stop, it will cause the startFunc/stopFunc setting error --- source/dnode/mnode/impl/src/mndTrans.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndTrans.c b/source/dnode/mnode/impl/src/mndTrans.c index 56fce0507f..ff8b4dd9ce 100644 --- a/source/dnode/mnode/impl/src/mndTrans.c +++ b/source/dnode/mnode/impl/src/mndTrans.c @@ -513,7 +513,7 @@ static int32_t mndTransActionInsert(SSdb *pSdb, STrans *pTrans) { if (fp) { (*fp)(pSdb->pMnode, pTrans->param, pTrans->paramLen); } - pTrans->startFunc = 0; + // pTrans->startFunc = 0; } return 0; @@ -557,7 +557,7 @@ static int32_t mndTransActionDelete(SSdb *pSdb, STrans *pTrans, bool callFunc) { if (fp) { (*fp)(pSdb->pMnode, pTrans->param, pTrans->paramLen); } - pTrans->stopFunc = 0; + // pTrans->stopFunc = 0; } mndTransDropData(pTrans); From f4c8444b8f92994a81816c3a90836f3425313ed4 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 3 Mar 2023 23:55:05 +0800 Subject: [PATCH 09/30] fix: format log --- source/dnode/mnode/impl/src/mndConsumer.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index 2b5956f6a6..803c4f81e7 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -81,11 +81,20 @@ bool mndRebTryStart() { return old == 0; } -void mndRebEnd() { int32_t val = atomic_sub_fetch_32(&mqRebInExecCnt, 1); mInfo("rebalance end, rebalance counter:%d", val); } +void mndRebEnd() { + int32_t val = atomic_sub_fetch_32(&mqRebInExecCnt, 1); + mInfo("rebalance end, rebalance count:%d", val); +} -void mndRebCntInc() { int32_t val = atomic_add_fetch_32(&mqRebInExecCnt, 1); mInfo("rebalance trans start, rebalance count:%d", val);} +void mndRebCntInc() { + int32_t val = atomic_add_fetch_32(&mqRebInExecCnt, 1); + mInfo("rebalance trans start, rebalance count:%d", val); +} -void mndRebCntDec() { int32_t val = atomic_sub_fetch_32(&mqRebInExecCnt, 1); mInfo("rebalance trans end, rebalance count:%d", val); } +void mndRebCntDec() { + int32_t val = atomic_sub_fetch_32(&mqRebInExecCnt, 1); + mInfo("rebalance trans end, rebalance count:%d", val); +} static int32_t mndProcessConsumerLostMsg(SRpcMsg *pMsg) { SMnode *pMnode = pMsg->info.node; From 3645c52b251ce44c761515e1984782674236dd03 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 4 Mar 2023 00:34:53 +0800 Subject: [PATCH 10/30] refactor: update some logs. --- source/client/src/clientTmq.c | 7 +++---- source/dnode/mnode/impl/src/mndConsumer.c | 3 ++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index 826bbbd8a3..de338a4642 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -1799,8 +1799,7 @@ void* tmqHandleAllRsp(tmq_t* tmq, int64_t timeout, bool pollIfReset) { atomic_store_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE); if (pollRspWrapper->dataRsp.blockNum == 0) { - tscDebug("consumer:0x%" PRIx64 " empty block received in poll rsp", tmq->consumerId); - + tscDebug("consumer:0x%" PRIx64 " empty block received, vgId:%d", tmq->consumerId, pVg->vgId); taosFreeQitem(pollRspWrapper); rspWrapper = NULL; continue; @@ -1913,11 +1912,11 @@ TAOS_RES* tmq_consumer_poll(tmq_t* tmq, int64_t timeout) { if (atomic_load_8(&tmq->status) == TMQ_CONSUMER_STATUS__RECOVER) { int32_t retryCnt = 0; while (TSDB_CODE_MND_CONSUMER_NOT_READY == tmqAskEp(tmq, false)) { - if (retryCnt++ > 10) { + if (retryCnt++ > 40) { return NULL; } - tscDebug("consumer:0x%" PRIx64 " not ready, retry:%d/10 in 500ms", tmq->consumerId, retryCnt); + tscDebug("consumer:0x%" PRIx64 " not ready, retry:%d/40 in 500ms", tmq->consumerId, retryCnt); taosMsleep(500); } } diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index 803c4f81e7..50aa1bb8c7 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -621,7 +621,8 @@ int32_t mndProcessSubscribeReq(SRpcMsg *pMsg) { pExistedConsumer = mndAcquireConsumer(pMnode, consumerId); if (pExistedConsumer == NULL) { - mInfo("receive subscribe request from new consumer:0x%" PRIx64" cgroup:%s", consumerId, subscribe.cgroup); + mInfo("receive subscribe request from new consumer:0x%" PRIx64" cgroup:%s, numOfTopics:%d", consumerId, + subscribe.cgroup, (int32_t) taosArrayGetSize(pTopicList)); pConsumerNew = tNewSMqConsumerObj(consumerId, cgroup); tstrncpy(pConsumerNew->clientId, subscribe.clientId, 256); From 53a0ff0be381dcb5fd1578291871642dea36cb11 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 4 Mar 2023 16:19:45 +0800 Subject: [PATCH 11/30] refactor: add some logs. --- source/client/src/clientTmq.c | 40 ++++++++++++++-------- source/dnode/mnode/impl/src/mndConsumer.c | 2 +- source/dnode/mnode/impl/src/mndSubscribe.c | 4 +-- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index de338a4642..56d0148272 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -1076,7 +1076,7 @@ int32_t tmq_subscribe(tmq_t* tmq, const tmq_list_t* topic_list) { SCMSubscribeReq req = {0}; int32_t code = 0; - tscDebug("consumer:0x%" PRIx64 " subscribe %d topics", tmq->consumerId, sz); + tscDebug("consumer:0x%" PRIx64 " cgroup:%s, subscribe %d topics", tmq->consumerId, tmq->groupId, sz); req.consumerId = tmq->consumerId; tstrncpy(req.clientId, tmq->clientId, 256); @@ -1213,31 +1213,38 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { return -1; } - int32_t epoch = pParam->epoch; - int32_t vgId = pParam->vgId; + int32_t epoch = pParam->epoch; + int32_t vgId = pParam->vgId; + uint64_t requestId = pParam->requestId; + taosMemoryFree(pParam); + if (code != 0) { - tscWarn("msg discard from vgId:%d, epoch %d, since %s", vgId, epoch, terrstr()); + tscWarn("consumer:0x%"PRIx64" msg from vgId:%d discarded, epoch %d, since %s, reqId:0x%"PRIx64, tmq->consumerId, vgId, + epoch, tstrerror(code), requestId); + if (pMsg->pData) taosMemoryFree(pMsg->pData); if (pMsg->pEpSet) taosMemoryFree(pMsg->pEpSet); + // in case of consumer mismatch, wait for 500ms and retry if (code == TSDB_CODE_TMQ_CONSUMER_MISMATCH) { + taosMsleep(500); atomic_store_8(&tmq->status, TMQ_CONSUMER_STATUS__RECOVER); - goto CREATE_MSG_FAIL; - } - - if (code == TSDB_CODE_TQ_NO_COMMITTED_OFFSET) { + } else if (code == TSDB_CODE_TQ_NO_COMMITTED_OFFSET) { SMqPollRspWrapper* pRspWrapper = taosAllocateQitem(sizeof(SMqPollRspWrapper), DEF_QITEM, 0); if (pRspWrapper == NULL) { - tscWarn("msg discard from vgId:%d, epoch %d since out of memory", vgId, epoch); + tscWarn("consumer:0x%" PRIx64 " msg from vgId:%d discarded, epoch %d since out of memory, reqId:0x%" PRIx64, + tmq->consumerId, vgId, epoch, requestId); goto CREATE_MSG_FAIL; } + pRspWrapper->tmqRspType = TMQ_MSG_TYPE__END_RSP; /*pRspWrapper->vgHandle = pVg;*/ /*pRspWrapper->topicHandle = pTopic;*/ taosWriteQitem(tmq->mqueue, pRspWrapper); tsem_post(&tmq->rspSem); } + goto CREATE_MSG_FAIL; } @@ -1245,8 +1252,9 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { int32_t tmqEpoch = atomic_load_32(&tmq->epoch); if (msgEpoch < tmqEpoch) { // do not write into queue since updating epoch reset - tscWarn("msg discard from vgId:%d since from earlier epoch, rsp epoch %d, current epoch %d", vgId, msgEpoch, - tmqEpoch); + tscWarn("consumer:0x%" PRIx64 " msg discard from vgId:%d since from earlier epoch, rsp epoch %d, current epoch %d, reqId:0x%"PRIx64, + tmq->consumerId, vgId, msgEpoch, tmqEpoch, requestId); + tsem_post(&tmq->rspSem); taosMemoryFree(pMsg->pData); taosMemoryFree(pMsg->pEpSet); @@ -1254,7 +1262,8 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { } if (msgEpoch != tmqEpoch) { - tscWarn("mismatch rsp from vgId:%d, epoch %d, current epoch %d", vgId, msgEpoch, tmqEpoch); + tscWarn("consumer:0x%"PRIx64" mismatch rsp from vgId:%d, epoch %d, current epoch %d, reqId:0x%"PRIx64, tmq->consumerId, vgId, + msgEpoch, tmqEpoch, requestId); } // handle meta rsp @@ -1264,7 +1273,7 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { if (pRspWrapper == NULL) { taosMemoryFree(pMsg->pData); taosMemoryFree(pMsg->pEpSet); - tscWarn("msg discard from vgId:%d, epoch %d since out of memory", vgId, epoch); + tscWarn("consumer:0x%"PRIx64" msg discard from vgId:%d, epoch %d since out of memory", tmq->consumerId, vgId, epoch); goto CREATE_MSG_FAIL; } @@ -1299,16 +1308,19 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { taosMemoryFree(pMsg->pData); taosMemoryFree(pMsg->pEpSet); - tscDebug("consumer:0x%" PRIx64 ", put poll res into mqueue, total in queue:%d", tmq->consumerId, tmq->mqueue->numOfItems); + tscDebug("consumer:0x%" PRIx64 ", put poll res into mqueue, total in queue:%d, reqId:0x%" PRIx64, tmq->consumerId, + tmq->mqueue->numOfItems, requestId); taosWriteQitem(tmq->mqueue, pRspWrapper); tsem_post(&tmq->rspSem); return 0; + CREATE_MSG_FAIL: if (epoch == tmq->epoch) { atomic_store_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE); } + tsem_post(&tmq->rspSem); return -1; } diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index 50aa1bb8c7..90f5f8c839 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -26,7 +26,7 @@ #define MND_CONSUMER_VER_NUMBER 1 #define MND_CONSUMER_RESERVE_SIZE 64 -#define MND_CONSUMER_LOST_HB_CNT 3 +#define MND_CONSUMER_LOST_HB_CNT 6 #define MND_CONSUMER_LOST_CLEAR_THRESHOLD 43200 static int32_t mqRebInExecCnt = 0; diff --git a/source/dnode/mnode/impl/src/mndSubscribe.c b/source/dnode/mnode/impl/src/mndSubscribe.c index fcd68d9001..648014d97e 100644 --- a/source/dnode/mnode/impl/src/mndSubscribe.c +++ b/source/dnode/mnode/impl/src/mndSubscribe.c @@ -296,7 +296,7 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR .pVgEp = pVgEp, }; taosHashPut(pHash, &pVgEp->vgId, sizeof(int32_t), &outputVg, sizeof(SMqRebOutputVg)); - mInfo("sub:%s mq rebalance remove vgId:%d from consumer:%" PRIx64 ",(first scan)", sub, pVgEp->vgId, + mInfo("sub:%s mq rebalance remove vgId:%d from consumer:0x%" PRIx64 ",(first scan)", sub, pVgEp->vgId, pConsumerEp->consumerId); } imbCnt++; @@ -311,7 +311,7 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR .pVgEp = pVgEp, }; taosHashPut(pHash, &pVgEp->vgId, sizeof(int32_t), &outputVg, sizeof(SMqRebOutputVg)); - mInfo("sub:%s mq rebalance remove vgId:%d from consumer:%" PRIx64 ",(first scan)", sub, pVgEp->vgId, + mInfo("sub:%s mq rebalance remove vgId:%d from consumer:0x%" PRIx64 ",(first scan)", sub, pVgEp->vgId, pConsumerEp->consumerId); } } From c159fd07471c6a36c42bbcdfdef2bc2b84b99cc1 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 4 Mar 2023 16:37:44 +0800 Subject: [PATCH 12/30] refactor: disable sleep temporarily. --- source/client/src/clientTmq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index 56d0148272..7a31ab572c 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -1228,7 +1228,7 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { // in case of consumer mismatch, wait for 500ms and retry if (code == TSDB_CODE_TMQ_CONSUMER_MISMATCH) { - taosMsleep(500); +// taosMsleep(500); atomic_store_8(&tmq->status, TMQ_CONSUMER_STATUS__RECOVER); } else if (code == TSDB_CODE_TQ_NO_COMMITTED_OFFSET) { SMqPollRspWrapper* pRspWrapper = taosAllocateQitem(sizeof(SMqPollRspWrapper), DEF_QITEM, 0); From 33f2d076052013ae0d5cab38b5acac5304c84eb1 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 4 Mar 2023 16:38:50 +0800 Subject: [PATCH 13/30] refactor: sleep for a while. --- source/client/src/clientTmq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index 7a31ab572c..d08cabd27e 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -1918,6 +1918,7 @@ TAOS_RES* tmq_consumer_poll(tmq_t* tmq, int64_t timeout) { // in no topic status, delayed task also need to be processed if (atomic_load_8(&tmq->status) == TMQ_CONSUMER_STATUS__INIT) { tscDebug("consumer:0x%" PRIx64 " poll return since consumer is init", tmq->consumerId); + // sleep for a while return NULL; } From 6fe2a9b10dff26679ebc6833ecc0d97c15c5fb28 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 4 Mar 2023 18:09:40 +0800 Subject: [PATCH 14/30] add debug --- source/libs/transport/src/transCli.c | 60 +++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 274d153adc..a2516f6ff7 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -13,7 +13,6 @@ */ #include "transComm.h" -#include "tutil.h" typedef struct { int32_t numOfConn; @@ -121,6 +120,9 @@ typedef struct SCliThrd { SCliMsg* stopMsg; bool quit; + + int newConnCount; + SHashObj* msgCount; } SCliThrd; typedef struct SCliObj { @@ -423,6 +425,21 @@ void cliHandleResp(SCliConn* conn) { tDebug("%s conn %p ref by app", CONN_GET_INST_LABEL(conn), conn); } + // if (TMSG_INFO(pHead->msgType - 1) != 0) { + // char buf[128] = {0}; + // sprintf(buf, "%s", TMSG_INFO(pHead->msgType - 1)); + // int* count = taosHashGet(pThrd->msgCount, TMSG_INFO(pHead->msgType - 1), strlen(TMSG_INFO(pHead->msgType - 1))); + // if (NULL == 0) { + // int localCount = 1; + // taosHashPut(pThrd->msgCount, TMSG_INFO(pHead->msgType - 1), strlen(TMSG_INFO(pHead->msgType - 1)), &localCount, + // sizeof(localCount)); + // } else { + // int localCount = *count - 1; + // taosHashPut(pThrd->msgCount, TMSG_INFO(pHead->msgType - 1), strlen(TMSG_INFO(pHead->msgType - 1)), &localCount, + // sizeof(localCount)); + // } + // } + STraceId* trace = &transMsg.info.traceId; tGDebug("%s conn %p %s received from %s, local info:%s, len:%d, code str:%s", CONN_GET_INST_LABEL(conn), conn, TMSG_INFO(pHead->msgType), conn->dst, conn->src, pHead->msgLen, tstrerror(transMsg.code)); @@ -1098,6 +1115,19 @@ void cliSend(SCliConn* pConn) { msgLen = (int32_t)ntohl((uint32_t)(pHead->msgLen)); } + if (TMSG_INFO(pHead->msgType) != 0) { + char buf[128] = {0}; + sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); + int* count = taosHashGet(pThrd->msgCount, buf, strlen(buf)); + if (NULL == 0) { + int localCount = 1; + taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); + } else { + int localCount = *count + 1; + taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); + } + } + tGDebug("%s conn %p %s is sent to %s, local info %s, len:%d", CONN_GET_INST_LABEL(pConn), pConn, TMSG_INFO(pHead->msgType), pConn->dst, pConn->src, msgLen); @@ -1173,6 +1203,7 @@ static void cliHandleBatchReq(SCliBatch* pBatch, SCliThrd* pThrd) { addr.sin_port = (uint16_t)htons(pList->port); tTrace("%s conn %p try to connect to %s", pTransInst->label, conn, pList->dst); + pThrd->newConnCount++; int32_t fd = taosCreateSocketWithTimeout(TRANS_CONN_TIMEOUT * 10); if (fd == -1) { tError("%s conn %p failed to create socket, reason:%s", transLabel(pTransInst), conn, @@ -1546,6 +1577,7 @@ void cliHandleReq(SCliMsg* pMsg, SCliThrd* pThrd) { addr.sin_port = (uint16_t)htons(port); tGTrace("%s conn %p try to connect to %s", pTransInst->label, conn, conn->ip); + pThrd->newConnCount++; int32_t fd = taosCreateSocketWithTimeout(TRANS_CONN_TIMEOUT * 4); if (fd == -1) { tGError("%s conn %p failed to create socket, reason:%s", transLabel(pTransInst), conn, @@ -1735,6 +1767,17 @@ static void cliAsyncCb(uv_async_t* handle) { QUEUE_MOVE(&item->qmsg, &wq); taosThreadMutexUnlock(&item->mtx); + void* pIter = taosHashIterate(pThrd->msgCount, NULL); + while (pIter != NULL) { + int* count = pIter; + size_t len = 0; + char* key = taosHashGetKey(pIter, &len); + tDebug("key: %s count: %d", key, *count); + + pIter = taosHashIterate(pThrd->msgCount, pIter); + } + tDebug("conn count: %d", pThrd->newConnCount); + int8_t supportBatch = pTransInst->supportBatch; if (supportBatch == 0) { cliNoBatchDealReq(&wq, pThrd); @@ -1969,6 +2012,9 @@ static SCliThrd* createThrdObj(void* trans) { pThrd->batchCache = taosHashInit(8, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_NO_LOCK); pThrd->quit = false; + + pThrd->newConnCount = 0; + pThrd->msgCount = taosHashInit(8, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), false, HASH_NO_LOCK); return pThrd; } static void destroyThrdObj(SCliThrd* pThrd) { @@ -2316,6 +2362,18 @@ int cliAppCb(SCliConn* pConn, STransMsg* pResp, SCliMsg* pMsg) { } } + if ((pResp->msgType - 1) > 0) { + char buf[128] = {0}; + sprintf(buf, "%s", TMSG_INFO(pResp->msgType - 1)); + int* count = taosHashGet(pThrd->msgCount, buf, strlen(buf)); + if (NULL == 0) { + int localCount = 0; + taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); + } else { + int localCount = *count - 1; + taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); + } + } if (pCtx->pSem != NULL) { tGTrace("%s conn %p(sync) handle resp", CONN_GET_INST_LABEL(pConn), pConn); if (pCtx->pRsp == NULL) { From 8f63de5a0ad4be615687544ea8cd5ca173d36b37 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 4 Mar 2023 18:30:53 +0800 Subject: [PATCH 15/30] add debug --- source/libs/transport/src/transCli.c | 66 ++++++++++++++-------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index a2516f6ff7..9ed7953e90 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -1115,18 +1115,18 @@ void cliSend(SCliConn* pConn) { msgLen = (int32_t)ntohl((uint32_t)(pHead->msgLen)); } - if (TMSG_INFO(pHead->msgType) != 0) { - char buf[128] = {0}; - sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); - int* count = taosHashGet(pThrd->msgCount, buf, strlen(buf)); - if (NULL == 0) { - int localCount = 1; - taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); - } else { - int localCount = *count + 1; - taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); - } - } + // if (pHead->msgType != 0) { + // char buf[128] = {0}; + // sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); + // int* count = taosHashGet(pThrd->msgCount, buf, strlen(buf)); + // if (NULL == 0) { + // int localCount = 1; + // taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); + //} else { + // int localCount = *count + 1; + // taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); + //} + //} tGDebug("%s conn %p %s is sent to %s, local info %s, len:%d", CONN_GET_INST_LABEL(pConn), pConn, TMSG_INFO(pHead->msgType), pConn->dst, pConn->src, msgLen); @@ -1767,15 +1767,15 @@ static void cliAsyncCb(uv_async_t* handle) { QUEUE_MOVE(&item->qmsg, &wq); taosThreadMutexUnlock(&item->mtx); - void* pIter = taosHashIterate(pThrd->msgCount, NULL); - while (pIter != NULL) { - int* count = pIter; - size_t len = 0; - char* key = taosHashGetKey(pIter, &len); - tDebug("key: %s count: %d", key, *count); + // void* pIter = taosHashIterate(pThrd->msgCount, NULL); + // while (pIter != NULL) { + // int* count = pIter; + // size_t len = 0; + // char* key = taosHashGetKey(pIter, &len); + // tDebug("key: %s count: %d", key, *count); - pIter = taosHashIterate(pThrd->msgCount, pIter); - } + // pIter = taosHashIterate(pThrd->msgCount, pIter); + //} tDebug("conn count: %d", pThrd->newConnCount); int8_t supportBatch = pTransInst->supportBatch; @@ -2014,7 +2014,7 @@ static SCliThrd* createThrdObj(void* trans) { pThrd->quit = false; pThrd->newConnCount = 0; - pThrd->msgCount = taosHashInit(8, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), false, HASH_NO_LOCK); + pThrd->msgCount = taosHashInit(8, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_NO_LOCK); return pThrd; } static void destroyThrdObj(SCliThrd* pThrd) { @@ -2362,18 +2362,18 @@ int cliAppCb(SCliConn* pConn, STransMsg* pResp, SCliMsg* pMsg) { } } - if ((pResp->msgType - 1) > 0) { - char buf[128] = {0}; - sprintf(buf, "%s", TMSG_INFO(pResp->msgType - 1)); - int* count = taosHashGet(pThrd->msgCount, buf, strlen(buf)); - if (NULL == 0) { - int localCount = 0; - taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); - } else { - int localCount = *count - 1; - taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); - } - } + // if ((pResp->msgType - 1) > 0) { + // char buf[128] = {0}; + // sprintf(buf, "%s", TMSG_INFO(pResp->msgType - 1)); + // int* count = taosHashGet(pThrd->msgCount, buf, strlen(buf)); + // if (NULL == 0) { + // int localCount = 0; + // taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); + //} else { + // int localCount = *count - 1; + // taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); + //} + //} if (pCtx->pSem != NULL) { tGTrace("%s conn %p(sync) handle resp", CONN_GET_INST_LABEL(pConn), pConn); if (pCtx->pRsp == NULL) { From b4e8130633710f11e261d73959409c6e0c154e6b Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 4 Mar 2023 18:48:37 +0800 Subject: [PATCH 16/30] add debug --- source/libs/transport/src/transCli.c | 68 ++++++++++++++-------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 9ed7953e90..e7da5b1c69 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -1115,18 +1115,18 @@ void cliSend(SCliConn* pConn) { msgLen = (int32_t)ntohl((uint32_t)(pHead->msgLen)); } - // if (pHead->msgType != 0) { - // char buf[128] = {0}; - // sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); - // int* count = taosHashGet(pThrd->msgCount, buf, strlen(buf)); - // if (NULL == 0) { - // int localCount = 1; - // taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); - //} else { - // int localCount = *count + 1; - // taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); - //} - //} + if (pHead->msgType != 0) { + char buf[128] = {0}; + sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); + int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); + if (NULL == 0) { + int localCount = 1; + taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); + } else { + int localCount = *count + 1; + taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); + } + } tGDebug("%s conn %p %s is sent to %s, local info %s, len:%d", CONN_GET_INST_LABEL(pConn), pConn, TMSG_INFO(pHead->msgType), pConn->dst, pConn->src, msgLen); @@ -1767,16 +1767,18 @@ static void cliAsyncCb(uv_async_t* handle) { QUEUE_MOVE(&item->qmsg, &wq); taosThreadMutexUnlock(&item->mtx); - // void* pIter = taosHashIterate(pThrd->msgCount, NULL); - // while (pIter != NULL) { - // int* count = pIter; - // size_t len = 0; - // char* key = taosHashGetKey(pIter, &len); - // tDebug("key: %s count: %d", key, *count); + void* pIter = taosHashIterate(pThrd->msgCount, NULL); + while (pIter != NULL) { + int* count = pIter; + size_t len = 0; + char* key = taosHashGetKey(pIter, &len); + if (*count != 0) { + tDebug("key: %s count: %d", key, *count); + } - // pIter = taosHashIterate(pThrd->msgCount, pIter); - //} - tDebug("conn count: %d", pThrd->newConnCount); + pIter = taosHashIterate(pThrd->msgCount, pIter); + } + tDebug("all conn count: %d", pThrd->newConnCount); int8_t supportBatch = pTransInst->supportBatch; if (supportBatch == 0) { @@ -2362,18 +2364,18 @@ int cliAppCb(SCliConn* pConn, STransMsg* pResp, SCliMsg* pMsg) { } } - // if ((pResp->msgType - 1) > 0) { - // char buf[128] = {0}; - // sprintf(buf, "%s", TMSG_INFO(pResp->msgType - 1)); - // int* count = taosHashGet(pThrd->msgCount, buf, strlen(buf)); - // if (NULL == 0) { - // int localCount = 0; - // taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); - //} else { - // int localCount = *count - 1; - // taosHashPut(pThrd->msgCount, buf, strlen(buf), &localCount, sizeof(localCount)); - //} - //} + if ((pResp->msgType - 1) > 0) { + char buf[128] = {0}; + sprintf(buf, "%s", TMSG_INFO(pResp->msgType - 1)); + int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); + if (NULL == 0) { + int localCount = 0; + taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); + } else { + int localCount = *count - 1; + taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); + } + } if (pCtx->pSem != NULL) { tGTrace("%s conn %p(sync) handle resp", CONN_GET_INST_LABEL(pConn), pConn); if (pCtx->pRsp == NULL) { From 566d233c1cbb14b8acc5fdea98f3f1edbebf75af Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 4 Mar 2023 19:28:03 +0800 Subject: [PATCH 17/30] add debug --- source/libs/transport/src/transCli.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index e7da5b1c69..0774492dbb 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -1115,7 +1115,8 @@ void cliSend(SCliConn* pConn) { msgLen = (int32_t)ntohl((uint32_t)(pHead->msgLen)); } - if (pHead->msgType != 0) { + if ((pHead->msgType > TDMT_VND_TMQ_MSG && pHead->msgType < TDMT_VND_TMQ_MAX_MSG) || + (pHead->msgType > TDMT_MND_MSG && pHead->msgType < TDMT_MND_MAX_MSG)) { char buf[128] = {0}; sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); @@ -2364,7 +2365,8 @@ int cliAppCb(SCliConn* pConn, STransMsg* pResp, SCliMsg* pMsg) { } } - if ((pResp->msgType - 1) > 0) { + if ((pResp->msgType - 1 > TDMT_VND_TMQ_MSG && pResp->msgType - 1 < TDMT_VND_TMQ_MAX_MSG) || + (pResp->msgType - 1 > TDMT_MND_MSG && pResp->msgType - 1 < TDMT_MND_MAX_MSG)) { char buf[128] = {0}; sprintf(buf, "%s", TMSG_INFO(pResp->msgType - 1)); int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); From dfeed5a4951a530ce4cc40fc91c7f60833f531d5 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 4 Mar 2023 22:14:30 +0800 Subject: [PATCH 18/30] refactor: disable 0 topics; --- include/util/taoserror.h | 1 + source/client/src/clientTmq.c | 7 ++++++- source/util/src/terror.c | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 5106196ccd..61e181bc36 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -558,6 +558,7 @@ int32_t* taosGetErrno(); #define TSDB_CODE_TQ_GROUP_NOT_SET TAOS_DEF_ERROR_CODE(0, 0x0A0B) #define TSDB_CODE_TQ_TABLE_SCHEMA_NOT_FOUND TAOS_DEF_ERROR_CODE(0, 0x0A0C) #define TSDB_CODE_TQ_NO_COMMITTED_OFFSET TAOS_DEF_ERROR_CODE(0, 0x0A0D) +#define TSDB_CODE_TQ_NO_SUBSCRIBE_TOPICS TAOS_DEF_ERROR_CODE(0, 0x0A0E) // wal // #define TSDB_CODE_WAL_APP_ERROR TAOS_DEF_ERROR_CODE(0, 0x1000) // 2.x diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index d08cabd27e..00e29f5ecc 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -1076,7 +1076,12 @@ int32_t tmq_subscribe(tmq_t* tmq, const tmq_list_t* topic_list) { SCMSubscribeReq req = {0}; int32_t code = 0; - tscDebug("consumer:0x%" PRIx64 " cgroup:%s, subscribe %d topics", tmq->consumerId, tmq->groupId, sz); + if (sz == 0) { + tscDebug("consumer:0x%" PRIx64 " cgroup:%s, subscribe %d topics, not allowed", tmq->consumerId, tmq->groupId, sz); + return TSDB_CODE_TQ_NO_SUBSCRIBE_TOPICS; + } else { + tscDebug("consumer:0x%" PRIx64 " cgroup:%s, subscribe %d topics", tmq->consumerId, tmq->groupId, sz); + } req.consumerId = tmq->consumerId; tstrncpy(req.clientId, tmq->clientId, 256); diff --git a/source/util/src/terror.c b/source/util/src/terror.c index b22e2e924f..6e63776afe 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -435,7 +435,8 @@ TAOS_DEFINE_ERROR(TSDB_CODE_TQ_META_KEY_NOT_IN_TXN, "TQ meta key not in tx TAOS_DEFINE_ERROR(TSDB_CODE_TQ_META_KEY_DUP_IN_TXN, "TQ met key dup in txn") TAOS_DEFINE_ERROR(TSDB_CODE_TQ_GROUP_NOT_SET, "TQ group not exist") TAOS_DEFINE_ERROR(TSDB_CODE_TQ_TABLE_SCHEMA_NOT_FOUND, "TQ table schema not found") -TAOS_DEFINE_ERROR(TSDB_CODE_TQ_NO_COMMITTED_OFFSET, "TQ no commited offset") +TAOS_DEFINE_ERROR(TSDB_CODE_TQ_NO_COMMITTED_OFFSET, "TQ no committed offset") +TAOS_DEFINE_ERROR(TSDB_CODE_TQ_NO_SUBSCRIBE_TOPICS, "TQ no topics") // wal TAOS_DEFINE_ERROR(TSDB_CODE_WAL_FILE_CORRUPTED, "WAL file is corrupted") From bf8b36901a0be5602481e22f1c148e9ceee629ec Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 4 Mar 2023 23:37:50 +0800 Subject: [PATCH 19/30] add crash --- source/libs/transport/src/transCli.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 0774492dbb..7d648d69ac 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -877,7 +877,7 @@ static SCliConn* cliCreateConn(SCliThrd* pThrd) { static void cliDestroyConn(SCliConn* conn, bool clear) { SCliThrd* pThrd = conn->hostThrd; tTrace("%s conn %p remove from conn pool", CONN_GET_INST_LABEL(conn), conn); - + conn->broken = true; QUEUE_REMOVE(&conn->q); QUEUE_INIT(&conn->q); @@ -1262,7 +1262,7 @@ static void cliSendBatchCb(uv_write_t* req, int status) { } else { tDebug("%s conn %p succ to send batch msg, batch size:%d, msgLen:%d", CONN_GET_INST_LABEL(conn), conn, p->wLen, p->batchSize); - if (!uv_is_closing((uv_handle_t*)&conn->stream)) { + if (!uv_is_closing((uv_handle_t*)&conn->stream) && conn->broken == false) { if (nxtBatch != NULL) { conn->pBatch = nxtBatch; cliSendBatch(conn); From fbad444981627d6079210c4b17ae433f9d4c1c92 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 5 Mar 2023 10:52:16 +0800 Subject: [PATCH 20/30] change parameter --- source/client/src/clientEnv.c | 4 ++-- source/common/src/tglobal.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/source/client/src/clientEnv.c b/source/client/src/clientEnv.c index 79c5baf43d..e3e20ee85d 100644 --- a/source/client/src/clientEnv.c +++ b/source/client/src/clientEnv.c @@ -159,9 +159,9 @@ void *openTransporter(const char *user, const char *auth, int32_t numOfThread) { rpcInit.retryMaxInterval = tsRedirectMaxPeriod; rpcInit.retryMaxTimouet = tsMaxRetryWaitTime; - int32_t connLimitNum = tsNumOfRpcSessions / (tsNumOfRpcThreads * 5); + int32_t connLimitNum = tsNumOfRpcSessions / (tsNumOfRpcThreads * 3); connLimitNum = TMAX(connLimitNum, 10); - connLimitNum = TMIN(connLimitNum, 500); + connLimitNum = TMIN(connLimitNum, 1000); rpcInit.connLimitNum = connLimitNum; rpcInit.timeToGetConn = tsTimeToGetAvailableConn; diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c index 4bb64e5fd6..0d03e1a11a 100644 --- a/source/common/src/tglobal.c +++ b/source/common/src/tglobal.c @@ -41,7 +41,7 @@ bool tsPrintAuth = false; // queue & threads int32_t tsNumOfRpcThreads = 1; -int32_t tsNumOfRpcSessions = 6000; +int32_t tsNumOfRpcSessions = 10000; int32_t tsTimeToGetAvailableConn = 500000; int32_t tsNumOfCommitThreads = 2; int32_t tsNumOfTaskQueueThreads = 4; From 167f4de9037241e57aea6b7243baed3b23cbb78b Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 5 Mar 2023 11:06:55 +0800 Subject: [PATCH 21/30] fix debug info --- source/libs/transport/src/transCli.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 7d648d69ac..15fb20856c 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -1116,7 +1116,8 @@ void cliSend(SCliConn* pConn) { } if ((pHead->msgType > TDMT_VND_TMQ_MSG && pHead->msgType < TDMT_VND_TMQ_MAX_MSG) || - (pHead->msgType > TDMT_MND_MSG && pHead->msgType < TDMT_MND_MAX_MSG)) { + (pHead->msgType > TDMT_MND_MSG && pHead->msgType < TDMT_MND_MAX_MSG) || pHead->msgType == TDMT_VND_SUBMIT || + pHead->msgType == TDMT_MND_HEARTBEAT) { char buf[128] = {0}; sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); @@ -2366,7 +2367,8 @@ int cliAppCb(SCliConn* pConn, STransMsg* pResp, SCliMsg* pMsg) { } if ((pResp->msgType - 1 > TDMT_VND_TMQ_MSG && pResp->msgType - 1 < TDMT_VND_TMQ_MAX_MSG) || - (pResp->msgType - 1 > TDMT_MND_MSG && pResp->msgType - 1 < TDMT_MND_MAX_MSG)) { + (pResp->msgType - 1 > TDMT_MND_MSG && pResp->msgType - 1 < TDMT_MND_MAX_MSG) || + pResp->msgType - 1 == TDMT_VND_SUBMIT_RSP || pResp->msgType - 1 == TDMT_MND_HEARTBEAT_RSP) { char buf[128] = {0}; sprintf(buf, "%s", TMSG_INFO(pResp->msgType - 1)); int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); From 7986e1fd49edd9eac9c9676a917bb469ad7040ae Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 5 Mar 2023 11:08:11 +0800 Subject: [PATCH 22/30] fix debug info --- source/libs/transport/src/transCli.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 15fb20856c..99c9285018 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -692,6 +692,7 @@ static SCliConn* getConnFromPool2(SCliThrd* pThrd, char* key, SCliMsg** pMsg) { } list->numOfConn++; } + tTrace("%s numOfConn: %d, limit: %d", pTransInst->label, list->numOfConn, pTransInst->connLimitNum); return NULL; } From e6ec795b4d2c7ba10ed2ea2de1edf9bb74a2175e Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 5 Mar 2023 11:36:01 +0800 Subject: [PATCH 23/30] fix debug info --- include/common/tmsg.h | 19 ++++++++++++++----- source/libs/transport/src/transCli.c | 8 ++------ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/include/common/tmsg.h b/include/common/tmsg.h index 41bc873a3b..9a2439f0cf 100644 --- a/include/common/tmsg.h +++ b/include/common/tmsg.h @@ -66,6 +66,15 @@ extern int32_t tMsgDict[]; typedef uint16_t tmsg_t; +static inline bool tmsgIsValid(tmsg_t type) { + if (type < TDMT_DND_MAX_MSG || type < TDMT_MND_MAX_MSG || type < TDMT_VND_MAX_MSG || type < TDMT_SCH_MAX_MSG || + type < TDMT_STREAM_MAX_MSG || type < TDMT_MON_MAX_MSG || type < TDMT_SYNC_MAX_MSG || type < TDMT_VND_STREAM_MSG || + type < TDMT_VND_TMQ_MSG || type < TDMT_VND_TMQ_MAX_MSG) { + return true; + } else { + return false; + } +} static inline bool vnodeIsMsgBlock(tmsg_t type) { return (type == TDMT_VND_CREATE_TABLE) || (type == TDMT_VND_ALTER_TABLE) || (type == TDMT_VND_DROP_TABLE) || (type == TDMT_VND_UPDATE_TAG_VAL) || (type == TDMT_VND_ALTER_CONFIRM); @@ -1911,10 +1920,10 @@ typedef struct { } SMqConsumerLostMsg, SMqConsumerRecoverMsg, SMqConsumerClearMsg; typedef struct { - int64_t consumerId; - char cgroup[TSDB_CGROUP_LEN]; - char clientId[256]; - SArray* topicNames; // SArray + int64_t consumerId; + char cgroup[TSDB_CGROUP_LEN]; + char clientId[256]; + SArray* topicNames; // SArray } SCMSubscribeReq; static FORCE_INLINE int32_t tSerializeSCMSubscribeReq(void** buf, const SCMSubscribeReq* pReq) { @@ -2691,7 +2700,7 @@ typedef struct { char subKey[TSDB_SUBSCRIBE_KEY_LEN]; int8_t subType; int8_t withMeta; - char* qmsg; //SubPlanToString + char* qmsg; // SubPlanToString int64_t suid; } SMqRebVgReq; diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 99c9285018..c4f0b63ffc 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -1116,9 +1116,7 @@ void cliSend(SCliConn* pConn) { msgLen = (int32_t)ntohl((uint32_t)(pHead->msgLen)); } - if ((pHead->msgType > TDMT_VND_TMQ_MSG && pHead->msgType < TDMT_VND_TMQ_MAX_MSG) || - (pHead->msgType > TDMT_MND_MSG && pHead->msgType < TDMT_MND_MAX_MSG) || pHead->msgType == TDMT_VND_SUBMIT || - pHead->msgType == TDMT_MND_HEARTBEAT) { + if (tmsgIsValid(pHead->msgType)) { char buf[128] = {0}; sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); @@ -2367,9 +2365,7 @@ int cliAppCb(SCliConn* pConn, STransMsg* pResp, SCliMsg* pMsg) { } } - if ((pResp->msgType - 1 > TDMT_VND_TMQ_MSG && pResp->msgType - 1 < TDMT_VND_TMQ_MAX_MSG) || - (pResp->msgType - 1 > TDMT_MND_MSG && pResp->msgType - 1 < TDMT_MND_MAX_MSG) || - pResp->msgType - 1 == TDMT_VND_SUBMIT_RSP || pResp->msgType - 1 == TDMT_MND_HEARTBEAT_RSP) { + if (tmsgIsValid(pResp->msgType - 1)) { char buf[128] = {0}; sprintf(buf, "%s", TMSG_INFO(pResp->msgType - 1)); int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); From ee0bf9c501afb8421152dd41e94c02e9da1d6723 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 5 Mar 2023 12:42:56 +0800 Subject: [PATCH 24/30] fix debug info --- source/libs/transport/src/transCli.c | 36 ++++++++++++++++++---------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index c4f0b63ffc..b97597dc65 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -1116,18 +1116,18 @@ void cliSend(SCliConn* pConn) { msgLen = (int32_t)ntohl((uint32_t)(pHead->msgLen)); } - if (tmsgIsValid(pHead->msgType)) { - char buf[128] = {0}; - sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); - int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); - if (NULL == 0) { - int localCount = 1; - taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); - } else { - int localCount = *count + 1; - taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); - } - } + // if (tmsgIsValid(pHead->msgType)) { + // char buf[128] = {0}; + // sprintf(buf, "%s", TMSG_INFO(pHead->msgType)); + // int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); + // if (NULL == 0) { + // int localCount = 1; + // taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); + // } else { + // int localCount = *count + 1; + // taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); + // } + // } tGDebug("%s conn %p %s is sent to %s, local info %s, len:%d", CONN_GET_INST_LABEL(pConn), pConn, TMSG_INFO(pHead->msgType), pConn->dst, pConn->src, msgLen); @@ -1523,6 +1523,18 @@ void cliHandleReq(SCliMsg* pMsg, SCliThrd* pThrd) { destroyCmsg(pMsg); return; } + if (tmsgIsValid(pMsg->msg.msgType)) { + char buf[128] = {0}; + sprintf(buf, "%s", TMSG_INFO(pMsg->msg.msgType)); + int* count = taosHashGet(pThrd->msgCount, buf, sizeof(buf)); + if (NULL == 0) { + int localCount = 1; + taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); + } else { + int localCount = *count + 1; + taosHashPut(pThrd->msgCount, buf, sizeof(buf), &localCount, sizeof(localCount)); + } + } char* fqdn = EPSET_GET_INUSE_IP(&pMsg->ctx->epSet); uint16_t port = EPSET_GET_INUSE_PORT(&pMsg->ctx->epSet); From 22a85734c69528e977663a2472b51e4b17cac079 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sun, 5 Mar 2023 13:23:50 +0800 Subject: [PATCH 25/30] fix(mq): add more chek for balance couner to avoid the negative value emerges. --- include/util/taoserror.h | 1 - source/client/src/clientTmq.c | 7 +------ source/client/test/clientTests.cpp | 9 +++++++- source/dnode/mnode/impl/src/mndConsumer.c | 24 ++++++++++++++++------ source/dnode/mnode/impl/src/mndMain.c | 3 +++ source/dnode/mnode/impl/src/mndSubscribe.c | 6 +++--- source/dnode/vnode/src/tq/tqRead.c | 9 ++++---- source/libs/executor/src/scanoperator.c | 6 +++++- source/util/src/terror.c | 1 - 9 files changed, 42 insertions(+), 24 deletions(-) diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 61e181bc36..5106196ccd 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -558,7 +558,6 @@ int32_t* taosGetErrno(); #define TSDB_CODE_TQ_GROUP_NOT_SET TAOS_DEF_ERROR_CODE(0, 0x0A0B) #define TSDB_CODE_TQ_TABLE_SCHEMA_NOT_FOUND TAOS_DEF_ERROR_CODE(0, 0x0A0C) #define TSDB_CODE_TQ_NO_COMMITTED_OFFSET TAOS_DEF_ERROR_CODE(0, 0x0A0D) -#define TSDB_CODE_TQ_NO_SUBSCRIBE_TOPICS TAOS_DEF_ERROR_CODE(0, 0x0A0E) // wal // #define TSDB_CODE_WAL_APP_ERROR TAOS_DEF_ERROR_CODE(0, 0x1000) // 2.x diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index 00e29f5ecc..d08cabd27e 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -1076,12 +1076,7 @@ int32_t tmq_subscribe(tmq_t* tmq, const tmq_list_t* topic_list) { SCMSubscribeReq req = {0}; int32_t code = 0; - if (sz == 0) { - tscDebug("consumer:0x%" PRIx64 " cgroup:%s, subscribe %d topics, not allowed", tmq->consumerId, tmq->groupId, sz); - return TSDB_CODE_TQ_NO_SUBSCRIBE_TOPICS; - } else { - tscDebug("consumer:0x%" PRIx64 " cgroup:%s, subscribe %d topics", tmq->consumerId, tmq->groupId, sz); - } + tscDebug("consumer:0x%" PRIx64 " cgroup:%s, subscribe %d topics", tmq->consumerId, tmq->groupId, sz); req.consumerId = tmq->consumerId; tstrncpy(req.clientId, tmq->clientId, 256); diff --git a/source/client/test/clientTests.cpp b/source/client/test/clientTests.cpp index 59c931d9aa..2f3d600019 100644 --- a/source/client/test/clientTests.cpp +++ b/source/client/test/clientTests.cpp @@ -925,7 +925,7 @@ TEST(clientCase, subscription_test) { // 创建订阅 topics 列表 tmq_list_t* topicList = tmq_list_new(); - tmq_list_append(topicList, "topic_t1"); +// tmq_list_append(topicList, "topic_t1"); // 启动订阅 tmq_subscribe(tmq, topicList); @@ -938,6 +938,8 @@ TEST(clientCase, subscription_test) { int32_t msgCnt = 0; int32_t timeout = 5000; + int32_t count = 0; + while (1) { TAOS_RES* pRes = tmq_consumer_poll(tmq, timeout); if (pRes) { @@ -952,6 +954,11 @@ TEST(clientCase, subscription_test) { printf("db: %s\n", dbName); printf("vgroup id: %d\n", vgroupId); + if (count ++ > 20) { + tmq_unsubscribe(tmq); + break; + } + while (1) { TAOS_ROW row = taos_fetch_row(pRes); if (row == NULL) break; diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index 90f5f8c839..e52b046053 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -82,18 +82,29 @@ bool mndRebTryStart() { } void mndRebEnd() { - int32_t val = atomic_sub_fetch_32(&mqRebInExecCnt, 1); - mInfo("rebalance end, rebalance count:%d", val); + mndRebCntDec(); } void mndRebCntInc() { int32_t val = atomic_add_fetch_32(&mqRebInExecCnt, 1); - mInfo("rebalance trans start, rebalance count:%d", val); + mInfo("rebalance trans start, rebalance counter:%d", val); } void mndRebCntDec() { - int32_t val = atomic_sub_fetch_32(&mqRebInExecCnt, 1); - mInfo("rebalance trans end, rebalance count:%d", val); + while (1) { + int32_t val = atomic_load_32(&mqRebInExecCnt); + if (val <= 0) { + mError("rebalance trans end, rebalance counter:%d should not be less equalled than 0, ignore counter desc", val); + break; + } + + int32_t newVal = val - 1; + int32_t oldVal = atomic_val_compare_exchange_32(&mqRebInExecCnt, val, newVal); + if (oldVal == val) { + mInfo("rebalance trans end, rebalance counter:%d", newVal); + break; + } + } } static int32_t mndProcessConsumerLostMsg(SRpcMsg *pMsg) { @@ -308,6 +319,7 @@ static int32_t mndProcessMqTimerMsg(SRpcMsg *pMsg) { taosRUnLockLatch(&pConsumer->lock); } else if (status == MQ_CONSUMER_STATUS__MODIFY) { taosRLockLatch(&pConsumer->lock); + int32_t newTopicNum = taosArrayGetSize(pConsumer->rebNewTopics); for (int32_t i = 0; i < newTopicNum; i++) { char key[TSDB_SUBSCRIBE_KEY_LEN]; @@ -700,6 +712,7 @@ int32_t mndProcessSubscribeReq(SRpcMsg *pMsg) { // no topics need to be rebalanced if (taosArrayGetSize(pConsumerNew->rebNewTopics) == 0 && taosArrayGetSize(pConsumerNew->rebRemovedTopics) == 0) { +// mInfo(); goto _over; } @@ -1057,7 +1070,6 @@ static int32_t mndRetrieveConsumer(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock * } taosRLockLatch(&pConsumer->lock); - mDebug("showing consumer:0x%" PRIx64, pConsumer->consumerId); int32_t topicSz = taosArrayGetSize(pConsumer->assignedTopics); diff --git a/source/dnode/mnode/impl/src/mndMain.c b/source/dnode/mnode/impl/src/mndMain.c index b09a4f63a7..d83b969e2d 100644 --- a/source/dnode/mnode/impl/src/mndMain.c +++ b/source/dnode/mnode/impl/src/mndMain.c @@ -706,6 +706,9 @@ int32_t mndProcessRpcMsg(SRpcMsg *pMsg) { } else if (code == 0) { mGTrace("msg:%p, successfully processed", pMsg); } else { + if (code == -1) { + code = terrno; + } mGError("msg:%p, failed to process since %s, app:%p type:%s", pMsg, tstrerror(code), pMsg->info.ahandle, TMSG_INFO(pMsg->msgType)); } diff --git a/source/dnode/mnode/impl/src/mndSubscribe.c b/source/dnode/mnode/impl/src/mndSubscribe.c index 648014d97e..86f6976398 100644 --- a/source/dnode/mnode/impl/src/mndSubscribe.c +++ b/source/dnode/mnode/impl/src/mndSubscribe.c @@ -263,7 +263,7 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR imbConsumerNum = totalVgNum % afterRebConsumerNum; } - mInfo("sub:%s mq re-balance %d consumers: at least %d vg each, %d consumer has more vg", sub, + mInfo("sub:%s mq re-balance %d consumers: at least %d vgs each, %d consumers has more vgs", sub, afterRebConsumerNum, minVgCnt, imbConsumerNum); // 4. first scan: remove consumer more than wanted, put to remove hash @@ -591,13 +591,13 @@ static int32_t mndProcessRebalanceReq(SRpcMsg *pMsg) { rebOutput.pSub = mndCreateSubscription(pMnode, pTopic, pRebInfo->key); if (rebOutput.pSub == NULL) { - mError("mq rebalance %s failed create sub since %s, abort", pRebInfo->key, terrstr()); + mError("mq rebalance %s failed create sub since %s, ignore", pRebInfo->key, terrstr()); taosRUnLockLatch(&pTopic->lock); mndReleaseTopic(pMnode, pTopic); continue; } - memcpy(rebOutput.pSub->dbName, pTopic->db, TSDB_DB_FNAME_LEN); + memcpy(rebOutput.pSub->dbName, pTopic->db, TSDB_DB_FNAME_LEN); taosRUnLockLatch(&pTopic->lock); mndReleaseTopic(pMnode, pTopic); diff --git a/source/dnode/vnode/src/tq/tqRead.c b/source/dnode/vnode/src/tq/tqRead.c index f4c96d073b..8712a93bff 100644 --- a/source/dnode/vnode/src/tq/tqRead.c +++ b/source/dnode/vnode/src/tq/tqRead.c @@ -297,7 +297,7 @@ int32_t tqSeekVer(STqReader* pReader, int64_t ver, const char* id) { tqError("tmq poll: wal reader failed to seek to ver:%"PRId64" code:%s, %s", ver, tstrerror(terrno), id); return -1; } else { - tqError("tmq poll: wal reader seek to ver:%"PRId64" %s", ver, id); + tqDebug("tmq poll: wal reader seek to ver:%"PRId64" %s", ver, id); return 0; } } @@ -308,13 +308,12 @@ int32_t tqNextBlock(STqReader* pReader, SFetchRet* ret) { while (1) { if (!fromProcessedMsg) { if (walNextValidMsg(pReader->pWalReader) < 0) { - pReader->ver = - pReader->pWalReader->curVersion - pReader->pWalReader->curStopped; -// pReader->pWalReader->curVersion - (pReader->pWalReader->curInvalid | pReader->pWalReader->curStopped); + pReader->ver = pReader->pWalReader->curVersion - pReader->pWalReader->curStopped; ret->offset.type = TMQ_OFFSET__LOG; + ret->offset.version = pReader->ver; ret->fetchType = FETCH_TYPE__NONE; - tqDebug("return offset %" PRId64 ", no more valid", ret->offset.version); + tqDebug("return offset %" PRId64 ", no more valid msg in wal", ret->offset.version); return -1; } diff --git a/source/libs/executor/src/scanoperator.c b/source/libs/executor/src/scanoperator.c index 8576ac74c7..ccc721996f 100644 --- a/source/libs/executor/src/scanoperator.c +++ b/source/libs/executor/src/scanoperator.c @@ -1632,8 +1632,12 @@ static SSDataBlock* doQueueScan(SOperatorInfo* pOperator) { while (1) { SFetchRet ret = {0}; if (tqNextBlock(pInfo->tqReader, &ret) < 0) { - qError("failed to get next log block since %s", terrstr()); + // if the end is reached, terrno is 0 + if (terrno != 0) { + qError("failed to get next log block since %s", terrstr()); + } } + if (ret.fetchType == FETCH_TYPE__DATA) { blockDataCleanup(pInfo->pRes); setBlockIntoRes(pInfo, &ret.data, true); diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 6e63776afe..33b562c8dd 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -436,7 +436,6 @@ TAOS_DEFINE_ERROR(TSDB_CODE_TQ_META_KEY_DUP_IN_TXN, "TQ met key dup in txn TAOS_DEFINE_ERROR(TSDB_CODE_TQ_GROUP_NOT_SET, "TQ group not exist") TAOS_DEFINE_ERROR(TSDB_CODE_TQ_TABLE_SCHEMA_NOT_FOUND, "TQ table schema not found") TAOS_DEFINE_ERROR(TSDB_CODE_TQ_NO_COMMITTED_OFFSET, "TQ no committed offset") -TAOS_DEFINE_ERROR(TSDB_CODE_TQ_NO_SUBSCRIBE_TOPICS, "TQ no topics") // wal TAOS_DEFINE_ERROR(TSDB_CODE_WAL_FILE_CORRUPTED, "WAL file is corrupted") From 42ba45e60d2aa7fa284ce5f2c5dd4c5718fbf0ab Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 5 Mar 2023 13:25:00 +0800 Subject: [PATCH 26/30] fix debug info --- source/libs/transport/src/transCli.c | 1 - 1 file changed, 1 deletion(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index b97597dc65..79afe95342 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -804,7 +804,6 @@ static int32_t specifyConnRef(SCliConn* conn, bool update, int64_t handle) { static void cliAllocRecvBufferCb(uv_handle_t* handle, size_t suggested_size, uv_buf_t* buf) { SCliConn* conn = handle->data; SConnBuffer* pBuf = &conn->readBuf; - tTrace("%s conn %p alloc read buf", CONN_GET_INST_LABEL(conn), conn); transAllocBuffer(pBuf, buf); } static void cliRecvCb(uv_stream_t* handle, ssize_t nread, const uv_buf_t* buf) { From 6fafa8443e37332ce61a64c21e75b8eeef856a14 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sun, 5 Mar 2023 15:32:00 +0800 Subject: [PATCH 27/30] fix(query): sleep a little bit when errors occur. --- source/client/src/clientTmq.c | 5 +++-- source/dnode/mnode/impl/src/mndConsumer.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index d08cabd27e..f6a2c5fdc1 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -1228,8 +1228,9 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { // in case of consumer mismatch, wait for 500ms and retry if (code == TSDB_CODE_TMQ_CONSUMER_MISMATCH) { -// taosMsleep(500); + taosMsleep(500); atomic_store_8(&tmq->status, TMQ_CONSUMER_STATUS__RECOVER); + tscDebug("consumer:0x%" PRIx64" wait for the re-balance, wait for 500ms and set status to be RECOVER", tmq->consumerId); } else if (code == TSDB_CODE_TQ_NO_COMMITTED_OFFSET) { SMqPollRspWrapper* pRspWrapper = taosAllocateQitem(sizeof(SMqPollRspWrapper), DEF_QITEM, 0); if (pRspWrapper == NULL) { @@ -1918,7 +1919,7 @@ TAOS_RES* tmq_consumer_poll(tmq_t* tmq, int64_t timeout) { // in no topic status, delayed task also need to be processed if (atomic_load_8(&tmq->status) == TMQ_CONSUMER_STATUS__INIT) { tscDebug("consumer:0x%" PRIx64 " poll return since consumer is init", tmq->consumerId); - // sleep for a while + taosMsleep(500); // sleep for a while return NULL; } diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index e52b046053..24974a1973 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -637,7 +637,7 @@ int32_t mndProcessSubscribeReq(SRpcMsg *pMsg) { subscribe.cgroup, (int32_t) taosArrayGetSize(pTopicList)); pConsumerNew = tNewSMqConsumerObj(consumerId, cgroup); - tstrncpy(pConsumerNew->clientId, subscribe.clientId, 256); + tstrncpy(pConsumerNew->clientId, subscribe.clientId, tListLen(pConsumerNew->clientId)); // set the update type pConsumerNew->updateType = CONSUMER_UPDATE__MODIFY; From 3362b8a4caf3f3a446858010a9118b8351594f7d Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 5 Mar 2023 15:38:11 +0800 Subject: [PATCH 28/30] fix mem leak --- source/libs/transport/src/transCli.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 79afe95342..1016b50542 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -890,6 +890,7 @@ static void cliDestroyConn(SCliConn* conn, bool clear) { connList->list->numOfConn--; } conn->list = NULL; + pThrd->newConnCount--; transReleaseExHandle(transGetRefMgt(), conn->refId); transRemoveExHandle(transGetRefMgt(), conn->refId); @@ -2074,6 +2075,7 @@ static void destroyThrdObj(SCliThrd* pThrd) { pIter = (void**)taosHashIterate(pThrd->batchCache, pIter); } taosHashCleanup(pThrd->batchCache); + taosHashCleanup(pThrd->msgCount); taosMemoryFree(pThrd); } From da550029d893d53649d0b25b08b0ebf7dfa85d5b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 6 Mar 2023 11:52:52 +0800 Subject: [PATCH 29/30] fix(tmq): fix memory error and adjust some logs. --- source/client/src/clientTmq.c | 2 +- source/dnode/mnode/impl/src/mndConsumer.c | 5 ++--- source/dnode/mnode/impl/src/mndSubscribe.c | 8 +++++--- source/dnode/mnode/impl/src/mndTrans.c | 1 + tests/script/tsim/tmq/consume.sh | 10 +++++----- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index f6a2c5fdc1..9f24deff94 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -1291,7 +1291,7 @@ int32_t tmqPollCb(void* param, SDataBuf* pMsg, int32_t code) { tscDebug("consumer:0x%" PRIx64 " recv poll rsp, vgId:%d, req offset:%" PRId64 ", rsp offset:%" PRId64 " type %d, reqId:0x%"PRIx64, tmq->consumerId, pVg->vgId, pRspWrapper->dataRsp.reqOffset.version, pRspWrapper->dataRsp.rspOffset.version, - rspType, pParam->requestId); + rspType, requestId); } else if (rspType == TMQ_MSG_TYPE__POLL_META_RSP) { SDecoder decoder; tDecoderInit(&decoder, POINTER_SHIFT(pMsg->pData, sizeof(SMqRspHead)), pMsg->len - sizeof(SMqRspHead)); diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index 24974a1973..f1ef83aca5 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -35,9 +35,9 @@ static const char *mndConsumerStatusName(int status); static int32_t mndConsumerActionInsert(SSdb *pSdb, SMqConsumerObj *pConsumer); static int32_t mndConsumerActionDelete(SSdb *pSdb, SMqConsumerObj *pConsumer); -static int32_t mndConsumerActionUpdate(SSdb *pSdb, SMqConsumerObj *pConsumer, SMqConsumerObj *pNewConsumer); +static int32_t mndConsumerActionUpdate(SSdb *pSdb, SMqConsumerObj *pOldConsumer, SMqConsumerObj *pNewConsumer); static int32_t mndProcessConsumerMetaMsg(SRpcMsg *pMsg); -static int32_t mndRetrieveConsumer(SRpcMsg *pMsg, SShowObj *pShow, SSDataBlock *pBlock, int32_t rows); +static int32_t mndRetrieveConsumer(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock *pBlock, int32_t rows); static void mndCancelGetNextConsumer(SMnode *pMnode, void *pIter); static int32_t mndProcessSubscribeReq(SRpcMsg *pMsg); @@ -712,7 +712,6 @@ int32_t mndProcessSubscribeReq(SRpcMsg *pMsg) { // no topics need to be rebalanced if (taosArrayGetSize(pConsumerNew->rebNewTopics) == 0 && taosArrayGetSize(pConsumerNew->rebRemovedTopics) == 0) { -// mInfo(); goto _over; } diff --git a/source/dnode/mnode/impl/src/mndSubscribe.c b/source/dnode/mnode/impl/src/mndSubscribe.c index 86f6976398..21539a6313 100644 --- a/source/dnode/mnode/impl/src/mndSubscribe.c +++ b/source/dnode/mnode/impl/src/mndSubscribe.c @@ -444,7 +444,9 @@ static int32_t mndDoRebalance(SMnode *pMnode, const SMqRebInputObj *pInput, SMqR static int32_t mndPersistRebResult(SMnode *pMnode, SRpcMsg *pMsg, const SMqRebOutputObj *pOutput) { STrans *pTrans = mndTransCreate(pMnode, TRN_POLICY_ROLLBACK, TRN_CONFLICT_DB_INSIDE, pMsg, "tmq-reb"); - if (pTrans == NULL) return -1; + if (pTrans == NULL) { + return -1; + } mndTransSetDbName(pTrans, pOutput->pSub->dbName, NULL); if (mndTrancCheckConflict(pMnode, pTrans) != 0) { @@ -616,9 +618,9 @@ static int32_t mndProcessRebalanceReq(SRpcMsg *pMsg) { // if add more consumer to balanced subscribe, // possibly no vg is changed - + // when each topic is re-balanced, issue an trans to save the results in sdb. if (mndPersistRebResult(pMnode, pMsg, &rebOutput) < 0) { - mError("mq re-balance persist re-balance output error, possibly vnode splitted or dropped"); + mError("mq re-balance persist output error, possibly vnode splitted or dropped"); } taosArrayDestroy(pRebInfo->lostConsumers); diff --git a/source/dnode/mnode/impl/src/mndTrans.c b/source/dnode/mnode/impl/src/mndTrans.c index ff8b4dd9ce..39b4252618 100644 --- a/source/dnode/mnode/impl/src/mndTrans.c +++ b/source/dnode/mnode/impl/src/mndTrans.c @@ -875,6 +875,7 @@ int32_t mndTrancCheckConflict(SMnode *pMnode, STrans *pTrans) { } } + if (mndCheckTransConflict(pMnode, pTrans)) { terrno = TSDB_CODE_MND_TRANS_CONFLICT; mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); diff --git a/tests/script/tsim/tmq/consume.sh b/tests/script/tsim/tmq/consume.sh index 001ce6ae49..f0c9197a59 100755 --- a/tests/script/tsim/tmq/consume.sh +++ b/tests/script/tsim/tmq/consume.sh @@ -79,15 +79,15 @@ CFG_DIR=$PRG_DIR/cfg LOG_DIR=$PRG_DIR/log echo "------------------------------------------------------------------------" +echo "TOP_DIR: $TOP_DIR" echo "BUILD_DIR: $BUILD_DIR" echo "SIM_DIR : $SIM_DIR" echo "CFG_DIR : $CFG_DIR" - -echo "PROGRAM: $PROGRAM -echo "CFG_DIR: $CFG_DIR -echo "POLL_DELAY: $POLL_DELAY -echo "DB_NAME: $DB_NAME +echo "PROGRAM: $PROGRAM" +echo "CFG_DIR: $CFG_DIR" +echo "POLL_DELAY: $POLL_DELAY" +echo "DB_NAME: $DB_NAME" echo "------------------------------------------------------------------------" if [ "$EXEC_OPTON" = "start" ]; then From 86ef6522ae56e60bdf68dee29d87bebc5e10b813 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Mon, 6 Mar 2023 17:40:02 +0800 Subject: [PATCH 30/30] fix:wal curVersion error if switch to wal --- source/dnode/vnode/src/tq/tqRead.c | 7 ++++++- source/libs/executor/src/scanoperator.c | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/source/dnode/vnode/src/tq/tqRead.c b/source/dnode/vnode/src/tq/tqRead.c index 8712a93bff..bf73cca925 100644 --- a/source/dnode/vnode/src/tq/tqRead.c +++ b/source/dnode/vnode/src/tq/tqRead.c @@ -308,7 +308,12 @@ int32_t tqNextBlock(STqReader* pReader, SFetchRet* ret) { while (1) { if (!fromProcessedMsg) { if (walNextValidMsg(pReader->pWalReader) < 0) { - pReader->ver = pReader->pWalReader->curVersion - pReader->pWalReader->curStopped; +// pReader->ver = pReader->pWalReader->curVersion - pReader->pWalReader->curStopped; + if(pReader->pWalReader->curInvalid == 0){ + pReader->ver = pReader->pWalReader->curVersion - pReader->pWalReader->curStopped; + }else{ + pReader->ver = walGetLastVer(pReader->pWalReader->pWal); + } ret->offset.type = TMQ_OFFSET__LOG; ret->offset.version = pReader->ver; diff --git a/source/libs/executor/src/scanoperator.c b/source/libs/executor/src/scanoperator.c index ccc721996f..7314e84c9b 100644 --- a/source/libs/executor/src/scanoperator.c +++ b/source/libs/executor/src/scanoperator.c @@ -1609,6 +1609,8 @@ static SSDataBlock* doQueueScan(SOperatorInfo* pOperator) { if (pResult && pResult->info.rows > 0) { qDebug("queue scan tsdb return %d rows min:%" PRId64 " max:%" PRId64, pResult->info.rows, pResult->info.window.skey, pResult->info.window.ekey); + qDebug("queue scan tsdb return %d rows min:%" PRId64 " max:%" PRId64 " wal curVersion:%" PRId64, pResult->info.rows, + pResult->info.window.skey, pResult->info.window.ekey, pInfo->tqReader->pWalReader->curVersion); pTaskInfo->streamInfo.returned = 1; return pResult; } else {