From 8e2d121b3ad4ce4e5f73907f1543fef1e0557586 Mon Sep 17 00:00:00 2001 From: Liu Jicong Date: Sat, 2 Apr 2022 19:59:25 +0800 Subject: [PATCH 1/3] temp --- source/client/src/tmq.c | 55 ++++++++++++++-------- source/dnode/mnode/impl/src/mndSubscribe.c | 2 +- source/util/src/tlog.c | 8 ++-- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/source/client/src/tmq.c b/source/client/src/tmq.c index 426a62433b..81219e1c36 100644 --- a/source/client/src/tmq.c +++ b/source/client/src/tmq.c @@ -79,6 +79,7 @@ struct tmq_t { tmq_commit_cb* commit_cb; int32_t nextTopicIdx; int8_t epStatus; + int32_t epSkipCnt; int32_t waitingRequest; int32_t readyRequest; SArray* clientTopics; // SArray @@ -107,6 +108,7 @@ typedef struct { // connection info int32_t vgId; int32_t vgStatus; + int64_t skipCnt; SEpSet epSet; } SMqClientVg; @@ -313,6 +315,7 @@ tmq_t* tmq_consumer_new(void* conn, tmq_conf_t* conf, char* errstr, int32_t errs pTmq->waitingRequest = 0; pTmq->readyRequest = 0; pTmq->epStatus = 0; + pTmq->epSkipCnt = 0; // set conf strcpy(pTmq->clientId, conf->clientId); strcpy(pTmq->groupId, conf->groupId); @@ -348,6 +351,8 @@ tmq_t* tmq_consumer_new1(tmq_conf_t* conf, char* errstr, int32_t errstrLen) { pTmq->epoch = 0; pTmq->waitingRequest = 0; pTmq->readyRequest = 0; + pTmq->epStatus = 0; + pTmq->epSkipCnt = 0; // set conf strcpy(pTmq->clientId, conf->clientId); strcpy(pTmq->groupId, conf->groupId); @@ -842,7 +847,7 @@ int32_t tmqPollCb(void* param, const SDataBuf* pMsg, int32_t code) { int32_t tmqEpoch = atomic_load_32(&tmq->epoch); if (msgEpoch < tmqEpoch) { /*printf("discard rsp epoch %d, current epoch %d\n", msgEpoch, tmqEpoch);*/ - tsem_post(&tmq->rspSem); + /*tsem_post(&tmq->rspSem);*/ tscWarn("discard rsp epoch %d, current epoch %d", msgEpoch, tmqEpoch); return 0; } @@ -892,26 +897,26 @@ int32_t tmqPollCb(void* param, const SDataBuf* pMsg, int32_t code) { } #endif - tscError("tmq recv poll: vg %d, req offset %ld, rsp offset %ld", pParam->pVg->vgId, pRsp->msg.reqOffset, + tscDebug("consumer %ld recv poll: vg %d, req offset %ld, rsp offset %ld", tmq->consumerId, pParam->pVg->vgId, pRsp->msg.reqOffset, pRsp->msg.rspOffset); pRsp->vg = pParam->pVg; taosWriteQitem(tmq->mqueue, pRsp); atomic_add_fetch_32(&tmq->readyRequest, 1); - tsem_post(&tmq->rspSem); + /*tsem_post(&tmq->rspSem);*/ return 0; CREATE_MSG_FAIL: if (pParam->epoch == tmq->epoch) { atomic_store_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE); } - tsem_post(&tmq->rspSem); + /*tsem_post(&tmq->rspSem);*/ return code; } bool tmqUpdateEp(tmq_t* tmq, int32_t epoch, SMqCMGetSubEpRsp* pRsp) { /*printf("call update ep %d\n", epoch);*/ - tscDebug("tmq update ep epoch %d to epoch %d", tmq->epoch, epoch); + tscDebug("consumer %ld update ep epoch %d to epoch %d", tmq->consumerId, tmq->epoch, epoch); bool set = false; int32_t topicNumGet = taosArrayGetSize(pRsp->topics); char vgKey[TSDB_TOPIC_FNAME_LEN + 22]; @@ -942,7 +947,7 @@ bool tmqUpdateEp(tmq_t* tmq, int32_t epoch, SMqCMGetSubEpRsp* pRsp) { for (int32_t k = 0; k < vgNumCur; k++) { SMqClientVg* pVgCur = taosArrayGet(pTopicCur->vgs, k); sprintf(vgKey, "%s:%d", topic.topicName, pVgCur->vgId); - tscDebug("epoch %d vg %d build %s\n", epoch, pVgCur->vgId, vgKey); + tscDebug("consumer %ld epoch %d vg %d build %s\n", tmq->consumerId, epoch, pVgCur->vgId, vgKey); taosHashPut(pHash, vgKey, strlen(vgKey), &pVgCur->currentOffset, sizeof(int64_t)); } break; @@ -956,18 +961,19 @@ bool tmqUpdateEp(tmq_t* tmq, int32_t epoch, SMqCMGetSubEpRsp* pRsp) { sprintf(vgKey, "%s:%d", topic.topicName, pVgEp->vgId); int64_t* pOffset = taosHashGet(pHash, vgKey, strlen(vgKey)); int64_t offset = pVgEp->offset; - tscDebug("epoch %d vg %d offset og to %ld\n", epoch, pVgEp->vgId, offset); + tscDebug("consumer %ld epoch %d vg %d offset og to %ld\n", tmq->consumerId, epoch, pVgEp->vgId, offset); if (pOffset != NULL) { offset = *pOffset; - tscDebug("epoch %d vg %d found %s\n", epoch, pVgEp->vgId, vgKey); + tscDebug("consumer %ld epoch %d vg %d found %s\n", tmq->consumerId, epoch, pVgEp->vgId, vgKey); } - tscDebug("epoch %d vg %d offset set to %ld\n", epoch, pVgEp->vgId, offset); + tscDebug("consumer %ld epoch %d vg %d offset set to %ld\n", tmq->consumerId, epoch, pVgEp->vgId, offset); SMqClientVg clientVg = { .pollCnt = 0, .currentOffset = offset, .vgId = pVgEp->vgId, .epSet = pVgEp->epSet, .vgStatus = TMQ_VG_STATUS__IDLE, + .skipCnt = 0, }; taosArrayPush(topic.vgs, &clientVg); set = true; @@ -984,9 +990,8 @@ bool tmqUpdateEp(tmq_t* tmq, int32_t epoch, SMqCMGetSubEpRsp* pRsp) { int32_t tmqAskEpCb(void* param, const SDataBuf* pMsg, int32_t code) { SMqAskEpCbParam* pParam = (SMqAskEpCbParam*)param; tmq_t* tmq = pParam->tmq; - tscDebug("consumer %ld recv ep", tmq->consumerId); if (code != 0) { - tscError("get topic endpoint error, not ready, wait:%d\n", pParam->sync); + tscError("consumer %ld get topic endpoint error, not ready, wait:%d", tmq->consumerId, pParam->sync); goto END; } @@ -995,6 +1000,7 @@ int32_t tmqAskEpCb(void* param, const SDataBuf* pMsg, int32_t code) { // Epoch will only increase when received newer epoch ep msg SMqRspHead* head = pMsg->pData; int32_t epoch = atomic_load_32(&tmq->epoch); + tscDebug("consumer %ld recv ep, msg epoch %d, current epoch %d", tmq->consumerId, head->epoch, epoch); if (head->epoch <= epoch) { goto END; } @@ -1019,7 +1025,7 @@ int32_t tmqAskEpCb(void* param, const SDataBuf* pMsg, int32_t code) { tDecodeSMqCMGetSubEpRsp(POINTER_SHIFT(pMsg->pData, sizeof(SMqRspHead)), pRsp); taosWriteQitem(tmq->mqueue, pRsp); - tsem_post(&tmq->rspSem); + /*tsem_post(&tmq->rspSem);*/ } END: @@ -1033,9 +1039,11 @@ END: int32_t tmqAskEp(tmq_t* tmq, bool sync) { int8_t epStatus = atomic_val_compare_exchange_8(&tmq->epStatus, 0, 1); if (epStatus == 1) { - tscDebug("consumer %ld skip ask ep", tmq->consumerId); - return 0; + int32_t epSkipCnt = atomic_add_fetch_32(&tmq->epSkipCnt, 1); + tscDebug("consumer %ld skip ask ep cnt %d", tmq->consumerId, epSkipCnt); + if (epSkipCnt < 40) return 0; } + atomic_store_32(&tmq->epSkipCnt, 0); int32_t tlen = sizeof(SMqCMGetSubEpReq); SMqCMGetSubEpReq* req = taosMemoryMalloc(tlen); if (req == NULL) { @@ -1221,20 +1229,29 @@ int32_t tmqPollImpl(tmq_t* tmq, int64_t blockingTime) { SMqClientVg* pVg = taosArrayGet(pTopic->vgs, j); int32_t vgStatus = atomic_val_compare_exchange_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE, TMQ_VG_STATUS__WAIT); if (vgStatus != TMQ_VG_STATUS__IDLE) { - tscDebug("consumer %ld skip vg %d", tmq->consumerId, pVg->vgId); + int64_t skipCnt = atomic_add_fetch_64(&pVg->skipCnt, 1); + tscDebug("consumer %ld skip vg %d skip cnt %ld", tmq->consumerId, pVg->vgId, skipCnt); continue; +#if 0 + if (skipCnt < 30000) { + continue; + } else { + tscDebug("consumer %ld skip vg %d skip too much reset", tmq->consumerId, pVg->vgId); + } +#endif } + atomic_store_64(&pVg->skipCnt, 0); SMqPollReq* pReq = tmqBuildConsumeReqImpl(tmq, blockingTime, pTopic, pVg); if (pReq == NULL) { atomic_store_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE); - tsem_post(&tmq->rspSem); + /*tsem_post(&tmq->rspSem);*/ return -1; } SMqPollCbParam* pParam = taosMemoryMalloc(sizeof(SMqPollCbParam)); if (pParam == NULL) { taosMemoryFree(pReq); atomic_store_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE); - tsem_post(&tmq->rspSem); + /*tsem_post(&tmq->rspSem);*/ return -1; } pParam->tmq = tmq; @@ -1247,7 +1264,7 @@ int32_t tmqPollImpl(tmq_t* tmq, int64_t blockingTime) { taosMemoryFree(pReq); taosMemoryFree(pParam); atomic_store_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE); - tsem_post(&tmq->rspSem); + /*tsem_post(&tmq->rspSem);*/ return -1; } @@ -1379,7 +1396,7 @@ tmq_message_t* tmq_consumer_poll(tmq_t* tmq, int64_t blocking_time) { tmqAskEp(tmq, false); tmqPollImpl(tmq, blocking_time); - tsem_wait(&tmq->rspSem); + /*tsem_wait(&tmq->rspSem);*/ rspMsg = tmqHandleAllRsp(tmq, blocking_time, false); if (rspMsg) { diff --git a/source/dnode/mnode/impl/src/mndSubscribe.c b/source/dnode/mnode/impl/src/mndSubscribe.c index 211163ce35..ae9a4198a4 100644 --- a/source/dnode/mnode/impl/src/mndSubscribe.c +++ b/source/dnode/mnode/impl/src/mndSubscribe.c @@ -216,7 +216,7 @@ static int32_t mndProcessGetSubEpReq(SNodeMsg *pMsg) { // TODO int32_t hbStatus = atomic_load_32(&pConsumer->hbStatus); - mTrace("try to get sub ep, old val: %d", hbStatus); + mTrace("consumer %ld epoch(%d) try to get sub ep, server epoch %d, old val: %d", consumerId, epoch, pConsumer->epoch, hbStatus); atomic_store_32(&pConsumer->hbStatus, 0); /*SSdbRaw *pConsumerRaw = mndConsumerActionEncode(pConsumer);*/ /*sdbSetRawStatus(pConsumerRaw, SDB_STATUS_READY);*/ diff --git a/source/util/src/tlog.c b/source/util/src/tlog.c index ef15f44f8f..f6fec47196 100644 --- a/source/util/src/tlog.c +++ b/source/util/src/tlog.c @@ -214,6 +214,7 @@ static void *taosThreadToOpenNewFile(void *param) { tsLogObj.logHandle->pFile = pFile; tsLogObj.lines = 0; tsLogObj.openInProgress = 0; + taosSsleep(2); taosCloseLogByFd(pOldFile); uInfo(" new log file:%d is opened", tsLogObj.flag); @@ -347,12 +348,13 @@ static int32_t taosOpenLogFile(char *fn, int32_t maxLines, int32_t maxFileNum) { taosThreadMutexInit(&tsLogObj.logMutex, NULL); taosUmaskFile(0); - tsLogObj.logHandle->pFile = taosOpenFile(fileName, TD_FILE_CTEATE | TD_FILE_WRITE); + TdFilePtr pFile = taosOpenFile(fileName, TD_FILE_CTEATE | TD_FILE_WRITE); - if (tsLogObj.logHandle->pFile == NULL) { + if (pFile == NULL) { printf("\nfailed to open log file:%s, reason:%s\n", fileName, strerror(errno)); return -1; } + tsLogObj.logHandle->pFile = pFile; taosLockLogFile(tsLogObj.logHandle->pFile); // only an estimate for number of lines @@ -746,4 +748,4 @@ void taosSetAllDebugFlag(int32_t flag) { fsDebugFlag = flag; uInfo("all debug flag are set to %d", flag); -} \ No newline at end of file +} From e4404dc8e72bb8574ac83d61714e42ba029463d2 Mon Sep 17 00:00:00 2001 From: Liu Jicong Date: Sat, 2 Apr 2022 21:02:58 +0800 Subject: [PATCH 2/3] add more log --- source/client/src/tmq.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/source/client/src/tmq.c b/source/client/src/tmq.c index 81219e1c36..6c0bdd4bab 100644 --- a/source/client/src/tmq.c +++ b/source/client/src/tmq.c @@ -140,6 +140,7 @@ typedef struct { tmq_t* tmq; SMqClientVg* pVg; int32_t epoch; + int32_t vgId; tsem_t rspSem; tmq_message_t** msg; int32_t sync; @@ -839,7 +840,7 @@ int32_t tmqPollCb(void* param, const SDataBuf* pMsg, int32_t code) { SMqClientVg* pVg = pParam->pVg; tmq_t* tmq = pParam->tmq; if (code != 0) { - tscWarn("msg discard, code:%x", code); + tscWarn("msg discard from vg %d, epoch %d, code:%x", pParam->vgId, pParam->epoch, code); goto CREATE_MSG_FAIL; } @@ -848,12 +849,12 @@ int32_t tmqPollCb(void* param, const SDataBuf* pMsg, int32_t code) { if (msgEpoch < tmqEpoch) { /*printf("discard rsp epoch %d, current epoch %d\n", msgEpoch, tmqEpoch);*/ /*tsem_post(&tmq->rspSem);*/ - tscWarn("discard rsp epoch %d, current epoch %d", msgEpoch, tmqEpoch); + tscWarn("discard rsp from vg %d, epoch %d, current epoch %d", pParam->vgId, msgEpoch, tmqEpoch); return 0; } if (msgEpoch != tmqEpoch) { - tscWarn("mismatch rsp epoch %d, current epoch %d", msgEpoch, tmqEpoch); + tscWarn("mismatch rsp from vg %d, epoch %d, current epoch %d", pParam->vgId, msgEpoch, tmqEpoch); } else { atomic_sub_fetch_32(&tmq->waitingRequest, 1); } @@ -1041,7 +1042,7 @@ int32_t tmqAskEp(tmq_t* tmq, bool sync) { if (epStatus == 1) { int32_t epSkipCnt = atomic_add_fetch_32(&tmq->epSkipCnt, 1); tscDebug("consumer %ld skip ask ep cnt %d", tmq->consumerId, epSkipCnt); - if (epSkipCnt < 40) return 0; + if (epSkipCnt < 5000) return 0; } atomic_store_32(&tmq->epSkipCnt, 0); int32_t tlen = sizeof(SMqCMGetSubEpReq); @@ -1256,6 +1257,7 @@ int32_t tmqPollImpl(tmq_t* tmq, int64_t blockingTime) { } pParam->tmq = tmq; pParam->pVg = pVg; + pParam->vgId = pVg->vgId; pParam->epoch = tmq->epoch; pParam->sync = 0; @@ -1282,7 +1284,7 @@ int32_t tmqPollImpl(tmq_t* tmq, int64_t blockingTime) { int64_t transporterId = 0; /*printf("send poll\n");*/ atomic_add_fetch_32(&tmq->waitingRequest, 1); - tscDebug("consumer %ld send poll: vg %d, req offset %ld", tmq->consumerId, pVg->vgId, pVg->currentOffset); + tscDebug("consumer %ld send poll: vg %d, epoch %d, req offset %ld", tmq->consumerId, pVg->vgId, tmq->epoch, pVg->currentOffset); /*printf("send vg %d %ld\n", pVg->vgId, pVg->currentOffset);*/ asyncSendMsgToServer(tmq->pTscObj->pAppInfo->pTransporter, &pVg->epSet, &transporterId, sendInfo); pVg->pollCnt++; From 798c3a9f825239103f572b5ca0e2bbe07a59b75f Mon Sep 17 00:00:00 2001 From: Liu Jicong Date: Sat, 2 Apr 2022 23:03:12 +0800 Subject: [PATCH 3/3] add more log --- source/client/src/tmq.c | 12 ++++++----- source/dnode/mnode/impl/src/mndConsumer.c | 4 +--- source/dnode/mnode/impl/src/mndScheduler.c | 3 +++ source/dnode/mnode/impl/src/mndSubscribe.c | 24 ++++++++++++++++------ 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/source/client/src/tmq.c b/source/client/src/tmq.c index 6c0bdd4bab..602ecdf6ab 100644 --- a/source/client/src/tmq.c +++ b/source/client/src/tmq.c @@ -917,10 +917,10 @@ CREATE_MSG_FAIL: bool tmqUpdateEp(tmq_t* tmq, int32_t epoch, SMqCMGetSubEpRsp* pRsp) { /*printf("call update ep %d\n", epoch);*/ - tscDebug("consumer %ld update ep epoch %d to epoch %d", tmq->consumerId, tmq->epoch, epoch); bool set = false; int32_t topicNumGet = taosArrayGetSize(pRsp->topics); char vgKey[TSDB_TOPIC_FNAME_LEN + 22]; + tscDebug("consumer %ld update ep epoch %d to epoch %d, topic num: %d", tmq->consumerId, tmq->epoch, epoch, topicNumGet); SArray* newTopics = taosArrayInit(topicNumGet, sizeof(SMqClientTopic)); if (newTopics == NULL) { return false; @@ -938,17 +938,19 @@ bool tmqUpdateEp(tmq_t* tmq, int32_t epoch, SMqCMGetSubEpRsp* pRsp) { taosHashClear(pHash); topic.topicName = strdup(pTopicEp->topic); + tscDebug("consumer %ld update topic: %s", tmq->consumerId, topic.topicName); int32_t topicNumCur = taosArrayGetSize(tmq->clientTopics); for (int32_t j = 0; j < topicNumCur; j++) { // find old topic SMqClientTopic* pTopicCur = taosArrayGet(tmq->clientTopics, j); if (pTopicCur->vgs && strcmp(pTopicCur->topicName, pTopicEp->topic) == 0) { int32_t vgNumCur = taosArrayGetSize(pTopicCur->vgs); + tscDebug("consumer %ld new vg num: %d", tmq->consumerId, vgNumCur); if (vgNumCur == 0) break; for (int32_t k = 0; k < vgNumCur; k++) { SMqClientVg* pVgCur = taosArrayGet(pTopicCur->vgs, k); sprintf(vgKey, "%s:%d", topic.topicName, pVgCur->vgId); - tscDebug("consumer %ld epoch %d vg %d build %s\n", tmq->consumerId, epoch, pVgCur->vgId, vgKey); + tscDebug("consumer %ld epoch %d vg %d build %s", tmq->consumerId, epoch, pVgCur->vgId, vgKey); taosHashPut(pHash, vgKey, strlen(vgKey), &pVgCur->currentOffset, sizeof(int64_t)); } break; @@ -962,10 +964,10 @@ bool tmqUpdateEp(tmq_t* tmq, int32_t epoch, SMqCMGetSubEpRsp* pRsp) { sprintf(vgKey, "%s:%d", topic.topicName, pVgEp->vgId); int64_t* pOffset = taosHashGet(pHash, vgKey, strlen(vgKey)); int64_t offset = pVgEp->offset; - tscDebug("consumer %ld epoch %d vg %d offset og to %ld\n", tmq->consumerId, epoch, pVgEp->vgId, offset); + tscDebug("consumer %ld epoch %d vg %d offset og to %ld", tmq->consumerId, epoch, pVgEp->vgId, offset); if (pOffset != NULL) { offset = *pOffset; - tscDebug("consumer %ld epoch %d vg %d found %s\n", tmq->consumerId, epoch, pVgEp->vgId, vgKey); + tscDebug("consumer %ld epoch %d vg %d found %s", tmq->consumerId, epoch, pVgEp->vgId, vgKey); } tscDebug("consumer %ld epoch %d vg %d offset set to %ld\n", tmq->consumerId, epoch, pVgEp->vgId, offset); SMqClientVg clientVg = { @@ -1231,7 +1233,7 @@ int32_t tmqPollImpl(tmq_t* tmq, int64_t blockingTime) { int32_t vgStatus = atomic_val_compare_exchange_32(&pVg->vgStatus, TMQ_VG_STATUS__IDLE, TMQ_VG_STATUS__WAIT); if (vgStatus != TMQ_VG_STATUS__IDLE) { int64_t skipCnt = atomic_add_fetch_64(&pVg->skipCnt, 1); - tscDebug("consumer %ld skip vg %d skip cnt %ld", tmq->consumerId, pVg->vgId, skipCnt); + tscDebug("consumer %ld epoch %d skip vg %d skip cnt %ld", tmq->consumerId, tmq->epoch, pVg->vgId, skipCnt); continue; #if 0 if (skipCnt < 30000) { diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index c4c93b7fc9..6080ec7710 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -160,10 +160,8 @@ static int32_t mndConsumerActionDelete(SSdb *pSdb, SMqConsumerObj *pConsumer) { static int32_t mndConsumerActionUpdate(SSdb *pSdb, SMqConsumerObj *pOldConsumer, SMqConsumerObj *pNewConsumer) { mTrace("consumer:%" PRId64 ", perform update action", pOldConsumer->consumerId); - pOldConsumer->epoch++; - - // TODO handle update /*taosWLockLatch(&pOldConsumer->lock);*/ + atomic_add_fetch_32(&pOldConsumer->epoch, 1); /*taosWUnLockLatch(&pOldConsumer->lock);*/ return 0; diff --git a/source/dnode/mnode/impl/src/mndScheduler.c b/source/dnode/mnode/impl/src/mndScheduler.c index 4562d9e5d3..df7946a0c1 100644 --- a/source/dnode/mnode/impl/src/mndScheduler.c +++ b/source/dnode/mnode/impl/src/mndScheduler.c @@ -471,6 +471,9 @@ int32_t mndSchedInitSubEp(SMnode* pMnode, const SMqTopicObj* pTopic, SMqSubscrib consumerEp.consumerId = -1; consumerEp.epSet = plan->execNode.epSet; consumerEp.vgId = plan->execNode.nodeId; + + mDebug("init subscribption %s, assign vg: %d", pSub->key, consumerEp.vgId); + int32_t msgLen; if (qSubPlanToString(plan, &consumerEp.qmsg, &msgLen) < 0) { sdbRelease(pSdb, pVgroup); diff --git a/source/dnode/mnode/impl/src/mndSubscribe.c b/source/dnode/mnode/impl/src/mndSubscribe.c index ae9a4198a4..57d98396ea 100644 --- a/source/dnode/mnode/impl/src/mndSubscribe.c +++ b/source/dnode/mnode/impl/src/mndSubscribe.c @@ -212,19 +212,24 @@ static int32_t mndProcessGetSubEpReq(SNodeMsg *pMsg) { terrno = TSDB_CODE_MND_CONSUMER_NOT_EXIST; return -1; } + //TODO add lock ASSERT(strcmp(pReq->cgroup, pConsumer->cgroup) == 0); + int32_t serverEpoch = pConsumer->epoch; // TODO int32_t hbStatus = atomic_load_32(&pConsumer->hbStatus); - mTrace("consumer %ld epoch(%d) try to get sub ep, server epoch %d, old val: %d", consumerId, epoch, pConsumer->epoch, hbStatus); + mDebug("consumer %ld epoch(%d) try to get sub ep, server epoch %d, old val: %d", consumerId, epoch, serverEpoch, hbStatus); atomic_store_32(&pConsumer->hbStatus, 0); /*SSdbRaw *pConsumerRaw = mndConsumerActionEncode(pConsumer);*/ /*sdbSetRawStatus(pConsumerRaw, SDB_STATUS_READY);*/ /*sdbWrite(pMnode->pSdb, pConsumerRaw);*/ strcpy(rsp.cgroup, pReq->cgroup); - if (epoch != pConsumer->epoch) { - mInfo("send new assignment to consumer, consumer epoch %d, server epoch %d", epoch, pConsumer->epoch); + if (epoch != serverEpoch) { + mInfo("send new assignment to consumer %ld, consumer epoch %d, server epoch %d", pConsumer->consumerId, epoch, serverEpoch); + mDebug("consumer %ld try r lock", consumerId); + taosRLockLatch(&pConsumer->lock); + mDebug("consumer %ld r locked", consumerId); SArray *pTopics = pConsumer->currentTopics; int32_t sz = taosArrayGetSize(pTopics); rsp.topics = taosArrayInit(sz, sizeof(SMqSubTopicEp)); @@ -238,7 +243,7 @@ static int32_t mndProcessGetSubEpReq(SNodeMsg *pMsg) { SMqSubConsumer *pSubConsumer = taosArrayGet(pSub->consumers, j); if (consumerId == pSubConsumer->consumerId) { int32_t vgsz = taosArrayGetSize(pSubConsumer->vgInfo); - mInfo("topic %s has %d vg", topicName, pConsumer->epoch); + mInfo("topic %s has %d vg", topicName, serverEpoch); SMqSubTopicEp topicEp; strcpy(topicEp.topic, topicName); topicEp.vgs = taosArrayInit(vgsz, sizeof(SMqSubVgEp)); @@ -264,6 +269,8 @@ static int32_t mndProcessGetSubEpReq(SNodeMsg *pMsg) { } mndReleaseSubscribe(pMnode, pSub); } + taosRUnLockLatch(&pConsumer->lock); + mDebug("consumer %ld r unlock", consumerId); } int32_t tlen = sizeof(SMqRspHead) + tEncodeSMqCMGetSubEpRsp(NULL, &rsp); void *buf = rpcMallocCont(tlen); @@ -272,7 +279,7 @@ static int32_t mndProcessGetSubEpReq(SNodeMsg *pMsg) { return -1; } ((SMqRspHead *)buf)->mqMsgType = TMQ_MSG_TYPE__EP_RSP; - ((SMqRspHead *)buf)->epoch = pConsumer->epoch; + ((SMqRspHead *)buf)->epoch = serverEpoch; ((SMqRspHead *)buf)->consumerId = pConsumer->consumerId; void *abuf = POINTER_SHIFT(buf, sizeof(SMqRspHead)); @@ -395,7 +402,7 @@ static int32_t mndProcessDoRebalanceMsg(SNodeMsg *pMsg) { SMqSubscribeObj *pSub = mndAcquireSubscribeByKey(pMnode, pRebSub->key); taosMemoryFreeClear(pRebSub->key); - mInfo("mq rebalance subscription: %s", pSub->key); + mInfo("mq rebalance subscription: %s, vgNum: %d, unassignedVg: %d", pSub->key, pSub->vgNum, (int32_t)taosArrayGetSize(pSub->unassignedVg)); // remove lost consumer for (int32_t i = 0; i < taosArrayGetSize(pRebSub->lostConsumers); i++) { @@ -442,6 +449,9 @@ static int32_t mndProcessDoRebalanceMsg(SNodeMsg *pMsg) { } SMqConsumerObj *pRebConsumer = mndAcquireConsumer(pMnode, pSubConsumer->consumerId); + mDebug("consumer %ld try w lock", pRebConsumer->consumerId); + taosWLockLatch(&pRebConsumer->lock); + mDebug("consumer %ld w locked", pRebConsumer->consumerId); int32_t status = atomic_load_32(&pRebConsumer->status); if (vgThisConsumerAfterRb != vgThisConsumerBeforeRb || (vgThisConsumerAfterRb != 0 && status != MQ_CONSUMER_STATUS__ACTIVE) || @@ -462,6 +472,8 @@ static int32_t mndProcessDoRebalanceMsg(SNodeMsg *pMsg) { sdbSetRawStatus(pConsumerRaw, SDB_STATUS_READY); mndTransAppendCommitlog(pTrans, pConsumerRaw); } + taosWUnLockLatch(&pRebConsumer->lock); + mDebug("consumer %ld w unlock", pRebConsumer->consumerId); mndReleaseConsumer(pMnode, pRebConsumer); }