From 9bd195d94d2aa6dcdc15cd4f06be19c9698d2043 Mon Sep 17 00:00:00 2001 From: dapan1121 <89396746@qq.com> Date: Sat, 4 Jun 2022 22:33:30 +0800 Subject: [PATCH] reschedule timeout task --- include/common/tmsg.h | 1 + include/libs/scheduler/scheduler.h | 2 + source/client/inc/clientInt.h | 10 -- source/client/src/clientEnv.c | 6 +- source/client/src/clientHb.c | 101 +++---------- source/client/src/clientImpl.c | 14 -- source/client/src/clientSml.c | 2 + source/libs/catalog/src/ctgDbg.c | 15 ++ source/libs/parser/src/parInsert.c | 2 + source/libs/qworker/inc/qwInt.h | 5 +- source/libs/qworker/src/qwUtil.c | 6 + source/libs/qworker/src/qworker.c | 21 ++- source/libs/scheduler/inc/schedulerInt.h | 4 +- source/libs/scheduler/src/schJob.c | 31 +--- source/libs/scheduler/src/schRemote.c | 86 +---------- source/libs/scheduler/src/schUtil.c | 177 ++++++++++++++++++++++- source/libs/scheduler/src/scheduler.c | 10 ++ 17 files changed, 268 insertions(+), 225 deletions(-) diff --git a/include/common/tmsg.h b/include/common/tmsg.h index ee858ad506..dfa641c83e 100644 --- a/include/common/tmsg.h +++ b/include/common/tmsg.h @@ -1982,6 +1982,7 @@ typedef struct { typedef struct { SClientHbKey connKey; + int64_t clusterId; SQueryHbReqBasic* query; SHashObj* info; // hash } SClientHbReq; diff --git a/include/libs/scheduler/scheduler.h b/include/libs/scheduler/scheduler.h index 5cb6dbd3bc..22706f0953 100644 --- a/include/libs/scheduler/scheduler.h +++ b/include/libs/scheduler/scheduler.h @@ -104,6 +104,8 @@ void schedulerAsyncFetchRows(int64_t job, schedulerFetchCallback fp, void* param int32_t schedulerGetTasksStatus(int64_t job, SArray *pSub); +void schedulerStopQueryHb(void *pTrans); + /** * Cancel query job diff --git a/source/client/inc/clientInt.h b/source/client/inc/clientInt.h index c7b831893e..0000366880 100644 --- a/source/client/inc/clientInt.h +++ b/source/client/inc/clientInt.h @@ -58,11 +58,6 @@ enum { typedef struct SAppInstInfo SAppInstInfo; -typedef struct { - void* param; - SClientHbReq* req; -} SHbConnInfo; - typedef struct { char* key; // statistics @@ -72,11 +67,8 @@ typedef struct { int64_t startTime; // ctl SRWLatch lock; // lock is used in serialization - // connection SAppInstInfo* pAppInstInfo; - // info SHashObj* activeInfo; // hash - SHashObj* connInfo; // hash } SAppHbMgr; typedef int32_t (*FHbRspHandle)(SAppHbMgr* pAppHbMgr, SClientHbRsp* pRsp); @@ -326,8 +318,6 @@ void appHbMgrCleanup(void); int hbRegisterConn(SAppHbMgr* pAppHbMgr, int64_t tscRefId, int64_t clusterId, int8_t connType); void hbDeregisterConn(SAppHbMgr* pAppHbMgr, SClientHbKey connKey); -int hbAddConnInfo(SAppHbMgr* pAppHbMgr, SClientHbKey connKey, void* key, void* value, int32_t keyLen, int32_t valueLen); - // --- mq void hbMgrInitMqHbRspHandle(); diff --git a/source/client/src/clientEnv.c b/source/client/src/clientEnv.c index 9afb989d27..336924f934 100644 --- a/source/client/src/clientEnv.c +++ b/source/client/src/clientEnv.c @@ -130,8 +130,12 @@ void destroyTscObj(void *pObj) { SClientHbKey connKey = {.tscRid = pTscObj->id, .connType = pTscObj->connType}; hbDeregisterConn(pTscObj->pAppInfo->pAppHbMgr, connKey); - atomic_sub_fetch_64(&pTscObj->pAppInfo->numOfConns, 1); + int64_t connNum = atomic_sub_fetch_64(&pTscObj->pAppInfo->numOfConns, 1); closeAllRequests(pTscObj->pRequests); + schedulerStopQueryHb(pTscObj->pAppInfo->pTransporter); + if (0 == connNum) { + closeTransporter(pTscObj); + } tscDebug("connObj 0x%" PRIx64 " destroyed, totalConn:%" PRId64, pTscObj->id, pTscObj->pAppInfo->numOfConns); taosThreadMutexDestroy(&pTscObj->mutex); taosMemoryFreeClear(pTscObj); diff --git a/source/client/src/clientHb.c b/source/client/src/clientHb.c index 09c3d269c7..a4c109ee17 100644 --- a/source/client/src/clientHb.c +++ b/source/client/src/clientHb.c @@ -129,9 +129,9 @@ static int32_t hbProcessStbInfoRsp(void *value, int32_t valueLen, struct SCatalo } static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) { - SHbConnInfo *info = taosHashGet(pAppHbMgr->connInfo, &pRsp->connKey, sizeof(SClientHbKey)); - if (NULL == info) { - tscWarn("fail to get connInfo, may be dropped, refId:%" PRIx64 ", type:%d", pRsp->connKey.tscRid, + SClientHbReq *pReq = taosHashGet(pAppHbMgr->activeInfo, &pRsp->connKey, sizeof(SClientHbKey)); + if (NULL == pReq) { + tscWarn("pReq to get activeInfo, may be dropped, refId:%" PRIx64 ", type:%d", pRsp->connKey.tscRid, pRsp->connKey.connType); return TSDB_CODE_SUCCESS; } @@ -181,12 +181,11 @@ static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) { break; } - int64_t *clusterId = (int64_t *)info->param; struct SCatalog *pCatalog = NULL; - int32_t code = catalogGetHandle(*clusterId, &pCatalog); + int32_t code = catalogGetHandle(pReq->clusterId, &pCatalog); if (code != TSDB_CODE_SUCCESS) { - tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", *clusterId, tstrerror(code)); + tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", pReq->clusterId, tstrerror(code)); break; } @@ -199,12 +198,11 @@ static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) { break; } - int64_t *clusterId = (int64_t *)info->param; struct SCatalog *pCatalog = NULL; - int32_t code = catalogGetHandle(*clusterId, &pCatalog); + int32_t code = catalogGetHandle(pReq->clusterId, &pCatalog); if (code != TSDB_CODE_SUCCESS) { - tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", *clusterId, tstrerror(code)); + tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", pReq->clusterId, tstrerror(code)); break; } @@ -217,12 +215,11 @@ static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) { break; } - int64_t *clusterId = (int64_t *)info->param; struct SCatalog *pCatalog = NULL; - int32_t code = catalogGetHandle(*clusterId, &pCatalog); + int32_t code = catalogGetHandle(pReq->clusterId, &pCatalog); if (code != TSDB_CODE_SUCCESS) { - tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", *clusterId, tstrerror(code)); + tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", pReq->clusterId, tstrerror(code)); break; } @@ -547,13 +544,10 @@ SClientHbBatchReq *hbGatherAllInfo(SAppHbMgr *pAppHbMgr) { pOneReq = taosArrayPush(pBatchReq->reqs, pOneReq); - SHbConnInfo *info = taosHashGet(pAppHbMgr->connInfo, &pOneReq->connKey, sizeof(SClientHbKey)); - if (info) { - code = (*clientHbMgr.reqHandle[pOneReq->connKey.connType])(&pOneReq->connKey, info->param, pOneReq); - if (code) { - pIter = taosHashIterate(pAppHbMgr->activeInfo, pIter); - continue; - } + code = (*clientHbMgr.reqHandle[pOneReq->connKey.connType])(&pOneReq->connKey, &pOneReq->clusterId, pOneReq); + if (code) { + pIter = taosHashIterate(pAppHbMgr->activeInfo, pIter); + continue; } //hbClearClientHbReq(pOneReq); @@ -569,23 +563,6 @@ SClientHbBatchReq *hbGatherAllInfo(SAppHbMgr *pAppHbMgr) { return pBatchReq; } -void hbClearReqInfo(SAppHbMgr *pAppHbMgr) { - void *pIter = taosHashIterate(pAppHbMgr->activeInfo, NULL); - while (pIter != NULL) { - SClientHbReq *pOneReq = pIter; - - tFreeReqKvHash(pOneReq->info); - taosHashClear(pOneReq->info); - - if (pOneReq->query) { - taosArrayDestroy(pOneReq->query->queryDesc); - taosMemoryFreeClear(pOneReq->query); - } - - pIter = taosHashIterate(pAppHbMgr->activeInfo, pIter); - } -} - void hbThreadFuncUnexpectedStopped(void) { atomic_store_8(&clientHbMgr.threadStop, 2); } @@ -715,14 +692,6 @@ SAppHbMgr *appHbMgrInit(SAppInstInfo *pAppInstInfo, char *key) { } taosHashSetFreeFp(pAppHbMgr->activeInfo, tFreeClientHbReq); - // init getInfoFunc - pAppHbMgr->connInfo = taosHashInit(64, hbKeyHashFunc, 1, HASH_ENTRY_LOCK); - - if (pAppHbMgr->connInfo == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - taosMemoryFree(pAppHbMgr); - return NULL; - } taosThreadMutexLock(&clientHbMgr.lock); taosArrayPush(clientHbMgr.appHbMgrs, &pAppHbMgr); @@ -745,15 +714,6 @@ void appHbMgrCleanup(void) { taosHashCleanup(pTarget->activeInfo); pTarget->activeInfo = NULL; - pIter = taosHashIterate(pTarget->connInfo, NULL); - while (pIter != NULL) { - SHbConnInfo *info = pIter; - taosMemoryFree(info->param); - pIter = taosHashIterate(pTarget->connInfo, pIter); - } - taosHashCleanup(pTarget->connInfo); - pTarget->connInfo = NULL; - taosMemoryFree(pTarget->key); taosMemoryFree(pTarget); } @@ -791,7 +751,7 @@ void hbMgrCleanUp() { clientHbMgr.appHbMgrs = NULL; } -int hbRegisterConnImpl(SAppHbMgr *pAppHbMgr, SClientHbKey connKey, SHbConnInfo *info) { +int hbRegisterConnImpl(SAppHbMgr *pAppHbMgr, SClientHbKey connKey, int64_t clusterId) { // init hash in activeinfo void *data = taosHashGet(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey)); if (data != NULL) { @@ -799,17 +759,11 @@ int hbRegisterConnImpl(SAppHbMgr *pAppHbMgr, SClientHbKey connKey, SHbConnInfo * } SClientHbReq hbReq = {0}; hbReq.connKey = connKey; + hbReq.clusterId = clusterId; //hbReq.info = taosHashInit(64, hbKeyHashFunc, 1, HASH_ENTRY_LOCK); taosHashPut(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey), &hbReq, sizeof(SClientHbReq)); - // init hash - if (info != NULL) { - SClientHbReq *pReq = taosHashGet(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey)); - info->req = pReq; - taosHashPut(pAppHbMgr->connInfo, &connKey, sizeof(SClientHbKey), info, sizeof(SHbConnInfo)); - } - atomic_add_fetch_32(&pAppHbMgr->connKeyCnt, 1); return 0; } @@ -819,15 +773,10 @@ int hbRegisterConn(SAppHbMgr *pAppHbMgr, int64_t tscRefId, int64_t clusterId, in .tscRid = tscRefId, .connType = connType, }; - SHbConnInfo info = {0}; switch (connType) { case CONN_TYPE__QUERY: { - int64_t *pClusterId = taosMemoryMalloc(sizeof(int64_t)); - *pClusterId = clusterId; - - info.param = pClusterId; - return hbRegisterConnImpl(pAppHbMgr, connKey, &info); + return hbRegisterConnImpl(pAppHbMgr, connKey, clusterId); } case CONN_TYPE__TMQ: { return 0; @@ -844,26 +793,10 @@ void hbDeregisterConn(SAppHbMgr *pAppHbMgr, SClientHbKey connKey) { taosHashRemove(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey)); } - SHbConnInfo *info = taosHashGet(pAppHbMgr->connInfo, &connKey, sizeof(SClientHbKey)); - if (info) { - taosMemoryFree(info->param); - taosHashRemove(pAppHbMgr->connInfo, &connKey, sizeof(SClientHbKey)); - } - - if (NULL == pReq || NULL == info) { + if (NULL == pReq) { return; } atomic_sub_fetch_32(&pAppHbMgr->connKeyCnt, 1); } -int hbAddConnInfo(SAppHbMgr *pAppHbMgr, SClientHbKey connKey, void *key, void *value, int32_t keyLen, - int32_t valueLen) { - // find req by connection id - SClientHbReq *pReq = taosHashGet(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey)); - ASSERT(pReq != NULL); - - taosHashPut(pReq->info, key, keyLen, value, valueLen); - - return 0; -} diff --git a/source/client/src/clientImpl.c b/source/client/src/clientImpl.c index 6694954fa2..4c68edaff0 100644 --- a/source/client/src/clientImpl.c +++ b/source/client/src/clientImpl.c @@ -741,20 +741,6 @@ SRequestObj* execQuery(STscObj* pTscObj, const char* sql, int sqlLen) { do { destroyRequest(pRequest); pRequest = launchQuery(pTscObj, sql, sqlLen); - if (*sql == 'y') { - SCatalog *pCatalog = NULL; - code = catalogGetHandle(pTscObj->pAppInfo->clusterId, &pCatalog); - SEpSet epset = getEpSet_s(&pTscObj->pAppInfo->mgmtEp); - ctgdLaunchAsyncCall(pCatalog, pTscObj->pAppInfo->pTransporter, &epset, pRequest->requestId, false); - break; - } else if (*sql == 'z') { - SCatalog *pCatalog = NULL; - code = catalogGetHandle(pTscObj->pAppInfo->clusterId, &pCatalog); - SEpSet epset = getEpSet_s(&pTscObj->pAppInfo->mgmtEp); - ctgdLaunchAsyncCall(pCatalog, pTscObj->pAppInfo->pTransporter, &epset, pRequest->requestId, false); - break; - } - if (pRequest == NULL || TSDB_CODE_SUCCESS == pRequest->code || !NEED_CLIENT_HANDLE_ERROR(pRequest->code)) { break; } diff --git a/source/client/src/clientSml.c b/source/client/src/clientSml.c index c559ac58f8..5643af746d 100644 --- a/source/client/src/clientSml.c +++ b/source/client/src/clientSml.c @@ -2332,6 +2332,8 @@ static int32_t isSchemalessDb(SSmlHandle* info){ smlBuildInvalidDataMsg(&info->msgBuf, "catalogGetDBCfg error, code:", tstrerror(code)); return code; } + taosArrayDestroy(pInfo.pRetensions); + if (!pInfo.schemaless){ info->pRequest->code = TSDB_CODE_SML_INVALID_DB_CONF; smlBuildInvalidDataMsg(&info->msgBuf, "can not insert into schemaless db:", dbFname); diff --git a/source/libs/catalog/src/ctgDbg.c b/source/libs/catalog/src/ctgDbg.c index cb18e45638..42baa530ce 100644 --- a/source/libs/catalog/src/ctgDbg.c +++ b/source/libs/catalog/src/ctgDbg.c @@ -132,6 +132,21 @@ void ctgdUserCallback(SMetaData* pResult, void* param, int32_t code) { } } + +/* +prepare SQL: +create database db1; +use db1; +create stable st1 (ts timestamp, f1 int) tags(t1 int); +create table tb1 using st1 tags(1); +insert into tb1 values (now, 1); +create qnode on dnode 1; +create user user1 pass "abc"; +create database db2; +grant write on db2.* to user1; +create function udf1 as '/tmp/libudf1.so' outputtype int; +create aggregate function udf2 as '/tmp/libudf2.so' outputtype int; +*/ int32_t ctgdLaunchAsyncCall(SCatalog* pCtg, void *pTrans, const SEpSet* pMgmtEps, uint64_t reqId, bool forceUpdate) { int32_t code = 0; SCatalogReq req = {0}; diff --git a/source/libs/parser/src/parInsert.c b/source/libs/parser/src/parInsert.c index 422c480397..01a6761b33 100644 --- a/source/libs/parser/src/parInsert.c +++ b/source/libs/parser/src/parInsert.c @@ -1424,6 +1424,8 @@ int32_t isNotSchemalessDb(SParseContext* pContext, char *dbName){ parserError("catalogGetDBCfg error, code:%s, dbFName:%s", tstrerror(code), dbFname); return code; } + taosArrayDestroy(pInfo.pRetensions); + if (pInfo.schemaless){ parserError("can not insert into schemaless db:%s", dbFname); return TSDB_CODE_SML_INVALID_DB_CONF; diff --git a/source/libs/qworker/inc/qwInt.h b/source/libs/qworker/inc/qwInt.h index 3804c114cb..082db6428f 100644 --- a/source/libs/qworker/inc/qwInt.h +++ b/source/libs/qworker/inc/qwInt.h @@ -34,6 +34,7 @@ extern "C" { #define QW_DEFAULT_SCH_TASK_NUMBER 10000 #define QW_DEFAULT_SHORT_RUN_TIMES 2 #define QW_DEFAULT_HEARTBEAT_MSEC 5000 +#define QW_SCH_TIMEOUT_MSEC 180000 enum { QW_PHASE_PRE_QUERY = 1, @@ -137,7 +138,7 @@ typedef struct SQWTaskCtx { } SQWTaskCtx; typedef struct SQWSchStatus { - int32_t lastAccessTs; // timestamp in second + int64_t hbBrokenTs; // timestamp in msecond SRWLatch hbConnLock; SRpcHandleInfo hbConnInfo; SQueryNodeEpId hbEpId; @@ -354,6 +355,8 @@ int32_t qwOpenRef(void); void qwSetHbParam(int64_t refId, SQWHbParam **pParam); int32_t qwUpdateTimeInQueue(SQWorker *mgmt, int64_t ts, EQueueType type); int64_t qwGetTimeInQueue(SQWorker *mgmt, EQueueType type); +void qwClearExpiredSch(SArray* pExpiredSch); +int32_t qwAcquireScheduler(SQWorker *mgmt, uint64_t sId, int32_t rwType, SQWSchStatus **sch); void qwDbgDumpMgmtInfo(SQWorker *mgmt); int32_t qwDbgValidateStatus(QW_FPARAMS_DEF, int8_t oriStatus, int8_t newStatus, bool *ignore); diff --git a/source/libs/qworker/src/qwUtil.c b/source/libs/qworker/src/qwUtil.c index 3d0204e355..8bfb80f061 100644 --- a/source/libs/qworker/src/qwUtil.c +++ b/source/libs/qworker/src/qwUtil.c @@ -535,3 +535,9 @@ int64_t qwGetTimeInQueue(SQWorker *mgmt, EQueueType type) { return -1; } + +void qwClearExpiredSch(SArray* pExpiredSch) { + +} + + diff --git a/source/libs/qworker/src/qworker.c b/source/libs/qworker/src/qworker.c index 8c0366fa0b..44a8fdf7f4 100644 --- a/source/libs/qworker/src/qworker.c +++ b/source/libs/qworker/src/qworker.c @@ -21,10 +21,12 @@ int32_t qwProcessHbLinkBroken(SQWorker *mgmt, SQWMsg *qwMsg, SSchedulerHbReq *re SSchedulerHbRsp rsp = {0}; SQWSchStatus *sch = NULL; - QW_ERR_RET(qwAcquireAddScheduler(mgmt, req->sId, QW_READ, &sch)); + QW_ERR_RET(qwAcquireScheduler(mgmt, req->sId, QW_READ, &sch)); QW_LOCK(QW_WRITE, &sch->hbConnLock); + sch->hbBrokenTs = taosGetTimestampMs(); + if (qwMsg->connInfo.handle == sch->hbConnInfo.handle) { tmsgReleaseHandle(&sch->hbConnInfo, TAOS_CONN_SERVER); sch->hbConnInfo.handle = NULL; @@ -794,6 +796,7 @@ void qwProcessHbTimerEvent(void *param, void *tmrId) { SQWSchStatus *sch = NULL; int32_t taskNum = 0; SQWHbInfo *rspList = NULL; + SArray *pExpiredSch = NULL; int32_t code = 0; qwDbgDumpMgmtInfo(mgmt); @@ -809,8 +812,11 @@ void qwProcessHbTimerEvent(void *param, void *tmrId) { } rspList = taosMemoryCalloc(schNum, sizeof(SQWHbInfo)); - if (NULL == rspList) { + pExpiredSch = taosArrayInit(schNum, sizeof(uint64_t)); + if (NULL == rspList || NULL == pExpiredSch) { QW_UNLOCK(QW_READ, &mgmt->schLock); + taosMemoryFree(rspList); + taosArrayDestroy(pExpiredSch); QW_ELOG("calloc %d SQWHbInfo failed", schNum); taosTmrReset(qwProcessHbTimerEvent, QW_DEFAULT_HEARTBEAT_MSEC, param, mgmt->timer, &mgmt->hbTimer); qwRelease(refId); @@ -820,6 +826,7 @@ void qwProcessHbTimerEvent(void *param, void *tmrId) { void *key = NULL; size_t keyLen = 0; int32_t i = 0; + int64_t currentMs = taosGetTimestampMs(); void *pIter = taosHashIterate(mgmt->schHash, NULL); while (pIter) { @@ -827,6 +834,11 @@ void qwProcessHbTimerEvent(void *param, void *tmrId) { if (NULL == sch->hbConnInfo.handle) { uint64_t *sId = taosHashGetKey(pIter, NULL); QW_TLOG("cancel send hb to sch %" PRIx64 " cause of no connection handle", *sId); + + if (sch->hbBrokenTs > 0 && ((currentMs - sch->hbBrokenTs) > QW_SCH_TIMEOUT_MSEC) && taosHashGetSize(sch->tasksHash) <= 0) { + taosArrayPush(pExpiredSch, sId); + } + pIter = taosHashIterate(mgmt->schHash, pIter); continue; } @@ -852,7 +864,12 @@ _return: tFreeSSchedulerHbRsp(&rspList[j].rsp); } + if (taosArrayGetSize(pExpiredSch) > 0) { + qwClearExpiredSch(pExpiredSch); + } + taosMemoryFreeClear(rspList); + taosArrayDestroy(pExpiredSch); taosTmrReset(qwProcessHbTimerEvent, QW_DEFAULT_HEARTBEAT_MSEC, param, mgmt->timer, &mgmt->hbTimer); qwRelease(refId); diff --git a/source/libs/scheduler/inc/schedulerInt.h b/source/libs/scheduler/inc/schedulerInt.h index c8fee1e532..d961780ae5 100644 --- a/source/libs/scheduler/inc/schedulerInt.h +++ b/source/libs/scheduler/inc/schedulerInt.h @@ -102,11 +102,11 @@ typedef struct SSchedulerMgmt { uint64_t taskId; // sequential taksId uint64_t sId; // schedulerId SSchedulerCfg cfg; - SRWLatch lock; bool exit; int32_t jobRef; int32_t jobNum; SSchStat stat; + SRWLatch hbLock; SHashObj *hbConnections; } SSchedulerMgmt; @@ -320,6 +320,8 @@ extern SSchedulerMgmt schMgmt; #define SCH_UNLOCK(type, _lock) (SCH_READ == (type) ? taosRUnLockLatch(_lock) : taosWUnLockLatch(_lock)) +void schDeregisterTaskHb(SSchJob *pJob, SSchTask *pTask); +void schCleanClusterHb(void* pTrans); int32_t schLaunchTask(SSchJob *job, SSchTask *task); int32_t schBuildAndSendMsg(SSchJob *job, SSchTask *task, SQueryNodeAddr *addr, int32_t msgType); SSchJob *schAcquireJob(int64_t refId); diff --git a/source/libs/scheduler/src/schJob.c b/source/libs/scheduler/src/schJob.c index be7b930d79..ca90d2fe34 100644 --- a/source/libs/scheduler/src/schJob.c +++ b/source/libs/scheduler/src/schJob.c @@ -126,30 +126,6 @@ _return: SCH_RET(code); } -void schDeregisterTaskHb(SSchJob *pJob, SSchTask *pTask) { - if (!pTask->registerdHb) { - return; - } - - SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - SQueryNodeEpId epId = {0}; - - epId.nodeId = addr->nodeId; - - SEp* pEp = SCH_GET_CUR_EP(addr); - strcpy(epId.ep.fqdn, pEp->fqdn); - epId.ep.port = pEp->port; - - SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, &epId, sizeof(SQueryNodeEpId)); - if (NULL == hb) { - SCH_TASK_ELOG("nodeId %d fqdn %s port %d not in hb connections", epId.nodeId, epId.ep.fqdn, epId.ep.port); - return; - } - - atomic_sub_fetch_64(&hb->taskNum, 1); - - pTask->registerdHb = false; -} void schFreeTask(SSchJob *pJob, SSchTask *pTask) { schDeregisterTaskHb(pJob, pTask); @@ -1484,9 +1460,10 @@ void schFreeJobImpl(void *job) { qDebug("QID:0x%" PRIx64 " job freed, refId:%" PRIx64 ", pointer:%p", queryId, refId, pJob); - atomic_sub_fetch_32(&schMgmt.jobNum, 1); - - schCloseJobRef(); + int32_t jobNum = atomic_sub_fetch_32(&schMgmt.jobNum, 1); + if (jobNum == 0) { + schCloseJobRef(); + } } int32_t schExecJobImpl(void *pTrans, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql, diff --git a/source/libs/scheduler/src/schRemote.c b/source/libs/scheduler/src/schRemote.c index 1389459604..bf51d8d631 100644 --- a/source/libs/scheduler/src/schRemote.c +++ b/source/libs/scheduler/src/schRemote.c @@ -648,31 +648,6 @@ _return: SCH_RET(code); } -int32_t schRegisterHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *epId, bool *exist) { - int32_t code = 0; - SSchHbTrans hb = {0}; - - hb.trans.pTrans = pJob->pTrans; - - SCH_ERR_RET(schMakeHbRpcCtx(pJob, pTask, &hb.rpcCtx)); - - code = taosHashPut(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId), &hb, sizeof(SSchHbTrans)); - if (code) { - schFreeRpcCtx(&hb.rpcCtx); - - if (HASH_NODE_EXIST(code)) { - *exist = true; - return TSDB_CODE_SUCCESS; - } - - qError("taosHashPut hb trans failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); - SCH_ERR_RET(code); - } - - return TSDB_CODE_SUCCESS; -} - - int32_t schBuildAndSendHbMsg(SQueryNodeEpId *nodeEpId, SArray* taskAction) { SSchedulerHbReq req = {0}; int32_t code = 0; @@ -684,17 +659,20 @@ int32_t schBuildAndSendHbMsg(SQueryNodeEpId *nodeEpId, SArray* taskAction) { req.sId = schMgmt.sId; memcpy(&req.epId, nodeEpId, sizeof(SQueryNodeEpId)); + SCH_LOCK(SCH_READ, &schMgmt.hbLock); SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, nodeEpId, sizeof(SQueryNodeEpId)); if (NULL == hb) { - qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", nodeEpId->nodeId, nodeEpId->ep.fqdn, + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + qError("hb connection no longer exist, nodeId:%d, fqdn:%s, port:%d", nodeEpId->nodeId, nodeEpId->ep.fqdn, nodeEpId->ep.port); - SCH_ERR_RET(code); + return TSDB_CODE_SUCCESS; } SCH_LOCK(SCH_WRITE, &hb->lock); code = schCloneHbRpcCtx(&hb->rpcCtx, &rpcCtx); memcpy(&trans, &hb->trans, sizeof(trans)); SCH_UNLOCK(SCH_WRITE, &hb->lock); + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); SCH_ERR_RET(code); @@ -764,60 +742,6 @@ _return: SCH_RET(code); } - -int32_t schEnsureHbConnection(SSchJob *pJob, SSchTask *pTask) { - SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - SQueryNodeEpId epId = {0}; - - epId.nodeId = addr->nodeId; - - SEp* pEp = SCH_GET_CUR_EP(addr); - strcpy(epId.ep.fqdn, pEp->fqdn); - epId.ep.port = pEp->port; - - SSchHbTrans *hb = NULL; - while (true) { - hb = taosHashGet(schMgmt.hbConnections, &epId, sizeof(SQueryNodeEpId)); - if (NULL == hb) { - bool exist = false; - SCH_ERR_RET(schRegisterHbConnection(pJob, pTask, &epId, &exist)); - if (!exist) { - SCH_ERR_RET(schBuildAndSendHbMsg(&epId, NULL)); - } - - continue; - } - - break; - } - - atomic_add_fetch_64(&hb->taskNum, 1); - - pTask->registerdHb = true; - - return TSDB_CODE_SUCCESS; -} - -int32_t schUpdateHbConnection(SQueryNodeEpId *epId, SSchTrans *trans) { - int32_t code = 0; - SSchHbTrans *hb = NULL; - - hb = taosHashGet(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); - if (NULL == hb) { - qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); - SCH_ERR_RET(TSDB_CODE_QRY_APP_ERROR); - } - - SCH_LOCK(SCH_WRITE, &hb->lock); - memcpy(&hb->trans, trans, sizeof(*trans)); - SCH_UNLOCK(SCH_WRITE, &hb->lock); - - qDebug("hb connection updated, sId:%" PRIx64 ", nodeId:%d, fqdn:%s, port:%d, pTrans:%p, pHandle:%p", schMgmt.sId, - epId->nodeId, epId->ep.fqdn, epId->ep.port, trans->pTrans, trans->pHandle); - - return TSDB_CODE_SUCCESS; -} - int32_t schHandleHbCallback(void *param, const SDataBuf *pMsg, int32_t code) { SSchedulerHbRsp rsp = {0}; SSchTaskCallbackParam *pParam = (SSchTaskCallbackParam *)param; diff --git a/source/libs/scheduler/src/schUtil.c b/source/libs/scheduler/src/schUtil.c index 81c95ea976..a91dc359f2 100644 --- a/source/libs/scheduler/src/schUtil.c +++ b/source/libs/scheduler/src/schUtil.c @@ -21,17 +21,187 @@ #include "tref.h" #include "trpc.h" + +void schCleanClusterHb(void* pTrans) { + SCH_LOCK(SCH_WRITE, &schMgmt.hbLock); + + SSchHbTrans *hb = taosHashIterate(schMgmt.hbConnections, NULL); + while (hb) { + if (hb->trans.pTrans == pTrans) { + SQueryNodeEpId* pEpId = taosHashGetKey(hb, NULL); + rpcReleaseHandle(hb->trans.pHandle, TAOS_CONN_CLIENT); + taosHashRemove(schMgmt.hbConnections, pEpId, sizeof(SQueryNodeEpId)); + } + + hb = taosHashIterate(schMgmt.hbConnections, hb); + } + + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); +} + +int32_t schRemoveHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *epId) { + int32_t code = 0; + + SCH_LOCK(SCH_WRITE, &schMgmt.hbLock); + SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); + SCH_TASK_ELOG("nodeId %d fqdn %s port %d not in hb connections", epId->nodeId, epId->ep.fqdn, epId->ep.port); + return TSDB_CODE_SUCCESS; + } + + int64_t taskNum = atomic_load_64(&hb->taskNum); + if (taskNum <= 0) { + rpcReleaseHandle(hb->trans.pHandle, TAOS_CONN_CLIENT); + taosHashRemove(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); + } + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); + + return TSDB_CODE_SUCCESS; +} + + +int32_t schAddHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *epId, bool *exist) { + int32_t code = 0; + SSchHbTrans hb = {0}; + + hb.trans.pTrans = pJob->pTrans; + hb.taskNum = 1; + + SCH_ERR_RET(schMakeHbRpcCtx(pJob, pTask, &hb.rpcCtx)); + + SCH_LOCK(SCH_WRITE, &schMgmt.hbLock); + code = taosHashPut(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId), &hb, sizeof(SSchHbTrans)); + if (code) { + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); + schFreeRpcCtx(&hb.rpcCtx); + + if (HASH_NODE_EXIST(code)) { + *exist = true; + return TSDB_CODE_SUCCESS; + } + + qError("taosHashPut hb trans failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); + SCH_ERR_RET(code); + } + + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); + + return TSDB_CODE_SUCCESS; +} + +int32_t schRegisterHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *pEpId) { + SSchHbTrans *hb = NULL; + + while (true) { + SCH_LOCK(SCH_READ, &schMgmt.hbLock); + hb = taosHashGet(schMgmt.hbConnections, pEpId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + bool exist = false; + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + SCH_ERR_RET(schAddHbConnection(pJob, pTask, pEpId, &exist)); + if (!exist) { + SCH_RET(schBuildAndSendHbMsg(pEpId, NULL)); + } + + continue; + } + + break; + } + + atomic_add_fetch_64(&hb->taskNum, 1); + + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + + return TSDB_CODE_SUCCESS; +} + +void schDeregisterTaskHb(SSchJob *pJob, SSchTask *pTask) { + if (!pTask->registerdHb) { + return; + } + + SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + SQueryNodeEpId epId = {0}; + + epId.nodeId = addr->nodeId; + + SEp* pEp = SCH_GET_CUR_EP(addr); + strcpy(epId.ep.fqdn, pEp->fqdn); + epId.ep.port = pEp->port; + + SCH_LOCK(SCH_READ, &schMgmt.hbLock); + SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, &epId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + SCH_TASK_WLOG("nodeId %d fqdn %s port %d not in hb connections", epId.nodeId, epId.ep.fqdn, epId.ep.port); + return; + } + + int64_t taskNum = atomic_sub_fetch_64(&hb->taskNum, 1); + if (0 == taskNum) { + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + schRemoveHbConnection(pJob, pTask, &epId); + } else { + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + } + + pTask->registerdHb = false; +} + + + +int32_t schEnsureHbConnection(SSchJob *pJob, SSchTask *pTask) { + SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + SQueryNodeEpId epId = {0}; + + epId.nodeId = addr->nodeId; + + SEp* pEp = SCH_GET_CUR_EP(addr); + strcpy(epId.ep.fqdn, pEp->fqdn); + epId.ep.port = pEp->port; + + SCH_ERR_RET(schRegisterHbConnection(pJob, pTask, &epId)); + + pTask->registerdHb = true; + + return TSDB_CODE_SUCCESS; +} + +int32_t schUpdateHbConnection(SQueryNodeEpId *epId, SSchTrans *trans) { + int32_t code = 0; + SSchHbTrans *hb = NULL; + + SCH_LOCK(SCH_READ, &schMgmt.hbLock); + hb = taosHashGet(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); + SCH_ERR_RET(TSDB_CODE_QRY_APP_ERROR); + } + + SCH_LOCK(SCH_WRITE, &hb->lock); + memcpy(&hb->trans, trans, sizeof(*trans)); + SCH_UNLOCK(SCH_WRITE, &hb->lock); + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + + qDebug("hb connection updated, sId:%" PRIx64 ", nodeId:%d, fqdn:%s, port:%d, pTrans:%p, pHandle:%p", schMgmt.sId, + epId->nodeId, epId->ep.fqdn, epId->ep.port, trans->pTrans, trans->pHandle); + + return TSDB_CODE_SUCCESS; +} + + void schCloseJobRef(void) { if (!atomic_load_8((int8_t *)&schMgmt.exit)) { return; } - SCH_LOCK(SCH_WRITE, &schMgmt.lock); - if (atomic_load_32(&schMgmt.jobNum) <= 0 && schMgmt.jobRef >= 0) { + if (schMgmt.jobRef >= 0) { taosCloseRef(schMgmt.jobRef); schMgmt.jobRef = -1; } - SCH_UNLOCK(SCH_WRITE, &schMgmt.lock); } uint64_t schGenTaskId(void) { return atomic_add_fetch_64(&schMgmt.taskId, 1); } @@ -88,4 +258,3 @@ void schFreeRpcCtx(SRpcCtx *pCtx) { (*pCtx->freeFunc)(pCtx->brokenVal.val); } - diff --git a/source/libs/scheduler/src/scheduler.c b/source/libs/scheduler/src/scheduler.c index 32e00c1b70..b4dc067f4c 100644 --- a/source/libs/scheduler/src/scheduler.c +++ b/source/libs/scheduler/src/scheduler.c @@ -182,6 +182,14 @@ int32_t scheduleCancelJob(int64_t job) { SCH_RET(code); } +void schedulerStopQueryHb(void *pTrans) { + if (NULL == pTrans) { + return; + } + + schCleanClusterHb(pTrans); +} + void schedulerFreeJob(int64_t job) { SSchJob *pJob = schAcquireJob(job); if (NULL == pJob) { @@ -220,6 +228,7 @@ void schedulerDestroy(void) { } } + SCH_LOCK(SCH_WRITE, &schMgmt.hbLock); if (schMgmt.hbConnections) { void *pIter = taosHashIterate(schMgmt.hbConnections, NULL); while (pIter != NULL) { @@ -230,4 +239,5 @@ void schedulerDestroy(void) { taosHashCleanup(schMgmt.hbConnections); schMgmt.hbConnections = NULL; } + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); }