diff --git a/include/libs/qcom/query.h b/include/libs/qcom/query.h index 7f7fe76139..64196aa64f 100644 --- a/include/libs/qcom/query.h +++ b/include/libs/qcom/query.h @@ -29,12 +29,13 @@ extern "C" { typedef enum { JOB_TASK_STATUS_NULL = 0, - JOB_TASK_STATUS_NOT_START = 1, - JOB_TASK_STATUS_EXECUTING, - JOB_TASK_STATUS_PARTIAL_SUCCEED, - JOB_TASK_STATUS_SUCCEED, - JOB_TASK_STATUS_FAILED, - JOB_TASK_STATUS_DROPPING, + JOB_TASK_STATUS_INIT, + JOB_TASK_STATUS_EXEC, + JOB_TASK_STATUS_PART_SUCC, + JOB_TASK_STATUS_SUCC, + JOB_TASK_STATUS_FAIL, + JOB_TASK_STATUS_DROP, + JOB_TASK_STATUS_MAX, } EJobTaskType; typedef enum { diff --git a/include/libs/scheduler/scheduler.h b/include/libs/scheduler/scheduler.h index 1c73b2c2c8..66e1f7ed3a 100644 --- a/include/libs/scheduler/scheduler.h +++ b/include/libs/scheduler/scheduler.h @@ -74,6 +74,7 @@ typedef void (*schedulerFetchFp)(void* pResult, void* param, int32_t code); typedef bool (*schedulerChkKillFp)(void* param); typedef struct SSchedulerReq { + bool syncReq; SRequestConnInfo *pConn; SArray *pNodeList; SQueryPlan *pDag; @@ -83,36 +84,17 @@ typedef struct SSchedulerReq { void* execParam; schedulerChkKillFp chkKillFp; void* chkKillParam; + SQueryResult* pQueryRes; } SSchedulerReq; int32_t schedulerInit(SSchedulerCfg *cfg); -/** - * Process the query job, generated according to the query physical plan. - * This is a synchronized API, and is also thread-safety. - * @param nodeList Qnode/Vnode address list, element is SQueryNodeAddr - * @return - */ -int32_t schedulerExecJob(SSchedulerReq *pReq, int64_t *pJob, SQueryResult *pRes); +int32_t schedulerExecJob(SSchedulerReq *pReq, int64_t *pJob); -/** - * Process the query job, generated according to the query physical plan. - * This is a asynchronized API, and is also thread-safety. - * @param pNodeList Qnode/Vnode address list, element is SQueryNodeAddr - * @return - */ - int32_t schedulerAsyncExecJob(SSchedulerReq *pReq, int64_t *pJob); - -/** - * Fetch query result from the remote query executor - * @param pJob - * @param data - * @return - */ int32_t schedulerFetchRows(int64_t job, void **data); -void schedulerAsyncFetchRows(int64_t job, schedulerFetchFp fp, void* param); +void schedulerFetchRowsA(int64_t job, schedulerFetchFp fp, void* param); int32_t schedulerGetTasksStatus(int64_t job, SArray *pSub); diff --git a/source/client/src/clientImpl.c b/source/client/src/clientImpl.c index 9047a9d27e..63b153b6fc 100644 --- a/source/client/src/clientImpl.c +++ b/source/client/src/clientImpl.c @@ -631,17 +631,21 @@ int32_t scheduleQuery(SRequestObj* pRequest, SQueryPlan* pDag, SArray* pNodeList SRequestConnInfo conn = {.pTrans = pRequest->pTscObj->pAppInfo->pTransporter, .requestId = pRequest->requestId, .requestObjRefId = pRequest->self}; - SSchedulerReq req = {.pConn = &conn, - .pNodeList = pNodeList, - .pDag = pDag, - .sql = pRequest->sqlstr, - .startTs = pRequest->metric.start, - .execFp = NULL, - .execParam = NULL, - .chkKillFp = chkRequestKilled, - .chkKillParam = (void*)pRequest->self}; + SSchedulerReq req = { + .syncReq = true, + .pConn = &conn, + .pNodeList = pNodeList, + .pDag = pDag, + .sql = pRequest->sqlstr, + .startTs = pRequest->metric.start, + .execFp = NULL, + .execParam = NULL, + .chkKillFp = chkRequestKilled, + .chkKillParam = (void*)pRequest->self + .pQueryRes = &res, + }; - int32_t code = schedulerExecJob(&req, &pRequest->body.queryJob, &res); + int32_t code = schedulerExecJob(&req, &pRequest->body.queryJob); pRequest->body.resInfo.execRes = res.res; if (code != TSDB_CODE_SUCCESS) { @@ -939,16 +943,20 @@ void launchAsyncQuery(SRequestObj* pRequest, SQuery* pQuery, SMetaData* pResultM SRequestConnInfo conn = { .pTrans = pAppInfo->pTransporter, .requestId = pRequest->requestId, .requestObjRefId = pRequest->self}; - SSchedulerReq req = {.pConn = &conn, - .pNodeList = pNodeList, - .pDag = pDag, - .sql = pRequest->sqlstr, - .startTs = pRequest->metric.start, - .execFp = schedulerExecCb, - .execParam = pRequest, - .chkKillFp = chkRequestKilled, - .chkKillParam = (void*)pRequest->self}; - code = schedulerAsyncExecJob(&req, &pRequest->body.queryJob); + SSchedulerReq req = { + .syncReq = false, + .pConn = &conn, + .pNodeList = pNodeList, + .pDag = pDag, + .sql = pRequest->sqlstr, + .startTs = pRequest->metric.start, + .execFp = schedulerExecCb, + .execParam = pRequest, + .chkKillFp = chkRequestKilled, + .chkKillParam = (void*)pRequest->self, + .pQueryRes = NULL, + }; + code = schedulerExecJob(&req, &pRequest->body.queryJob); taosArrayDestroy(pNodeList); } else { tscDebug("0x%" PRIx64 " plan not executed, code:%s 0x%" PRIx64, pRequest->self, tstrerror(code), diff --git a/source/client/src/clientMain.c b/source/client/src/clientMain.c index d824ef998f..f660c46d3c 100644 --- a/source/client/src/clientMain.c +++ b/source/client/src/clientMain.c @@ -863,7 +863,7 @@ void taos_fetch_rows_a(TAOS_RES *res, __taos_async_fn_t fp, void *param) { } } - schedulerAsyncFetchRows(pRequest->body.queryJob, fetchCallback, pRequest); + schedulerFetchRowsA(pRequest->body.queryJob, fetchCallback, pRequest); } void taos_fetch_raw_block_a(TAOS_RES *res, __taos_async_fn_t fp, void *param) { diff --git a/source/libs/qcom/src/queryUtil.c b/source/libs/qcom/src/queryUtil.c index a3a15869eb..1db13dd931 100644 --- a/source/libs/qcom/src/queryUtil.c +++ b/source/libs/qcom/src/queryUtil.c @@ -171,17 +171,17 @@ char* jobTaskStatusStr(int32_t status) { switch (status) { case JOB_TASK_STATUS_NULL: return "NULL"; - case JOB_TASK_STATUS_NOT_START: - return "NOT_START"; - case JOB_TASK_STATUS_EXECUTING: + case JOB_TASK_STATUS_INIT: + return "INIT"; + case JOB_TASK_STATUS_EXEC: return "EXECUTING"; - case JOB_TASK_STATUS_PARTIAL_SUCCEED: + case JOB_TASK_STATUS_PART_SUCC: return "PARTIAL_SUCCEED"; - case JOB_TASK_STATUS_SUCCEED: + case JOB_TASK_STATUS_SUCC: return "SUCCEED"; - case JOB_TASK_STATUS_FAILED: + case JOB_TASK_STATUS_FAIL: return "FAILED"; - case JOB_TASK_STATUS_DROPPING: + case JOB_TASK_STATUS_DROP: return "DROPPING"; default: break; diff --git a/source/libs/qworker/inc/qwInt.h b/source/libs/qworker/inc/qwInt.h index 6faffa13b3..eb10a2fdd6 100644 --- a/source/libs/qworker/inc/qwInt.h +++ b/source/libs/qworker/inc/qwInt.h @@ -226,8 +226,8 @@ typedef struct SQWorkerMgmt { #define QW_TASK_NOT_EXIST(code) (TSDB_CODE_QRY_SCH_NOT_EXIST == (code) || TSDB_CODE_QRY_TASK_NOT_EXIST == (code)) #define QW_TASK_ALREADY_EXIST(code) (TSDB_CODE_QRY_TASK_ALREADY_EXIST == (code)) #define QW_TASK_READY(status) \ - (status == JOB_TASK_STATUS_SUCCEED || status == JOB_TASK_STATUS_FAILED || status == JOB_TASK_STATUS_CANCELLED || \ - status == JOB_TASK_STATUS_PARTIAL_SUCCEED) + (status == JOB_TASK_STATUS_SUCC || status == JOB_TASK_STATUS_FAIL || status == JOB_TASK_STATUS_CANCELLED || \ + status == JOB_TASK_STATUS_PART_SUCC) #define QW_SET_QTID(id, qId, tId, eId) \ do { \ *(uint64_t *)(id) = (qId); \ diff --git a/source/libs/qworker/src/qwDbg.c b/source/libs/qworker/src/qwDbg.c index 68058334ab..dfe5a04d19 100644 --- a/source/libs/qworker/src/qwDbg.c +++ b/source/libs/qworker/src/qwDbg.c @@ -19,7 +19,7 @@ int32_t qwDbgValidateStatus(QW_FPARAMS_DEF, int8_t oriStatus, int8_t newStatus, int32_t code = 0; if (oriStatus == newStatus) { - if (newStatus == JOB_TASK_STATUS_EXECUTING || newStatus == JOB_TASK_STATUS_FAILED) { + if (newStatus == JOB_TASK_STATUS_EXEC || newStatus == JOB_TASK_STATUS_FAIL) { *ignore = true; return TSDB_CODE_SUCCESS; } @@ -29,47 +29,47 @@ int32_t qwDbgValidateStatus(QW_FPARAMS_DEF, int8_t oriStatus, int8_t newStatus, switch (oriStatus) { case JOB_TASK_STATUS_NULL: - if (newStatus != JOB_TASK_STATUS_EXECUTING && newStatus != JOB_TASK_STATUS_FAILED && - newStatus != JOB_TASK_STATUS_NOT_START) { + if (newStatus != JOB_TASK_STATUS_EXEC && newStatus != JOB_TASK_STATUS_FAIL && + newStatus != JOB_TASK_STATUS_INIT) { QW_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_NOT_START: - if (newStatus != JOB_TASK_STATUS_DROPPING && newStatus != JOB_TASK_STATUS_EXECUTING - && newStatus != JOB_TASK_STATUS_FAILED) { + case JOB_TASK_STATUS_INIT: + if (newStatus != JOB_TASK_STATUS_DROP && newStatus != JOB_TASK_STATUS_EXEC + && newStatus != JOB_TASK_STATUS_FAIL) { QW_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_EXECUTING: - if (newStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED && newStatus != JOB_TASK_STATUS_SUCCEED && - newStatus != JOB_TASK_STATUS_FAILED && newStatus != JOB_TASK_STATUS_DROPPING) { + case JOB_TASK_STATUS_EXEC: + if (newStatus != JOB_TASK_STATUS_PART_SUCC && newStatus != JOB_TASK_STATUS_SUCC && + newStatus != JOB_TASK_STATUS_FAIL && newStatus != JOB_TASK_STATUS_DROP) { QW_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_PARTIAL_SUCCEED: - if (newStatus != JOB_TASK_STATUS_EXECUTING && newStatus != JOB_TASK_STATUS_SUCCEED && - newStatus != JOB_TASK_STATUS_FAILED && newStatus != JOB_TASK_STATUS_DROPPING) { + case JOB_TASK_STATUS_PART_SUCC: + if (newStatus != JOB_TASK_STATUS_EXEC && newStatus != JOB_TASK_STATUS_SUCC && + newStatus != JOB_TASK_STATUS_FAIL && newStatus != JOB_TASK_STATUS_DROP) { QW_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_SUCCEED: - if (newStatus != JOB_TASK_STATUS_DROPPING && newStatus != JOB_TASK_STATUS_FAILED) { + case JOB_TASK_STATUS_SUCC: + if (newStatus != JOB_TASK_STATUS_DROP && newStatus != JOB_TASK_STATUS_FAIL) { QW_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_FAILED: - if (newStatus != JOB_TASK_STATUS_DROPPING) { + case JOB_TASK_STATUS_FAIL: + if (newStatus != JOB_TASK_STATUS_DROP) { QW_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_DROPPING: - if (newStatus != JOB_TASK_STATUS_FAILED && newStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { + case JOB_TASK_STATUS_DROP: + if (newStatus != JOB_TASK_STATUS_FAIL && newStatus != JOB_TASK_STATUS_PART_SUCC) { QW_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; diff --git a/source/libs/qworker/src/qworker.c b/source/libs/qworker/src/qworker.c index 949b67249f..b8a2f911bc 100644 --- a/source/libs/qworker/src/qworker.c +++ b/source/libs/qworker/src/qworker.c @@ -206,7 +206,7 @@ int32_t qwGetQueryResFromSink(QW_FPARAMS_DEF, SQWTaskCtx *ctx, int32_t *dataLen, QW_TASK_DLOG_E("no data in sink and query end"); - qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_SUCCEED); + qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_SUCC); QW_ERR_RET(qwMallocFetchRsp(len, &rsp)); *rspMsg = rsp; @@ -236,7 +236,7 @@ int32_t qwGetQueryResFromSink(QW_FPARAMS_DEF, SQWTaskCtx *ctx, int32_t *dataLen, if (DS_BUF_EMPTY == pOutput->bufStatus && pOutput->queryEnd) { QW_TASK_DLOG_E("task all data fetched, done"); - qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_SUCCEED); + qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_SUCC); } return TSDB_CODE_SUCCESS; @@ -330,7 +330,7 @@ int32_t qwHandlePrePhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inpu break; } - QW_ERR_JRET(qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_EXECUTING)); + QW_ERR_JRET(qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_EXEC)); break; } case QW_PHASE_PRE_FETCH: { @@ -447,7 +447,7 @@ int32_t qwHandlePostPhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inp _return: if (TSDB_CODE_SUCCESS == code && QW_PHASE_POST_QUERY == phase) { - qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_PARTIAL_SUCCEED); + qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_PART_SUCC); } if (rspConnection) { @@ -467,7 +467,7 @@ _return: } if (code) { - qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_FAILED); + qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_FAIL); } QW_TASK_DLOG("end to handle event at phase %s, code:%x - %s", qwPhaseStr(phase), code, tstrerror(code)); @@ -499,7 +499,7 @@ int32_t qwPrerocessQuery(QW_FPARAMS_DEF, SQWMsg *qwMsg) { ctx->ctrlConnInfo = qwMsg->connInfo; - QW_ERR_JRET(qwAddTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_NOT_START)); + QW_ERR_JRET(qwAddTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_INIT)); _return: @@ -698,7 +698,7 @@ int32_t qwProcessFetch(QW_FPARAMS_DEF, SQWMsg *qwMsg) { if (QW_IS_QUERY_RUNNING(ctx)) { atomic_store_8((int8_t *)&ctx->queryContinue, 1); } else if (0 == atomic_load_8((int8_t *)&ctx->queryInQueue)) { - qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_EXECUTING); + qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_EXEC); atomic_store_8((int8_t *)&ctx->queryInQueue, 1); @@ -749,7 +749,7 @@ int32_t qwProcessDrop(QW_FPARAMS_DEF, SQWMsg *qwMsg) { if (QW_IS_QUERY_RUNNING(ctx)) { QW_ERR_JRET(qwKillTaskHandle(QW_FPARAMS(), ctx)); - qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_DROPPING); + qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_DROP); } else if (ctx->phase > 0) { QW_ERR_JRET(qwDropTask(QW_FPARAMS())); rsped = true; @@ -770,7 +770,7 @@ _return: QW_UPDATE_RSP_CODE(ctx, code); } - qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_FAILED); + qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_FAIL); } if (locked) { diff --git a/source/libs/scheduler/inc/schedulerInt.h b/source/libs/scheduler/inc/schInt.h similarity index 95% rename from source/libs/scheduler/inc/schedulerInt.h rename to source/libs/scheduler/inc/schInt.h index aaa8274ce8..ce4b9eea19 100644 --- a/source/libs/scheduler/inc/schedulerInt.h +++ b/source/libs/scheduler/inc/schInt.h @@ -54,6 +54,13 @@ typedef enum { SCH_OP_FETCH, } SCH_OP_TYPE; +typedef enum { + SCH_EVENT_ENTER_API = 1, + SCH_EVENT_LEAVE_API, + SCH_EVENT_MSG, + SCH_EVENT_DROP, +} SCH_EVENT_TYPE; + typedef struct SSchTrans { void *pTrans; void *pHandle; @@ -104,6 +111,22 @@ typedef struct SSchResInfo { void* userParam; } SSchResInfo; +typedef struct SSchEvent { + SCH_EVENT_TYPE event; + void* info; +} SSchEvent; + +typedef int32_t (*schStatusEnterFp)(void* pHandle, void* pParam); +typedef int32_t (*schStatusLeaveFp)(void* pHandle, void* pParam); +typedef int32_t (*schStatusEventFp)(void* pHandle, void* pParam, void* pEvent); + +typedef struct SSchStatusFps { + EJobTaskType status; + schStatusEnterFp enterFp; + schStatusLeaveFp leaveFp; + schStatusEventFp eventFp; +} SSchStatusFps; + typedef struct SSchedulerMgmt { uint64_t taskId; // sequential taksId uint64_t sId; // schedulerId @@ -200,7 +223,7 @@ typedef struct SSchJobAttr { typedef struct { int32_t op; - bool sync; + bool syncReq; } SSchOpStatus; typedef struct SSchJob { @@ -349,7 +372,7 @@ int32_t schDecTaskFlowQuota(SSchJob *pJob, SSchTask *pTask); int32_t schCheckIncTaskFlowQuota(SSchJob *pJob, SSchTask *pTask, bool *enough); int32_t schLaunchTasksInFlowCtrlList(SSchJob *pJob, SSchTask *pTask); int32_t schLaunchTaskImpl(SSchJob *pJob, SSchTask *pTask); -int32_t schFetchFromRemote(SSchJob *pJob); +int32_t schLaunchFetchTask(SSchJob *pJob); int32_t schProcessOnTaskFailure(SSchJob *pJob, SSchTask *pTask, int32_t errCode); int32_t schBuildAndSendHbMsg(SQueryNodeEpId *nodeEpId, SArray* taskAction); int32_t schCloneSMsgSendInfo(void *src, void **dst); @@ -371,22 +394,21 @@ void schFreeRpcCtxVal(const void *arg); int32_t schMakeBrokenLinkVal(SSchJob *pJob, SSchTask *pTask, SRpcBrokenlinkVal *brokenVal, bool isHb); int32_t schAppendTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, int32_t execId); int32_t schExecStaticExplainJob(SSchedulerReq *pReq, int64_t *job, bool sync); -int32_t schExecJobImpl(SSchedulerReq *pReq, SSchJob *pJob, bool sync); int32_t schUpdateJobStatus(SSchJob *pJob, int8_t newStatus); int32_t schCancelJob(SSchJob *pJob); int32_t schProcessOnJobDropped(SSchJob *pJob, int32_t errCode); uint64_t schGenTaskId(void); void schCloseJobRef(void); -int32_t schExecJob(SSchedulerReq *pReq, int64_t *pJob, SQueryResult *pRes); int32_t schAsyncExecJob(SSchedulerReq *pReq, int64_t *pJob); -int32_t schFetchRows(SSchJob *pJob); -int32_t schAsyncFetchRows(SSchJob *pJob); +int32_t schJobFetchRows(SSchJob *pJob); +int32_t schJobFetchRowsA(SSchJob *pJob); int32_t schUpdateTaskHandle(SSchJob *pJob, SSchTask *pTask, bool dropExecNode, void *handle, int32_t execId); int32_t schProcessOnTaskStatusRsp(SQueryNodeEpId* pEpId, SArray* pStatusList); void schFreeSMsgSendInfo(SMsgSendInfo *msgSendInfo); char* schGetOpStr(SCH_OP_TYPE type); int32_t schBeginOperation(SSchJob *pJob, SCH_OP_TYPE type, bool sync); -int32_t schInitJob(SSchedulerReq *pReq, SSchJob **pSchJob); +int32_t schInitJob(SSchJob **pJob, SSchedulerReq *pReq); +int32_t schExecJob(SSchJob *pJob, SSchedulerReq *pReq); int32_t schSetJobQueryRes(SSchJob* pJob, SQueryResult* pRes); int32_t schUpdateTaskCandidateAddr(SSchJob *pJob, SSchTask *pTask, SEpSet* pEpSet); int32_t schHandleRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf* pData, int32_t rspCode); diff --git a/source/libs/scheduler/src/schJob.c b/source/libs/scheduler/src/schJob.c index cb39ff3f20..e137b2b001 100644 --- a/source/libs/scheduler/src/schJob.c +++ b/source/libs/scheduler/src/schJob.c @@ -25,30 +25,13 @@ FORCE_INLINE SSchJob *schAcquireJob(int64_t refId) { qDebug("sch acquire jobId:0 FORCE_INLINE int32_t schReleaseJob(int64_t refId) { qDebug("sch release jobId:0x%"PRIx64, refId); return taosReleaseRef(schMgmt.jobRef, refId); } -int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *pLevel) { - pTask->plan = pPlan; - pTask->level = pLevel; - pTask->execId = -1; - pTask->maxExecTimes = SCH_TASK_MAX_EXEC_TIMES; - pTask->timeoutUsec = SCH_DEFAULT_TASK_TIMEOUT_USEC; - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_NOT_START); - pTask->taskId = schGenTaskId(); - pTask->execNodes = taosHashInit(SCH_MAX_CANDIDATE_EP_NUM, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_NO_LOCK); - if (NULL == pTask->execNodes) { - SCH_TASK_ELOG("taosHashInit %d execNodes failed", SCH_MAX_CANDIDATE_EP_NUM); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - return TSDB_CODE_SUCCESS; -} - -int32_t schInitJob(SSchedulerReq *pReq, SSchJob **pSchJob) { +int32_t schInitJob(SSchJob **pSchJob, SSchedulerReq *pReq) { int32_t code = 0; int64_t refId = -1; SSchJob *pJob = taosMemoryCalloc(1, sizeof(SSchJob)); if (NULL == pJob) { qError("QID:0x%" PRIx64 " calloc %d failed", pReq->pDag->queryId, (int32_t)sizeof(SSchJob)); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); } pJob->attr.explainMode = pReq->pDag->explainInfo.mode; @@ -59,6 +42,8 @@ int32_t schInitJob(SSchedulerReq *pReq, SSchJob **pSchJob) { pJob->chkKillParam = pReq->chkKillParam; pJob->userRes.execFp = pReq->execFp; pJob->userRes.userParam = pReq->execParam; + pJob->opStatus.op = SCH_OP_EXEC; + pJob->opStatus.syncReq = pReq->syncReq; if (pReq->pNodeList == NULL || taosArrayGetSize(pReq->pNodeList) <= 0) { qDebug("QID:0x%" PRIx64 " input exec nodeList is empty", pReq->pDag->queryId); @@ -105,43 +90,21 @@ int32_t schInitJob(SSchedulerReq *pReq, SSchJob **pSchJob) { SCH_JOB_DLOG("job refId:0x%" PRIx64" created", pJob->refId); - schUpdateJobStatus(pJob, JOB_TASK_STATUS_NOT_START); - *pSchJob = pJob; return TSDB_CODE_SUCCESS; _return: - if (refId < 0) { + if (NULL == pJob) { + qDestroyQueryPlan(pReq->pDag); + } else if (refId < 0) { schFreeJobImpl(pJob); } else { taosRemoveRef(schMgmt.jobRef, refId); } - SCH_RET(code); -} - - -void schFreeTask(SSchJob *pJob, SSchTask *pTask) { - schDeregisterTaskHb(pJob, pTask); - if (pTask->candidateAddrs) { - taosArrayDestroy(pTask->candidateAddrs); - } - - taosMemoryFreeClear(pTask->msg); - - if (pTask->children) { - taosArrayDestroy(pTask->children); - } - - if (pTask->parents) { - taosArrayDestroy(pTask->parents); - } - - if (pTask->execNodes) { - taosHashCleanup(pTask->execNodes); - } + SCH_RET(code); } @@ -188,8 +151,8 @@ FORCE_INLINE bool schJobNeedToStop(SSchJob *pJob, int8_t *pStatus) { return true; } - return (status == JOB_TASK_STATUS_FAILED || status == JOB_TASK_STATUS_DROPPING || - status == JOB_TASK_STATUS_SUCCEED); + return (status == JOB_TASK_STATUS_FAIL || status == JOB_TASK_STATUS_DROP || + status == JOB_TASK_STATUS_SUCC); } int32_t schUpdateJobStatus(SSchJob *pJob, int8_t newStatus) { @@ -201,7 +164,7 @@ int32_t schUpdateJobStatus(SSchJob *pJob, int8_t newStatus) { oriStatus = SCH_GET_JOB_STATUS(pJob); if (oriStatus == newStatus) { - if (newStatus == JOB_TASK_STATUS_DROPPING) { + if (newStatus == JOB_TASK_STATUS_DROP) { SCH_ERR_JRET(TSDB_CODE_SCH_JOB_IS_DROPPING); } @@ -210,39 +173,39 @@ int32_t schUpdateJobStatus(SSchJob *pJob, int8_t newStatus) { switch (oriStatus) { case JOB_TASK_STATUS_NULL: - if (newStatus != JOB_TASK_STATUS_NOT_START) { + if (newStatus != JOB_TASK_STATUS_INIT) { SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_NOT_START: - if (newStatus != JOB_TASK_STATUS_EXECUTING && newStatus != JOB_TASK_STATUS_DROPPING) { + case JOB_TASK_STATUS_INIT: + if (newStatus != JOB_TASK_STATUS_EXEC && newStatus != JOB_TASK_STATUS_DROP) { SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_EXECUTING: - if (newStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED && newStatus != JOB_TASK_STATUS_FAILED && - newStatus != JOB_TASK_STATUS_DROPPING) { + case JOB_TASK_STATUS_EXEC: + if (newStatus != JOB_TASK_STATUS_PART_SUCC && newStatus != JOB_TASK_STATUS_FAIL && + newStatus != JOB_TASK_STATUS_DROP) { SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_PARTIAL_SUCCEED: - if (newStatus != JOB_TASK_STATUS_FAILED && newStatus != JOB_TASK_STATUS_SUCCEED && - newStatus != JOB_TASK_STATUS_DROPPING) { + case JOB_TASK_STATUS_PART_SUCC: + if (newStatus != JOB_TASK_STATUS_FAIL && newStatus != JOB_TASK_STATUS_SUCC && + newStatus != JOB_TASK_STATUS_DROP) { SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_SUCCEED: - case JOB_TASK_STATUS_FAILED: - if (newStatus != JOB_TASK_STATUS_DROPPING) { + case JOB_TASK_STATUS_SUCC: + case JOB_TASK_STATUS_FAIL: + if (newStatus != JOB_TASK_STATUS_DROP) { SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } break; - case JOB_TASK_STATUS_DROPPING: + case JOB_TASK_STATUS_DROP: SCH_ERR_JRET(TSDB_CODE_QRY_JOB_FREED); break; @@ -297,11 +260,11 @@ int32_t schBeginOperation(SSchJob *pJob, SCH_OP_TYPE type, bool sync) { SCH_JOB_DLOG("job start %s operation", schGetOpStr(pJob->opStatus.op)); - pJob->opStatus.sync = sync; + pJob->opStatus.syncReq = sync; switch (type) { case SCH_OP_EXEC: - SCH_ERR_JRET(schUpdateJobStatus(pJob, JOB_TASK_STATUS_EXECUTING)); + SCH_ERR_JRET(schUpdateJobStatus(pJob, JOB_TASK_STATUS_EXEC)); break; case SCH_OP_FETCH: if (!SCH_JOB_NEED_FETCH(pJob)) { @@ -309,7 +272,7 @@ int32_t schBeginOperation(SSchJob *pJob, SCH_OP_TYPE type, bool sync) { SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); } - if (status != JOB_TASK_STATUS_PARTIAL_SUCCEED) { + if (status != JOB_TASK_STATUS_PART_SUCC) { SCH_JOB_ELOG("job status error for fetch, status:%s", jobTaskStatusStr(status)); SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); } @@ -414,78 +377,8 @@ int32_t schBuildTaskRalation(SSchJob *pJob, SHashObj *planToTask) { return TSDB_CODE_SUCCESS; } -int32_t schRecordTaskSucceedNode(SSchJob *pJob, SSchTask *pTask) { - SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - if (NULL == addr) { - SCH_TASK_ELOG("taosArrayGet candidate addr failed, idx:%d, size:%d", pTask->candidateIdx, - (int32_t)taosArrayGetSize(pTask->candidateAddrs)); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - pTask->succeedAddr = *addr; - - return TSDB_CODE_SUCCESS; -} - -int32_t schAppendTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, int32_t execId) { - SSchNodeInfo nodeInfo = {.addr = *addr, .handle = NULL}; - - if (taosHashPut(pTask->execNodes, &execId, sizeof(execId), &nodeInfo, sizeof(nodeInfo))) { - SCH_TASK_ELOG("taosHashPut nodeInfo to execNodes failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_TASK_DLOG("task execNode added, execId:%d", execId); - - return TSDB_CODE_SUCCESS; -} - -int32_t schDropTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int32_t execId) { - if (NULL == pTask->execNodes) { - return TSDB_CODE_SUCCESS; - } - - if (taosHashRemove(pTask->execNodes, &execId, sizeof(execId))) { - SCH_TASK_ELOG("fail to remove execId %d from execNodeList", execId); - } else { - SCH_TASK_DLOG("execId %d removed from execNodeList", execId); - } - - if (execId != pTask->execId) { // ignore it - SCH_TASK_DLOG("execId %d is not current execId %d", execId, pTask->execId); - SCH_RET(TSDB_CODE_SCH_IGNORE_ERROR); - } - - return TSDB_CODE_SUCCESS; -} - -int32_t schUpdateTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int32_t execId) { - if (taosHashGetSize(pTask->execNodes) <= 0) { - return TSDB_CODE_SUCCESS; - } - - SSchNodeInfo *nodeInfo = taosHashGet(pTask->execNodes, &execId, sizeof(execId)); - nodeInfo->handle = handle; - - SCH_TASK_DLOG("handle updated to %p for execId %d", handle, execId); - - return TSDB_CODE_SUCCESS; -} - -int32_t schUpdateTaskHandle(SSchJob *pJob, SSchTask *pTask, bool dropExecNode, void *handle, int32_t execId) { - if (dropExecNode) { - SCH_RET(schDropTaskExecNode(pJob, pTask, handle, execId)); - } - - SCH_SET_TASK_HANDLE(pTask, handle); - - schUpdateTaskExecNode(pJob, pTask, handle, execId); - - return TSDB_CODE_SUCCESS; -} - - -int32_t schRecordQueryDataSrc(SSchJob *pJob, SSchTask *pTask) { +int32_t schAppendJobDataSrc(SSchJob *pJob, SSchTask *pTask) { if (!SCH_IS_DATA_SRC_QRY_TASK(pTask)) { return TSDB_CODE_SUCCESS; } @@ -539,7 +432,7 @@ int32_t schValidateAndBuildJob(SQueryPlan *pDag, SSchJob *pJob) { int32_t taskNum = 0; SSchLevel *pLevel = NULL; - level.status = JOB_TASK_STATUS_NOT_START; + level.status = JOB_TASK_STATUS_INIT; for (int32_t i = 0; i < levelNum; ++i) { if (NULL == taosArrayPush(pJob->levels, &level)) { @@ -584,7 +477,7 @@ int32_t schValidateAndBuildJob(SQueryPlan *pDag, SSchJob *pJob) { SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); } - SCH_ERR_JRET(schRecordQueryDataSrc(pJob, pTask)); + SCH_ERR_JRET(schAppendJobDataSrc(pJob, pTask)); if (0 != taosHashPut(planToTask, &plan, POINTER_BYTES, &pTask, POINTER_BYTES)) { SCH_TASK_ELOG("taosHashPut to planToTaks failed, taskIdx:%d", n); @@ -613,273 +506,6 @@ _return: SCH_RET(code); } -int32_t schSetAddrsFromNodeList(SSchJob *pJob, SSchTask *pTask) { - int32_t addNum = 0; - int32_t nodeNum = 0; - - if (pJob->nodeList) { - nodeNum = taosArrayGetSize(pJob->nodeList); - - for (int32_t i = 0; i < nodeNum && addNum < SCH_MAX_CANDIDATE_EP_NUM; ++i) { - SQueryNodeLoad *nload = taosArrayGet(pJob->nodeList, i); - SQueryNodeAddr *naddr = &nload->addr; - - if (NULL == taosArrayPush(pTask->candidateAddrs, naddr)) { - SCH_TASK_ELOG("taosArrayPush execNode to candidate addrs failed, addNum:%d, errno:%d", addNum, errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_TASK_DLOG("set %dth candidate addr, id %d, fqdn:%s, port:%d", i, naddr->nodeId, SCH_GET_CUR_EP(naddr)->fqdn, SCH_GET_CUR_EP(naddr)->port); - - ++addNum; - } - } - - if (addNum <= 0) { - SCH_TASK_ELOG("no available execNode as candidates, nodeNum:%d", nodeNum); - SCH_ERR_RET(TSDB_CODE_TSC_NO_EXEC_NODE); - } - - return TSDB_CODE_SUCCESS; -} - - -int32_t schSetTaskCandidateAddrs(SSchJob *pJob, SSchTask *pTask) { - if (NULL != pTask->candidateAddrs) { - return TSDB_CODE_SUCCESS; - } - - pTask->candidateIdx = 0; - pTask->candidateAddrs = taosArrayInit(SCH_MAX_CANDIDATE_EP_NUM, sizeof(SQueryNodeAddr)); - if (NULL == pTask->candidateAddrs) { - SCH_TASK_ELOG("taosArrayInit %d condidate addrs failed", SCH_MAX_CANDIDATE_EP_NUM); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - if (pTask->plan->execNode.epSet.numOfEps > 0) { - if (NULL == taosArrayPush(pTask->candidateAddrs, &pTask->plan->execNode)) { - SCH_TASK_ELOG("taosArrayPush execNode to candidate addrs failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_TASK_DLOG("use execNode in plan as candidate addr, numOfEps:%d", pTask->plan->execNode.epSet.numOfEps); - - return TSDB_CODE_SUCCESS; - } - - SCH_ERR_RET(schSetAddrsFromNodeList(pJob, pTask)); - - /* - for (int32_t i = 0; i < job->dataSrcEps.numOfEps && addNum < SCH_MAX_CANDIDATE_EP_NUM; ++i) { - strncpy(epSet->fqdn[epSet->numOfEps], job->dataSrcEps.fqdn[i], sizeof(job->dataSrcEps.fqdn[i])); - epSet->port[epSet->numOfEps] = job->dataSrcEps.port[i]; - - ++epSet->numOfEps; - } - */ - - return TSDB_CODE_SUCCESS; -} - -int32_t schUpdateTaskCandidateAddr(SSchJob *pJob, SSchTask *pTask, SEpSet* pEpSet) { - if (NULL == pTask->candidateAddrs || 1 != taosArrayGetSize(pTask->candidateAddrs)) { - SCH_TASK_ELOG("not able to update cndidate addr, addr num %d", (int32_t)(pTask->candidateAddrs ? taosArrayGetSize(pTask->candidateAddrs): 0)); - SCH_ERR_RET(TSDB_CODE_APP_ERROR); - } - - SQueryNodeAddr* pAddr = taosArrayGet(pTask->candidateAddrs, 0); - - SEp* pOld = &pAddr->epSet.eps[pAddr->epSet.inUse]; - SEp* pNew = &pEpSet->eps[pEpSet->inUse]; - - SCH_TASK_DLOG("update task ep from %s:%d to %s:%d", pOld->fqdn, pOld->port, pNew->fqdn, pNew->port); - - memcpy(&pAddr->epSet, pEpSet, sizeof(pAddr->epSet)); - - return TSDB_CODE_SUCCESS; -} - - -int32_t schRemoveTaskFromExecList(SSchJob *pJob, SSchTask *pTask) { - int32_t code = taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId)); - if (code) { - SCH_TASK_ELOG("task failed to rm from execTask list, code:%x", code); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - return TSDB_CODE_SUCCESS; -} - - -int32_t schPushTaskToExecList(SSchJob *pJob, SSchTask *pTask) { - int32_t code = taosHashPut(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); - if (0 != code) { - if (HASH_NODE_EXIST(code)) { - SCH_TASK_ELOG("task already in execTask list, code:%x", code); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - SCH_TASK_ELOG("taosHashPut task to execTask list failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_TASK_DLOG("task added to execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); - - return TSDB_CODE_SUCCESS; -} - -/* -int32_t schMoveTaskToSuccList(SSchJob *pJob, SSchTask *pTask, bool *moved) { - if (0 != taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId))) { - SCH_TASK_WLOG("remove task from execTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - } else { - SCH_TASK_DLOG("task removed from execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); - } - - int32_t code = taosHashPut(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); - if (0 != code) { - if (HASH_NODE_EXIST(code)) { - *moved = true; - SCH_TASK_ELOG("task already in succTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_TASK_ELOG("taosHashPut task to succTask list failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - *moved = true; - - SCH_TASK_DLOG("task moved to succTask list, numOfTasks:%d", taosHashGetSize(pJob->succTasks)); - - return TSDB_CODE_SUCCESS; -} - -int32_t schMoveTaskToFailList(SSchJob *pJob, SSchTask *pTask, bool *moved) { - *moved = false; - - if (0 != taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId))) { - SCH_TASK_WLOG("remove task from execTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - } - - int32_t code = taosHashPut(pJob->failTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); - if (0 != code) { - if (HASH_NODE_EXIST(code)) { - *moved = true; - - SCH_TASK_WLOG("task already in failTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_TASK_ELOG("taosHashPut task to failTask list failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - *moved = true; - - SCH_TASK_DLOG("task moved to failTask list, numOfTasks:%d", taosHashGetSize(pJob->failTasks)); - - return TSDB_CODE_SUCCESS; -} - -int32_t schMoveTaskToExecList(SSchJob *pJob, SSchTask *pTask, bool *moved) { - if (0 != taosHashRemove(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId))) { - SCH_TASK_WLOG("remove task from succTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - } - - int32_t code = taosHashPut(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); - if (0 != code) { - if (HASH_NODE_EXIST(code)) { - *moved = true; - - SCH_TASK_ELOG("task already in execTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_TASK_ELOG("taosHashPut task to execTask list failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - *moved = true; - - SCH_TASK_DLOG("task moved to execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); - - return TSDB_CODE_SUCCESS; -} -*/ - -int32_t schTaskCheckSetRetry(SSchJob *pJob, SSchTask *pTask, int32_t errCode, bool *needRetry) { - if (TSDB_CODE_SCH_TIMEOUT_ERROR == errCode) { - pTask->maxExecTimes++; - if (pTask->timeoutUsec < SCH_MAX_TASK_TIMEOUT_USEC) { - pTask->timeoutUsec *= 2; - if (pTask->timeoutUsec > SCH_MAX_TASK_TIMEOUT_USEC) { - pTask->timeoutUsec = SCH_MAX_TASK_TIMEOUT_USEC; - } - } - } - - if ((pTask->execId + 1) >= pTask->maxExecTimes) { - *needRetry = false; - SCH_TASK_DLOG("task no more retry since reach max try times, execId:%d", pTask->execId); - return TSDB_CODE_SUCCESS; - } - - if (!SCH_NEED_RETRY(pTask->lastMsgType, errCode)) { - *needRetry = false; - SCH_TASK_DLOG("task no more retry cause of errCode, errCode:%x - %s", errCode, tstrerror(errCode)); - return TSDB_CODE_SUCCESS; - } - - if (SCH_IS_DATA_SRC_TASK(pTask)) { - if ((pTask->execId + 1) >= SCH_TASK_NUM_OF_EPS(&pTask->plan->execNode)) { - *needRetry = false; - SCH_TASK_DLOG("task no more retry since all ep tried, execId:%d, epNum:%d", pTask->execId, - SCH_TASK_NUM_OF_EPS(&pTask->plan->execNode)); - return TSDB_CODE_SUCCESS; - } - } else { - int32_t candidateNum = taosArrayGetSize(pTask->candidateAddrs); - - if ((pTask->candidateIdx + 1) >= candidateNum && (TSDB_CODE_SCH_TIMEOUT_ERROR != errCode)) { - *needRetry = false; - SCH_TASK_DLOG("task no more retry since all candiates tried, candidateIdx:%d, candidateNum:%d", - pTask->candidateIdx, candidateNum); - return TSDB_CODE_SUCCESS; - } - } - - *needRetry = true; - SCH_TASK_DLOG("task need the %dth retry, errCode:%x - %s", pTask->execId + 1, errCode, tstrerror(errCode)); - - return TSDB_CODE_SUCCESS; -} - -int32_t schHandleTaskRetry(SSchJob *pJob, SSchTask *pTask) { - atomic_sub_fetch_32(&pTask->level->taskLaunchedNum, 1); - - SCH_ERR_RET(schRemoveTaskFromExecList(pJob, pTask)); - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_NOT_START); - - if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { - SCH_ERR_RET(schLaunchTasksInFlowCtrlList(pJob, pTask)); - } - - schDeregisterTaskHb(pJob, pTask); - - if (SCH_IS_DATA_SRC_TASK(pTask)) { - SCH_SWITCH_EPSET(&pTask->plan->execNode); - } else { - int32_t candidateNum = taosArrayGetSize(pTask->candidateAddrs); - if (++pTask->candidateIdx >= candidateNum) { - pTask->candidateIdx = 0; - } - } - - SCH_ERR_RET(schLaunchTask(pJob, pTask)); - - return TSDB_CODE_SUCCESS; -} int32_t schSetJobQueryRes(SSchJob* pJob, SQueryResult* pRes) { pRes->code = atomic_load_32(&pJob->errCode); @@ -893,7 +519,7 @@ int32_t schSetJobQueryRes(SSchJob* pJob, SQueryResult* pRes) { int32_t schSetJobFetchRes(SSchJob* pJob, void** pData) { int32_t code = 0; if (pJob->resData && ((SRetrieveTableRsp *)pJob->resData)->completed) { - SCH_ERR_RET(schUpdateJobStatus(pJob, JOB_TASK_STATUS_SUCCEED)); + SCH_ERR_RET(schUpdateJobStatus(pJob, JOB_TASK_STATUS_SUCC)); } while (true) { @@ -989,19 +615,19 @@ int32_t schProcessOnJobFailureImpl(SSchJob *pJob, int32_t status, int32_t errCod // Note: no more task error processing, handled in function internal int32_t schProcessOnJobFailure(SSchJob *pJob, int32_t errCode) { - SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_FAILED, errCode)); + SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_FAIL, errCode)); } // Note: no more error processing, handled in function internal int32_t schProcessOnJobDropped(SSchJob *pJob, int32_t errCode) { - SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_DROPPING, errCode)); + SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_DROP, errCode)); } // Note: no more task error processing, handled in function internal int32_t schProcessOnJobPartialSuccess(SSchJob *pJob) { int32_t code = 0; - SCH_ERR_RET(schUpdateJobStatus(pJob, JOB_TASK_STATUS_PARTIAL_SUCCEED)); + SCH_ERR_RET(schUpdateJobStatus(pJob, JOB_TASK_STATUS_PART_SUCC)); schPostJobRes(pJob, SCH_OP_EXEC); @@ -1016,65 +642,21 @@ void schProcessOnDataFetched(SSchJob *pJob) { schPostJobRes(pJob, SCH_OP_FETCH); } -// Note: no more task error processing, handled in function internal -int32_t schProcessOnTaskFailure(SSchJob *pJob, SSchTask *pTask, int32_t errCode) { - int8_t status = 0; +int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRsp *pRsp) { + SCH_TASK_DLOG("got explain rsp, rows:%d, complete:%d", htonl(pRsp->numOfRows), pRsp->completed); - if (errCode == TSDB_CODE_SCH_TIMEOUT_ERROR) { - SCH_LOG_TASK_WAIT_TS(pTask); - } else { - SCH_LOG_TASK_END_TS(pTask); - } - - if (schJobNeedToStop(pJob, &status)) { - SCH_TASK_DLOG("task failed not processed cause of job status, job status:%s", jobTaskStatusStr(status)); - SCH_RET(atomic_load_32(&pJob->errCode)); - } + atomic_store_32(&pJob->resNumOfRows, htonl(pRsp->numOfRows)); + atomic_store_ptr(&pJob->resData, pRsp); - bool needRetry = false; - bool moved = false; - int32_t taskDone = 0; - int32_t code = 0; + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCC); - SCH_TASK_DLOG("taskOnFailure, code:%s", tstrerror(errCode)); + schProcessOnDataFetched(pJob); - SCH_ERR_JRET(schTaskCheckSetRetry(pJob, pTask, errCode, &needRetry)); - - if (!needRetry) { - SCH_TASK_ELOG("task failed and no more retry, code:%s", tstrerror(errCode)); - - if (SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_EXECUTING) { - SCH_TASK_ELOG("task not in executing list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_FAILED); - - if (SCH_IS_WAIT_ALL_JOB(pJob)) { - SCH_LOCK(SCH_WRITE, &pTask->level->lock); - pTask->level->taskFailed++; - taskDone = pTask->level->taskSucceed + pTask->level->taskFailed; - SCH_UNLOCK(SCH_WRITE, &pTask->level->lock); - - schUpdateJobErrCode(pJob, errCode); - - if (taskDone < pTask->level->taskNum) { - SCH_TASK_DLOG("need to wait other tasks, doneNum:%d, allNum:%d", taskDone, pTask->level->taskNum); - SCH_RET(errCode); - } - } - } else { - SCH_ERR_JRET(schHandleTaskRetry(pJob, pTask)); - - return TSDB_CODE_SUCCESS; - } - -_return: - - SCH_RET(schProcessOnJobFailure(pJob, errCode)); + return TSDB_CODE_SUCCESS; } -int32_t schLaunchNextLevelTasks(SSchJob *pJob, SSchTask *pTask) { + +int32_t schLaunchJobLowerLevel(SSchJob *pJob, SSchTask *pTask) { if (!SCH_IS_QUERY_JOB(pJob)) { return TSDB_CODE_SUCCESS; } @@ -1099,217 +681,6 @@ int32_t schLaunchNextLevelTasks(SSchJob *pJob, SSchTask *pTask) { return TSDB_CODE_SUCCESS; } - -// Note: no more task error processing, handled in function internal -int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask) { - bool moved = false; - int32_t code = 0; - - SCH_TASK_DLOG("taskOnSuccess, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - - SCH_LOG_TASK_END_TS(pTask); - - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_PARTIAL_SUCCEED); - - SCH_ERR_JRET(schRecordTaskSucceedNode(pJob, pTask)); - - SCH_ERR_JRET(schLaunchTasksInFlowCtrlList(pJob, pTask)); - - int32_t parentNum = pTask->parents ? (int32_t)taosArrayGetSize(pTask->parents) : 0; - if (parentNum == 0) { - int32_t taskDone = 0; - if (SCH_IS_WAIT_ALL_JOB(pJob)) { - SCH_LOCK(SCH_WRITE, &pTask->level->lock); - pTask->level->taskSucceed++; - taskDone = pTask->level->taskSucceed + pTask->level->taskFailed; - SCH_UNLOCK(SCH_WRITE, &pTask->level->lock); - - if (taskDone < pTask->level->taskNum) { - SCH_TASK_DLOG("wait all tasks, done:%d, all:%d", taskDone, pTask->level->taskNum); - return TSDB_CODE_SUCCESS; - } else if (taskDone > pTask->level->taskNum) { - SCH_TASK_ELOG("taskDone number invalid, done:%d, total:%d", taskDone, pTask->level->taskNum); - } - - if (pTask->level->taskFailed > 0) { - SCH_RET(schProcessOnJobFailure(pJob, 0)); - } else { - SCH_RET(schProcessOnJobPartialSuccess(pJob)); - } - } else { - pJob->resNode = pTask->succeedAddr; - } - - pJob->fetchTask = pTask; - - SCH_RET(schProcessOnJobPartialSuccess(pJob)); - } - - /* - if (SCH_IS_DATA_SRC_TASK(task) && job->dataSrcEps.numOfEps < SCH_MAX_CANDIDATE_EP_NUM) { - strncpy(job->dataSrcEps.fqdn[job->dataSrcEps.numOfEps], task->execAddr.fqdn, sizeof(task->execAddr.fqdn)); - job->dataSrcEps.port[job->dataSrcEps.numOfEps] = task->execAddr.port; - - ++job->dataSrcEps.numOfEps; - } - */ - - for (int32_t i = 0; i < parentNum; ++i) { - SSchTask *parent = *(SSchTask **)taosArrayGet(pTask->parents, i); - int32_t readyNum = atomic_add_fetch_32(&parent->childReady, 1); - - SCH_LOCK(SCH_WRITE, &parent->lock); - SDownstreamSourceNode source = {.type = QUERY_NODE_DOWNSTREAM_SOURCE, - .taskId = pTask->taskId, - .schedId = schMgmt.sId, - .execId = pTask->execId, - .addr = pTask->succeedAddr}; - qSetSubplanExecutionNode(parent->plan, pTask->plan->id.groupId, &source); - SCH_UNLOCK(SCH_WRITE, &parent->lock); - - if (SCH_TASK_READY_FOR_LAUNCH(readyNum, parent)) { - SCH_TASK_DLOG("all %d children task done, start to launch parent task 0x%" PRIx64, readyNum, parent->taskId); - SCH_ERR_RET(schLaunchTask(pJob, parent)); - } - } - - SCH_ERR_RET(schLaunchNextLevelTasks(pJob, pTask)); - - return TSDB_CODE_SUCCESS; - -_return: - - SCH_RET(schProcessOnJobFailure(pJob, code)); -} - -// Note: no more error processing, handled in function internal -int32_t schFetchFromRemote(SSchJob *pJob) { - int32_t code = 0; - - void *resData = atomic_load_ptr(&pJob->resData); - if (resData) { - SCH_JOB_DLOG("res already fetched, res:%p", resData); - return TSDB_CODE_SUCCESS; - } - - SCH_ERR_JRET(schBuildAndSendMsg(pJob, pJob->fetchTask, &pJob->resNode, TDMT_SCH_FETCH)); - - return TSDB_CODE_SUCCESS; - -_return: - - SCH_RET(schProcessOnTaskFailure(pJob, pJob->fetchTask, code)); -} - -int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRsp *pRsp) { - SCH_TASK_DLOG("got explain rsp, rows:%d, complete:%d", htonl(pRsp->numOfRows), pRsp->completed); - - atomic_store_32(&pJob->resNumOfRows, htonl(pRsp->numOfRows)); - atomic_store_ptr(&pJob->resData, pRsp); - - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCCEED); - - schProcessOnDataFetched(pJob); - - return TSDB_CODE_SUCCESS; -} - -void schDropTaskOnExecNode(SSchJob *pJob, SSchTask *pTask) { - if (NULL == pTask->execNodes) { - SCH_TASK_DLOG("no exec address, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - return; - } - - int32_t size = (int32_t)taosHashGetSize(pTask->execNodes); - - if (size <= 0) { - SCH_TASK_DLOG("task has no execNodes, no need to drop it, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - return; - } - - SSchNodeInfo *nodeInfo = taosHashIterate(pTask->execNodes, NULL); - while (nodeInfo) { - SCH_SET_TASK_HANDLE(pTask, nodeInfo->handle); - - schBuildAndSendMsg(pJob, pTask, &nodeInfo->addr, TDMT_SCH_DROP_TASK); - - nodeInfo = taosHashIterate(pTask->execNodes, nodeInfo); - } - - SCH_TASK_DLOG("task has been dropped on %d exec nodes", size); -} - - -int32_t schRescheduleTask(SSchJob *pJob, SSchTask *pTask) { - if (SCH_IS_DATA_SRC_QRY_TASK(pTask)) { - return TSDB_CODE_SUCCESS; - } - - SCH_LOCK_TASK(pTask); - if (SCH_TASK_TIMEOUT(pTask) && JOB_TASK_STATUS_EXECUTING == pTask->status && - pJob->fetchTask != pTask && taosArrayGetSize(pTask->candidateAddrs) > 1) { - SCH_TASK_DLOG("task execId %d will be rescheduled now", pTask->execId); - schDropTaskOnExecNode(pJob, pTask); - taosHashClear(pTask->execNodes); - schProcessOnTaskFailure(pJob, pTask, TSDB_CODE_SCH_TIMEOUT_ERROR); - } - SCH_UNLOCK_TASK(pTask); - - return TSDB_CODE_SUCCESS; -} - -int32_t schProcessOnTaskStatusRsp(SQueryNodeEpId* pEpId, SArray* pStatusList) { - int32_t taskNum = (int32_t)taosArrayGetSize(pStatusList); - SSchTask *pTask = NULL; - - qDebug("%d task status in hb rsp from nodeId:%d, fqdn:%s, port:%d", taskNum, pEpId->nodeId, pEpId->ep.fqdn, pEpId->ep.port); - - for (int32_t i = 0; i < taskNum; ++i) { - STaskStatus *taskStatus = taosArrayGet(pStatusList, i); - - qDebug("QID:0x%" PRIx64 ",TID:0x%" PRIx64 ",EID:%d task status in server: %s", - taskStatus->queryId, taskStatus->taskId, taskStatus->execId, jobTaskStatusStr(taskStatus->status)); - - SSchJob *pJob = schAcquireJob(taskStatus->refId); - if (NULL == pJob) { - qWarn("job not found, refId:0x%" PRIx64 ",QID:0x%" PRIx64 ",TID:0x%" PRIx64, taskStatus->refId, - taskStatus->queryId, taskStatus->taskId); - // TODO DROP TASK FROM SERVER!!!! - continue; - } - - pTask = NULL; - schGetTaskInJob(pJob, taskStatus->taskId, &pTask); - if (NULL == pTask) { - // TODO DROP TASK FROM SERVER!!!! - schReleaseJob(taskStatus->refId); - continue; - } - - if (taskStatus->execId != pTask->execId) { - // TODO DROP TASK FROM SERVER!!!! - SCH_TASK_DLOG("EID %d in hb rsp mis-match", taskStatus->execId); - schReleaseJob(taskStatus->refId); - continue; - } - - if (taskStatus->status == JOB_TASK_STATUS_FAILED) { - // RECORD AND HANDLE ERROR!!!! - schReleaseJob(taskStatus->refId); - continue; - } - - if (taskStatus->status == JOB_TASK_STATUS_NOT_START) { - schRescheduleTask(pJob, pTask); - } - - schReleaseJob(taskStatus->refId); - } - - return TSDB_CODE_SUCCESS; -} - - int32_t schSaveJobQueryRes(SSchJob *pJob, SQueryTableRsp *rsp) { if (rsp->tbFName[0]) { if (NULL == pJob->execRes.res) { @@ -1331,22 +702,6 @@ int32_t schSaveJobQueryRes(SSchJob *pJob, SQueryTableRsp *rsp) { return TSDB_CODE_SUCCESS; } -int32_t schGetTaskFromList(SHashObj *pTaskList, uint64_t taskId, SSchTask **pTask) { - int32_t s = taosHashGetSize(pTaskList); - if (s <= 0) { - return TSDB_CODE_SUCCESS; - } - - SSchTask **task = taosHashGet(pTaskList, &taskId, sizeof(taskId)); - if (NULL == task || NULL == (*task)) { - return TSDB_CODE_SUCCESS; - } - - *pTask = *task; - - return TSDB_CODE_SUCCESS; -} - int32_t schGetTaskInJob(SSchJob *pJob, uint64_t taskId, SSchTask **pTask) { schGetTaskFromList(pJob->taskList, taskId, pTask); if (NULL == *pTask) { @@ -1357,113 +712,20 @@ int32_t schGetTaskInJob(SSchJob *pJob, uint64_t taskId, SSchTask **pTask) { return TSDB_CODE_SUCCESS; } -int32_t schLaunchTaskImpl(SSchJob *pJob, SSchTask *pTask) { - int8_t status = 0; - int32_t code = 0; - - atomic_add_fetch_32(&pTask->level->taskLaunchedNum, 1); - pTask->execId++; - - SCH_TASK_DLOG("start to launch task's %dth exec", pTask->execId); - - SCH_LOG_TASK_START_TS(pTask); - - if (schJobNeedToStop(pJob, &status)) { - SCH_TASK_DLOG("no need to launch task cause of job status, job status:%s", jobTaskStatusStr(status)); - - SCH_RET(atomic_load_32(&pJob->errCode)); - } - - // NOTE: race condition: the task should be put into the hash table before send msg to server - if (SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_EXECUTING) { - SCH_ERR_RET(schPushTaskToExecList(pJob, pTask)); - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_EXECUTING); - } - - SSubplan *plan = pTask->plan; - - if (NULL == pTask->msg) { // TODO add more detailed reason for failure - code = qSubPlanToString(plan, &pTask->msg, &pTask->msgLen); - if (TSDB_CODE_SUCCESS != code) { - SCH_TASK_ELOG("failed to create physical plan, code:%s, msg:%p, len:%d", tstrerror(code), pTask->msg, - pTask->msgLen); - SCH_ERR_RET(code); - } else { - SCH_TASK_DLOGL("physical plan len:%d, %s", pTask->msgLen, pTask->msg); - } - } - - SCH_ERR_RET(schSetTaskCandidateAddrs(pJob, pTask)); - - if (SCH_IS_QUERY_JOB(pJob)) { - SCH_ERR_RET(schEnsureHbConnection(pJob, pTask)); - } - - SCH_ERR_RET(schBuildAndSendMsg(pJob, pTask, NULL, plan->msgType)); - - return TSDB_CODE_SUCCESS; -} - -// Note: no more error processing, handled in function internal -int32_t schLaunchTask(SSchJob *pJob, SSchTask *pTask) { - bool enough = false; - int32_t code = 0; - - SCH_SET_TASK_HANDLE(pTask, NULL); - - if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { - SCH_ERR_JRET(schCheckIncTaskFlowQuota(pJob, pTask, &enough)); - - if (enough) { - SCH_ERR_JRET(schLaunchTaskImpl(pJob, pTask)); - } - } else { - SCH_ERR_JRET(schLaunchTaskImpl(pJob, pTask)); - } - - return TSDB_CODE_SUCCESS; - -_return: - - SCH_RET(schProcessOnTaskFailure(pJob, pTask, code)); -} - -int32_t schLaunchLevelTasks(SSchJob *pJob, SSchLevel *level) { - for (int32_t i = 0; i < level->taskNum; ++i) { - SSchTask *pTask = taosArrayGet(level->subTasks, i); - - SCH_ERR_RET(schLaunchTask(pJob, pTask)); - } - - return TSDB_CODE_SUCCESS; -} int32_t schLaunchJob(SSchJob *pJob) { - SSchLevel *level = taosArrayGet(pJob->levels, pJob->levelIdx); - - SCH_ERR_RET(schChkJobNeedFlowCtrl(pJob, level)); - - SCH_ERR_RET(schLaunchLevelTasks(pJob, level)); + if (EXPLAIN_MODE_STATIC == pJob->attr.explainMode) { + SCH_ERR_RET(qExecStaticExplain(pJob->pDag, (SRetrieveTableRsp **)&pJob->resData)); + SCH_ERR_RET(schJobStatusEnter(&pJob, JOB_TASK_STATUS_PART_SUCC, NULL)); + } else { + SSchLevel *level = taosArrayGet(pJob->levels, pJob->levelIdx); + SCH_ERR_RET(schLaunchLevelTasks(pJob, level)); + } return TSDB_CODE_SUCCESS; } -void schDropTaskInHashList(SSchJob *pJob, SHashObj *list) { - if (!SCH_IS_NEED_DROP_JOB(pJob)) { - return; - } - - void *pIter = taosHashIterate(list, NULL); - while (pIter) { - SSchTask *pTask = *(SSchTask **)pIter; - - schDropTaskOnExecNode(pJob, pTask); - - pIter = taosHashIterate(list, pIter); - } -} - void schDropJobAllTasks(SSchJob *pJob) { schDropTaskInHashList(pJob, pJob->execTasks); // schDropTaskInHashList(pJob, pJob->succTasks); @@ -1487,7 +749,7 @@ void schFreeJobImpl(void *job) { qDebug("QID:0x%" PRIx64 " begin to free sch job, refId:0x%" PRIx64 ", pointer:%p", queryId, refId, pJob); - if (pJob->status == JOB_TASK_STATUS_EXECUTING) { + if (pJob->status == JOB_TASK_STATUS_EXEC) { schCancelJob(pJob); } @@ -1535,88 +797,11 @@ void schFreeJobImpl(void *job) { qDebug("QID:0x%" PRIx64 " sch job freed, refId:0x%" PRIx64 ", pointer:%p", queryId, refId, pJob); } -int32_t schLaunchStaticExplainJob(SSchedulerReq *pReq, SSchJob *pJob, bool sync) { - qDebug("QID:0x%" PRIx64 " job started", pReq->pDag->queryId); - - int32_t code = 0; -/* - SSchJob *pJob = taosMemoryCalloc(1, sizeof(SSchJob)); - if (NULL == pJob) { - qError("QID:0x%" PRIx64 " calloc %d failed", pReq->pDag->queryId, (int32_t)sizeof(SSchJob)); - code = TSDB_CODE_QRY_OUT_OF_MEMORY; - pReq->fp(NULL, pReq->cbParam, code); - SCH_ERR_RET(code); - } - - pJob->sql = pReq->sql; - pJob->reqKilled = pReq->reqKilled; - pJob->pDag = pReq->pDag; - pJob->attr.queryJob = true; - pJob->attr.explainMode = pReq->pDag->explainInfo.mode; - pJob->queryId = pReq->pDag->queryId; - pJob->userRes.execFp = pReq->fp; - pJob->userRes.userParam = pReq->cbParam; - - schUpdateJobStatus(pJob, JOB_TASK_STATUS_NOT_START); - - code = schBeginOperation(pJob, SCH_OP_EXEC, sync); - if (code) { - pReq->fp(NULL, pReq->cbParam, code); - schFreeJobImpl(pJob); - SCH_ERR_RET(code); - } -*/ - - SCH_ERR_JRET(qExecStaticExplain(pReq->pDag, (SRetrieveTableRsp **)&pJob->resData)); - -/* - int64_t refId = taosAddRef(schMgmt.jobRef, pJob); - if (refId < 0) { - SCH_JOB_ELOG("taosAddRef job failed, error:%s", tstrerror(terrno)); - SCH_ERR_JRET(terrno); - } - - if (NULL == schAcquireJob(refId)) { - SCH_JOB_ELOG("schAcquireJob job failed, refId:0x%" PRIx64, refId); - SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); - } - - pJob->refId = refId; - - SCH_JOB_DLOG("job refId:0x%" PRIx64, pJob->refId); -*/ - - pJob->status = JOB_TASK_STATUS_PARTIAL_SUCCEED; - - SCH_JOB_DLOG("job exec done, job status:%s", SCH_GET_JOB_STATUS_STR(pJob)); - - if (!sync) { - schPostJobRes(pJob, SCH_OP_EXEC); - } else { - schEndOperation(pJob); - } - -// schReleaseJob(pJob->refId); - - SCH_RET(code); - -_return: - - schEndOperation(pJob); - if (!sync) { - pReq->execFp(NULL, pReq->execParam, code); - } - - schFreeJobImpl(pJob); - - SCH_RET(code); -} - -int32_t schFetchRows(SSchJob *pJob) { +int32_t schJobFetchRows(SSchJob *pJob) { int32_t code = 0; if (!(pJob->attr.explainMode == EXPLAIN_MODE_STATIC)) { - SCH_ERR_JRET(schFetchFromRemote(pJob)); + SCH_ERR_JRET(schLaunchFetchTask(pJob)); tsem_wait(&pJob->rspSem); } @@ -1629,7 +814,7 @@ _return: SCH_RET(code); } -int32_t schAsyncFetchRows(SSchJob *pJob) { +int32_t schJobFetchRowsA(SSchJob *pJob) { int32_t code = 0; if (pJob->attr.explainMode == EXPLAIN_MODE_STATIC) { @@ -1637,129 +822,55 @@ int32_t schAsyncFetchRows(SSchJob *pJob) { return TSDB_CODE_SUCCESS; } - SCH_ERR_RET(schFetchFromRemote(pJob)); + SCH_ERR_RET(schLaunchFetchTask(pJob)); return TSDB_CODE_SUCCESS; } - -int32_t schExecJobImpl(SSchedulerReq *pReq, SSchJob *pJob, bool sync) { - int32_t code = 0; - +int32_t schExecJob(SSchJob *pJob, SSchedulerReq *pReq) { + int32_t code = 0; qDebug("QID:0x%" PRIx64 " sch job refId 0x%"PRIx64 " started", pReq->pDag->queryId, pJob->refId); - SCH_ERR_JRET(schBeginOperation(pJob, SCH_OP_EXEC, sync)); - - if (EXPLAIN_MODE_STATIC == pReq->pDag->explainInfo.mode) { - code = schLaunchStaticExplainJob(pReq, pJob, sync); - } else { - code = schLaunchJob(pJob); - if (sync) { - SCH_JOB_DLOG("will wait for rsp now, job status:%s", SCH_GET_JOB_STATUS_STR(pJob)); - tsem_wait(&pJob->rspSem); - - schEndOperation(pJob); - } else if (code) { - schPostJobRes(pJob, SCH_OP_EXEC); - } + SCH_ERR_JRET(schLaunchJob(pJob)); + + if (pReq->syncReq) { + SCH_JOB_DLOG("sync wait for rsp now, job status:%s", SCH_GET_JOB_STATUS_STR(pJob)); + tsem_wait(&pJob->rspSem); } SCH_JOB_DLOG("job exec done, job status:%s, jobId:0x%" PRIx64, SCH_GET_JOB_STATUS_STR(pJob), pJob->refId); - SCH_RET(code); - -_return: - - if (!sync) { - pReq->execFp(NULL, pReq->execParam, code); - } - - SCH_RET(code); -} - -int32_t schDoTaskRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf* pData, int32_t rspCode) { - int32_t code = 0; - - if ((pTask->execId + 1) >= pTask->maxExecTimes) { - SCH_TASK_DLOG("task no more retry since reach max try times, execId:%d", pTask->execId); - schProcessOnJobFailure(pJob, rspCode); - return TSDB_CODE_SUCCESS; - } - - SCH_TASK_DLOG("task will be redirected now, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - - schDropTaskOnExecNode(pJob, pTask); - taosHashClear(pTask->execNodes); - SCH_ERR_JRET(schRemoveTaskFromExecList(pJob, pTask)); - schDeregisterTaskHb(pJob, pTask); - atomic_sub_fetch_32(&pTask->level->taskLaunchedNum, 1); - taosMemoryFreeClear(pTask->msg); - pTask->msgLen = 0; - pTask->lastMsgType = 0; - memset(&pTask->succeedAddr, 0, sizeof(pTask->succeedAddr)); - - if (SCH_IS_DATA_SRC_QRY_TASK(pTask)) { - if (pData) { - SCH_ERR_JRET(schUpdateTaskCandidateAddr(pJob, pTask, pData->pEpSet)); - } - - if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { - if (JOB_TASK_STATUS_EXECUTING == SCH_GET_TASK_STATUS(pTask)) { - SCH_ERR_JRET(schLaunchTasksInFlowCtrlList(pJob, pTask)); - } - } - - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_NOT_START); - - SCH_ERR_JRET(schLaunchTask(pJob, pTask)); - - return TSDB_CODE_SUCCESS; - } - - - // merge plan - - pTask->childReady = 0; - - qClearSubplanExecutionNode(pTask->plan); - - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_NOT_START); - - int32_t childrenNum = taosArrayGetSize(pTask->children); - for (int32_t i = 0; i < childrenNum; ++i) { - SSchTask* pChild = taosArrayGetP(pTask->children, i); - SCH_LOCK_TASK(pChild); - schDoTaskRedirect(pJob, pChild, NULL, rspCode); - SCH_UNLOCK_TASK(pChild); - } - return TSDB_CODE_SUCCESS; _return: - - code = schProcessOnTaskFailure(pJob, pTask, code); - - SCH_RET(code); + + SCH_RET(schProcessOnJobFailure(pJob, code)); } -int32_t schHandleRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf* pData, int32_t rspCode) { - int32_t code = 0; +int32_t schJobStatusEnter(SSchJob** job, int32_t status, void* param) { + SCH_ERR_RET(schUpdateJobStatus(*job, status)); - if (SCH_IS_DATA_SRC_QRY_TASK(pTask)) { - if (NULL == pData->pEpSet) { - SCH_TASK_ELOG("no epset updated while got error %s", tstrerror(rspCode)); - SCH_ERR_JRET(rspCode); + switch (status) { + case JOB_TASK_STATUS_INIT: + SCH_RET(schInitJob(job, param)); + case JOB_TASK_STATUS_EXEC: + SCH_RET(schExecJob(job, param)); + case JOB_TASK_STATUS_PART_SUCC: + default: { + SSchJob* pJob = *job; + SCH_JOB_ELOG("enter unknown job status %d", status); + SCH_RET(TSDB_CODE_SCH_STATUS_ERROR); } } - SCH_RET(schDoTaskRedirect(pJob, pTask, pData, rspCode)); + return TSDB_CODE_SUCCESS; +} -_return: - - schProcessOnTaskFailure(pJob, pTask, code); - - SCH_RET(code); +int32_t schJobStatusEvent() { + + schEndOperation(pJob); } + diff --git a/source/libs/scheduler/src/schRemote.c b/source/libs/scheduler/src/schRemote.c index 32f151f8af..479d3665a4 100644 --- a/source/libs/scheduler/src/schRemote.c +++ b/source/libs/scheduler/src/schRemote.c @@ -37,7 +37,7 @@ int32_t schValidateReceivedMsgType(SSchJob *pJob, SSchTask *pTask, int32_t msgTy TMSG_INFO(msgType)); } - if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { + if (taskStatus != JOB_TASK_STATUS_EXEC && taskStatus != JOB_TASK_STATUS_PART_SUCC) { SCH_TASK_DLOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), TMSG_INFO(msgType)); } @@ -51,7 +51,7 @@ int32_t schValidateReceivedMsgType(SSchJob *pJob, SSchTask *pTask, int32_t msgTy SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); } - if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { + if (taskStatus != JOB_TASK_STATUS_EXEC && taskStatus != JOB_TASK_STATUS_PART_SUCC) { SCH_TASK_ELOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), TMSG_INFO(msgType)); SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); @@ -76,7 +76,7 @@ int32_t schValidateReceivedMsgType(SSchJob *pJob, SSchTask *pTask, int32_t msgTy SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); } - if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { + if (taskStatus != JOB_TASK_STATUS_EXEC && taskStatus != JOB_TASK_STATUS_PART_SUCC) { SCH_TASK_ELOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), TMSG_INFO(msgType)); SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); @@ -308,7 +308,7 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, ch return TSDB_CODE_SUCCESS; } - SCH_ERR_JRET(schFetchFromRemote(pJob)); + SCH_ERR_JRET(schLaunchFetchTask(pJob)); taosMemoryFreeClear(msg); @@ -325,7 +325,7 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, ch atomic_add_fetch_32(&pJob->resNumOfRows, htonl(rsp->numOfRows)); if (rsp->completed) { - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCCEED); + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCC); } SCH_TASK_DLOG("got fetch rsp, rows:%d, complete:%d", htonl(rsp->numOfRows), rsp->completed); diff --git a/source/libs/scheduler/src/schStatus.c b/source/libs/scheduler/src/schStatus.c new file mode 100644 index 0000000000..a8cac993cf --- /dev/null +++ b/source/libs/scheduler/src/schStatus.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "catalog.h" +#include "command.h" +#include "query.h" +#include "schInt.h" +#include "tmsg.h" +#include "tref.h" +#include "trpc.h" + +SSchStatusFps gSchJobFps[JOB_TASK_STATUS_MAX] = { + {JOB_TASK_STATUS_NULL, schJobStNullEnter, schJobStNullLeave, schJobStNullEvent}, + {JOB_TASK_STATUS_INIT, schJobStNullEnter, schJobStNullLeave, schJobStNullEvent}, + {JOB_TASK_STATUS_EXEC, schJobStNullEnter, schJobStNullLeave, schJobStNullEvent}, + {JOB_TASK_STATUS_PART_SUCC, schJobStNullEnter, schJobStNullLeave, schJobStNullEvent}, + {JOB_TASK_STATUS_SUCC, schJobStNullEnter, schJobStNullLeave, schJobStNullEvent}, + {JOB_TASK_STATUS_FAIL, schJobStNullEnter, schJobStNullLeave, schJobStNullEvent}, + {JOB_TASK_STATUS_DROP, schJobStNullEnter, schJobStNullLeave, schJobStNullEvent}, +}; + +SSchStatusFps gSchTaskFps[JOB_TASK_STATUS_MAX] = { + {JOB_TASK_STATUS_NULL, schTaskStatusNullEnter, schTaskStatusNullLeave, schTaskStatusNullEvent}, + {JOB_TASK_STATUS_INIT, schTaskStatusNullEnter, schTaskStatusNullLeave, schTaskStatusNullEvent}, + {JOB_TASK_STATUS_EXEC, schTaskStatusNullEnter, schTaskStatusNullLeave, schTaskStatusNullEvent}, + {JOB_TASK_STATUS_PART_SUCC, schTaskStatusNullEnter, schTaskStatusNullLeave, schTaskStatusNullEvent}, + {JOB_TASK_STATUS_SUCC, schTaskStatusNullEnter, schTaskStatusNullLeave, schTaskStatusNullEvent}, + {JOB_TASK_STATUS_FAIL, schTaskStatusNullEnter, schTaskStatusNullLeave, schTaskStatusNullEvent}, + {JOB_TASK_STATUS_DROP, schTaskStatusNullEnter, schTaskStatusNullLeave, schTaskStatusNullEvent}, +}; + + + + diff --git a/source/libs/scheduler/src/schTask.c b/source/libs/scheduler/src/schTask.c new file mode 100644 index 0000000000..ccbd1f4615 --- /dev/null +++ b/source/libs/scheduler/src/schTask.c @@ -0,0 +1,843 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "catalog.h" +#include "command.h" +#include "query.h" +#include "schedulerInt.h" +#include "tmsg.h" +#include "tref.h" +#include "trpc.h" + + + +void schFreeTask(SSchJob *pJob, SSchTask *pTask) { + schDeregisterTaskHb(pJob, pTask); + + if (pTask->candidateAddrs) { + taosArrayDestroy(pTask->candidateAddrs); + } + + taosMemoryFreeClear(pTask->msg); + + if (pTask->children) { + taosArrayDestroy(pTask->children); + } + + if (pTask->parents) { + taosArrayDestroy(pTask->parents); + } + + if (pTask->execNodes) { + taosHashCleanup(pTask->execNodes); + } +} + + +int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *pLevel) { + pTask->plan = pPlan; + pTask->level = pLevel; + pTask->execId = -1; + pTask->maxExecTimes = SCH_TASK_MAX_EXEC_TIMES; + pTask->timeoutUsec = SCH_DEFAULT_TASK_TIMEOUT_USEC; + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_INIT); + pTask->taskId = schGenTaskId(); + pTask->execNodes = taosHashInit(SCH_MAX_CANDIDATE_EP_NUM, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_NO_LOCK); + if (NULL == pTask->execNodes) { + SCH_TASK_ELOG("taosHashInit %d execNodes failed", SCH_MAX_CANDIDATE_EP_NUM); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + return TSDB_CODE_SUCCESS; +} + +int32_t schRecordTaskSucceedNode(SSchJob *pJob, SSchTask *pTask) { + SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + if (NULL == addr) { + SCH_TASK_ELOG("taosArrayGet candidate addr failed, idx:%d, size:%d", pTask->candidateIdx, + (int32_t)taosArrayGetSize(pTask->candidateAddrs)); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + pTask->succeedAddr = *addr; + + return TSDB_CODE_SUCCESS; +} + +int32_t schAppendTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, int32_t execId) { + SSchNodeInfo nodeInfo = {.addr = *addr, .handle = NULL}; + + if (taosHashPut(pTask->execNodes, &execId, sizeof(execId), &nodeInfo, sizeof(nodeInfo))) { + SCH_TASK_ELOG("taosHashPut nodeInfo to execNodes failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_TASK_DLOG("task execNode added, execId:%d", execId); + + return TSDB_CODE_SUCCESS; +} + +int32_t schDropTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int32_t execId) { + if (NULL == pTask->execNodes) { + return TSDB_CODE_SUCCESS; + } + + if (taosHashRemove(pTask->execNodes, &execId, sizeof(execId))) { + SCH_TASK_ELOG("fail to remove execId %d from execNodeList", execId); + } else { + SCH_TASK_DLOG("execId %d removed from execNodeList", execId); + } + + if (execId != pTask->execId) { // ignore it + SCH_TASK_DLOG("execId %d is not current execId %d", execId, pTask->execId); + SCH_RET(TSDB_CODE_SCH_IGNORE_ERROR); + } + + return TSDB_CODE_SUCCESS; +} + +int32_t schUpdateTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int32_t execId) { + if (taosHashGetSize(pTask->execNodes) <= 0) { + return TSDB_CODE_SUCCESS; + } + + SSchNodeInfo *nodeInfo = taosHashGet(pTask->execNodes, &execId, sizeof(execId)); + nodeInfo->handle = handle; + + SCH_TASK_DLOG("handle updated to %p for execId %d", handle, execId); + + return TSDB_CODE_SUCCESS; +} + +int32_t schUpdateTaskHandle(SSchJob *pJob, SSchTask *pTask, bool dropExecNode, void *handle, int32_t execId) { + if (dropExecNode) { + SCH_RET(schDropTaskExecNode(pJob, pTask, handle, execId)); + } + + SCH_SET_TASK_HANDLE(pTask, handle); + + schUpdateTaskExecNode(pJob, pTask, handle, execId); + + return TSDB_CODE_SUCCESS; +} + +// Note: no more task error processing, handled in function internal +int32_t schProcessOnTaskFailure(SSchJob *pJob, SSchTask *pTask, int32_t errCode) { + int8_t status = 0; + + if (errCode == TSDB_CODE_SCH_TIMEOUT_ERROR) { + SCH_LOG_TASK_WAIT_TS(pTask); + } else { + SCH_LOG_TASK_END_TS(pTask); + } + + if (schJobNeedToStop(pJob, &status)) { + SCH_TASK_DLOG("task failed not processed cause of job status, job status:%s", jobTaskStatusStr(status)); + SCH_RET(atomic_load_32(&pJob->errCode)); + } + + bool needRetry = false; + bool moved = false; + int32_t taskDone = 0; + int32_t code = 0; + + SCH_TASK_DLOG("taskOnFailure, code:%s", tstrerror(errCode)); + + SCH_ERR_JRET(schTaskCheckSetRetry(pJob, pTask, errCode, &needRetry)); + + if (!needRetry) { + SCH_TASK_ELOG("task failed and no more retry, code:%s", tstrerror(errCode)); + + if (SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_EXEC) { + SCH_TASK_ELOG("task not in executing list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_FAIL); + + if (SCH_IS_WAIT_ALL_JOB(pJob)) { + SCH_LOCK(SCH_WRITE, &pTask->level->lock); + pTask->level->taskFailed++; + taskDone = pTask->level->taskSucceed + pTask->level->taskFailed; + SCH_UNLOCK(SCH_WRITE, &pTask->level->lock); + + schUpdateJobErrCode(pJob, errCode); + + if (taskDone < pTask->level->taskNum) { + SCH_TASK_DLOG("need to wait other tasks, doneNum:%d, allNum:%d", taskDone, pTask->level->taskNum); + SCH_RET(errCode); + } + } + } else { + SCH_ERR_JRET(schHandleTaskRetry(pJob, pTask)); + + return TSDB_CODE_SUCCESS; + } + +_return: + + SCH_RET(schProcessOnJobFailure(pJob, errCode)); +} + + + +// Note: no more task error processing, handled in function internal +int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask) { + bool moved = false; + int32_t code = 0; + + SCH_TASK_DLOG("taskOnSuccess, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + + SCH_LOG_TASK_END_TS(pTask); + + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_PART_SUCC); + + SCH_ERR_JRET(schRecordTaskSucceedNode(pJob, pTask)); + + SCH_ERR_JRET(schLaunchTasksInFlowCtrlList(pJob, pTask)); + + int32_t parentNum = pTask->parents ? (int32_t)taosArrayGetSize(pTask->parents) : 0; + if (parentNum == 0) { + int32_t taskDone = 0; + if (SCH_IS_WAIT_ALL_JOB(pJob)) { + SCH_LOCK(SCH_WRITE, &pTask->level->lock); + pTask->level->taskSucceed++; + taskDone = pTask->level->taskSucceed + pTask->level->taskFailed; + SCH_UNLOCK(SCH_WRITE, &pTask->level->lock); + + if (taskDone < pTask->level->taskNum) { + SCH_TASK_DLOG("wait all tasks, done:%d, all:%d", taskDone, pTask->level->taskNum); + return TSDB_CODE_SUCCESS; + } else if (taskDone > pTask->level->taskNum) { + SCH_TASK_ELOG("taskDone number invalid, done:%d, total:%d", taskDone, pTask->level->taskNum); + } + + if (pTask->level->taskFailed > 0) { + SCH_RET(schProcessOnJobFailure(pJob, 0)); + } else { + SCH_RET(schProcessOnJobPartialSuccess(pJob)); + } + } else { + pJob->resNode = pTask->succeedAddr; + } + + pJob->fetchTask = pTask; + + SCH_RET(schProcessOnJobPartialSuccess(pJob)); + } + + /* + if (SCH_IS_DATA_SRC_TASK(task) && job->dataSrcEps.numOfEps < SCH_MAX_CANDIDATE_EP_NUM) { + strncpy(job->dataSrcEps.fqdn[job->dataSrcEps.numOfEps], task->execAddr.fqdn, sizeof(task->execAddr.fqdn)); + job->dataSrcEps.port[job->dataSrcEps.numOfEps] = task->execAddr.port; + + ++job->dataSrcEps.numOfEps; + } + */ + + for (int32_t i = 0; i < parentNum; ++i) { + SSchTask *parent = *(SSchTask **)taosArrayGet(pTask->parents, i); + int32_t readyNum = atomic_add_fetch_32(&parent->childReady, 1); + + SCH_LOCK(SCH_WRITE, &parent->lock); + SDownstreamSourceNode source = {.type = QUERY_NODE_DOWNSTREAM_SOURCE, + .taskId = pTask->taskId, + .schedId = schMgmt.sId, + .execId = pTask->execId, + .addr = pTask->succeedAddr}; + qSetSubplanExecutionNode(parent->plan, pTask->plan->id.groupId, &source); + SCH_UNLOCK(SCH_WRITE, &parent->lock); + + if (SCH_TASK_READY_FOR_LAUNCH(readyNum, parent)) { + SCH_TASK_DLOG("all %d children task done, start to launch parent task 0x%" PRIx64, readyNum, parent->taskId); + SCH_ERR_RET(schLaunchTask(pJob, parent)); + } + } + + SCH_ERR_RET(schLaunchJobLowerLevel(pJob, pTask)); + + return TSDB_CODE_SUCCESS; + +_return: + + SCH_RET(schProcessOnJobFailure(pJob, code)); +} + +int32_t schRescheduleTask(SSchJob *pJob, SSchTask *pTask) { + if (SCH_IS_DATA_SRC_QRY_TASK(pTask)) { + return TSDB_CODE_SUCCESS; + } + + SCH_LOCK_TASK(pTask); + if (SCH_TASK_TIMEOUT(pTask) && JOB_TASK_STATUS_EXEC == pTask->status && + pJob->fetchTask != pTask && taosArrayGetSize(pTask->candidateAddrs) > 1) { + SCH_TASK_DLOG("task execId %d will be rescheduled now", pTask->execId); + schDropTaskOnExecNode(pJob, pTask); + taosHashClear(pTask->execNodes); + schProcessOnTaskFailure(pJob, pTask, TSDB_CODE_SCH_TIMEOUT_ERROR); + } + SCH_UNLOCK_TASK(pTask); + + return TSDB_CODE_SUCCESS; +} + +int32_t schDoTaskRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf* pData, int32_t rspCode) { + int32_t code = 0; + + if ((pTask->execId + 1) >= pTask->maxExecTimes) { + SCH_TASK_DLOG("task no more retry since reach max try times, execId:%d", pTask->execId); + schProcessOnJobFailure(pJob, rspCode); + return TSDB_CODE_SUCCESS; + } + + SCH_TASK_DLOG("task will be redirected now, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + + schDropTaskOnExecNode(pJob, pTask); + taosHashClear(pTask->execNodes); + SCH_ERR_JRET(schRemoveTaskFromExecList(pJob, pTask)); + schDeregisterTaskHb(pJob, pTask); + atomic_sub_fetch_32(&pTask->level->taskLaunchedNum, 1); + taosMemoryFreeClear(pTask->msg); + pTask->msgLen = 0; + pTask->lastMsgType = 0; + memset(&pTask->succeedAddr, 0, sizeof(pTask->succeedAddr)); + + if (SCH_IS_DATA_SRC_QRY_TASK(pTask)) { + if (pData) { + SCH_ERR_JRET(schUpdateTaskCandidateAddr(pJob, pTask, pData->pEpSet)); + } + + if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { + if (JOB_TASK_STATUS_EXEC == SCH_GET_TASK_STATUS(pTask)) { + SCH_ERR_JRET(schLaunchTasksInFlowCtrlList(pJob, pTask)); + } + } + + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_INIT); + + SCH_ERR_JRET(schLaunchTask(pJob, pTask)); + + return TSDB_CODE_SUCCESS; + } + + + // merge plan + + pTask->childReady = 0; + + qClearSubplanExecutionNode(pTask->plan); + + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_INIT); + + int32_t childrenNum = taosArrayGetSize(pTask->children); + for (int32_t i = 0; i < childrenNum; ++i) { + SSchTask* pChild = taosArrayGetP(pTask->children, i); + SCH_LOCK_TASK(pChild); + schDoTaskRedirect(pJob, pChild, NULL, rspCode); + SCH_UNLOCK_TASK(pChild); + } + + return TSDB_CODE_SUCCESS; + +_return: + + code = schProcessOnTaskFailure(pJob, pTask, code); + + SCH_RET(code); +} + +int32_t schHandleRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf* pData, int32_t rspCode) { + int32_t code = 0; + + if (SCH_IS_DATA_SRC_QRY_TASK(pTask)) { + if (NULL == pData->pEpSet) { + SCH_TASK_ELOG("no epset updated while got error %s", tstrerror(rspCode)); + SCH_ERR_JRET(rspCode); + } + } + + SCH_RET(schDoTaskRedirect(pJob, pTask, pData, rspCode)); + +_return: + + schProcessOnTaskFailure(pJob, pTask, code); + + SCH_RET(code); +} + +int32_t schPushTaskToExecList(SSchJob *pJob, SSchTask *pTask) { + int32_t code = taosHashPut(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); + if (0 != code) { + if (HASH_NODE_EXIST(code)) { + SCH_TASK_ELOG("task already in execTask list, code:%x", code); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + SCH_TASK_ELOG("taosHashPut task to execTask list failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_TASK_DLOG("task added to execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); + + return TSDB_CODE_SUCCESS; +} + +/* +int32_t schMoveTaskToSuccList(SSchJob *pJob, SSchTask *pTask, bool *moved) { + if (0 != taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId))) { + SCH_TASK_WLOG("remove task from execTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + } else { + SCH_TASK_DLOG("task removed from execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); + } + + int32_t code = taosHashPut(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); + if (0 != code) { + if (HASH_NODE_EXIST(code)) { + *moved = true; + SCH_TASK_ELOG("task already in succTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_TASK_ELOG("taosHashPut task to succTask list failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + *moved = true; + + SCH_TASK_DLOG("task moved to succTask list, numOfTasks:%d", taosHashGetSize(pJob->succTasks)); + + return TSDB_CODE_SUCCESS; +} + +int32_t schMoveTaskToFailList(SSchJob *pJob, SSchTask *pTask, bool *moved) { + *moved = false; + + if (0 != taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId))) { + SCH_TASK_WLOG("remove task from execTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + } + + int32_t code = taosHashPut(pJob->failTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); + if (0 != code) { + if (HASH_NODE_EXIST(code)) { + *moved = true; + + SCH_TASK_WLOG("task already in failTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_TASK_ELOG("taosHashPut task to failTask list failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + *moved = true; + + SCH_TASK_DLOG("task moved to failTask list, numOfTasks:%d", taosHashGetSize(pJob->failTasks)); + + return TSDB_CODE_SUCCESS; +} + +int32_t schMoveTaskToExecList(SSchJob *pJob, SSchTask *pTask, bool *moved) { + if (0 != taosHashRemove(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId))) { + SCH_TASK_WLOG("remove task from succTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + } + + int32_t code = taosHashPut(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); + if (0 != code) { + if (HASH_NODE_EXIST(code)) { + *moved = true; + + SCH_TASK_ELOG("task already in execTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_TASK_ELOG("taosHashPut task to execTask list failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + *moved = true; + + SCH_TASK_DLOG("task moved to execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); + + return TSDB_CODE_SUCCESS; +} +*/ + +int32_t schTaskCheckSetRetry(SSchJob *pJob, SSchTask *pTask, int32_t errCode, bool *needRetry) { + if (TSDB_CODE_SCH_TIMEOUT_ERROR == errCode) { + pTask->maxExecTimes++; + if (pTask->timeoutUsec < SCH_MAX_TASK_TIMEOUT_USEC) { + pTask->timeoutUsec *= 2; + if (pTask->timeoutUsec > SCH_MAX_TASK_TIMEOUT_USEC) { + pTask->timeoutUsec = SCH_MAX_TASK_TIMEOUT_USEC; + } + } + } + + if ((pTask->execId + 1) >= pTask->maxExecTimes) { + *needRetry = false; + SCH_TASK_DLOG("task no more retry since reach max try times, execId:%d", pTask->execId); + return TSDB_CODE_SUCCESS; + } + + if (!SCH_NEED_RETRY(pTask->lastMsgType, errCode)) { + *needRetry = false; + SCH_TASK_DLOG("task no more retry cause of errCode, errCode:%x - %s", errCode, tstrerror(errCode)); + return TSDB_CODE_SUCCESS; + } + + if (SCH_IS_DATA_SRC_TASK(pTask)) { + if ((pTask->execId + 1) >= SCH_TASK_NUM_OF_EPS(&pTask->plan->execNode)) { + *needRetry = false; + SCH_TASK_DLOG("task no more retry since all ep tried, execId:%d, epNum:%d", pTask->execId, + SCH_TASK_NUM_OF_EPS(&pTask->plan->execNode)); + return TSDB_CODE_SUCCESS; + } + } else { + int32_t candidateNum = taosArrayGetSize(pTask->candidateAddrs); + + if ((pTask->candidateIdx + 1) >= candidateNum && (TSDB_CODE_SCH_TIMEOUT_ERROR != errCode)) { + *needRetry = false; + SCH_TASK_DLOG("task no more retry since all candiates tried, candidateIdx:%d, candidateNum:%d", + pTask->candidateIdx, candidateNum); + return TSDB_CODE_SUCCESS; + } + } + + *needRetry = true; + SCH_TASK_DLOG("task need the %dth retry, errCode:%x - %s", pTask->execId + 1, errCode, tstrerror(errCode)); + + return TSDB_CODE_SUCCESS; +} + +int32_t schHandleTaskRetry(SSchJob *pJob, SSchTask *pTask) { + atomic_sub_fetch_32(&pTask->level->taskLaunchedNum, 1); + + SCH_ERR_RET(schRemoveTaskFromExecList(pJob, pTask)); + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_INIT); + + if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { + SCH_ERR_RET(schLaunchTasksInFlowCtrlList(pJob, pTask)); + } + + schDeregisterTaskHb(pJob, pTask); + + if (SCH_IS_DATA_SRC_TASK(pTask)) { + SCH_SWITCH_EPSET(&pTask->plan->execNode); + } else { + int32_t candidateNum = taosArrayGetSize(pTask->candidateAddrs); + if (++pTask->candidateIdx >= candidateNum) { + pTask->candidateIdx = 0; + } + } + + SCH_ERR_RET(schLaunchTask(pJob, pTask)); + + return TSDB_CODE_SUCCESS; +} + +int32_t schSetAddrsFromNodeList(SSchJob *pJob, SSchTask *pTask) { + int32_t addNum = 0; + int32_t nodeNum = 0; + + if (pJob->nodeList) { + nodeNum = taosArrayGetSize(pJob->nodeList); + + for (int32_t i = 0; i < nodeNum && addNum < SCH_MAX_CANDIDATE_EP_NUM; ++i) { + SQueryNodeLoad *nload = taosArrayGet(pJob->nodeList, i); + SQueryNodeAddr *naddr = &nload->addr; + + if (NULL == taosArrayPush(pTask->candidateAddrs, naddr)) { + SCH_TASK_ELOG("taosArrayPush execNode to candidate addrs failed, addNum:%d, errno:%d", addNum, errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_TASK_DLOG("set %dth candidate addr, id %d, fqdn:%s, port:%d", i, naddr->nodeId, SCH_GET_CUR_EP(naddr)->fqdn, SCH_GET_CUR_EP(naddr)->port); + + ++addNum; + } + } + + if (addNum <= 0) { + SCH_TASK_ELOG("no available execNode as candidates, nodeNum:%d", nodeNum); + SCH_ERR_RET(TSDB_CODE_TSC_NO_EXEC_NODE); + } + + return TSDB_CODE_SUCCESS; +} + + +int32_t schSetTaskCandidateAddrs(SSchJob *pJob, SSchTask *pTask) { + if (NULL != pTask->candidateAddrs) { + return TSDB_CODE_SUCCESS; + } + + pTask->candidateIdx = 0; + pTask->candidateAddrs = taosArrayInit(SCH_MAX_CANDIDATE_EP_NUM, sizeof(SQueryNodeAddr)); + if (NULL == pTask->candidateAddrs) { + SCH_TASK_ELOG("taosArrayInit %d condidate addrs failed", SCH_MAX_CANDIDATE_EP_NUM); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + if (pTask->plan->execNode.epSet.numOfEps > 0) { + if (NULL == taosArrayPush(pTask->candidateAddrs, &pTask->plan->execNode)) { + SCH_TASK_ELOG("taosArrayPush execNode to candidate addrs failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_TASK_DLOG("use execNode in plan as candidate addr, numOfEps:%d", pTask->plan->execNode.epSet.numOfEps); + + return TSDB_CODE_SUCCESS; + } + + SCH_ERR_RET(schSetAddrsFromNodeList(pJob, pTask)); + + /* + for (int32_t i = 0; i < job->dataSrcEps.numOfEps && addNum < SCH_MAX_CANDIDATE_EP_NUM; ++i) { + strncpy(epSet->fqdn[epSet->numOfEps], job->dataSrcEps.fqdn[i], sizeof(job->dataSrcEps.fqdn[i])); + epSet->port[epSet->numOfEps] = job->dataSrcEps.port[i]; + + ++epSet->numOfEps; + } + */ + + return TSDB_CODE_SUCCESS; +} + +int32_t schUpdateTaskCandidateAddr(SSchJob *pJob, SSchTask *pTask, SEpSet* pEpSet) { + if (NULL == pTask->candidateAddrs || 1 != taosArrayGetSize(pTask->candidateAddrs)) { + SCH_TASK_ELOG("not able to update cndidate addr, addr num %d", (int32_t)(pTask->candidateAddrs ? taosArrayGetSize(pTask->candidateAddrs): 0)); + SCH_ERR_RET(TSDB_CODE_APP_ERROR); + } + + SQueryNodeAddr* pAddr = taosArrayGet(pTask->candidateAddrs, 0); + + SEp* pOld = &pAddr->epSet.eps[pAddr->epSet.inUse]; + SEp* pNew = &pEpSet->eps[pEpSet->inUse]; + + SCH_TASK_DLOG("update task ep from %s:%d to %s:%d", pOld->fqdn, pOld->port, pNew->fqdn, pNew->port); + + memcpy(&pAddr->epSet, pEpSet, sizeof(pAddr->epSet)); + + return TSDB_CODE_SUCCESS; +} + + +int32_t schRemoveTaskFromExecList(SSchJob *pJob, SSchTask *pTask) { + int32_t code = taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId)); + if (code) { + SCH_TASK_ELOG("task failed to rm from execTask list, code:%x", code); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + return TSDB_CODE_SUCCESS; +} + +void schDropTaskOnExecNode(SSchJob *pJob, SSchTask *pTask) { + if (NULL == pTask->execNodes) { + SCH_TASK_DLOG("no exec address, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + return; + } + + int32_t size = (int32_t)taosHashGetSize(pTask->execNodes); + + if (size <= 0) { + SCH_TASK_DLOG("task has no execNodes, no need to drop it, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + return; + } + + SSchNodeInfo *nodeInfo = taosHashIterate(pTask->execNodes, NULL); + while (nodeInfo) { + SCH_SET_TASK_HANDLE(pTask, nodeInfo->handle); + + schBuildAndSendMsg(pJob, pTask, &nodeInfo->addr, TDMT_SCH_DROP_TASK); + + nodeInfo = taosHashIterate(pTask->execNodes, nodeInfo); + } + + SCH_TASK_DLOG("task has been dropped on %d exec nodes", size); +} + + + +int32_t schProcessOnTaskStatusRsp(SQueryNodeEpId* pEpId, SArray* pStatusList) { + int32_t taskNum = (int32_t)taosArrayGetSize(pStatusList); + SSchTask *pTask = NULL; + + qDebug("%d task status in hb rsp from nodeId:%d, fqdn:%s, port:%d", taskNum, pEpId->nodeId, pEpId->ep.fqdn, pEpId->ep.port); + + for (int32_t i = 0; i < taskNum; ++i) { + STaskStatus *taskStatus = taosArrayGet(pStatusList, i); + + qDebug("QID:0x%" PRIx64 ",TID:0x%" PRIx64 ",EID:%d task status in server: %s", + taskStatus->queryId, taskStatus->taskId, taskStatus->execId, jobTaskStatusStr(taskStatus->status)); + + SSchJob *pJob = schAcquireJob(taskStatus->refId); + if (NULL == pJob) { + qWarn("job not found, refId:0x%" PRIx64 ",QID:0x%" PRIx64 ",TID:0x%" PRIx64, taskStatus->refId, + taskStatus->queryId, taskStatus->taskId); + // TODO DROP TASK FROM SERVER!!!! + continue; + } + + pTask = NULL; + schGetTaskInJob(pJob, taskStatus->taskId, &pTask); + if (NULL == pTask) { + // TODO DROP TASK FROM SERVER!!!! + schReleaseJob(taskStatus->refId); + continue; + } + + if (taskStatus->execId != pTask->execId) { + // TODO DROP TASK FROM SERVER!!!! + SCH_TASK_DLOG("EID %d in hb rsp mis-match", taskStatus->execId); + schReleaseJob(taskStatus->refId); + continue; + } + + if (taskStatus->status == JOB_TASK_STATUS_FAIL) { + // RECORD AND HANDLE ERROR!!!! + schReleaseJob(taskStatus->refId); + continue; + } + + if (taskStatus->status == JOB_TASK_STATUS_INIT) { + schRescheduleTask(pJob, pTask); + } + + schReleaseJob(taskStatus->refId); + } + + return TSDB_CODE_SUCCESS; +} + +int32_t schLaunchTaskImpl(SSchJob *pJob, SSchTask *pTask) { + int8_t status = 0; + int32_t code = 0; + + atomic_add_fetch_32(&pTask->level->taskLaunchedNum, 1); + pTask->execId++; + + SCH_TASK_DLOG("start to launch task's %dth exec", pTask->execId); + + SCH_LOG_TASK_START_TS(pTask); + + if (schJobNeedToStop(pJob, &status)) { + SCH_TASK_DLOG("no need to launch task cause of job status, job status:%s", jobTaskStatusStr(status)); + + SCH_RET(atomic_load_32(&pJob->errCode)); + } + + // NOTE: race condition: the task should be put into the hash table before send msg to server + if (SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_EXEC) { + SCH_ERR_RET(schPushTaskToExecList(pJob, pTask)); + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_EXEC); + } + + SSubplan *plan = pTask->plan; + + if (NULL == pTask->msg) { // TODO add more detailed reason for failure + code = qSubPlanToString(plan, &pTask->msg, &pTask->msgLen); + if (TSDB_CODE_SUCCESS != code) { + SCH_TASK_ELOG("failed to create physical plan, code:%s, msg:%p, len:%d", tstrerror(code), pTask->msg, + pTask->msgLen); + SCH_ERR_RET(code); + } else { + SCH_TASK_DLOGL("physical plan len:%d, %s", pTask->msgLen, pTask->msg); + } + } + + SCH_ERR_RET(schSetTaskCandidateAddrs(pJob, pTask)); + + if (SCH_IS_QUERY_JOB(pJob)) { + SCH_ERR_RET(schEnsureHbConnection(pJob, pTask)); + } + + SCH_ERR_RET(schBuildAndSendMsg(pJob, pTask, NULL, plan->msgType)); + + return TSDB_CODE_SUCCESS; +} + +// Note: no more error processing, handled in function internal +int32_t schLaunchTask(SSchJob *pJob, SSchTask *pTask) { + bool enough = false; + int32_t code = 0; + + SCH_SET_TASK_HANDLE(pTask, NULL); + + if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { + SCH_ERR_JRET(schCheckIncTaskFlowQuota(pJob, pTask, &enough)); + + if (enough) { + SCH_ERR_JRET(schLaunchTaskImpl(pJob, pTask)); + } + } else { + SCH_ERR_JRET(schLaunchTaskImpl(pJob, pTask)); + } + + return TSDB_CODE_SUCCESS; + +_return: + + SCH_RET(schProcessOnTaskFailure(pJob, pTask, code)); +} + +int32_t schLaunchLevelTasks(SSchJob *pJob, SSchLevel *level) { + SCH_ERR_RET(schChkJobNeedFlowCtrl(pJob, level)); + + for (int32_t i = 0; i < level->taskNum; ++i) { + SSchTask *pTask = taosArrayGet(level->subTasks, i); + + SCH_ERR_RET(schLaunchTask(pJob, pTask)); + } + + return TSDB_CODE_SUCCESS; +} + +void schDropTaskInHashList(SSchJob *pJob, SHashObj *list) { + if (!SCH_IS_NEED_DROP_JOB(pJob)) { + return; + } + + void *pIter = taosHashIterate(list, NULL); + while (pIter) { + SSchTask *pTask = *(SSchTask **)pIter; + + schDropTaskOnExecNode(pJob, pTask); + + pIter = taosHashIterate(list, pIter); + } +} + + +// Note: no more error processing, handled in function internal +int32_t schLaunchFetchTask(SSchJob *pJob) { + int32_t code = 0; + + void *resData = atomic_load_ptr(&pJob->resData); + if (resData) { + SCH_JOB_DLOG("res already fetched, res:%p", resData); + return TSDB_CODE_SUCCESS; + } + + SCH_ERR_JRET(schBuildAndSendMsg(pJob, pJob->fetchTask, &pJob->resNode, TDMT_SCH_FETCH)); + + return TSDB_CODE_SUCCESS; + +_return: + + SCH_RET(schProcessOnTaskFailure(pJob, pJob->fetchTask, code)); +} + + diff --git a/source/libs/scheduler/src/schUtil.c b/source/libs/scheduler/src/schUtil.c index 73077cbf0f..f0ff12b56b 100644 --- a/source/libs/scheduler/src/schUtil.c +++ b/source/libs/scheduler/src/schUtil.c @@ -283,3 +283,20 @@ void schFreeSMsgSendInfo(SMsgSendInfo *msgSendInfo) { taosMemoryFree(msgSendInfo); } +int32_t schGetTaskFromList(SHashObj *pTaskList, uint64_t taskId, SSchTask **pTask) { + int32_t s = taosHashGetSize(pTaskList); + if (s <= 0) { + return TSDB_CODE_SUCCESS; + } + + SSchTask **task = taosHashGet(pTaskList, &taskId, sizeof(taskId)); + if (NULL == task || NULL == (*task)) { + return TSDB_CODE_SUCCESS; + } + + *pTask = *task; + + return TSDB_CODE_SUCCESS; +} + + diff --git a/source/libs/scheduler/src/scheduler.c b/source/libs/scheduler/src/scheduler.c index e2389c2a75..cbc6a1c17a 100644 --- a/source/libs/scheduler/src/scheduler.c +++ b/source/libs/scheduler/src/scheduler.c @@ -67,49 +67,22 @@ int32_t schedulerInit(SSchedulerCfg *cfg) { return TSDB_CODE_SUCCESS; } -int32_t schedulerExecJob(SSchedulerReq *pReq, int64_t *pJobId, SQueryResult *pRes) { - qDebug("scheduler sync exec job start"); +int32_t schedulerExecJob(SSchedulerReq *pReq, int64_t *pJobId) { + qDebug("scheduler %s exec job start", pReq->syncReq ? "SYNC" : "ASYNC"); int32_t code = 0; SSchJob *pJob = NULL; - SCH_ERR_JRET(schInitJob(pReq, &pJob)); + + SCH_ERR_RET(schJobStatusEnter(&pJob, JOB_TASK_STATUS_INIT, pReq)); + + SCH_ERR_RET(schJobStatusEnter(&pJob, JOB_TASK_STATUS_EXEC, pReq)); *pJobId = pJob->refId; - - SCH_ERR_JRET(schExecJobImpl(pReq, pJob, true)); _return: - - if (code && NULL == pJob) { - qDestroyQueryPlan(pReq->pDag); - } - - if (pJob) { - schSetJobQueryRes(pJob, pRes); - schReleaseJob(pJob->refId); - } - - return code; -} - -int32_t schedulerAsyncExecJob(SSchedulerReq *pReq, int64_t *pJobId) { - qDebug("scheduler async exec job start"); - - int32_t code = 0; - SSchJob *pJob = NULL; - SCH_ERR_JRET(schInitJob(pReq, &pJob)); - - *pJobId = pJob->refId; - - SCH_ERR_JRET(schExecJobImpl(pReq, pJob, false)); - -_return: - - if (code && NULL == pJob) { - qDestroyQueryPlan(pReq->pDag); - } if (pJob) { + schSetJobQueryRes(pJob, pReq->pQueryRes); schReleaseJob(pJob->refId); } @@ -133,14 +106,14 @@ int32_t schedulerFetchRows(int64_t job, void **pData) { SCH_ERR_RET(schBeginOperation(pJob, SCH_OP_FETCH, true)); pJob->userRes.fetchRes = pData; - code = schFetchRows(pJob); + code = schJobFetchRows(pJob); schReleaseJob(job); SCH_RET(code); } -void schedulerAsyncFetchRows(int64_t job, schedulerFetchFp fp, void* param) { +void schedulerFetchRowsA(int64_t job, schedulerFetchFp fp, void* param) { qDebug("scheduler async fetch rows start"); int32_t code = 0; @@ -159,7 +132,7 @@ void schedulerAsyncFetchRows(int64_t job, schedulerFetchFp fp, void* param) { pJob->userRes.fetchFp = fp; pJob->userRes.userParam = param; - SCH_ERR_JRET(schAsyncFetchRows(pJob)); + SCH_ERR_JRET(schJobFetchRowsA(pJob)); _return: @@ -178,7 +151,7 @@ int32_t schedulerGetTasksStatus(int64_t job, SArray *pSub) { SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); } - if (pJob->status < JOB_TASK_STATUS_NOT_START || pJob->levelNum <= 0 || NULL == pJob->levels) { + if (pJob->status < JOB_TASK_STATUS_INIT || pJob->levelNum <= 0 || NULL == pJob->levels) { qDebug("job not initialized or not executable job, refId:0x%" PRIx64, job); SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); } diff --git a/source/libs/scheduler/test/schedulerTests.cpp b/source/libs/scheduler/test/schedulerTests.cpp index 7fe6cc22bf..245d8d362c 100644 --- a/source/libs/scheduler/test/schedulerTests.cpp +++ b/source/libs/scheduler/test/schedulerTests.cpp @@ -507,6 +507,7 @@ void* schtRunJobThread(void *aa) { SRequestConnInfo conn = {0}; conn.pTrans = mockPointer; SSchedulerReq req = {0}; + req.syncReq = false; req.pConn = &conn; req.pNodeList = qnodeList; req.pDag = &dag; @@ -514,7 +515,7 @@ void* schtRunJobThread(void *aa) { req.execFp = schtQueryCb; req.execParam = &queryDone; - code = schedulerAsyncExecJob(&req, &queryJobRefId); + code = schedulerExecJob(&req, &queryJobRefId); assert(code == 0); pJob = schAcquireJob(queryJobRefId); @@ -658,7 +659,7 @@ TEST(queryTest, normalCase) { SRequestConnInfo conn = {0}; conn.pTrans = mockPointer; - SSchedulerReq req = {0}; + SSchedulerReq req = {0}; req.pConn = &conn; req.pNodeList = qnodeList; req.pDag = &dag; @@ -666,7 +667,7 @@ TEST(queryTest, normalCase) { req.execFp = schtQueryCb; req.execParam = &queryDone; - code = schedulerAsyncExecJob(&req, &job); + code = schedulerExecJob(&req, &job); ASSERT_EQ(code, 0); @@ -769,7 +770,7 @@ TEST(queryTest, readyFirstCase) { req.sql = "select * from tb"; req.execFp = schtQueryCb; req.execParam = &queryDone; - code = schedulerAsyncExecJob(&req, &job); + code = schedulerExecJob(&req, &job); ASSERT_EQ(code, 0); @@ -877,7 +878,7 @@ TEST(queryTest, flowCtrlCase) { req.execFp = schtQueryCb; req.execParam = &queryDone; - code = schedulerAsyncExecJob(&req, &job); + code = schedulerExecJob(&req, &job); ASSERT_EQ(code, 0);