no refact

This commit is contained in:
Hongze Cheng 2022-01-24 02:31:19 +00:00
parent 6e358fd42f
commit af406ab31a
2 changed files with 298 additions and 281 deletions

View File

@ -47,7 +47,7 @@ option(
option( option(
BUILD_WITH_UV BUILD_WITH_UV
"If build with libuv" "If build with libuv"
ON OFF
) )
option( option(

View File

@ -13,14 +13,16 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
#include "catalog.h"
#include "query.h"
#include "schedulerInt.h" #include "schedulerInt.h"
#include "tmsg.h" #include "tmsg.h"
#include "query.h"
#include "catalog.h"
static SSchedulerMgmt schMgmt = {0}; static SSchedulerMgmt schMgmt = {0};
uint64_t schGenTaskId(void) { return atomic_add_fetch_64(&schMgmt.taskId, 1); } uint64_t schGenTaskId(void) {
return atomic_add_fetch_64(&schMgmt.taskId, 1);
}
uint64_t schGenUUID(void) { uint64_t schGenUUID(void) {
static uint64_t hashId = 0; static uint64_t hashId = 0;
@ -44,6 +46,7 @@ uint64_t schGenUUID(void) {
return id; return id;
} }
int32_t schInitTask(SSchJob* pJob, SSchTask *pTask, SSubplan* pPlan, SSchLevel *pLevel) { int32_t schInitTask(SSchJob* pJob, SSchTask *pTask, SSubplan* pPlan, SSchLevel *pLevel) {
pTask->plan = pPlan; pTask->plan = pPlan;
pTask->level = pLevel; pTask->level = pLevel;
@ -78,6 +81,7 @@ void schFreeTask(SSchTask *pTask) {
} }
} }
int32_t schValidateTaskReceivedMsgType(SSchJob *pJob, SSchTask *pTask, int32_t msgType) { int32_t schValidateTaskReceivedMsgType(SSchJob *pJob, SSchTask *pTask, int32_t msgType) {
int32_t lastMsgType = atomic_load_32(&pTask->lastMsgType); int32_t lastMsgType = atomic_load_32(&pTask->lastMsgType);
@ -93,10 +97,8 @@ int32_t schValidateTaskReceivedMsgType(SSchJob *pJob, SSchTask *pTask, int32_t m
SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR);
} }
if (SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_EXECUTING && if (SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_EXECUTING && SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_PARTIAL_SUCCEED) {
SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_PARTIAL_SUCCEED) { SCH_TASK_ELOG("rsp msg conflicted with task status, status:%d, rspType:%d", SCH_GET_TASK_STATUS(pTask), msgType);
SCH_TASK_ELOG("rsp msg conflicted with task status, status:%d, rspType:%d", SCH_GET_TASK_STATUS(pTask),
msgType);
SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR);
} }
@ -110,6 +112,7 @@ int32_t schValidateTaskReceivedMsgType(SSchJob *pJob, SSchTask *pTask, int32_t m
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
int32_t schCheckAndUpdateJobStatus(SSchJob *pJob, int8_t newStatus) { int32_t schCheckAndUpdateJobStatus(SSchJob *pJob, int8_t newStatus) {
int32_t code = 0; int32_t code = 0;
@ -136,16 +139,19 @@ int32_t schCheckAndUpdateJobStatus(SSchJob *pJob, int8_t newStatus) {
break; break;
case JOB_TASK_STATUS_EXECUTING: case JOB_TASK_STATUS_EXECUTING:
if (newStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED && newStatus != JOB_TASK_STATUS_FAILED && if (newStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED
newStatus != JOB_TASK_STATUS_CANCELLING && newStatus != JOB_TASK_STATUS_CANCELLED && && newStatus != JOB_TASK_STATUS_FAILED
newStatus != JOB_TASK_STATUS_DROPPING) { && newStatus != JOB_TASK_STATUS_CANCELLING
&& newStatus != JOB_TASK_STATUS_CANCELLED
&& newStatus != JOB_TASK_STATUS_DROPPING) {
SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR);
} }
break; break;
case JOB_TASK_STATUS_PARTIAL_SUCCEED: case JOB_TASK_STATUS_PARTIAL_SUCCEED:
if (newStatus != JOB_TASK_STATUS_FAILED && newStatus != JOB_TASK_STATUS_SUCCEED && if (newStatus != JOB_TASK_STATUS_FAILED
newStatus != JOB_TASK_STATUS_DROPPING) { && newStatus != JOB_TASK_STATUS_SUCCEED
&& newStatus != JOB_TASK_STATUS_DROPPING) {
SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR);
} }
@ -185,6 +191,7 @@ _return:
SCH_ERR_RET(code); SCH_ERR_RET(code);
} }
int32_t schBuildTaskRalation(SSchJob *pJob, SHashObj *planToTask) { int32_t schBuildTaskRalation(SSchJob *pJob, SHashObj *planToTask) {
for (int32_t i = 0; i < pJob->levelNum; ++i) { for (int32_t i = 0; i < pJob->levelNum; ++i) {
SSchLevel *pLevel = taosArrayGet(pJob->levels, i); SSchLevel *pLevel = taosArrayGet(pJob->levels, i);
@ -267,6 +274,7 @@ int32_t schBuildTaskRalation(SSchJob *pJob, SHashObj *planToTask) {
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
int32_t schRecordTaskSucceedNode(SSchTask *pTask) { int32_t schRecordTaskSucceedNode(SSchTask *pTask) {
SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, atomic_load_8(&pTask->candidateIdx)); SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, atomic_load_8(&pTask->candidateIdx));
@ -277,6 +285,7 @@ int32_t schRecordTaskSucceedNode(SSchTask *pTask) {
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
int32_t schRecordTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr) { int32_t schRecordTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr) {
if (NULL == taosArrayPush(pTask->execAddrs, addr)) { if (NULL == taosArrayPush(pTask->execAddrs, addr)) {
SCH_TASK_ELOG("taosArrayPush addr to execAddr list failed, errno:%d", errno); SCH_TASK_ELOG("taosArrayPush addr to execAddr list failed, errno:%d", errno);
@ -286,6 +295,7 @@ int32_t schRecordTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *ad
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
int32_t schValidateAndBuildJob(SQueryDag *pDag, SSchJob *pJob) { int32_t schValidateAndBuildJob(SQueryDag *pDag, SSchJob *pJob) {
int32_t code = 0; int32_t code = 0;
pJob->queryId = pDag->queryId; pJob->queryId = pDag->queryId;
@ -301,10 +311,7 @@ int32_t schValidateAndBuildJob(SQueryDag *pDag, SSchJob *pJob) {
SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT);
} }
SHashObj *planToTask = taosHashInit( SHashObj *planToTask = taosHashInit(SCHEDULE_DEFAULT_TASK_NUMBER, taosGetDefaultHashFunction(POINTER_BYTES == sizeof(int64_t) ? TSDB_DATA_TYPE_BIGINT : TSDB_DATA_TYPE_INT), false, HASH_NO_LOCK);
SCHEDULE_DEFAULT_TASK_NUMBER,
taosGetDefaultHashFunction(POINTER_BYTES == sizeof(int64_t) ? TSDB_DATA_TYPE_BIGINT : TSDB_DATA_TYPE_INT), false,
HASH_NO_LOCK);
if (NULL == planToTask) { if (NULL == planToTask) {
SCH_JOB_ELOG("taosHashInit %d failed", SCHEDULE_DEFAULT_TASK_NUMBER); SCH_JOB_ELOG("taosHashInit %d failed", SCHEDULE_DEFAULT_TASK_NUMBER);
SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY);
@ -521,6 +528,7 @@ int32_t schMoveTaskToFailList(SSchJob *pJob, SSchTask *pTask, bool *moved) {
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
int32_t schMoveTaskToExecList(SSchJob *pJob, SSchTask *pTask, bool *moved) { int32_t schMoveTaskToExecList(SSchJob *pJob, SSchTask *pTask, bool *moved) {
if (0 != taosHashRemove(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId))) { if (0 != taosHashRemove(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId))) {
SCH_TASK_WLOG("remove task from succTask list failed, may not exist, status:%d", SCH_GET_TASK_STATUS(pTask)); SCH_TASK_WLOG("remove task from succTask list failed, may not exist, status:%d", SCH_GET_TASK_STATUS(pTask));
@ -546,6 +554,7 @@ int32_t schMoveTaskToExecList(SSchJob *pJob, SSchTask *pTask, bool *moved) {
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
int32_t schTaskCheckAndSetRetry(SSchJob *job, SSchTask *task, int32_t errCode, bool *needRetry) { int32_t schTaskCheckAndSetRetry(SSchJob *job, SSchTask *task, int32_t errCode, bool *needRetry) {
// TODO set retry or not based on task type/errCode/retry times/job status/available eps... // TODO set retry or not based on task type/errCode/retry times/job status/available eps...
// TODO if needRetry, set task retry info // TODO if needRetry, set task retry info
@ -574,6 +583,8 @@ int32_t schProcessOnJobFailureImpl(SSchJob *pJob, int32_t status, int32_t errCod
assert(0); assert(0);
} }
// Note: no more error processing, handled in function internal // Note: no more error processing, handled in function internal
int32_t schProcessOnJobFailure(SSchJob *pJob, int32_t errCode) { int32_t schProcessOnJobFailure(SSchJob *pJob, int32_t errCode) {
SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_FAILED, errCode)); SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_FAILED, errCode));
@ -584,6 +595,7 @@ int32_t schProcessOnJobDropped(SSchJob *pJob, int32_t errCode) {
SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_DROPPING, errCode)); SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_DROPPING, errCode));
} }
// Note: no more error processing, handled in function internal // Note: no more error processing, handled in function internal
int32_t schFetchFromRemote(SSchJob *pJob) { int32_t schFetchFromRemote(SSchJob *pJob) {
int32_t code = 0; int32_t code = 0;
@ -614,6 +626,7 @@ _return:
return code; return code;
} }
// Note: no more error processing, handled in function internal // Note: no more error processing, handled in function internal
int32_t schProcessOnJobPartialSuccess(SSchJob *pJob) { int32_t schProcessOnJobPartialSuccess(SSchJob *pJob) {
int32_t code = 0; int32_t code = 0;
@ -692,6 +705,7 @@ _return:
SCH_ERR_RET(errCode); SCH_ERR_RET(errCode);
} }
// Note: no more error processing, handled in function internal // Note: no more error processing, handled in function internal
int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask) { int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask) {
bool moved = false; bool moved = false;
@ -720,7 +734,7 @@ int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask) {
SCH_UNLOCK(SCH_WRITE, &pTask->level->lock); SCH_UNLOCK(SCH_WRITE, &pTask->level->lock);
if (taskDone < pTask->level->taskNum) { if (taskDone < pTask->level->taskNum) {
SCH_TASK_ELOG("wait all tasks, done:%d, all:%d", taskDone, pTask->level->taskNum); SCH_TASK_DLOG("wait all tasks, done:%d, all:%d", taskDone, pTask->level->taskNum);
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} else if (taskDone > pTask->level->taskNum) { } else if (taskDone > pTask->level->taskNum) {
@ -782,8 +796,7 @@ _return:
SCH_ERR_RET(code); SCH_ERR_RET(code);
} }
int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, char *msg, int32_t msgSize, int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, char *msg, int32_t msgSize, int32_t rspCode) {
int32_t rspCode) {
int32_t code = 0; int32_t code = 0;
SCH_ERR_JRET(schValidateTaskReceivedMsgType(pJob, pTask, msgType)); SCH_ERR_JRET(schValidateTaskReceivedMsgType(pJob, pTask, msgType));
@ -837,7 +850,7 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, ch
SResReadyRsp *rsp = (SResReadyRsp *)msg; SResReadyRsp *rsp = (SResReadyRsp *)msg;
if (rspCode != TSDB_CODE_SUCCESS || NULL == msg || rsp->code != TSDB_CODE_SUCCESS) { if (rspCode != TSDB_CODE_SUCCESS || NULL == msg || rsp->code != TSDB_CODE_SUCCESS) {
SCH_ERR_RET(schProcessOnTaskFailure(pJob, pTask, rsp->code)); SCH_ERR_RET(schProcessOnTaskFailure(pJob, pTask, rspCode));
} }
SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask));
@ -857,12 +870,14 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, ch
SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR);
} }
SCH_ERR_JRET(schProcessOnDataFetched(pJob)); atomic_store_ptr(&pJob->res, rsp);
break; atomic_store_32(&pJob->resNumOfRows, rsp->numOfRows);
if (rsp->completed) {
SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCCEED);
} }
SCH_ERR_JRET(schProcessOnDataFetched(pJob)); SCH_ERR_JRET(schProcessOnDataFetched(pJob));
break; break;
} }
case TDMT_VND_DROP_TASK: { case TDMT_VND_DROP_TASK: {
@ -884,6 +899,7 @@ _return:
SCH_RET(code); SCH_RET(code);
} }
int32_t schHandleCallback(void* param, const SDataBuf* pMsg, int32_t msgType, int32_t rspCode) { int32_t schHandleCallback(void* param, const SDataBuf* pMsg, int32_t msgType, int32_t rspCode) {
int32_t code = 0; int32_t code = 0;
SSchCallbackParam *pParam = (SSchCallbackParam *)param; SSchCallbackParam *pParam = (SSchCallbackParam *)param;
@ -980,8 +996,8 @@ int32_t schGetCallbackFp(int32_t msgType, __async_send_cb_fn_t *fp) {
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
int32_t schAsyncSendMsg(void *transport, SEpSet *epSet, uint64_t qId, uint64_t tId, int32_t msgType, void *msg,
uint32_t msgSize) { int32_t schAsyncSendMsg(void *transport, SEpSet* epSet, uint64_t qId, uint64_t tId, int32_t msgType, void *msg, uint32_t msgSize) {
int32_t code = 0; int32_t code = 0;
SMsgSendInfo* pMsgSendInfo = calloc(1, sizeof(SMsgSendInfo)); SMsgSendInfo* pMsgSendInfo = calloc(1, sizeof(SMsgSendInfo));
if (NULL == pMsgSendInfo) { if (NULL == pMsgSendInfo) {
@ -1166,10 +1182,11 @@ static FORCE_INLINE bool schJobNeedToStop(SSchJob *pJob, int8_t *pStatus) {
*pStatus = status; *pStatus = status;
} }
return (status == JOB_TASK_STATUS_FAILED || status == JOB_TASK_STATUS_CANCELLED || return (status == JOB_TASK_STATUS_FAILED || status == JOB_TASK_STATUS_CANCELLED
status == JOB_TASK_STATUS_CANCELLING || status == JOB_TASK_STATUS_DROPPING); || status == JOB_TASK_STATUS_CANCELLING || status == JOB_TASK_STATUS_DROPPING);
} }
// Note: no more error processing, handled in function internal // Note: no more error processing, handled in function internal
int32_t schLaunchTask(SSchJob *pJob, SSchTask *pTask) { int32_t schLaunchTask(SSchJob *pJob, SSchTask *pTask) {
int8_t status = 0; int8_t status = 0;
@ -1289,22 +1306,19 @@ int32_t schExecJobImpl(void *transport, SArray *nodeList, SQueryDag *pDag, struc
SCH_ERR_JRET(schValidateAndBuildJob(pDag, pJob)); SCH_ERR_JRET(schValidateAndBuildJob(pDag, pJob));
pJob->execTasks = pJob->execTasks = taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK);
taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK);
if (NULL == pJob->execTasks) { if (NULL == pJob->execTasks) {
SCH_JOB_ELOG("taosHashInit %d execTasks failed", pDag->numOfSubplans); SCH_JOB_ELOG("taosHashInit %d execTasks failed", pDag->numOfSubplans);
SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY);
} }
pJob->succTasks = pJob->succTasks = taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK);
taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK);
if (NULL == pJob->succTasks) { if (NULL == pJob->succTasks) {
SCH_JOB_ELOG("taosHashInit %d succTasks failed", pDag->numOfSubplans); SCH_JOB_ELOG("taosHashInit %d succTasks failed", pDag->numOfSubplans);
SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY);
} }
pJob->failTasks = pJob->failTasks = taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK);
taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK);
if (NULL == pJob->failTasks) { if (NULL == pJob->failTasks) {
SCH_JOB_ELOG("taosHashInit %d failTasks failed", pDag->numOfSubplans); SCH_JOB_ELOG("taosHashInit %d failTasks failed", pDag->numOfSubplans);
SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY);
@ -1351,8 +1365,10 @@ int32_t schCancelJob(SSchJob *pJob) {
//TODO //TODO
//TODO MOVE ALL TASKS FROM EXEC LIST TO FAIL LIST //TODO MOVE ALL TASKS FROM EXEC LIST TO FAIL LIST
} }
int32_t schedulerInit(SSchedulerCfg *cfg) { int32_t schedulerInit(SSchedulerCfg *cfg) {
if (schMgmt.jobs) { if (schMgmt.jobs) {
qError("scheduler already initialized"); qError("scheduler already initialized");
@ -1369,8 +1385,7 @@ int32_t schedulerInit(SSchedulerCfg *cfg) {
schMgmt.cfg.maxJobNum = SCHEDULE_DEFAULT_JOB_NUMBER; schMgmt.cfg.maxJobNum = SCHEDULE_DEFAULT_JOB_NUMBER;
} }
schMgmt.jobs = schMgmt.jobs = taosHashInit(schMgmt.cfg.maxJobNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK);
taosHashInit(schMgmt.cfg.maxJobNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK);
if (NULL == schMgmt.jobs) { if (NULL == schMgmt.jobs) {
qError("init schduler jobs failed, num:%u", schMgmt.cfg.maxJobNum); qError("init schduler jobs failed, num:%u", schMgmt.cfg.maxJobNum);
SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY);
@ -1535,6 +1550,7 @@ _return:
SCH_RET(code); SCH_RET(code);
} }
int32_t scheduleFetchRows(SSchJob *pJob, void** pData) { int32_t scheduleFetchRows(SSchJob *pJob, void** pData) {
if (NULL == pJob || NULL == pData) { if (NULL == pJob || NULL == pData) {
SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT);
@ -1713,3 +1729,4 @@ void schedulerDestroy(void) {
schMgmt.jobs = NULL; schMgmt.jobs = NULL;
} }
} }