diff --git a/docs-en/01-index.md b/docs-en/01-index.md index 9574323fe6..f5b7f3e0f6 100644 --- a/docs-en/01-index.md +++ b/docs-en/01-index.md @@ -4,24 +4,24 @@ sidebar_label: Documentation Home slug: / --- -TDengine is a [high-performance](https://tdengine.com/fast), [scalable](https://tdengine.com/scalable) time series database with [SQL support](https://tdengine.com/sql-support). This document is the TDengine user manual. It introduces the basic concepts, installation, features, SQL, APIs, operation, maintenance, kernel design, etc. It’s written mainly for architects, developers and system administrators. +TDengine is a [high-performance](https://tdengine.com/fast), [scalable](https://tdengine.com/scalable) time series database with [SQL support](https://tdengine.com/sql-support). This document is the TDengine user manual. It introduces the basic, as well as novel concepts, in TDengine, and also talks in detail about installation, features, SQL, APIs, operation, maintenance, kernel design and other topics. It’s written mainly for architects, developers and system administrators. -To get a global view about TDengine, like feature list, benchmarks, and competitive advantages, please browse through section [Introduction](./intro). +To get an overview of TDengine, such as a feature list, benchmarks, and competitive advantages, please browse through the [Introduction](./intro) section. -TDengine makes full use of the characteristics of time series data, proposes the concepts of "one table for one data collection point" and "super table", and designs an innovative storage engine, which greatly improves the efficiency of data ingestion, querying and storage. To understand the new concepts and use TDengine in the right way, please read [“Concepts”](./concept) thoroughly. +TDengine greatly improves the efficiency of data ingestion, querying and storage by exploiting the characteristics of time series data, introducing the novel concepts of "one table for one data collection point" and "super table", and designing an innovative storage engine. To understand the new concepts in TDengine and make full use of the features and capabilities of TDengine, please read [“Concepts”](./concept) thoroughly. -If you are a developer, please read the [“Developer Guide”](./develop) carefully. This section introduces the database connection, data modeling, data ingestion, query, continuous query, cache, data subscription, user-defined function, etc. in detail. Sample code is provided for a variety of programming languages. In most cases, you can just copy and paste the sample code, make a few changes to accommodate your application, and it will work. +If you are a developer, please read the [“Developer Guide”](./develop) carefully. This section introduces the database connection, data modeling, data ingestion, query, continuous query, cache, data subscription, user-defined functions, and other functionality in detail. Sample code is provided for a variety of programming languages. In most cases, you can just copy and paste the sample code, make a few changes to accommodate your application, and it will work. -We live in the era of big data, and scale-up is unable to meet the growing business needs. Any modern data system must have the ability to scale out, and clustering has become an indispensable feature of big data systems. The TDengine team has not only developed the cluster feature, they also decided to open source this important feature. To learn how to deploy, manage and maintain a TDengine cluster please refer to ["Cluster"](./cluster). +We live in the era of big data, and scale-up is unable to meet the growing needs of business. Any modern data system must have the ability to scale out, and clustering has become an indispensable feature of big data systems. Not only did the TDengine team develop the cluster feature, but also decided to open source this important feature. To learn how to deploy, manage and maintain a TDengine cluster please refer to ["cluster"](./cluster). -TDengine uses SQL as its query language, which greatly reduces learning costs and migration costs. In addition to the standard SQL, TDengine has extensions to support time series data scenarios better, such as roll up, interpolation, time weighted average, etc. The ["SQL Reference"](./taos-sql) chapter describes the SQL syntax in detail, and lists the various supported commands and functions. +TDengine uses ubiquitious SQL as its query language, which greatly reduces learning costs and migration costs. In addition to the standard SQL, TDengine has extensions to better support time series data analysis. These extensions include functions such as roll up, interpolation and time weighted average, among many others. The ["SQL Reference"](./taos-sql) chapter describes the SQL syntax in detail, and lists the various supported commands and functions. -If you are a system administrator who cares about installation, upgrade, fault tolerance, disaster recovery, data import, data export, system configuration, how to monitor whether TDengine is running healthily, and how to improve system performance, please refer to the ["Administration"](./operation) thoroughly. +If you are a system administrator who cares about installation, upgrade, fault tolerance, disaster recovery, data import, data export, system configuration, how to monitor whether TDengine is running healthily, and how to improve system performance, please refer to, and thoroughly read the ["Administration"](./operation) section. -If you want to know more about TDengine tools, REST API, and connectors for various programming languages, please see the ["Reference"](./reference) chapter. +If you want to know more about TDengine tools, the REST API, and connectors for various programming languages, please see the ["Reference"](./reference) chapter. If you are very interested in the internal design of TDengine, please read the chapter ["Inside TDengine”](./tdinternal), which introduces the cluster design, data partitioning, sharding, writing, and reading processes in detail. If you want to study TDengine code or even contribute code, please read this chapter carefully. -TDengine is an open source database, you are welcome to be a part of TDengine. If you find any errors in the documentation, or the description is not clear, please click "Edit this page" at the bottom of each page to edit it directly. +TDengine is an open source database, and we would love for you to be a part of TDengine. If you find any errors in the documentation, or see parts where more clarity or elaboration is needed, please click "Edit this page" at the bottom of each page to edit it directly. Together, we make a difference. diff --git a/include/common/taosdef.h b/include/common/taosdef.h index 1d32c9825f..d39c7a1215 100644 --- a/include/common/taosdef.h +++ b/include/common/taosdef.h @@ -96,6 +96,7 @@ extern char *qtypeStr[]; #define TSDB_PORT_HTTP 11 #undef TD_DEBUG_PRINT_ROW +#undef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS #ifdef __cplusplus } diff --git a/include/common/tmsgdef.h b/include/common/tmsgdef.h index 455898585a..e8e931daa5 100644 --- a/include/common/tmsgdef.h +++ b/include/common/tmsgdef.h @@ -158,6 +158,7 @@ enum { TD_DEF_MSG_TYPE(TDMT_MND_DROP_INDEX, "mnode-drop-index", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_GET_DB_CFG, "mnode-get-db-cfg", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_GET_INDEX, "mnode-get-index", NULL, NULL) + TD_DEF_MSG_TYPE(TDMT_MND_APPLY_MSG, "mnode-apply-msg", NULL, NULL) // Requests handled by VNODE TD_NEW_MSG_SEG(TDMT_VND_MSG) diff --git a/include/dnode/mnode/mnode.h b/include/dnode/mnode/mnode.h index 28c470a443..f2c8c916c8 100644 --- a/include/dnode/mnode/mnode.h +++ b/include/dnode/mnode/mnode.h @@ -89,6 +89,7 @@ int32_t mndGetLoad(SMnode *pMnode, SMnodeLoad *pLoad); * @return int32_t 0 for success, -1 for failure. */ int32_t mndProcessMsg(SRpcMsg *pMsg); +int32_t mndProcessSyncMsg(SRpcMsg *pMsg); /** * @brief Generate machine code diff --git a/include/dnode/mnode/sdb/sdb.h b/include/dnode/mnode/sdb/sdb.h index 2abe0e5c73..94d41a7416 100644 --- a/include/dnode/mnode/sdb/sdb.h +++ b/include/dnode/mnode/sdb/sdb.h @@ -304,13 +304,16 @@ int32_t sdbGetMaxId(SSdb *pSdb, ESdbType type); int64_t sdbGetTableVer(SSdb *pSdb, ESdbType type); /** - * @brief Update the version of sdb + * @brief Update the index of sdb * * @param pSdb The sdb object. - * @param val The update value of the version. - * @return int32_t The current version of sdb + * @param index The update value of the apply index. + * @return int32_t The current index of sdb */ -int64_t sdbUpdateVer(SSdb *pSdb, int32_t val); +void sdbSetApplyIndex(SSdb *pSdb, int64_t index); +int64_t sdbGetApplyIndex(SSdb *pSdb); +void sdbSetApplyTerm(SSdb *pSdb, int64_t term); +int64_t sdbGetApplyTerm(SSdb *pSdb); SSdbRaw *sdbAllocRaw(ESdbType type, int8_t sver, int32_t dataLen); void sdbFreeRaw(SSdbRaw *pRaw); @@ -339,6 +342,7 @@ typedef struct SSdb { char *tmpDir; int64_t lastCommitVer; int64_t curVer; + int64_t curTerm; int64_t tableVer[SDB_MAX]; int64_t maxId[SDB_MAX]; EKeyType keyTypes[SDB_MAX]; diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index 9b6593e4b5..1eed353f33 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -78,6 +78,8 @@ typedef struct SFsmCbMeta { int32_t code; ESyncState state; uint64_t seqNum; + SyncTerm term; + SyncTerm currentTerm; } SFsmCbMeta; typedef struct SSyncFSM { @@ -85,6 +87,7 @@ typedef struct SSyncFSM { void (*FpCommitCb)(struct SSyncFSM* pFsm, const SRpcMsg* pMsg, SFsmCbMeta cbMeta); void (*FpPreCommitCb)(struct SSyncFSM* pFsm, const SRpcMsg* pMsg, SFsmCbMeta cbMeta); void (*FpRollBackCb)(struct SSyncFSM* pFsm, const SRpcMsg* pMsg, SFsmCbMeta cbMeta); + void (*FpRestoreFinish)(struct SSyncFSM* pFsm); int32_t (*FpGetSnapshot)(struct SSyncFSM* pFsm, SSnapshot* pSnapshot); int32_t (*FpRestoreSnapshot)(struct SSyncFSM* pFsm, const SSnapshot* snapshot); } SSyncFSM; @@ -117,7 +120,6 @@ typedef struct SSyncLogStore { } SSyncLogStore; - typedef struct SSyncInfo { SyncGroupId vgId; SSyncCfg syncCfg; diff --git a/include/libs/transport/trpc.h b/include/libs/transport/trpc.h index fcb00ddf01..754a203471 100644 --- a/include/libs/transport/trpc.h +++ b/include/libs/transport/trpc.h @@ -28,7 +28,7 @@ extern "C" { #define TAOS_CONN_CLIENT 1 #define IsReq(pMsg) (pMsg->msgType & 1U) -extern int tsRpcHeadSize; +extern int32_t tsRpcHeadSize; typedef struct { uint32_t clientIp; @@ -69,10 +69,10 @@ typedef struct SRpcInit { char localFqdn[TSDB_FQDN_LEN]; uint16_t localPort; // local port char * label; // for debug purpose - int numOfThreads; // number of threads to handle connections - int sessions; // number of sessions allowed + int32_t numOfThreads; // number of threads to handle connections + int32_t sessions; // number of sessions allowed int8_t connType; // TAOS_CONN_UDP, TAOS_CONN_TCPC, TAOS_CONN_TCPS - int idleTime; // milliseconds, 0 means idle timer is disabled + int32_t idleTime; // milliseconds, 0 means idle timer is disabled // the following is for client app ecurity only char *user; // user name @@ -108,9 +108,9 @@ int32_t rpcInit(); void rpcCleanup(); void * rpcOpen(const SRpcInit *pRpc); void rpcClose(void *); -void * rpcMallocCont(int contLen); +void * rpcMallocCont(int32_t contLen); void rpcFreeCont(void *pCont); -void * rpcReallocCont(void *ptr, int contLen); +void * rpcReallocCont(void *ptr, int32_t contLen); // Because taosd supports multi-process mode // These functions should not be used on the server side @@ -121,10 +121,10 @@ void rpcRegisterBrokenLinkArg(SRpcMsg *msg); void rpcReleaseHandle(void *handle, int8_t type); // just release client conn to rpc instance, no close sock // These functions will not be called in the child process -void rpcSendRedirectRsp(void *pConn, const SEpSet *pEpSet); -void rpcSendRequestWithCtx(void *thandle, const SEpSet *pEpSet, SRpcMsg *pMsg, int64_t *rid, SRpcCtx *ctx); -int rpcGetConnInfo(void *thandle, SRpcConnInfo *pInfo); -void rpcSendRecv(void *shandle, SEpSet *pEpSet, SRpcMsg *pReq, SRpcMsg *pRsp); +void rpcSendRedirectRsp(void *pConn, const SEpSet *pEpSet); +void rpcSendRequestWithCtx(void *thandle, const SEpSet *pEpSet, SRpcMsg *pMsg, int64_t *rid, SRpcCtx *ctx); +int32_t rpcGetConnInfo(void *thandle, SRpcConnInfo *pInfo); +void rpcSendRecv(void *shandle, SEpSet *pEpSet, SRpcMsg *pReq, SRpcMsg *pRsp); #ifdef __cplusplus } diff --git a/include/util/taoserror.h b/include/util/taoserror.h index e47944c724..e318978339 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -253,6 +253,7 @@ int32_t* taosGetErrno(); #define TSDB_CODE_MND_TRANS_INVALID_STAGE TAOS_DEF_ERROR_CODE(0, 0x03D2) #define TSDB_CODE_MND_TRANS_CONFLICT TAOS_DEF_ERROR_CODE(0, 0x03D3) #define TSDB_CODE_MND_TRANS_UNKNOW_ERROR TAOS_DEF_ERROR_CODE(0, 0x03D4) +#define TSDB_CODE_MND_TRANS_CLOG_IS_NULL TAOS_DEF_ERROR_CODE(0, 0x03D5) // mnode-mq #define TSDB_CODE_MND_TOPIC_ALREADY_EXIST TAOS_DEF_ERROR_CODE(0, 0x03E0) diff --git a/include/util/tencode.h b/include/util/tencode.h index af38d694e2..cbacd59fa7 100644 --- a/include/util/tencode.h +++ b/include/util/tencode.h @@ -82,7 +82,7 @@ typedef struct { do { \ SEncoder coder = {0}; \ tEncoderInit(&coder, NULL, 0); \ - if ((E)(&coder, S) == 0) { \ + if ((E)(&coder, S) >= 0) { \ SIZE = coder.pos; \ RET = 0; \ } else { \ diff --git a/source/dnode/mgmt/mgmt_mnode/inc/mmInt.h b/source/dnode/mgmt/mgmt_mnode/inc/mmInt.h index 75e83d6547..030d4b309e 100644 --- a/source/dnode/mgmt/mgmt_mnode/inc/mmInt.h +++ b/source/dnode/mgmt/mgmt_mnode/inc/mmInt.h @@ -24,19 +24,22 @@ extern "C" { #endif typedef struct SMnodeMgmt { - SDnodeData *pData; - SMnode *pMnode; - SMsgCb msgCb; - const char *path; - const char *name; - SSingleWorker queryWorker; - SSingleWorker readWorker; - SSingleWorker writeWorker; - SSingleWorker syncWorker; - SSingleWorker monitorWorker; - SReplica replicas[TSDB_MAX_REPLICA]; - int8_t replica; - int8_t selfIndex; + SDnodeData *pData; + SMnode *pMnode; + SMsgCb msgCb; + const char *path; + const char *name; + SSingleWorker queryWorker; + SSingleWorker readWorker; + SSingleWorker writeWorker; + SSingleWorker syncWorker; + SSingleWorker monitorWorker; + SReplica replicas[TSDB_MAX_REPLICA]; + int8_t replica; + int8_t selfIndex; + bool stopped; + int32_t refCount; + TdThreadRwlock lock; } SMnodeMgmt; // mmFile.c @@ -45,6 +48,8 @@ int32_t mmWriteFile(SMnodeMgmt *pMgmt, SDCreateMnodeReq *pMsg, bool deployed); // mmInt.c int32_t mmAlter(SMnodeMgmt *pMgmt, SDAlterMnodeReq *pMsg); +int32_t mmAcquire(SMnodeMgmt *pMgmt); +void mmRelease(SMnodeMgmt *pMgmt); // mmHandle.c SArray *mmGetMsgHandles(); diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c index 2ce42d7a5f..a894a4962d 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c @@ -237,6 +237,16 @@ SArray *mmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_VNODE_RSP, mmPutNodeMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_VND_COMPACT_VNODE_RSP, mmPutNodeMsgToWriteQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_TIMEOUT, mmPutNodeMsgToSyncQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_PING, mmPutNodeMsgToSyncQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_PING_REPLY, mmPutNodeMsgToSyncQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_CLIENT_REQUEST, mmPutNodeMsgToSyncQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_CLIENT_REQUEST_REPLY, mmPutNodeMsgToSyncQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_REQUEST_VOTE, mmPutNodeMsgToSyncQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_REQUEST_VOTE_REPLY, mmPutNodeMsgToSyncQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_APPEND_ENTRIES, mmPutNodeMsgToSyncQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_VND_SYNC_APPEND_ENTRIES_REPLY, mmPutNodeMsgToSyncQueue, 1) == NULL) goto _OVER; + code = 0; _OVER: diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmInt.c b/source/dnode/mgmt/mgmt_mnode/src/mmInt.c index 4f7fd4a1c0..43113d05af 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmInt.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmInt.c @@ -110,6 +110,7 @@ static void mmClose(SMnodeMgmt *pMgmt) { if (pMgmt->pMnode != NULL) { mmStopWorker(pMgmt); mndClose(pMgmt->pMnode); + taosThreadRwlockDestroy(&pMgmt->lock); pMgmt->pMnode = NULL; } @@ -122,6 +123,11 @@ static int32_t mmOpen(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) { return -1; } + if (syncInit() != 0) { + dError("failed to init sync since %s", terrstr()); + return -1; + } + SMnodeMgmt *pMgmt = taosMemoryCalloc(1, sizeof(SMnodeMgmt)); if (pMgmt == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; @@ -137,6 +143,7 @@ static int32_t mmOpen(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) { pMgmt->msgCb.queueFps[WRITE_QUEUE] = (PutToQueueFp)mmPutRpcMsgToWriteQueue; pMgmt->msgCb.queueFps[SYNC_QUEUE] = (PutToQueueFp)mmPutRpcMsgToSyncQueue; pMgmt->msgCb.mgmt = pMgmt; + taosThreadRwlockInit(&pMgmt->lock, NULL); bool deployed = false; if (mmReadFile(pMgmt, &deployed) != 0) { @@ -206,3 +213,22 @@ SMgmtFunc mmGetMgmtFunc() { return mgmtFunc; } + +int32_t mmAcquire(SMnodeMgmt *pMgmt) { + int32_t code = 0; + + taosThreadRwlockRdlock(&pMgmt->lock); + if (pMgmt->stopped) { + code = -1; + } else { + atomic_add_fetch_32(&pMgmt->refCount, 1); + } + taosThreadRwlockUnlock(&pMgmt->lock); + return code; +} + +void mmRelease(SMnodeMgmt *pMgmt) { + taosThreadRwlockRdlock(&pMgmt->lock); + atomic_sub_fetch_32(&pMgmt->refCount, 1); + taosThreadRwlockUnlock(&pMgmt->lock); +} \ No newline at end of file diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmWorker.c b/source/dnode/mgmt/mgmt_mnode/src/mmWorker.c index 43ee7cd80a..45dec1153f 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmWorker.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmWorker.c @@ -56,6 +56,12 @@ static void mmProcessQueue(SQueueInfo *pInfo, SRpcMsg *pMsg) { taosFreeQitem(pMsg); } +static void mmProcessSyncQueue(SQueueInfo *pInfo, SRpcMsg *pMsg) { + SMnodeMgmt *pMgmt = pInfo->ahandle; + pMsg->info.node = pMgmt->pMnode; + mndProcessSyncMsg(pMsg); +} + static int32_t mmPutNodeMsgToWorker(SSingleWorker *pWorker, SRpcMsg *pMsg) { dTrace("msg:%p, put into worker %s, type:%s", pMsg, pWorker->name, TMSG_INFO(pMsg->msgType)); taosWriteQitem(pWorker->queue, pMsg); @@ -105,7 +111,10 @@ int32_t mmPutRpcMsgToReadQueue(SMnodeMgmt *pMgmt, SRpcMsg *pMsg) { } int32_t mmPutRpcMsgToSyncQueue(SMnodeMgmt *pMgmt, SRpcMsg *pMsg) { - return mmPutRpcMsgToWorker(&pMgmt->syncWorker, pMsg); + if (mmAcquire(pMgmt) != 0) return -1; + int32_t code = mmPutRpcMsgToWorker(&pMgmt->syncWorker, pMsg); + mmRelease(pMgmt); + return code; } int32_t mmStartWorker(SMnodeMgmt *pMgmt) { @@ -149,7 +158,7 @@ int32_t mmStartWorker(SMnodeMgmt *pMgmt) { .min = 1, .max = 1, .name = "mnode-sync", - .fp = (FItem)mmProcessQueue, + .fp = (FItem)mmProcessSyncQueue, .param = pMgmt, }; if (tSingleWorkerInit(&pMgmt->syncWorker, &sCfg) != 0) { @@ -174,6 +183,11 @@ int32_t mmStartWorker(SMnodeMgmt *pMgmt) { } void mmStopWorker(SMnodeMgmt *pMgmt) { + taosThreadRwlockWrlock(&pMgmt->lock); + pMgmt->stopped = 1; + taosThreadRwlockUnlock(&pMgmt->lock); + while (pMgmt->refCount > 0) taosMsleep(10); + tSingleWorkerCleanup(&pMgmt->monitorWorker); tSingleWorkerCleanup(&pMgmt->queryWorker); tSingleWorkerCleanup(&pMgmt->readWorker); diff --git a/source/dnode/mgmt/test/mnode/CMakeLists.txt b/source/dnode/mgmt/test/mnode/CMakeLists.txt index e83f5dbbec..788cf53976 100644 --- a/source/dnode/mgmt/test/mnode/CMakeLists.txt +++ b/source/dnode/mgmt/test/mnode/CMakeLists.txt @@ -4,7 +4,7 @@ target_link_libraries( dmnodeTest sut ) -add_test( - NAME dmnodeTest - COMMAND dmnodeTest -) +#add_test( +# NAME dmnodeTest +# COMMAND dmnodeTest +#) diff --git a/source/dnode/mnode/impl/inc/mndInt.h b/source/dnode/mnode/impl/inc/mndInt.h index 5258fa9e02..5a1653b937 100644 --- a/source/dnode/mnode/impl/inc/mndInt.h +++ b/source/dnode/mnode/impl/inc/mndInt.h @@ -19,6 +19,7 @@ #include "mndDef.h" #include "sdb.h" +#include "syncTools.h" #include "tcache.h" #include "tdatablock.h" #include "tglobal.h" @@ -31,12 +32,14 @@ extern "C" { #endif +// clang-format off #define mFatal(...) { if (mDebugFlag & DEBUG_FATAL) { taosPrintLog("MND FATAL ", DEBUG_FATAL, 255, __VA_ARGS__); }} #define mError(...) { if (mDebugFlag & DEBUG_ERROR) { taosPrintLog("MND ERROR ", DEBUG_ERROR, 255, __VA_ARGS__); }} #define mWarn(...) { if (mDebugFlag & DEBUG_WARN) { taosPrintLog("MND WARN ", DEBUG_WARN, 255, __VA_ARGS__); }} #define mInfo(...) { if (mDebugFlag & DEBUG_INFO) { taosPrintLog("MND ", DEBUG_INFO, 255, __VA_ARGS__); }} #define mDebug(...) { if (mDebugFlag & DEBUG_DEBUG) { taosPrintLog("MND ", DEBUG_DEBUG, mDebugFlag, __VA_ARGS__); }} #define mTrace(...) { if (mDebugFlag & DEBUG_TRACE) { taosPrintLog("MND ", DEBUG_TRACE, mDebugFlag, __VA_ARGS__); }} +// clang-format on #define SYSTABLE_SCH_TABLE_NAME_LEN ((TSDB_TABLE_NAME_LEN - 1) + VARSTR_HEADER_SIZE) #define SYSTABLE_SCH_DB_NAME_LEN ((TSDB_DB_NAME_LEN - 1) + VARSTR_HEADER_SIZE) @@ -72,10 +75,11 @@ typedef struct { } STelemMgmt; typedef struct { - int32_t errCode; - sem_t syncSem; SWal *pWal; - SSyncNode *pSyncNode; + int32_t errCode; + bool restored; + sem_t syncSem; + int64_t sync; ESyncState state; } SSyncMgmt; diff --git a/source/dnode/mnode/impl/inc/mndSync.h b/source/dnode/mnode/impl/inc/mndSync.h index fe557cdeac..356f215267 100644 --- a/source/dnode/mnode/impl/inc/mndSync.h +++ b/source/dnode/mnode/impl/inc/mndSync.h @@ -26,6 +26,8 @@ int32_t mndInitSync(SMnode *pMnode); void mndCleanupSync(SMnode *pMnode); bool mndIsMaster(SMnode *pMnode); int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw); +void mndSyncStart(SMnode *pMnode); +void mndSyncStop(SMnode *pMnode); #ifdef __cplusplus } diff --git a/source/dnode/mnode/impl/inc/mndTopic.h b/source/dnode/mnode/impl/inc/mndTopic.h index d7e6f9c87b..c5c4800e02 100644 --- a/source/dnode/mnode/impl/inc/mndTopic.h +++ b/source/dnode/mnode/impl/inc/mndTopic.h @@ -35,7 +35,7 @@ int32_t mndDropTopicByDB(SMnode *pMnode, STrans *pTrans, SDbObj *pDb); const char *mndTopicGetShowName(const char topic[TSDB_TOPIC_FNAME_LEN]); -int32_t mndSetTopicRedoLogs(SMnode *pMnode, STrans *pTrans, SMqTopicObj *pTopic); +int32_t mndSetTopicCommitLogs(SMnode *pMnode, STrans *pTrans, SMqTopicObj *pTopic); #ifdef __cplusplus } diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index 8b2799833b..1bb003bab9 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -419,7 +419,7 @@ static int32_t mndProcessSubscribeReq(SRpcMsg *pMsg) { SMqTopicObj topicObj = {0}; memcpy(&topicObj, pTopic, sizeof(SMqTopicObj)); topicObj.refConsumerCnt = pTopic->refConsumerCnt + 1; - if (mndSetTopicRedoLogs(pMnode, pTrans, &topicObj) != 0) goto SUBSCRIBE_OVER; + if (mndSetTopicCommitLogs(pMnode, pTrans, &topicObj) != 0) goto SUBSCRIBE_OVER; mndReleaseTopic(pMnode, pTopic); } diff --git a/source/dnode/mnode/impl/src/mndDnode.c b/source/dnode/mnode/impl/src/mndDnode.c index 01ff08cef9..0cac7fd86b 100644 --- a/source/dnode/mnode/impl/src/mndDnode.c +++ b/source/dnode/mnode/impl/src/mndDnode.c @@ -448,13 +448,13 @@ static int32_t mndCreateDnode(SMnode *pMnode, SRpcMsg *pReq, SCreateDnodeReq *pC } mDebug("trans:%d, used to create dnode:%s", pTrans->id, dnodeObj.ep); - SSdbRaw *pRedoRaw = mndDnodeActionEncode(&dnodeObj); - if (pRedoRaw == NULL || mndTransAppendRedolog(pTrans, pRedoRaw) != 0) { - mError("trans:%d, failed to append redo log since %s", pTrans->id, terrstr()); + SSdbRaw *pCommitRaw = mndDnodeActionEncode(&dnodeObj); + if (pCommitRaw == NULL || mndTransAppendCommitlog(pTrans, pCommitRaw) != 0) { + mError("trans:%d, failed to append commit log since %s", pTrans->id, terrstr()); mndTransDrop(pTrans); return -1; } - sdbSetRawStatus(pRedoRaw, SDB_STATUS_READY); + sdbSetRawStatus(pCommitRaw, SDB_STATUS_READY); if (mndTransPrepare(pMnode, pTrans) != 0) { mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); @@ -524,13 +524,13 @@ static int32_t mndDropDnode(SMnode *pMnode, SRpcMsg *pReq, SDnodeObj *pDnode) { } mDebug("trans:%d, used to drop dnode:%d", pTrans->id, pDnode->id); - SSdbRaw *pRedoRaw = mndDnodeActionEncode(pDnode); - if (pRedoRaw == NULL || mndTransAppendRedolog(pTrans, pRedoRaw) != 0) { - mError("trans:%d, failed to append redo log since %s", pTrans->id, terrstr()); + SSdbRaw *pCommitRaw = mndDnodeActionEncode(pDnode); + if (pCommitRaw == NULL || mndTransAppendCommitlog(pTrans, pCommitRaw) != 0) { + mError("trans:%d, failed to append commit log since %s", pTrans->id, terrstr()); mndTransDrop(pTrans); return -1; } - sdbSetRawStatus(pRedoRaw, SDB_STATUS_DROPPED); + sdbSetRawStatus(pCommitRaw, SDB_STATUS_DROPPED); if (mndTransPrepare(pMnode, pTrans) != 0) { mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); diff --git a/source/dnode/mnode/impl/src/mndOffset.c b/source/dnode/mnode/impl/src/mndOffset.c index 6f42d66625..dca07f6a6d 100644 --- a/source/dnode/mnode/impl/src/mndOffset.c +++ b/source/dnode/mnode/impl/src/mndOffset.c @@ -153,6 +153,7 @@ int32_t mndCreateOffsets(STrans *pTrans, const char *cgroup, const char *topicNa return -1; } sdbSetRawStatus(pOffsetRaw, SDB_STATUS_READY); + // commit log or redo log? if (mndTransAppendRedolog(pTrans, pOffsetRaw) < 0) { return -1; } @@ -188,7 +189,7 @@ static int32_t mndProcessCommitOffsetReq(SRpcMsg *pMsg) { pOffsetObj->offset = pOffset->offset; SSdbRaw *pOffsetRaw = mndOffsetActionEncode(pOffsetObj); sdbSetRawStatus(pOffsetRaw, SDB_STATUS_READY); - mndTransAppendRedolog(pTrans, pOffsetRaw); + mndTransAppendCommitlog(pTrans, pOffsetRaw); if (create) { taosMemoryFree(pOffsetObj); } else { diff --git a/source/dnode/mnode/impl/src/mndStb.c b/source/dnode/mnode/impl/src/mndStb.c index f6043615ab..61f115e2ba 100644 --- a/source/dnode/mnode/impl/src/mndStb.c +++ b/source/dnode/mnode/impl/src/mndStb.c @@ -743,9 +743,7 @@ static int32_t mndCreateStb(SMnode *pMnode, SRpcMsg *pReq, SMCreateStbReq *pCrea mDebug("trans:%d, used to create stb:%s", pTrans->id, pCreate->name); - if (mndBuildStbFromReq(pMnode, &stbObj, pCreate, pDb) != 0) { - goto _OVER; - } + if (mndBuildStbFromReq(pMnode, &stbObj, pCreate, pDb) != 0) goto _OVER; if (mndAddStbToTrans(pMnode, pTrans, pDb, &stbObj) < 0) goto _OVER; diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 7b6383a470..9de6138689 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -279,13 +279,13 @@ int32_t mndAddStreamToTrans(SMnode *pMnode, SStreamObj *pStream, const char *ast } mDebug("trans:%d, used to create stream:%s", pTrans->id, pStream->name); - SSdbRaw *pRedoRaw = mndStreamActionEncode(pStream); - if (pRedoRaw == NULL || mndTransAppendRedolog(pTrans, pRedoRaw) != 0) { - mError("trans:%d, failed to append redo log since %s", pTrans->id, terrstr()); + SSdbRaw *pCommitRaw = mndStreamActionEncode(pStream); + if (pCommitRaw == NULL || mndTransAppendCommitlog(pTrans, pCommitRaw) != 0) { + mError("trans:%d, failed to append commit log since %s", pTrans->id, terrstr()); mndTransDrop(pTrans); return -1; } - sdbSetRawStatus(pRedoRaw, SDB_STATUS_READY); + sdbSetRawStatus(pCommitRaw, SDB_STATUS_READY); return 0; } diff --git a/source/dnode/mnode/impl/src/mndSubscribe.c b/source/dnode/mnode/impl/src/mndSubscribe.c index c82472eec0..3713bd501a 100644 --- a/source/dnode/mnode/impl/src/mndSubscribe.c +++ b/source/dnode/mnode/impl/src/mndSubscribe.c @@ -479,7 +479,7 @@ static int32_t mndPersistRebResult(SMnode *pMnode, SRpcMsg *pMsg, const SMqRebOu SMqTopicObj topicObj = {0}; memcpy(&topicObj, pTopic, sizeof(SMqTopicObj)); topicObj.refConsumerCnt = pTopic->refConsumerCnt - consumerNum; - if (mndSetTopicRedoLogs(pMnode, pTrans, &topicObj) != 0) goto REB_FAIL; + if (mndSetTopicCommitLogs(pMnode, pTrans, &topicObj) != 0) goto REB_FAIL; } } diff --git a/source/dnode/mnode/impl/src/mndSync.c b/source/dnode/mnode/impl/src/mndSync.c index 3dbe3241a7..e3f7b40526 100644 --- a/source/dnode/mnode/impl/src/mndSync.c +++ b/source/dnode/mnode/impl/src/mndSync.c @@ -17,10 +17,60 @@ #include "mndSync.h" #include "mndTrans.h" -static int32_t mndInitWal(SMnode *pMnode) { +int32_t mndSyncEqMsg(const SMsgCb *msgcb, SRpcMsg *pMsg) { + int32_t code = tmsgPutToQueue(msgcb, SYNC_QUEUE, pMsg); + if (code != 0) { + rpcFreeCont(pMsg->pCont); + } + return code; +} + +int32_t mndSyncSendMsg(const SEpSet *pEpSet, SRpcMsg *pMsg) { return tmsgSendReq(pEpSet, pMsg); } + +void mndSyncCommitMsg(struct SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMeta cbMeta) { + SMnode *pMnode = pFsm->data; + SSdb *pSdb = pMnode->pSdb; + SSyncMgmt *pMgmt = &pMnode->syncMgmt; + SSdbRaw *pRaw = pMsg->pCont; + + mTrace("ver:%" PRId64 ", apply raw:%p to sdb, role:%s", cbMeta.index, pRaw, syncStr(cbMeta.state)); + sdbWriteWithoutFree(pSdb, pRaw); + sdbSetApplyIndex(pSdb, cbMeta.index); + sdbSetApplyTerm(pSdb, cbMeta.term); + if (cbMeta.state == TAOS_SYNC_STATE_LEADER) { + tsem_post(&pMgmt->syncSem); + } +} + +int32_t mndSyncGetSnapshot(struct SSyncFSM *pFsm, SSnapshot *pSnapshot) { + SMnode *pMnode = pFsm->data; + pSnapshot->lastApplyIndex = sdbGetApplyIndex(pMnode->pSdb); + pSnapshot->lastApplyTerm = sdbGetApplyTerm(pMnode->pSdb); + return 0; +} + +void mndRestoreFinish(struct SSyncFSM *pFsm) { + SMnode *pMnode = pFsm->data; + mndTransPullup(pMnode); + pMnode->syncMgmt.restored = true; +} + +SSyncFSM *mndSyncMakeFsm(SMnode *pMnode) { + SSyncFSM *pFsm = taosMemoryCalloc(1, sizeof(SSyncFSM)); + pFsm->data = pMnode; + pFsm->FpCommitCb = mndSyncCommitMsg; + pFsm->FpPreCommitCb = NULL; + pFsm->FpRollBackCb = NULL; + pFsm->FpGetSnapshot = mndSyncGetSnapshot; + pFsm->FpRestoreFinish = mndRestoreFinish; + pFsm->FpRestoreSnapshot = NULL; + return pFsm; +} + +int32_t mndInitSync(SMnode *pMnode) { SSyncMgmt *pMgmt = &pMnode->syncMgmt; - char path[PATH_MAX] = {0}; + char path[PATH_MAX + 20] = {0}; snprintf(path, sizeof(path), "%s%swal", pMnode->path, TD_DIRSEP); SWalCfg cfg = { .vgId = 1, @@ -31,164 +81,88 @@ static int32_t mndInitWal(SMnode *pMnode) { .retentionSize = -1, .level = TAOS_WAL_FSYNC, }; + pMgmt->pWal = walOpen(path, &cfg); - if (pMgmt->pWal == NULL) return -1; - - return 0; -} - -static void mndCloseWal(SMnode *pMnode) { - SSyncMgmt *pMgmt = &pMnode->syncMgmt; - if (pMgmt->pWal != NULL) { - walClose(pMgmt->pWal); - pMgmt->pWal = NULL; - } -} - -static int32_t mndRestoreWal(SMnode *pMnode) { - SWal *pWal = pMnode->syncMgmt.pWal; - SSdb *pSdb = pMnode->pSdb; - int64_t lastSdbVer = sdbUpdateVer(pSdb, 0); - int32_t code = -1; - - SWalReadHandle *pHandle = walOpenReadHandle(pWal); - if (pHandle == NULL) return -1; - - int64_t first = walGetFirstVer(pWal); - int64_t last = walGetLastVer(pWal); - mDebug("start to restore wal, sdbver:%" PRId64 ", first:%" PRId64 " last:%" PRId64, lastSdbVer, first, last); - - first = TMAX(lastSdbVer + 1, first); - for (int64_t ver = first; ver >= 0 && ver <= last; ++ver) { - if (walReadWithHandle(pHandle, ver) < 0) { - mError("ver:%" PRId64 ", failed to read from wal since %s", ver, terrstr()); - goto _OVER; - } - - SWalHead *pHead = pHandle->pHead; - int64_t sdbVer = sdbUpdateVer(pSdb, 0); - if (sdbVer + 1 != ver) { - terrno = TSDB_CODE_SDB_INVALID_WAl_VER; - mError("ver:%" PRId64 ", failed to write to sdb, since inconsistent with sdbver:%" PRId64, ver, sdbVer); - goto _OVER; - } - - mTrace("ver:%" PRId64 ", will be restored, content:%p", ver, pHead->head.body); - if (sdbWriteWithoutFree(pSdb, (void *)pHead->head.body) < 0) { - mError("ver:%" PRId64 ", failed to write to sdb since %s", ver, terrstr()); - goto _OVER; - } - - sdbUpdateVer(pSdb, 1); - mDebug("ver:%" PRId64 ", is restored", ver); - } - - int64_t sdbVer = sdbUpdateVer(pSdb, 0); - mDebug("restore wal finished, sdbver:%" PRId64, sdbVer); - - mndTransPullup(pMnode); - sdbVer = sdbUpdateVer(pSdb, 0); - mDebug("pullup trans finished, sdbver:%" PRId64, sdbVer); - - if (sdbVer != lastSdbVer) { - mInfo("sdb restored from %" PRId64 " to %" PRId64 ", write file", lastSdbVer, sdbVer); - if (sdbWriteFile(pSdb) != 0) { - goto _OVER; - } - - if (walCommit(pWal, sdbVer) != 0) { - goto _OVER; - } - - if (walBeginSnapshot(pWal, sdbVer) < 0) { - goto _OVER; - } - - if (walEndSnapshot(pWal) < 0) { - goto _OVER; - } - } - - code = 0; - -_OVER: - walCloseReadHandle(pHandle); - return code; -} - -int32_t mndInitSync(SMnode *pMnode) { - SSyncMgmt *pMgmt = &pMnode->syncMgmt; - tsem_init(&pMgmt->syncSem, 0, 0); - - if (mndInitWal(pMnode) < 0) { + if (pMgmt->pWal == NULL) { mError("failed to open wal since %s", terrstr()); return -1; } - if (mndRestoreWal(pMnode) < 0) { - mError("failed to restore wal since %s", terrstr()); + SSyncInfo syncInfo = {.vgId = 1, .FpSendMsg = mndSyncSendMsg, .FpEqMsg = mndSyncEqMsg}; + snprintf(syncInfo.path, sizeof(syncInfo.path), "%s%ssync", pMnode->path, TD_DIRSEP); + syncInfo.pWal = pMgmt->pWal; + syncInfo.pFsm = mndSyncMakeFsm(pMnode); + + SSyncCfg *pCfg = &syncInfo.syncCfg; + pCfg->replicaNum = pMnode->replica; + pCfg->myIndex = pMnode->selfIndex; + for (int32_t i = 0; i < pMnode->replica; ++i) { + SNodeInfo *pNode = &pCfg->nodeInfo[i]; + tstrncpy(pNode->nodeFqdn, pMnode->replicas[i].fqdn, sizeof(pNode->nodeFqdn)); + pNode->nodePort = pMnode->replicas[i].port; + } + + tsem_init(&pMgmt->syncSem, 0, 0); + pMgmt->sync = syncOpen(&syncInfo); + if (pMgmt->sync <= 0) { + mError("failed to open sync since %s", terrstr()); return -1; } - if (pMnode->selfId == 1) { - pMgmt->state = TAOS_SYNC_STATE_LEADER; - } - pMgmt->pSyncNode = NULL; + mDebug("mnode sync is opened, id:%" PRId64, pMgmt->sync); return 0; } void mndCleanupSync(SMnode *pMnode) { SSyncMgmt *pMgmt = &pMnode->syncMgmt; + syncStop(pMgmt->sync); + mDebug("sync:%" PRId64 " is stopped", pMgmt->sync); + tsem_destroy(&pMgmt->syncSem); - mndCloseWal(pMnode); -} + if (pMgmt->pWal != NULL) { + walClose(pMgmt->pWal); + } -static int32_t mndSyncApplyCb(struct SSyncFSM *fsm, SyncIndex index, const SSyncBuffer *buf, void *pData) { - SMnode *pMnode = pData; - SSyncMgmt *pMgmt = &pMnode->syncMgmt; - - pMgmt->errCode = 0; - tsem_post(&pMgmt->syncSem); - - return 0; + memset(pMgmt, 0, sizeof(SSyncMgmt)); } int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw) { - SWal *pWal = pMnode->syncMgmt.pWal; - SSdb *pSdb = pMnode->pSdb; - - int64_t ver = sdbUpdateVer(pSdb, 1); - if (walWrite(pWal, ver, 1, pRaw, sdbGetRawTotalSize(pRaw)) < 0) { - sdbUpdateVer(pSdb, -1); - mError("ver:%" PRId64 ", failed to write raw:%p to wal since %s", ver, pRaw, terrstr()); - return -1; - } - - mTrace("ver:%" PRId64 ", write to wal, raw:%p", ver, pRaw); - walCommit(pWal, ver); - walFsync(pWal, true); - -#if 1 - return 0; -#else - if (pMnode->replica == 1) return 0; - SSyncMgmt *pMgmt = &pMnode->syncMgmt; pMgmt->errCode = 0; - SSyncBuffer buf = {.data = pRaw, .len = sdbGetRawTotalSize(pRaw)}; + SRpcMsg rsp = {.code = TDMT_MND_APPLY_MSG, .contLen = sdbGetRawTotalSize(pRaw)}; + rsp.pCont = rpcMallocCont(rsp.contLen); + if (rsp.pCont == NULL) return -1; + memcpy(rsp.pCont, pRaw, rsp.contLen); - bool isWeak = false; - int32_t code = syncPropose(pMgmt->pSyncNode, &buf, pMnode, isWeak); + const bool isWeak = false; + int32_t code = syncPropose(pMgmt->sync, &rsp, isWeak); + if (code == 0) { + tsem_wait(&pMgmt->syncSem); + } else if (code == TAOS_SYNC_PROPOSE_NOT_LEADER) { + terrno = TSDB_CODE_APP_NOT_READY; + } else if (code == TAOS_SYNC_PROPOSE_OTHER_ERROR) { + terrno = TSDB_CODE_SYN_INTERNAL_ERROR; + } else { + terrno = TSDB_CODE_APP_ERROR; + } if (code != 0) return code; - - tsem_wait(&pMgmt->syncSem); return pMgmt->errCode; -#endif } +void mndSyncStart(SMnode *pMnode) { + SSyncMgmt *pMgmt = &pMnode->syncMgmt; + syncSetMsgCb(pMgmt->sync, &pMnode->msgCb); + syncStart(pMgmt->sync); + mDebug("sync:%" PRId64 " is started", pMgmt->sync); +} + +void mndSyncStop(SMnode *pMnode) {} + bool mndIsMaster(SMnode *pMnode) { SSyncMgmt *pMgmt = &pMnode->syncMgmt; - return pMgmt->state == TAOS_SYNC_STATE_LEADER; + pMgmt->state = syncGetMyRole(pMgmt->sync); + + return (pMgmt->state == TAOS_SYNC_STATE_LEADER) && (pMnode->syncMgmt.restored); } diff --git a/source/dnode/mnode/impl/src/mndTopic.c b/source/dnode/mnode/impl/src/mndTopic.c index b9b01a4391..ec3d30ff07 100644 --- a/source/dnode/mnode/impl/src/mndTopic.c +++ b/source/dnode/mnode/impl/src/mndTopic.c @@ -386,14 +386,14 @@ static int32_t mndCreateTopic(SMnode *pMnode, SRpcMsg *pReq, SCMCreateTopicReq * } mDebug("trans:%d, used to create topic:%s", pTrans->id, pCreate->name); - SSdbRaw *pRedoRaw = mndTopicActionEncode(&topicObj); - if (pRedoRaw == NULL || mndTransAppendRedolog(pTrans, pRedoRaw) != 0) { - mError("trans:%d, failed to append redo log since %s", pTrans->id, terrstr()); + SSdbRaw *pCommitRaw = mndTopicActionEncode(&topicObj); + if (pCommitRaw == NULL || mndTransAppendCommitlog(pTrans, pCommitRaw) != 0) { + mError("trans:%d, failed to append commit log since %s", pTrans->id, terrstr()); taosMemoryFreeClear(topicObj.physicalPlan); mndTransDrop(pTrans); return -1; } - sdbSetRawStatus(pRedoRaw, SDB_STATUS_READY); + sdbSetRawStatus(pCommitRaw, SDB_STATUS_READY); if (mndTransPrepare(pMnode, pTrans) != 0) { mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); @@ -473,13 +473,13 @@ CREATE_TOPIC_OVER: } static int32_t mndDropTopic(SMnode *pMnode, STrans *pTrans, SRpcMsg *pReq, SMqTopicObj *pTopic) { - SSdbRaw *pRedoRaw = mndTopicActionEncode(pTopic); - if (pRedoRaw == NULL || mndTransAppendRedolog(pTrans, pRedoRaw) != 0) { - mError("trans:%d, failed to append redo log since %s", pTrans->id, terrstr()); + SSdbRaw *pCommitRaw = mndTopicActionEncode(pTopic); + if (pCommitRaw == NULL || mndTransAppendCommitlog(pTrans, pCommitRaw) != 0) { + mError("trans:%d, failed to append commit log since %s", pTrans->id, terrstr()); mndTransDrop(pTrans); return -1; } - sdbSetRawStatus(pRedoRaw, SDB_STATUS_DROPPED); + sdbSetRawStatus(pCommitRaw, SDB_STATUS_DROPPED); if (mndTransPrepare(pMnode, pTrans) != 0) { mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); @@ -627,11 +627,11 @@ static int32_t mndRetrieveTopic(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock *pBl return numOfRows; } -int32_t mndSetTopicRedoLogs(SMnode *pMnode, STrans *pTrans, SMqTopicObj *pTopic) { - SSdbRaw *pRedoRaw = mndTopicActionEncode(pTopic); - if (pRedoRaw == NULL) return -1; - if (mndTransAppendCommitlog(pTrans, pRedoRaw) != 0) return -1; - if (sdbSetRawStatus(pRedoRaw, SDB_STATUS_READY) != 0) return -1; +int32_t mndSetTopicCommitLogs(SMnode *pMnode, STrans *pTrans, SMqTopicObj *pTopic) { + SSdbRaw *pCommitRaw = mndTopicActionEncode(pTopic); + if (pCommitRaw == NULL) return -1; + if (mndTransAppendCommitlog(pTrans, pCommitRaw) != 0) return -1; + if (sdbSetRawStatus(pCommitRaw, SDB_STATUS_READY) != 0) return -1; return 0; } diff --git a/source/dnode/mnode/impl/src/mndTrans.c b/source/dnode/mnode/impl/src/mndTrans.c index 5d205197d1..d685c88500 100644 --- a/source/dnode/mnode/impl/src/mndTrans.c +++ b/source/dnode/mnode/impl/src/mndTrans.c @@ -682,13 +682,6 @@ static int32_t mndTransSync(SMnode *pMnode, STrans *pTrans) { } mDebug("trans:%d, sync finished", pTrans->id); - - code = sdbWrite(pMnode->pSdb, pRaw); - if (code != 0) { - mError("trans:%d, failed to write sdb since %s", pTrans->id, terrstr()); - return -1; - } - return 0; } @@ -768,6 +761,12 @@ int32_t mndTransPrepare(SMnode *pMnode, STrans *pTrans) { return -1; } + if (taosArrayGetSize(pTrans->commitLogs) <= 0) { + terrno = TSDB_CODE_MND_TRANS_CLOG_IS_NULL; + mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); + return -1; + } + mDebug("trans:%d, prepare transaction", pTrans->id); if (mndTransSync(pMnode, pTrans) != 0) { mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); @@ -1080,6 +1079,8 @@ static bool mndTransPerformRedoLogStage(SMnode *pMnode, STrans *pTrans) { } static bool mndTransPerformRedoActionStage(SMnode *pMnode, STrans *pTrans) { + if (!mndIsMaster(pMnode)) return false; + bool continueExec = true; int32_t code = mndTransExecuteRedoActions(pMnode, pTrans); @@ -1169,6 +1170,8 @@ static bool mndTransPerformUndoLogStage(SMnode *pMnode, STrans *pTrans) { } static bool mndTransPerformUndoActionStage(SMnode *pMnode, STrans *pTrans) { + if (!mndIsMaster(pMnode)) return false; + bool continueExec = true; int32_t code = mndTransExecuteUndoActions(pMnode, pTrans); @@ -1350,19 +1353,35 @@ _OVER: return code; } -void mndTransPullup(SMnode *pMnode) { - STrans *pTrans = NULL; - void *pIter = NULL; +static int32_t mndCompareTransId(int32_t *pTransId1, int32_t *pTransId2) { return *pTransId1 >= *pTransId2 ? 1 : 0; } +void mndTransPullup(SMnode *pMnode) { + SSdb *pSdb = pMnode->pSdb; + SArray *pArray = taosArrayInit(sdbGetSize(pSdb, SDB_TRANS), sizeof(int32_t)); + if (pArray == NULL) return; + + void *pIter = NULL; while (1) { + STrans *pTrans = NULL; pIter = sdbFetch(pMnode->pSdb, SDB_TRANS, pIter, (void **)&pTrans); if (pIter == NULL) break; + taosArrayPush(pArray, &pTrans->id); + sdbRelease(pSdb, pTrans); + } - mndTransExecute(pMnode, pTrans); - sdbRelease(pMnode->pSdb, pTrans); + taosArraySort(pArray, (__compar_fn_t)mndCompareTransId); + + for (int32_t i = 0; i < taosArrayGetSize(pArray); ++i) { + int32_t *pTransId = taosArrayGet(pArray, i); + STrans *pTrans = mndAcquireTrans(pMnode, *pTransId); + if (pTrans != NULL) { + mndTransExecute(pMnode, pTrans); + } + mndReleaseTrans(pMnode, pTrans); } sdbWriteFile(pMnode->pSdb); + taosArrayDestroy(pArray); } static int32_t mndRetrieveTrans(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock *pBlock, int32_t rows) { diff --git a/source/dnode/mnode/impl/src/mndUser.c b/source/dnode/mnode/impl/src/mndUser.c index 88e646e765..5f2147a5fe 100644 --- a/source/dnode/mnode/impl/src/mndUser.c +++ b/source/dnode/mnode/impl/src/mndUser.c @@ -272,13 +272,13 @@ static int32_t mndCreateUser(SMnode *pMnode, char *acct, SCreateUserReq *pCreate } mDebug("trans:%d, used to create user:%s", pTrans->id, pCreate->user); - SSdbRaw *pRedoRaw = mndUserActionEncode(&userObj); - if (pRedoRaw == NULL || mndTransAppendRedolog(pTrans, pRedoRaw) != 0) { - mError("trans:%d, failed to append redo log since %s", pTrans->id, terrstr()); + SSdbRaw *pCommitRaw = mndUserActionEncode(&userObj); + if (pCommitRaw == NULL || mndTransAppendCommitlog(pTrans, pCommitRaw) != 0) { + mError("trans:%d, failed to commit redo log since %s", pTrans->id, terrstr()); mndTransDrop(pTrans); return -1; } - sdbSetRawStatus(pRedoRaw, SDB_STATUS_READY); + sdbSetRawStatus(pCommitRaw, SDB_STATUS_READY); if (mndTransPrepare(pMnode, pTrans) != 0) { mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); @@ -352,13 +352,13 @@ static int32_t mndAlterUser(SMnode *pMnode, SUserObj *pOld, SUserObj *pNew, SRpc } mDebug("trans:%d, used to alter user:%s", pTrans->id, pOld->user); - SSdbRaw *pRedoRaw = mndUserActionEncode(pNew); - if (pRedoRaw == NULL || mndTransAppendRedolog(pTrans, pRedoRaw) != 0) { - mError("trans:%d, failed to append redo log since %s", pTrans->id, terrstr()); + SSdbRaw *pCommitRaw = mndUserActionEncode(pNew); + if (pCommitRaw == NULL || mndTransAppendCommitlog(pTrans, pCommitRaw) != 0) { + mError("trans:%d, failed to append commit log since %s", pTrans->id, terrstr()); mndTransDrop(pTrans); return -1; } - sdbSetRawStatus(pRedoRaw, SDB_STATUS_READY); + sdbSetRawStatus(pCommitRaw, SDB_STATUS_READY); if (mndTransPrepare(pMnode, pTrans) != 0) { mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); @@ -559,13 +559,13 @@ static int32_t mndDropUser(SMnode *pMnode, SRpcMsg *pReq, SUserObj *pUser) { } mDebug("trans:%d, used to drop user:%s", pTrans->id, pUser->user); - SSdbRaw *pRedoRaw = mndUserActionEncode(pUser); - if (pRedoRaw == NULL || mndTransAppendRedolog(pTrans, pRedoRaw) != 0) { - mError("trans:%d, failed to append redo log since %s", pTrans->id, terrstr()); + SSdbRaw *pCommitRaw = mndUserActionEncode(pUser); + if (pCommitRaw == NULL || mndTransAppendCommitlog(pTrans, pCommitRaw) != 0) { + mError("trans:%d, failed to append commit log since %s", pTrans->id, terrstr()); mndTransDrop(pTrans); return -1; } - sdbSetRawStatus(pRedoRaw, SDB_STATUS_DROPPED); + sdbSetRawStatus(pCommitRaw, SDB_STATUS_DROPPED); if (mndTransPrepare(pMnode, pTrans) != 0) { mError("trans:%d, failed to prepare since %s", pTrans->id, terrstr()); diff --git a/source/dnode/mnode/impl/src/mnode.c b/source/dnode/mnode/impl/src/mnode.c index 285ae030e1..5b8c1a3101 100644 --- a/source/dnode/mnode/impl/src/mnode.c +++ b/source/dnode/mnode/impl/src/mnode.c @@ -86,7 +86,6 @@ static void *mndThreadFp(void *param) { lastTime++; taosMsleep(100); if (pMnode->stopped) break; - if (!mndIsMaster(pMnode)) continue; if (lastTime % (tsTransPullupInterval * 10) == 0) { mndPullupTrans(pMnode); @@ -336,9 +335,107 @@ int32_t mndAlter(SMnode *pMnode, const SMnodeOpt *pOption) { return 0; } -int32_t mndStart(SMnode *pMnode) { return mndInitTimer(pMnode); } +int32_t mndStart(SMnode *pMnode) { + mndSyncStart(pMnode); + return mndInitTimer(pMnode); +} -void mndStop(SMnode *pMnode) { return mndCleanupTimer(pMnode); } +void mndStop(SMnode *pMnode) { + mndSyncStop(pMnode); + return mndCleanupTimer(pMnode); +} + +int32_t mndProcessSyncMsg(SRpcMsg *pMsg) { + SMnode *pMnode = pMsg->info.node; + void *ahandle = pMsg->info.ahandle; + int32_t ret = TAOS_SYNC_PROPOSE_OTHER_ERROR; + + if (syncEnvIsStart()) { + SSyncNode *pSyncNode = syncNodeAcquire(pMnode->syncMgmt.sync); + assert(pSyncNode != NULL); + + ESyncState state = syncGetMyRole(pMnode->syncMgmt.sync); + SyncTerm currentTerm = syncGetMyTerm(pMnode->syncMgmt.sync); + + SMsgHead *pHead = pMsg->pCont; + + char logBuf[512]; + char *syncNodeStr = sync2SimpleStr(pMnode->syncMgmt.sync); + snprintf(logBuf, sizeof(logBuf), "==vnodeProcessSyncReq== msgType:%d, syncNode: %s", pMsg->msgType, syncNodeStr); + syncRpcMsgLog2(logBuf, pMsg); + taosMemoryFree(syncNodeStr); + + SRpcMsg *pRpcMsg = pMsg; + + if (pRpcMsg->msgType == TDMT_VND_SYNC_TIMEOUT) { + SyncTimeout *pSyncMsg = syncTimeoutFromRpcMsg2(pRpcMsg); + assert(pSyncMsg != NULL); + + ret = syncNodeOnTimeoutCb(pSyncNode, pSyncMsg); + syncTimeoutDestroy(pSyncMsg); + + } else if (pRpcMsg->msgType == TDMT_VND_SYNC_PING) { + SyncPing *pSyncMsg = syncPingFromRpcMsg2(pRpcMsg); + assert(pSyncMsg != NULL); + + ret = syncNodeOnPingCb(pSyncNode, pSyncMsg); + syncPingDestroy(pSyncMsg); + + } else if (pRpcMsg->msgType == TDMT_VND_SYNC_PING_REPLY) { + SyncPingReply *pSyncMsg = syncPingReplyFromRpcMsg2(pRpcMsg); + assert(pSyncMsg != NULL); + + ret = syncNodeOnPingReplyCb(pSyncNode, pSyncMsg); + syncPingReplyDestroy(pSyncMsg); + + } else if (pRpcMsg->msgType == TDMT_VND_SYNC_CLIENT_REQUEST) { + SyncClientRequest *pSyncMsg = syncClientRequestFromRpcMsg2(pRpcMsg); + assert(pSyncMsg != NULL); + + ret = syncNodeOnClientRequestCb(pSyncNode, pSyncMsg); + syncClientRequestDestroy(pSyncMsg); + + } else if (pRpcMsg->msgType == TDMT_VND_SYNC_REQUEST_VOTE) { + SyncRequestVote *pSyncMsg = syncRequestVoteFromRpcMsg2(pRpcMsg); + assert(pSyncMsg != NULL); + + ret = syncNodeOnRequestVoteCb(pSyncNode, pSyncMsg); + syncRequestVoteDestroy(pSyncMsg); + + } else if (pRpcMsg->msgType == TDMT_VND_SYNC_REQUEST_VOTE_REPLY) { + SyncRequestVoteReply *pSyncMsg = syncRequestVoteReplyFromRpcMsg2(pRpcMsg); + assert(pSyncMsg != NULL); + + ret = syncNodeOnRequestVoteReplyCb(pSyncNode, pSyncMsg); + syncRequestVoteReplyDestroy(pSyncMsg); + + } else if (pRpcMsg->msgType == TDMT_VND_SYNC_APPEND_ENTRIES) { + SyncAppendEntries *pSyncMsg = syncAppendEntriesFromRpcMsg2(pRpcMsg); + assert(pSyncMsg != NULL); + + ret = syncNodeOnAppendEntriesCb(pSyncNode, pSyncMsg); + syncAppendEntriesDestroy(pSyncMsg); + + } else if (pRpcMsg->msgType == TDMT_VND_SYNC_APPEND_ENTRIES_REPLY) { + SyncAppendEntriesReply *pSyncMsg = syncAppendEntriesReplyFromRpcMsg2(pRpcMsg); + assert(pSyncMsg != NULL); + + ret = syncNodeOnAppendEntriesReplyCb(pSyncNode, pSyncMsg); + syncAppendEntriesReplyDestroy(pSyncMsg); + + } else { + mError("==mndProcessSyncMsg== error msg type:%d", pRpcMsg->msgType); + ret = TAOS_SYNC_PROPOSE_OTHER_ERROR; + } + + syncNodeRelease(pSyncNode); + } else { + mError("==mndProcessSyncMsg== error syncEnv stop"); + ret = TAOS_SYNC_PROPOSE_OTHER_ERROR; + } + + return ret; +} int32_t mndProcessMsg(SRpcMsg *pMsg) { SMnode *pMnode = pMsg->info.node; @@ -346,7 +443,8 @@ int32_t mndProcessMsg(SRpcMsg *pMsg) { mTrace("msg:%p, will be processed, type:%s app:%p", pMsg, TMSG_INFO(pMsg->msgType), ahandle); if (IsReq(pMsg)) { - if (!mndIsMaster(pMnode)) { + if (!mndIsMaster(pMnode) && pMsg->msgType != TDMT_MND_TRANS_TIMER && pMsg->msgType != TDMT_MND_MQ_TIMER && + pMsg->msgType != TDMT_MND_TELEM_TIMER) { terrno = TSDB_CODE_APP_NOT_READY; mDebug("msg:%p, failed to process since %s, app:%p", pMsg, terrstr(), ahandle); return -1; diff --git a/source/dnode/mnode/impl/test/sdb/sdbTest.cpp b/source/dnode/mnode/impl/test/sdb/sdbTest.cpp index f67ca2fb91..df535c4456 100644 --- a/source/dnode/mnode/impl/test/sdb/sdbTest.cpp +++ b/source/dnode/mnode/impl/test/sdb/sdbTest.cpp @@ -493,9 +493,8 @@ TEST_F(MndTestSdb, 01_Write_Str) { ASSERT_EQ(sdbGetSize(pSdb, SDB_USER), 2); ASSERT_EQ(sdbGetMaxId(pSdb, SDB_USER), -1); ASSERT_EQ(sdbGetTableVer(pSdb, SDB_USER), 2 ); - ASSERT_EQ(sdbUpdateVer(pSdb, 0), -1); - ASSERT_EQ(sdbUpdateVer(pSdb, 1), 0); - ASSERT_EQ(sdbUpdateVer(pSdb, -1), -1); + sdbSetApplyIndex(pSdb, -1); + ASSERT_EQ(sdbGetApplyIndex(pSdb), -1); ASSERT_EQ(mnode.insertTimes, 2); ASSERT_EQ(mnode.deleteTimes, 0); @@ -537,9 +536,6 @@ TEST_F(MndTestSdb, 01_Write_Str) { ASSERT_EQ(sdbGetSize(pSdb, SDB_USER), 3); ASSERT_EQ(sdbGetTableVer(pSdb, SDB_USER), 4); - ASSERT_EQ(sdbUpdateVer(pSdb, 0), -1); - ASSERT_EQ(sdbUpdateVer(pSdb, 1), 0); - ASSERT_EQ(sdbUpdateVer(pSdb, -1), -1); ASSERT_EQ(mnode.insertTimes, 3); ASSERT_EQ(mnode.deleteTimes, 0); @@ -704,8 +700,9 @@ TEST_F(MndTestSdb, 01_Write_Str) { } // write version - ASSERT_EQ(sdbUpdateVer(pSdb, 1), 0); - ASSERT_EQ(sdbUpdateVer(pSdb, 1), 1); + sdbSetApplyIndex(pSdb, 0); + sdbSetApplyIndex(pSdb, 1); + ASSERT_EQ(sdbGetApplyIndex(pSdb), 1); ASSERT_EQ(sdbWriteFile(pSdb), 0); ASSERT_EQ(sdbWriteFile(pSdb), 0); @@ -775,7 +772,7 @@ TEST_F(MndTestSdb, 01_Read_Str) { ASSERT_EQ(sdbGetSize(pSdb, SDB_USER), 2); ASSERT_EQ(sdbGetMaxId(pSdb, SDB_USER), -1); ASSERT_EQ(sdbGetTableVer(pSdb, SDB_USER), 5); - ASSERT_EQ(sdbUpdateVer(pSdb, 0), 1); + ASSERT_EQ(sdbGetApplyIndex(pSdb), 1); ASSERT_EQ(mnode.insertTimes, 4); ASSERT_EQ(mnode.deleteTimes, 0); diff --git a/source/dnode/mnode/impl/test/trans/CMakeLists.txt b/source/dnode/mnode/impl/test/trans/CMakeLists.txt index 55fc3abbc2..22ff85563f 100644 --- a/source/dnode/mnode/impl/test/trans/CMakeLists.txt +++ b/source/dnode/mnode/impl/test/trans/CMakeLists.txt @@ -31,7 +31,7 @@ target_include_directories( PUBLIC "${TD_SOURCE_DIR}/include/dnode/mnode" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../inc" ) -add_test( - NAME transTest2 - COMMAND transTest2 -) +#add_test( +# NAME transTest2 +# COMMAND transTest2 +#) diff --git a/source/dnode/mnode/impl/test/trans/trans2.cpp b/source/dnode/mnode/impl/test/trans/trans2.cpp index ec972cf784..b78f1c7021 100644 --- a/source/dnode/mnode/impl/test/trans/trans2.cpp +++ b/source/dnode/mnode/impl/test/trans/trans2.cpp @@ -23,6 +23,11 @@ int32_t sendReq(const SEpSet *pEpSet, SRpcMsg *pMsg) { return -1; } +int32_t putToQueue(void *pMgmt, SRpcMsg *pMsg) { + terrno = TSDB_CODE_INVALID_PTR; + return -1; +} + class MndTestTrans2 : public ::testing::Test { protected: static void InitLog() { @@ -55,6 +60,9 @@ class MndTestTrans2 : public ::testing::Test { msgCb.reportStartupFp = reportStartup; msgCb.sendReqFp = sendReq; msgCb.sendRspFp = sendRsp; + msgCb.queueFps[SYNC_QUEUE] = putToQueue; + msgCb.queueFps[WRITE_QUEUE] = putToQueue; + msgCb.queueFps[READ_QUEUE] = putToQueue; msgCb.mgmt = (SMgmtWrapper *)(&msgCb); // hack tmsgSetDefault(&msgCb); @@ -77,6 +85,7 @@ class MndTestTrans2 : public ::testing::Test { static void SetUpTestSuite() { InitLog(); walInit(); + syncInit(); InitMnode(); } diff --git a/source/dnode/mnode/sdb/src/sdb.c b/source/dnode/mnode/sdb/src/sdb.c index 1f11a77e6c..7b90d8acb5 100644 --- a/source/dnode/mnode/sdb/src/sdb.c +++ b/source/dnode/mnode/sdb/src/sdb.c @@ -31,11 +31,9 @@ SSdb *sdbInit(SSdbOpt *pOption) { char path[PATH_MAX + 100] = {0}; snprintf(path, sizeof(path), "%s%sdata", pOption->path, TD_DIRSEP); pSdb->currDir = strdup(path); - snprintf(path, sizeof(path), "%s%ssync", pOption->path, TD_DIRSEP); - pSdb->syncDir = strdup(path); snprintf(path, sizeof(path), "%s%stmp", pOption->path, TD_DIRSEP); pSdb->tmpDir = strdup(path); - if (pSdb->currDir == NULL || pSdb->currDir == NULL || pSdb->currDir == NULL) { + if (pSdb->currDir == NULL || pSdb->tmpDir == NULL) { sdbCleanup(pSdb); terrno = TSDB_CODE_OUT_OF_MEMORY; mError("failed to init sdb since %s", terrstr()); @@ -55,6 +53,7 @@ SSdb *sdbInit(SSdbOpt *pOption) { } pSdb->curVer = -1; + pSdb->curTerm = -1; pSdb->lastCommitVer = -1; pSdb->pMnode = pOption->pMnode; mDebug("sdb init successfully"); @@ -149,12 +148,6 @@ static int32_t sdbCreateDir(SSdb *pSdb) { return -1; } - if (taosMkDir(pSdb->syncDir) != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); - mError("failed to create dir:%s since %s", pSdb->syncDir, terrstr()); - return -1; - } - if (taosMkDir(pSdb->tmpDir) != 0) { terrno = TAOS_SYSTEM_ERROR(errno); mError("failed to create dir:%s since %s", pSdb->tmpDir, terrstr()); @@ -164,4 +157,10 @@ static int32_t sdbCreateDir(SSdb *pSdb) { return 0; } -int64_t sdbUpdateVer(SSdb *pSdb, int32_t val) { return atomic_add_fetch_64(&pSdb->curVer, val); } \ No newline at end of file +void sdbSetApplyIndex(SSdb *pSdb, int64_t index) { pSdb->curVer = index; } + +int64_t sdbGetApplyIndex(SSdb *pSdb) { return pSdb->curVer; } + +void sdbSetApplyTerm(SSdb *pSdb, int64_t term) { pSdb->curTerm = term; } + +int64_t sdbGetApplyTerm(SSdb *pSdb) { return pSdb->curTerm; } diff --git a/source/dnode/mnode/sdb/src/sdbFile.c b/source/dnode/mnode/sdb/src/sdbFile.c index a391ea8d03..b000c208c8 100644 --- a/source/dnode/mnode/sdb/src/sdbFile.c +++ b/source/dnode/mnode/sdb/src/sdbFile.c @@ -65,6 +65,16 @@ static int32_t sdbReadFileHead(SSdb *pSdb, TdFilePtr pFile) { return -1; } + ret = taosReadFile(pFile, &pSdb->curTerm, sizeof(int64_t)); + if (ret < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + if (ret != sizeof(int64_t)) { + terrno = TSDB_CODE_FILE_CORRUPTED; + return -1; + } + for (int32_t i = 0; i < SDB_TABLE_SIZE; ++i) { int64_t maxId = 0; ret = taosReadFile(pFile, &maxId, sizeof(int64_t)); @@ -123,6 +133,11 @@ static int32_t sdbWriteFileHead(SSdb *pSdb, TdFilePtr pFile) { return -1; } + if (taosWriteFile(pFile, &pSdb->curTerm, sizeof(int64_t)) != sizeof(int64_t)) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + for (int32_t i = 0; i < SDB_TABLE_SIZE; ++i) { int64_t maxId = 0; if (i < SDB_MAX) { @@ -182,6 +197,7 @@ int32_t sdbReadFile(SSdb *pSdb) { if (sdbReadFileHead(pSdb, pFile) != 0) { mError("failed to read file:%s head since %s", file, terrstr()); pSdb->curVer = -1; + pSdb->curTerm = -1; taosMemoryFree(pRaw); taosCloseFile(&pFile); return -1; @@ -256,8 +272,8 @@ static int32_t sdbWriteFileImp(SSdb *pSdb) { char curfile[PATH_MAX] = {0}; snprintf(curfile, sizeof(curfile), "%s%ssdb.data", pSdb->currDir, TD_DIRSEP); - mDebug("start to write file:%s, current ver:%" PRId64 ", commit ver:%" PRId64, curfile, pSdb->curVer, - pSdb->lastCommitVer); + mDebug("start to write file:%s, current ver:%" PRId64 " term:%" PRId64 ", commit ver:%" PRId64, curfile, pSdb->curVer, + pSdb->curTerm, pSdb->lastCommitVer); TdFilePtr pFile = taosOpenFile(tmpfile, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); if (pFile == NULL) { @@ -350,7 +366,7 @@ static int32_t sdbWriteFileImp(SSdb *pSdb) { mError("failed to write file:%s since %s", curfile, tstrerror(code)); } else { pSdb->lastCommitVer = pSdb->curVer; - mDebug("write file:%s successfully, ver:%" PRId64, curfile, pSdb->lastCommitVer); + mDebug("write file:%s successfully, ver:%" PRId64 " term:%" PRId64, curfile, pSdb->lastCommitVer, pSdb->curTerm); } terrno = code; diff --git a/source/dnode/vnode/src/inc/tq.h b/source/dnode/vnode/src/inc/tq.h index f89df4a96f..ad3f8cc869 100644 --- a/source/dnode/vnode/src/inc/tq.h +++ b/source/dnode/vnode/src/inc/tq.h @@ -20,9 +20,9 @@ #include "executor.h" #include "os.h" -#include "tcache.h" #include "thash.h" #include "tmsg.h" +#include "tqueue.h" #include "trpc.h" #include "ttimer.h" #include "wal.h" @@ -86,6 +86,9 @@ typedef struct { qTaskInfo_t task[5]; } STqExec; +int32_t tEncodeSTqExec(SEncoder* pEncoder, const STqExec* pExec); +int32_t tDecodeSTqExec(SDecoder* pDecoder, STqExec* pExec); + struct STQ { char* path; SHashObj* pushMgr; // consumerId -> STqExec* @@ -93,7 +96,7 @@ struct STQ { SHashObj* pStreamTasks; SVnode* pVnode; SWal* pWal; - // TDB* pTdb; + TDB* pTdb; }; typedef struct { @@ -101,7 +104,7 @@ typedef struct { tmr_h timer; } STqMgmt; -static STqMgmt tqMgmt; +static STqMgmt tqMgmt = {0}; // init once int tqInit(); diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index 6ca523b580..bd48ed9b4c 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -14,14 +14,37 @@ */ #include "tq.h" -#include "tqueue.h" int32_t tqInit() { - // + int8_t old; + while (1) { + old = atomic_val_compare_exchange_8(&tqMgmt.inited, 0, 2); + if (old != 2) break; + } + + if (old == 0) { + tqMgmt.timer = taosTmrInit(10000, 100, 10000, "TQ"); + if (tqMgmt.timer == NULL) { + atomic_store_8(&tqMgmt.inited, 0); + return -1; + } + atomic_store_8(&tqMgmt.inited, 1); + } return 0; } -void tqCleanUp() {} +void tqCleanUp() { + int8_t old; + while (1) { + old = atomic_val_compare_exchange_8(&tqMgmt.inited, 1, 2); + if (old != 2) break; + } + + if (old == 1) { + taosTmrCleanUp(tqMgmt.timer); + atomic_store_8(&tqMgmt.inited, 0); + } +} STQ* tqOpen(const char* path, SVnode* pVnode, SWal* pWal) { STQ* pTq = taosMemoryMalloc(sizeof(STQ)); @@ -32,9 +55,9 @@ STQ* tqOpen(const char* path, SVnode* pVnode, SWal* pWal) { pTq->path = strdup(path); pTq->pVnode = pVnode; pTq->pWal = pWal; - /*if (tdbOpen(path, 4096, 1, &pTq->pTdb) < 0) {*/ - /*ASSERT(0);*/ - /*}*/ + if (tdbOpen(path, 4096, 1, &pTq->pTdb) < 0) { + ASSERT(0); + } pTq->execs = taosHashInit(64, MurmurHash3_32, true, HASH_ENTRY_LOCK); @@ -51,11 +74,45 @@ void tqClose(STQ* pTq) { taosHashCleanup(pTq->execs); taosHashCleanup(pTq->pStreamTasks); taosHashCleanup(pTq->pushMgr); + tdbClose(pTq->pTdb); taosMemoryFree(pTq); } // TODO } +int32_t tEncodeSTqExec(SEncoder* pEncoder, const STqExec* pExec) { + if (tStartEncode(pEncoder) < 0) return -1; + if (tEncodeCStr(pEncoder, pExec->subKey) < 0) return -1; + if (tEncodeI64(pEncoder, pExec->consumerId) < 0) return -1; + if (tEncodeI32(pEncoder, pExec->epoch) < 0) return -1; + if (tEncodeI8(pEncoder, pExec->subType) < 0) return -1; + if (tEncodeI8(pEncoder, pExec->withTbName) < 0) return -1; + if (tEncodeI8(pEncoder, pExec->withSchema) < 0) return -1; + if (tEncodeI8(pEncoder, pExec->withTag) < 0) return -1; + if (pExec->subType == TOPIC_SUB_TYPE__TABLE) { + if (tEncodeCStr(pEncoder, pExec->qmsg) < 0) return -1; + // TODO encode modified exec + } + tEndEncode(pEncoder); + return pEncoder->pos; +} + +int32_t tDecodeSTqExec(SDecoder* pDecoder, STqExec* pExec) { + if (tStartDecode(pDecoder) < 0) return -1; + if (tDecodeCStrTo(pDecoder, pExec->subKey) < 0) return -1; + if (tDecodeI64(pDecoder, &pExec->consumerId) < 0) return -1; + if (tDecodeI32(pDecoder, &pExec->epoch) < 0) return -1; + if (tDecodeI8(pDecoder, &pExec->subType) < 0) return -1; + if (tDecodeI8(pDecoder, &pExec->withTbName) < 0) return -1; + if (tDecodeI8(pDecoder, &pExec->withSchema) < 0) return -1; + if (tDecodeI8(pDecoder, &pExec->withTag) < 0) return -1; + if (pExec->subType == TOPIC_SUB_TYPE__TABLE) { + if (tDecodeCStrAlloc(pDecoder, &pExec->qmsg) < 0) return -1; + // TODO decode modified exec + } + tEndDecode(pDecoder); + return 0; +} int32_t tqUpdateTbUidList(STQ* pTq, const SArray* tbUidList, bool isAdd) { void* pIter = NULL; while (1) { diff --git a/source/dnode/vnode/src/tsdb/tsdbReadImpl.c b/source/dnode/vnode/src/tsdb/tsdbReadImpl.c index 0773805684..f66037b16d 100644 --- a/source/dnode/vnode/src/tsdb/tsdbReadImpl.c +++ b/source/dnode/vnode/src/tsdb/tsdbReadImpl.c @@ -252,6 +252,45 @@ static FORCE_INLINE void tsdbSwapDataCols(SDataCols *pDest, SDataCols *pSrc) { pSrc->cols = pCols; } +static void printTsdbLoadBlkData(SReadH *readh, SDataCols *pDCols, SBlock *pBlock, const char *tag, int32_t ln) { + printf("%s:%d:%" PRIi64 " ================\n", tag, ln, taosGetSelfPthreadId()); + if (pBlock) { + SDFile *pHeadf = TSDB_READ_HEAD_FILE(readh); + printf("%s:%d:%" PRIi64 ":%p:%d %s\n", tag, ln, taosGetSelfPthreadId(), pBlock, (int32_t)pBlock->len, + pHeadf->f.aname); + SDFile *pDFile = pBlock->last ? TSDB_READ_LAST_FILE(readh) : TSDB_READ_DATA_FILE(readh); + printf("%s:%d:%" PRIi64 ":%p:%d %s\n", tag, ln, taosGetSelfPthreadId(), pBlock, (int32_t)pBlock->len, + pDFile->f.aname); + } + SDataCol *pDCol = pDCols->cols + 0; + if (TSKEY_MIN == *(int64_t *)pDCol->pData) { + ASSERT(0); + } + + int rows = pDCols->numOfRows; + for (int r = 0; r < rows; ++r) { + if (pBlock) { + printf("%s:%d:%" PRIi64 ":%p:%d rows[%d][%d] ", tag, ln, taosGetSelfPthreadId(), pBlock, (int32_t)pBlock->len, + rows, r); + } else { + printf("%s:%d:%" PRIi64 ":%s rows[%d][%d] ", tag, ln, taosGetSelfPthreadId(), "=== merge === ", rows, r); + } + + int nDataCols = pDCols->numOfCols; + int j = 0; + SCellVal sVal = {0}; + while (j < nDataCols) { + SDataCol *pDataCol = pDCols->cols + j; + tdGetColDataOfRow(&sVal, pDataCol, r, pDCols->bitmapMode); + tdSCellValPrint(&sVal, pDataCol->type); + ++j; + } + printf("\n"); + } + + fflush(stdout); +} + int tsdbLoadBlockData(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo) { ASSERT(pBlock->numOfSubBlocks > 0); STsdbCfg *pCfg = REPO_CFG(pReadh->pRepo); @@ -266,15 +305,23 @@ int tsdbLoadBlockData(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo) { } } - if (tsdbLoadBlockDataImpl(pReadh, iBlock, pReadh->pDCols[0], TSDB_BITMODE_ONE_BIT) < 0) return -1; +#ifdef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS + printTsdbLoadBlkData(pReadh, pReadh->pDCols[0], iBlock, __func__, __LINE__); +#endif for (int i = 1; i < pBlock->numOfSubBlocks; i++) { iBlock++; if (tsdbLoadBlockDataImpl(pReadh, iBlock, pReadh->pDCols[1], TSDB_BITMODE_DEFAULT) < 0) return -1; +#ifdef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS + printTsdbLoadBlkData(pReadh, pReadh->pDCols[1], iBlock, __func__, __LINE__); +#endif // TODO: use the real maxVersion to replace the UINT64_MAX to support Multi-Version if (tdMergeDataCols(pReadh->pDCols[0], pReadh->pDCols[1], pReadh->pDCols[1]->numOfRows, NULL, TD_SUPPORT_UPDATE(update), TD_VER_MAX) < 0) return -1; +#ifdef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS + printTsdbLoadBlkData(pReadh, pReadh->pDCols[0], iBlock, " === MERGE === ", __LINE__); +#endif } // if ((pBlock->numOfSubBlocks == 1) && (iBlock->hasDupKey)) { // TODO: use this line if (pBlock->numOfSubBlocks == 1) { @@ -286,6 +333,9 @@ int tsdbLoadBlockData(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo) { } tsdbSwapDataCols(pReadh->pDCols[0], pReadh->pDCols[1]); ASSERT(pReadh->pDCols[0]->bitmapMode != 0); +#ifdef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS + printTsdbLoadBlkData(pReadh, pReadh->pDCols[0], iBlock, " === UPDATE FILTER === ", __LINE__); +#endif } ASSERT(pReadh->pDCols[0]->numOfRows <= pBlock->numOfRows); @@ -295,6 +345,53 @@ int tsdbLoadBlockData(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo) { return 0; } +static void printTsdbLoadBlkDataCols(SReadH *readh, SDataCols *pDCols, SBlock *pBlock, const int16_t *colIds, + int numOfColsIds, const char *tag, int32_t ln) { + printf("%s:%d:%" PRIi64 " ================\n", tag, ln, taosGetSelfPthreadId()); + if (pBlock) { + SDFile *pHeadf = TSDB_READ_HEAD_FILE(readh); + printf("%s:%d:%" PRIi64 ":%p:%d %s\n", tag, ln, taosGetSelfPthreadId(), pBlock, (int32_t)pBlock->len, + pHeadf->f.aname); + SDFile *pDFile = pBlock->last ? TSDB_READ_LAST_FILE(readh) : TSDB_READ_DATA_FILE(readh); + printf("%s:%d:%" PRIi64 ":%p:%d %s\n", tag, ln, taosGetSelfPthreadId(), pBlock, (int32_t)pBlock->len, + pDFile->f.aname); + } + + int rows = pDCols->numOfRows; + for (int r = 0; r < rows; ++r) { + if (pBlock) { + printf("%s:%d:%" PRIi64 ":%p:%d rows[%d][%d] ", tag, ln, taosGetSelfPthreadId(), pBlock, (int32_t)pBlock->len, + rows, r); + } else { + printf("%s:%d:%" PRIi64 ":%s rows[%d][%d] ", tag, ln, taosGetSelfPthreadId(), "=== merge === ", rows, r); + } + + int nDataCols = pDCols->numOfCols; + int j = 0, k = 0; + SCellVal sVal = {0}; + while (j < nDataCols) { + if (k >= numOfColsIds) break; + SDataCol *pDataCol = pDCols->cols + j; + int16_t colId1 = pDataCol->colId; + int16_t colId2 = *(colIds + k); + if (colId1 < colId2) { + ++j; + } else if (colId1 > colId2) { + ++k; // colId2 not exists in SDataCols + printf("NotExists "); + } else { + tdGetColDataOfRow(&sVal, pDataCol, r, pDCols->bitmapMode); + tdSCellValPrint(&sVal, pDataCol->type); + ++j; + ++k; + } + } + printf("\n"); + } + + fflush(stdout); +} + // TODO: filter by Multi-Version int tsdbLoadBlockDataCols(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo, const int16_t *colIds, int numOfColsIds, bool mergeBitmap) { @@ -310,14 +407,25 @@ int tsdbLoadBlockDataCols(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo, } } - if (tsdbLoadBlockDataColsImpl(pReadh, iBlock, pReadh->pDCols[0], colIds, numOfColsIds, TSDB_BITMODE_ONE_BIT) < 0) return -1; + if (tsdbLoadBlockDataColsImpl(pReadh, iBlock, pReadh->pDCols[0], colIds, numOfColsIds, TSDB_BITMODE_ONE_BIT) < 0) + return -1; +#ifdef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS + printTsdbLoadBlkDataCols(pReadh, pReadh->pDCols[0], iBlock, colIds, numOfColsIds, __func__, __LINE__); +#endif for (int i = 1; i < pBlock->numOfSubBlocks; i++) { iBlock++; - if (tsdbLoadBlockDataColsImpl(pReadh, iBlock, pReadh->pDCols[1], colIds, numOfColsIds, TSDB_BITMODE_DEFAULT) < 0) return -1; + if (tsdbLoadBlockDataColsImpl(pReadh, iBlock, pReadh->pDCols[1], colIds, numOfColsIds, TSDB_BITMODE_DEFAULT) < 0) + return -1; +#ifdef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS + printTsdbLoadBlkDataCols(pReadh, pReadh->pDCols[1], iBlock, colIds, numOfColsIds, __func__, __LINE__); +#endif // TODO: use the real maxVersion to replace the UINT64_MAX to support Multi-Version if (tdMergeDataCols(pReadh->pDCols[0], pReadh->pDCols[1], pReadh->pDCols[1]->numOfRows, NULL, TD_SUPPORT_UPDATE(update), TD_VER_MAX) < 0) return -1; +#ifdef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS + printTsdbLoadBlkDataCols(pReadh, pReadh->pDCols[0], NULL, colIds, numOfColsIds, __func__, __LINE__); +#endif } // if ((pBlock->numOfSubBlocks == 1) && (iBlock->hasDupKey)) { // TODO: use this line if (pBlock->numOfSubBlocks == 1) { @@ -329,18 +437,23 @@ int tsdbLoadBlockDataCols(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo, } tsdbSwapDataCols(pReadh->pDCols[0], pReadh->pDCols[1]); ASSERT(pReadh->pDCols[0]->bitmapMode != 0); +#ifdef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS + printTsdbLoadBlkDataCols(pReadh, pReadh->pDCols[0], NULL, colIds, numOfColsIds, + " === update filter === ", __LINE__); +#endif } if (mergeBitmap && !tdDataColsIsBitmapI(pReadh->pDCols[0])) { for (int i = 0; i < numOfColsIds; ++i) { SDataCol *pDataCol = pReadh->pDCols[0]->cols + i; if (pDataCol->len > 0 && pDataCol->bitmap) { - ASSERT(pDataCol->colId != PRIMARYKEY_TIMESTAMP_COL_ID); - ASSERT(pDataCol->pBitmap); tdMergeBitmap(pDataCol->pBitmap, pReadh->pDCols[0]->numOfRows, pDataCol->pBitmap); tdDataColsSetBitmapI(pReadh->pDCols[0]); } } +#ifdef TD_DEBUG_PRINT_TSDB_LOAD_DCOLS + printTsdbLoadBlkDataCols(pReadh, pReadh->pDCols[0], NULL, colIds, numOfColsIds, " === merge bitmap === ", __LINE__); +#endif } ASSERT(pReadh->pDCols[0]->numOfRows <= pBlock->numOfRows); @@ -551,9 +664,7 @@ static int tsdbLoadBlockDataImpl(SReadH *pReadh, SBlock *pBlock, SDataCols *pDat tdResetDataCols(pDataCols); - if (tdIsBitmapModeI(bitmapMode)) { - tdDataColsSetBitmapI(pDataCols); - } + pDataCols->bitmapMode = bitmapMode; if (tsdbMakeRoom((void **)(&TSDB_READ_BUF(pReadh)), pBlock->len) < 0) return -1; @@ -740,9 +851,7 @@ static int tsdbLoadBlockDataColsImpl(SReadH *pReadh, SBlock *pBlock, SDataCols * tdResetDataCols(pDataCols); - if (tdIsBitmapModeI(bitmapMode)) { - tdDataColsSetBitmapI(pDataCols); - } + pDataCols->bitmapMode = bitmapMode; // If only load timestamp column, no need to load SBlockData part if (numOfColIds > 1 && tsdbLoadBlockOffset(pReadh, pBlock) < 0) return -1; diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index 8659c41807..882ee912cd 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -56,7 +56,13 @@ void vnodeSyncStart(SVnode *pVnode) { void vnodeSyncClose(SVnode *pVnode) { syncStop(pVnode->sync); } -int32_t vnodeSyncEqMsg(const SMsgCb *msgcb, SRpcMsg *pMsg) { return tmsgPutToQueue(msgcb, SYNC_QUEUE, pMsg); } +int32_t vnodeSyncEqMsg(const SMsgCb *msgcb, SRpcMsg *pMsg) { + int32_t code = tmsgPutToQueue(msgcb, SYNC_QUEUE, pMsg); + if (code != 0) { + rpcFreeCont(pMsg->pCont); + } + return code; +} int32_t vnodeSyncSendMsg(const SEpSet *pEpSet, SRpcMsg *pMsg) { return tmsgSendReq(pEpSet, pMsg); } @@ -141,5 +147,6 @@ SSyncFSM *vnodeSyncMakeFsm(SVnode *pVnode) { pFsm->FpPreCommitCb = vnodeSyncPreCommitMsg; pFsm->FpRollBackCb = vnodeSyncRollBackMsg; pFsm->FpGetSnapshot = vnodeSyncGetSnapshot; + pFsm->FpRestoreFinish = NULL; return pFsm; } \ No newline at end of file diff --git a/source/libs/index/CMakeLists.txt b/source/libs/index/CMakeLists.txt index c6d1cc0dbf..e55b004972 100644 --- a/source/libs/index/CMakeLists.txt +++ b/source/libs/index/CMakeLists.txt @@ -34,7 +34,7 @@ if (${BUILD_WITH_INVERTEDINDEX}) endif(${BUILD_WITH_INVERTEDINDEX}) -#if (${BUILD_TEST}) -# add_subdirectory(test) -#endif(${BUILD_TEST}) +if (${BUILD_TEST}) + add_subdirectory(test) +endif(${BUILD_TEST}) diff --git a/source/libs/index/inc/indexTfile.h b/source/libs/index/inc/indexTfile.h index 9712e4b30f..85ed397b0a 100644 --- a/source/libs/index/inc/indexTfile.h +++ b/source/libs/index/inc/indexTfile.h @@ -40,7 +40,7 @@ typedef struct TFileHeader { } TFileHeader; #pragma pack(pop) -#define TFILE_HEADER_SIZE (sizeof(TFileHeader)) +#define TFILE_HEADER_SIZE (sizeof(TFileHeader)) #define TFILE_HEADER_NO_FST (TFILE_HEADER_SIZE - sizeof(int32_t)) typedef struct TFileValue { diff --git a/source/libs/index/src/index.c b/source/libs/index/src/index.c index 2141e90bbd..6add788a89 100644 --- a/source/libs/index/src/index.c +++ b/source/libs/index/src/index.c @@ -29,7 +29,7 @@ #include "lucene++/Lucene_c.h" #endif -#define INDEX_NUM_OF_THREADS 4 +#define INDEX_NUM_OF_THREADS 1 #define INDEX_QUEUE_SIZE 200 #define INDEX_DATA_BOOL_NULL 0x02 @@ -117,7 +117,6 @@ int indexOpen(SIndexOpts* opts, const char* path, SIndex** index) { sIdx->path = tstrdup(path); taosThreadMutexInit(&sIdx->mtx, NULL); tsem_init(&sIdx->sem, 0, 0); - // taosThreadCondInit(&sIdx->finished, NULL); sIdx->refId = indexAddRef(sIdx); indexAcquireRef(sIdx->refId); @@ -143,13 +142,13 @@ void indexDestroy(void* handle) { return; } void indexClose(SIndex* sIdx) { - indexReleaseRef(sIdx->refId); bool ref = 0; if (sIdx->colObj != NULL) { void* iter = taosHashIterate(sIdx->colObj, NULL); while (iter) { IndexCache** pCache = iter; indexCacheForceToMerge((void*)(*pCache)); + indexInfo("%s wait to merge", (*pCache)->colName); indexWait((void*)(sIdx)); iter = taosHashIterate(sIdx->colObj, iter); indexCacheUnRef(*pCache); @@ -157,7 +156,7 @@ void indexClose(SIndex* sIdx) { taosHashCleanup(sIdx->colObj); sIdx->colObj = NULL; } - // taosMsleep(1000 * 5); + indexReleaseRef(sIdx->refId); indexRemoveRef(sIdx->refId); } int64_t indexAddRef(void* p) { @@ -554,8 +553,29 @@ void iterateValueDestroy(IterateValue* value, bool destroy) { taosMemoryFree(value->colVal); value->colVal = NULL; } + +static int64_t indexGetAvaialbleVer(SIndex* sIdx, IndexCache* cache) { + ICacheKey key = {.suid = cache->suid, .colName = cache->colName, .nColName = strlen(cache->colName)}; + int64_t ver = CACHE_VERSION(cache); + taosThreadMutexLock(&sIdx->mtx); + TFileReader* trd = tfileCacheGet(((IndexTFile*)sIdx->tindex)->cache, &key); + if (trd != NULL) { + if (ver < trd->header.version) { + ver = trd->header.version + 1; + } else { + ver += 1; + } + indexInfo("header: %d, ver: %" PRId64 "", trd->header.version, ver); + tfileReaderUnRef(trd); + } else { + indexInfo("not found reader base %p", trd); + } + taosThreadMutexUnlock(&sIdx->mtx); + return ver; +} static int indexGenTFile(SIndex* sIdx, IndexCache* cache, SArray* batch) { - int32_t version = CACHE_VERSION(cache); + int64_t version = indexGetAvaialbleVer(sIdx, cache); + indexInfo("file name version: %" PRId64 "", version); uint8_t colType = cache->type; TFileWriter* tw = tfileWriterOpen(sIdx->path, cache->suid, version, cache->colName, colType); @@ -575,6 +595,7 @@ static int indexGenTFile(SIndex* sIdx, IndexCache* cache, SArray* batch) { if (reader == NULL) { return -1; } + indexInfo("success to create tfile, reopen it, %s", reader->ctx->file.buf); TFileHeader* header = &reader->header; ICacheKey key = {.suid = cache->suid, .colName = header->colName, .nColName = strlen(header->colName)}; diff --git a/source/libs/index/src/indexCache.c b/source/libs/index/src/indexCache.c index 232eca9304..d704e3876e 100644 --- a/source/libs/index/src/indexCache.c +++ b/source/libs/index/src/indexCache.c @@ -335,6 +335,9 @@ IndexCache* indexCacheCreate(SIndex* idx, uint64_t suid, const char* colName, in taosThreadCondInit(&cache->finished, NULL); indexCacheRef(cache); + if (idx != NULL) { + indexAcquireRef(idx->refId); + } return cache; } void indexCacheDebug(IndexCache* cache) { @@ -426,13 +429,16 @@ void indexCacheDestroy(void* cache) { if (pCache == NULL) { return; } + indexMemUnRef(pCache->mem); indexMemUnRef(pCache->imm); taosMemoryFree(pCache->colName); taosThreadMutexDestroy(&pCache->mtx); taosThreadCondDestroy(&pCache->finished); - + if (pCache->index != NULL) { + indexReleaseRef(((SIndex*)pCache->index)->refId); + } taosMemoryFree(pCache); } diff --git a/source/libs/index/src/indexFstCountingWriter.c b/source/libs/index/src/indexFstCountingWriter.c index 1d4395aff6..8ba5173602 100644 --- a/source/libs/index/src/indexFstCountingWriter.c +++ b/source/libs/index/src/indexFstCountingWriter.c @@ -97,6 +97,7 @@ WriterCtx* writerCtxCreate(WriterType type, const char* path, bool readOnly, int int64_t file_size; taosStatFile(path, &file_size, NULL); ctx->file.size = (int)file_size; + } else { // ctx->file.pFile = open(path, O_RDONLY, S_IRWXU | S_IRWXG | S_IRWXO); ctx->file.pFile = taosOpenFile(path, TD_FILE_READ); diff --git a/source/libs/index/src/indexTfile.c b/source/libs/index/src/indexTfile.c index f5b3cbf227..3d85646bd2 100644 --- a/source/libs/index/src/indexTfile.c +++ b/source/libs/index/src/indexTfile.c @@ -1,6 +1,5 @@ /* * Copyright (c) 2019 TAOS Data, Inc. -p * * This program is free software: you can use, redistribute, and/or modify * it under the terms of the GNU Affero General Public License, version 3 * or later ("AGPL"), as published by the Free Software Foundation. @@ -152,10 +151,13 @@ TFileReader* tfileCacheGet(TFileCache* tcache, ICacheKey* key) { char buf[128] = {0}; int32_t sz = indexSerialCacheKey(key, buf); assert(sz < sizeof(buf)); + indexInfo("Try to get key: %s", buf); TFileReader** reader = taosHashGet(tcache->tableCache, buf, sz); - if (reader == NULL) { + if (reader == NULL || *reader == NULL) { + indexInfo("failed to get key: %s", buf); return NULL; } + indexInfo("Get key: %s file: %s", buf, (*reader)->ctx->file.buf); tfileReaderRef(*reader); return *reader; @@ -165,9 +167,10 @@ void tfileCachePut(TFileCache* tcache, ICacheKey* key, TFileReader* reader) { int32_t sz = indexSerialCacheKey(key, buf); // remove last version index reader TFileReader** p = taosHashGet(tcache->tableCache, buf, sz); - if (p != NULL) { + if (p != NULL && *p != NULL) { TFileReader* oldReader = *p; taosHashRemove(tcache->tableCache, buf, sz); + indexInfo("found %s, remove file %s", buf, oldReader->ctx->file.buf); oldReader->remove = true; tfileReaderUnRef(oldReader); } @@ -180,7 +183,6 @@ TFileReader* tfileReaderCreate(WriterCtx* ctx) { if (reader == NULL) { return NULL; } - reader->ctx = ctx; if (0 != tfileReaderVerify(reader)) { @@ -202,6 +204,7 @@ TFileReader* tfileReaderCreate(WriterCtx* ctx) { tfileReaderDestroy(reader); return NULL; } + reader->remove = false; return reader; } @@ -536,7 +539,7 @@ TFileReader* tfileReaderOpen(char* path, uint64_t suid, int32_t version, const c indexError("failed to open readonly file: %s, reason: %s", fullname, terrstr()); return NULL; } - indexInfo("open read file name:%s, file size: %d", wc->file.buf, wc->file.size); + indexTrace("open read file name:%s, file size: %d", wc->file.buf, wc->file.size); TFileReader* reader = tfileReaderCreate(wc); return reader; diff --git a/source/libs/index/test/indexTests.cc b/source/libs/index/test/indexTests.cc index 11a25a798f..f848cee86b 100644 --- a/source/libs/index/test/indexTests.cc +++ b/source/libs/index/test/indexTests.cc @@ -674,10 +674,13 @@ class IndexObj { // opt numOfWrite = 0; numOfRead = 0; - indexInit(); + // indexInit(); } - int Init(const std::string& dir) { - taosRemoveDir(dir.c_str()); + int Init(const std::string& dir, bool remove = true) { + if (remove) { + taosRemoveDir(dir.c_str()); + taosMkDir(dir.c_str()); + } taosMkDir(dir.c_str()); int ret = indexOpen(&opts, dir.c_str(), &idx); if (ret != 0) { @@ -838,8 +841,11 @@ class IndexEnv2 : public ::testing::Test { initLog(); index = new IndexObj(); } - virtual void TearDown() { delete index; } - IndexObj* index; + virtual void TearDown() { + // taosMsleep(500); + delete index; + } + IndexObj* index; }; TEST_F(IndexEnv2, testIndexOpen) { std::string path = TD_TMP_DIR_PATH "test"; @@ -951,6 +957,8 @@ static void single_write_and_search(IndexObj* idx) { target = idx->SearchOne("tag2", "Test"); } static void multi_write_and_search(IndexObj* idx) { + idx->PutOne("tag1", "Hello"); + idx->PutOne("tag2", "Test"); int target = idx->SearchOne("tag1", "Hello"); target = idx->SearchOne("tag2", "Test"); idx->WriteMultiMillonData("tag1", "hello world test", 100 * 100); @@ -992,16 +1000,16 @@ TEST_F(IndexEnv2, testIndex_MultiWrite_and_MultiRead) { } } -// TEST_F(IndexEnv2, testIndex_restart) { -// std::string path = TD_TMP_DIR_PATH "cache_and_tfile"; -// if (index->Init(path) != 0) { -// } -// index->SearchOneTarget("tag1", "Hello", 10); -// index->SearchOneTarget("tag2", "Test", 10); -//} +TEST_F(IndexEnv2, testIndex_restart) { + std::string path = TD_TMP_DIR_PATH "cache_and_tfile"; + if (index->Init(path, false) != 0) { + } + index->SearchOneTarget("tag1", "Hello", 10); + index->SearchOneTarget("tag2", "Test", 10); +} // TEST_F(IndexEnv2, testIndex_restart1) { // std::string path = TD_TMP_DIR_PATH "cache_and_tfile"; -// if (index->Init(path) != 0) { +// if (index->Init(path, false) != 0) { // } // index->ReadMultiMillonData("tag1", "coding"); // index->SearchOneTarget("tag1", "Hello", 10); @@ -1018,16 +1026,16 @@ TEST_F(IndexEnv2, testIndex_MultiWrite_and_MultiRead) { // std::cout << "reader sz: " << index->SearchOne("tag1", "Hello") << std::endl; // assert(3 == index->SearchOne("tag1", "Hello")); //} -// TEST_F(IndexEnv2, testIndexMultiTag) { -// std::string path = TD_TMP_DIR_PATH "multi_tag"; -// if (index->Init(path) != 0) { -// } -// int64_t st = taosGetTimestampUs(); -// int32_t num = 1000 * 10000; -// index->WriteMultiMillonData("tag1", "xxxxxxxxxxxxxxx", num); -// std::cout << "numOfRow: " << num << "\ttime cost:" << taosGetTimestampUs() - st << std::endl; -// // index->WriteMultiMillonData("tag2", "xxxxxxxxxxxxxxxxxxxxxxxxx", 100 * 10000); -//} +TEST_F(IndexEnv2, testIndexMultiTag) { + std::string path = TD_TMP_DIR_PATH "multi_tag"; + if (index->Init(path) != 0) { + } + int64_t st = taosGetTimestampUs(); + int32_t num = 100 * 100; + index->WriteMultiMillonData("tag1", "xxxxxxxxxxxxxxx", num); + std::cout << "numOfRow: " << num << "\ttime cost:" << taosGetTimestampUs() - st << std::endl; + // index->WriteMultiMillonData("tag2", "xxxxxxxxxxxxxxxxxxxxxxxxx", 100 * 10000); +} TEST_F(IndexEnv2, testLongComVal1) { std::string path = TD_TMP_DIR_PATH "long_colVal"; if (index->Init(path) != 0) { diff --git a/source/libs/scheduler/inc/schedulerInt.h b/source/libs/scheduler/inc/schedulerInt.h index 9e0878e118..ffac0f856d 100644 --- a/source/libs/scheduler/inc/schedulerInt.h +++ b/source/libs/scheduler/inc/schedulerInt.h @@ -264,7 +264,7 @@ int32_t schBuildAndSendMsg(SSchJob *job, SSchTask *task, SQueryNodeAddr *addr, i SSchJob *schAcquireJob(int64_t refId); int32_t schReleaseJob(int64_t refId); void schFreeFlowCtrl(SSchJob *pJob); -int32_t schCheckJobNeedFlowCtrl(SSchJob *pJob, SSchLevel *pLevel); +int32_t schChkJobNeedFlowCtrl(SSchJob *pJob, SSchLevel *pLevel); int32_t schDecTaskFlowQuota(SSchJob *pJob, SSchTask *pTask); int32_t schCheckIncTaskFlowQuota(SSchJob *pJob, SSchTask *pTask, bool *enough); int32_t schLaunchTasksInFlowCtrlList(SSchJob *pJob, SSchTask *pTask); @@ -275,6 +275,32 @@ int32_t schBuildAndSendHbMsg(SQueryNodeEpId *nodeEpId); int32_t schCloneSMsgSendInfo(void *src, void **dst); int32_t schValidateAndBuildJob(SQueryPlan *pDag, SSchJob *pJob); void schFreeJobImpl(void *job); +int32_t schMakeHbCallbackParam(SSchJob *pJob, SSchTask *pTask, void **pParam); +int32_t schMakeHbRpcCtx(SSchJob *pJob, SSchTask *pTask, SRpcCtx *pCtx); +int32_t schEnsureHbConnection(SSchJob *pJob, SSchTask *pTask); +int32_t schUpdateHbConnection(SQueryNodeEpId *epId, SSchTrans *trans); +int32_t schHandleHbCallback(void *param, const SDataBuf *pMsg, int32_t code); +void schFreeRpcCtx(SRpcCtx *pCtx); +int32_t schGetCallbackFp(int32_t msgType, __async_send_cb_fn_t *fp); +bool schJobNeedToStop(SSchJob *pJob, int8_t *pStatus); +int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask); +int32_t schSaveJobQueryRes(SSchJob *pJob, SResReadyRsp *rsp); +int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRsp *pRsp); +void schProcessOnDataFetched(SSchJob *job); +int32_t schGetTaskFromTaskList(SHashObj *pTaskList, uint64_t taskId, SSchTask **pTask); +int32_t schUpdateTaskExecNodeHandle(SSchTask *pTask, void *handle, int32_t rspCode); +void schFreeRpcCtxVal(const void *arg); +int32_t schMakeBrokenLinkVal(SSchJob *pJob, SSchTask *pTask, SRpcBrokenlinkVal *brokenVal, bool isHb); +int32_t schRecordTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, void *handle); +int32_t schExecStaticExplain(void *transport, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql, + bool syncSchedule); +int32_t schExecJobImpl(void *transport, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql, + int64_t startTs, bool sync); +int32_t schChkUpdateJobStatus(SSchJob *pJob, int8_t newStatus); +int32_t schCancelJob(SSchJob *pJob); +int32_t schProcessOnJobDropped(SSchJob *pJob, int32_t errCode); +uint64_t schGenTaskId(void); +void schCloseJobRef(void); #ifdef __cplusplus diff --git a/source/libs/scheduler/src/schFlowCtrl.c b/source/libs/scheduler/src/schFlowCtrl.c index fbcca403bb..85d205f5f2 100644 --- a/source/libs/scheduler/src/schFlowCtrl.c +++ b/source/libs/scheduler/src/schFlowCtrl.c @@ -40,7 +40,7 @@ void schFreeFlowCtrl(SSchJob *pJob) { pJob->flowCtrl = NULL; } -int32_t schCheckJobNeedFlowCtrl(SSchJob *pJob, SSchLevel *pLevel) { +int32_t schChkJobNeedFlowCtrl(SSchJob *pJob, SSchLevel *pLevel) { if (!SCH_IS_QUERY_JOB(pJob)) { SCH_JOB_DLOG("job no need flow ctrl, queryJob:%d", SCH_IS_QUERY_JOB(pJob)); return TSDB_CODE_SUCCESS; diff --git a/source/libs/scheduler/src/schJob.c b/source/libs/scheduler/src/schJob.c new file mode 100644 index 0000000000..14f4646397 --- /dev/null +++ b/source/libs/scheduler/src/schJob.c @@ -0,0 +1,1312 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "catalog.h" +#include "command.h" +#include "query.h" +#include "schedulerInt.h" +#include "tmsg.h" +#include "tref.h" +#include "trpc.h" + +FORCE_INLINE SSchJob *schAcquireJob(int64_t refId) { return (SSchJob *)taosAcquireRef(schMgmt.jobRef, refId); } + +FORCE_INLINE int32_t schReleaseJob(int64_t refId) { return taosReleaseRef(schMgmt.jobRef, refId); } + +int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *pLevel) { + pTask->plan = pPlan; + pTask->level = pLevel; + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_NOT_START); + pTask->taskId = schGenTaskId(); + pTask->execNodes = taosArrayInit(SCH_MAX_CANDIDATE_EP_NUM, sizeof(SSchNodeInfo)); + if (NULL == pTask->execNodes) { + SCH_TASK_ELOG("taosArrayInit %d execNodes failed", SCH_MAX_CANDIDATE_EP_NUM); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + return TSDB_CODE_SUCCESS; +} + +int32_t schInitJob(SSchJob **pSchJob, SQueryPlan *pDag, void *transport, SArray *pNodeList, const char *sql, + int64_t startTs, bool syncSchedule) { + int32_t code = 0; + int64_t refId = -1; + SSchJob *pJob = taosMemoryCalloc(1, sizeof(SSchJob)); + if (NULL == pJob) { + qError("QID:%" PRIx64 " calloc %d failed", pDag->queryId, (int32_t)sizeof(SSchJob)); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + pJob->attr.explainMode = pDag->explainInfo.mode; + pJob->attr.syncSchedule = syncSchedule; + pJob->transport = transport; + pJob->sql = sql; + + if (pNodeList != NULL) { + pJob->nodeList = taosArrayDup(pNodeList); + } + + SCH_ERR_JRET(schValidateAndBuildJob(pDag, pJob)); + + if (SCH_IS_EXPLAIN_JOB(pJob)) { + SCH_ERR_JRET(qExecExplainBegin(pDag, &pJob->explainCtx, startTs)); + } + + pJob->execTasks = + taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK); + if (NULL == pJob->execTasks) { + SCH_JOB_ELOG("taosHashInit %d execTasks failed", pDag->numOfSubplans); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + pJob->succTasks = + taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK); + if (NULL == pJob->succTasks) { + SCH_JOB_ELOG("taosHashInit %d succTasks failed", pDag->numOfSubplans); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + pJob->failTasks = + taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK); + if (NULL == pJob->failTasks) { + SCH_JOB_ELOG("taosHashInit %d failTasks failed", pDag->numOfSubplans); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + tsem_init(&pJob->rspSem, 0, 0); + + refId = taosAddRef(schMgmt.jobRef, pJob); + if (refId < 0) { + SCH_JOB_ELOG("taosAddRef job failed, error:%s", tstrerror(terrno)); + SCH_ERR_JRET(terrno); + } + + atomic_add_fetch_32(&schMgmt.jobNum, 1); + + if (NULL == schAcquireJob(refId)) { + SCH_JOB_ELOG("schAcquireJob job failed, refId:%" PRIx64, refId); + SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); + } + + pJob->refId = refId; + + SCH_JOB_DLOG("job refId:%" PRIx64, pJob->refId); + + pJob->status = JOB_TASK_STATUS_NOT_START; + + *pSchJob = pJob; + + return TSDB_CODE_SUCCESS; + +_return: + + if (refId < 0) { + schFreeJobImpl(pJob); + } else { + taosRemoveRef(schMgmt.jobRef, refId); + } + SCH_RET(code); +} + +void schFreeTask(SSchTask *pTask) { + if (pTask->candidateAddrs) { + taosArrayDestroy(pTask->candidateAddrs); + } + + taosMemoryFreeClear(pTask->msg); + + if (pTask->children) { + taosArrayDestroy(pTask->children); + } + + if (pTask->parents) { + taosArrayDestroy(pTask->parents); + } + + if (pTask->execNodes) { + taosArrayDestroy(pTask->execNodes); + } +} + +FORCE_INLINE bool schJobNeedToStop(SSchJob *pJob, int8_t *pStatus) { + int8_t status = SCH_GET_JOB_STATUS(pJob); + if (pStatus) { + *pStatus = status; + } + + return (status == JOB_TASK_STATUS_FAILED || status == JOB_TASK_STATUS_CANCELLED || + status == JOB_TASK_STATUS_CANCELLING || status == JOB_TASK_STATUS_DROPPING || + status == JOB_TASK_STATUS_SUCCEED); +} + +int32_t schChkUpdateJobStatus(SSchJob *pJob, int8_t newStatus) { + int32_t code = 0; + + int8_t oriStatus = 0; + + while (true) { + oriStatus = SCH_GET_JOB_STATUS(pJob); + + if (oriStatus == newStatus) { + SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); + } + + switch (oriStatus) { + case JOB_TASK_STATUS_NULL: + if (newStatus != JOB_TASK_STATUS_NOT_START) { + SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); + } + + break; + case JOB_TASK_STATUS_NOT_START: + if (newStatus != JOB_TASK_STATUS_EXECUTING) { + SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); + } + + break; + case JOB_TASK_STATUS_EXECUTING: + if (newStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED && newStatus != JOB_TASK_STATUS_FAILED && + newStatus != JOB_TASK_STATUS_CANCELLING && newStatus != JOB_TASK_STATUS_CANCELLED && + newStatus != JOB_TASK_STATUS_DROPPING) { + SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); + } + + break; + case JOB_TASK_STATUS_PARTIAL_SUCCEED: + if (newStatus != JOB_TASK_STATUS_FAILED && newStatus != JOB_TASK_STATUS_SUCCEED && + newStatus != JOB_TASK_STATUS_DROPPING) { + SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); + } + + break; + case JOB_TASK_STATUS_SUCCEED: + case JOB_TASK_STATUS_FAILED: + case JOB_TASK_STATUS_CANCELLING: + if (newStatus != JOB_TASK_STATUS_DROPPING) { + SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); + } + + break; + case JOB_TASK_STATUS_CANCELLED: + case JOB_TASK_STATUS_DROPPING: + SCH_ERR_JRET(TSDB_CODE_QRY_JOB_FREED); + break; + + default: + SCH_JOB_ELOG("invalid job status:%s", jobTaskStatusStr(oriStatus)); + SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); + } + + if (oriStatus != atomic_val_compare_exchange_8(&pJob->status, oriStatus, newStatus)) { + continue; + } + + SCH_JOB_DLOG("job status updated from %s to %s", jobTaskStatusStr(oriStatus), jobTaskStatusStr(newStatus)); + + break; + } + + return TSDB_CODE_SUCCESS; + +_return: + + SCH_JOB_ELOG("invalid job status update, from %s to %s", jobTaskStatusStr(oriStatus), jobTaskStatusStr(newStatus)); + SCH_ERR_RET(code); + return TSDB_CODE_SUCCESS; +} + +int32_t schBuildTaskRalation(SSchJob *pJob, SHashObj *planToTask) { + for (int32_t i = 0; i < pJob->levelNum; ++i) { + SSchLevel *pLevel = taosArrayGet(pJob->levels, i); + + for (int32_t m = 0; m < pLevel->taskNum; ++m) { + SSchTask *pTask = taosArrayGet(pLevel->subTasks, m); + SSubplan *pPlan = pTask->plan; + int32_t childNum = pPlan->pChildren ? (int32_t)LIST_LENGTH(pPlan->pChildren) : 0; + int32_t parentNum = pPlan->pParents ? (int32_t)LIST_LENGTH(pPlan->pParents) : 0; + + if (childNum > 0) { + if (pJob->levelIdx == pLevel->level) { + SCH_JOB_ELOG("invalid query plan, lowest level, childNum:%d", childNum); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + pTask->children = taosArrayInit(childNum, POINTER_BYTES); + if (NULL == pTask->children) { + SCH_TASK_ELOG("taosArrayInit %d children failed", childNum); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + } + + for (int32_t n = 0; n < childNum; ++n) { + SSubplan *child = (SSubplan *)nodesListGetNode(pPlan->pChildren, n); + SSchTask **childTask = taosHashGet(planToTask, &child, POINTER_BYTES); + if (NULL == childTask || NULL == *childTask) { + SCH_TASK_ELOG("subplan children relationship error, level:%d, taskIdx:%d, childIdx:%d", i, m, n); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + if (NULL == taosArrayPush(pTask->children, childTask)) { + SCH_TASK_ELOG("taosArrayPush childTask failed, level:%d, taskIdx:%d, childIdx:%d", i, m, n); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_TASK_DLOG("children info, the %d child TID %" PRIx64, n, (*childTask)->taskId); + } + + if (parentNum > 0) { + if (0 == pLevel->level) { + SCH_TASK_ELOG("invalid task info, level:0, parentNum:%d", parentNum); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + pTask->parents = taosArrayInit(parentNum, POINTER_BYTES); + if (NULL == pTask->parents) { + SCH_TASK_ELOG("taosArrayInit %d parents failed", parentNum); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + } else { + if (0 != pLevel->level) { + SCH_TASK_ELOG("invalid task info, level:%d, parentNum:%d", pLevel->level, parentNum); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + } + + for (int32_t n = 0; n < parentNum; ++n) { + SSubplan *parent = (SSubplan *)nodesListGetNode(pPlan->pParents, n); + SSchTask **parentTask = taosHashGet(planToTask, &parent, POINTER_BYTES); + if (NULL == parentTask || NULL == *parentTask) { + SCH_TASK_ELOG("subplan parent relationship error, level:%d, taskIdx:%d, childIdx:%d", i, m, n); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + if (NULL == taosArrayPush(pTask->parents, parentTask)) { + SCH_TASK_ELOG("taosArrayPush parentTask failed, level:%d, taskIdx:%d, childIdx:%d", i, m, n); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_TASK_DLOG("parents info, the %d parent TID %" PRIx64, n, (*parentTask)->taskId); + } + + SCH_TASK_DLOG("level:%d, parentNum:%d, childNum:%d", i, parentNum, childNum); + } + } + + SSchLevel *pLevel = taosArrayGet(pJob->levels, 0); + if (SCH_IS_QUERY_JOB(pJob) && pLevel->taskNum > 1) { + SCH_JOB_ELOG("invalid query plan, level:0, taskNum:%d", pLevel->taskNum); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + return TSDB_CODE_SUCCESS; +} + +int32_t schRecordTaskSucceedNode(SSchJob *pJob, SSchTask *pTask) { + SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + if (NULL == addr) { + SCH_TASK_ELOG("taosArrayGet candidate addr failed, idx:%d, size:%d", pTask->candidateIdx, + (int32_t)taosArrayGetSize(pTask->candidateAddrs)); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + pTask->succeedAddr = *addr; + + return TSDB_CODE_SUCCESS; +} + +int32_t schRecordTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, void *handle) { + SSchNodeInfo nodeInfo = {.addr = *addr, .handle = handle}; + + if (NULL == taosArrayPush(pTask->execNodes, &nodeInfo)) { + SCH_TASK_ELOG("taosArrayPush nodeInfo to execNodes list failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_TASK_DLOG("task execNode recorded, handle:%p", handle); + + return TSDB_CODE_SUCCESS; +} + +int32_t schRecordQueryDataSrc(SSchJob *pJob, SSchTask *pTask) { + if (!SCH_IS_DATA_SRC_QRY_TASK(pTask)) { + return TSDB_CODE_SUCCESS; + } + + taosArrayPush(pJob->dataSrcTasks, &pTask); + + return TSDB_CODE_SUCCESS; +} + + +int32_t schValidateAndBuildJob(SQueryPlan *pDag, SSchJob *pJob) { + int32_t code = 0; + pJob->queryId = pDag->queryId; + + if (pDag->numOfSubplans <= 0) { + SCH_JOB_ELOG("invalid subplan num:%d", pDag->numOfSubplans); + SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); + } + + pJob->dataSrcTasks = taosArrayInit(pDag->numOfSubplans, POINTER_BYTES); + if (NULL == pJob->dataSrcTasks) { + SCH_ERR_RET(TSDB_CODE_OUT_OF_MEMORY); + } + + int32_t levelNum = (int32_t)LIST_LENGTH(pDag->pSubplans); + if (levelNum <= 0) { + SCH_JOB_ELOG("invalid level num:%d", levelNum); + SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); + } + + SHashObj *planToTask = taosHashInit( + SCHEDULE_DEFAULT_MAX_TASK_NUM, + taosGetDefaultHashFunction(POINTER_BYTES == sizeof(int64_t) ? TSDB_DATA_TYPE_BIGINT : TSDB_DATA_TYPE_INT), false, + HASH_NO_LOCK); + if (NULL == planToTask) { + SCH_JOB_ELOG("taosHashInit %d failed", SCHEDULE_DEFAULT_MAX_TASK_NUM); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + pJob->levels = taosArrayInit(levelNum, sizeof(SSchLevel)); + if (NULL == pJob->levels) { + SCH_JOB_ELOG("taosArrayInit %d failed", levelNum); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + pJob->levelNum = levelNum; + pJob->levelIdx = levelNum - 1; + + pJob->subPlans = pDag->pSubplans; + + SSchLevel level = {0}; + SNodeListNode *plans = NULL; + int32_t taskNum = 0; + SSchLevel *pLevel = NULL; + + level.status = JOB_TASK_STATUS_NOT_START; + + for (int32_t i = 0; i < levelNum; ++i) { + if (NULL == taosArrayPush(pJob->levels, &level)) { + SCH_JOB_ELOG("taosArrayPush level failed, level:%d", i); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + pLevel = taosArrayGet(pJob->levels, i); + pLevel->level = i; + + plans = (SNodeListNode *)nodesListGetNode(pDag->pSubplans, i); + if (NULL == plans) { + SCH_JOB_ELOG("empty level plan, level:%d", i); + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + + taskNum = (int32_t)LIST_LENGTH(plans->pNodeList); + if (taskNum <= 0) { + SCH_JOB_ELOG("invalid level plan number:%d, level:%d", taskNum, i); + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + + pLevel->taskNum = taskNum; + + pLevel->subTasks = taosArrayInit(taskNum, sizeof(SSchTask)); + if (NULL == pLevel->subTasks) { + SCH_JOB_ELOG("taosArrayInit %d failed", taskNum); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + for (int32_t n = 0; n < taskNum; ++n) { + SSubplan *plan = (SSubplan *)nodesListGetNode(plans->pNodeList, n); + + SCH_SET_JOB_TYPE(pJob, plan->subplanType); + + SSchTask task = {0}; + SSchTask *pTask = &task; + + SCH_ERR_JRET(schInitTask(pJob, &task, plan, pLevel)); + + void *p = taosArrayPush(pLevel->subTasks, &task); + if (NULL == p) { + SCH_TASK_ELOG("taosArrayPush task to level failed, level:%d, taskIdx:%d", pLevel->level, n); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_ERR_JRET(schRecordQueryDataSrc(pJob, p)); + + if (0 != taosHashPut(planToTask, &plan, POINTER_BYTES, &p, POINTER_BYTES)) { + SCH_TASK_ELOG("taosHashPut to planToTaks failed, taskIdx:%d", n); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + ++pJob->taskNum; + } + + SCH_JOB_DLOG("level initialized, taskNum:%d", taskNum); + } + + SCH_ERR_JRET(schBuildTaskRalation(pJob, planToTask)); + +_return: + if (planToTask) { + taosHashCleanup(planToTask); + } + + SCH_RET(code); +} + +int32_t schSetTaskCandidateAddrs(SSchJob *pJob, SSchTask *pTask) { + if (NULL != pTask->candidateAddrs) { + return TSDB_CODE_SUCCESS; + } + + pTask->candidateIdx = 0; + pTask->candidateAddrs = taosArrayInit(SCH_MAX_CANDIDATE_EP_NUM, sizeof(SQueryNodeAddr)); + if (NULL == pTask->candidateAddrs) { + SCH_TASK_ELOG("taosArrayInit %d condidate addrs failed", SCH_MAX_CANDIDATE_EP_NUM); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + if (pTask->plan->execNode.epSet.numOfEps > 0) { + if (NULL == taosArrayPush(pTask->candidateAddrs, &pTask->plan->execNode)) { + SCH_TASK_ELOG("taosArrayPush execNode to candidate addrs failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_TASK_DLOG("use execNode from plan as candidate addr, numOfEps:%d", pTask->plan->execNode.epSet.numOfEps); + + return TSDB_CODE_SUCCESS; + } + + int32_t addNum = 0; + int32_t nodeNum = 0; + if (pJob->nodeList) { + nodeNum = taosArrayGetSize(pJob->nodeList); + + for (int32_t i = 0; i < nodeNum && addNum < SCH_MAX_CANDIDATE_EP_NUM; ++i) { + SQueryNodeAddr *naddr = taosArrayGet(pJob->nodeList, i); + + if (NULL == taosArrayPush(pTask->candidateAddrs, naddr)) { + SCH_TASK_ELOG("taosArrayPush execNode to candidate addrs failed, addNum:%d, errno:%d", addNum, errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + ++addNum; + } + } + + if (addNum <= 0) { + SCH_TASK_ELOG("no available execNode as candidates, nodeNum:%d", nodeNum); + SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); + } + + /* + for (int32_t i = 0; i < job->dataSrcEps.numOfEps && addNum < SCH_MAX_CANDIDATE_EP_NUM; ++i) { + strncpy(epSet->fqdn[epSet->numOfEps], job->dataSrcEps.fqdn[i], sizeof(job->dataSrcEps.fqdn[i])); + epSet->port[epSet->numOfEps] = job->dataSrcEps.port[i]; + + ++epSet->numOfEps; + } + */ + + return TSDB_CODE_SUCCESS; +} + +int32_t schRemoveTaskFromExecList(SSchJob *pJob, SSchTask *pTask) { + int32_t code = taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId)); + if (code) { + SCH_TASK_ELOG("task failed to rm from execTask list, code:%x", code); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + return TSDB_CODE_SUCCESS; +} + + +int32_t schPushTaskToExecList(SSchJob *pJob, SSchTask *pTask) { + int32_t code = taosHashPut(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); + if (0 != code) { + if (HASH_NODE_EXIST(code)) { + SCH_TASK_ELOG("task already in execTask list, code:%x", code); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + SCH_TASK_ELOG("taosHashPut task to execTask list failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_TASK_DLOG("task added to execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); + + return TSDB_CODE_SUCCESS; +} + +int32_t schMoveTaskToSuccList(SSchJob *pJob, SSchTask *pTask, bool *moved) { + if (0 != taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId))) { + SCH_TASK_WLOG("remove task from execTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + } else { + SCH_TASK_DLOG("task removed from execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); + } + + int32_t code = taosHashPut(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); + if (0 != code) { + if (HASH_NODE_EXIST(code)) { + *moved = true; + SCH_TASK_ELOG("task already in succTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_TASK_ELOG("taosHashPut task to succTask list failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + *moved = true; + + SCH_TASK_DLOG("task moved to succTask list, numOfTasks:%d", taosHashGetSize(pJob->succTasks)); + + return TSDB_CODE_SUCCESS; +} + +int32_t schMoveTaskToFailList(SSchJob *pJob, SSchTask *pTask, bool *moved) { + *moved = false; + + if (0 != taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId))) { + SCH_TASK_WLOG("remove task from execTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + } + + int32_t code = taosHashPut(pJob->failTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); + if (0 != code) { + if (HASH_NODE_EXIST(code)) { + *moved = true; + + SCH_TASK_WLOG("task already in failTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_TASK_ELOG("taosHashPut task to failTask list failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + *moved = true; + + SCH_TASK_DLOG("task moved to failTask list, numOfTasks:%d", taosHashGetSize(pJob->failTasks)); + + return TSDB_CODE_SUCCESS; +} + +int32_t schMoveTaskToExecList(SSchJob *pJob, SSchTask *pTask, bool *moved) { + if (0 != taosHashRemove(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId))) { + SCH_TASK_WLOG("remove task from succTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + } + + int32_t code = taosHashPut(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); + if (0 != code) { + if (HASH_NODE_EXIST(code)) { + *moved = true; + + SCH_TASK_ELOG("task already in execTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_TASK_ELOG("taosHashPut task to execTask list failed, errno:%d", errno); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + *moved = true; + + SCH_TASK_DLOG("task moved to execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); + + return TSDB_CODE_SUCCESS; +} + +int32_t schTaskCheckSetRetry(SSchJob *pJob, SSchTask *pTask, int32_t errCode, bool *needRetry) { + int8_t status = 0; + ++pTask->tryTimes; + + if (schJobNeedToStop(pJob, &status)) { + *needRetry = false; + SCH_TASK_DLOG("task no more retry cause of job status, job status:%s", jobTaskStatusStr(status)); + return TSDB_CODE_SUCCESS; + } + + if (pTask->tryTimes >= REQUEST_MAX_TRY_TIMES) { + *needRetry = false; + SCH_TASK_DLOG("task no more retry since reach max try times, tryTimes:%d", pTask->tryTimes); + return TSDB_CODE_SUCCESS; + } + + if (!NEED_SCHEDULER_RETRY_ERROR(errCode)) { + *needRetry = false; + SCH_TASK_DLOG("task no more retry cause of errCode, errCode:%x - %s", errCode, tstrerror(errCode)); + return TSDB_CODE_SUCCESS; + } + + // TODO CHECK epList/condidateList + if (SCH_IS_DATA_SRC_TASK(pTask)) { + if (pTask->tryTimes >= SCH_TASK_NUM_OF_EPS(&pTask->plan->execNode)) { + *needRetry = false; + SCH_TASK_DLOG("task no more retry since all ep tried, tryTimes:%d, epNum:%d", pTask->tryTimes, + SCH_TASK_NUM_OF_EPS(&pTask->plan->execNode)); + return TSDB_CODE_SUCCESS; + } + } else { + int32_t candidateNum = taosArrayGetSize(pTask->candidateAddrs); + + if ((pTask->candidateIdx + 1) >= candidateNum) { + *needRetry = false; + SCH_TASK_DLOG("task no more retry since all candiates tried, candidateIdx:%d, candidateNum:%d", + pTask->candidateIdx, candidateNum); + return TSDB_CODE_SUCCESS; + } + } + + *needRetry = true; + SCH_TASK_DLOG("task need the %dth retry, errCode:%x - %s", pTask->tryTimes, errCode, tstrerror(errCode)); + + return TSDB_CODE_SUCCESS; +} + +int32_t schHandleTaskRetry(SSchJob *pJob, SSchTask *pTask) { + atomic_sub_fetch_32(&pTask->level->taskLaunchedNum, 1); + + SCH_ERR_RET(schRemoveTaskFromExecList(pJob, pTask)); + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_NOT_START); + + if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { + SCH_ERR_RET(schDecTaskFlowQuota(pJob, pTask)); + SCH_ERR_RET(schLaunchTasksInFlowCtrlList(pJob, pTask)); + } + + if (SCH_IS_DATA_SRC_TASK(pTask)) { + SCH_SWITCH_EPSET(&pTask->plan->execNode); + } else { + ++pTask->candidateIdx; + } + + SCH_ERR_RET(schLaunchTask(pJob, pTask)); + + return TSDB_CODE_SUCCESS; +} + +void schUpdateJobErrCode(SSchJob *pJob, int32_t errCode) { + if (TSDB_CODE_SUCCESS == errCode) { + return; + } + + int32_t origCode = atomic_load_32(&pJob->errCode); + if (TSDB_CODE_SUCCESS == origCode) { + if (origCode == atomic_val_compare_exchange_32(&pJob->errCode, origCode, errCode)) { + goto _return; + } + + origCode = atomic_load_32(&pJob->errCode); + } + + if (NEED_CLIENT_HANDLE_ERROR(origCode)) { + return; + } + + if (NEED_CLIENT_HANDLE_ERROR(errCode)) { + atomic_store_32(&pJob->errCode, errCode); + goto _return; + } + + return; + +_return: + + SCH_JOB_DLOG("job errCode updated to %x - %s", errCode, tstrerror(errCode)); +} + +int32_t schProcessOnJobFailureImpl(SSchJob *pJob, int32_t status, int32_t errCode) { + // if already FAILED, no more processing + SCH_ERR_RET(schChkUpdateJobStatus(pJob, status)); + + schUpdateJobErrCode(pJob, errCode); + + if (atomic_load_8(&pJob->userFetch) || pJob->attr.syncSchedule) { + tsem_post(&pJob->rspSem); + } + + int32_t code = atomic_load_32(&pJob->errCode); + + SCH_JOB_DLOG("job failed with error: %s", tstrerror(code)); + + SCH_RET(code); +} + +// Note: no more task error processing, handled in function internal +int32_t schProcessOnJobFailure(SSchJob *pJob, int32_t errCode) { + SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_FAILED, errCode)); +} + +// Note: no more error processing, handled in function internal +int32_t schProcessOnJobDropped(SSchJob *pJob, int32_t errCode) { + SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_DROPPING, errCode)); +} + +// Note: no more task error processing, handled in function internal +int32_t schProcessOnJobPartialSuccess(SSchJob *pJob) { + int32_t code = 0; + + SCH_ERR_RET(schChkUpdateJobStatus(pJob, JOB_TASK_STATUS_PARTIAL_SUCCEED)); + + if (pJob->attr.syncSchedule) { + tsem_post(&pJob->rspSem); + } + + if (atomic_load_8(&pJob->userFetch)) { + SCH_ERR_JRET(schFetchFromRemote(pJob)); + } + + return TSDB_CODE_SUCCESS; + +_return: + + SCH_RET(schProcessOnJobFailure(pJob, code)); +} + +void schProcessOnDataFetched(SSchJob *job) { + atomic_val_compare_exchange_32(&job->remoteFetch, 1, 0); + tsem_post(&job->rspSem); +} + +// Note: no more task error processing, handled in function internal +int32_t schProcessOnTaskFailure(SSchJob *pJob, SSchTask *pTask, int32_t errCode) { + int8_t status = 0; + + if (schJobNeedToStop(pJob, &status)) { + SCH_TASK_DLOG("task failed not processed cause of job status, job status:%s", jobTaskStatusStr(status)); + SCH_RET(atomic_load_32(&pJob->errCode)); + } + + bool needRetry = false; + bool moved = false; + int32_t taskDone = 0; + int32_t code = 0; + + SCH_TASK_DLOG("taskOnFailure, code:%s", tstrerror(errCode)); + + SCH_ERR_JRET(schTaskCheckSetRetry(pJob, pTask, errCode, &needRetry)); + + if (!needRetry) { + SCH_TASK_ELOG("task failed and no more retry, code:%s", tstrerror(errCode)); + + if (SCH_GET_TASK_STATUS(pTask) == JOB_TASK_STATUS_EXECUTING) { + SCH_ERR_JRET(schMoveTaskToFailList(pJob, pTask, &moved)); + } else { + SCH_TASK_ELOG("task not in executing list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_FAILED); + + if (SCH_IS_WAIT_ALL_JOB(pJob)) { + SCH_LOCK(SCH_WRITE, &pTask->level->lock); + pTask->level->taskFailed++; + taskDone = pTask->level->taskSucceed + pTask->level->taskFailed; + SCH_UNLOCK(SCH_WRITE, &pTask->level->lock); + + schUpdateJobErrCode(pJob, errCode); + + if (taskDone < pTask->level->taskNum) { + SCH_TASK_DLOG("need to wait other tasks, doneNum:%d, allNum:%d", taskDone, pTask->level->taskNum); + SCH_RET(errCode); + } + } + } else { + SCH_ERR_JRET(schHandleTaskRetry(pJob, pTask)); + + return TSDB_CODE_SUCCESS; + } + +_return: + + SCH_RET(schProcessOnJobFailure(pJob, errCode)); +} + +int32_t schLaunchNextLevelTasks(SSchJob *pJob, SSchTask *pTask) { + if (!SCH_IS_QUERY_JOB(pJob)) { + return TSDB_CODE_SUCCESS; + } + + SSchLevel *pLevel = pTask->level; + int32_t doneNum = atomic_add_fetch_32(&pLevel->taskDoneNum, 1); + if (doneNum == pLevel->taskNum) { + pJob->levelIdx--; + + pLevel = taosArrayGet(pJob->levels, pJob->levelIdx); + for (int32_t i = 0; i < pLevel->taskNum; ++i) { + SSchTask *pTask = taosArrayGet(pLevel->subTasks, i); + + if (pTask->children && taosArrayGetSize(pTask->children) > 0) { + continue; + } + + SCH_ERR_RET(schLaunchTask(pJob, pTask)); + } + } + + return TSDB_CODE_SUCCESS; +} + + +// Note: no more task error processing, handled in function internal +int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask) { + bool moved = false; + int32_t code = 0; + + SCH_TASK_DLOG("taskOnSuccess, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + + SCH_ERR_JRET(schMoveTaskToSuccList(pJob, pTask, &moved)); + + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_PARTIAL_SUCCEED); + + SCH_ERR_JRET(schRecordTaskSucceedNode(pJob, pTask)); + + SCH_ERR_JRET(schLaunchTasksInFlowCtrlList(pJob, pTask)); + + int32_t parentNum = pTask->parents ? (int32_t)taosArrayGetSize(pTask->parents) : 0; + if (parentNum == 0) { + int32_t taskDone = 0; + if (SCH_IS_WAIT_ALL_JOB(pJob)) { + SCH_LOCK(SCH_WRITE, &pTask->level->lock); + pTask->level->taskSucceed++; + taskDone = pTask->level->taskSucceed + pTask->level->taskFailed; + SCH_UNLOCK(SCH_WRITE, &pTask->level->lock); + + if (taskDone < pTask->level->taskNum) { + SCH_TASK_DLOG("wait all tasks, done:%d, all:%d", taskDone, pTask->level->taskNum); + return TSDB_CODE_SUCCESS; + } else if (taskDone > pTask->level->taskNum) { + SCH_TASK_ELOG("taskDone number invalid, done:%d, total:%d", taskDone, pTask->level->taskNum); + } + + if (pTask->level->taskFailed > 0) { + SCH_RET(schProcessOnJobFailure(pJob, 0)); + } else { + SCH_RET(schProcessOnJobPartialSuccess(pJob)); + } + } else { + pJob->resNode = pTask->succeedAddr; + } + + pJob->fetchTask = pTask; + + SCH_ERR_JRET(schMoveTaskToExecList(pJob, pTask, &moved)); + + SCH_RET(schProcessOnJobPartialSuccess(pJob)); + } + + /* + if (SCH_IS_DATA_SRC_TASK(task) && job->dataSrcEps.numOfEps < SCH_MAX_CANDIDATE_EP_NUM) { + strncpy(job->dataSrcEps.fqdn[job->dataSrcEps.numOfEps], task->execAddr.fqdn, sizeof(task->execAddr.fqdn)); + job->dataSrcEps.port[job->dataSrcEps.numOfEps] = task->execAddr.port; + + ++job->dataSrcEps.numOfEps; + } + */ + + for (int32_t i = 0; i < parentNum; ++i) { + SSchTask *par = *(SSchTask **)taosArrayGet(pTask->parents, i); + int32_t readyNum = atomic_add_fetch_32(&par->childReady, 1); + + SCH_LOCK(SCH_WRITE, &par->lock); + SDownstreamSourceNode source = {.type = QUERY_NODE_DOWNSTREAM_SOURCE, + .taskId = pTask->taskId, + .schedId = schMgmt.sId, + .addr = pTask->succeedAddr}; + qSetSubplanExecutionNode(par->plan, pTask->plan->id.groupId, &source); + SCH_UNLOCK(SCH_WRITE, &par->lock); + + if (SCH_TASK_READY_FOR_LAUNCH(readyNum, par)) { + SCH_ERR_RET(schLaunchTask(pJob, par)); + } + } + + SCH_ERR_RET(schLaunchNextLevelTasks(pJob, pTask)); + + return TSDB_CODE_SUCCESS; + +_return: + + SCH_RET(schProcessOnJobFailure(pJob, code)); +} + +// Note: no more error processing, handled in function internal +int32_t schFetchFromRemote(SSchJob *pJob) { + int32_t code = 0; + + if (atomic_val_compare_exchange_32(&pJob->remoteFetch, 0, 1) != 0) { + SCH_JOB_ELOG("prior fetching not finished, remoteFetch:%d", atomic_load_32(&pJob->remoteFetch)); + return TSDB_CODE_SUCCESS; + } + + void *resData = atomic_load_ptr(&pJob->resData); + if (resData) { + atomic_val_compare_exchange_32(&pJob->remoteFetch, 1, 0); + + SCH_JOB_DLOG("res already fetched, res:%p", resData); + return TSDB_CODE_SUCCESS; + } + + SCH_ERR_JRET(schBuildAndSendMsg(pJob, pJob->fetchTask, &pJob->resNode, TDMT_VND_FETCH)); + + return TSDB_CODE_SUCCESS; + +_return: + + atomic_val_compare_exchange_32(&pJob->remoteFetch, 1, 0); + + SCH_RET(schProcessOnTaskFailure(pJob, pJob->fetchTask, code)); +} + +int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRsp *pRsp) { + SCH_TASK_DLOG("got explain rsp, rows:%d, complete:%d", htonl(pRsp->numOfRows), pRsp->completed); + + atomic_store_32(&pJob->resNumOfRows, htonl(pRsp->numOfRows)); + atomic_store_ptr(&pJob->resData, pRsp); + + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCCEED); + + schProcessOnDataFetched(pJob); + + return TSDB_CODE_SUCCESS; +} + +int32_t schSaveJobQueryRes(SSchJob *pJob, SResReadyRsp *rsp) { + if (rsp->tbFName[0]) { + if (NULL == pJob->queryRes) { + pJob->queryRes = taosArrayInit(pJob->taskNum, sizeof(STbVerInfo)); + if (NULL == pJob->queryRes) { + SCH_ERR_RET(TSDB_CODE_OUT_OF_MEMORY); + } + } + + STbVerInfo tbInfo; + strcpy(tbInfo.tbFName, rsp->tbFName); + tbInfo.sversion = rsp->sversion; + tbInfo.tversion = rsp->tversion; + + taosArrayPush((SArray *)pJob->queryRes, &tbInfo); + } + + return TSDB_CODE_SUCCESS; +} + +int32_t schGetTaskFromTaskList(SHashObj *pTaskList, uint64_t taskId, SSchTask **pTask) { + int32_t s = taosHashGetSize(pTaskList); + if (s <= 0) { + return TSDB_CODE_SUCCESS; + } + + SSchTask **task = taosHashGet(pTaskList, &taskId, sizeof(taskId)); + if (NULL == task || NULL == (*task)) { + return TSDB_CODE_SUCCESS; + } + + *pTask = *task; + + return TSDB_CODE_SUCCESS; +} + +int32_t schUpdateTaskExecNodeHandle(SSchTask *pTask, void *handle, int32_t rspCode) { + if (rspCode || NULL == pTask->execNodes || taosArrayGetSize(pTask->execNodes) > 1 || + taosArrayGetSize(pTask->execNodes) <= 0) { + return TSDB_CODE_SUCCESS; + } + + SSchNodeInfo *nodeInfo = taosArrayGet(pTask->execNodes, 0); + nodeInfo->handle = handle; + + return TSDB_CODE_SUCCESS; +} + +int32_t schLaunchTaskImpl(SSchJob *pJob, SSchTask *pTask) { + int8_t status = 0; + int32_t code = 0; + + atomic_add_fetch_32(&pTask->level->taskLaunchedNum, 1); + + if (schJobNeedToStop(pJob, &status)) { + SCH_TASK_DLOG("no need to launch task cause of job status, job status:%s", jobTaskStatusStr(status)); + + SCH_RET(atomic_load_32(&pJob->errCode)); + } + + // NOTE: race condition: the task should be put into the hash table before send msg to server + if (SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_EXECUTING) { + SCH_ERR_RET(schPushTaskToExecList(pJob, pTask)); + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_EXECUTING); + } + + SSubplan *plan = pTask->plan; + + if (NULL == pTask->msg) { // TODO add more detailed reason for failure + code = qSubPlanToString(plan, &pTask->msg, &pTask->msgLen); + if (TSDB_CODE_SUCCESS != code) { + SCH_TASK_ELOG("failed to create physical plan, code:%s, msg:%p, len:%d", tstrerror(code), pTask->msg, + pTask->msgLen); + SCH_ERR_RET(code); + } else { + SCH_TASK_DLOGL("physical plan len:%d, %s", pTask->msgLen, pTask->msg); + } + } + + SCH_ERR_RET(schSetTaskCandidateAddrs(pJob, pTask)); + + if (SCH_IS_QUERY_JOB(pJob)) { + SCH_ERR_RET(schEnsureHbConnection(pJob, pTask)); + } + + SCH_ERR_RET(schBuildAndSendMsg(pJob, pTask, NULL, plan->msgType)); + + return TSDB_CODE_SUCCESS; +} + +// Note: no more error processing, handled in function internal +int32_t schLaunchTask(SSchJob *pJob, SSchTask *pTask) { + bool enough = false; + int32_t code = 0; + + SCH_SET_TASK_HANDLE(pTask, NULL); + + if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { + SCH_ERR_JRET(schCheckIncTaskFlowQuota(pJob, pTask, &enough)); + + if (enough) { + SCH_ERR_JRET(schLaunchTaskImpl(pJob, pTask)); + } + } else { + SCH_ERR_JRET(schLaunchTaskImpl(pJob, pTask)); + } + + return TSDB_CODE_SUCCESS; + +_return: + + SCH_RET(schProcessOnTaskFailure(pJob, pTask, code)); +} + +int32_t schLaunchLevelTasks(SSchJob *pJob, SSchLevel *level) { + for (int32_t i = 0; i < level->taskNum; ++i) { + SSchTask *pTask = taosArrayGet(level->subTasks, i); + + SCH_ERR_RET(schLaunchTask(pJob, pTask)); + } + + return TSDB_CODE_SUCCESS; +} + +int32_t schLaunchJob(SSchJob *pJob) { + SSchLevel *level = taosArrayGet(pJob->levels, pJob->levelIdx); + + SCH_ERR_RET(schChkUpdateJobStatus(pJob, JOB_TASK_STATUS_EXECUTING)); + + SCH_ERR_RET(schChkJobNeedFlowCtrl(pJob, level)); + + SCH_ERR_RET(schLaunchLevelTasks(pJob, level)); + + return TSDB_CODE_SUCCESS; +} + +void schDropTaskOnExecNode(SSchJob *pJob, SSchTask *pTask) { + if (NULL == pTask->execNodes) { + SCH_TASK_DLOG("no exec address, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + return; + } + + int32_t size = (int32_t)taosArrayGetSize(pTask->execNodes); + + if (size <= 0) { + SCH_TASK_DLOG("task has no execNodes, no need to drop it, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); + return; + } + + SSchNodeInfo *nodeInfo = NULL; + for (int32_t i = 0; i < size; ++i) { + nodeInfo = (SSchNodeInfo *)taosArrayGet(pTask->execNodes, i); + SCH_SET_TASK_HANDLE(pTask, nodeInfo->handle); + + schBuildAndSendMsg(pJob, pTask, &nodeInfo->addr, TDMT_VND_DROP_TASK); + } + + SCH_TASK_DLOG("task has %d exec address", size); +} + +void schDropTaskInHashList(SSchJob *pJob, SHashObj *list) { + if (!SCH_IS_NEED_DROP_JOB(pJob)) { + return; + } + + void *pIter = taosHashIterate(list, NULL); + while (pIter) { + SSchTask *pTask = *(SSchTask **)pIter; + + schDropTaskOnExecNode(pJob, pTask); + + pIter = taosHashIterate(list, pIter); + } +} + +void schDropJobAllTasks(SSchJob *pJob) { + schDropTaskInHashList(pJob, pJob->execTasks); + schDropTaskInHashList(pJob, pJob->succTasks); + schDropTaskInHashList(pJob, pJob->failTasks); +} + +int32_t schCancelJob(SSchJob *pJob) { + // TODO + return TSDB_CODE_SUCCESS; + // TODO MOVE ALL TASKS FROM EXEC LIST TO FAIL LIST +} + +void schFreeJobImpl(void *job) { + if (NULL == job) { + return; + } + + SSchJob *pJob = job; + uint64_t queryId = pJob->queryId; + int64_t refId = pJob->refId; + + if (pJob->status == JOB_TASK_STATUS_EXECUTING) { + schCancelJob(pJob); + } + + schDropJobAllTasks(pJob); + + pJob->subPlans = NULL; // it is a reference to pDag->pSubplans + + int32_t numOfLevels = taosArrayGetSize(pJob->levels); + for (int32_t i = 0; i < numOfLevels; ++i) { + SSchLevel *pLevel = taosArrayGet(pJob->levels, i); + + int32_t numOfTasks = taosArrayGetSize(pLevel->subTasks); + for (int32_t j = 0; j < numOfTasks; ++j) { + SSchTask *pTask = taosArrayGet(pLevel->subTasks, j); + schFreeTask(pTask); + } + + taosArrayDestroy(pLevel->subTasks); + } + + schFreeFlowCtrl(pJob); + + taosHashCleanup(pJob->execTasks); + taosHashCleanup(pJob->failTasks); + taosHashCleanup(pJob->succTasks); + + taosArrayDestroy(pJob->levels); + taosArrayDestroy(pJob->nodeList); + taosArrayDestroy(pJob->dataSrcTasks); + + qExplainFreeCtx(pJob->explainCtx); + + if (SCH_IS_QUERY_JOB(pJob)) { + taosArrayDestroy((SArray *)pJob->queryRes); + } else { + tFreeSSubmitRsp((SSubmitRsp*)pJob->queryRes); + } + + taosMemoryFreeClear(pJob->resData); + taosMemoryFreeClear(pJob); + + qDebug("QID:0x%" PRIx64 " job freed, refId:%" PRIx64 ", pointer:%p", queryId, refId, pJob); + + atomic_sub_fetch_32(&schMgmt.jobNum, 1); + + schCloseJobRef(); +} + +int32_t schExecJobImpl(void *transport, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql, + int64_t startTs, bool sync) { + qDebug("QID:0x%" PRIx64 " job started", pDag->queryId); + + if (pNodeList == NULL || taosArrayGetSize(pNodeList) <= 0) { + qDebug("QID:0x%" PRIx64 " input exec nodeList is empty", pDag->queryId); + } + + int32_t code = 0; + SSchJob *pJob = NULL; + SCH_ERR_JRET(schInitJob(&pJob, pDag, transport, pNodeList, sql, startTs, sync)); + + SCH_ERR_JRET(schLaunchJob(pJob)); + + *job = pJob->refId; + + if (sync) { + SCH_JOB_DLOG("will wait for rsp now, job status:%s", SCH_GET_JOB_STATUS_STR(pJob)); + tsem_wait(&pJob->rspSem); + } + + SCH_JOB_DLOG("job exec done, job status:%s", SCH_GET_JOB_STATUS_STR(pJob)); + + schReleaseJob(pJob->refId); + + return TSDB_CODE_SUCCESS; + +_return: + + schFreeJobImpl(pJob); + SCH_RET(code); +} + +int32_t schExecStaticExplain(void *transport, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql, + bool syncSchedule) { + qDebug("QID:0x%" PRIx64 " job started", pDag->queryId); + + int32_t code = 0; + SSchJob *pJob = taosMemoryCalloc(1, sizeof(SSchJob)); + if (NULL == pJob) { + qError("QID:%" PRIx64 " calloc %d failed", pDag->queryId, (int32_t)sizeof(SSchJob)); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + pJob->sql = sql; + pJob->attr.queryJob = true; + pJob->attr.explainMode = pDag->explainInfo.mode; + pJob->queryId = pDag->queryId; + pJob->subPlans = pDag->pSubplans; + + SCH_ERR_JRET(qExecStaticExplain(pDag, (SRetrieveTableRsp **)&pJob->resData)); + + int64_t refId = taosAddRef(schMgmt.jobRef, pJob); + if (refId < 0) { + SCH_JOB_ELOG("taosAddRef job failed, error:%s", tstrerror(terrno)); + SCH_ERR_JRET(terrno); + } + + if (NULL == schAcquireJob(refId)) { + SCH_JOB_ELOG("schAcquireJob job failed, refId:%" PRIx64, refId); + SCH_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + pJob->refId = refId; + + SCH_JOB_DLOG("job refId:%" PRIx64, pJob->refId); + + pJob->status = JOB_TASK_STATUS_PARTIAL_SUCCEED; + *job = pJob->refId; + SCH_JOB_DLOG("job exec done, job status:%s", SCH_GET_JOB_STATUS_STR(pJob)); + + schReleaseJob(pJob->refId); + + return TSDB_CODE_SUCCESS; + +_return: + + schFreeJobImpl(pJob); + SCH_RET(code); +} + + diff --git a/source/libs/scheduler/src/schRemote.c b/source/libs/scheduler/src/schRemote.c new file mode 100644 index 0000000000..6d9f6b435f --- /dev/null +++ b/source/libs/scheduler/src/schRemote.c @@ -0,0 +1,1231 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "catalog.h" +#include "command.h" +#include "query.h" +#include "schedulerInt.h" +#include "tmsg.h" +#include "tref.h" +#include "trpc.h" + + +int32_t schValidateReceivedMsgType(SSchJob *pJob, SSchTask *pTask, int32_t msgType) { + int32_t lastMsgType = SCH_GET_TASK_LASTMSG_TYPE(pTask); + int32_t taskStatus = SCH_GET_TASK_STATUS(pTask); + int32_t reqMsgType = msgType - 1; + switch (msgType) { + case TDMT_SCH_LINK_BROKEN: + case TDMT_VND_EXPLAIN_RSP: + return TSDB_CODE_SUCCESS; + case TDMT_VND_QUERY_RSP: // query_rsp may be processed later than ready_rsp + if (lastMsgType != reqMsgType && -1 != lastMsgType && TDMT_VND_FETCH != lastMsgType) { + SCH_TASK_DLOG("rsp msg type mis-match, last sent msgType:%s, rspType:%s", TMSG_INFO(lastMsgType), + TMSG_INFO(msgType)); + } + + if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { + SCH_TASK_DLOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), + TMSG_INFO(msgType)); + } + + SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); + return TSDB_CODE_SUCCESS; + case TDMT_VND_RES_READY_RSP: + reqMsgType = TDMT_VND_QUERY; + if (lastMsgType != reqMsgType && -1 != lastMsgType) { + SCH_TASK_ELOG("rsp msg type mis-match, last sent msgType:%s, rspType:%s", + (lastMsgType > 0 ? TMSG_INFO(lastMsgType) : "null"), TMSG_INFO(msgType)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { + SCH_TASK_ELOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), + TMSG_INFO(msgType)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); + return TSDB_CODE_SUCCESS; + case TDMT_VND_FETCH_RSP: + if (lastMsgType != reqMsgType && -1 != lastMsgType) { + SCH_TASK_ELOG("rsp msg type mis-match, last sent msgType:%s, rspType:%s", TMSG_INFO(lastMsgType), + TMSG_INFO(msgType)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { + SCH_TASK_ELOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), + TMSG_INFO(msgType)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); + return TSDB_CODE_SUCCESS; + case TDMT_VND_CREATE_TABLE_RSP: + case TDMT_VND_DROP_TABLE_RSP: + case TDMT_VND_ALTER_TABLE_RSP: + case TDMT_VND_SUBMIT_RSP: + break; + default: + SCH_TASK_ELOG("unknown rsp msg, type:%s, status:%s", TMSG_INFO(msgType), jobTaskStatusStr(taskStatus)); + SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); + } + + if (lastMsgType != reqMsgType) { + SCH_TASK_ELOG("rsp msg type mis-match, last sent msgType:%s, rspType:%s", TMSG_INFO(lastMsgType), + TMSG_INFO(msgType)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { + SCH_TASK_ELOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), + TMSG_INFO(msgType)); + SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); + + return TSDB_CODE_SUCCESS; +} + +// Note: no more task error processing, handled in function internal +int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, char *msg, int32_t msgSize, + int32_t rspCode) { + int32_t code = 0; + int8_t status = 0; + + if (schJobNeedToStop(pJob, &status)) { + SCH_TASK_ELOG("rsp not processed cause of job status, job status:%s, rspCode:0x%x", jobTaskStatusStr(status), + rspCode); + SCH_RET(atomic_load_32(&pJob->errCode)); + } + + SCH_ERR_JRET(schValidateReceivedMsgType(pJob, pTask, msgType)); + + switch (msgType) { + case TDMT_VND_CREATE_TABLE_RSP: { + SVCreateTbBatchRsp batchRsp = {0}; + if (msg) { + SDecoder coder = {0}; + tDecoderInit(&coder, msg, msgSize); + code = tDecodeSVCreateTbBatchRsp(&coder, &batchRsp); + if (TSDB_CODE_SUCCESS == code && batchRsp.nRsps > 0) { + for (int32_t i = 0; i < batchRsp.nRsps; ++i) { + SVCreateTbRsp *rsp = batchRsp.pRsps + i; + if (TSDB_CODE_SUCCESS != rsp->code) { + code = rsp->code; + tDecoderClear(&coder); + SCH_ERR_JRET(code); + } + } + } + tDecoderClear(&coder); + SCH_ERR_JRET(code); + } + + SCH_ERR_JRET(rspCode); + SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); + break; + } + case TDMT_VND_DROP_TABLE_RSP: { + SVDropTbBatchRsp batchRsp = {0}; + if (msg) { + SDecoder coder = {0}; + tDecoderInit(&coder, msg, msgSize); + code = tDecodeSVDropTbBatchRsp(&coder, &batchRsp); + if (TSDB_CODE_SUCCESS == code && batchRsp.nRsps > 0) { + for (int32_t i = 0; i < batchRsp.nRsps; ++i) { + SVDropTbRsp *rsp = batchRsp.pRsps + i; + if (TSDB_CODE_SUCCESS != rsp->code) { + code = rsp->code; + tDecoderClear(&coder); + SCH_ERR_JRET(code); + } + } + } + tDecoderClear(&coder); + SCH_ERR_JRET(code); + } + + SCH_ERR_JRET(rspCode); + SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); + break; + } + case TDMT_VND_ALTER_TABLE_RSP: { + SVAlterTbRsp rsp = {0}; + if (msg) { + SDecoder coder = {0}; + tDecoderInit(&coder, msg, msgSize); + code = tDecodeSVAlterTbRsp(&coder, &rsp); + tDecoderClear(&coder); + SCH_ERR_JRET(code); + SCH_ERR_JRET(rsp.code); + } + + SCH_ERR_JRET(rspCode); + + if (NULL == msg) { + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); + break; + } + case TDMT_VND_SUBMIT_RSP: { + SCH_ERR_JRET(rspCode); + + if (msg) { + SDecoder coder = {0}; + SSubmitRsp *rsp = taosMemoryMalloc(sizeof(*rsp)); + tDecoderInit(&coder, msg, msgSize); + code = tDecodeSSubmitRsp(&coder, rsp); + if (code) { + SCH_TASK_ELOG("decode submitRsp failed, code:%d", code); + tFreeSSubmitRsp(rsp); + SCH_ERR_JRET(code); + } + + if (rsp->nBlocks > 0) { + for (int32_t i = 0; i < rsp->nBlocks; ++i) { + SSubmitBlkRsp *blk = rsp->pBlocks + i; + if (TSDB_CODE_SUCCESS != blk->code) { + code = blk->code; + tFreeSSubmitRsp(rsp); + SCH_ERR_JRET(code); + } + } + } + + atomic_add_fetch_32(&pJob->resNumOfRows, rsp->affectedRows); + SCH_TASK_DLOG("submit succeed, affectedRows:%d", rsp->affectedRows); + + SCH_LOCK(SCH_WRITE, &pJob->resLock); + if (pJob->queryRes) { + SSubmitRsp *sum = pJob->queryRes; + sum->affectedRows += rsp->affectedRows; + sum->nBlocks += rsp->nBlocks; + sum->pBlocks = taosMemoryRealloc(sum->pBlocks, sum->nBlocks * sizeof(*sum->pBlocks)); + memcpy(sum->pBlocks + sum->nBlocks - rsp->nBlocks, rsp->pBlocks, rsp->nBlocks * sizeof(*sum->pBlocks)); + taosMemoryFree(rsp->pBlocks); + taosMemoryFree(rsp); + } else { + pJob->queryRes = rsp; + } + SCH_UNLOCK(SCH_WRITE, &pJob->resLock); + } + + SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); + + break; + } + case TDMT_VND_QUERY_RSP: { + SQueryTableRsp rsp = {0}; + if (msg) { + SCH_ERR_JRET(tDeserializeSQueryTableRsp(msg, msgSize, &rsp)); + SCH_ERR_JRET(rsp.code); + } + + SCH_ERR_JRET(rspCode); + + if (NULL == msg) { + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + + // SCH_ERR_JRET(schBuildAndSendMsg(pJob, pTask, NULL, TDMT_VND_RES_READY)); + + break; + } + case TDMT_VND_RES_READY_RSP: { + SResReadyRsp *rsp = (SResReadyRsp *)msg; + + SCH_ERR_JRET(rspCode); + if (NULL == msg) { + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + SCH_ERR_JRET(rsp->code); + + SCH_ERR_JRET(schSaveJobQueryRes(pJob, rsp)); + + SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); + + break; + } + case TDMT_VND_EXPLAIN_RSP: { + SCH_ERR_JRET(rspCode); + if (NULL == msg) { + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + + if (!SCH_IS_EXPLAIN_JOB(pJob)) { + SCH_TASK_ELOG("invalid msg received for none explain query, msg type:%s", TMSG_INFO(msgType)); + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + + if (pJob->resData) { + SCH_TASK_ELOG("explain result is already generated, res:%p", pJob->resData); + SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); + } + + SExplainRsp rsp = {0}; + if (tDeserializeSExplainRsp(msg, msgSize, &rsp)) { + taosMemoryFree(rsp.subplanInfo); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SRetrieveTableRsp *pRsp = NULL; + SCH_ERR_JRET(qExplainUpdateExecInfo(pJob->explainCtx, &rsp, pTask->plan->id.groupId, &pRsp)); + + if (pRsp) { + SCH_ERR_JRET(schProcessOnExplainDone(pJob, pTask, pRsp)); + } + break; + } + case TDMT_VND_FETCH_RSP: { + SRetrieveTableRsp *rsp = (SRetrieveTableRsp *)msg; + + SCH_ERR_JRET(rspCode); + if (NULL == msg) { + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + + if (SCH_IS_EXPLAIN_JOB(pJob)) { + if (rsp->completed) { + SRetrieveTableRsp *pRsp = NULL; + SCH_ERR_JRET(qExecExplainEnd(pJob->explainCtx, &pRsp)); + if (pRsp) { + SCH_ERR_JRET(schProcessOnExplainDone(pJob, pTask, pRsp)); + } + + return TSDB_CODE_SUCCESS; + } + + atomic_val_compare_exchange_32(&pJob->remoteFetch, 1, 0); + + SCH_ERR_JRET(schFetchFromRemote(pJob)); + + return TSDB_CODE_SUCCESS; + } + + if (pJob->resData) { + SCH_TASK_ELOG("got fetch rsp while res already exists, res:%p", pJob->resData); + taosMemoryFreeClear(rsp); + SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); + } + + atomic_store_ptr(&pJob->resData, rsp); + atomic_add_fetch_32(&pJob->resNumOfRows, htonl(rsp->numOfRows)); + + if (rsp->completed) { + SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCCEED); + } + + SCH_TASK_DLOG("got fetch rsp, rows:%d, complete:%d", htonl(rsp->numOfRows), rsp->completed); + + schProcessOnDataFetched(pJob); + break; + } + case TDMT_VND_DROP_TASK_RSP: { + // SHOULD NEVER REACH HERE + SCH_TASK_ELOG("invalid status to handle drop task rsp, refId:%" PRIx64, pJob->refId); + SCH_ERR_JRET(TSDB_CODE_SCH_INTERNAL_ERROR); + break; + } + case TDMT_SCH_LINK_BROKEN: + SCH_TASK_ELOG("link broken received, error:%x - %s", rspCode, tstrerror(rspCode)); + SCH_ERR_JRET(rspCode); + break; + default: + SCH_TASK_ELOG("unknown rsp msg, type:%d, status:%s", msgType, SCH_GET_TASK_STATUS_STR(pTask)); + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + + return TSDB_CODE_SUCCESS; + +_return: + + SCH_RET(schProcessOnTaskFailure(pJob, pTask, code)); +} + + +int32_t schHandleCallback(void *param, const SDataBuf *pMsg, int32_t msgType, int32_t rspCode) { + int32_t code = 0; + SSchTaskCallbackParam *pParam = (SSchTaskCallbackParam *)param; + SSchTask *pTask = NULL; + + SSchJob *pJob = schAcquireJob(pParam->refId); + if (NULL == pJob) { + qWarn("QID:0x%" PRIx64 ",TID:0x%" PRIx64 "taosAcquireRef job failed, may be dropped, refId:%" PRIx64, + pParam->queryId, pParam->taskId, pParam->refId); + SCH_ERR_JRET(TSDB_CODE_QRY_JOB_FREED); + } + + schGetTaskFromTaskList(pJob->execTasks, pParam->taskId, &pTask); + if (NULL == pTask) { + if (TDMT_VND_EXPLAIN_RSP == msgType) { + schGetTaskFromTaskList(pJob->succTasks, pParam->taskId, &pTask); + } else { + SCH_JOB_ELOG("task not found in execTask list, refId:%" PRIx64 ", taskId:%" PRIx64, pParam->refId, + pParam->taskId); + SCH_ERR_JRET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + } + + if (NULL == pTask) { + SCH_JOB_ELOG("task not found in execList & succList, refId:%" PRIx64 ", taskId:%" PRIx64, pParam->refId, + pParam->taskId); + SCH_ERR_JRET(TSDB_CODE_SCH_INTERNAL_ERROR); + } + + SCH_TASK_DLOG("rsp msg received, type:%s, handle:%p, code:%s", TMSG_INFO(msgType), pMsg->handle, tstrerror(rspCode)); + + SCH_SET_TASK_HANDLE(pTask, pMsg->handle); + schUpdateTaskExecNodeHandle(pTask, pMsg->handle, rspCode); + + SCH_ERR_JRET(schHandleResponseMsg(pJob, pTask, msgType, pMsg->pData, pMsg->len, rspCode)); + +_return: + if (pJob) { + schReleaseJob(pParam->refId); + } + + taosMemoryFreeClear(param); + SCH_RET(code); +} + +int32_t schHandleSubmitCallback(void *param, const SDataBuf *pMsg, int32_t code) { + return schHandleCallback(param, pMsg, TDMT_VND_SUBMIT_RSP, code); +} + +int32_t schHandleCreateTbCallback(void *param, const SDataBuf *pMsg, int32_t code) { + return schHandleCallback(param, pMsg, TDMT_VND_CREATE_TABLE_RSP, code); +} + +int32_t schHandleDropTbCallback(void *param, const SDataBuf *pMsg, int32_t code) { + return schHandleCallback(param, pMsg, TDMT_VND_DROP_TABLE_RSP, code); +} + +int32_t schHandleAlterTbCallback(void *param, const SDataBuf *pMsg, int32_t code) { + return schHandleCallback(param, pMsg, TDMT_VND_ALTER_TABLE_RSP, code); +} + +int32_t schHandleQueryCallback(void *param, const SDataBuf *pMsg, int32_t code) { + return schHandleCallback(param, pMsg, TDMT_VND_QUERY_RSP, code); +} + +int32_t schHandleFetchCallback(void *param, const SDataBuf *pMsg, int32_t code) { + return schHandleCallback(param, pMsg, TDMT_VND_FETCH_RSP, code); +} + +int32_t schHandleReadyCallback(void *param, const SDataBuf *pMsg, int32_t code) { + return schHandleCallback(param, pMsg, TDMT_VND_RES_READY_RSP, code); +} + +int32_t schHandleExplainCallback(void *param, const SDataBuf *pMsg, int32_t code) { + return schHandleCallback(param, pMsg, TDMT_VND_EXPLAIN_RSP, code); +} + +int32_t schHandleDropCallback(void *param, const SDataBuf *pMsg, int32_t code) { + SSchTaskCallbackParam *pParam = (SSchTaskCallbackParam *)param; + qDebug("QID:%" PRIx64 ",TID:%" PRIx64 " drop task rsp received, code:%x", pParam->queryId, pParam->taskId, code); + return TSDB_CODE_SUCCESS; +} + +int32_t schHandleLinkBrokenCallback(void *param, const SDataBuf *pMsg, int32_t code) { + SSchCallbackParamHeader *head = (SSchCallbackParamHeader *)param; + rpcReleaseHandle(pMsg->handle, TAOS_CONN_CLIENT); + + qDebug("handle %p is broken", pMsg->handle); + + if (head->isHbParam) { + SSchHbCallbackParam *hbParam = (SSchHbCallbackParam *)param; + SSchTrans trans = {.transInst = hbParam->transport, .transHandle = NULL}; + SCH_ERR_RET(schUpdateHbConnection(&hbParam->nodeEpId, &trans)); + + SCH_ERR_RET(schBuildAndSendHbMsg(&hbParam->nodeEpId)); + } else { + SCH_ERR_RET(schHandleCallback(param, pMsg, TDMT_SCH_LINK_BROKEN, code)); + } + + return TSDB_CODE_SUCCESS; +} + +int32_t schGenerateCallBackInfo(SSchJob *pJob, SSchTask *pTask, int32_t msgType, SMsgSendInfo **pMsgSendInfo) { + int32_t code = 0; + SMsgSendInfo *msgSendInfo = taosMemoryCalloc(1, sizeof(SMsgSendInfo)); + if (NULL == msgSendInfo) { + SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SMsgSendInfo)); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SSchTaskCallbackParam *param = taosMemoryCalloc(1, sizeof(SSchTaskCallbackParam)); + if (NULL == param) { + SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SSchTaskCallbackParam)); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + __async_send_cb_fn_t fp = NULL; + SCH_ERR_JRET(schGetCallbackFp(msgType, &fp)); + + param->queryId = pJob->queryId; + param->refId = pJob->refId; + param->taskId = SCH_TASK_ID(pTask); + param->transport = pJob->transport; + + msgSendInfo->param = param; + msgSendInfo->fp = fp; + + *pMsgSendInfo = msgSendInfo; + + return TSDB_CODE_SUCCESS; + +_return: + + taosMemoryFree(param); + taosMemoryFree(msgSendInfo); + + SCH_RET(code); +} + + +int32_t schGetCallbackFp(int32_t msgType, __async_send_cb_fn_t *fp) { + switch (msgType) { + case TDMT_VND_CREATE_TABLE: + *fp = schHandleCreateTbCallback; + break; + case TDMT_VND_DROP_TABLE: + *fp = schHandleDropTbCallback; + break; + case TDMT_VND_ALTER_TABLE: + *fp = schHandleAlterTbCallback; + break; + case TDMT_VND_SUBMIT: + *fp = schHandleSubmitCallback; + break; + case TDMT_VND_QUERY: + *fp = schHandleQueryCallback; + break; + case TDMT_VND_RES_READY: + *fp = schHandleReadyCallback; + break; + case TDMT_VND_EXPLAIN: + *fp = schHandleExplainCallback; + break; + case TDMT_VND_FETCH: + *fp = schHandleFetchCallback; + break; + case TDMT_VND_DROP_TASK: + *fp = schHandleDropCallback; + break; + case TDMT_VND_QUERY_HEARTBEAT: + *fp = schHandleHbCallback; + break; + case TDMT_SCH_LINK_BROKEN: + *fp = schHandleLinkBrokenCallback; + break; + default: + qError("unknown msg type for callback, msgType:%d", msgType); + SCH_ERR_RET(TSDB_CODE_QRY_APP_ERROR); + } + + return TSDB_CODE_SUCCESS; +} + + +int32_t schMakeHbCallbackParam(SSchJob *pJob, SSchTask *pTask, void **pParam) { + SSchHbCallbackParam *param = taosMemoryCalloc(1, sizeof(SSchHbCallbackParam)); + if (NULL == param) { + SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SSchHbCallbackParam)); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + param->head.isHbParam = true; + + SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + + param->nodeEpId.nodeId = addr->nodeId; + memcpy(¶m->nodeEpId.ep, SCH_GET_CUR_EP(addr), sizeof(SEp)); + param->transport = pJob->transport; + + *pParam = param; + + return TSDB_CODE_SUCCESS; +} + +int32_t schCloneHbRpcCtx(SRpcCtx *pSrc, SRpcCtx *pDst) { + int32_t code = 0; + memcpy(&pDst->brokenVal, &pSrc->brokenVal, sizeof(pSrc->brokenVal)); + pDst->brokenVal.val = NULL; + + SCH_ERR_RET(schCloneSMsgSendInfo(pSrc->brokenVal.val, &pDst->brokenVal.val)); + + pDst->args = taosHashInit(1, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false, HASH_ENTRY_LOCK); + if (NULL == pDst->args) { + qError("taosHashInit %d RpcCtx failed", 1); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SRpcCtxVal dst = {0}; + void *pIter = taosHashIterate(pSrc->args, NULL); + while (pIter) { + SRpcCtxVal *pVal = (SRpcCtxVal *)pIter; + int32_t *msgType = taosHashGetKey(pIter, NULL); + + dst = *pVal; + dst.val = NULL; + + SCH_ERR_JRET(schCloneSMsgSendInfo(pVal->val, &dst.val)); + + if (taosHashPut(pDst->args, msgType, sizeof(*msgType), &dst, sizeof(dst))) { + qError("taosHashPut msg %d to rpcCtx failed", *msgType); + (*dst.freeFunc)(dst.val); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + pIter = taosHashIterate(pSrc->args, pIter); + } + + return TSDB_CODE_SUCCESS; + +_return: + + schFreeRpcCtx(pDst); + SCH_RET(code); +} + + +int32_t schMakeHbRpcCtx(SSchJob *pJob, SSchTask *pTask, SRpcCtx *pCtx) { + int32_t code = 0; + SSchHbCallbackParam *param = NULL; + SMsgSendInfo *pMsgSendInfo = NULL; + SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + SQueryNodeEpId epId = {0}; + + epId.nodeId = addr->nodeId; + memcpy(&epId.ep, SCH_GET_CUR_EP(addr), sizeof(SEp)); + + pCtx->args = taosHashInit(1, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false, HASH_ENTRY_LOCK); + if (NULL == pCtx->args) { + SCH_TASK_ELOG("taosHashInit %d RpcCtx failed", 1); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + pMsgSendInfo = taosMemoryCalloc(1, sizeof(SMsgSendInfo)); + if (NULL == pMsgSendInfo) { + SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SMsgSendInfo)); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + param = taosMemoryCalloc(1, sizeof(SSchHbCallbackParam)); + if (NULL == param) { + SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SSchHbCallbackParam)); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + int32_t msgType = TDMT_VND_QUERY_HEARTBEAT_RSP; + __async_send_cb_fn_t fp = NULL; + SCH_ERR_JRET(schGetCallbackFp(TDMT_VND_QUERY_HEARTBEAT, &fp)); + + param->nodeEpId = epId; + param->transport = pJob->transport; + + pMsgSendInfo->param = param; + pMsgSendInfo->fp = fp; + + SRpcCtxVal ctxVal = {.val = pMsgSendInfo, .clone = schCloneSMsgSendInfo, .freeFunc = schFreeRpcCtxVal}; + if (taosHashPut(pCtx->args, &msgType, sizeof(msgType), &ctxVal, sizeof(ctxVal))) { + SCH_TASK_ELOG("taosHashPut msg %d to rpcCtx failed", msgType); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_ERR_JRET(schMakeBrokenLinkVal(pJob, pTask, &pCtx->brokenVal, true)); + + return TSDB_CODE_SUCCESS; + +_return: + + taosHashCleanup(pCtx->args); + taosMemoryFreeClear(param); + taosMemoryFreeClear(pMsgSendInfo); + + SCH_RET(code); +} + +int32_t schRegisterHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *epId, bool *exist) { + int32_t code = 0; + SSchHbTrans hb = {0}; + + hb.trans.transInst = pJob->transport; + + SCH_ERR_RET(schMakeHbRpcCtx(pJob, pTask, &hb.rpcCtx)); + + code = taosHashPut(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId), &hb, sizeof(SSchHbTrans)); + if (code) { + schFreeRpcCtx(&hb.rpcCtx); + + if (HASH_NODE_EXIST(code)) { + *exist = true; + return TSDB_CODE_SUCCESS; + } + + qError("taosHashPut hb trans failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); + SCH_ERR_RET(code); + } + + return TSDB_CODE_SUCCESS; +} + + +int32_t schBuildAndSendHbMsg(SQueryNodeEpId *nodeEpId) { + SSchedulerHbReq req = {0}; + int32_t code = 0; + SRpcCtx rpcCtx = {0}; + SSchTrans trans = {0}; + int32_t msgType = TDMT_VND_QUERY_HEARTBEAT; + + req.header.vgId = nodeEpId->nodeId; + req.sId = schMgmt.sId; + memcpy(&req.epId, nodeEpId, sizeof(SQueryNodeEpId)); + + SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, nodeEpId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", nodeEpId->nodeId, nodeEpId->ep.fqdn, + nodeEpId->ep.port); + SCH_ERR_RET(code); + } + + SCH_LOCK(SCH_WRITE, &hb->lock); + code = schCloneHbRpcCtx(&hb->rpcCtx, &rpcCtx); + memcpy(&trans, &hb->trans, sizeof(trans)); + SCH_UNLOCK(SCH_WRITE, &hb->lock); + + SCH_ERR_RET(code); + + int32_t msgSize = tSerializeSSchedulerHbReq(NULL, 0, &req); + if (msgSize < 0) { + qError("tSerializeSSchedulerHbReq hbReq failed, size:%d", msgSize); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + void *msg = taosMemoryCalloc(1, msgSize); + if (NULL == msg) { + qError("calloc hb req %d failed", msgSize); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + if (tSerializeSSchedulerHbReq(msg, msgSize, &req) < 0) { + qError("tSerializeSSchedulerHbReq hbReq failed, size:%d", msgSize); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SMsgSendInfo *pMsgSendInfo = taosMemoryCalloc(1, sizeof(SMsgSendInfo)); + if (NULL == pMsgSendInfo) { + qError("calloc SMsgSendInfo failed"); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SSchTaskCallbackParam *param = taosMemoryCalloc(1, sizeof(SSchTaskCallbackParam)); + if (NULL == param) { + qError("calloc SSchTaskCallbackParam failed"); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + __async_send_cb_fn_t fp = NULL; + SCH_ERR_JRET(schGetCallbackFp(msgType, &fp)); + + param->transport = trans.transInst; + + pMsgSendInfo->param = param; + pMsgSendInfo->msgInfo.pData = msg; + pMsgSendInfo->msgInfo.len = msgSize; + pMsgSendInfo->msgInfo.handle = trans.transHandle; + pMsgSendInfo->msgType = msgType; + pMsgSendInfo->fp = fp; + + int64_t transporterId = 0; + SEpSet epSet = {.inUse = 0, .numOfEps = 1}; + memcpy(&epSet.eps[0], &nodeEpId->ep, sizeof(nodeEpId->ep)); + + qDebug("start to send hb msg, instance:%p, handle:%p, fqdn:%s, port:%d", trans.transInst, trans.transHandle, + nodeEpId->ep.fqdn, nodeEpId->ep.port); + + code = asyncSendMsgToServerExt(trans.transInst, &epSet, &transporterId, pMsgSendInfo, true, &rpcCtx); + if (code) { + qError("fail to send hb msg, instance:%p, handle:%p, fqdn:%s, port:%d, error:%x - %s", trans.transInst, + trans.transHandle, nodeEpId->ep.fqdn, nodeEpId->ep.port, code, tstrerror(code)); + SCH_ERR_JRET(code); + } + + qDebug("hb msg sent"); + return TSDB_CODE_SUCCESS; + +_return: + + taosMemoryFreeClear(msg); + taosMemoryFreeClear(param); + taosMemoryFreeClear(pMsgSendInfo); + schFreeRpcCtx(&rpcCtx); + SCH_RET(code); +} + + +int32_t schEnsureHbConnection(SSchJob *pJob, SSchTask *pTask) { + SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + SQueryNodeEpId epId = {0}; + + epId.nodeId = addr->nodeId; + memcpy(&epId.ep, SCH_GET_CUR_EP(addr), sizeof(SEp)); + + SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, &epId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + bool exist = false; + SCH_ERR_RET(schRegisterHbConnection(pJob, pTask, &epId, &exist)); + if (!exist) { + SCH_ERR_RET(schBuildAndSendHbMsg(&epId)); + } + } + + return TSDB_CODE_SUCCESS; +} + +int32_t schUpdateHbConnection(SQueryNodeEpId *epId, SSchTrans *trans) { + int32_t code = 0; + SSchHbTrans *hb = NULL; + + hb = taosHashGet(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); + SCH_ERR_RET(TSDB_CODE_QRY_APP_ERROR); + } + + SCH_LOCK(SCH_WRITE, &hb->lock); + memcpy(&hb->trans, trans, sizeof(*trans)); + SCH_UNLOCK(SCH_WRITE, &hb->lock); + + qDebug("hb connection updated, sId:%" PRIx64 ", nodeId:%d, fqdn:%s, port:%d, instance:%p, handle:%p", schMgmt.sId, + epId->nodeId, epId->ep.fqdn, epId->ep.port, trans->transInst, trans->transHandle); + + return TSDB_CODE_SUCCESS; +} + +int32_t schHandleHbCallback(void *param, const SDataBuf *pMsg, int32_t code) { + SSchedulerHbRsp rsp = {0}; + SSchTaskCallbackParam *pParam = (SSchTaskCallbackParam *)param; + + if (code) { + qError("hb rsp error:%s", tstrerror(code)); + SCH_ERR_JRET(code); + } + + if (tDeserializeSSchedulerHbRsp(pMsg->pData, pMsg->len, &rsp)) { + qError("invalid hb rsp msg, size:%d", pMsg->len); + SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); + } + + SSchTrans trans = {0}; + trans.transInst = pParam->transport; + trans.transHandle = pMsg->handle; + + SCH_ERR_JRET(schUpdateHbConnection(&rsp.epId, &trans)); + + int32_t taskNum = (int32_t)taosArrayGetSize(rsp.taskStatus); + qDebug("%d task status in hb rsp, nodeId:%d, fqdn:%s, port:%d", taskNum, rsp.epId.nodeId, rsp.epId.ep.fqdn, + rsp.epId.ep.port); + + for (int32_t i = 0; i < taskNum; ++i) { + STaskStatus *taskStatus = taosArrayGet(rsp.taskStatus, i); + + SSchJob *pJob = schAcquireJob(taskStatus->refId); + if (NULL == pJob) { + qWarn("job not found, refId:0x%" PRIx64 ",QID:0x%" PRIx64 ",TID:0x%" PRIx64, taskStatus->refId, + taskStatus->queryId, taskStatus->taskId); + // TODO DROP TASK FROM SERVER!!!! + continue; + } + + // TODO + + SCH_JOB_DLOG("TID:0x%" PRIx64 " task status in server: %s", taskStatus->taskId, + jobTaskStatusStr(taskStatus->status)); + + schReleaseJob(taskStatus->refId); + } + +_return: + + tFreeSSchedulerHbRsp(&rsp); + taosMemoryFree(param); + + SCH_RET(code); +} + +int32_t schMakeCallbackParam(SSchJob *pJob, SSchTask *pTask, void **pParam) { + SSchTaskCallbackParam *param = taosMemoryCalloc(1, sizeof(SSchTaskCallbackParam)); + if (NULL == param) { + SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SSchTaskCallbackParam)); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + param->queryId = pJob->queryId; + param->refId = pJob->refId; + param->taskId = SCH_TASK_ID(pTask); + param->transport = pJob->transport; + + *pParam = param; + + return TSDB_CODE_SUCCESS; +} + +int32_t schMakeBrokenLinkVal(SSchJob *pJob, SSchTask *pTask, SRpcBrokenlinkVal *brokenVal, bool isHb) { + int32_t code = 0; + SMsgSendInfo *pMsgSendInfo = NULL; + + pMsgSendInfo = taosMemoryCalloc(1, sizeof(SMsgSendInfo)); + if (NULL == pMsgSendInfo) { + SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SMsgSendInfo)); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + if (isHb) { + SCH_ERR_JRET(schMakeHbCallbackParam(pJob, pTask, &pMsgSendInfo->param)); + } else { + SCH_ERR_JRET(schMakeCallbackParam(pJob, pTask, &pMsgSendInfo->param)); + } + + int32_t msgType = TDMT_SCH_LINK_BROKEN; + __async_send_cb_fn_t fp = NULL; + SCH_ERR_JRET(schGetCallbackFp(msgType, &fp)); + + pMsgSendInfo->fp = fp; + + brokenVal->msgType = msgType; + brokenVal->val = pMsgSendInfo; + brokenVal->clone = schCloneSMsgSendInfo; + brokenVal->freeFunc = schFreeRpcCtxVal; + + return TSDB_CODE_SUCCESS; + +_return: + + taosMemoryFreeClear(pMsgSendInfo->param); + taosMemoryFreeClear(pMsgSendInfo); + + SCH_RET(code); +} + +int32_t schMakeQueryRpcCtx(SSchJob *pJob, SSchTask *pTask, SRpcCtx *pCtx) { + int32_t code = 0; + SMsgSendInfo *pReadyMsgSendInfo = NULL; + SMsgSendInfo *pExplainMsgSendInfo = NULL; + + pCtx->args = taosHashInit(1, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false, HASH_ENTRY_LOCK); + if (NULL == pCtx->args) { + SCH_TASK_ELOG("taosHashInit %d RpcCtx failed", 1); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_ERR_JRET(schGenerateCallBackInfo(pJob, pTask, TDMT_VND_RES_READY, &pReadyMsgSendInfo)); + SCH_ERR_JRET(schGenerateCallBackInfo(pJob, pTask, TDMT_VND_EXPLAIN, &pExplainMsgSendInfo)); + + int32_t msgType = TDMT_VND_RES_READY_RSP; + SRpcCtxVal ctxVal = {.val = pReadyMsgSendInfo, .clone = schCloneSMsgSendInfo, .freeFunc = schFreeRpcCtxVal}; + if (taosHashPut(pCtx->args, &msgType, sizeof(msgType), &ctxVal, sizeof(ctxVal))) { + SCH_TASK_ELOG("taosHashPut msg %d to rpcCtx failed", msgType); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + msgType = TDMT_VND_EXPLAIN_RSP; + ctxVal.val = pExplainMsgSendInfo; + if (taosHashPut(pCtx->args, &msgType, sizeof(msgType), &ctxVal, sizeof(ctxVal))) { + SCH_TASK_ELOG("taosHashPut msg %d to rpcCtx failed", msgType); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SCH_ERR_JRET(schMakeBrokenLinkVal(pJob, pTask, &pCtx->brokenVal, false)); + + return TSDB_CODE_SUCCESS; + +_return: + + taosHashCleanup(pCtx->args); + + if (pReadyMsgSendInfo) { + taosMemoryFreeClear(pReadyMsgSendInfo->param); + taosMemoryFreeClear(pReadyMsgSendInfo); + } + + if (pExplainMsgSendInfo) { + taosMemoryFreeClear(pExplainMsgSendInfo->param); + taosMemoryFreeClear(pExplainMsgSendInfo); + } + + SCH_RET(code); +} + +int32_t schCloneCallbackParam(SSchCallbackParamHeader *pSrc, SSchCallbackParamHeader **pDst) { + if (pSrc->isHbParam) { + SSchHbCallbackParam *dst = taosMemoryMalloc(sizeof(SSchHbCallbackParam)); + if (NULL == dst) { + qError("malloc SSchHbCallbackParam failed"); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + memcpy(dst, pSrc, sizeof(*dst)); + *pDst = (SSchCallbackParamHeader *)dst; + + return TSDB_CODE_SUCCESS; + } + + SSchTaskCallbackParam *dst = taosMemoryMalloc(sizeof(SSchTaskCallbackParam)); + if (NULL == dst) { + qError("malloc SSchTaskCallbackParam failed"); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + memcpy(dst, pSrc, sizeof(*dst)); + *pDst = (SSchCallbackParamHeader *)dst; + + return TSDB_CODE_SUCCESS; +} + +int32_t schCloneSMsgSendInfo(void *src, void **dst) { + SMsgSendInfo *pSrc = src; + int32_t code = 0; + SMsgSendInfo *pDst = taosMemoryMalloc(sizeof(*pSrc)); + if (NULL == pDst) { + qError("malloc SMsgSendInfo for rpcCtx failed, len:%d", (int32_t)sizeof(*pSrc)); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + memcpy(pDst, pSrc, sizeof(*pSrc)); + pDst->param = NULL; + + SCH_ERR_JRET(schCloneCallbackParam(pSrc->param, (SSchCallbackParamHeader **)&pDst->param)); + + *dst = pDst; + + return TSDB_CODE_SUCCESS; + +_return: + + taosMemoryFreeClear(pDst); + SCH_RET(code); +} + + +int32_t schAsyncSendMsg(SSchJob *pJob, SSchTask *pTask, void *transport, SEpSet *epSet, int32_t msgType, void *msg, + uint32_t msgSize, bool persistHandle, SRpcCtx *ctx) { + int32_t code = 0; + + SSchTrans *trans = (SSchTrans *)transport; + + SMsgSendInfo *pMsgSendInfo = NULL; + SCH_ERR_JRET(schGenerateCallBackInfo(pJob, pTask, msgType, &pMsgSendInfo)); + + pMsgSendInfo->msgInfo.pData = msg; + pMsgSendInfo->msgInfo.len = msgSize; + pMsgSendInfo->msgInfo.handle = trans->transHandle; + pMsgSendInfo->msgType = msgType; + + qDebug("start to send %s msg to node[%d,%s,%d], refId:%" PRIx64 "instance:%p, handle:%p", TMSG_INFO(msgType), + ntohl(((SMsgHead *)msg)->vgId), epSet->eps[epSet->inUse].fqdn, epSet->eps[epSet->inUse].port, pJob->refId, + trans->transInst, trans->transHandle); + + int64_t transporterId = 0; + code = asyncSendMsgToServerExt(trans->transInst, epSet, &transporterId, pMsgSendInfo, persistHandle, ctx); + if (code) { + SCH_ERR_JRET(code); + } + + SCH_TASK_DLOG("req msg sent, refId:%" PRIx64 ", type:%d, %s", pJob->refId, msgType, TMSG_INFO(msgType)); + return TSDB_CODE_SUCCESS; + +_return: + + if (pMsgSendInfo) { + taosMemoryFreeClear(pMsgSendInfo->param); + taosMemoryFreeClear(pMsgSendInfo); + } + + SCH_RET(code); +} + +int32_t schBuildAndSendMsg(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, int32_t msgType) { + uint32_t msgSize = 0; + void *msg = NULL; + int32_t code = 0; + bool isCandidateAddr = false; + bool persistHandle = false; + SRpcCtx rpcCtx = {0}; + + if (NULL == addr) { + addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + isCandidateAddr = true; + } + + SEpSet epSet = addr->epSet; + + switch (msgType) { + case TDMT_VND_CREATE_TABLE: + case TDMT_VND_DROP_TABLE: + case TDMT_VND_ALTER_TABLE: + case TDMT_VND_SUBMIT: { + msgSize = pTask->msgLen; + msg = taosMemoryCalloc(1, msgSize); + if (NULL == msg) { + SCH_TASK_ELOG("calloc %d failed", msgSize); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + memcpy(msg, pTask->msg, msgSize); + break; + } + + case TDMT_VND_QUERY: { + SCH_ERR_RET(schMakeQueryRpcCtx(pJob, pTask, &rpcCtx)); + + uint32_t len = strlen(pJob->sql); + msgSize = sizeof(SSubQueryMsg) + pTask->msgLen + len; + msg = taosMemoryCalloc(1, msgSize); + if (NULL == msg) { + SCH_TASK_ELOG("calloc %d failed", msgSize); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SSubQueryMsg *pMsg = msg; + pMsg->header.vgId = htonl(addr->nodeId); + pMsg->sId = htobe64(schMgmt.sId); + pMsg->queryId = htobe64(pJob->queryId); + pMsg->taskId = htobe64(pTask->taskId); + pMsg->refId = htobe64(pJob->refId); + pMsg->taskType = TASK_TYPE_TEMP; + pMsg->explain = SCH_IS_EXPLAIN_JOB(pJob); + pMsg->phyLen = htonl(pTask->msgLen); + pMsg->sqlLen = htonl(len); + + memcpy(pMsg->msg, pJob->sql, len); + memcpy(pMsg->msg + len, pTask->msg, pTask->msgLen); + + persistHandle = true; + break; + } + + case TDMT_VND_RES_READY: { + msgSize = sizeof(SResReadyReq); + msg = taosMemoryCalloc(1, msgSize); + if (NULL == msg) { + SCH_TASK_ELOG("calloc %d failed", msgSize); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SResReadyReq *pMsg = msg; + + pMsg->header.vgId = htonl(addr->nodeId); + + pMsg->sId = htobe64(schMgmt.sId); + pMsg->queryId = htobe64(pJob->queryId); + pMsg->taskId = htobe64(pTask->taskId); + break; + } + case TDMT_VND_FETCH: { + msgSize = sizeof(SResFetchReq); + msg = taosMemoryCalloc(1, msgSize); + if (NULL == msg) { + SCH_TASK_ELOG("calloc %d failed", msgSize); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + SResFetchReq *pMsg = msg; + + pMsg->header.vgId = htonl(addr->nodeId); + + pMsg->sId = htobe64(schMgmt.sId); + pMsg->queryId = htobe64(pJob->queryId); + pMsg->taskId = htobe64(pTask->taskId); + + break; + } + case TDMT_VND_DROP_TASK: { + msgSize = sizeof(STaskDropReq); + msg = taosMemoryCalloc(1, msgSize); + if (NULL == msg) { + SCH_TASK_ELOG("calloc %d failed", msgSize); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + STaskDropReq *pMsg = msg; + + pMsg->header.vgId = htonl(addr->nodeId); + + pMsg->sId = htobe64(schMgmt.sId); + pMsg->queryId = htobe64(pJob->queryId); + pMsg->taskId = htobe64(pTask->taskId); + pMsg->refId = htobe64(pJob->refId); + break; + } + case TDMT_VND_QUERY_HEARTBEAT: { + SCH_ERR_RET(schMakeHbRpcCtx(pJob, pTask, &rpcCtx)); + + SSchedulerHbReq req = {0}; + req.sId = schMgmt.sId; + req.header.vgId = addr->nodeId; + req.epId.nodeId = addr->nodeId; + memcpy(&req.epId.ep, SCH_GET_CUR_EP(addr), sizeof(SEp)); + + msgSize = tSerializeSSchedulerHbReq(NULL, 0, &req); + if (msgSize < 0) { + SCH_JOB_ELOG("tSerializeSSchedulerHbReq hbReq failed, size:%d", msgSize); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + msg = taosMemoryCalloc(1, msgSize); + if (NULL == msg) { + SCH_JOB_ELOG("calloc %d failed", msgSize); + SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + if (tSerializeSSchedulerHbReq(msg, msgSize, &req) < 0) { + SCH_JOB_ELOG("tSerializeSSchedulerHbReq hbReq failed, size:%d", msgSize); + SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); + } + + persistHandle = true; + break; + } + default: + SCH_TASK_ELOG("unknown msg type to send, msgType:%d", msgType); + SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); + break; + } + + SCH_SET_TASK_LASTMSG_TYPE(pTask, msgType); + + SSchTrans trans = {.transInst = pJob->transport, .transHandle = SCH_GET_TASK_HANDLE(pTask)}; + SCH_ERR_JRET(schAsyncSendMsg(pJob, pTask, &trans, &epSet, msgType, msg, msgSize, persistHandle, + (rpcCtx.args ? &rpcCtx : NULL))); + + if (msgType == TDMT_VND_QUERY) { + SCH_ERR_RET(schRecordTaskExecNode(pJob, pTask, addr, trans.transHandle)); + } + + return TSDB_CODE_SUCCESS; + +_return: + + SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); + schFreeRpcCtx(&rpcCtx); + + taosMemoryFreeClear(msg); + SCH_RET(code); +} + + + diff --git a/source/libs/scheduler/src/schUtil.c b/source/libs/scheduler/src/schUtil.c new file mode 100644 index 0000000000..57a86ba125 --- /dev/null +++ b/source/libs/scheduler/src/schUtil.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "catalog.h" +#include "command.h" +#include "query.h" +#include "schedulerInt.h" +#include "tmsg.h" +#include "tref.h" +#include "trpc.h" + +void schCloseJobRef(void) { + if (!atomic_load_8((int8_t *)&schMgmt.exit)) { + return; + } + + SCH_LOCK(SCH_WRITE, &schMgmt.lock); + if (atomic_load_32(&schMgmt.jobNum) <= 0 && schMgmt.jobRef >= 0) { + taosCloseRef(schMgmt.jobRef); + schMgmt.jobRef = -1; + } + SCH_UNLOCK(SCH_WRITE, &schMgmt.lock); +} + +uint64_t schGenTaskId(void) { return atomic_add_fetch_64(&schMgmt.taskId, 1); } + +uint64_t schGenUUID(void) { + static uint64_t hashId = 0; + static int32_t requestSerialId = 0; + + if (hashId == 0) { + char uid[64]; + int32_t code = taosGetSystemUUID(uid, tListLen(uid)); + if (code != TSDB_CODE_SUCCESS) { + qError("Failed to get the system uid, reason:%s", tstrerror(TAOS_SYSTEM_ERROR(errno))); + } else { + hashId = MurmurHash3_32(uid, strlen(uid)); + } + } + + int64_t ts = taosGetTimestampMs(); + uint64_t pid = taosGetPId(); + int32_t val = atomic_add_fetch_32(&requestSerialId, 1); + + uint64_t id = ((hashId & 0x0FFF) << 52) | ((pid & 0x0FFF) << 40) | ((ts & 0xFFFFFF) << 16) | (val & 0xFFFF); + return id; +} + + +void schFreeRpcCtxVal(const void *arg) { + if (NULL == arg) { + return; + } + + SMsgSendInfo *pMsgSendInfo = (SMsgSendInfo *)arg; + taosMemoryFreeClear(pMsgSendInfo->param); + taosMemoryFreeClear(pMsgSendInfo); +} + +void schFreeRpcCtx(SRpcCtx *pCtx) { + if (NULL == pCtx) { + return; + } + void *pIter = taosHashIterate(pCtx->args, NULL); + while (pIter) { + SRpcCtxVal *ctxVal = (SRpcCtxVal *)pIter; + + (*ctxVal->freeFunc)(ctxVal->val); + + pIter = taosHashIterate(pCtx->args, pIter); + } + + taosHashCleanup(pCtx->args); + + if (pCtx->brokenVal.freeFunc) { + (*pCtx->brokenVal.freeFunc)(pCtx->brokenVal.val); + } +} + + diff --git a/source/libs/scheduler/src/scheduler.c b/source/libs/scheduler/src/scheduler.c index 7a75c00e6e..bd2c7e5b49 100644 --- a/source/libs/scheduler/src/scheduler.c +++ b/source/libs/scheduler/src/scheduler.c @@ -25,2563 +25,6 @@ SSchedulerMgmt schMgmt = { .jobRef = -1, }; -FORCE_INLINE SSchJob *schAcquireJob(int64_t refId) { return (SSchJob *)taosAcquireRef(schMgmt.jobRef, refId); } - -FORCE_INLINE int32_t schReleaseJob(int64_t refId) { return taosReleaseRef(schMgmt.jobRef, refId); } - -uint64_t schGenTaskId(void) { return atomic_add_fetch_64(&schMgmt.taskId, 1); } - -#if 0 -uint64_t schGenUUID(void) { - static uint64_t hashId = 0; - static int32_t requestSerialId = 0; - - if (hashId == 0) { - char uid[64]; - int32_t code = taosGetSystemUUID(uid, tListLen(uid)); - if (code != TSDB_CODE_SUCCESS) { - qError("Failed to get the system uid, reason:%s", tstrerror(TAOS_SYSTEM_ERROR(errno))); - } else { - hashId = MurmurHash3_32(uid, strlen(uid)); - } - } - - int64_t ts = taosGetTimestampMs(); - uint64_t pid = taosGetPId(); - int32_t val = atomic_add_fetch_32(&requestSerialId, 1); - - uint64_t id = ((hashId & 0x0FFF) << 52) | ((pid & 0x0FFF) << 40) | ((ts & 0xFFFFFF) << 16) | (val & 0xFFFF); - return id; -} -#endif - -int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *pLevel) { - pTask->plan = pPlan; - pTask->level = pLevel; - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_NOT_START); - pTask->taskId = schGenTaskId(); - pTask->execNodes = taosArrayInit(SCH_MAX_CANDIDATE_EP_NUM, sizeof(SSchNodeInfo)); - if (NULL == pTask->execNodes) { - SCH_TASK_ELOG("taosArrayInit %d execNodes failed", SCH_MAX_CANDIDATE_EP_NUM); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - return TSDB_CODE_SUCCESS; -} - -int32_t schInitJob(SSchJob **pSchJob, SQueryPlan *pDag, void *transport, SArray *pNodeList, const char *sql, - int64_t startTs, bool syncSchedule) { - int32_t code = 0; - int64_t refId = -1; - SSchJob *pJob = taosMemoryCalloc(1, sizeof(SSchJob)); - if (NULL == pJob) { - qError("QID:%" PRIx64 " calloc %d failed", pDag->queryId, (int32_t)sizeof(SSchJob)); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - pJob->attr.explainMode = pDag->explainInfo.mode; - pJob->attr.syncSchedule = syncSchedule; - pJob->transport = transport; - pJob->sql = sql; - - if (pNodeList != NULL) { - pJob->nodeList = taosArrayDup(pNodeList); - } - - SCH_ERR_JRET(schValidateAndBuildJob(pDag, pJob)); - - if (SCH_IS_EXPLAIN_JOB(pJob)) { - SCH_ERR_JRET(qExecExplainBegin(pDag, &pJob->explainCtx, startTs)); - } - - pJob->execTasks = - taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK); - if (NULL == pJob->execTasks) { - SCH_JOB_ELOG("taosHashInit %d execTasks failed", pDag->numOfSubplans); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - pJob->succTasks = - taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK); - if (NULL == pJob->succTasks) { - SCH_JOB_ELOG("taosHashInit %d succTasks failed", pDag->numOfSubplans); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - pJob->failTasks = - taosHashInit(pDag->numOfSubplans, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), false, HASH_ENTRY_LOCK); - if (NULL == pJob->failTasks) { - SCH_JOB_ELOG("taosHashInit %d failTasks failed", pDag->numOfSubplans); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - tsem_init(&pJob->rspSem, 0, 0); - - refId = taosAddRef(schMgmt.jobRef, pJob); - if (refId < 0) { - SCH_JOB_ELOG("taosAddRef job failed, error:%s", tstrerror(terrno)); - SCH_ERR_JRET(terrno); - } - - atomic_add_fetch_32(&schMgmt.jobNum, 1); - - if (NULL == schAcquireJob(refId)) { - SCH_JOB_ELOG("schAcquireJob job failed, refId:%" PRIx64, refId); - SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); - } - - pJob->refId = refId; - - SCH_JOB_DLOG("job refId:%" PRIx64, pJob->refId); - - pJob->status = JOB_TASK_STATUS_NOT_START; - - *pSchJob = pJob; - - return TSDB_CODE_SUCCESS; - -_return: - - if (refId < 0) { - schFreeJobImpl(pJob); - } else { - taosRemoveRef(schMgmt.jobRef, refId); - } - SCH_RET(code); -} - -void schFreeRpcCtx(SRpcCtx *pCtx) { - if (NULL == pCtx) { - return; - } - void *pIter = taosHashIterate(pCtx->args, NULL); - while (pIter) { - SRpcCtxVal *ctxVal = (SRpcCtxVal *)pIter; - - (*ctxVal->freeFunc)(ctxVal->val); - - pIter = taosHashIterate(pCtx->args, pIter); - } - - taosHashCleanup(pCtx->args); - - if (pCtx->brokenVal.freeFunc) { - (*pCtx->brokenVal.freeFunc)(pCtx->brokenVal.val); - } -} - -void schFreeTask(SSchTask *pTask) { - if (pTask->candidateAddrs) { - taosArrayDestroy(pTask->candidateAddrs); - } - - taosMemoryFreeClear(pTask->msg); - - if (pTask->children) { - taosArrayDestroy(pTask->children); - } - - if (pTask->parents) { - taosArrayDestroy(pTask->parents); - } - - if (pTask->execNodes) { - taosArrayDestroy(pTask->execNodes); - } -} - -static FORCE_INLINE bool schJobNeedToStop(SSchJob *pJob, int8_t *pStatus) { - int8_t status = SCH_GET_JOB_STATUS(pJob); - if (pStatus) { - *pStatus = status; - } - - return (status == JOB_TASK_STATUS_FAILED || status == JOB_TASK_STATUS_CANCELLED || - status == JOB_TASK_STATUS_CANCELLING || status == JOB_TASK_STATUS_DROPPING || - status == JOB_TASK_STATUS_SUCCEED); -} - -int32_t schValidateTaskReceivedMsgType(SSchJob *pJob, SSchTask *pTask, int32_t msgType) { - int32_t lastMsgType = SCH_GET_TASK_LASTMSG_TYPE(pTask); - int32_t taskStatus = SCH_GET_TASK_STATUS(pTask); - int32_t reqMsgType = msgType - 1; - switch (msgType) { - case TDMT_SCH_LINK_BROKEN: - case TDMT_VND_EXPLAIN_RSP: - return TSDB_CODE_SUCCESS; - case TDMT_VND_QUERY_RSP: // query_rsp may be processed later than ready_rsp - if (lastMsgType != reqMsgType && -1 != lastMsgType && TDMT_VND_FETCH != lastMsgType) { - SCH_TASK_DLOG("rsp msg type mis-match, last sent msgType:%s, rspType:%s", TMSG_INFO(lastMsgType), - TMSG_INFO(msgType)); - } - - if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { - SCH_TASK_DLOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), - TMSG_INFO(msgType)); - } - - SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); - return TSDB_CODE_SUCCESS; - case TDMT_VND_RES_READY_RSP: - reqMsgType = TDMT_VND_QUERY; - if (lastMsgType != reqMsgType && -1 != lastMsgType) { - SCH_TASK_ELOG("rsp msg type mis-match, last sent msgType:%s, rspType:%s", - (lastMsgType > 0 ? TMSG_INFO(lastMsgType) : "null"), TMSG_INFO(msgType)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { - SCH_TASK_ELOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), - TMSG_INFO(msgType)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); - return TSDB_CODE_SUCCESS; - case TDMT_VND_FETCH_RSP: - if (lastMsgType != reqMsgType && -1 != lastMsgType) { - SCH_TASK_ELOG("rsp msg type mis-match, last sent msgType:%s, rspType:%s", TMSG_INFO(lastMsgType), - TMSG_INFO(msgType)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { - SCH_TASK_ELOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), - TMSG_INFO(msgType)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); - return TSDB_CODE_SUCCESS; - case TDMT_VND_CREATE_TABLE_RSP: - case TDMT_VND_DROP_TABLE_RSP: - case TDMT_VND_ALTER_TABLE_RSP: - case TDMT_VND_SUBMIT_RSP: - break; - default: - SCH_TASK_ELOG("unknown rsp msg, type:%s, status:%s", TMSG_INFO(msgType), jobTaskStatusStr(taskStatus)); - SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); - } - - if (lastMsgType != reqMsgType) { - SCH_TASK_ELOG("rsp msg type mis-match, last sent msgType:%s, rspType:%s", TMSG_INFO(lastMsgType), - TMSG_INFO(msgType)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - if (taskStatus != JOB_TASK_STATUS_EXECUTING && taskStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED) { - SCH_TASK_ELOG("rsp msg conflicted with task status, status:%s, rspType:%s", jobTaskStatusStr(taskStatus), - TMSG_INFO(msgType)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); - - return TSDB_CODE_SUCCESS; -} - -int32_t schCheckAndUpdateJobStatus(SSchJob *pJob, int8_t newStatus) { - int32_t code = 0; - - int8_t oriStatus = 0; - - while (true) { - oriStatus = SCH_GET_JOB_STATUS(pJob); - - if (oriStatus == newStatus) { - SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); - } - - switch (oriStatus) { - case JOB_TASK_STATUS_NULL: - if (newStatus != JOB_TASK_STATUS_NOT_START) { - SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); - } - - break; - case JOB_TASK_STATUS_NOT_START: - if (newStatus != JOB_TASK_STATUS_EXECUTING) { - SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); - } - - break; - case JOB_TASK_STATUS_EXECUTING: - if (newStatus != JOB_TASK_STATUS_PARTIAL_SUCCEED && newStatus != JOB_TASK_STATUS_FAILED && - newStatus != JOB_TASK_STATUS_CANCELLING && newStatus != JOB_TASK_STATUS_CANCELLED && - newStatus != JOB_TASK_STATUS_DROPPING) { - SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); - } - - break; - case JOB_TASK_STATUS_PARTIAL_SUCCEED: - if (newStatus != JOB_TASK_STATUS_FAILED && newStatus != JOB_TASK_STATUS_SUCCEED && - newStatus != JOB_TASK_STATUS_DROPPING) { - SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); - } - - break; - case JOB_TASK_STATUS_SUCCEED: - case JOB_TASK_STATUS_FAILED: - case JOB_TASK_STATUS_CANCELLING: - if (newStatus != JOB_TASK_STATUS_DROPPING) { - SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); - } - - break; - case JOB_TASK_STATUS_CANCELLED: - case JOB_TASK_STATUS_DROPPING: - SCH_ERR_JRET(TSDB_CODE_QRY_JOB_FREED); - break; - - default: - SCH_JOB_ELOG("invalid job status:%s", jobTaskStatusStr(oriStatus)); - SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); - } - - if (oriStatus != atomic_val_compare_exchange_8(&pJob->status, oriStatus, newStatus)) { - continue; - } - - SCH_JOB_DLOG("job status updated from %s to %s", jobTaskStatusStr(oriStatus), jobTaskStatusStr(newStatus)); - - break; - } - - return TSDB_CODE_SUCCESS; - -_return: - - SCH_JOB_ELOG("invalid job status update, from %s to %s", jobTaskStatusStr(oriStatus), jobTaskStatusStr(newStatus)); - SCH_ERR_RET(code); - return TSDB_CODE_SUCCESS; -} - -int32_t schBuildTaskRalation(SSchJob *pJob, SHashObj *planToTask) { - for (int32_t i = 0; i < pJob->levelNum; ++i) { - SSchLevel *pLevel = taosArrayGet(pJob->levels, i); - - for (int32_t m = 0; m < pLevel->taskNum; ++m) { - SSchTask *pTask = taosArrayGet(pLevel->subTasks, m); - SSubplan *pPlan = pTask->plan; - int32_t childNum = pPlan->pChildren ? (int32_t)LIST_LENGTH(pPlan->pChildren) : 0; - int32_t parentNum = pPlan->pParents ? (int32_t)LIST_LENGTH(pPlan->pParents) : 0; - - if (childNum > 0) { - if (pJob->levelIdx == pLevel->level) { - SCH_JOB_ELOG("invalid query plan, lowest level, childNum:%d", childNum); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - pTask->children = taosArrayInit(childNum, POINTER_BYTES); - if (NULL == pTask->children) { - SCH_TASK_ELOG("taosArrayInit %d children failed", childNum); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - } - - for (int32_t n = 0; n < childNum; ++n) { - SSubplan *child = (SSubplan *)nodesListGetNode(pPlan->pChildren, n); - SSchTask **childTask = taosHashGet(planToTask, &child, POINTER_BYTES); - if (NULL == childTask || NULL == *childTask) { - SCH_TASK_ELOG("subplan children relationship error, level:%d, taskIdx:%d, childIdx:%d", i, m, n); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - if (NULL == taosArrayPush(pTask->children, childTask)) { - SCH_TASK_ELOG("taosArrayPush childTask failed, level:%d, taskIdx:%d, childIdx:%d", i, m, n); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_TASK_DLOG("children info, the %d child TID %" PRIx64, n, (*childTask)->taskId); - } - - if (parentNum > 0) { - if (0 == pLevel->level) { - SCH_TASK_ELOG("invalid task info, level:0, parentNum:%d", parentNum); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - pTask->parents = taosArrayInit(parentNum, POINTER_BYTES); - if (NULL == pTask->parents) { - SCH_TASK_ELOG("taosArrayInit %d parents failed", parentNum); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - } else { - if (0 != pLevel->level) { - SCH_TASK_ELOG("invalid task info, level:%d, parentNum:%d", pLevel->level, parentNum); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - } - - for (int32_t n = 0; n < parentNum; ++n) { - SSubplan *parent = (SSubplan *)nodesListGetNode(pPlan->pParents, n); - SSchTask **parentTask = taosHashGet(planToTask, &parent, POINTER_BYTES); - if (NULL == parentTask || NULL == *parentTask) { - SCH_TASK_ELOG("subplan parent relationship error, level:%d, taskIdx:%d, childIdx:%d", i, m, n); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - if (NULL == taosArrayPush(pTask->parents, parentTask)) { - SCH_TASK_ELOG("taosArrayPush parentTask failed, level:%d, taskIdx:%d, childIdx:%d", i, m, n); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_TASK_DLOG("parents info, the %d parent TID %" PRIx64, n, (*parentTask)->taskId); - } - - SCH_TASK_DLOG("level:%d, parentNum:%d, childNum:%d", i, parentNum, childNum); - } - } - - SSchLevel *pLevel = taosArrayGet(pJob->levels, 0); - if (SCH_IS_QUERY_JOB(pJob) && pLevel->taskNum > 1) { - SCH_JOB_ELOG("invalid query plan, level:0, taskNum:%d", pLevel->taskNum); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - return TSDB_CODE_SUCCESS; -} - -int32_t schRecordTaskSucceedNode(SSchJob *pJob, SSchTask *pTask) { - SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - if (NULL == addr) { - SCH_TASK_ELOG("taosArrayGet candidate addr failed, idx:%d, size:%d", pTask->candidateIdx, - (int32_t)taosArrayGetSize(pTask->candidateAddrs)); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - pTask->succeedAddr = *addr; - - return TSDB_CODE_SUCCESS; -} - -int32_t schRecordTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, void *handle) { - SSchNodeInfo nodeInfo = {.addr = *addr, .handle = handle}; - - if (NULL == taosArrayPush(pTask->execNodes, &nodeInfo)) { - SCH_TASK_ELOG("taosArrayPush nodeInfo to execNodes list failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_TASK_DLOG("task execNode recorded, handle:%p", handle); - - return TSDB_CODE_SUCCESS; -} - -int32_t schRecordQueryDataSrc(SSchJob *pJob, SSchTask *pTask) { - if (!SCH_IS_DATA_SRC_QRY_TASK(pTask)) { - return TSDB_CODE_SUCCESS; - } - - taosArrayPush(pJob->dataSrcTasks, &pTask); - - return TSDB_CODE_SUCCESS; -} - - -int32_t schValidateAndBuildJob(SQueryPlan *pDag, SSchJob *pJob) { - int32_t code = 0; - pJob->queryId = pDag->queryId; - - if (pDag->numOfSubplans <= 0) { - SCH_JOB_ELOG("invalid subplan num:%d", pDag->numOfSubplans); - SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); - } - - pJob->dataSrcTasks = taosArrayInit(pDag->numOfSubplans, POINTER_BYTES); - if (NULL == pJob->dataSrcTasks) { - SCH_ERR_RET(TSDB_CODE_OUT_OF_MEMORY); - } - - int32_t levelNum = (int32_t)LIST_LENGTH(pDag->pSubplans); - if (levelNum <= 0) { - SCH_JOB_ELOG("invalid level num:%d", levelNum); - SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); - } - - SHashObj *planToTask = taosHashInit( - SCHEDULE_DEFAULT_MAX_TASK_NUM, - taosGetDefaultHashFunction(POINTER_BYTES == sizeof(int64_t) ? TSDB_DATA_TYPE_BIGINT : TSDB_DATA_TYPE_INT), false, - HASH_NO_LOCK); - if (NULL == planToTask) { - SCH_JOB_ELOG("taosHashInit %d failed", SCHEDULE_DEFAULT_MAX_TASK_NUM); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - pJob->levels = taosArrayInit(levelNum, sizeof(SSchLevel)); - if (NULL == pJob->levels) { - SCH_JOB_ELOG("taosArrayInit %d failed", levelNum); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - pJob->levelNum = levelNum; - pJob->levelIdx = levelNum - 1; - - pJob->subPlans = pDag->pSubplans; - - SSchLevel level = {0}; - SNodeListNode *plans = NULL; - int32_t taskNum = 0; - SSchLevel *pLevel = NULL; - - level.status = JOB_TASK_STATUS_NOT_START; - - for (int32_t i = 0; i < levelNum; ++i) { - if (NULL == taosArrayPush(pJob->levels, &level)) { - SCH_JOB_ELOG("taosArrayPush level failed, level:%d", i); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - pLevel = taosArrayGet(pJob->levels, i); - pLevel->level = i; - - plans = (SNodeListNode *)nodesListGetNode(pDag->pSubplans, i); - if (NULL == plans) { - SCH_JOB_ELOG("empty level plan, level:%d", i); - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - - taskNum = (int32_t)LIST_LENGTH(plans->pNodeList); - if (taskNum <= 0) { - SCH_JOB_ELOG("invalid level plan number:%d, level:%d", taskNum, i); - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - - pLevel->taskNum = taskNum; - - pLevel->subTasks = taosArrayInit(taskNum, sizeof(SSchTask)); - if (NULL == pLevel->subTasks) { - SCH_JOB_ELOG("taosArrayInit %d failed", taskNum); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - for (int32_t n = 0; n < taskNum; ++n) { - SSubplan *plan = (SSubplan *)nodesListGetNode(plans->pNodeList, n); - - SCH_SET_JOB_TYPE(pJob, plan->subplanType); - - SSchTask task = {0}; - SSchTask *pTask = &task; - - SCH_ERR_JRET(schInitTask(pJob, &task, plan, pLevel)); - - void *p = taosArrayPush(pLevel->subTasks, &task); - if (NULL == p) { - SCH_TASK_ELOG("taosArrayPush task to level failed, level:%d, taskIdx:%d", pLevel->level, n); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_ERR_JRET(schRecordQueryDataSrc(pJob, p)); - - if (0 != taosHashPut(planToTask, &plan, POINTER_BYTES, &p, POINTER_BYTES)) { - SCH_TASK_ELOG("taosHashPut to planToTaks failed, taskIdx:%d", n); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - ++pJob->taskNum; - } - - SCH_JOB_DLOG("level initialized, taskNum:%d", taskNum); - } - - SCH_ERR_JRET(schBuildTaskRalation(pJob, planToTask)); - -_return: - if (planToTask) { - taosHashCleanup(planToTask); - } - - SCH_RET(code); -} - -int32_t schSetTaskCandidateAddrs(SSchJob *pJob, SSchTask *pTask) { - if (NULL != pTask->candidateAddrs) { - return TSDB_CODE_SUCCESS; - } - - pTask->candidateIdx = 0; - pTask->candidateAddrs = taosArrayInit(SCH_MAX_CANDIDATE_EP_NUM, sizeof(SQueryNodeAddr)); - if (NULL == pTask->candidateAddrs) { - SCH_TASK_ELOG("taosArrayInit %d condidate addrs failed", SCH_MAX_CANDIDATE_EP_NUM); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - if (pTask->plan->execNode.epSet.numOfEps > 0) { - if (NULL == taosArrayPush(pTask->candidateAddrs, &pTask->plan->execNode)) { - SCH_TASK_ELOG("taosArrayPush execNode to candidate addrs failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_TASK_DLOG("use execNode from plan as candidate addr, numOfEps:%d", pTask->plan->execNode.epSet.numOfEps); - - return TSDB_CODE_SUCCESS; - } - - int32_t addNum = 0; - int32_t nodeNum = 0; - if (pJob->nodeList) { - nodeNum = taosArrayGetSize(pJob->nodeList); - - for (int32_t i = 0; i < nodeNum && addNum < SCH_MAX_CANDIDATE_EP_NUM; ++i) { - SQueryNodeAddr *naddr = taosArrayGet(pJob->nodeList, i); - - if (NULL == taosArrayPush(pTask->candidateAddrs, naddr)) { - SCH_TASK_ELOG("taosArrayPush execNode to candidate addrs failed, addNum:%d, errno:%d", addNum, errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - ++addNum; - } - } - - if (addNum <= 0) { - SCH_TASK_ELOG("no available execNode as candidates, nodeNum:%d", nodeNum); - SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); - } - - /* - for (int32_t i = 0; i < job->dataSrcEps.numOfEps && addNum < SCH_MAX_CANDIDATE_EP_NUM; ++i) { - strncpy(epSet->fqdn[epSet->numOfEps], job->dataSrcEps.fqdn[i], sizeof(job->dataSrcEps.fqdn[i])); - epSet->port[epSet->numOfEps] = job->dataSrcEps.port[i]; - - ++epSet->numOfEps; - } - */ - - return TSDB_CODE_SUCCESS; -} - -int32_t schRemoveTaskFromExecList(SSchJob *pJob, SSchTask *pTask) { - int32_t code = taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId)); - if (code) { - SCH_TASK_ELOG("task failed to rm from execTask list, code:%x", code); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - return TSDB_CODE_SUCCESS; -} - - -int32_t schPushTaskToExecList(SSchJob *pJob, SSchTask *pTask) { - int32_t code = taosHashPut(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); - if (0 != code) { - if (HASH_NODE_EXIST(code)) { - SCH_TASK_ELOG("task already in execTask list, code:%x", code); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - SCH_TASK_ELOG("taosHashPut task to execTask list failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_TASK_DLOG("task added to execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); - - return TSDB_CODE_SUCCESS; -} - -int32_t schMoveTaskToSuccList(SSchJob *pJob, SSchTask *pTask, bool *moved) { - if (0 != taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId))) { - SCH_TASK_WLOG("remove task from execTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - } else { - SCH_TASK_DLOG("task removed from execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); - } - - int32_t code = taosHashPut(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); - if (0 != code) { - if (HASH_NODE_EXIST(code)) { - *moved = true; - SCH_TASK_ELOG("task already in succTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_TASK_ELOG("taosHashPut task to succTask list failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - *moved = true; - - SCH_TASK_DLOG("task moved to succTask list, numOfTasks:%d", taosHashGetSize(pJob->succTasks)); - - return TSDB_CODE_SUCCESS; -} - -int32_t schMoveTaskToFailList(SSchJob *pJob, SSchTask *pTask, bool *moved) { - *moved = false; - - if (0 != taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId))) { - SCH_TASK_WLOG("remove task from execTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - } - - int32_t code = taosHashPut(pJob->failTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); - if (0 != code) { - if (HASH_NODE_EXIST(code)) { - *moved = true; - - SCH_TASK_WLOG("task already in failTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_TASK_ELOG("taosHashPut task to failTask list failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - *moved = true; - - SCH_TASK_DLOG("task moved to failTask list, numOfTasks:%d", taosHashGetSize(pJob->failTasks)); - - return TSDB_CODE_SUCCESS; -} - -int32_t schMoveTaskToExecList(SSchJob *pJob, SSchTask *pTask, bool *moved) { - if (0 != taosHashRemove(pJob->succTasks, &pTask->taskId, sizeof(pTask->taskId))) { - SCH_TASK_WLOG("remove task from succTask list failed, may not exist, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - } - - int32_t code = taosHashPut(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId), &pTask, POINTER_BYTES); - if (0 != code) { - if (HASH_NODE_EXIST(code)) { - *moved = true; - - SCH_TASK_ELOG("task already in execTask list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - SCH_ERR_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_TASK_ELOG("taosHashPut task to execTask list failed, errno:%d", errno); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - *moved = true; - - SCH_TASK_DLOG("task moved to execTask list, numOfTasks:%d", taosHashGetSize(pJob->execTasks)); - - return TSDB_CODE_SUCCESS; -} - -int32_t schTaskCheckSetRetry(SSchJob *pJob, SSchTask *pTask, int32_t errCode, bool *needRetry) { - int8_t status = 0; - ++pTask->tryTimes; - - if (schJobNeedToStop(pJob, &status)) { - *needRetry = false; - SCH_TASK_DLOG("task no more retry cause of job status, job status:%s", jobTaskStatusStr(status)); - return TSDB_CODE_SUCCESS; - } - - if (pTask->tryTimes >= REQUEST_MAX_TRY_TIMES) { - *needRetry = false; - SCH_TASK_DLOG("task no more retry since reach max try times, tryTimes:%d", pTask->tryTimes); - return TSDB_CODE_SUCCESS; - } - - if (!NEED_SCHEDULER_RETRY_ERROR(errCode)) { - *needRetry = false; - SCH_TASK_DLOG("task no more retry cause of errCode, errCode:%x - %s", errCode, tstrerror(errCode)); - return TSDB_CODE_SUCCESS; - } - - // TODO CHECK epList/condidateList - if (SCH_IS_DATA_SRC_TASK(pTask)) { - if (pTask->tryTimes >= SCH_TASK_NUM_OF_EPS(&pTask->plan->execNode)) { - *needRetry = false; - SCH_TASK_DLOG("task no more retry since all ep tried, tryTimes:%d, epNum:%d", pTask->tryTimes, - SCH_TASK_NUM_OF_EPS(&pTask->plan->execNode)); - return TSDB_CODE_SUCCESS; - } - } else { - int32_t candidateNum = taosArrayGetSize(pTask->candidateAddrs); - - if ((pTask->candidateIdx + 1) >= candidateNum) { - *needRetry = false; - SCH_TASK_DLOG("task no more retry since all candiates tried, candidateIdx:%d, candidateNum:%d", - pTask->candidateIdx, candidateNum); - return TSDB_CODE_SUCCESS; - } - } - - *needRetry = true; - SCH_TASK_DLOG("task need the %dth retry, errCode:%x - %s", pTask->tryTimes, errCode, tstrerror(errCode)); - - return TSDB_CODE_SUCCESS; -} - -int32_t schHandleTaskRetry(SSchJob *pJob, SSchTask *pTask) { - atomic_sub_fetch_32(&pTask->level->taskLaunchedNum, 1); - - SCH_ERR_RET(schRemoveTaskFromExecList(pJob, pTask)); - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_NOT_START); - - if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { - SCH_ERR_RET(schDecTaskFlowQuota(pJob, pTask)); - SCH_ERR_RET(schLaunchTasksInFlowCtrlList(pJob, pTask)); - } - - if (SCH_IS_DATA_SRC_TASK(pTask)) { - SCH_SWITCH_EPSET(&pTask->plan->execNode); - } else { - ++pTask->candidateIdx; - } - - SCH_ERR_RET(schLaunchTask(pJob, pTask)); - - return TSDB_CODE_SUCCESS; -} - -int32_t schUpdateHbConnection(SQueryNodeEpId *epId, SSchTrans *trans) { - int32_t code = 0; - SSchHbTrans *hb = NULL; - - hb = taosHashGet(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); - if (NULL == hb) { - qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); - SCH_ERR_RET(TSDB_CODE_QRY_APP_ERROR); - } - - SCH_LOCK(SCH_WRITE, &hb->lock); - memcpy(&hb->trans, trans, sizeof(*trans)); - SCH_UNLOCK(SCH_WRITE, &hb->lock); - - qDebug("hb connection updated, sId:%" PRIx64 ", nodeId:%d, fqdn:%s, port:%d, instance:%p, handle:%p", schMgmt.sId, - epId->nodeId, epId->ep.fqdn, epId->ep.port, trans->transInst, trans->transHandle); - - return TSDB_CODE_SUCCESS; -} - -void schUpdateJobErrCode(SSchJob *pJob, int32_t errCode) { - if (TSDB_CODE_SUCCESS == errCode) { - return; - } - - int32_t origCode = atomic_load_32(&pJob->errCode); - if (TSDB_CODE_SUCCESS == origCode) { - if (origCode == atomic_val_compare_exchange_32(&pJob->errCode, origCode, errCode)) { - goto _return; - } - - origCode = atomic_load_32(&pJob->errCode); - } - - if (NEED_CLIENT_HANDLE_ERROR(origCode)) { - return; - } - - if (NEED_CLIENT_HANDLE_ERROR(errCode)) { - atomic_store_32(&pJob->errCode, errCode); - goto _return; - } - - return; - -_return: - - SCH_JOB_DLOG("job errCode updated to %x - %s", errCode, tstrerror(errCode)); -} - -int32_t schProcessOnJobFailureImpl(SSchJob *pJob, int32_t status, int32_t errCode) { - // if already FAILED, no more processing - SCH_ERR_RET(schCheckAndUpdateJobStatus(pJob, status)); - - schUpdateJobErrCode(pJob, errCode); - - if (atomic_load_8(&pJob->userFetch) || pJob->attr.syncSchedule) { - tsem_post(&pJob->rspSem); - } - - int32_t code = atomic_load_32(&pJob->errCode); - - SCH_JOB_DLOG("job failed with error: %s", tstrerror(code)); - - SCH_RET(code); -} - -// Note: no more task error processing, handled in function internal -int32_t schProcessOnJobFailure(SSchJob *pJob, int32_t errCode) { - SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_FAILED, errCode)); -} - -// Note: no more error processing, handled in function internal -int32_t schProcessOnJobDropped(SSchJob *pJob, int32_t errCode) { - SCH_RET(schProcessOnJobFailureImpl(pJob, JOB_TASK_STATUS_DROPPING, errCode)); -} - -// Note: no more task error processing, handled in function internal -int32_t schProcessOnJobPartialSuccess(SSchJob *pJob) { - int32_t code = 0; - - SCH_ERR_RET(schCheckAndUpdateJobStatus(pJob, JOB_TASK_STATUS_PARTIAL_SUCCEED)); - - if (pJob->attr.syncSchedule) { - tsem_post(&pJob->rspSem); - } - - if (atomic_load_8(&pJob->userFetch)) { - SCH_ERR_JRET(schFetchFromRemote(pJob)); - } - - return TSDB_CODE_SUCCESS; - -_return: - - SCH_RET(schProcessOnJobFailure(pJob, code)); -} - -void schProcessOnDataFetched(SSchJob *job) { - atomic_val_compare_exchange_32(&job->remoteFetch, 1, 0); - tsem_post(&job->rspSem); -} - -// Note: no more task error processing, handled in function internal -int32_t schProcessOnTaskFailure(SSchJob *pJob, SSchTask *pTask, int32_t errCode) { - int8_t status = 0; - - if (schJobNeedToStop(pJob, &status)) { - SCH_TASK_DLOG("task failed not processed cause of job status, job status:%s", jobTaskStatusStr(status)); - SCH_RET(atomic_load_32(&pJob->errCode)); - } - - bool needRetry = false; - bool moved = false; - int32_t taskDone = 0; - int32_t code = 0; - - SCH_TASK_DLOG("taskOnFailure, code:%s", tstrerror(errCode)); - - SCH_ERR_JRET(schTaskCheckSetRetry(pJob, pTask, errCode, &needRetry)); - - if (!needRetry) { - SCH_TASK_ELOG("task failed and no more retry, code:%s", tstrerror(errCode)); - - if (SCH_GET_TASK_STATUS(pTask) == JOB_TASK_STATUS_EXECUTING) { - SCH_ERR_JRET(schMoveTaskToFailList(pJob, pTask, &moved)); - } else { - SCH_TASK_ELOG("task not in executing list, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_FAILED); - - if (SCH_IS_WAIT_ALL_JOB(pJob)) { - SCH_LOCK(SCH_WRITE, &pTask->level->lock); - pTask->level->taskFailed++; - taskDone = pTask->level->taskSucceed + pTask->level->taskFailed; - SCH_UNLOCK(SCH_WRITE, &pTask->level->lock); - - schUpdateJobErrCode(pJob, errCode); - - if (taskDone < pTask->level->taskNum) { - SCH_TASK_DLOG("need to wait other tasks, doneNum:%d, allNum:%d", taskDone, pTask->level->taskNum); - SCH_RET(errCode); - } - } - } else { - SCH_ERR_JRET(schHandleTaskRetry(pJob, pTask)); - - return TSDB_CODE_SUCCESS; - } - -_return: - - SCH_RET(schProcessOnJobFailure(pJob, errCode)); -} - -int32_t schLaunchNextLevelTasks(SSchJob *pJob, SSchTask *pTask) { - if (!SCH_IS_QUERY_JOB(pJob)) { - return TSDB_CODE_SUCCESS; - } - - SSchLevel *pLevel = pTask->level; - int32_t doneNum = atomic_add_fetch_32(&pLevel->taskDoneNum, 1); - if (doneNum == pLevel->taskNum) { - pJob->levelIdx--; - - pLevel = taosArrayGet(pJob->levels, pJob->levelIdx); - for (int32_t i = 0; i < pLevel->taskNum; ++i) { - SSchTask *pTask = taosArrayGet(pLevel->subTasks, i); - - if (pTask->children && taosArrayGetSize(pTask->children) > 0) { - continue; - } - - SCH_ERR_RET(schLaunchTask(pJob, pTask)); - } - } - - return TSDB_CODE_SUCCESS; -} - - -// Note: no more task error processing, handled in function internal -int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask) { - bool moved = false; - int32_t code = 0; - - SCH_TASK_DLOG("taskOnSuccess, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - - SCH_ERR_JRET(schMoveTaskToSuccList(pJob, pTask, &moved)); - - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_PARTIAL_SUCCEED); - - SCH_ERR_JRET(schRecordTaskSucceedNode(pJob, pTask)); - - SCH_ERR_JRET(schLaunchTasksInFlowCtrlList(pJob, pTask)); - - int32_t parentNum = pTask->parents ? (int32_t)taosArrayGetSize(pTask->parents) : 0; - if (parentNum == 0) { - int32_t taskDone = 0; - if (SCH_IS_WAIT_ALL_JOB(pJob)) { - SCH_LOCK(SCH_WRITE, &pTask->level->lock); - pTask->level->taskSucceed++; - taskDone = pTask->level->taskSucceed + pTask->level->taskFailed; - SCH_UNLOCK(SCH_WRITE, &pTask->level->lock); - - if (taskDone < pTask->level->taskNum) { - SCH_TASK_DLOG("wait all tasks, done:%d, all:%d", taskDone, pTask->level->taskNum); - return TSDB_CODE_SUCCESS; - } else if (taskDone > pTask->level->taskNum) { - SCH_TASK_ELOG("taskDone number invalid, done:%d, total:%d", taskDone, pTask->level->taskNum); - } - - if (pTask->level->taskFailed > 0) { - SCH_RET(schProcessOnJobFailure(pJob, 0)); - } else { - SCH_RET(schProcessOnJobPartialSuccess(pJob)); - } - } else { - pJob->resNode = pTask->succeedAddr; - } - - pJob->fetchTask = pTask; - - SCH_ERR_JRET(schMoveTaskToExecList(pJob, pTask, &moved)); - - SCH_RET(schProcessOnJobPartialSuccess(pJob)); - } - - /* - if (SCH_IS_DATA_SRC_TASK(task) && job->dataSrcEps.numOfEps < SCH_MAX_CANDIDATE_EP_NUM) { - strncpy(job->dataSrcEps.fqdn[job->dataSrcEps.numOfEps], task->execAddr.fqdn, sizeof(task->execAddr.fqdn)); - job->dataSrcEps.port[job->dataSrcEps.numOfEps] = task->execAddr.port; - - ++job->dataSrcEps.numOfEps; - } - */ - - for (int32_t i = 0; i < parentNum; ++i) { - SSchTask *par = *(SSchTask **)taosArrayGet(pTask->parents, i); - int32_t readyNum = atomic_add_fetch_32(&par->childReady, 1); - - SCH_LOCK(SCH_WRITE, &par->lock); - SDownstreamSourceNode source = {.type = QUERY_NODE_DOWNSTREAM_SOURCE, - .taskId = pTask->taskId, - .schedId = schMgmt.sId, - .addr = pTask->succeedAddr}; - qSetSubplanExecutionNode(par->plan, pTask->plan->id.groupId, &source); - SCH_UNLOCK(SCH_WRITE, &par->lock); - - if (SCH_TASK_READY_FOR_LAUNCH(readyNum, par)) { - SCH_ERR_RET(schLaunchTask(pJob, par)); - } - } - - SCH_ERR_RET(schLaunchNextLevelTasks(pJob, pTask)); - - return TSDB_CODE_SUCCESS; - -_return: - - SCH_RET(schProcessOnJobFailure(pJob, code)); -} - -// Note: no more error processing, handled in function internal -int32_t schFetchFromRemote(SSchJob *pJob) { - int32_t code = 0; - - if (atomic_val_compare_exchange_32(&pJob->remoteFetch, 0, 1) != 0) { - SCH_JOB_ELOG("prior fetching not finished, remoteFetch:%d", atomic_load_32(&pJob->remoteFetch)); - return TSDB_CODE_SUCCESS; - } - - void *resData = atomic_load_ptr(&pJob->resData); - if (resData) { - atomic_val_compare_exchange_32(&pJob->remoteFetch, 1, 0); - - SCH_JOB_DLOG("res already fetched, res:%p", resData); - return TSDB_CODE_SUCCESS; - } - - SCH_ERR_JRET(schBuildAndSendMsg(pJob, pJob->fetchTask, &pJob->resNode, TDMT_VND_FETCH)); - - return TSDB_CODE_SUCCESS; - -_return: - - atomic_val_compare_exchange_32(&pJob->remoteFetch, 1, 0); - - SCH_RET(schProcessOnTaskFailure(pJob, pJob->fetchTask, code)); -} - -int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRsp *pRsp) { - SCH_TASK_DLOG("got explain rsp, rows:%d, complete:%d", htonl(pRsp->numOfRows), pRsp->completed); - - atomic_store_32(&pJob->resNumOfRows, htonl(pRsp->numOfRows)); - atomic_store_ptr(&pJob->resData, pRsp); - - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCCEED); - - schProcessOnDataFetched(pJob); - - return TSDB_CODE_SUCCESS; -} - -int32_t schSaveJobQueryRes(SSchJob *pJob, SResReadyRsp *rsp) { - if (rsp->tbFName[0]) { - if (NULL == pJob->queryRes) { - pJob->queryRes = taosArrayInit(pJob->taskNum, sizeof(STbVerInfo)); - if (NULL == pJob->queryRes) { - SCH_ERR_RET(TSDB_CODE_OUT_OF_MEMORY); - } - } - - STbVerInfo tbInfo; - strcpy(tbInfo.tbFName, rsp->tbFName); - tbInfo.sversion = rsp->sversion; - tbInfo.tversion = rsp->tversion; - - taosArrayPush((SArray *)pJob->queryRes, &tbInfo); - } - - return TSDB_CODE_SUCCESS; -} - - -// Note: no more task error processing, handled in function internal -int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, char *msg, int32_t msgSize, - int32_t rspCode) { - int32_t code = 0; - int8_t status = 0; - - if (schJobNeedToStop(pJob, &status)) { - SCH_TASK_ELOG("rsp not processed cause of job status, job status:%s, rspCode:0x%x", jobTaskStatusStr(status), - rspCode); - SCH_RET(atomic_load_32(&pJob->errCode)); - } - - SCH_ERR_JRET(schValidateTaskReceivedMsgType(pJob, pTask, msgType)); - - switch (msgType) { - case TDMT_VND_CREATE_TABLE_RSP: { - SVCreateTbBatchRsp batchRsp = {0}; - if (msg) { - SDecoder coder = {0}; - tDecoderInit(&coder, msg, msgSize); - code = tDecodeSVCreateTbBatchRsp(&coder, &batchRsp); - if (TSDB_CODE_SUCCESS == code && batchRsp.nRsps > 0) { - for (int32_t i = 0; i < batchRsp.nRsps; ++i) { - SVCreateTbRsp *rsp = batchRsp.pRsps + i; - if (TSDB_CODE_SUCCESS != rsp->code) { - code = rsp->code; - tDecoderClear(&coder); - SCH_ERR_JRET(code); - } - } - } - tDecoderClear(&coder); - SCH_ERR_JRET(code); - } - - SCH_ERR_JRET(rspCode); - SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); - break; - } - case TDMT_VND_DROP_TABLE_RSP: { - SVDropTbBatchRsp batchRsp = {0}; - if (msg) { - SDecoder coder = {0}; - tDecoderInit(&coder, msg, msgSize); - code = tDecodeSVDropTbBatchRsp(&coder, &batchRsp); - if (TSDB_CODE_SUCCESS == code && batchRsp.nRsps > 0) { - for (int32_t i = 0; i < batchRsp.nRsps; ++i) { - SVDropTbRsp *rsp = batchRsp.pRsps + i; - if (TSDB_CODE_SUCCESS != rsp->code) { - code = rsp->code; - tDecoderClear(&coder); - SCH_ERR_JRET(code); - } - } - } - tDecoderClear(&coder); - SCH_ERR_JRET(code); - } - - SCH_ERR_JRET(rspCode); - SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); - break; - } - case TDMT_VND_ALTER_TABLE_RSP: { - SVAlterTbRsp rsp = {0}; - if (msg) { - SDecoder coder = {0}; - tDecoderInit(&coder, msg, msgSize); - code = tDecodeSVAlterTbRsp(&coder, &rsp); - tDecoderClear(&coder); - SCH_ERR_JRET(code); - SCH_ERR_JRET(rsp.code); - } - - SCH_ERR_JRET(rspCode); - - if (NULL == msg) { - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); - break; - } - case TDMT_VND_SUBMIT_RSP: { - SCH_ERR_JRET(rspCode); - - if (msg) { - SDecoder coder = {0}; - SSubmitRsp *rsp = taosMemoryMalloc(sizeof(*rsp)); - tDecoderInit(&coder, msg, msgSize); - code = tDecodeSSubmitRsp(&coder, rsp); - if (code) { - SCH_TASK_ELOG("decode submitRsp failed, code:%d", code); - tFreeSSubmitRsp(rsp); - SCH_ERR_JRET(code); - } - - if (rsp->nBlocks > 0) { - for (int32_t i = 0; i < rsp->nBlocks; ++i) { - SSubmitBlkRsp *blk = rsp->pBlocks + i; - if (TSDB_CODE_SUCCESS != blk->code) { - code = blk->code; - tFreeSSubmitRsp(rsp); - SCH_ERR_JRET(code); - } - } - } - - atomic_add_fetch_32(&pJob->resNumOfRows, rsp->affectedRows); - SCH_TASK_DLOG("submit succeed, affectedRows:%d", rsp->affectedRows); - - SCH_LOCK(SCH_WRITE, &pJob->resLock); - if (pJob->queryRes) { - SSubmitRsp *sum = pJob->queryRes; - sum->affectedRows += rsp->affectedRows; - sum->nBlocks += rsp->nBlocks; - sum->pBlocks = taosMemoryRealloc(sum->pBlocks, sum->nBlocks * sizeof(*sum->pBlocks)); - memcpy(sum->pBlocks + sum->nBlocks - rsp->nBlocks, rsp->pBlocks, rsp->nBlocks * sizeof(*sum->pBlocks)); - taosMemoryFree(rsp->pBlocks); - taosMemoryFree(rsp); - } else { - pJob->queryRes = rsp; - } - SCH_UNLOCK(SCH_WRITE, &pJob->resLock); - } - - SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); - - break; - } - case TDMT_VND_QUERY_RSP: { - SQueryTableRsp rsp = {0}; - if (msg) { - SCH_ERR_JRET(tDeserializeSQueryTableRsp(msg, msgSize, &rsp)); - SCH_ERR_JRET(rsp.code); - } - - SCH_ERR_JRET(rspCode); - - if (NULL == msg) { - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - - // SCH_ERR_JRET(schBuildAndSendMsg(pJob, pTask, NULL, TDMT_VND_RES_READY)); - - break; - } - case TDMT_VND_RES_READY_RSP: { - SResReadyRsp *rsp = (SResReadyRsp *)msg; - - SCH_ERR_JRET(rspCode); - if (NULL == msg) { - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - SCH_ERR_JRET(rsp->code); - - SCH_ERR_JRET(schSaveJobQueryRes(pJob, rsp)); - - SCH_ERR_RET(schProcessOnTaskSuccess(pJob, pTask)); - - break; - } - case TDMT_VND_EXPLAIN_RSP: { - SCH_ERR_JRET(rspCode); - if (NULL == msg) { - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - - if (!SCH_IS_EXPLAIN_JOB(pJob)) { - SCH_TASK_ELOG("invalid msg received for none explain query, msg type:%s", TMSG_INFO(msgType)); - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - - if (pJob->resData) { - SCH_TASK_ELOG("explain result is already generated, res:%p", pJob->resData); - SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); - } - - SExplainRsp rsp = {0}; - if (tDeserializeSExplainRsp(msg, msgSize, &rsp)) { - taosMemoryFree(rsp.subplanInfo); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SRetrieveTableRsp *pRsp = NULL; - SCH_ERR_JRET(qExplainUpdateExecInfo(pJob->explainCtx, &rsp, pTask->plan->id.groupId, &pRsp)); - - if (pRsp) { - SCH_ERR_JRET(schProcessOnExplainDone(pJob, pTask, pRsp)); - } - break; - } - case TDMT_VND_FETCH_RSP: { - SRetrieveTableRsp *rsp = (SRetrieveTableRsp *)msg; - - SCH_ERR_JRET(rspCode); - if (NULL == msg) { - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - - if (SCH_IS_EXPLAIN_JOB(pJob)) { - if (rsp->completed) { - SRetrieveTableRsp *pRsp = NULL; - SCH_ERR_JRET(qExecExplainEnd(pJob->explainCtx, &pRsp)); - if (pRsp) { - SCH_ERR_JRET(schProcessOnExplainDone(pJob, pTask, pRsp)); - } - - return TSDB_CODE_SUCCESS; - } - - atomic_val_compare_exchange_32(&pJob->remoteFetch, 1, 0); - - SCH_ERR_JRET(schFetchFromRemote(pJob)); - - return TSDB_CODE_SUCCESS; - } - - if (pJob->resData) { - SCH_TASK_ELOG("got fetch rsp while res already exists, res:%p", pJob->resData); - taosMemoryFreeClear(rsp); - SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); - } - - atomic_store_ptr(&pJob->resData, rsp); - atomic_add_fetch_32(&pJob->resNumOfRows, htonl(rsp->numOfRows)); - - if (rsp->completed) { - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCCEED); - } - - SCH_TASK_DLOG("got fetch rsp, rows:%d, complete:%d", htonl(rsp->numOfRows), rsp->completed); - - schProcessOnDataFetched(pJob); - break; - } - case TDMT_VND_DROP_TASK_RSP: { - // SHOULD NEVER REACH HERE - SCH_TASK_ELOG("invalid status to handle drop task rsp, refId:%" PRIx64, pJob->refId); - SCH_ERR_JRET(TSDB_CODE_SCH_INTERNAL_ERROR); - break; - } - case TDMT_SCH_LINK_BROKEN: - SCH_TASK_ELOG("link broken received, error:%x - %s", rspCode, tstrerror(rspCode)); - SCH_ERR_JRET(rspCode); - break; - default: - SCH_TASK_ELOG("unknown rsp msg, type:%d, status:%s", msgType, SCH_GET_TASK_STATUS_STR(pTask)); - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - - return TSDB_CODE_SUCCESS; - -_return: - - SCH_RET(schProcessOnTaskFailure(pJob, pTask, code)); -} - -int32_t schGetTaskFromTaskList(SHashObj *pTaskList, uint64_t taskId, SSchTask **pTask) { - int32_t s = taosHashGetSize(pTaskList); - if (s <= 0) { - return TSDB_CODE_SUCCESS; - } - - SSchTask **task = taosHashGet(pTaskList, &taskId, sizeof(taskId)); - if (NULL == task || NULL == (*task)) { - return TSDB_CODE_SUCCESS; - } - - *pTask = *task; - - return TSDB_CODE_SUCCESS; -} - -int32_t schUpdateTaskExecNodeHandle(SSchTask *pTask, void *handle, int32_t rspCode) { - if (rspCode || NULL == pTask->execNodes || taosArrayGetSize(pTask->execNodes) > 1 || - taosArrayGetSize(pTask->execNodes) <= 0) { - return TSDB_CODE_SUCCESS; - } - - SSchNodeInfo *nodeInfo = taosArrayGet(pTask->execNodes, 0); - nodeInfo->handle = handle; - - return TSDB_CODE_SUCCESS; -} - -int32_t schHandleCallback(void *param, const SDataBuf *pMsg, int32_t msgType, int32_t rspCode) { - int32_t code = 0; - SSchTaskCallbackParam *pParam = (SSchTaskCallbackParam *)param; - SSchTask *pTask = NULL; - - SSchJob *pJob = schAcquireJob(pParam->refId); - if (NULL == pJob) { - qWarn("QID:0x%" PRIx64 ",TID:0x%" PRIx64 "taosAcquireRef job failed, may be dropped, refId:%" PRIx64, - pParam->queryId, pParam->taskId, pParam->refId); - SCH_ERR_JRET(TSDB_CODE_QRY_JOB_FREED); - } - - schGetTaskFromTaskList(pJob->execTasks, pParam->taskId, &pTask); - if (NULL == pTask) { - if (TDMT_VND_EXPLAIN_RSP == msgType) { - schGetTaskFromTaskList(pJob->succTasks, pParam->taskId, &pTask); - } else { - SCH_JOB_ELOG("task not found in execTask list, refId:%" PRIx64 ", taskId:%" PRIx64, pParam->refId, - pParam->taskId); - SCH_ERR_JRET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - } - - if (NULL == pTask) { - SCH_JOB_ELOG("task not found in execList & succList, refId:%" PRIx64 ", taskId:%" PRIx64, pParam->refId, - pParam->taskId); - SCH_ERR_JRET(TSDB_CODE_SCH_INTERNAL_ERROR); - } - - SCH_TASK_DLOG("rsp msg received, type:%s, handle:%p, code:%s", TMSG_INFO(msgType), pMsg->handle, tstrerror(rspCode)); - - SCH_SET_TASK_HANDLE(pTask, pMsg->handle); - schUpdateTaskExecNodeHandle(pTask, pMsg->handle, rspCode); - SCH_ERR_JRET(schHandleResponseMsg(pJob, pTask, msgType, pMsg->pData, pMsg->len, rspCode)); - -_return: - if (pJob) { - schReleaseJob(pParam->refId); - } - - taosMemoryFreeClear(param); - SCH_RET(code); -} - -int32_t schHandleSubmitCallback(void *param, const SDataBuf *pMsg, int32_t code) { - return schHandleCallback(param, pMsg, TDMT_VND_SUBMIT_RSP, code); -} - -int32_t schHandleCreateTableCallback(void *param, const SDataBuf *pMsg, int32_t code) { - return schHandleCallback(param, pMsg, TDMT_VND_CREATE_TABLE_RSP, code); -} - -int32_t schHandleDropTableCallback(void *param, const SDataBuf *pMsg, int32_t code) { - return schHandleCallback(param, pMsg, TDMT_VND_DROP_TABLE_RSP, code); -} - -int32_t schHandleAlterTableCallback(void *param, const SDataBuf *pMsg, int32_t code) { - return schHandleCallback(param, pMsg, TDMT_VND_ALTER_TABLE_RSP, code); -} - -int32_t schHandleQueryCallback(void *param, const SDataBuf *pMsg, int32_t code) { - return schHandleCallback(param, pMsg, TDMT_VND_QUERY_RSP, code); -} - -int32_t schHandleFetchCallback(void *param, const SDataBuf *pMsg, int32_t code) { - return schHandleCallback(param, pMsg, TDMT_VND_FETCH_RSP, code); -} - -int32_t schHandleReadyCallback(void *param, const SDataBuf *pMsg, int32_t code) { - return schHandleCallback(param, pMsg, TDMT_VND_RES_READY_RSP, code); -} - -int32_t schHandleExplainCallback(void *param, const SDataBuf *pMsg, int32_t code) { - return schHandleCallback(param, pMsg, TDMT_VND_EXPLAIN_RSP, code); -} - -int32_t schHandleDropCallback(void *param, const SDataBuf *pMsg, int32_t code) { - SSchTaskCallbackParam *pParam = (SSchTaskCallbackParam *)param; - qDebug("QID:%" PRIx64 ",TID:%" PRIx64 " drop task rsp received, code:%x", pParam->queryId, pParam->taskId, code); - return TSDB_CODE_SUCCESS; -} - -int32_t schHandleHbCallback(void *param, const SDataBuf *pMsg, int32_t code) { - SSchedulerHbRsp rsp = {0}; - SSchTaskCallbackParam *pParam = (SSchTaskCallbackParam *)param; - - if (code) { - qError("hb rsp error:%s", tstrerror(code)); - SCH_ERR_JRET(code); - } - - if (tDeserializeSSchedulerHbRsp(pMsg->pData, pMsg->len, &rsp)) { - qError("invalid hb rsp msg, size:%d", pMsg->len); - SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); - } - - SSchTrans trans = {0}; - trans.transInst = pParam->transport; - trans.transHandle = pMsg->handle; - - SCH_ERR_JRET(schUpdateHbConnection(&rsp.epId, &trans)); - - int32_t taskNum = (int32_t)taosArrayGetSize(rsp.taskStatus); - qDebug("%d task status in hb rsp, nodeId:%d, fqdn:%s, port:%d", taskNum, rsp.epId.nodeId, rsp.epId.ep.fqdn, - rsp.epId.ep.port); - - for (int32_t i = 0; i < taskNum; ++i) { - STaskStatus *taskStatus = taosArrayGet(rsp.taskStatus, i); - - SSchJob *pJob = schAcquireJob(taskStatus->refId); - if (NULL == pJob) { - qWarn("job not found, refId:0x%" PRIx64 ",QID:0x%" PRIx64 ",TID:0x%" PRIx64, taskStatus->refId, - taskStatus->queryId, taskStatus->taskId); - // TODO DROP TASK FROM SERVER!!!! - continue; - } - - // TODO - - SCH_JOB_DLOG("TID:0x%" PRIx64 " task status in server: %s", taskStatus->taskId, - jobTaskStatusStr(taskStatus->status)); - - schReleaseJob(taskStatus->refId); - } - -_return: - - tFreeSSchedulerHbRsp(&rsp); - taosMemoryFree(param); - - SCH_RET(code); -} - -int32_t schHandleLinkBrokenCallback(void *param, const SDataBuf *pMsg, int32_t code) { - SSchCallbackParamHeader *head = (SSchCallbackParamHeader *)param; - rpcReleaseHandle(pMsg->handle, TAOS_CONN_CLIENT); - - qDebug("handle %p is broken", pMsg->handle); - - if (head->isHbParam) { - SSchHbCallbackParam *hbParam = (SSchHbCallbackParam *)param; - SSchTrans trans = {.transInst = hbParam->transport, .transHandle = NULL}; - SCH_ERR_RET(schUpdateHbConnection(&hbParam->nodeEpId, &trans)); - - SCH_ERR_RET(schBuildAndSendHbMsg(&hbParam->nodeEpId)); - } else { - SCH_ERR_RET(schHandleCallback(param, pMsg, TDMT_SCH_LINK_BROKEN, code)); - } - - return TSDB_CODE_SUCCESS; -} - -int32_t schGetCallbackFp(int32_t msgType, __async_send_cb_fn_t *fp) { - switch (msgType) { - case TDMT_VND_CREATE_TABLE: - *fp = schHandleCreateTableCallback; - break; - case TDMT_VND_DROP_TABLE: - *fp = schHandleDropTableCallback; - break; - case TDMT_VND_ALTER_TABLE: - *fp = schHandleAlterTableCallback; - break; - case TDMT_VND_SUBMIT: - *fp = schHandleSubmitCallback; - break; - case TDMT_VND_QUERY: - *fp = schHandleQueryCallback; - break; - case TDMT_VND_RES_READY: - *fp = schHandleReadyCallback; - break; - case TDMT_VND_EXPLAIN: - *fp = schHandleExplainCallback; - break; - case TDMT_VND_FETCH: - *fp = schHandleFetchCallback; - break; - case TDMT_VND_DROP_TASK: - *fp = schHandleDropCallback; - break; - case TDMT_VND_QUERY_HEARTBEAT: - *fp = schHandleHbCallback; - break; - case TDMT_SCH_LINK_BROKEN: - *fp = schHandleLinkBrokenCallback; - break; - default: - qError("unknown msg type for callback, msgType:%d", msgType); - SCH_ERR_RET(TSDB_CODE_QRY_APP_ERROR); - } - - return TSDB_CODE_SUCCESS; -} - -int32_t schGenerateTaskCallBackAHandle(SSchJob *pJob, SSchTask *pTask, int32_t msgType, SMsgSendInfo **pMsgSendInfo) { - int32_t code = 0; - SMsgSendInfo *msgSendInfo = taosMemoryCalloc(1, sizeof(SMsgSendInfo)); - if (NULL == msgSendInfo) { - SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SMsgSendInfo)); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SSchTaskCallbackParam *param = taosMemoryCalloc(1, sizeof(SSchTaskCallbackParam)); - if (NULL == param) { - SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SSchTaskCallbackParam)); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - __async_send_cb_fn_t fp = NULL; - SCH_ERR_JRET(schGetCallbackFp(msgType, &fp)); - - param->queryId = pJob->queryId; - param->refId = pJob->refId; - param->taskId = SCH_TASK_ID(pTask); - param->transport = pJob->transport; - - msgSendInfo->param = param; - msgSendInfo->fp = fp; - - *pMsgSendInfo = msgSendInfo; - - return TSDB_CODE_SUCCESS; - -_return: - - taosMemoryFree(param); - taosMemoryFree(msgSendInfo); - - SCH_RET(code); -} - -void schFreeRpcCtxVal(const void *arg) { - if (NULL == arg) { - return; - } - - SMsgSendInfo *pMsgSendInfo = (SMsgSendInfo *)arg; - taosMemoryFreeClear(pMsgSendInfo->param); - taosMemoryFreeClear(pMsgSendInfo); -} - -int32_t schMakeTaskCallbackParam(SSchJob *pJob, SSchTask *pTask, void **pParam) { - SSchTaskCallbackParam *param = taosMemoryCalloc(1, sizeof(SSchTaskCallbackParam)); - if (NULL == param) { - SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SSchTaskCallbackParam)); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - param->queryId = pJob->queryId; - param->refId = pJob->refId; - param->taskId = SCH_TASK_ID(pTask); - param->transport = pJob->transport; - - *pParam = param; - - return TSDB_CODE_SUCCESS; -} - -int32_t schMakeHbCallbackParam(SSchJob *pJob, SSchTask *pTask, void **pParam) { - SSchHbCallbackParam *param = taosMemoryCalloc(1, sizeof(SSchHbCallbackParam)); - if (NULL == param) { - SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SSchHbCallbackParam)); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - param->head.isHbParam = true; - - SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - - param->nodeEpId.nodeId = addr->nodeId; - memcpy(¶m->nodeEpId.ep, SCH_GET_CUR_EP(addr), sizeof(SEp)); - param->transport = pJob->transport; - - *pParam = param; - - return TSDB_CODE_SUCCESS; -} - -int32_t schMakeBrokenLinkVal(SSchJob *pJob, SSchTask *pTask, SRpcBrokenlinkVal *brokenVal, bool isHb) { - int32_t code = 0; - SMsgSendInfo *pMsgSendInfo = NULL; - - pMsgSendInfo = taosMemoryCalloc(1, sizeof(SMsgSendInfo)); - if (NULL == pMsgSendInfo) { - SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SMsgSendInfo)); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - if (isHb) { - SCH_ERR_JRET(schMakeHbCallbackParam(pJob, pTask, &pMsgSendInfo->param)); - } else { - SCH_ERR_JRET(schMakeTaskCallbackParam(pJob, pTask, &pMsgSendInfo->param)); - } - - int32_t msgType = TDMT_SCH_LINK_BROKEN; - __async_send_cb_fn_t fp = NULL; - SCH_ERR_JRET(schGetCallbackFp(msgType, &fp)); - - pMsgSendInfo->fp = fp; - - brokenVal->msgType = msgType; - brokenVal->val = pMsgSendInfo; - brokenVal->clone = schCloneSMsgSendInfo; - brokenVal->freeFunc = schFreeRpcCtxVal; - - return TSDB_CODE_SUCCESS; - -_return: - - taosMemoryFreeClear(pMsgSendInfo->param); - taosMemoryFreeClear(pMsgSendInfo); - - SCH_RET(code); -} - -int32_t schMakeQueryRpcCtx(SSchJob *pJob, SSchTask *pTask, SRpcCtx *pCtx) { - int32_t code = 0; - SMsgSendInfo *pReadyMsgSendInfo = NULL; - SMsgSendInfo *pExplainMsgSendInfo = NULL; - - pCtx->args = taosHashInit(1, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false, HASH_ENTRY_LOCK); - if (NULL == pCtx->args) { - SCH_TASK_ELOG("taosHashInit %d RpcCtx failed", 1); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_ERR_JRET(schGenerateTaskCallBackAHandle(pJob, pTask, TDMT_VND_RES_READY, &pReadyMsgSendInfo)); - SCH_ERR_JRET(schGenerateTaskCallBackAHandle(pJob, pTask, TDMT_VND_EXPLAIN, &pExplainMsgSendInfo)); - - int32_t msgType = TDMT_VND_RES_READY_RSP; - SRpcCtxVal ctxVal = {.val = pReadyMsgSendInfo, .clone = schCloneSMsgSendInfo, .freeFunc = schFreeRpcCtxVal}; - if (taosHashPut(pCtx->args, &msgType, sizeof(msgType), &ctxVal, sizeof(ctxVal))) { - SCH_TASK_ELOG("taosHashPut msg %d to rpcCtx failed", msgType); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - msgType = TDMT_VND_EXPLAIN_RSP; - ctxVal.val = pExplainMsgSendInfo; - if (taosHashPut(pCtx->args, &msgType, sizeof(msgType), &ctxVal, sizeof(ctxVal))) { - SCH_TASK_ELOG("taosHashPut msg %d to rpcCtx failed", msgType); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_ERR_JRET(schMakeBrokenLinkVal(pJob, pTask, &pCtx->brokenVal, false)); - - return TSDB_CODE_SUCCESS; - -_return: - - taosHashCleanup(pCtx->args); - - if (pReadyMsgSendInfo) { - taosMemoryFreeClear(pReadyMsgSendInfo->param); - taosMemoryFreeClear(pReadyMsgSendInfo); - } - - if (pExplainMsgSendInfo) { - taosMemoryFreeClear(pExplainMsgSendInfo->param); - taosMemoryFreeClear(pExplainMsgSendInfo); - } - - SCH_RET(code); -} - -int32_t schMakeHbRpcCtx(SSchJob *pJob, SSchTask *pTask, SRpcCtx *pCtx) { - int32_t code = 0; - SSchHbCallbackParam *param = NULL; - SMsgSendInfo *pMsgSendInfo = NULL; - SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - SQueryNodeEpId epId = {0}; - - epId.nodeId = addr->nodeId; - memcpy(&epId.ep, SCH_GET_CUR_EP(addr), sizeof(SEp)); - - pCtx->args = taosHashInit(1, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false, HASH_ENTRY_LOCK); - if (NULL == pCtx->args) { - SCH_TASK_ELOG("taosHashInit %d RpcCtx failed", 1); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - pMsgSendInfo = taosMemoryCalloc(1, sizeof(SMsgSendInfo)); - if (NULL == pMsgSendInfo) { - SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SMsgSendInfo)); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - param = taosMemoryCalloc(1, sizeof(SSchHbCallbackParam)); - if (NULL == param) { - SCH_TASK_ELOG("calloc %d failed", (int32_t)sizeof(SSchHbCallbackParam)); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - int32_t msgType = TDMT_VND_QUERY_HEARTBEAT_RSP; - __async_send_cb_fn_t fp = NULL; - SCH_ERR_JRET(schGetCallbackFp(TDMT_VND_QUERY_HEARTBEAT, &fp)); - - param->nodeEpId = epId; - param->transport = pJob->transport; - - pMsgSendInfo->param = param; - pMsgSendInfo->fp = fp; - - SRpcCtxVal ctxVal = {.val = pMsgSendInfo, .clone = schCloneSMsgSendInfo, .freeFunc = schFreeRpcCtxVal}; - if (taosHashPut(pCtx->args, &msgType, sizeof(msgType), &ctxVal, sizeof(ctxVal))) { - SCH_TASK_ELOG("taosHashPut msg %d to rpcCtx failed", msgType); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SCH_ERR_JRET(schMakeBrokenLinkVal(pJob, pTask, &pCtx->brokenVal, true)); - - return TSDB_CODE_SUCCESS; - -_return: - - taosHashCleanup(pCtx->args); - taosMemoryFreeClear(param); - taosMemoryFreeClear(pMsgSendInfo); - - SCH_RET(code); -} - -int32_t schRegisterHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *epId, bool *exist) { - int32_t code = 0; - SSchHbTrans hb = {0}; - - hb.trans.transInst = pJob->transport; - - SCH_ERR_RET(schMakeHbRpcCtx(pJob, pTask, &hb.rpcCtx)); - - code = taosHashPut(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId), &hb, sizeof(SSchHbTrans)); - if (code) { - schFreeRpcCtx(&hb.rpcCtx); - - if (HASH_NODE_EXIST(code)) { - *exist = true; - return TSDB_CODE_SUCCESS; - } - - qError("taosHashPut hb trans failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); - SCH_ERR_RET(code); - } - - return TSDB_CODE_SUCCESS; -} - -int32_t schCloneCallbackParam(SSchCallbackParamHeader *pSrc, SSchCallbackParamHeader **pDst) { - if (pSrc->isHbParam) { - SSchHbCallbackParam *dst = taosMemoryMalloc(sizeof(SSchHbCallbackParam)); - if (NULL == dst) { - qError("malloc SSchHbCallbackParam failed"); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - memcpy(dst, pSrc, sizeof(*dst)); - *pDst = (SSchCallbackParamHeader *)dst; - - return TSDB_CODE_SUCCESS; - } - - SSchTaskCallbackParam *dst = taosMemoryMalloc(sizeof(SSchTaskCallbackParam)); - if (NULL == dst) { - qError("malloc SSchTaskCallbackParam failed"); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - memcpy(dst, pSrc, sizeof(*dst)); - *pDst = (SSchCallbackParamHeader *)dst; - - return TSDB_CODE_SUCCESS; -} - -int32_t schCloneSMsgSendInfo(void *src, void **dst) { - SMsgSendInfo *pSrc = src; - int32_t code = 0; - SMsgSendInfo *pDst = taosMemoryMalloc(sizeof(*pSrc)); - if (NULL == pDst) { - qError("malloc SMsgSendInfo for rpcCtx failed, len:%d", (int32_t)sizeof(*pSrc)); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - memcpy(pDst, pSrc, sizeof(*pSrc)); - pDst->param = NULL; - - SCH_ERR_JRET(schCloneCallbackParam(pSrc->param, (SSchCallbackParamHeader **)&pDst->param)); - - *dst = pDst; - - return TSDB_CODE_SUCCESS; - -_return: - - taosMemoryFreeClear(pDst); - SCH_RET(code); -} - -int32_t schCloneHbRpcCtx(SRpcCtx *pSrc, SRpcCtx *pDst) { - int32_t code = 0; - memcpy(&pDst->brokenVal, &pSrc->brokenVal, sizeof(pSrc->brokenVal)); - pDst->brokenVal.val = NULL; - - SCH_ERR_RET(schCloneSMsgSendInfo(pSrc->brokenVal.val, &pDst->brokenVal.val)); - - pDst->args = taosHashInit(1, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false, HASH_ENTRY_LOCK); - if (NULL == pDst->args) { - qError("taosHashInit %d RpcCtx failed", 1); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SRpcCtxVal dst = {0}; - void *pIter = taosHashIterate(pSrc->args, NULL); - while (pIter) { - SRpcCtxVal *pVal = (SRpcCtxVal *)pIter; - int32_t *msgType = taosHashGetKey(pIter, NULL); - - dst = *pVal; - dst.val = NULL; - - SCH_ERR_JRET(schCloneSMsgSendInfo(pVal->val, &dst.val)); - - if (taosHashPut(pDst->args, msgType, sizeof(*msgType), &dst, sizeof(dst))) { - qError("taosHashPut msg %d to rpcCtx failed", *msgType); - (*dst.freeFunc)(dst.val); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - pIter = taosHashIterate(pSrc->args, pIter); - } - - return TSDB_CODE_SUCCESS; - -_return: - - schFreeRpcCtx(pDst); - SCH_RET(code); -} - -int32_t schAsyncSendMsg(SSchJob *pJob, SSchTask *pTask, void *transport, SEpSet *epSet, int32_t msgType, void *msg, - uint32_t msgSize, bool persistHandle, SRpcCtx *ctx) { - int32_t code = 0; - - SSchTrans *trans = (SSchTrans *)transport; - - SMsgSendInfo *pMsgSendInfo = NULL; - SCH_ERR_JRET(schGenerateTaskCallBackAHandle(pJob, pTask, msgType, &pMsgSendInfo)); - - pMsgSendInfo->msgInfo.pData = msg; - pMsgSendInfo->msgInfo.len = msgSize; - pMsgSendInfo->msgInfo.handle = trans->transHandle; - pMsgSendInfo->msgType = msgType; - - qDebug("start to send %s msg to node[%d,%s,%d], refId:%" PRIx64 "instance:%p, handle:%p", TMSG_INFO(msgType), - ntohl(((SMsgHead *)msg)->vgId), epSet->eps[epSet->inUse].fqdn, epSet->eps[epSet->inUse].port, pJob->refId, - trans->transInst, trans->transHandle); - - int64_t transporterId = 0; - code = asyncSendMsgToServerExt(trans->transInst, epSet, &transporterId, pMsgSendInfo, persistHandle, ctx); - if (code) { - SCH_ERR_JRET(code); - } - - SCH_TASK_DLOG("req msg sent, refId:%" PRIx64 ", type:%d, %s", pJob->refId, msgType, TMSG_INFO(msgType)); - return TSDB_CODE_SUCCESS; - -_return: - - if (pMsgSendInfo) { - taosMemoryFreeClear(pMsgSendInfo->param); - taosMemoryFreeClear(pMsgSendInfo); - } - - SCH_RET(code); -} - -int32_t schBuildAndSendHbMsg(SQueryNodeEpId *nodeEpId) { - SSchedulerHbReq req = {0}; - int32_t code = 0; - SRpcCtx rpcCtx = {0}; - SSchTrans trans = {0}; - int32_t msgType = TDMT_VND_QUERY_HEARTBEAT; - - req.header.vgId = nodeEpId->nodeId; - req.sId = schMgmt.sId; - memcpy(&req.epId, nodeEpId, sizeof(SQueryNodeEpId)); - - SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, nodeEpId, sizeof(SQueryNodeEpId)); - if (NULL == hb) { - qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", nodeEpId->nodeId, nodeEpId->ep.fqdn, - nodeEpId->ep.port); - SCH_ERR_RET(code); - } - - SCH_LOCK(SCH_WRITE, &hb->lock); - code = schCloneHbRpcCtx(&hb->rpcCtx, &rpcCtx); - memcpy(&trans, &hb->trans, sizeof(trans)); - SCH_UNLOCK(SCH_WRITE, &hb->lock); - - SCH_ERR_RET(code); - - int32_t msgSize = tSerializeSSchedulerHbReq(NULL, 0, &req); - if (msgSize < 0) { - qError("tSerializeSSchedulerHbReq hbReq failed, size:%d", msgSize); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - void *msg = taosMemoryCalloc(1, msgSize); - if (NULL == msg) { - qError("calloc hb req %d failed", msgSize); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - if (tSerializeSSchedulerHbReq(msg, msgSize, &req) < 0) { - qError("tSerializeSSchedulerHbReq hbReq failed, size:%d", msgSize); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SMsgSendInfo *pMsgSendInfo = taosMemoryCalloc(1, sizeof(SMsgSendInfo)); - if (NULL == pMsgSendInfo) { - qError("calloc SMsgSendInfo failed"); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SSchTaskCallbackParam *param = taosMemoryCalloc(1, sizeof(SSchTaskCallbackParam)); - if (NULL == param) { - qError("calloc SSchTaskCallbackParam failed"); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - __async_send_cb_fn_t fp = NULL; - SCH_ERR_JRET(schGetCallbackFp(msgType, &fp)); - - param->transport = trans.transInst; - - pMsgSendInfo->param = param; - pMsgSendInfo->msgInfo.pData = msg; - pMsgSendInfo->msgInfo.len = msgSize; - pMsgSendInfo->msgInfo.handle = trans.transHandle; - pMsgSendInfo->msgType = msgType; - pMsgSendInfo->fp = fp; - - int64_t transporterId = 0; - SEpSet epSet = {.inUse = 0, .numOfEps = 1}; - memcpy(&epSet.eps[0], &nodeEpId->ep, sizeof(nodeEpId->ep)); - - qDebug("start to send hb msg, instance:%p, handle:%p, fqdn:%s, port:%d", trans.transInst, trans.transHandle, - nodeEpId->ep.fqdn, nodeEpId->ep.port); - - code = asyncSendMsgToServerExt(trans.transInst, &epSet, &transporterId, pMsgSendInfo, true, &rpcCtx); - if (code) { - qError("fail to send hb msg, instance:%p, handle:%p, fqdn:%s, port:%d, error:%x - %s", trans.transInst, - trans.transHandle, nodeEpId->ep.fqdn, nodeEpId->ep.port, code, tstrerror(code)); - SCH_ERR_JRET(code); - } - - qDebug("hb msg sent"); - return TSDB_CODE_SUCCESS; - -_return: - - taosMemoryFreeClear(msg); - taosMemoryFreeClear(param); - taosMemoryFreeClear(pMsgSendInfo); - schFreeRpcCtx(&rpcCtx); - SCH_RET(code); -} - -int32_t schBuildAndSendMsg(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, int32_t msgType) { - uint32_t msgSize = 0; - void *msg = NULL; - int32_t code = 0; - bool isCandidateAddr = false; - bool persistHandle = false; - SRpcCtx rpcCtx = {0}; - - if (NULL == addr) { - addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - isCandidateAddr = true; - } - - SEpSet epSet = addr->epSet; - - switch (msgType) { - case TDMT_VND_CREATE_TABLE: - case TDMT_VND_DROP_TABLE: - case TDMT_VND_ALTER_TABLE: - case TDMT_VND_SUBMIT: { - msgSize = pTask->msgLen; - msg = taosMemoryCalloc(1, msgSize); - if (NULL == msg) { - SCH_TASK_ELOG("calloc %d failed", msgSize); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - memcpy(msg, pTask->msg, msgSize); - break; - } - - case TDMT_VND_QUERY: { - SCH_ERR_RET(schMakeQueryRpcCtx(pJob, pTask, &rpcCtx)); - - uint32_t len = strlen(pJob->sql); - msgSize = sizeof(SSubQueryMsg) + pTask->msgLen + len; - msg = taosMemoryCalloc(1, msgSize); - if (NULL == msg) { - SCH_TASK_ELOG("calloc %d failed", msgSize); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SSubQueryMsg *pMsg = msg; - pMsg->header.vgId = htonl(addr->nodeId); - pMsg->sId = htobe64(schMgmt.sId); - pMsg->queryId = htobe64(pJob->queryId); - pMsg->taskId = htobe64(pTask->taskId); - pMsg->refId = htobe64(pJob->refId); - pMsg->taskType = TASK_TYPE_TEMP; - pMsg->explain = SCH_IS_EXPLAIN_JOB(pJob); - pMsg->phyLen = htonl(pTask->msgLen); - pMsg->sqlLen = htonl(len); - - memcpy(pMsg->msg, pJob->sql, len); - memcpy(pMsg->msg + len, pTask->msg, pTask->msgLen); - - persistHandle = true; - break; - } - - case TDMT_VND_RES_READY: { - msgSize = sizeof(SResReadyReq); - msg = taosMemoryCalloc(1, msgSize); - if (NULL == msg) { - SCH_TASK_ELOG("calloc %d failed", msgSize); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SResReadyReq *pMsg = msg; - - pMsg->header.vgId = htonl(addr->nodeId); - - pMsg->sId = htobe64(schMgmt.sId); - pMsg->queryId = htobe64(pJob->queryId); - pMsg->taskId = htobe64(pTask->taskId); - break; - } - case TDMT_VND_FETCH: { - msgSize = sizeof(SResFetchReq); - msg = taosMemoryCalloc(1, msgSize); - if (NULL == msg) { - SCH_TASK_ELOG("calloc %d failed", msgSize); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SResFetchReq *pMsg = msg; - - pMsg->header.vgId = htonl(addr->nodeId); - - pMsg->sId = htobe64(schMgmt.sId); - pMsg->queryId = htobe64(pJob->queryId); - pMsg->taskId = htobe64(pTask->taskId); - - break; - } - case TDMT_VND_DROP_TASK: { - msgSize = sizeof(STaskDropReq); - msg = taosMemoryCalloc(1, msgSize); - if (NULL == msg) { - SCH_TASK_ELOG("calloc %d failed", msgSize); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - STaskDropReq *pMsg = msg; - - pMsg->header.vgId = htonl(addr->nodeId); - - pMsg->sId = htobe64(schMgmt.sId); - pMsg->queryId = htobe64(pJob->queryId); - pMsg->taskId = htobe64(pTask->taskId); - pMsg->refId = htobe64(pJob->refId); - break; - } - case TDMT_VND_QUERY_HEARTBEAT: { - SCH_ERR_RET(schMakeHbRpcCtx(pJob, pTask, &rpcCtx)); - - SSchedulerHbReq req = {0}; - req.sId = schMgmt.sId; - req.header.vgId = addr->nodeId; - req.epId.nodeId = addr->nodeId; - memcpy(&req.epId.ep, SCH_GET_CUR_EP(addr), sizeof(SEp)); - - msgSize = tSerializeSSchedulerHbReq(NULL, 0, &req); - if (msgSize < 0) { - SCH_JOB_ELOG("tSerializeSSchedulerHbReq hbReq failed, size:%d", msgSize); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - msg = taosMemoryCalloc(1, msgSize); - if (NULL == msg) { - SCH_JOB_ELOG("calloc %d failed", msgSize); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - if (tSerializeSSchedulerHbReq(msg, msgSize, &req) < 0) { - SCH_JOB_ELOG("tSerializeSSchedulerHbReq hbReq failed, size:%d", msgSize); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - persistHandle = true; - break; - } - default: - SCH_TASK_ELOG("unknown msg type to send, msgType:%d", msgType); - SCH_ERR_RET(TSDB_CODE_SCH_INTERNAL_ERROR); - break; - } - - SCH_SET_TASK_LASTMSG_TYPE(pTask, msgType); - - SSchTrans trans = {.transInst = pJob->transport, .transHandle = SCH_GET_TASK_HANDLE(pTask)}; - SCH_ERR_JRET(schAsyncSendMsg(pJob, pTask, &trans, &epSet, msgType, msg, msgSize, persistHandle, - (rpcCtx.args ? &rpcCtx : NULL))); - - if (msgType == TDMT_VND_QUERY) { - SCH_ERR_RET(schRecordTaskExecNode(pJob, pTask, addr, trans.transHandle)); - } - - return TSDB_CODE_SUCCESS; - -_return: - - SCH_SET_TASK_LASTMSG_TYPE(pTask, -1); - schFreeRpcCtx(&rpcCtx); - - taosMemoryFreeClear(msg); - SCH_RET(code); -} - -int32_t schEnsureHbConnection(SSchJob *pJob, SSchTask *pTask) { - SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - SQueryNodeEpId epId = {0}; - - epId.nodeId = addr->nodeId; - memcpy(&epId.ep, SCH_GET_CUR_EP(addr), sizeof(SEp)); - -#if 1 - SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, &epId, sizeof(SQueryNodeEpId)); - if (NULL == hb) { - bool exist = false; - SCH_ERR_RET(schRegisterHbConnection(pJob, pTask, &epId, &exist)); - if (!exist) { - SCH_ERR_RET(schBuildAndSendHbMsg(&epId)); - } - } -#endif - - return TSDB_CODE_SUCCESS; -} - -int32_t schLaunchTaskImpl(SSchJob *pJob, SSchTask *pTask) { - int8_t status = 0; - int32_t code = 0; - - atomic_add_fetch_32(&pTask->level->taskLaunchedNum, 1); - - if (schJobNeedToStop(pJob, &status)) { - SCH_TASK_DLOG("no need to launch task cause of job status, job status:%s", jobTaskStatusStr(status)); - - SCH_RET(atomic_load_32(&pJob->errCode)); - } - - // NOTE: race condition: the task should be put into the hash table before send msg to server - if (SCH_GET_TASK_STATUS(pTask) != JOB_TASK_STATUS_EXECUTING) { - SCH_ERR_RET(schPushTaskToExecList(pJob, pTask)); - SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_EXECUTING); - } - - SSubplan *plan = pTask->plan; - - if (NULL == pTask->msg) { // TODO add more detailed reason for failure - code = qSubPlanToString(plan, &pTask->msg, &pTask->msgLen); - if (TSDB_CODE_SUCCESS != code) { - SCH_TASK_ELOG("failed to create physical plan, code:%s, msg:%p, len:%d", tstrerror(code), pTask->msg, - pTask->msgLen); - SCH_ERR_RET(code); - } else { - SCH_TASK_DLOGL("physical plan len:%d, %s", pTask->msgLen, pTask->msg); - } - } - - SCH_ERR_RET(schSetTaskCandidateAddrs(pJob, pTask)); - - if (SCH_IS_QUERY_JOB(pJob)) { - SCH_ERR_RET(schEnsureHbConnection(pJob, pTask)); - } - - SCH_ERR_RET(schBuildAndSendMsg(pJob, pTask, NULL, plan->msgType)); - - return TSDB_CODE_SUCCESS; -} - -// Note: no more error processing, handled in function internal -int32_t schLaunchTask(SSchJob *pJob, SSchTask *pTask) { - bool enough = false; - int32_t code = 0; - - SCH_SET_TASK_HANDLE(pTask, NULL); - - if (SCH_TASK_NEED_FLOW_CTRL(pJob, pTask)) { - SCH_ERR_JRET(schCheckIncTaskFlowQuota(pJob, pTask, &enough)); - - if (enough) { - SCH_ERR_JRET(schLaunchTaskImpl(pJob, pTask)); - } - } else { - SCH_ERR_JRET(schLaunchTaskImpl(pJob, pTask)); - } - - return TSDB_CODE_SUCCESS; - -_return: - - SCH_RET(schProcessOnTaskFailure(pJob, pTask, code)); -} - -int32_t schLaunchLevelTasks(SSchJob *pJob, SSchLevel *level) { - for (int32_t i = 0; i < level->taskNum; ++i) { - SSchTask *pTask = taosArrayGet(level->subTasks, i); - - SCH_ERR_RET(schLaunchTask(pJob, pTask)); - } - - return TSDB_CODE_SUCCESS; -} - -int32_t schLaunchJob(SSchJob *pJob) { - SSchLevel *level = taosArrayGet(pJob->levels, pJob->levelIdx); - - SCH_ERR_RET(schCheckAndUpdateJobStatus(pJob, JOB_TASK_STATUS_EXECUTING)); - - SCH_ERR_RET(schCheckJobNeedFlowCtrl(pJob, level)); - - SCH_ERR_RET(schLaunchLevelTasks(pJob, level)); - - return TSDB_CODE_SUCCESS; -} - -void schDropTaskOnExecutedNode(SSchJob *pJob, SSchTask *pTask) { - if (NULL == pTask->execNodes) { - SCH_TASK_DLOG("no exec address, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - return; - } - - int32_t size = (int32_t)taosArrayGetSize(pTask->execNodes); - - if (size <= 0) { - SCH_TASK_DLOG("task has no execNodes, no need to drop it, status:%s", SCH_GET_TASK_STATUS_STR(pTask)); - return; - } - - SSchNodeInfo *nodeInfo = NULL; - for (int32_t i = 0; i < size; ++i) { - nodeInfo = (SSchNodeInfo *)taosArrayGet(pTask->execNodes, i); - SCH_SET_TASK_HANDLE(pTask, nodeInfo->handle); - - schBuildAndSendMsg(pJob, pTask, &nodeInfo->addr, TDMT_VND_DROP_TASK); - } - - SCH_TASK_DLOG("task has %d exec address", size); -} - -void schDropTaskInHashList(SSchJob *pJob, SHashObj *list) { - if (!SCH_IS_NEED_DROP_JOB(pJob)) { - return; - } - - void *pIter = taosHashIterate(list, NULL); - while (pIter) { - SSchTask *pTask = *(SSchTask **)pIter; - - schDropTaskOnExecutedNode(pJob, pTask); - - pIter = taosHashIterate(list, pIter); - } -} - -void schDropJobAllTasks(SSchJob *pJob) { - schDropTaskInHashList(pJob, pJob->execTasks); - schDropTaskInHashList(pJob, pJob->succTasks); - schDropTaskInHashList(pJob, pJob->failTasks); -} - -int32_t schCancelJob(SSchJob *pJob) { - // TODO - return TSDB_CODE_SUCCESS; - // TODO MOVE ALL TASKS FROM EXEC LIST TO FAIL LIST -} - -void schCloseJobRef(void) { - if (!atomic_load_8((int8_t *)&schMgmt.exit)) { - return; - } - - SCH_LOCK(SCH_WRITE, &schMgmt.lock); - if (atomic_load_32(&schMgmt.jobNum) <= 0 && schMgmt.jobRef >= 0) { - taosCloseRef(schMgmt.jobRef); - schMgmt.jobRef = -1; - } - SCH_UNLOCK(SCH_WRITE, &schMgmt.lock); -} - -void schFreeJobImpl(void *job) { - if (NULL == job) { - return; - } - - SSchJob *pJob = job; - uint64_t queryId = pJob->queryId; - int64_t refId = pJob->refId; - - if (pJob->status == JOB_TASK_STATUS_EXECUTING) { - schCancelJob(pJob); - } - - schDropJobAllTasks(pJob); - - pJob->subPlans = NULL; // it is a reference to pDag->pSubplans - - int32_t numOfLevels = taosArrayGetSize(pJob->levels); - for (int32_t i = 0; i < numOfLevels; ++i) { - SSchLevel *pLevel = taosArrayGet(pJob->levels, i); - - int32_t numOfTasks = taosArrayGetSize(pLevel->subTasks); - for (int32_t j = 0; j < numOfTasks; ++j) { - SSchTask *pTask = taosArrayGet(pLevel->subTasks, j); - schFreeTask(pTask); - } - - taosArrayDestroy(pLevel->subTasks); - } - - schFreeFlowCtrl(pJob); - - taosHashCleanup(pJob->execTasks); - taosHashCleanup(pJob->failTasks); - taosHashCleanup(pJob->succTasks); - - taosArrayDestroy(pJob->levels); - taosArrayDestroy(pJob->nodeList); - taosArrayDestroy(pJob->dataSrcTasks); - - qExplainFreeCtx(pJob->explainCtx); - - if (SCH_IS_QUERY_JOB(pJob)) { - taosArrayDestroy((SArray *)pJob->queryRes); - } else { - tFreeSSubmitRsp((SSubmitRsp*)pJob->queryRes); - } - - taosMemoryFreeClear(pJob->resData); - taosMemoryFreeClear(pJob); - - qDebug("QID:0x%" PRIx64 " job freed, refId:%" PRIx64 ", pointer:%p", queryId, refId, pJob); - - atomic_sub_fetch_32(&schMgmt.jobNum, 1); - - schCloseJobRef(); -} - -static int32_t schExecJobImpl(void *transport, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql, - int64_t startTs, bool syncSchedule) { - qDebug("QID:0x%" PRIx64 " job started", pDag->queryId); - - if (pNodeList == NULL || taosArrayGetSize(pNodeList) <= 0) { - qDebug("QID:0x%" PRIx64 " input exec nodeList is empty", pDag->queryId); - } - - int32_t code = 0; - SSchJob *pJob = NULL; - SCH_ERR_JRET(schInitJob(&pJob, pDag, transport, pNodeList, sql, startTs, syncSchedule)); - - SCH_ERR_JRET(schLaunchJob(pJob)); - - *job = pJob->refId; - - if (syncSchedule) { - SCH_JOB_DLOG("will wait for rsp now, job status:%s", SCH_GET_JOB_STATUS_STR(pJob)); - tsem_wait(&pJob->rspSem); - } - - SCH_JOB_DLOG("job exec done, job status:%s", SCH_GET_JOB_STATUS_STR(pJob)); - - schReleaseJob(pJob->refId); - - return TSDB_CODE_SUCCESS; - -_return: - - schFreeJobImpl(pJob); - SCH_RET(code); -} - -int32_t schExecStaticExplain(void *transport, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql, - bool syncSchedule) { - qDebug("QID:0x%" PRIx64 " job started", pDag->queryId); - - int32_t code = 0; - SSchJob *pJob = taosMemoryCalloc(1, sizeof(SSchJob)); - if (NULL == pJob) { - qError("QID:%" PRIx64 " calloc %d failed", pDag->queryId, (int32_t)sizeof(SSchJob)); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - pJob->sql = sql; - pJob->attr.queryJob = true; - pJob->attr.explainMode = pDag->explainInfo.mode; - pJob->queryId = pDag->queryId; - pJob->subPlans = pDag->pSubplans; - - SCH_ERR_JRET(qExecStaticExplain(pDag, (SRetrieveTableRsp **)&pJob->resData)); - - int64_t refId = taosAddRef(schMgmt.jobRef, pJob); - if (refId < 0) { - SCH_JOB_ELOG("taosAddRef job failed, error:%s", tstrerror(terrno)); - SCH_ERR_JRET(terrno); - } - - if (NULL == schAcquireJob(refId)) { - SCH_JOB_ELOG("schAcquireJob job failed, refId:%" PRIx64, refId); - SCH_RET(TSDB_CODE_SCH_STATUS_ERROR); - } - - pJob->refId = refId; - - SCH_JOB_DLOG("job refId:%" PRIx64, pJob->refId); - - pJob->status = JOB_TASK_STATUS_PARTIAL_SUCCEED; - *job = pJob->refId; - SCH_JOB_DLOG("job exec done, job status:%s", SCH_GET_JOB_STATUS_STR(pJob)); - - schReleaseJob(pJob->refId); - - return TSDB_CODE_SUCCESS; - -_return: - - schFreeJobImpl(pJob); - SCH_RET(code); -} - int32_t schedulerInit(SSchedulerCfg *cfg) { if (schMgmt.jobRef >= 0) { qError("scheduler already initialized"); @@ -2670,129 +113,6 @@ int32_t schedulerAsyncExecJob(void *transport, SArray *pNodeList, SQueryPlan *pD return TSDB_CODE_SUCCESS; } -#if 0 -int32_t schedulerConvertDagToTaskList(SQueryPlan* pDag, SArray **pTasks) { - if (NULL == pDag || pDag->numOfSubplans <= 0 || LIST_LENGTH(pDag->pSubplans) == 0) { - SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); - } - - int32_t levelNum = LIST_LENGTH(pDag->pSubplans); - if (1 != levelNum) { - qError("invalid level num: %d", levelNum); - SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); - } - - SNodeListNode *plans = (SNodeListNode*)nodesListGetNode(pDag->pSubplans, 0); - int32_t taskNum = LIST_LENGTH(plans->pNodeList); - if (taskNum <= 0) { - qError("invalid task num: %d", taskNum); - SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); - } - - SArray *info = taosArrayInit(taskNum, sizeof(STaskInfo)); - if (NULL == info) { - qError("taosArrayInit %d taskInfo failed", taskNum); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - STaskInfo tInfo = {0}; - char *msg = NULL; - int32_t msgLen = 0; - int32_t code = 0; - - for (int32_t i = 0; i < taskNum; ++i) { - SSubplan *plan = (SSubplan*)nodesListGetNode(plans->pNodeList, i); - tInfo.addr = plan->execNode; - - code = qSubPlanToString(plan, &msg, &msgLen); - if (TSDB_CODE_SUCCESS != code) { - qError("subplanToString error, code:%x, msg:%p, len:%d", code, msg, msgLen); - SCH_ERR_JRET(code); - } - - int32_t msgSize = sizeof(SSubQueryMsg) + msgLen; - if (NULL == msg) { - qError("calloc %d failed", msgSize); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - SSubQueryMsg* pMsg = taosMemoryCalloc(1, msgSize); - - pMsg->header.vgId = tInfo.addr.nodeId; - - pMsg->sId = schMgmt.sId; - pMsg->queryId = plan->id.queryId; - pMsg->taskId = schGenUUID(); - pMsg->taskType = TASK_TYPE_PERSISTENT; - pMsg->phyLen = msgLen; - pMsg->sqlLen = 0; - memcpy(pMsg->msg, msg, msgLen); - /*memcpy(pMsg->msg, ((SSubQueryMsg*)msg)->msg, msgLen);*/ - - tInfo.msg = pMsg; - - if (NULL == taosArrayPush(info, &tInfo)) { - qError("taosArrayPush failed, idx:%d", i); - taosMemoryFree(msg); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - } - - *pTasks = info; - info = NULL; - -_return: - schedulerFreeTaskList(info); - SCH_RET(code); -} - -int32_t schedulerCopyTask(STaskInfo *src, SArray **dst, int32_t copyNum) { - if (NULL == src || NULL == dst || copyNum <= 0) { - SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); - } - - int32_t code = 0; - - *dst = taosArrayInit(copyNum, sizeof(STaskInfo)); - if (NULL == *dst) { - qError("taosArrayInit %d taskInfo failed", copyNum); - SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - int32_t msgSize = src->msg->phyLen + sizeof(*src->msg); - STaskInfo info = {0}; - - info.addr = src->addr; - - for (int32_t i = 0; i < copyNum; ++i) { - info.msg = taosMemoryMalloc(msgSize); - if (NULL == info.msg) { - qError("malloc %d failed", msgSize); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - - memcpy(info.msg, src->msg, msgSize); - - info.msg->taskId = schGenUUID(); - - if (NULL == taosArrayPush(*dst, &info)) { - qError("taosArrayPush failed, idx:%d", i); - taosMemoryFree(info.msg); - SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); - } - } - - return TSDB_CODE_SUCCESS; - -_return: - - schedulerFreeTaskList(*dst); - *dst = NULL; - - SCH_RET(code); -} -#endif - int32_t schedulerFetchRows(int64_t job, void **pData) { if (NULL == pData) { SCH_ERR_RET(TSDB_CODE_QRY_INVALID_INPUT); @@ -2848,7 +168,7 @@ int32_t schedulerFetchRows(int64_t job, void **pData) { } if (pJob->resData && ((SRetrieveTableRsp *)pJob->resData)->completed) { - SCH_ERR_JRET(schCheckAndUpdateJobStatus(pJob, JOB_TASK_STATUS_SUCCEED)); + SCH_ERR_JRET(schChkUpdateJobStatus(pJob, JOB_TASK_STATUS_SUCCEED)); } while (true) { @@ -2942,20 +262,6 @@ void schedulerFreeJob(int64_t job) { schReleaseJob(job); } -void schedulerFreeTaskList(SArray *taskList) { - if (NULL == taskList) { - return; - } - - int32_t taskNum = taosArrayGetSize(taskList); - for (int32_t i = 0; i < taskNum; ++i) { - STaskInfo *info = taosArrayGet(taskList, i); - taosMemoryFreeClear(info->msg); - } - - taosArrayDestroy(taskList); -} - void schedulerDestroy(void) { atomic_store_8((int8_t *)&schMgmt.exit, 1); diff --git a/source/libs/sync/inc/syncIO.h b/source/libs/sync/inc/syncIO.h index f65a317694..b69c087b5f 100644 --- a/source/libs/sync/inc/syncIO.h +++ b/source/libs/sync/inc/syncIO.h @@ -36,8 +36,8 @@ typedef struct SSyncIO { STaosQueue *pMsgQ; STaosQset * pQset; TdThread consumerTid; - void *serverRpc; - void *clientRpc; + void * serverRpc; + void * clientRpc; SEpSet myAddr; SMsgCb msgcb; diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 768e1c1cf1..9246041b81 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -147,6 +147,11 @@ typedef struct SSyncNode { // tools SSyncRespMgr* pSyncRespMgr; + // restore state + bool restoreFinish; + //sem_t restoreSem; + SSnapshot* pSnapshot; + } SSyncNode; // open/close -------------- diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 1a5d418e75..fa735e71c0 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -324,7 +324,6 @@ int32_t syncNodeOnAppendEntriesCb(SSyncNode* ths, SyncAppendEntries* pMsg) { SRpcMsg rpcMsg; syncEntry2OriginalRpc(pEntry, &rpcMsg); - // if (ths->pFsm->FpCommitCb != NULL && pEntry->originalRpcType != TDMT_VND_SYNC_NOOP) { if (ths->pFsm->FpCommitCb != NULL && syncUtilUserCommit(pEntry->originalRpcType)) { SFsmCbMeta cbMeta; cbMeta.index = pEntry->index; @@ -332,7 +331,18 @@ int32_t syncNodeOnAppendEntriesCb(SSyncNode* ths, SyncAppendEntries* pMsg) { cbMeta.code = 0; cbMeta.state = ths->state; cbMeta.seqNum = pEntry->seqNum; + cbMeta.term = pEntry->term; + cbMeta.currentTerm = ths->pRaftStore->currentTerm; ths->pFsm->FpCommitCb(ths->pFsm, &rpcMsg, cbMeta); + + bool needExecute = true; + if (ths->pSnapshot != NULL && cbMeta.index <= ths->pSnapshot->lastApplyIndex) { + needExecute = false; + } + + if (needExecute) { + ths->pFsm->FpCommitCb(ths->pFsm, &rpcMsg, cbMeta); + } } // config change @@ -349,6 +359,22 @@ int32_t syncNodeOnAppendEntriesCb(SSyncNode* ths, SyncAppendEntries* pMsg) { } } + // restore finish + if (pEntry->index == ths->pLogStore->getLastIndex(ths->pLogStore)) { + if (ths->restoreFinish == false) { + if (ths->pFsm->FpRestoreFinish != NULL) { + ths->pFsm->FpRestoreFinish(ths->pFsm); + } + ths->restoreFinish = true; + sInfo("==syncNodeOnAppendEntriesCb== restoreFinish set true %p vgId:%d", ths, ths->vgId); + + /* + tsem_post(&ths->restoreSem); + sInfo("==syncNodeOnAppendEntriesCb== RestoreFinish tsem_post %p", ths); + */ + } + } + rpcFreeCont(rpcMsg.pCont); syncEntryDestory(pEntry); } diff --git a/source/libs/sync/src/syncCommit.c b/source/libs/sync/src/syncCommit.c index 0f17cf267e..18c6f8930a 100644 --- a/source/libs/sync/src/syncCommit.c +++ b/source/libs/sync/src/syncCommit.c @@ -102,7 +102,6 @@ void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) { SRpcMsg rpcMsg; syncEntry2OriginalRpc(pEntry, &rpcMsg); - // if (pSyncNode->pFsm->FpCommitCb != NULL && pEntry->originalRpcType != TDMT_VND_SYNC_NOOP) { if (pSyncNode->pFsm->FpCommitCb != NULL && syncUtilUserCommit(pEntry->originalRpcType)) { SFsmCbMeta cbMeta; cbMeta.index = pEntry->index; @@ -110,7 +109,17 @@ void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) { cbMeta.code = 0; cbMeta.state = pSyncNode->state; cbMeta.seqNum = pEntry->seqNum; - pSyncNode->pFsm->FpCommitCb(pSyncNode->pFsm, &rpcMsg, cbMeta); + cbMeta.term = pEntry->term; + cbMeta.currentTerm = pSyncNode->pRaftStore->currentTerm; + + bool needExecute = true; + if (pSyncNode->pSnapshot != NULL && cbMeta.index <= pSyncNode->pSnapshot->lastApplyIndex) { + needExecute = false; + } + + if (needExecute) { + pSyncNode->pFsm->FpCommitCb(pSyncNode->pFsm, &rpcMsg, cbMeta); + } } // config change @@ -127,6 +136,22 @@ void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) { } } + // restore finish + if (pEntry->index == pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore)) { + if (pSyncNode->restoreFinish == false) { + if (pSyncNode->pFsm->FpRestoreFinish != NULL) { + pSyncNode->pFsm->FpRestoreFinish(pSyncNode->pFsm); + } + pSyncNode->restoreFinish = true; + sInfo("==syncMaybeAdvanceCommitIndex== restoreFinish set true %p vgId:%d", pSyncNode, pSyncNode->vgId); + + /* + tsem_post(&pSyncNode->restoreSem); + sInfo("==syncMaybeAdvanceCommitIndex== RestoreFinish tsem_post %p", pSyncNode); + */ + } + } + rpcFreeCont(rpcMsg.pCont); syncEntryDestory(pEntry); } @@ -162,4 +187,4 @@ bool syncAgree(SSyncNode* pSyncNode, SyncIndex index) { } } return false; -} \ No newline at end of file +} diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index d9ff60bbe2..821ed364e8 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -13,7 +13,6 @@ * along with this program. If not, see . */ -#include #include "sync.h" #include "syncAppendEntries.h" #include "syncAppendEntriesReply.h" @@ -55,14 +54,17 @@ static void syncFreeNode(void* param); // --------------------------------- int32_t syncInit() { - int32_t ret; - tsNodeRefId = taosOpenRef(200, syncFreeNode); - if (tsNodeRefId < 0) { - sError("failed to init node ref"); - syncCleanUp(); - ret = -1; - } else { - ret = syncEnvStart(); + int32_t ret = 0; + + if (!syncEnvIsStart()) { + tsNodeRefId = taosOpenRef(200, syncFreeNode); + if (tsNodeRefId < 0) { + sError("failed to init node ref"); + syncCleanUp(); + ret = -1; + } else { + ret = syncEnvStart(); + } } return ret; @@ -240,7 +242,7 @@ int32_t syncGetAndDelRespRpc(int64_t rid, uint64_t index, SRpcMsg* msg) { return ret; } -void syncSetMsgCb(int64_t rid, const SMsgCb *msgcb) { +void syncSetMsgCb(int64_t rid, const SMsgCb* msgcb) { SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid); if (pSyncNode == NULL) { sTrace("syncSetQ get pSyncNode is NULL, rid:%ld", rid); @@ -490,6 +492,15 @@ SSyncNode* syncNodeOpen(const SSyncInfo* pSyncInfo) { pSyncNode->pSyncRespMgr = syncRespMgrCreate(NULL, 0); assert(pSyncNode->pSyncRespMgr != NULL); + // restore state + pSyncNode->restoreFinish = false; + pSyncNode->pSnapshot = NULL; + if (pSyncNode->pFsm->FpGetSnapshot != NULL) { + pSyncNode->pSnapshot = taosMemoryMalloc(sizeof(SSnapshot)); + pSyncNode->pFsm->FpGetSnapshot(pSyncNode->pFsm, pSyncNode->pSnapshot); + } + //tsem_init(&(pSyncNode->restoreSem), 0, 0); + // start in syncNodeStart // start raft // syncNodeBecomeFollower(pSyncNode); @@ -509,6 +520,20 @@ void syncNodeStart(SSyncNode* pSyncNode) { // use this now syncNodeAppendNoop(pSyncNode); syncMaybeAdvanceCommitIndex(pSyncNode); // maybe only one replica + + /* + sInfo("==syncNodeStart== RestoreFinish begin 1 replica tsem_wait %p", pSyncNode); + tsem_wait(&pSyncNode->restoreSem); + sInfo("==syncNodeStart== RestoreFinish end 1 replica tsem_wait %p", pSyncNode); + */ + + /* + while (pSyncNode->restoreFinish != true) { + taosMsleep(10); + } + */ + + sInfo("==syncNodeStart== restoreFinish ok 1 replica %p vgId:%d", pSyncNode, pSyncNode->vgId); return; } @@ -518,6 +543,19 @@ void syncNodeStart(SSyncNode* pSyncNode) { int32_t ret = 0; // ret = syncNodeStartPingTimer(pSyncNode); assert(ret == 0); + + /* + sInfo("==syncNodeStart== RestoreFinish begin multi replica tsem_wait %p", pSyncNode); + tsem_wait(&pSyncNode->restoreSem); + sInfo("==syncNodeStart== RestoreFinish end multi replica tsem_wait %p", pSyncNode); + */ + + /* + while (pSyncNode->restoreFinish != true) { + taosMsleep(10); + } + */ + sInfo("==syncNodeStart== restoreFinish ok multi replica %p vgId:%d", pSyncNode, pSyncNode->vgId); } void syncNodeStartStandBy(SSyncNode* pSyncNode) { @@ -554,6 +592,12 @@ void syncNodeClose(SSyncNode* pSyncNode) { taosMemoryFree(pSyncNode->pFsm); } + if (pSyncNode->pSnapshot != NULL) { + taosMemoryFree(pSyncNode->pSnapshot); + } + + //tsem_destroy(&pSyncNode->restoreSem); + // free memory in syncFreeNode // taosMemoryFree(pSyncNode); } diff --git a/source/libs/sync/test/syncConfigChangeTest.cpp b/source/libs/sync/test/syncConfigChangeTest.cpp index cff692239a..0850ef6343 100644 --- a/source/libs/sync/test/syncConfigChangeTest.cpp +++ b/source/libs/sync/test/syncConfigChangeTest.cpp @@ -73,12 +73,17 @@ int32_t GetSnapshotCb(struct SSyncFSM* pFsm, SSnapshot* pSnapshot) { return 0; } +void FpRestoreFinishCb(struct SSyncFSM* pFsm) { + sTrace("==callback== ==FpRestoreFinishCb=="); +} + SSyncFSM* createFsm() { SSyncFSM* pFsm = (SSyncFSM*)taosMemoryMalloc(sizeof(SSyncFSM)); pFsm->FpCommitCb = CommitCb; pFsm->FpPreCommitCb = PreCommitCb; pFsm->FpRollBackCb = RollBackCb; pFsm->FpGetSnapshot = GetSnapshotCb; + pFsm->FpRestoreFinish = FpRestoreFinishCb; return pFsm; } diff --git a/source/libs/sync/test/syncSnapshotTest.cpp b/source/libs/sync/test/syncSnapshotTest.cpp index 62bda5b22e..8ccd698907 100644 --- a/source/libs/sync/test/syncSnapshotTest.cpp +++ b/source/libs/sync/test/syncSnapshotTest.cpp @@ -160,6 +160,8 @@ SyncClientRequest *step1(const SRpcMsg *pMsg) { } int main(int argc, char **argv) { + sprintf(tsTempDir, "%s", "."); + // taosInitLog((char *)"syncTest.log", 100000, 10); tsAsyncLog = 0; sDebugFlag = 143 + 64; diff --git a/source/libs/transport/inc/transComm.h b/source/libs/transport/inc/transComm.h index e71e19edce..30f799f39e 100644 --- a/source/libs/transport/inc/transComm.h +++ b/source/libs/transport/inc/transComm.h @@ -94,7 +94,7 @@ typedef void* queue[2]; /* Return the structure holding the given element. */ #define QUEUE_DATA(e, type, field) ((type*)((void*)((char*)(e)-offsetof(type, field)))) -#define TRANS_RETRY_COUNT_LIMIT 20 // retry count limit +#define TRANS_RETRY_COUNT_LIMIT 100 // retry count limit #define TRANS_RETRY_INTERVAL 15 // ms retry interval #define TRANS_CONN_TIMEOUT 3 // connect timeout diff --git a/source/libs/transport/src/trans.c b/source/libs/transport/src/trans.c index 5627dbfbf5..9e71c87fa5 100644 --- a/source/libs/transport/src/trans.c +++ b/source/libs/transport/src/trans.c @@ -17,7 +17,7 @@ #include "transComm.h" -void* (*taosInitHandle[])(uint32_t ip, uint32_t port, char* label, int numOfThreads, void* fp, void* shandle) = { +void* (*taosInitHandle[])(uint32_t ip, uint32_t port, char* label, int32_t numOfThreads, void* fp, void* shandle) = { transInitServer, transInitClient}; void (*taosCloseHandle[])(void* arg) = {transCloseServer, transCloseClient}; @@ -77,37 +77,38 @@ void rpcClose(void* arg) { taosMemoryFree(pRpc); return; } -void* rpcMallocCont(int contLen) { - int size = contLen + TRANS_MSG_OVERHEAD; - char* start = (char*)taosMemoryCalloc(1, (size_t)size); +void* rpcMallocCont(int32_t contLen) { + int32_t size = contLen + TRANS_MSG_OVERHEAD; + char* start = taosMemoryCalloc(1, size); if (start == NULL) { tError("failed to malloc msg, size:%d", size); + terrno = TSDB_CODE_OUT_OF_MEMORY; return NULL; } else { tTrace("malloc mem:%p size:%d", start, size); } + return start + sizeof(STransMsgHead); } -void rpcFreeCont(void* cont) { - // impl - if (cont == NULL) { - return; - } +void rpcFreeCont(void* cont) { + if (cont == NULL) return; taosMemoryFree((char*)cont - TRANS_MSG_OVERHEAD); tTrace("free mem: %p", (char*)cont - TRANS_MSG_OVERHEAD); } -void* rpcReallocCont(void* ptr, int contLen) { - if (ptr == NULL) { - return rpcMallocCont(contLen); - } - char* st = (char*)ptr - TRANS_MSG_OVERHEAD; - int sz = contLen + TRANS_MSG_OVERHEAD; + +void* rpcReallocCont(void* ptr, int32_t contLen) { + if (ptr == NULL) return rpcMallocCont(contLen); + + char* st = (char*)ptr - TRANS_MSG_OVERHEAD; + int32_t sz = contLen + TRANS_MSG_OVERHEAD; st = taosMemoryRealloc(st, sz); if (st == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; return NULL; } + return st + TRANS_MSG_OVERHEAD; } @@ -116,8 +117,8 @@ void rpcSendRedirectRsp(void* thandle, const SEpSet* pEpSet) { assert(0); } -int rpcReportProgress(void* pConn, char* pCont, int contLen) { return -1; } -void rpcCancelRequest(int64_t rid) { return; } +int32_t rpcReportProgress(void* pConn, char* pCont, int32_t contLen) { return -1; } +void rpcCancelRequest(int64_t rid) { return; } void rpcSendRequest(void* shandle, const SEpSet* pEpSet, SRpcMsg* pMsg, int64_t* pRid) { transSendRequest(shandle, pEpSet, pMsg, NULL); @@ -129,8 +130,8 @@ void rpcSendRecv(void* shandle, SEpSet* pEpSet, SRpcMsg* pMsg, SRpcMsg* pRsp) { transSendRecv(shandle, pEpSet, pMsg, pRsp); } -void rpcSendResponse(const SRpcMsg* pMsg) { transSendResponse(pMsg); } -int rpcGetConnInfo(void* thandle, SRpcConnInfo* pInfo) { return transGetConnInfo((void*)thandle, pInfo); } +void rpcSendResponse(const SRpcMsg* pMsg) { transSendResponse(pMsg); } +int32_t rpcGetConnInfo(void* thandle, SRpcConnInfo* pInfo) { return transGetConnInfo((void*)thandle, pInfo); } void rpcRefHandle(void* handle, int8_t type) { assert(type == TAOS_CONN_SERVER || type == TAOS_CONN_CLIENT); diff --git a/source/libs/wal/src/walMgmt.c b/source/libs/wal/src/walMgmt.c index ada1f599f2..71cd6de73f 100644 --- a/source/libs/wal/src/walMgmt.c +++ b/source/libs/wal/src/walMgmt.c @@ -14,17 +14,17 @@ */ #define _DEFAULT_SOURCE -#include "tcompare.h" #include "os.h" #include "taoserror.h" +#include "tcompare.h" #include "tref.h" #include "walInt.h" typedef struct { - int8_t stop; - int8_t inited; - uint32_t seq; - int32_t refSetId; + int8_t stop; + int8_t inited; + uint32_t seq; + int32_t refSetId; TdThread thread; } SWalMgmt; @@ -36,30 +36,42 @@ static void walFreeObj(void *pWal); int64_t walGetSeq() { return (int64_t)atomic_load_32(&tsWal.seq); } int32_t walInit() { - int8_t old = atomic_val_compare_exchange_8(&tsWal.inited, 0, 1); - if (old == 1) return 0; - - tsWal.refSetId = taosOpenRef(TSDB_MIN_VNODES, walFreeObj); - - int32_t code = walCreateThread(); - if (code != 0) { - wError("failed to init wal module since %s", tstrerror(code)); - atomic_store_8(&tsWal.inited, 0); - return code; + int8_t old; + while (1) { + old = atomic_val_compare_exchange_8(&tsWal.inited, 0, 2); + if (old != 2) break; + } + + if (old == 0) { + tsWal.refSetId = taosOpenRef(TSDB_MIN_VNODES, walFreeObj); + + int32_t code = walCreateThread(); + if (code != 0) { + wError("failed to init wal module since %s", tstrerror(code)); + atomic_store_8(&tsWal.inited, 0); + return code; + } + + wInfo("wal module is initialized, rsetId:%d", tsWal.refSetId); + atomic_store_8(&tsWal.inited, 1); } - wInfo("wal module is initialized, rsetId:%d", tsWal.refSetId); return 0; } void walCleanUp() { - int8_t old = atomic_val_compare_exchange_8(&tsWal.inited, 1, 0); - if (old == 0) { - return; + int8_t old; + while (1) { + old = atomic_val_compare_exchange_8(&tsWal.inited, 1, 2); + if (old != 2) break; + } + + if (old == 1) { + walStopThread(); + taosCloseRef(tsWal.refSetId); + wInfo("wal module is cleaned up"); + atomic_store_8(&tsWal.inited, 0); } - walStopThread(); - taosCloseRef(tsWal.refSetId); - wInfo("wal module is cleaned up"); } SWal *walOpen(const char *path, SWalCfg *pCfg) { @@ -126,7 +138,6 @@ SWal *walOpen(const char *path, SWalCfg *pCfg) { } if (walCheckAndRepairIdx(pWal) < 0) { - } wDebug("vgId:%d, wal:%p is opened, level:%d fsyncPeriod:%d", pWal->cfg.vgId, pWal, pWal->cfg.level, diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 3890a55ff1..7c4f0fa2dd 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -259,6 +259,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_MND_TRANS_NOT_EXIST, "Transaction not exist TAOS_DEFINE_ERROR(TSDB_CODE_MND_TRANS_INVALID_STAGE, "Invalid stage to kill") TAOS_DEFINE_ERROR(TSDB_CODE_MND_TRANS_CONFLICT, "Conflict transaction not completed") TAOS_DEFINE_ERROR(TSDB_CODE_MND_TRANS_UNKNOW_ERROR, "Unknown transaction error") +TAOS_DEFINE_ERROR(TSDB_CODE_MND_TRANS_CLOG_IS_NULL, "Transaction commitlog is null") // mnode-mq TAOS_DEFINE_ERROR(TSDB_CODE_MND_TOPIC_ALREADY_EXIST, "Topic already exists") diff --git a/tests/test/c/sdbDump.c b/tests/test/c/sdbDump.c index 2bc60f777c..1d3eba7cde 100644 --- a/tests/test/c/sdbDump.c +++ b/tests/test/c/sdbDump.c @@ -262,7 +262,7 @@ void dumpCluster(SSdb *pSdb, SJson *json) { } void dumpTrans(SSdb *pSdb, SJson *json) { - void *pIter = NULL; + void *pIter = NULL; SJson *items = tjsonCreateObject(); tjsonAddItemToObject(json, "transactions", items); @@ -294,6 +294,7 @@ void dumpTrans(SSdb *pSdb, SJson *json) { void dumpHeader(SSdb *pSdb, SJson *json) { tjsonAddIntegerToObject(json, "sver", 1); tjsonAddStringToObject(json, "curVer", i642str(pSdb->curVer)); + tjsonAddStringToObject(json, "curTerm", i642str(pSdb->curTerm)); SJson *maxIdsJson = tjsonCreateObject(); tjsonAddItemToObject(json, "maxIds", maxIdsJson);