From 5765bac268b29dcdc7db339bd8a5488bb480b444 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Wed, 26 Jan 2022 16:10:56 +0800 Subject: [PATCH 01/12] define BUILD_WITH_UV_TRANS --- source/libs/transport/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/libs/transport/CMakeLists.txt b/source/libs/transport/CMakeLists.txt index 5c214b75a1..a2e82201bf 100644 --- a/source/libs/transport/CMakeLists.txt +++ b/source/libs/transport/CMakeLists.txt @@ -14,17 +14,18 @@ target_link_libraries( PUBLIC common ) if (${BUILD_WITH_UV_TRANS}) +if (${BUILD_WITH_UV}) target_include_directories( transport PUBLIC "${CMAKE_SOURCE_DIR}/contrib/libuv/include" ) - -#LINK_DIRECTORIES("${CMAKE_SOURCE_DIR}/debug/contrib/libuv") + target_link_libraries( transport PUBLIC uv_a ) add_definitions(-DUSE_UV) +endif(${BUILD_WITH_UV}) endif(${BUILD_WITH_UV_TRANS}) if (${BUILD_TEST}) From 1b98943dd20dc0d8a98018a27eb56f8c6584ec81 Mon Sep 17 00:00:00 2001 From: dapan1121 Date: Mon, 7 Feb 2022 19:01:26 +0800 Subject: [PATCH 02/12] feature/qnode --- include/libs/catalog/catalog.h | 2 +- include/libs/qcom/query.h | 1 + include/util/taoserror.h | 1 + source/client/src/clientHb.c | 3 +- source/libs/catalog/inc/catalogInt.h | 13 +- source/libs/catalog/src/catalog.c | 610 ++++++++++++++-------- source/libs/catalog/test/catalogTests.cpp | 5 +- source/libs/qcom/src/querymsg.c | 2 +- source/util/src/terror.c | 1 + 9 files changed, 399 insertions(+), 239 deletions(-) diff --git a/include/libs/catalog/catalog.h b/include/libs/catalog/catalog.h index c291ebd8fd..fef951b010 100644 --- a/include/libs/catalog/catalog.h +++ b/include/libs/catalog/catalog.h @@ -99,7 +99,7 @@ int32_t catalogGetDBVgroupVersion(struct SCatalog* pCatalog, const char* dbName, */ int32_t catalogGetDBVgroup(struct SCatalog* pCatalog, void *pTransporter, const SEpSet* pMgmtEps, const char* pDBName, bool forceUpdate, SArray** pVgroupList); -int32_t catalogUpdateDBVgroup(struct SCatalog* pCatalog, const char* dbName, SDBVgroupInfo* dbInfo); +int32_t catalogUpdateDBVgroup(struct SCatalog* pCatalog, const char* dbName, uint64_t dbId, SDBVgroupInfo* dbInfo); int32_t catalogRemoveDB(struct SCatalog* pCatalog, const char* dbName, uint64_t dbId); diff --git a/include/libs/qcom/query.h b/include/libs/qcom/query.h index 2e4093590d..808437cb7e 100644 --- a/include/libs/qcom/query.h +++ b/include/libs/qcom/query.h @@ -89,6 +89,7 @@ typedef struct SDBVgroupInfo { typedef struct SUseDbOutput { char db[TSDB_DB_FNAME_LEN]; + uint64_t dbId; SDBVgroupInfo *dbVgroup; } SUseDbOutput; diff --git a/include/util/taoserror.h b/include/util/taoserror.h index b5740b0118..6237de36ff 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -450,3 +450,4 @@ int32_t* taosGetErrno(); #endif #endif /*_TD_COMMON_TAOS_ERROR_H_*/ + \ No newline at end of file diff --git a/source/client/src/clientHb.c b/source/client/src/clientHb.c index d265ffaa94..e80e87f03b 100644 --- a/source/client/src/clientHb.c +++ b/source/client/src/clientHb.c @@ -44,7 +44,6 @@ static int32_t hbProcessDBInfoRsp(void *value, int32_t valueLen, struct SCatalog code = catalogRemoveDB(pCatalog, rsp->db, rsp->uid); } else { SDBVgroupInfo vgInfo = {0}; - vgInfo.dbId = rsp->uid; vgInfo.vgVersion = rsp->vgVersion; vgInfo.hashMethod = rsp->hashMethod; vgInfo.vgHash = taosHashInit(rsp->vgNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); @@ -69,7 +68,7 @@ static int32_t hbProcessDBInfoRsp(void *value, int32_t valueLen, struct SCatalog } } - code = catalogUpdateDBVgroup(pCatalog, rsp->db, &vgInfo); + code = catalogUpdateDBVgroup(pCatalog, rsp->db, rsp->uid, &vgInfo); if (code) { taosHashCleanup(vgInfo.vgHash); } diff --git a/source/libs/catalog/inc/catalogInt.h b/source/libs/catalog/inc/catalogInt.h index 9c041d76c7..1c4a19530d 100644 --- a/source/libs/catalog/inc/catalogInt.h +++ b/source/libs/catalog/inc/catalogInt.h @@ -48,18 +48,22 @@ enum { }; typedef struct SCtgDebug { - int32_t lockDebug; + bool lockDebug; + bool cacheDebug; + uint32_t showCachePeriodSec; } SCtgDebug; typedef struct SCtgTbMetaCache { SRWLatch stbLock; - SHashObj *cache; //key:tbname, value:STableMeta + SRWLatch metaLock; // RC between cache destroy and all other operations + SHashObj *metaCache; //key:tbname, value:STableMeta SHashObj *stbCache; //key:suid, value:STableMeta* } SCtgTbMetaCache; typedef struct SCtgDBCache { SRWLatch vgLock; + uint64_t dbId; int8_t deleted; SDBVgroupInfo *vgInfo; SCtgTbMetaCache tbCache; @@ -136,7 +140,8 @@ typedef uint32_t (*tableNameHashFp)(const char *, uint32_t); #define CTG_RET(c) do { int32_t _code = c; if (_code != TSDB_CODE_SUCCESS) { terrno = _code; } return _code; } while (0) #define CTG_ERR_JRET(c) do { code = c; if (code != TSDB_CODE_SUCCESS) { terrno = code; goto _return; } } while (0) -#define CTG_LOCK_DEBUG(...) do { if (gCTGDebug.lockDebug) { qDebug(__VA_ARGS__); } } while (0) +#define CTG_LOCK_DEBUG(...) do { if (gCTGDebug.lockDebug) { ctgDebug(__VA_ARGS__); } } while (0) +#define CTG_CACHE_DEBUG(...) do { if (gCTGDebug.cacheDebug) { ctgDebug(__VA_ARGS__); } } while (0) #define TD_RWLATCH_WRITE_FLAG_COPY 0x40000000 @@ -173,6 +178,8 @@ typedef uint32_t (*tableNameHashFp)(const char *, uint32_t); } while (0) + + #ifdef __cplusplus } #endif diff --git a/source/libs/catalog/src/catalog.c b/source/libs/catalog/src/catalog.c index 02773fe533..3776969d90 100644 --- a/source/libs/catalog/src/catalog.c +++ b/source/libs/catalog/src/catalog.c @@ -22,6 +22,99 @@ SCatalogMgmt ctgMgmt = {0}; SCtgDebug gCTGDebug = {0}; +void ctgShowDBCache(SHashObj *dbHash) { + if (NULL == dbHash) { + return; + } + + int32_t i = 0; + SCtgDBCache *dbCache = NULL; + void *pIter = taosHashIterate(dbHash, NULL); + while (pIter) { + char *dbFName = NULL; + size_t len = 0; + + dbCache = (SCtgDBCache *)pIter; + + taosHashGetKey(dbCache, &dbFName, &len); + + CTG_CACHE_DEBUG("** %dth db [%.*s] **", i, len, dbFName); + + pIter = taosHashIterate(dbHash, pIter); + } +} + +void ctgShowClusterCache(struct SCatalog* pCatalog) { + if (NULL == pCatalog) { + return; + } + + CTG_CACHE_DEBUG("## cluster %"PRIx64" cache Info ##", pCatalog->clusterId); + CTG_CACHE_DEBUG("db cache number:%d", pCatalog->dbCache ? taosHashGetSize(pCatalog->dbCache) : 0); + ctgShowDBCache(pCatalog->dbCache); + +} + +int32_t ctgInitDBCache(struct SCatalog* pCatalog) { + if (NULL == pCatalog->dbCache) { + SHashObj *cache = taosHashInit(ctgMgmt.cfg.maxDBCacheNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); + if (NULL == cache) { + ctgError("taosHashInit %d failed", CTG_DEFAULT_CACHE_DB_NUMBER); + CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); + } + + if (NULL != atomic_val_compare_exchange_ptr(&pCatalog->dbCache, NULL, cache)) { + taosHashCleanup(cache); + } + } + + return TSDB_CODE_SUCCESS; +} + + +int32_t ctgInitTbMetaCache(struct SCatalog* pCatalog, SCtgDBCache *dbCache) { + if (NULL == dbCache->tbCache.metaCache) { + if (dbCache->deleted) { + ctgInfo("db is dropping, dbId:%"PRIx64, dbCache->dbId); + CTG_ERR_RET(TSDB_CODE_CTG_DB_DROPPED); + } + + SHashObj *metaCache = taosHashInit(ctgMgmt.cfg.maxTblCacheNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); + if (NULL == metaCache) { + ctgError("taosHashInit failed, num:%d", ctgMgmt.cfg.maxTblCacheNum); + CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); + } + + if (NULL != atomic_val_compare_exchange_ptr(&dbCache->tbCache.metaCache, NULL, metaCache)) { + taosHashCleanup(metaCache); + } + } + + return TSDB_CODE_SUCCESS; +} + +int32_t ctgInitStbCache(struct SCatalog* pCatalog, SCtgDBCache *dbCache) { + if (NULL == dbCache->tbCache.stbCache) { + if (dbCache->deleted) { + ctgInfo("db is dropping, dbId:%"PRIx64, dbCache->dbId); + CTG_ERR_RET(TSDB_CODE_CTG_DB_DROPPED); + } + + SHashObj *cache = taosHashInit(ctgMgmt.cfg.maxTblCacheNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), true, HASH_ENTRY_LOCK); + if (NULL == cache) { + ctgError("taosHashInit failed, num:%d", ctgMgmt.cfg.maxTblCacheNum); + CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); + } + + if (NULL != atomic_val_compare_exchange_ptr(&dbCache->tbCache.stbCache, NULL, cache)) { + taosHashCleanup(cache); + } + } + + return TSDB_CODE_SUCCESS; +} + + void ctgFreeMetaRent(SCtgRentMgmt *mgmt) { if (NULL == mgmt->slots) { @@ -40,18 +133,20 @@ void ctgFreeMetaRent(SCtgRentMgmt *mgmt) { } -void ctgFreeTableMetaCache(SCtgTbMetaCache *table) { - CTG_LOCK(CTG_WRITE, &table->stbLock); - if (table->stbCache) { - taosHashCleanup(table->stbCache); - table->stbCache = NULL; +void ctgFreeTableMetaCache(SCtgTbMetaCache *cache) { + CTG_LOCK(CTG_WRITE, &cache->stbLock); + if (cache->stbCache) { + taosHashCleanup(cache->stbCache); + cache->stbCache = NULL; } - CTG_UNLOCK(CTG_WRITE, &table->stbLock); + CTG_UNLOCK(CTG_WRITE, &cache->stbLock); - if (table->cache) { - taosHashCleanup(table->cache); - table->cache = NULL; + CTG_LOCK(CTG_WRITE, &cache->metaLock); + if (cache->metaCache) { + taosHashCleanup(cache->metaCache); + cache->metaCache = NULL; } + CTG_UNLOCK(CTG_WRITE, &cache->metaLock); } void ctgFreeDbCache(SCtgDBCache *dbCache) { @@ -61,9 +156,8 @@ void ctgFreeDbCache(SCtgDBCache *dbCache) { atomic_store_8(&dbCache->deleted, 1); - SDBVgroupInfo *dbInfo = NULL; + CTG_LOCK(CTG_WRITE, &dbCache->vgLock); if (dbCache->vgInfo) { - CTG_LOCK(CTG_WRITE, &dbCache->vgLock); if (dbCache->vgInfo->vgHash) { taosHashCleanup(dbCache->vgInfo->vgHash); @@ -71,8 +165,8 @@ void ctgFreeDbCache(SCtgDBCache *dbCache) { } tfree(dbCache->vgInfo); - CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); } + CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); ctgFreeTableMetaCache(&dbCache->tbCache); } @@ -97,22 +191,21 @@ void ctgFreeHandle(struct SCatalog* pCatalog) { free(pCatalog); } - -int32_t ctgGetDBVgroupFromCache(struct SCatalog* pCatalog, const char *dbName, SCtgDBCache **dbCache, bool *inCache) { +int32_t ctgGetDBVgroupFromCache(struct SCatalog* pCatalog, const char *dbFName, SCtgDBCache **dbCache, bool *inCache) { if (NULL == pCatalog->dbCache) { *inCache = false; - ctgWarn("empty db cache, dbName:%s", dbName); + ctgWarn("empty db cache, dbFName:%s", dbFName); return TSDB_CODE_SUCCESS; } SCtgDBCache *cache = NULL; while (true) { - cache = taosHashAcquire(pCatalog->dbCache, dbName, strlen(dbName)); + cache = taosHashAcquire(pCatalog->dbCache, dbFName, strlen(dbFName)); if (NULL == cache) { *inCache = false; - ctgWarn("not in db vgroup cache, dbName:%s", dbName); + ctgWarn("not in db vgroup cache, dbFName:%s", dbFName); return TSDB_CODE_SUCCESS; } @@ -120,7 +213,7 @@ int32_t ctgGetDBVgroupFromCache(struct SCatalog* pCatalog, const char *dbName, S if (NULL == cache->vgInfo) { CTG_UNLOCK(CTG_READ, &cache->vgLock); taosHashRelease(pCatalog->dbCache, cache); - ctgWarn("db cache vgInfo is NULL, dbName:%s", dbName); + ctgWarn("db cache vgInfo is NULL, dbFName:%s", dbFName); continue; } @@ -131,7 +224,7 @@ int32_t ctgGetDBVgroupFromCache(struct SCatalog* pCatalog, const char *dbName, S *dbCache = cache; *inCache = true; - ctgDebug("Got db vgroup from cache, dbName:%s", dbName); + ctgDebug("Got db vgroup from cache, dbFName:%s", dbFName); return TSDB_CODE_SUCCESS; } @@ -189,7 +282,10 @@ int32_t ctgIsTableMetaExistInCache(struct SCatalog* pCatalog, char *dbFName, cha size_t sz = 0; - STableMeta *tbMeta = taosHashGet(dbCache->tbCache.cache, tbName, strlen(tbName)); + CTG_LOCK(CTG_READ, &dbCache->tbCache.metaLock); + STableMeta *tbMeta = taosHashGet(dbCache->tbCache.metaCache, tbName, strlen(tbName)); + CTG_UNLOCK(CTG_READ, &dbCache->tbCache.metaLock); + if (NULL == tbMeta) { taosHashRelease(pCatalog->dbCache, dbCache); @@ -227,15 +323,18 @@ int32_t ctgGetTableMetaFromCache(struct SCatalog* pCatalog, const SName* pTableN return TSDB_CODE_SUCCESS; } - if (NULL == dbCache->tbCache.cache) { + if (NULL == dbCache->tbCache.metaCache) { *exist = 0; taosHashRelease(pCatalog->dbCache, dbCache); ctgWarn("empty tbmeta cache, dbFName:%s, tbName:%s", db, pTableName->tname); return TSDB_CODE_SUCCESS; } - size_t sz = 0; - STableMeta *tbMeta = taosHashGetCloneExt(dbCache->tbCache.cache, pTableName->tname, strlen(pTableName->tname), NULL, (void **)pTableMeta, &sz); + size_t sz = 0; + CTG_LOCK(CTG_READ, &dbCache->tbCache.metaLock); + STableMeta *tbMeta = taosHashGetCloneExt(dbCache->tbCache.metaCache, pTableName->tname, strlen(pTableName->tname), NULL, (void **)pTableMeta, &sz); + CTG_UNLOCK(CTG_READ, &dbCache->tbCache.metaLock); + if (NULL == *pTableMeta) { *exist = 0; taosHashRelease(pCatalog->dbCache, dbCache); @@ -308,7 +407,10 @@ int32_t ctgGetTableTypeFromCache(struct SCatalog* pCatalog, const SName* pTableN return TSDB_CODE_SUCCESS; } - STableMeta *pTableMeta = (STableMeta *)taosHashAcquire(dbCache->tbCache.cache, pTableName->tname, strlen(pTableName->tname)); + CTG_LOCK(CTG_READ, &dbCache->tbCache.metaLock); + STableMeta *pTableMeta = (STableMeta *)taosHashAcquire(dbCache->tbCache.metaCache, pTableName->tname, strlen(pTableName->tname)); + CTG_UNLOCK(CTG_READ, &dbCache->tbCache.metaLock); + if (NULL == pTableMeta) { ctgWarn("tbmeta not in cache, dbFName:%s, tbName:%s", dbName, pTableName->tname); taosHashRelease(pCatalog->dbCache, dbCache); @@ -318,7 +420,7 @@ int32_t ctgGetTableTypeFromCache(struct SCatalog* pCatalog, const SName* pTableN *tbType = atomic_load_8(&pTableMeta->tableType); - taosHashRelease(dbCache->tbCache.cache, dbCache); + taosHashRelease(dbCache->tbCache.metaCache, dbCache); taosHashRelease(pCatalog->dbCache, dbCache); ctgDebug("Got tbtype from cache, dbFName:%s, tbName:%s, type:%d", dbName, pTableName->tname, *tbType); @@ -741,6 +843,98 @@ int32_t ctgMetaRentGet(SCtgRentMgmt *mgmt, void **res, uint32_t *num, int32_t si return TSDB_CODE_SUCCESS; } +int32_t ctgAddDBCache(struct SCatalog *pCatalog, char *dbFName, SCtgDBCache *dbCache) { + int32_t code = 0; + if (taosHashPut(pCatalog->dbCache, dbFName, strlen(dbFName), dbCache, sizeof(SCtgDBCache))) { + ctgError("taosHashPut db to cache failed, db:%s", dbFName); + CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); + } + + SDbVgVersion vgVersion = {.dbId = dbCache->dbId, .vgVersion = dbCache->vgInfo ? dbCache->vgInfo->vgVersion : -1}; + strncpy(vgVersion.dbFName, dbFName, sizeof(vgVersion.dbFName)); + + ctgDebug("dbCache added, dbFName:%s, vgVersion:%d, dbId:%"PRIx64, dbFName, vgVersion.vgVersion, dbCache->dbId); + + CTG_ERR_JRET(ctgMetaRentAdd(&pCatalog->dbRent, &vgVersion, vgVersion.dbId, sizeof(SDbVgVersion))); + + return TSDB_CODE_SUCCESS; + +_return: + + ctgFreeDbCache(dbCache); + + CTG_RET(code); +} + + +int32_t ctgUpdateTbMetaImpl(struct SCatalog *pCatalog, SCtgTbMetaCache *tbCache, char *dbFName, char *tbName, STableMeta *meta, int32_t metaSize) { + CTG_LOCK(CTG_READ, &tbCache->metaLock); + if (taosHashPut(tbCache->metaCache, tbName, strlen(tbName), meta, metaSize) != 0) { + CTG_UNLOCK(CTG_READ, &tbCache->metaLock); + ctgError("taosHashPut tbmeta to cache failed, dbFName:%s, tbName:%s, tbType:%d", dbFName, tbName, meta->tableType); + CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); + } + CTG_UNLOCK(CTG_READ, &tbCache->metaLock); + + ctgDebug("tbmeta updated to cache, dbFName:%s, tbName:%s, tbType:%d", dbFName, tbName, meta->tableType); + + return TSDB_CODE_SUCCESS; +} + +int32_t ctgUpdateStbMetaImpl(struct SCatalog *pCatalog, SCtgTbMetaCache *tbCache, char *dbFName, char *tbName, STableMeta *meta, int32_t metaSize) { + bool newAdded = false; + int32_t code = 0; + SSTableMetaVersion metaRent = {.suid = meta->suid, .sversion = meta->sversion, .tversion = meta->tversion}; + strcpy(metaRent.dbFName, dbFName); + strcpy(metaRent.stbName, tbName); + + CTG_LOCK(CTG_WRITE, &tbCache->stbLock); + + CTG_LOCK(CTG_READ, &tbCache->metaLock); + STableMeta *orig = taosHashAcquire(tbCache->metaCache, tbName, strlen(tbName)); + if (orig) { + if (orig->suid != meta->suid) { + if (taosHashRemove(tbCache->stbCache, &orig->suid, sizeof(orig->suid))) { + ctgError("stb not exist in stbCache, db:%s, stb:%s, suid:%"PRIx64, dbFName, tbName, orig->suid); + } + + ctgMetaRentRemove(&pCatalog->stbRent, orig->suid, ctgSTableVersionCompare); + } + + taosHashRelease(tbCache->metaCache, orig); + } + CTG_UNLOCK(CTG_READ, &tbCache->metaLock); + + CTG_ERR_JRET(ctgUpdateTbMetaImpl(pCatalog, tbCache, dbFName, tbName, meta, metaSize)); + + CTG_LOCK(CTG_READ, &tbCache->metaLock); + STableMeta *tbMeta = taosHashGet(tbCache->metaCache, tbName, strlen(tbName)); + if (taosHashPutExt(tbCache->stbCache, &meta->suid, sizeof(meta->suid), &tbMeta, POINTER_BYTES, &newAdded) != 0) { + CTG_UNLOCK(CTG_READ, &tbCache->metaLock); + CTG_UNLOCK(CTG_WRITE, &tbCache->stbLock); + ctgError("taosHashPutExt stable to stable cache failed, suid:%"PRIx64, meta->suid); + CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); + } + CTG_UNLOCK(CTG_READ, &tbCache->metaLock); + + CTG_UNLOCK(CTG_WRITE, &tbCache->stbLock); + + ctgDebug("update stable to cache, suid:%"PRIx64, meta->suid); + + if (newAdded) { + CTG_ERR_RET(ctgMetaRentAdd(&pCatalog->stbRent, &metaRent, metaRent.suid, sizeof(SSTableMetaVersion))); + } else { + CTG_ERR_RET(ctgMetaRentUpdate(&pCatalog->stbRent, &metaRent, metaRent.suid, sizeof(SSTableMetaVersion), ctgSTableVersionCompare)); + } + + return TSDB_CODE_SUCCESS; + +_return: + + CTG_UNLOCK(CTG_WRITE, &tbCache->stbLock); + + CTG_RET(code); +} int32_t ctgUpdateTableMetaCache(struct SCatalog *pCatalog, STableMetaOutput *output) { @@ -752,63 +946,15 @@ int32_t ctgUpdateTableMetaCache(struct SCatalog *pCatalog, STableMetaOutput *out CTG_ERR_RET(TSDB_CODE_CTG_INTERNAL_ERROR); } - if (NULL == pCatalog->dbCache) { - SHashObj *cache = taosHashInit(ctgMgmt.cfg.maxDBCacheNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); - if (NULL == cache) { - ctgError("taosHashInit %d failed", CTG_DEFAULT_CACHE_DB_NUMBER); - CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); - } + CTG_ERR_RET(ctgInitDBCache(pCatalog)); - if (NULL != atomic_val_compare_exchange_ptr(&pCatalog->dbCache, NULL, cache)) { - taosHashCleanup(cache); - } - } + CTG_ERR_JRET(ctgAcquireDBCache(pCatalog, output->dbFName, output->dbId, &dbCache)); - while (true) { - dbCache = (SCtgDBCache *)taosHashAcquire(pCatalog->dbCache, output->dbFName, strlen(output->dbFName)); - if (dbCache) { - break; - } - - SCtgDBCache newDbCache = {0}; - - if (taosHashPut(pCatalog->dbCache, output->dbFName, strlen(output->dbFName), &newDbCache, sizeof(newDbCache))) { - ctgError("taosHashPut db to cache failed, db:%s", output->dbFName); - CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); - } - } - - if (NULL == dbCache->tbCache.cache) { - SHashObj *cache = taosHashInit(ctgMgmt.cfg.maxTblCacheNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); - if (NULL == cache) { - ctgError("taosHashInit failed, num:%d", ctgMgmt.cfg.maxTblCacheNum); - CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); - } - - if (NULL != atomic_val_compare_exchange_ptr(&dbCache->tbCache.cache, NULL, cache)) { - taosHashCleanup(cache); - } - } - - if (NULL == dbCache->tbCache.stbCache) { - SHashObj *cache = taosHashInit(ctgMgmt.cfg.maxTblCacheNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT), true, HASH_ENTRY_LOCK); - if (NULL == cache) { - ctgError("taosHashInit failed, num:%d", ctgMgmt.cfg.maxTblCacheNum); - CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); - } - - if (NULL != atomic_val_compare_exchange_ptr(&dbCache->tbCache.stbCache, NULL, cache)) { - taosHashCleanup(cache); - } - } + CTG_ERR_JRET(ctgInitTbMetaCache(pCatalog, dbCache)); + CTG_ERR_JRET(ctgInitStbCache(pCatalog, dbCache)); if (CTG_IS_META_CTABLE(output->metaType) || CTG_IS_META_BOTH(output->metaType)) { - if (taosHashPut(dbCache->tbCache.cache, output->ctbName, strlen(output->ctbName), &output->ctbMeta, sizeof(output->ctbMeta)) != 0) { - ctgError("taosHashPut ctbmeta to cache failed, ctbName:%s", output->ctbName); - CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); - } - - ctgDebug("ctbmeta updated to cache, ctbName:%s", output->ctbName); + CTG_ERR_JRET(ctgUpdateTbMetaImpl(pCatalog, &dbCache->tbCache, output->ctbName, (STableMeta *)&output->ctbMeta, sizeof(output->ctbMeta))); } if (CTG_IS_META_CTABLE(output->metaType)) { @@ -823,42 +969,11 @@ int32_t ctgUpdateTableMetaCache(struct SCatalog *pCatalog, STableMetaOutput *out int32_t tbSize = sizeof(*output->tbMeta) + sizeof(SSchema) * (output->tbMeta->tableInfo.numOfColumns + output->tbMeta->tableInfo.numOfTags); if (TSDB_SUPER_TABLE == output->tbMeta->tableType) { - bool newAdded = false; - SSTableMetaVersion metaRent = {.suid = output->tbMeta->suid, .sversion = output->tbMeta->sversion, .tversion = output->tbMeta->tversion}; - strcpy(metaRent.dbFName, output->dbFName); - strcpy(metaRent.stbName, output->tbName); - - CTG_LOCK(CTG_WRITE, &dbCache->tbCache.stbLock); - if (taosHashPut(dbCache->tbCache.cache, output->tbName, strlen(output->tbName), output->tbMeta, tbSize) != 0) { - CTG_UNLOCK(CTG_WRITE, &dbCache->tbCache.stbLock); - ctgError("taosHashPut tablemeta to cache failed, dbFName:%s, tbName:%s", output->dbFName, output->tbName); - CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); - } - - STableMeta *tbMeta = taosHashGet(dbCache->tbCache.cache, output->tbName, strlen(output->tbName)); - if (taosHashPutExt(dbCache->tbCache.stbCache, &output->tbMeta->suid, sizeof(output->tbMeta->suid), &tbMeta, POINTER_BYTES, &newAdded) != 0) { - CTG_UNLOCK(CTG_WRITE, &dbCache->tbCache.stbLock); - ctgError("taosHashPutExt stable to stable cache failed, suid:%"PRIx64, output->tbMeta->suid); - CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); - } - CTG_UNLOCK(CTG_WRITE, &dbCache->tbCache.stbLock); - - ctgDebug("update stable to cache, suid:%"PRIx64, output->tbMeta->suid); - - if (newAdded) { - CTG_ERR_JRET(ctgMetaRentAdd(&pCatalog->stbRent, &metaRent, metaRent.suid, sizeof(SSTableMetaVersion))); - } else { - CTG_ERR_JRET(ctgMetaRentUpdate(&pCatalog->stbRent, &metaRent, metaRent.suid, sizeof(SSTableMetaVersion), ctgSTableVersionCompare)); - } + CTG_ERR_JRET(ctgUpdateStbMetaImpl(pCatalog, &dbCache->tbCache, output->dbFName, output->tbName, output->tbMeta, tbSize)); } else { - if (taosHashPut(dbCache->tbCache.cache, output->tbName, strlen(output->tbName), output->tbMeta, tbSize) != 0) { - ctgError("taosHashPut tablemeta to cache failed, dbFName:%s, tbName:%s", output->dbFName, output->tbName); - CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); - } + CTG_ERR_JRET(ctgUpdateTbMetaImpl(pCatalog, &dbCache->tbCache, output->dbFName, output->tbName, output->tbMeta, tbSize)); } - ctgDebug("update tablemeta to cache, dbFName:%s, tbName:%s", output->dbFName, output->tbName); - _return: if (dbCache) { @@ -868,30 +983,30 @@ _return: CTG_RET(code); } -int32_t ctgGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgmtEps, const char* dbName, bool forceUpdate, SCtgDBCache** dbCache) { +int32_t ctgGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgmtEps, const char* dbFName, bool forceUpdate, SCtgDBCache** dbCache) { bool inCache = false; if (!forceUpdate) { - CTG_ERR_RET(ctgGetDBVgroupFromCache(pCatalog, dbName, dbCache, &inCache)); + CTG_ERR_RET(ctgGetDBVgroupFromCache(pCatalog, dbFName, dbCache, &inCache)); if (inCache) { return TSDB_CODE_SUCCESS; } - ctgDebug("failed to get DB vgroupInfo from cache, dbName:%s, load it from mnode, update:%d", dbName, forceUpdate); + ctgDebug("failed to get DB vgroupInfo from cache, dbName:%s, load it from mnode, update:%d", dbFName, forceUpdate); } SUseDbOutput DbOut = {0}; SBuildUseDBInput input = {0}; - tstrncpy(input.db, dbName, tListLen(input.db)); + tstrncpy(input.db, dbFName, tListLen(input.db)); input.vgVersion = CTG_DEFAULT_INVALID_VERSION; while (true) { CTG_ERR_RET(ctgGetDBVgroupFromMnode(pCatalog, pRpc, pMgmtEps, &input, &DbOut)); - CTG_ERR_RET(catalogUpdateDBVgroup(pCatalog, dbName, DbOut.dbVgroup)); - CTG_ERR_RET(ctgGetDBVgroupFromCache(pCatalog, dbName, dbCache, &inCache)); + CTG_ERR_RET(catalogUpdateDBVgroup(pCatalog, dbFName, DbOut.dbVgroup)); + CTG_ERR_RET(ctgGetDBVgroupFromCache(pCatalog, dbFName, dbCache, &inCache)); if (!inCache) { - ctgWarn("can't get db vgroup from cache, will retry, db:%s", dbName); + ctgWarn("can't get db vgroup from cache, will retry, db:%s", dbFName); continue; } @@ -901,58 +1016,90 @@ int32_t ctgGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgm return TSDB_CODE_SUCCESS; } +void ctgRemoveAndFreeTableMeta(struct SCatalog* pCatalog, SCtgTbMetaCache *cache) { + CTG_LOCK(CTG_WRITE, &cache->stbLock); + if (cache->stbCache) { + void *pIter = taosHashIterate(cache->stbCache, NULL); + while (pIter) { + uint64_t suid = 0; + taosHashGetKey(pIter, &suid, NULL); -int32_t ctgValidateAndRemoveDb(struct SCatalog* pCatalog, const char* dbName, uint64_t dbId, bool *removed) { - *removed = false; + CTG_ERR_RET(ctgMetaRentRemove(&pCatalog->stbRent, suid, ctgSTableVersionCompare)); + ctgDebug("stb removed from rent, suid:%"PRIx64, suid); + + pIter = taosHashIterate(cache->stbCache, pIter); + } + } + CTG_UNLOCK(CTG_WRITE, &cache->stbLock); - SCtgDBCache *dbCache = (SCtgDBCache *)taosHashAcquire(pCatalog->dbCache, dbName, strlen(dbName)); - if (NULL == dbCache) { - ctgInfo("db not exist in dbCache, may be removed, db:%s", dbName); - return TSDB_CODE_SUCCESS; - } - - CTG_LOCK(CTG_WRITE, &dbCache->vgLock); - - if (NULL == dbCache->vgInfo) { - ctgInfo("db vgInfo not in dbCache, may be removed, db:%s, dbId:%"PRIx64, dbName, dbId); - CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); - taosHashRelease(pCatalog->dbCache, dbCache); - return TSDB_CODE_SUCCESS; - } - - if (dbCache->vgInfo->dbId != dbId) { - ctgInfo("db id already updated, db:%s, dbId:%"PRIx64 ", targetId:%"PRIx64, dbName, dbCache->vgInfo->dbId, dbId); - CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); - taosHashRelease(pCatalog->dbCache, dbCache); - return TSDB_CODE_SUCCESS; - } - - if (dbCache->vgInfo->vgHash) { - ctgInfo("cleanup db vgInfo, db:%s, dbId:%"PRIx64, dbName, dbId); - taosHashCleanup(dbCache->vgInfo->vgHash); - tfree(dbCache->vgInfo); - } + ctgFreeTableMetaCache(cache); +} - if (taosHashRemove(pCatalog->dbCache, dbName, strlen(dbName))) { - ctgError("taosHashRemove from dbCache failed, db:%s", dbName); - CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); - taosHashRelease(pCatalog->dbCache, dbCache); +int32_t ctgValidateAndRemoveDb(struct SCatalog* pCatalog, SCtgDBCache *dbCache, const char* dbFName) { + if (taosHashRemove(pCatalog->dbCache, dbFName, strlen(dbFName))) { + ctgError("taosHashRemove from dbCache failed, dbFName:%s", dbFName); CTG_ERR_RET(TSDB_CODE_CTG_INTERNAL_ERROR); } - dbCache->deleted = true; + atomic_store_8(&dbCache->deleted, 1); + CTG_LOCK(CTG_WRITE, &dbCache->vgLock); + if (dbCache->vgInfo) { + ctgInfo("cleanup db vgInfo, dbFName:%s, dbId:%"PRIx64, dbFName, dbCache->dbId); + + if (dbCache->vgInfo->vgHash) { + taosHashCleanup(dbCache->vgInfo->vgHash); + } + + tfree(dbCache->vgInfo); + } CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); - ctgFreeTableMetaCache(&dbCache->tbCache); + ctgRemoveAndFreeTableMeta(pCatalog, &dbCache->tbCache); - taosHashRelease(pCatalog->dbCache, dbCache); + ctgInfo("db removed from cache, dbFName:%s, uid:%"PRIx64, dbFName, dbCache->dbId); - *removed = true; + CTG_ERR_RET(ctgMetaRentRemove(&pCatalog->dbRent, dbCache->dbId, ctgDbVgVersionCompare)); + + ctgDebug("db removed from rent, dbFName:%s, uid:%"PRIx64, dbFName, dbCache->dbId); return TSDB_CODE_SUCCESS; } + +int32_t ctgAcquireDBCache(struct SCatalog* pCatalog, const char *dbFName, uint64_t dbId, SCtgDBCache **pCache) { + int32_t code = 0; + SCtgDBCache *dbCache = NULL; + + while (true) { + dbCache = (SCtgDBCache *)taosHashAcquire(pCatalog->dbCache, dbFName, strlen(dbFName)); + if (dbCache) { + if (dbCache->dbId == dbId) { + *pCache = dbCache; + return TSDB_CODE_SUCCESS; + } + + CTG_ERR_JRET(ctgValidateAndRemoveDb(pCatalog, dbCache, dbFName)); + taosHashRelease(pCatalog->dbCache, dbCache); + dbCache = NULL; + } + + SCtgDBCache newDBCache = {0}; + newDBCache.dbId = dbId; + + CTG_ERR_JRET(ctgAddDBCache(pCatalog, dbFName, &newDBCache)); + } + +_return: + + if (dbCache) { + taosHashRelease(pCatalog->dbCache, dbCache); + } + + CTG_RET(code); +} + + int32_t ctgValidateAndRemoveStbMeta(struct SCatalog* pCatalog, const char* dbName, const char* stbName, uint64_t suid, bool *removed) { *removed = false; @@ -970,12 +1117,16 @@ int32_t ctgValidateAndRemoveStbMeta(struct SCatalog* pCatalog, const char* dbNam return TSDB_CODE_SUCCESS; } - if (taosHashRemove(dbCache->tbCache.cache, stbName, strlen(stbName))) { + CTG_LOCK(CTG_READ, &dbCache->tbCache.metaLock); + if (taosHashRemove(dbCache->tbCache.metaCache, stbName, strlen(stbName))) { + CTG_UNLOCK(CTG_READ, &dbCache->tbCache.metaLock); CTG_UNLOCK(CTG_WRITE, &dbCache->tbCache.stbLock); taosHashRelease(pCatalog->dbCache, dbCache); ctgError("stb not exist in cache, db:%s, stb:%s, suid:%"PRIx64, dbName, stbName, suid); CTG_ERR_RET(TSDB_CODE_CTG_INTERNAL_ERROR); - } + } + CTG_UNLOCK(CTG_READ, &dbCache->tbCache.metaLock); + CTG_UNLOCK(CTG_WRITE, &dbCache->tbCache.stbLock); taosHashRelease(pCatalog->dbCache, dbCache); @@ -1255,7 +1406,7 @@ int32_t catalogGetDBVgroupVersion(struct SCatalog* pCatalog, const char* dbName, return TSDB_CODE_SUCCESS; } -int32_t catalogGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgmtEps, const char* dbName, bool forceUpdate, SArray** vgroupList) { +int32_t catalogGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgmtEps, const char* dbFName, bool forceUpdate, SArray** vgroupList) { if (NULL == pCatalog || NULL == dbName || NULL == pRpc || NULL == pMgmtEps || NULL == vgroupList) { CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } @@ -1265,7 +1416,7 @@ int32_t catalogGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* int32_t code = 0; SArray *vgList = NULL; - CTG_ERR_JRET(ctgGetDBVgroup(pCatalog, pRpc, pMgmtEps, dbName, forceUpdate, &dbCache)); + CTG_ERR_JRET(ctgGetDBVgroup(pCatalog, pRpc, pMgmtEps, dbFName, forceUpdate, &dbCache)); int32_t vgNum = (int32_t)taosHashGetSize(dbCache->vgInfo->vgHash); vgList = taosArrayInit(vgNum, sizeof(SVgroupInfo)); @@ -1307,89 +1458,64 @@ _return: } -int32_t catalogUpdateDBVgroup(struct SCatalog* pCatalog, const char* dbName, SDBVgroupInfo* dbInfo) { +int32_t catalogUpdateDBVgroup(struct SCatalog* pCatalog, const char* dbFName, uint64_t dbId, SDBVgroupInfo* dbInfo) { int32_t code = 0; - if (NULL == pCatalog || NULL == dbName || NULL == dbInfo) { + if (NULL == pCatalog || NULL == dbFName || NULL == dbInfo) { CTG_ERR_JRET(TSDB_CODE_CTG_INVALID_INPUT); } if (NULL == dbInfo->vgHash || dbInfo->vgVersion < 0 || taosHashGetSize(dbInfo->vgHash) <= 0) { - ctgError("invalid db vgInfo, dbName:%s, vgHash:%p, vgVersion:%d", dbName, dbInfo->vgHash, dbInfo->vgVersion); + ctgError("invalid db vgInfo, dbFName:%s, vgHash:%p, vgVersion:%d", dbFName, dbInfo->vgHash, dbInfo->vgVersion); CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); } - if (NULL == pCatalog->dbCache) { - SHashObj *cache = taosHashInit(ctgMgmt.cfg.maxDBCacheNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); - if (NULL == cache) { - ctgError("taosHashInit %d failed", CTG_DEFAULT_CACHE_DB_NUMBER); - CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); - } - - if (NULL != atomic_val_compare_exchange_ptr(&pCatalog->dbCache, NULL, cache)) { - taosHashCleanup(cache); - } - } + CTG_ERR_JRET(ctgInitDBCache(pCatalog)); bool newAdded = false; - SDbVgVersion vgVersion = {.dbId = dbInfo->dbId, .vgVersion = dbInfo->vgVersion}; - - SCtgDBCache *dbCache = (SCtgDBCache *)taosHashAcquire(pCatalog->dbCache, dbName, strlen(dbName)); - if (dbCache) { - CTG_LOCK(CTG_WRITE, &dbCache->vgLock); - - if (NULL == dbCache->vgInfo) { - newAdded = true; - - dbCache->vgInfo = dbInfo; - } else { - if (dbCache->vgInfo->dbId != dbInfo->dbId) { - ctgMetaRentRemove(&pCatalog->dbRent, dbCache->vgInfo->dbId, ctgDbVgVersionCompare); - newAdded = true; - } else if (dbInfo->vgVersion <= dbCache->vgInfo->vgVersion) { - ctgInfo("db vgVersion is old, db:%s, vgVersion:%d, current:%d", dbName, dbInfo->vgVersion, dbCache->vgInfo->vgVersion); - CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); - taosHashRelease(pCatalog->dbCache, dbCache); - - goto _return; - } - - if (dbCache->vgInfo->vgHash) { - ctgInfo("cleanup db vgHash, db:%s", dbName); - taosHashCleanup(dbCache->vgInfo->vgHash); - dbCache->vgInfo->vgHash = NULL; - } - - tfree(dbCache->vgInfo); - dbCache->vgInfo = dbInfo; - } + SDbVgVersion vgVersion = {.dbId = dbId, .vgVersion = dbInfo->vgVersion}; + SCtgDBCache *dbCache = NULL; + CTG_ERR_JRET(ctgAcquireDBCache(pCatalog, dbFName, dbId, &dbCache)); + + CTG_LOCK(CTG_WRITE, &dbCache->vgLock); + if (dbCache->deleted) { + ctgInfo("db is dropping, dbFName:%s, dbId:%"PRIx64, dbFName, dbInfo->dbId); CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); taosHashRelease(pCatalog->dbCache, dbCache); + CTG_ERR_JRET(TSDB_CODE_CTG_DB_DROPPED); + } + + if (NULL == dbCache->vgInfo) { + dbCache->vgInfo = dbInfo; } else { - SCtgDBCache newDBCache = {0}; - newDBCache.vgInfo = dbInfo; + if (dbInfo->vgVersion <= dbCache->vgInfo->vgVersion) { + ctgInfo("db vgVersion is old, dbFName:%s, vgVersion:%d, current:%d", dbFName, dbInfo->vgVersion, dbCache->vgInfo->vgVersion); + CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); + taosHashRelease(pCatalog->dbCache, dbCache); + + goto _return; + } - if (taosHashPut(pCatalog->dbCache, dbName, strlen(dbName), &newDBCache, sizeof(newDBCache)) != 0) { - ctgError("taosHashPut db & db vgroup to cache failed, db:%s", dbName); - CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); + if (dbCache->vgInfo->vgHash) { + ctgInfo("cleanup db vgHash, dbFName:%s", dbFName); + taosHashCleanup(dbCache->vgInfo->vgHash); + dbCache->vgInfo->vgHash = NULL; } - newAdded = true; + tfree(dbCache->vgInfo); + dbCache->vgInfo = dbInfo; } dbInfo = NULL; - strncpy(vgVersion.dbFName, dbName, sizeof(vgVersion.dbFName)); - - if (newAdded) { - CTG_ERR_JRET(ctgMetaRentAdd(&pCatalog->dbRent, &vgVersion, vgVersion.dbId, sizeof(SDbVgVersion))); - } else { - CTG_ERR_JRET(ctgMetaRentUpdate(&pCatalog->dbRent, &vgVersion, vgVersion.dbId, sizeof(SDbVgVersion), ctgDbVgVersionCompare)); - } - - ctgDebug("dbName:%s vgroup updated, vgVersion:%d", dbName, vgVersion.vgVersion); + CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); + taosHashRelease(pCatalog->dbCache, dbCache); + strncpy(vgVersion.dbFName, dbFName, sizeof(vgVersion.dbFName)); + CTG_ERR_JRET(ctgMetaRentUpdate(&pCatalog->dbRent, &vgVersion, vgVersion.dbId, sizeof(SDbVgVersion), ctgDbVgVersionCompare)); + + ctgDebug("dbCache updated, dbFName:%s, vgVersion:%d, dbId:%"PRIx64, dbFName, vgVersion.vgVersion, vgVersion.dbId); _return: @@ -1403,29 +1529,34 @@ _return: } -int32_t catalogRemoveDB(struct SCatalog* pCatalog, const char* dbName, uint64_t dbId) { +int32_t catalogRemoveDB(struct SCatalog* pCatalog, const char* dbFName, uint64_t dbId) { int32_t code = 0; - bool removed = false; - if (NULL == pCatalog || NULL == dbName) { + if (NULL == pCatalog || NULL == dbFName) { CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } if (NULL == pCatalog->dbCache) { return TSDB_CODE_SUCCESS; } - - CTG_ERR_RET(ctgValidateAndRemoveDb(pCatalog, dbName, dbId, &removed)); - if (!removed) { + + SCtgDBCache *dbCache = (SCtgDBCache *)taosHashAcquire(pCatalog->dbCache, dbFName, strlen(dbFName)); + if (NULL == dbCache) { + ctgInfo("db not exist in dbCache, may be removed, dbFName:%s", dbFName); + return TSDB_CODE_SUCCESS; + } + + if (dbCache->dbId != dbId) { + ctgInfo("db id already updated, dbFName:%s, dbId:%"PRIx64 ", targetId:%"PRIx64, dbFName, dbCache->dbId, dbId); return TSDB_CODE_SUCCESS; } - ctgInfo("db removed from cache, db:%s, uid:%"PRIx64, dbName, dbId); + CTG_ERR_JRET(ctgValidateAndRemoveDb(pCatalog, dbCache, dbFName)); - CTG_ERR_RET(ctgMetaRentRemove(&pCatalog->dbRent, dbId, ctgDbVgVersionCompare)); - - ctgDebug("db removed from rent, db:%s, uid:%"PRIx64, dbName, dbId); +_return: + taosHashRelease(pCatalog->dbCache, dbCache); + CTG_RET(code); } @@ -1464,6 +1595,27 @@ int32_t catalogGetSTableMeta(struct SCatalog* pCatalog, void * pTransporter, con return ctgGetTableMeta(pCatalog, pTransporter, pMgmtEps, pTableName, false, pTableMeta, 1); } +int32_t catalogUpdateSTableMeta(struct SCatalog* pCatalog, STableMetaRsp *rspMsg) { + STableMetaOutput output = {0}; + int32_t code = 0; + + strcpy(output.dbFName, rspMsg->dbFName); + strcpy(output.tbName, rspMsg->tbName); + + SET_META_TYPE_TABLE(output.metaType); + + CTG_ERR_RET(queryCreateTableMetaFromMsg(rspMsg, true, &output.tbMeta)); + + CTG_ERR_JRET(ctgUpdateTableMetaCache(pCatalog, &output)); + +_return: + + tfree(output.tbMeta); + + CTG_RET(code); +} + + int32_t catalogRenewTableMeta(struct SCatalog* pCatalog, void *pTransporter, const SEpSet* pMgmtEps, const SName* pTableName, int32_t isSTable) { if (NULL == pCatalog || NULL == pTransporter || NULL == pMgmtEps || NULL == pTableName) { CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); diff --git a/source/libs/catalog/test/catalogTests.cpp b/source/libs/catalog/test/catalogTests.cpp index 751fa72347..5b58794651 100644 --- a/source/libs/catalog/test/catalogTests.cpp +++ b/source/libs/catalog/test/catalogTests.cpp @@ -185,7 +185,6 @@ void ctgTestBuildDBVgroup(SDBVgroupInfo **pdbVgroup) { ctgTestCurrentVgVersion = dbVgroup->vgVersion; dbVgroup->hashMethod = 0; - dbVgroup->dbId = ctgTestDbId; dbVgroup->vgHash = taosHashInit(ctgTestVgNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); vgNum = ctgTestGetVgNumFromVgVersion(dbVgroup->vgVersion); @@ -600,7 +599,7 @@ void *ctgTestSetDbVgroupThread(void *param) { while (!ctgTestStop) { ctgTestBuildDBVgroup(&dbVgroup); - code = catalogUpdateDBVgroup(pCtg, ctgTestDbname, dbVgroup); + code = catalogUpdateDBVgroup(pCtg, ctgTestDbname, ctgTestDbId, dbVgroup); if (code) { assert(0); } @@ -1109,7 +1108,7 @@ TEST(dbVgroup, getSetDbVgroupCase) { taosArrayDestroy(vgList); ctgTestBuildDBVgroup(&dbVgroup); - code = catalogUpdateDBVgroup(pCtg, ctgTestDbname, dbVgroup); + code = catalogUpdateDBVgroup(pCtg, ctgTestDbname, ctgTestDbId, dbVgroup); ASSERT_EQ(code, 0); code = catalogGetTableHashVgroup(pCtg, mockPointer, (const SEpSet *)mockPointer, &n, &vgInfo); diff --git a/source/libs/qcom/src/querymsg.c b/source/libs/qcom/src/querymsg.c index e7b3d08bc5..55099d9972 100644 --- a/source/libs/qcom/src/querymsg.c +++ b/source/libs/qcom/src/querymsg.c @@ -119,9 +119,9 @@ int32_t queryProcessUseDBRsp(void* output, char *msg, int32_t msgSize) { return TSDB_CODE_TSC_OUT_OF_MEMORY; } + pOut->dbId = pRsp->uid; pOut->dbVgroup->vgVersion = pRsp->vgVersion; pOut->dbVgroup->hashMethod = pRsp->hashMethod; - pOut->dbVgroup->dbId = pRsp->uid; pOut->dbVgroup->vgHash = taosHashInit(pRsp->vgNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); if (NULL == pOut->dbVgroup->vgHash) { qError("taosHashInit %d failed", pRsp->vgNum); diff --git a/source/util/src/terror.c b/source/util/src/terror.c index ee5bea0ab7..55d40ff98c 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -418,6 +418,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_CTG_INVALID_INPUT, "invalid catalog input TAOS_DEFINE_ERROR(TSDB_CODE_CTG_NOT_READY, "catalog is not ready") TAOS_DEFINE_ERROR(TSDB_CODE_CTG_MEM_ERROR, "catalog memory error") TAOS_DEFINE_ERROR(TSDB_CODE_CTG_SYS_ERROR, "catalog system error") +TAOS_DEFINE_ERROR(TSDB_CODE_CTG_DB_DROPPED, "Database is dropped") //scheduler TAOS_DEFINE_ERROR(TSDB_CODE_SCH_STATUS_ERROR, "scheduler status error") From d2f1e633a3e4db85d7d1b768fc74e1b05313bc0d Mon Sep 17 00:00:00 2001 From: Xiaoyu Wang Date: Mon, 7 Feb 2022 16:09:35 -0500 Subject: [PATCH 03/12] TD-13338 SELECT statement translate code --- include/nodes/nodes.h | 16 +- include/util/taoserror.h | 1 + source/libs/parser/inc/astCreateContext.h | 1 - source/libs/parser/inc/astCreateFuncs.h | 18 +- source/libs/parser/src/astCreateFuncs.c | 9 +- source/libs/parser/src/parserImpl.c | 196 ++++++++++++---------- source/libs/parser/test/newParserTest.cpp | 183 +++++++++++++++++++- source/nodes/src/nodesTraverseFuncs.c | 2 +- source/nodes/src/nodesUtilFuncs.c | 67 +++++++- 9 files changed, 377 insertions(+), 116 deletions(-) diff --git a/include/nodes/nodes.h b/include/nodes/nodes.h index ccb135aa0d..73082825ef 100644 --- a/include/nodes/nodes.h +++ b/include/nodes/nodes.h @@ -54,6 +54,9 @@ typedef enum ENodeType { QUERY_NODE_NODE_LIST, QUERY_NODE_FILL, + // only for parser + QUERY_NODE_TARGET_EXPR, + QUERY_NODE_SET_OPERATOR, QUERY_NODE_SELECT_STMT, QUERY_NODE_SHOW_STMT @@ -78,11 +81,6 @@ typedef struct SNodeList { SListCell* pTail; } SNodeList; -typedef struct SNameStr { - int32_t len; - char* pName; -} SNameStr; - typedef struct SDataType { uint8_t type; uint8_t precision; @@ -114,7 +112,7 @@ typedef struct SColumnNode { } SColumnNode; typedef struct SValueNode { - SExprNode type; // QUERY_NODE_VALUE + SExprNode node; // QUERY_NODE_VALUE char* literal; } SValueNode; @@ -146,7 +144,7 @@ typedef enum EOperatorType { } EOperatorType; typedef struct SOperatorNode { - SExprNode type; // QUERY_NODE_OPERATOR + SExprNode node; // QUERY_NODE_OPERATOR EOperatorType opType; SNode* pLeft; SNode* pRight; @@ -332,6 +330,10 @@ void nodesCloneNode(const SNode* pNode); int32_t nodesNodeToString(const SNode* pNode, char** pStr, int32_t* pLen); int32_t nodesStringToNode(const char* pStr, SNode** pNode); +bool nodesIsArithmeticOp(const SOperatorNode* pOp); +bool nodesIsComparisonOp(const SOperatorNode* pOp); +bool nodesIsJsonOp(const SOperatorNode* pOp); + bool nodesIsTimeorderQuery(const SNode* pQuery); bool nodesIsTimelineQuery(const SNode* pQuery); diff --git a/include/util/taoserror.h b/include/util/taoserror.h index b5740b0118..2343fc5a5a 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -444,6 +444,7 @@ int32_t* taosGetErrno(); #define TSDB_CODE_PARSER_INVALID_COLUMN TAOS_DEF_ERROR_CODE(0, 0x2601) //invalid column name #define TSDB_CODE_PARSER_TABLE_NOT_EXIST TAOS_DEF_ERROR_CODE(0, 0x2602) //table not exist #define TSDB_CODE_PARSER_AMBIGUOUS_COLUMN TAOS_DEF_ERROR_CODE(0, 0x2603) //ambiguous column +#define TSDB_CODE_PARSER_WRONG_VALUE_TYPE TAOS_DEF_ERROR_CODE(0, 0x2604) //wrong value type #ifdef __cplusplus } diff --git a/source/libs/parser/inc/astCreateContext.h b/source/libs/parser/inc/astCreateContext.h index 5458500a82..a0bac9ea7b 100644 --- a/source/libs/parser/inc/astCreateContext.h +++ b/source/libs/parser/inc/astCreateContext.h @@ -28,7 +28,6 @@ typedef struct SAstCreateContext { bool notSupport; bool valid; SNode* pRootNode; - SHashObj* pResourceHash; } SAstCreateContext; int32_t createAstCreateContext(SParseContext* pQueryCxt, SAstCreateContext* pCxt); diff --git a/source/libs/parser/inc/astCreateFuncs.h b/source/libs/parser/inc/astCreateFuncs.h index 15f0792d5c..7cd7e1932d 100644 --- a/source/libs/parser/inc/astCreateFuncs.h +++ b/source/libs/parser/inc/astCreateFuncs.h @@ -13,11 +13,6 @@ * along with this program. If not, see . */ -#include "nodes.h" -#include "nodesShowStmts.h" -#include "astCreateContext.h" -#include "ttoken.h" - #ifndef _TD_AST_CREATE_FUNCS_H_ #define _TD_AST_CREATE_FUNCS_H_ @@ -25,15 +20,26 @@ extern "C" { #endif +#include "nodes.h" +#include "nodesShowStmts.h" +#include "astCreateContext.h" +#include "ttoken.h" + extern SToken nil_token; +typedef struct STargetExprNode { + ENodeType nodeType; + char* p; + uint32_t n; + SNode* pNode; +} STargetExprNode; + SNodeList* createNodeList(SAstCreateContext* pCxt, SNode* pNode); SNodeList* addNodeToList(SAstCreateContext* pCxt, SNodeList* pList, SNode* pNode); SNode* createColumnNode(SAstCreateContext* pCxt, const SToken* pTableAlias, const SToken* pColumnName); SNode* createValueNode(SAstCreateContext* pCxt, int32_t dataType, const SToken* pLiteral); SNode* createDurationValueNode(SAstCreateContext* pCxt, const SToken* pLiteral); -SNode* addMinusSign(SAstCreateContext* pCxt, SNode* pNode); SNode* setProjectionAlias(SAstCreateContext* pCxt, SNode* pNode, const SToken* pAlias); SNode* createLogicConditionNode(SAstCreateContext* pCxt, ELogicConditionType type, SNode* pParam1, SNode* pParam2); SNode* createOperatorNode(SAstCreateContext* pCxt, EOperatorType type, SNode* pLeft, SNode* pRight); diff --git a/source/libs/parser/src/astCreateFuncs.c b/source/libs/parser/src/astCreateFuncs.c index 6091961ed5..e8b8b42f74 100644 --- a/source/libs/parser/src/astCreateFuncs.c +++ b/source/libs/parser/src/astCreateFuncs.c @@ -76,7 +76,10 @@ SNode* createColumnNode(SAstCreateContext* pCxt, const SToken* pTableAlias, cons SNode* createValueNode(SAstCreateContext* pCxt, int32_t dataType, const SToken* pLiteral) { SValueNode* val = (SValueNode*)nodesMakeNode(QUERY_NODE_VALUE); CHECK_OUT_OF_MEM(val); - // todo + val->literal = strndup(pLiteral->z, pLiteral->n); + CHECK_OUT_OF_MEM(val->literal); + val->node.resType.type = dataType; + val->node.resType.bytes = tDataTypes[TSDB_DATA_TYPE_BOOL].bytes; return (SNode*)val; } @@ -87,10 +90,6 @@ SNode* createDurationValueNode(SAstCreateContext* pCxt, const SToken* pLiteral) return (SNode*)val; } -SNode* addMinusSign(SAstCreateContext* pCxt, SNode* pNode) { - // todo -} - SNode* createLogicConditionNode(SAstCreateContext* pCxt, ELogicConditionType type, SNode* pParam1, SNode* pParam2) { SLogicConditionNode* cond = (SLogicConditionNode*)nodesMakeNode(QUERY_NODE_LOGIC_CONDITION); CHECK_OUT_OF_MEM(cond); diff --git a/source/libs/parser/src/parserImpl.c b/source/libs/parser/src/parserImpl.c index 7182bcfedf..1682a1cb9d 100644 --- a/source/libs/parser/src/parserImpl.c +++ b/source/libs/parser/src/parserImpl.c @@ -28,82 +28,11 @@ extern void NewParseFree(void*, FFree); extern void NewParseTrace(FILE*, char*); static uint32_t toNewTokenId(uint32_t tokenId) { -// #define 1 -// #define NEW_TK_AND 2 -// #define NEW_TK_UNION 3 -// #define NEW_TK_ALL 4 -// #define NEW_TK_MINUS 5 -// #define NEW_TK_EXCEPT 6 -// #define NEW_TK_INTERSECT 7 -// #define NEW_TK_NK_PLUS 8 -// #define NEW_TK_NK_MINUS 9 -// #define NEW_TK_NK_STAR 10 -// #define NEW_TK_NK_SLASH 11 -// #define NEW_TK_NK_REM 12 -// #define NEW_TK_SHOW 13 -// #define NEW_TK_DATABASES 14 -// #define NEW_TK_NK_INTEGER 15 -// #define NEW_TK_NK_FLOAT 16 -// #define NEW_TK_NK_STRING 17 -// #define NEW_TK_NK_BOOL 18 -// #define NEW_TK_TIMESTAMP 19 -// #define NEW_TK_NK_VARIABLE 20 -// #define NEW_TK_NK_COMMA 21 -// #define NEW_TK_NK_ID 22 -// #define NEW_TK_NK_LP 23 -// #define NEW_TK_NK_RP 24 -// #define NEW_TK_NK_DOT 25 -// #define NEW_TK_BETWEEN 26 -// #define NEW_TK_NOT 27 -// #define NEW_TK_IS 28 -// #define NEW_TK_NULL 29 -// #define NEW_TK_NK_LT 30 -// #define NEW_TK_NK_GT 31 -// #define NEW_TK_NK_LE 32 -// #define NEW_TK_NK_GE 33 -// #define NEW_TK_NK_NE 34 -// #define 35 -// #define NEW_TK_LIKE 36 -// #define NEW_TK_MATCH 37 -// #define NEW_TK_NMATCH 38 -// #define NEW_TK_IN 39 -// #define NEW_TK_FROM 40 -// #define NEW_TK_AS 41 -// #define NEW_TK_JOIN 42 -// #define NEW_TK_ON 43 -// #define NEW_TK_INNER 44 -// #define NEW_TK_SELECT 45 -// #define NEW_TK_DISTINCT 46 -// #define 47 -// #define NEW_TK_PARTITION 48 -// #define NEW_TK_BY 49 -// #define NEW_TK_SESSION 50 -// #define NEW_TK_STATE_WINDOW 51 -// #define NEW_TK_INTERVAL 52 -// #define NEW_TK_SLIDING 53 -// #define NEW_TK_FILL 54 -// #define NEW_TK_VALUE 55 -// #define NEW_TK_NONE 56 -// #define NEW_TK_PREV 57 -// #define NEW_TK_LINEAR 58 -// #define NEW_TK_NEXT 59 -// #define NEW_TK_GROUP 60 -// #define NEW_TK_HAVING 61 -// #define NEW_TK_ORDER 62 -// #define NEW_TK_SLIMIT 63 -// #define NEW_TK_SOFFSET 64 -// #define NEW_TK_LIMIT 65 -// #define NEW_TK_OFFSET 66 -// #define NEW_TK_NK_LR 67 -// #define NEW_TK_ASC 68 -// #define NEW_TK_DESC 69 -// #define NEW_TK_NULLS 70 -// #define NEW_TK_FIRST 71 -// #define NEW_TK_LAST 72 - switch (tokenId) { case TK_OR: return NEW_TK_OR; + case TK_AND: + return NEW_TK_AND; case TK_UNION: return NEW_TK_UNION; case TK_ALL: @@ -116,22 +45,62 @@ static uint32_t toNewTokenId(uint32_t tokenId) { return NEW_TK_NK_STAR; case TK_SLASH: return NEW_TK_NK_SLASH; + case TK_REM: + return NEW_TK_NK_REM; case TK_SHOW: return NEW_TK_SHOW; case TK_DATABASES: return NEW_TK_DATABASES; + case TK_INTEGER: + return NEW_TK_NK_INTEGER; + case TK_FLOAT: + return NEW_TK_NK_FLOAT; + case TK_STRING: + return NEW_TK_NK_STRING; + case TK_BOOL: + return NEW_TK_NK_BOOL; + case TK_TIMESTAMP: + return NEW_TK_TIMESTAMP; + case TK_VARIABLE: + return NEW_TK_NK_VARIABLE; + case TK_COMMA: + return NEW_TK_NK_COMMA; case TK_ID: return NEW_TK_NK_ID; case TK_LP: return NEW_TK_NK_LP; case TK_RP: return NEW_TK_NK_RP; - case TK_COMMA: - return NEW_TK_NK_COMMA; case TK_DOT: return NEW_TK_NK_DOT; + case TK_BETWEEN: + return NEW_TK_BETWEEN; + case TK_NOT: + return NEW_TK_NOT; + case TK_IS: + return NEW_TK_IS; + case TK_NULL: + return NEW_TK_NULL; + case TK_LT: + return NEW_TK_NK_LT; + case TK_GT: + return NEW_TK_NK_GT; + case TK_LE: + return NEW_TK_NK_LE; + case TK_GE: + return NEW_TK_NK_GE; + case TK_NE: + return NEW_TK_NK_NE; case TK_EQ: return NEW_TK_NK_EQ; + case TK_LIKE: + return NEW_TK_LIKE; + case TK_MATCH: + return NEW_TK_MATCH; + case TK_NMATCH: + return NEW_TK_NMATCH; + case TK_IN: + return NEW_TK_IN; case TK_SELECT: return NEW_TK_SELECT; case TK_DISTINCT: @@ -142,6 +111,38 @@ static uint32_t toNewTokenId(uint32_t tokenId) { return NEW_TK_AS; case TK_FROM: return NEW_TK_FROM; + case TK_JOIN: + return NEW_TK_JOIN; + // case TK_ON: + // return NEW_TK_ON; + // case TK_INNER: + // return NEW_TK_INNER; + // case TK_PARTITION: + // return NEW_TK_PARTITION; + case TK_SESSION: + return NEW_TK_SESSION; + case TK_STATE_WINDOW: + return NEW_TK_STATE_WINDOW; + case TK_INTERVAL: + return NEW_TK_INTERVAL; + case TK_SLIDING: + return NEW_TK_SLIDING; + case TK_FILL: + return NEW_TK_FILL; + // case TK_VALUE: + // return NEW_TK_VALUE; + case TK_NONE: + return NEW_TK_NONE; + case TK_PREV: + return NEW_TK_PREV; + case TK_LINEAR: + return NEW_TK_LINEAR; + // case TK_NEXT: + // return NEW_TK_NEXT; + case TK_GROUP: + return NEW_TK_GROUP; + case TK_HAVING: + return NEW_TK_HAVING; case TK_ORDER: return NEW_TK_ORDER; case TK_BY: @@ -150,6 +151,14 @@ static uint32_t toNewTokenId(uint32_t tokenId) { return NEW_TK_ASC; case TK_DESC: return NEW_TK_DESC; + case TK_SLIMIT: + return NEW_TK_SLIMIT; + case TK_SOFFSET: + return NEW_TK_SOFFSET; + case TK_LIMIT: + return NEW_TK_LIMIT; + case TK_OFFSET: + return NEW_TK_OFFSET; case TK_SPACE: break; default: @@ -224,14 +233,6 @@ abort_parse: return cxt.valid ? TSDB_CODE_SUCCESS : TSDB_CODE_FAILED; } -// typedef struct SNamespace { -// int16_t level; // todo for correlated subquery -// char dbName[TSDB_DB_NAME_LEN]; -// char tableAlias[TSDB_TABLE_NAME_LEN]; -// SHashObj* pColHash; // key is colname, value is index of STableMeta.schema -// STableMeta* pMeta; -// } SNamespace; - typedef enum ESqlClause { SQL_CLAUSE_FROM = 1, SQL_CLAUSE_WHERE @@ -256,6 +257,8 @@ static char* getSyntaxErrFormat(int32_t errCode) { return "Table does not exist : %s"; case TSDB_CODE_PARSER_AMBIGUOUS_COLUMN: return "Column ambiguously defined : %s"; + case TSDB_CODE_PARSER_WRONG_VALUE_TYPE: + return "Invalid value type : %s"; default: return "Unknown error"; } @@ -322,7 +325,8 @@ static void setColumnInfoBySchema(const STableNode* pTable, const SSchema* pColS strcpy(pCol->node.aliasName, pColSchema->name); } pCol->colId = pColSchema->colId; - pCol->colType = pColSchema->type; + // pCol->colType = pColSchema->type; + pCol->node.resType.type = pColSchema->type; pCol->node.resType.bytes = pColSchema->bytes; } @@ -431,6 +435,30 @@ static bool translateValue(STranslateContext* pCxt, SValueNode* pVal) { } static bool translateOperator(STranslateContext* pCxt, SOperatorNode* pOp) { + SDataType ldt = ((SExprNode*)(pOp->pLeft))->resType; + SDataType rdt = ((SExprNode*)(pOp->pRight))->resType; + if (nodesIsArithmeticOp(pOp)) { + if (TSDB_DATA_TYPE_JSON == ldt.type || TSDB_DATA_TYPE_BLOB == ldt.type || + TSDB_DATA_TYPE_JSON == rdt.type || TSDB_DATA_TYPE_BLOB == rdt.type) { + generateSyntaxErrMsg(pCxt, TSDB_CODE_PARSER_WRONG_VALUE_TYPE, ((SExprNode*)(pOp->pRight))->aliasName); + return false; + } + pOp->node.resType.type = TSDB_DATA_TYPE_DOUBLE; + pOp->node.resType.bytes = tDataTypes[TSDB_DATA_TYPE_DOUBLE].bytes; + return true; + } else if (nodesIsComparisonOp(pOp)) { + if (TSDB_DATA_TYPE_JSON == ldt.type || TSDB_DATA_TYPE_BLOB == ldt.type || + TSDB_DATA_TYPE_JSON == rdt.type || TSDB_DATA_TYPE_BLOB == rdt.type) { + generateSyntaxErrMsg(pCxt, TSDB_CODE_PARSER_WRONG_VALUE_TYPE, ((SExprNode*)(pOp->pRight))->aliasName); + return false; + } + pOp->node.resType.type = TSDB_DATA_TYPE_BOOL; + pOp->node.resType.bytes = tDataTypes[TSDB_DATA_TYPE_BOOL].bytes; + return true; + } else { + // todo json operator + return true; + } return true; } diff --git a/source/libs/parser/test/newParserTest.cpp b/source/libs/parser/test/newParserTest.cpp index 973a6aff1e..16fd9f26d5 100644 --- a/source/libs/parser/test/newParserTest.cpp +++ b/source/libs/parser/test/newParserTest.cpp @@ -55,10 +55,13 @@ protected: return (TSDB_CODE_SUCCESS != translateCode); } if (NULL != query_.pRoot && QUERY_NODE_SELECT_STMT == nodeType(query_.pRoot)) { - string sql; - selectToSql(query_.pRoot, sql); cout << "input sql : [" << cxt_.pSql << "]" << endl; - cout << "output sql : [" << sql << "]" << endl; + // string sql; + // selectToSql(query_.pRoot, sql); + // cout << "output sql : [" << sql << "]" << endl; + string str; + selectToStr(query_.pRoot, str); + cout << "translate str : \n" << str << endl; } return (TSDB_CODE_SUCCESS == translateCode); } @@ -67,6 +70,162 @@ private: static const int max_err_len = 1024; static const int max_sql_len = 1024 * 1024; + string dataTypeToStr(const SDataType& dt) { + switch (dt.type) { + case TSDB_DATA_TYPE_NULL: + return "NULL"; + case TSDB_DATA_TYPE_BOOL: + return "BOOL"; + case TSDB_DATA_TYPE_TINYINT: + return "TINYINT"; + case TSDB_DATA_TYPE_SMALLINT: + return "SMALLINT"; + case TSDB_DATA_TYPE_INT: + return "INT"; + case TSDB_DATA_TYPE_BIGINT: + return "BIGINT"; + case TSDB_DATA_TYPE_FLOAT: + return "FLOAT"; + case TSDB_DATA_TYPE_DOUBLE: + return "DOUBLE"; + case TSDB_DATA_TYPE_BINARY: + return "BINART(" + to_string(dt.bytes) + ")"; + case TSDB_DATA_TYPE_TIMESTAMP: + return "TIMESTAMP"; + case TSDB_DATA_TYPE_NCHAR: + return "NCHAR(" + to_string(dt.bytes) + ")"; + case TSDB_DATA_TYPE_UTINYINT: + return "UTINYINT"; + case TSDB_DATA_TYPE_USMALLINT: + return "USMALLINT"; + case TSDB_DATA_TYPE_UINT: + return "UINT"; + case TSDB_DATA_TYPE_UBIGINT: + return "UBIGINT"; + case TSDB_DATA_TYPE_VARCHAR: + return "VARCHAR(" + to_string(dt.bytes) + ")"; + case TSDB_DATA_TYPE_VARBINARY: + return "VARBINARY(" + to_string(dt.bytes) + ")"; + case TSDB_DATA_TYPE_JSON: + return "JSON"; + case TSDB_DATA_TYPE_DECIMAL: + return "DECIMAL(" + to_string(dt.precision) + ", " + to_string(dt.scale) + ")"; + case TSDB_DATA_TYPE_BLOB: + return "BLOB"; + default: + break; + } + return "Unknown Data Type " + to_string(dt.type); + } + + void nodeToStr(const SNode* node, string& str, bool isProject) { + if (nullptr == node) { + return; + } + + switch (nodeType(node)) { + case QUERY_NODE_COLUMN: { + SColumnNode* pCol = (SColumnNode*)node; + if ('\0' != pCol->dbName[0]) { + str.append(pCol->dbName); + str.append("."); + } + if ('\0' != pCol->tableAlias[0]) { + str.append(pCol->tableAlias); + str.append("."); + } + str.append(pCol->colName); + str.append(" [" + dataTypeToStr(pCol->node.resType) + "]"); + if (isProject) { + str.append(" AS " + string(pCol->node.aliasName)); + } + break; + } + case QUERY_NODE_VALUE: { + SValueNode* pVal = (SValueNode*)node; + str.append(pVal->literal); + str.append(" [" + dataTypeToStr(pVal->node.resType) + "]"); + if (isProject) { + str.append(" AS " + string(pVal->node.aliasName)); + } + break; + } + case QUERY_NODE_OPERATOR: { + SOperatorNode* pOp = (SOperatorNode*)node; + nodeToStr(pOp->pLeft, str, false); + str.append(opTypeToStr(pOp->opType)); + nodeToStr(pOp->pRight, str, false); + str.append(" [" + dataTypeToStr(pOp->node.resType) + "]"); + if (isProject) { + str.append(" AS " + string(pOp->node.aliasName)); + } + break; + } + default: + break; + } + } + + void nodeListToStr(const SNodeList* nodelist, const string& prefix, string& str, bool isProject = false) { + SNode* node = nullptr; + FOREACH(node, nodelist) { + str.append(prefix); + nodeToStr(node, str, isProject); + str.append("\n"); + } + } + + void tableToStr(const SNode* node, const string& prefix, string& str) { + const STableNode* table = (const STableNode*)node; + switch (nodeType(node)) { + case QUERY_NODE_REAL_TABLE: { + SRealTableNode* realTable = (SRealTableNode*)table; + str.append(prefix); + if ('\0' != realTable->table.dbName[0]) { + str.append(realTable->table.dbName); + str.append("."); + } + str.append(realTable->table.tableName); + str.append(string(" ") + realTable->table.tableAlias); + break; + } + case QUERY_NODE_TEMP_TABLE: { + STempTableNode* tempTable = (STempTableNode*)table; + str.append(prefix + "(\n"); + selectToStr(tempTable->pSubquery, str, prefix + "\t"); + str.append("\n"); + str.append(prefix + ") "); + str.append(tempTable->table.tableAlias); + break; + } + case QUERY_NODE_JOIN_TABLE: { + SJoinTableNode* joinTable = (SJoinTableNode*)table; + tableToStr(joinTable->pLeft, prefix, str); + str.append("\n" + prefix + "JOIN\n"); + tableToStr(joinTable->pRight, prefix, str); + if (nullptr != joinTable->pOnCond) { + str.append("\n" + prefix + "\tON "); + nodeToStr(joinTable->pOnCond, str, false); + } + break; + } + default: + break; + } + } + + void selectToStr(const SNode* node, string& str, const string& prefix = "") { + SSelectStmt* select = (SSelectStmt*)node; + str.append(prefix + "SELECT "); + if (select->isDistinct) { + str.append("DISTINCT"); + } + str.append("\n"); + nodeListToStr(select->pProjectionList, prefix + "\t", str, true); + str.append("\n" + prefix + "FROM\n"); + tableToStr(select->pFromTable, prefix + "\t", str); + } + void selectToSql(const SNode* node, string& sql) { SSelectStmt* select = (SSelectStmt*)node; sql.append("SELECT "); @@ -123,7 +282,7 @@ private: } } - string opTypeToSql(EOperatorType type) { + string opTypeToStr(EOperatorType type) { switch (type) { case OP_TYPE_ADD: return " + "; @@ -177,7 +336,7 @@ private: case QUERY_NODE_OPERATOR: { SOperatorNode* pOp = (SOperatorNode*)node; nodeToSql(pOp->pLeft, sql); - sql.append(opTypeToSql(pOp->opType)); + sql.append(opTypeToStr(pOp->opType)); nodeToSql(pOp->pRight, sql); break; } @@ -213,8 +372,7 @@ private: SQuery query_; }; -// SELECT * FROM t1 -TEST_F(NewParserTest, selectStar) { +TEST_F(NewParserTest, selectSimple) { setDatabase("root", "test"); bind("SELECT * FROM t1"); @@ -233,7 +391,14 @@ TEST_F(NewParserTest, selectStar) { ASSERT_TRUE(run()); } -TEST_F(NewParserTest, syntaxError) { +TEST_F(NewParserTest, selectExpression) { + setDatabase("root", "test"); + + bind("SELECT c1 + 10, c2 FROM t1"); + ASSERT_TRUE(run()); +} + +TEST_F(NewParserTest, selectSyntaxError) { setDatabase("root", "test"); bind("SELECTT * FROM t1"); @@ -249,7 +414,7 @@ TEST_F(NewParserTest, syntaxError) { ASSERT_TRUE(run(TSDB_CODE_FAILED)); } -TEST_F(NewParserTest, semanticError) { +TEST_F(NewParserTest, selectSemanticError) { setDatabase("root", "test"); bind("SELECT * FROM t10"); diff --git a/source/nodes/src/nodesTraverseFuncs.c b/source/nodes/src/nodesTraverseFuncs.c index 444ff7cbcf..0702254b5f 100644 --- a/source/nodes/src/nodesTraverseFuncs.c +++ b/source/nodes/src/nodesTraverseFuncs.c @@ -109,7 +109,7 @@ void nodesWalkNodePostOrder(SNode* pNode, FQueryNodeWalker walker, void* pContex } void nodesWalkListPostOrder(SNodeList* pList, FQueryNodeWalker walker, void* pContext) { - (void)walkList(pList, TRAVERSAL_PREORDER, walker, pContext); + (void)walkList(pList, TRAVERSAL_POSTORDER, walker, pContext); } bool nodesWalkStmt(SNode* pNode, FQueryNodeWalker walker, void* pContext) { diff --git a/source/nodes/src/nodesUtilFuncs.c b/source/nodes/src/nodesUtilFuncs.c index bf4c4a83cc..af6cec755d 100644 --- a/source/nodes/src/nodesUtilFuncs.c +++ b/source/nodes/src/nodesUtilFuncs.c @@ -70,8 +70,19 @@ SNode* nodesMakeNode(ENodeType type) { return NULL; } -void nodesDestroyNode(SNode* pNode) { +static bool destroyNode(SNode* pNode, void* pContext) { + switch (nodeType(pNode)) { + case QUERY_NODE_VALUE: + tfree(((SValueNode*)pNode)->literal); + break; + default: + break; + } + tfree(pNode); +} +void nodesDestroyNode(SNode* pNode) { + nodesWalkNodePostOrder(pNode, destroyNode, NULL); } SNodeList* nodesMakeList() { @@ -103,13 +114,63 @@ SNodeList* nodesListAppend(SNodeList* pList, SNode* pNode) { } void nodesDestroyList(SNodeList* pList) { + SNode* node; + FOREACH(node, pList) { + nodesDestroyNode(node); + } + tfree(pList); +} +bool nodesIsArithmeticOp(const SOperatorNode* pOp) { + switch (pOp->opType) { + case OP_TYPE_ADD: + case OP_TYPE_SUB: + case OP_TYPE_MULTI: + case OP_TYPE_DIV: + case OP_TYPE_MOD: + return true; + default: + break; + } + return false; +} + +bool nodesIsComparisonOp(const SOperatorNode* pOp) { + switch (pOp->opType) { + case OP_TYPE_GREATER_THAN: + case OP_TYPE_GREATER_EQUAL: + case OP_TYPE_LOWER_THAN: + case OP_TYPE_LOWER_EQUAL: + case OP_TYPE_EQUAL: + case OP_TYPE_NOT_EQUAL: + case OP_TYPE_IN: + case OP_TYPE_NOT_IN: + case OP_TYPE_LIKE: + case OP_TYPE_NOT_LIKE: + case OP_TYPE_MATCH: + case OP_TYPE_NMATCH: + return true; + default: + break; + } + return false; +} + +bool nodesIsJsonOp(const SOperatorNode* pOp) { + switch (pOp->opType) { + case OP_TYPE_JSON_GET_VALUE: + case OP_TYPE_JSON_CONTAINS: + return true; + default: + break; + } + return false; } bool nodesIsTimeorderQuery(const SNode* pQuery) { - + return false; } bool nodesIsTimelineQuery(const SNode* pQuery) { - + return false; } \ No newline at end of file From b9eb9d87a2ff2961dab5fa595f58b1ea0d61ba15 Mon Sep 17 00:00:00 2001 From: Xiaoyu Wang Date: Mon, 7 Feb 2022 22:56:41 -0500 Subject: [PATCH 04/12] TD-13338 SELECT statement translate code --- include/nodes/nodes.h | 9 +- source/libs/parser/inc/astCreateFuncs.h | 10 +- source/libs/parser/inc/new_sql.y | 103 +++++++++++------ source/libs/parser/src/astCreateFuncs.c | 41 ++++++- source/libs/parser/src/new_sql.c | 141 +++++++++++++++++------- source/nodes/src/nodesTraverseFuncs.c | 29 +++++ source/nodes/src/nodesUtilFuncs.c | 8 +- 7 files changed, 257 insertions(+), 84 deletions(-) diff --git a/include/nodes/nodes.h b/include/nodes/nodes.h index 73082825ef..ae29445865 100644 --- a/include/nodes/nodes.h +++ b/include/nodes/nodes.h @@ -55,7 +55,7 @@ typedef enum ENodeType { QUERY_NODE_FILL, // only for parser - QUERY_NODE_TARGET_EXPR, + QUERY_NODE_RAW_EXPR, QUERY_NODE_SET_OPERATOR, QUERY_NODE_SELECT_STMT, @@ -81,6 +81,13 @@ typedef struct SNodeList { SListCell* pTail; } SNodeList; +typedef struct SRawExprNode { + ENodeType nodeType; + char* p; + uint32_t n; + SNode* pNode; +} SRawExprNode; + typedef struct SDataType { uint8_t type; uint8_t precision; diff --git a/source/libs/parser/inc/astCreateFuncs.h b/source/libs/parser/inc/astCreateFuncs.h index 7cd7e1932d..82315a5ba4 100644 --- a/source/libs/parser/inc/astCreateFuncs.h +++ b/source/libs/parser/inc/astCreateFuncs.h @@ -27,12 +27,10 @@ extern "C" { extern SToken nil_token; -typedef struct STargetExprNode { - ENodeType nodeType; - char* p; - uint32_t n; - SNode* pNode; -} STargetExprNode; +SNode* createRawExprNode(SAstCreateContext* pCxt, const SToken* pToken, SNode* pNode); +SNode* createRawExprNodeExt(SAstCreateContext* pCxt, const SToken* pStart, const SToken* pEnd, SNode* pNode); +SNode* releaseRawExprNode(SAstCreateContext* pCxt, SNode* pNode); +SToken getTokenFromRawExprNode(SAstCreateContext* pCxt, SNode* pNode); SNodeList* createNodeList(SAstCreateContext* pCxt, SNode* pNode); SNodeList* addNodeToList(SAstCreateContext* pCxt, SNodeList* pList, SNode* pNode); diff --git a/source/libs/parser/inc/new_sql.y b/source/libs/parser/inc/new_sql.y index 6616b80d17..72207e2bf4 100644 --- a/source/libs/parser/inc/new_sql.y +++ b/source/libs/parser/inc/new_sql.y @@ -67,19 +67,19 @@ cmd ::= SHOW DATABASES. cmd ::= query_expression(A). { PARSER_TRACE; pCxt->pRootNode = A; } /************************************************ literal *************************************************************/ -literal(A) ::= NK_INTEGER(B). { PARSER_TRACE; A = createValueNode(pCxt, TSDB_DATA_TYPE_BIGINT, &B); } -literal(A) ::= NK_FLOAT(B). { PARSER_TRACE; A = createValueNode(pCxt, TSDB_DATA_TYPE_DOUBLE, &B); } -literal(A) ::= NK_STRING(B). { PARSER_TRACE; A = createValueNode(pCxt, TSDB_DATA_TYPE_BINARY, &B); } -literal(A) ::= NK_BOOL(B). { PARSER_TRACE; A = createValueNode(pCxt, TSDB_DATA_TYPE_BOOL, &B); } -literal(A) ::= TIMESTAMP NK_STRING(B). { PARSER_TRACE; A = createValueNode(pCxt, TSDB_DATA_TYPE_TIMESTAMP, &B); } +literal(A) ::= NK_INTEGER(B). { PARSER_TRACE; A = createRawExprNode(pCxt, &B, createValueNode(pCxt, TSDB_DATA_TYPE_BIGINT, &B)); } +literal(A) ::= NK_FLOAT(B). { PARSER_TRACE; A = createRawExprNode(pCxt, &B, createValueNode(pCxt, TSDB_DATA_TYPE_DOUBLE, &B)); } +literal(A) ::= NK_STRING(B). { PARSER_TRACE; A = createRawExprNode(pCxt, &B, createValueNode(pCxt, TSDB_DATA_TYPE_BINARY, &B)); } +literal(A) ::= NK_BOOL(B). { PARSER_TRACE; A = createRawExprNode(pCxt, &B, createValueNode(pCxt, TSDB_DATA_TYPE_BOOL, &B)); } +literal(A) ::= TIMESTAMP(B) NK_STRING(C). { PARSER_TRACE; A = createRawExprNodeExt(pCxt, &B, &C, createValueNode(pCxt, TSDB_DATA_TYPE_TIMESTAMP, &C)); } literal(A) ::= duration_literal(B). { PARSER_TRACE; A = B; } -duration_literal(A) ::= NK_VARIABLE(B). { PARSER_TRACE; A = createDurationValueNode(pCxt, &B); } +duration_literal(A) ::= NK_VARIABLE(B). { PARSER_TRACE; A = createRawExprNode(pCxt, &B, createDurationValueNode(pCxt, &B)); } %type literal_list { SNodeList* } %destructor literal_list { PARSER_DESTRUCTOR_TRACE; nodesDestroyList($$); } -literal_list(A) ::= literal(B). { PARSER_TRACE; A = createNodeList(pCxt, B); } -literal_list(A) ::= literal_list(B) NK_COMMA literal(C). { PARSER_TRACE; A = addNodeToList(pCxt, B, C); } +literal_list(A) ::= literal(B). { PARSER_TRACE; A = createNodeList(pCxt, releaseRawExprNode(pCxt, B)); } +literal_list(A) ::= literal_list(B) NK_COMMA literal(C). { PARSER_TRACE; A = addNodeToList(pCxt, B, releaseRawExprNode(pCxt, C)); } /************************************************ names and identifiers ***********************************************/ %type db_name { SToken } @@ -111,37 +111,70 @@ expression(A) ::= literal(B). //expression(A) ::= NK_QUESTION(B). { PARSER_TRACE; A = B; } //expression(A) ::= pseudo_column(B). { PARSER_TRACE; A = B; } expression(A) ::= column_reference(B). { PARSER_TRACE; A = B; } -expression(A) ::= function_name(B) NK_LP expression_list(C) NK_RP. { PARSER_TRACE; A = createFunctionNode(pCxt, &B, C); } +expression(A) ::= function_name(B) NK_LP expression_list(C) NK_RP(D). { PARSER_TRACE; A = createRawExprNodeExt(pCxt, &B, &D, createFunctionNode(pCxt, &B, C)); } //expression(A) ::= cast_expression(B). { PARSER_TRACE; A = B; } //expression(A) ::= case_expression(B). { PARSER_TRACE; A = B; } expression(A) ::= subquery(B). { PARSER_TRACE; A = B; } -expression(A) ::= NK_LP expression(B) NK_RP. { PARSER_TRACE; A = B; } -expression(A) ::= NK_PLUS expression(B). { PARSER_TRACE; A = B; } -expression(A) ::= NK_MINUS expression(B). { PARSER_TRACE; A = createOperatorNode(pCxt, OP_TYPE_SUB, B, NULL); } -expression(A) ::= expression(B) NK_PLUS expression(C). { PARSER_TRACE; A = createOperatorNode(pCxt, OP_TYPE_ADD, B, C); } -expression(A) ::= expression(B) NK_MINUS expression(C). { PARSER_TRACE; A = createOperatorNode(pCxt, OP_TYPE_SUB, B, C); } -expression(A) ::= expression(B) NK_STAR expression(C). { PARSER_TRACE; A = createOperatorNode(pCxt, OP_TYPE_MULTI, B, C); } -expression(A) ::= expression(B) NK_SLASH expression(C). { PARSER_TRACE; A = createOperatorNode(pCxt, OP_TYPE_DIV, B, C); } -expression(A) ::= expression(B) NK_REM expression(C). { PARSER_TRACE; A = createOperatorNode(pCxt, OP_TYPE_MOD, B, C); } +expression(A) ::= NK_LP(B) expression(C) NK_RP(D). { PARSER_TRACE; A = createRawExprNodeExt(pCxt, &B, &D, releaseRawExprNode(pCxt, C)); } +expression(A) ::= NK_PLUS(B) expression(C). { + PARSER_TRACE; + SToken t = getTokenFromRawExprNode(pCxt, C); + A = createRawExprNodeExt(pCxt, &B, &t, releaseRawExprNode(pCxt, C)); + } +expression(A) ::= NK_MINUS(B) expression(C). { + PARSER_TRACE; + SToken t = getTokenFromRawExprNode(pCxt, C); + A = createRawExprNodeExt(pCxt, &B, &t, createOperatorNode(pCxt, OP_TYPE_SUB, releaseRawExprNode(pCxt, C), NULL)); + } +expression(A) ::= expression(B) NK_PLUS expression(C). { + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, B); + SToken e = getTokenFromRawExprNode(pCxt, C); + A = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_ADD, releaseRawExprNode(pCxt, B), releaseRawExprNode(pCxt, C))); + } +expression(A) ::= expression(B) NK_MINUS expression(C). { + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, B); + SToken e = getTokenFromRawExprNode(pCxt, C); + A = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_SUB, releaseRawExprNode(pCxt, B), releaseRawExprNode(pCxt, C))); + } +expression(A) ::= expression(B) NK_STAR expression(C). { + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, B); + SToken e = getTokenFromRawExprNode(pCxt, C); + A = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_MULTI, releaseRawExprNode(pCxt, B), releaseRawExprNode(pCxt, C))); + } +expression(A) ::= expression(B) NK_SLASH expression(C). { + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, B); + SToken e = getTokenFromRawExprNode(pCxt, C); + A = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_DIV, releaseRawExprNode(pCxt, B), releaseRawExprNode(pCxt, C))); + } +expression(A) ::= expression(B) NK_REM expression(C). { + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, B); + SToken e = getTokenFromRawExprNode(pCxt, C); + A = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_MOD, releaseRawExprNode(pCxt, B), releaseRawExprNode(pCxt, C))); + } %type expression_list { SNodeList* } %destructor expression_list { PARSER_DESTRUCTOR_TRACE; nodesDestroyList($$); } -expression_list(A) ::= expression(B). { PARSER_TRACE; A = createNodeList(pCxt, B); } -expression_list(A) ::= expression_list(B) NK_COMMA expression(C). { PARSER_TRACE; A = addNodeToList(pCxt, B, C); } +expression_list(A) ::= expression(B). { PARSER_TRACE; A = createNodeList(pCxt, releaseRawExprNode(pCxt, B)); } +expression_list(A) ::= expression_list(B) NK_COMMA expression(C). { PARSER_TRACE; A = addNodeToList(pCxt, B, releaseRawExprNode(pCxt, C)); } -column_reference(A) ::= column_name(B). { PARSER_TRACE; A = createColumnNode(pCxt, NULL, &B); } -column_reference(A) ::= table_name(B) NK_DOT column_name(C). { PARSER_TRACE; A = createColumnNode(pCxt, &B, &C); } +column_reference(A) ::= column_name(B). { PARSER_TRACE; A = createRawExprNode(pCxt, &B, createColumnNode(pCxt, NULL, &B)); } +column_reference(A) ::= table_name(B) NK_DOT column_name(C). { PARSER_TRACE; A = createRawExprNodeExt(pCxt, &B, &C, createColumnNode(pCxt, &B, &C)); } //pseudo_column(A) ::= NK_NOW. { PARSER_TRACE; A = createFunctionNode(pCxt, NULL, NULL); } /************************************************ predicate ***********************************************************/ -predicate(A) ::= expression(B) compare_op(C) expression(D). { PARSER_TRACE; A = createOperatorNode(pCxt, C, B, D); } +predicate(A) ::= expression(B) compare_op(C) expression(D). { PARSER_TRACE; A = createOperatorNode(pCxt, C, releaseRawExprNode(pCxt, B), releaseRawExprNode(pCxt, D)); } //predicate(A) ::= expression(B) compare_op sub_type expression(B). -predicate(A) ::= expression(B) BETWEEN expression(C) AND expression(D). { PARSER_TRACE; A = createBetweenAnd(pCxt, B, C, D); } -predicate(A) ::= expression(B) NOT BETWEEN expression(C) AND expression(D). { PARSER_TRACE; A = createNotBetweenAnd(pCxt, C, B, D); } -predicate(A) ::= expression(B) IS NULL. { PARSER_TRACE; A = createIsNullCondNode(pCxt, B, true); } -predicate(A) ::= expression(B) IS NOT NULL. { PARSER_TRACE; A = createIsNullCondNode(pCxt, B, false); } -predicate(A) ::= expression(B) in_op(C) in_predicate_value(D). { PARSER_TRACE; A = createOperatorNode(pCxt, C, B, D); } +predicate(A) ::= expression(B) BETWEEN expression(C) AND expression(D). { PARSER_TRACE; A = createBetweenAnd(pCxt, releaseRawExprNode(pCxt, B), releaseRawExprNode(pCxt, C), releaseRawExprNode(pCxt, D)); } +predicate(A) ::= expression(B) NOT BETWEEN expression(C) AND expression(D). { PARSER_TRACE; A = createNotBetweenAnd(pCxt, releaseRawExprNode(pCxt, C), releaseRawExprNode(pCxt, B), releaseRawExprNode(pCxt, D)); } +predicate(A) ::= expression(B) IS NULL. { PARSER_TRACE; A = createIsNullCondNode(pCxt, releaseRawExprNode(pCxt, B), true); } +predicate(A) ::= expression(B) IS NOT NULL. { PARSER_TRACE; A = createIsNullCondNode(pCxt, releaseRawExprNode(pCxt, B), false); } +predicate(A) ::= expression(B) in_op(C) in_predicate_value(D). { PARSER_TRACE; A = createOperatorNode(pCxt, C, releaseRawExprNode(pCxt, B), D); } %type compare_op { EOperatorType } %destructor compare_op { PARSER_DESTRUCTOR_TRACE; } @@ -236,9 +269,13 @@ select_list(A) ::= select_sublist(B). select_sublist(A) ::= select_item(B). { PARSER_TRACE; A = createNodeList(pCxt, B); } select_sublist(A) ::= select_sublist(B) NK_COMMA select_item(C). { PARSER_TRACE; A = addNodeToList(pCxt, B, C); } -select_item(A) ::= expression(B). { PARSER_TRACE; A = B; } -select_item(A) ::= expression(B) column_alias(C). { PARSER_TRACE; A = setProjectionAlias(pCxt, B, &C); } -select_item(A) ::= expression(B) AS column_alias(C). { PARSER_TRACE; A = setProjectionAlias(pCxt, B, &C); } +select_item(A) ::= expression(B). { + PARSER_TRACE; + SToken t = getTokenFromRawExprNode(pCxt, B); + A = setProjectionAlias(pCxt, releaseRawExprNode(pCxt, B), &t); + } +select_item(A) ::= expression(B) column_alias(C). { PARSER_TRACE; A = setProjectionAlias(pCxt, releaseRawExprNode(pCxt, B), &C); } +select_item(A) ::= expression(B) AS column_alias(C). { PARSER_TRACE; A = setProjectionAlias(pCxt, releaseRawExprNode(pCxt, B), &C); } select_item(A) ::= table_name(B) NK_DOT NK_STAR(C). { PARSER_TRACE; A = createColumnNode(pCxt, &B, &C); } where_clause_opt(A) ::= . { PARSER_TRACE; A = NULL; } @@ -251,8 +288,8 @@ partition_by_clause_opt(A) ::= PARTITION BY expression_list(B). twindow_clause_opt(A) ::= . { PARSER_TRACE; A = NULL; } twindow_clause_opt(A) ::= - SESSION NK_LP column_reference(B) NK_COMMA NK_INTEGER(C) NK_RP. { PARSER_TRACE; A = createSessionWindowNode(pCxt, B, &C); } -twindow_clause_opt(A) ::= STATE_WINDOW NK_LP column_reference(B) NK_RP. { PARSER_TRACE; A = createStateWindowNode(pCxt, B); } + SESSION NK_LP column_reference(B) NK_COMMA NK_INTEGER(C) NK_RP. { PARSER_TRACE; A = createSessionWindowNode(pCxt, releaseRawExprNode(pCxt, B), &C); } +twindow_clause_opt(A) ::= STATE_WINDOW NK_LP column_reference(B) NK_RP. { PARSER_TRACE; A = createStateWindowNode(pCxt, releaseRawExprNode(pCxt, B)); } twindow_clause_opt(A) ::= INTERVAL NK_LP duration_literal(B) NK_RP sliding_opt(C) fill_opt(D). { PARSER_TRACE; A = createIntervalWindowNode(pCxt, B, NULL, C, D); } twindow_clause_opt(A) ::= @@ -330,7 +367,7 @@ sort_specification_list(A) ::= sort_specification_list(B) NK_COMMA sort_specification(C). { PARSER_TRACE; A = addNodeToList(pCxt, B, C); } sort_specification(A) ::= - expression(B) ordering_specification_opt(C) null_ordering_opt(D). { PARSER_TRACE; A = createOrderByExprNode(pCxt, B, C, D); } + expression(B) ordering_specification_opt(C) null_ordering_opt(D). { PARSER_TRACE; A = createOrderByExprNode(pCxt, releaseRawExprNode(pCxt, B), C, D); } %type ordering_specification_opt EOrder %destructor ordering_specification_opt { PARSER_DESTRUCTOR_TRACE; } diff --git a/source/libs/parser/src/astCreateFuncs.c b/source/libs/parser/src/astCreateFuncs.c index e8b8b42f74..5aaa40e0e4 100644 --- a/source/libs/parser/src/astCreateFuncs.c +++ b/source/libs/parser/src/astCreateFuncs.c @@ -24,6 +24,14 @@ } \ } while (0) +#define CHECK_RAW_EXPR_NODE(node) \ + do { \ + if (NULL == (node) || QUERY_NODE_RAW_EXPR != nodeType(node)) { \ + pCxt->valid = false; \ + return NULL; \ + } \ + } while (0) + SToken nil_token = { .type = TK_NIL, .n = 0, .z = NULL }; static bool checkDbName(SAstCreateContext* pCxt, const SToken* pDbName) { @@ -50,6 +58,37 @@ static bool checkColumnName(SAstCreateContext* pCxt, const SToken* pColumnName) return pCxt->valid; } +SNode* createRawExprNode(SAstCreateContext* pCxt, const SToken* pToken, SNode* pNode) { + SRawExprNode* target = (SRawExprNode*)nodesMakeNode(QUERY_NODE_RAW_EXPR); + CHECK_OUT_OF_MEM(target); + target->p = pToken->z; + target->n = pToken->n; + target->pNode = pNode; + return (SNode*)target; +} + +SNode* createRawExprNodeExt(SAstCreateContext* pCxt, const SToken* pStart, const SToken* pEnd, SNode* pNode) { + SRawExprNode* target = (SRawExprNode*)nodesMakeNode(QUERY_NODE_RAW_EXPR); + CHECK_OUT_OF_MEM(target); + target->p = pStart->z; + target->n = (pEnd->z + pEnd->n) - pStart->z; + target->pNode = pNode; + return (SNode*)target; +} + +SNode* releaseRawExprNode(SAstCreateContext* pCxt, SNode* pNode) { + CHECK_RAW_EXPR_NODE(pNode); + SNode* tmp = ((SRawExprNode*)pNode)->pNode; + tfree(pNode); + return tmp; +} + +SToken getTokenFromRawExprNode(SAstCreateContext* pCxt, SNode* pNode) { + SRawExprNode* target = (SRawExprNode*)pNode; + SToken t = { .type = 0, .z = target->p, .n = target->n}; + return t; +} + SNodeList* createNodeList(SAstCreateContext* pCxt, SNode* pNode) { SNodeList* list = nodesMakeList(); CHECK_OUT_OF_MEM(list); @@ -86,7 +125,7 @@ SNode* createValueNode(SAstCreateContext* pCxt, int32_t dataType, const SToken* SNode* createDurationValueNode(SAstCreateContext* pCxt, const SToken* pLiteral) { SValueNode* val = (SValueNode*)nodesMakeNode(QUERY_NODE_VALUE); CHECK_OUT_OF_MEM(val); - // todo + // todo : calc, for example, 10s return (SNode*)val; } diff --git a/source/libs/parser/src/new_sql.c b/source/libs/parser/src/new_sql.c index 8ce36a6425..85f73e5eb8 100644 --- a/source/libs/parser/src/new_sql.c +++ b/source/libs/parser/src/new_sql.c @@ -1517,23 +1517,24 @@ static YYACTIONTYPE yy_reduce( { PARSER_TRACE; pCxt->pRootNode = yymsp[0].minor.yy168; } break; case 2: /* literal ::= NK_INTEGER */ -{ PARSER_TRACE; yylhsminor.yy168 = createValueNode(pCxt, TSDB_DATA_TYPE_BIGINT, &yymsp[0].minor.yy0); } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNode(pCxt, &yymsp[0].minor.yy0, createValueNode(pCxt, TSDB_DATA_TYPE_BIGINT, &yymsp[0].minor.yy0)); } yymsp[0].minor.yy168 = yylhsminor.yy168; break; case 3: /* literal ::= NK_FLOAT */ -{ PARSER_TRACE; yylhsminor.yy168 = createValueNode(pCxt, TSDB_DATA_TYPE_DOUBLE, &yymsp[0].minor.yy0); } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNode(pCxt, &yymsp[0].minor.yy0, createValueNode(pCxt, TSDB_DATA_TYPE_DOUBLE, &yymsp[0].minor.yy0)); } yymsp[0].minor.yy168 = yylhsminor.yy168; break; case 4: /* literal ::= NK_STRING */ -{ PARSER_TRACE; yylhsminor.yy168 = createValueNode(pCxt, TSDB_DATA_TYPE_BINARY, &yymsp[0].minor.yy0); } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNode(pCxt, &yymsp[0].minor.yy0, createValueNode(pCxt, TSDB_DATA_TYPE_BINARY, &yymsp[0].minor.yy0)); } yymsp[0].minor.yy168 = yylhsminor.yy168; break; case 5: /* literal ::= NK_BOOL */ -{ PARSER_TRACE; yylhsminor.yy168 = createValueNode(pCxt, TSDB_DATA_TYPE_BOOL, &yymsp[0].minor.yy0); } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNode(pCxt, &yymsp[0].minor.yy0, createValueNode(pCxt, TSDB_DATA_TYPE_BOOL, &yymsp[0].minor.yy0)); } yymsp[0].minor.yy168 = yylhsminor.yy168; break; case 6: /* literal ::= TIMESTAMP NK_STRING */ -{ PARSER_TRACE; yymsp[-1].minor.yy168 = createValueNode(pCxt, TSDB_DATA_TYPE_TIMESTAMP, &yymsp[0].minor.yy0); } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNodeExt(pCxt, &yymsp[-1].minor.yy0, &yymsp[0].minor.yy0, createValueNode(pCxt, TSDB_DATA_TYPE_TIMESTAMP, &yymsp[0].minor.yy0)); } + yymsp[-1].minor.yy168 = yylhsminor.yy168; break; case 7: /* literal ::= duration_literal */ case 17: /* expression ::= literal */ yytestcase(yyruleno==17); @@ -1545,7 +1546,6 @@ static YYACTIONTYPE yy_reduce( case 61: /* table_reference ::= table_primary */ yytestcase(yyruleno==61); case 62: /* table_reference ::= joined_table */ yytestcase(yyruleno==62); case 66: /* table_primary ::= parenthesized_joined_table */ yytestcase(yyruleno==66); - case 82: /* select_item ::= expression */ yytestcase(yyruleno==82); case 110: /* query_expression_body ::= query_primary */ yytestcase(yyruleno==110); case 112: /* query_primary ::= query_specification */ yytestcase(yyruleno==112); case 124: /* search_condition ::= boolean_value_expression */ yytestcase(yyruleno==124); @@ -1553,21 +1553,17 @@ static YYACTIONTYPE yy_reduce( yymsp[0].minor.yy168 = yylhsminor.yy168; break; case 8: /* duration_literal ::= NK_VARIABLE */ -{ PARSER_TRACE; yylhsminor.yy168 = createDurationValueNode(pCxt, &yymsp[0].minor.yy0); } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNode(pCxt, &yymsp[0].minor.yy0, createDurationValueNode(pCxt, &yymsp[0].minor.yy0)); } yymsp[0].minor.yy168 = yylhsminor.yy168; break; case 9: /* literal_list ::= literal */ case 29: /* expression_list ::= expression */ yytestcase(yyruleno==29); - case 80: /* select_sublist ::= select_item */ yytestcase(yyruleno==80); - case 125: /* sort_specification_list ::= sort_specification */ yytestcase(yyruleno==125); -{ PARSER_TRACE; yylhsminor.yy192 = createNodeList(pCxt, yymsp[0].minor.yy168); } +{ PARSER_TRACE; yylhsminor.yy192 = createNodeList(pCxt, releaseRawExprNode(pCxt, yymsp[0].minor.yy168)); } yymsp[0].minor.yy192 = yylhsminor.yy192; break; case 10: /* literal_list ::= literal_list NK_COMMA literal */ case 30: /* expression_list ::= expression_list NK_COMMA expression */ yytestcase(yyruleno==30); - case 81: /* select_sublist ::= select_sublist NK_COMMA select_item */ yytestcase(yyruleno==81); - case 126: /* sort_specification_list ::= sort_specification_list NK_COMMA sort_specification */ yytestcase(yyruleno==126); -{ PARSER_TRACE; yylhsminor.yy192 = addNodeToList(pCxt, yymsp[-2].minor.yy192, yymsp[0].minor.yy168); } +{ PARSER_TRACE; yylhsminor.yy192 = addNodeToList(pCxt, yymsp[-2].minor.yy192, releaseRawExprNode(pCxt, yymsp[0].minor.yy168)); } yymsp[-2].minor.yy192 = yylhsminor.yy192; break; case 11: /* db_name ::= NK_ID */ @@ -1580,74 +1576,106 @@ static YYACTIONTYPE yy_reduce( yymsp[0].minor.yy241 = yylhsminor.yy241; break; case 19: /* expression ::= function_name NK_LP expression_list NK_RP */ -{ PARSER_TRACE; yylhsminor.yy168 = createFunctionNode(pCxt, &yymsp[-3].minor.yy241, yymsp[-1].minor.yy192); } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNodeExt(pCxt, &yymsp[-3].minor.yy241, &yymsp[0].minor.yy0, createFunctionNode(pCxt, &yymsp[-3].minor.yy241, yymsp[-1].minor.yy192)); } yymsp[-3].minor.yy168 = yylhsminor.yy168; break; case 21: /* expression ::= NK_LP expression NK_RP */ - case 57: /* boolean_primary ::= NK_LP boolean_value_expression NK_RP */ yytestcase(yyruleno==57); - case 70: /* parenthesized_joined_table ::= NK_LP joined_table NK_RP */ yytestcase(yyruleno==70); - case 71: /* parenthesized_joined_table ::= NK_LP parenthesized_joined_table NK_RP */ yytestcase(yyruleno==71); - case 123: /* subquery ::= NK_LP query_expression NK_RP */ yytestcase(yyruleno==123); -{ PARSER_TRACE; yymsp[-2].minor.yy168 = yymsp[-1].minor.yy168; } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNodeExt(pCxt, &yymsp[-2].minor.yy0, &yymsp[0].minor.yy0, releaseRawExprNode(pCxt, yymsp[-1].minor.yy168)); } + yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 22: /* expression ::= NK_PLUS expression */ - case 58: /* from_clause ::= FROM table_reference_list */ yytestcase(yyruleno==58); - case 87: /* where_clause_opt ::= WHERE search_condition */ yytestcase(yyruleno==87); - case 108: /* having_clause_opt ::= HAVING search_condition */ yytestcase(yyruleno==108); -{ PARSER_TRACE; yymsp[-1].minor.yy168 = yymsp[0].minor.yy168; } +{ + PARSER_TRACE; + SToken t = getTokenFromRawExprNode(pCxt, yymsp[0].minor.yy168); + yylhsminor.yy168 = createRawExprNodeExt(pCxt, &yymsp[-1].minor.yy0, &t, releaseRawExprNode(pCxt, yymsp[0].minor.yy168)); + } + yymsp[-1].minor.yy168 = yylhsminor.yy168; break; case 23: /* expression ::= NK_MINUS expression */ -{ PARSER_TRACE; yymsp[-1].minor.yy168 = createOperatorNode(pCxt, OP_TYPE_SUB, yymsp[0].minor.yy168, NULL); } +{ + PARSER_TRACE; + SToken t = getTokenFromRawExprNode(pCxt, yymsp[0].minor.yy168); + yylhsminor.yy168 = createRawExprNodeExt(pCxt, &yymsp[-1].minor.yy0, &t, createOperatorNode(pCxt, OP_TYPE_SUB, releaseRawExprNode(pCxt, yymsp[0].minor.yy168), NULL)); + } + yymsp[-1].minor.yy168 = yylhsminor.yy168; break; case 24: /* expression ::= expression NK_PLUS expression */ -{ PARSER_TRACE; yylhsminor.yy168 = createOperatorNode(pCxt, OP_TYPE_ADD, yymsp[-2].minor.yy168, yymsp[0].minor.yy168); } +{ + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, yymsp[-2].minor.yy168); + SToken e = getTokenFromRawExprNode(pCxt, yymsp[0].minor.yy168); + yylhsminor.yy168 = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_ADD, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), releaseRawExprNode(pCxt, yymsp[0].minor.yy168))); + } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 25: /* expression ::= expression NK_MINUS expression */ -{ PARSER_TRACE; yylhsminor.yy168 = createOperatorNode(pCxt, OP_TYPE_SUB, yymsp[-2].minor.yy168, yymsp[0].minor.yy168); } +{ + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, yymsp[-2].minor.yy168); + SToken e = getTokenFromRawExprNode(pCxt, yymsp[0].minor.yy168); + yylhsminor.yy168 = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_SUB, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), releaseRawExprNode(pCxt, yymsp[0].minor.yy168))); + } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 26: /* expression ::= expression NK_STAR expression */ -{ PARSER_TRACE; yylhsminor.yy168 = createOperatorNode(pCxt, OP_TYPE_MULTI, yymsp[-2].minor.yy168, yymsp[0].minor.yy168); } +{ + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, yymsp[-2].minor.yy168); + SToken e = getTokenFromRawExprNode(pCxt, yymsp[0].minor.yy168); + yylhsminor.yy168 = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_MULTI, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), releaseRawExprNode(pCxt, yymsp[0].minor.yy168))); + } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 27: /* expression ::= expression NK_SLASH expression */ -{ PARSER_TRACE; yylhsminor.yy168 = createOperatorNode(pCxt, OP_TYPE_DIV, yymsp[-2].minor.yy168, yymsp[0].minor.yy168); } +{ + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, yymsp[-2].minor.yy168); + SToken e = getTokenFromRawExprNode(pCxt, yymsp[0].minor.yy168); + yylhsminor.yy168 = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_DIV, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), releaseRawExprNode(pCxt, yymsp[0].minor.yy168))); + } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 28: /* expression ::= expression NK_REM expression */ -{ PARSER_TRACE; yylhsminor.yy168 = createOperatorNode(pCxt, OP_TYPE_MOD, yymsp[-2].minor.yy168, yymsp[0].minor.yy168); } +{ + PARSER_TRACE; + SToken s = getTokenFromRawExprNode(pCxt, yymsp[-2].minor.yy168); + SToken e = getTokenFromRawExprNode(pCxt, yymsp[0].minor.yy168); + yylhsminor.yy168 = createRawExprNodeExt(pCxt, &s, &e, createOperatorNode(pCxt, OP_TYPE_MOD, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), releaseRawExprNode(pCxt, yymsp[0].minor.yy168))); + } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 31: /* column_reference ::= column_name */ -{ PARSER_TRACE; yylhsminor.yy168 = createColumnNode(pCxt, NULL, &yymsp[0].minor.yy241); } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNode(pCxt, &yymsp[0].minor.yy241, createColumnNode(pCxt, NULL, &yymsp[0].minor.yy241)); } yymsp[0].minor.yy168 = yylhsminor.yy168; break; case 32: /* column_reference ::= table_name NK_DOT column_name */ -{ PARSER_TRACE; yylhsminor.yy168 = createColumnNode(pCxt, &yymsp[-2].minor.yy241, &yymsp[0].minor.yy241); } +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNodeExt(pCxt, &yymsp[-2].minor.yy241, &yymsp[0].minor.yy241, createColumnNode(pCxt, &yymsp[-2].minor.yy241, &yymsp[0].minor.yy241)); } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 33: /* predicate ::= expression compare_op expression */ - case 38: /* predicate ::= expression in_op in_predicate_value */ yytestcase(yyruleno==38); -{ PARSER_TRACE; yylhsminor.yy168 = createOperatorNode(pCxt, yymsp[-1].minor.yy228, yymsp[-2].minor.yy168, yymsp[0].minor.yy168); } +{ PARSER_TRACE; yylhsminor.yy168 = createOperatorNode(pCxt, yymsp[-1].minor.yy228, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), releaseRawExprNode(pCxt, yymsp[0].minor.yy168)); } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 34: /* predicate ::= expression BETWEEN expression AND expression */ -{ PARSER_TRACE; yylhsminor.yy168 = createBetweenAnd(pCxt, yymsp[-4].minor.yy168, yymsp[-2].minor.yy168, yymsp[0].minor.yy168); } +{ PARSER_TRACE; yylhsminor.yy168 = createBetweenAnd(pCxt, releaseRawExprNode(pCxt, yymsp[-4].minor.yy168), releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), releaseRawExprNode(pCxt, yymsp[0].minor.yy168)); } yymsp[-4].minor.yy168 = yylhsminor.yy168; break; case 35: /* predicate ::= expression NOT BETWEEN expression AND expression */ -{ PARSER_TRACE; yylhsminor.yy168 = createNotBetweenAnd(pCxt, yymsp[-2].minor.yy168, yymsp[-5].minor.yy168, yymsp[0].minor.yy168); } +{ PARSER_TRACE; yylhsminor.yy168 = createNotBetweenAnd(pCxt, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), releaseRawExprNode(pCxt, yymsp[-5].minor.yy168), releaseRawExprNode(pCxt, yymsp[0].minor.yy168)); } yymsp[-5].minor.yy168 = yylhsminor.yy168; break; case 36: /* predicate ::= expression IS NULL */ -{ PARSER_TRACE; yylhsminor.yy168 = createIsNullCondNode(pCxt, yymsp[-2].minor.yy168, true); } +{ PARSER_TRACE; yylhsminor.yy168 = createIsNullCondNode(pCxt, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), true); } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 37: /* predicate ::= expression IS NOT NULL */ -{ PARSER_TRACE; yylhsminor.yy168 = createIsNullCondNode(pCxt, yymsp[-3].minor.yy168, false); } +{ PARSER_TRACE; yylhsminor.yy168 = createIsNullCondNode(pCxt, releaseRawExprNode(pCxt, yymsp[-3].minor.yy168), false); } yymsp[-3].minor.yy168 = yylhsminor.yy168; break; + case 38: /* predicate ::= expression in_op in_predicate_value */ +{ PARSER_TRACE; yylhsminor.yy168 = createOperatorNode(pCxt, yymsp[-1].minor.yy228, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), yymsp[0].minor.yy168); } + yymsp[-2].minor.yy168 = yylhsminor.yy168; + break; case 39: /* compare_op ::= NK_LT */ { PARSER_TRACE; yymsp[0].minor.yy228 = OP_TYPE_LOWER_THAN; } break; @@ -1698,6 +1726,17 @@ static YYACTIONTYPE yy_reduce( { PARSER_TRACE; yylhsminor.yy168 = createLogicConditionNode(pCxt, LOGIC_COND_TYPE_AND, yymsp[-2].minor.yy168, yymsp[0].minor.yy168); } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; + case 57: /* boolean_primary ::= NK_LP boolean_value_expression NK_RP */ + case 70: /* parenthesized_joined_table ::= NK_LP joined_table NK_RP */ yytestcase(yyruleno==70); + case 71: /* parenthesized_joined_table ::= NK_LP parenthesized_joined_table NK_RP */ yytestcase(yyruleno==71); + case 123: /* subquery ::= NK_LP query_expression NK_RP */ yytestcase(yyruleno==123); +{ PARSER_TRACE; yymsp[-2].minor.yy168 = yymsp[-1].minor.yy168; } + break; + case 58: /* from_clause ::= FROM table_reference_list */ + case 87: /* where_clause_opt ::= WHERE search_condition */ yytestcase(yyruleno==87); + case 108: /* having_clause_opt ::= HAVING search_condition */ yytestcase(yyruleno==108); +{ PARSER_TRACE; yymsp[-1].minor.yy168 = yymsp[0].minor.yy168; } + break; case 60: /* table_reference_list ::= table_reference_list NK_COMMA table_reference */ { PARSER_TRACE; yylhsminor.yy168 = createJoinTableNode(pCxt, JOIN_TYPE_INNER, yymsp[-2].minor.yy168, yymsp[0].minor.yy168, NULL); } yymsp[-2].minor.yy168 = yylhsminor.yy168; @@ -1758,12 +1797,30 @@ static YYACTIONTYPE yy_reduce( { PARSER_TRACE; yylhsminor.yy192 = yymsp[0].minor.yy192; } yymsp[0].minor.yy192 = yylhsminor.yy192; break; + case 80: /* select_sublist ::= select_item */ + case 125: /* sort_specification_list ::= sort_specification */ yytestcase(yyruleno==125); +{ PARSER_TRACE; yylhsminor.yy192 = createNodeList(pCxt, yymsp[0].minor.yy168); } + yymsp[0].minor.yy192 = yylhsminor.yy192; + break; + case 81: /* select_sublist ::= select_sublist NK_COMMA select_item */ + case 126: /* sort_specification_list ::= sort_specification_list NK_COMMA sort_specification */ yytestcase(yyruleno==126); +{ PARSER_TRACE; yylhsminor.yy192 = addNodeToList(pCxt, yymsp[-2].minor.yy192, yymsp[0].minor.yy168); } + yymsp[-2].minor.yy192 = yylhsminor.yy192; + break; + case 82: /* select_item ::= expression */ +{ + PARSER_TRACE; + SToken t = getTokenFromRawExprNode(pCxt, yymsp[0].minor.yy168); + yylhsminor.yy168 = setProjectionAlias(pCxt, releaseRawExprNode(pCxt, yymsp[0].minor.yy168), &t); + } + yymsp[0].minor.yy168 = yylhsminor.yy168; + break; case 83: /* select_item ::= expression column_alias */ -{ PARSER_TRACE; yylhsminor.yy168 = setProjectionAlias(pCxt, yymsp[-1].minor.yy168, &yymsp[0].minor.yy241); } +{ PARSER_TRACE; yylhsminor.yy168 = setProjectionAlias(pCxt, releaseRawExprNode(pCxt, yymsp[-1].minor.yy168), &yymsp[0].minor.yy241); } yymsp[-1].minor.yy168 = yylhsminor.yy168; break; case 84: /* select_item ::= expression AS column_alias */ -{ PARSER_TRACE; yylhsminor.yy168 = setProjectionAlias(pCxt, yymsp[-2].minor.yy168, &yymsp[0].minor.yy241); } +{ PARSER_TRACE; yylhsminor.yy168 = setProjectionAlias(pCxt, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), &yymsp[0].minor.yy241); } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 85: /* select_item ::= table_name NK_DOT NK_STAR */ @@ -1790,10 +1847,10 @@ static YYACTIONTYPE yy_reduce( { PARSER_TRACE; yymsp[-2].minor.yy192 = yymsp[0].minor.yy192; } break; case 91: /* twindow_clause_opt ::= SESSION NK_LP column_reference NK_COMMA NK_INTEGER NK_RP */ -{ PARSER_TRACE; yymsp[-5].minor.yy168 = createSessionWindowNode(pCxt, yymsp[-3].minor.yy168, &yymsp[-1].minor.yy0); } +{ PARSER_TRACE; yymsp[-5].minor.yy168 = createSessionWindowNode(pCxt, releaseRawExprNode(pCxt, yymsp[-3].minor.yy168), &yymsp[-1].minor.yy0); } break; case 92: /* twindow_clause_opt ::= STATE_WINDOW NK_LP column_reference NK_RP */ -{ PARSER_TRACE; yymsp[-3].minor.yy168 = createStateWindowNode(pCxt, yymsp[-1].minor.yy168); } +{ PARSER_TRACE; yymsp[-3].minor.yy168 = createStateWindowNode(pCxt, releaseRawExprNode(pCxt, yymsp[-1].minor.yy168)); } break; case 93: /* twindow_clause_opt ::= INTERVAL NK_LP duration_literal NK_RP sliding_opt fill_opt */ { PARSER_TRACE; yymsp[-5].minor.yy168 = createIntervalWindowNode(pCxt, yymsp[-3].minor.yy168, NULL, yymsp[-1].minor.yy168, yymsp[0].minor.yy168); } @@ -1851,7 +1908,7 @@ static YYACTIONTYPE yy_reduce( { PARSER_TRACE; yymsp[-3].minor.yy168 = createLimitNode(pCxt, &yymsp[0].minor.yy0, &yymsp[-2].minor.yy0); } break; case 127: /* sort_specification ::= expression ordering_specification_opt null_ordering_opt */ -{ PARSER_TRACE; yylhsminor.yy168 = createOrderByExprNode(pCxt, yymsp[-2].minor.yy168, yymsp[-1].minor.yy10, yymsp[0].minor.yy177); } +{ PARSER_TRACE; yylhsminor.yy168 = createOrderByExprNode(pCxt, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), yymsp[-1].minor.yy10, yymsp[0].minor.yy177); } yymsp[-2].minor.yy168 = yylhsminor.yy168; break; case 128: /* ordering_specification_opt ::= */ diff --git a/source/nodes/src/nodesTraverseFuncs.c b/source/nodes/src/nodesTraverseFuncs.c index 0702254b5f..2280d0b922 100644 --- a/source/nodes/src/nodesTraverseFuncs.c +++ b/source/nodes/src/nodesTraverseFuncs.c @@ -75,6 +75,35 @@ static bool walkNode(SNode* pNode, ETraversalOrder order, FQueryNodeWalker walke case QUERY_NODE_ORDER_BY_EXPR: res = walkNode(((SOrderByExprNode*)pNode)->pExpr, order, walker, pContext); break; + case QUERY_NODE_STATE_WINDOW: + res = walkNode(((SStateWindowNode*)pNode)->pCol, order, walker, pContext); + break; + case QUERY_NODE_SESSION_WINDOW: + res = walkNode(((SSessionWindowNode*)pNode)->pCol, order, walker, pContext); + break; + case QUERY_NODE_INTERVAL_WINDOW: { + SIntervalWindowNode* pInterval = (SIntervalWindowNode*)pNode; + res = walkNode(pInterval->pInterval, order, walker, pContext); + if (res) { + res = walkNode(pInterval->pOffset, order, walker, pContext); + } + if (res) { + res = walkNode(pInterval->pSliding, order, walker, pContext); + } + if (res) { + res = walkNode(pInterval->pFill, order, walker, pContext); + } + break; + } + case QUERY_NODE_NODE_LIST: + res = walkList(((SNodeListNode*)pNode)->pNodeList, order, walker, pContext); + break; + case QUERY_NODE_FILL: + res = walkNode(((SFillNode*)pNode)->pValues, order, walker, pContext); + break; + case QUERY_NODE_RAW_EXPR: + res = walkNode(((SRawExprNode*)pNode)->pNode, order, walker, pContext); + break; default: break; } diff --git a/source/nodes/src/nodesUtilFuncs.c b/source/nodes/src/nodesUtilFuncs.c index af6cec755d..5acb9fdf7c 100644 --- a/source/nodes/src/nodesUtilFuncs.c +++ b/source/nodes/src/nodesUtilFuncs.c @@ -58,6 +58,12 @@ SNode* nodesMakeNode(ENodeType type) { return makeNode(type, sizeof(SSessionWindowNode)); case QUERY_NODE_INTERVAL_WINDOW: return makeNode(type, sizeof(SIntervalWindowNode)); + case QUERY_NODE_NODE_LIST: + return makeNode(type, sizeof(SNodeListNode)); + case QUERY_NODE_FILL: + return makeNode(type, sizeof(SFillNode)); + case QUERY_NODE_RAW_EXPR: + return makeNode(type, sizeof(SRawExprNode)); case QUERY_NODE_SET_OPERATOR: return makeNode(type, sizeof(SSetOperator)); case QUERY_NODE_SELECT_STMT: @@ -74,7 +80,7 @@ static bool destroyNode(SNode* pNode, void* pContext) { switch (nodeType(pNode)) { case QUERY_NODE_VALUE: tfree(((SValueNode*)pNode)->literal); - break; + break; default: break; } From f6c1218b38fe0a50942d609aec3e52bf57aa4b35 Mon Sep 17 00:00:00 2001 From: Xiaoyu Wang Date: Mon, 7 Feb 2022 23:09:53 -0500 Subject: [PATCH 05/12] TD-13338 SELECT statement translate code --- source/libs/parser/inc/new_sql.y | 4 ++-- source/libs/parser/src/new_sql.c | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/source/libs/parser/inc/new_sql.y b/source/libs/parser/inc/new_sql.y index 72207e2bf4..d12e76000a 100644 --- a/source/libs/parser/inc/new_sql.y +++ b/source/libs/parser/inc/new_sql.y @@ -219,7 +219,7 @@ table_reference(A) ::= joined_table(B). table_primary(A) ::= table_name(B) alias_opt(C). { PARSER_TRACE; A = createRealTableNode(pCxt, NULL, &B, &C); } table_primary(A) ::= db_name(B) NK_DOT table_name(C) alias_opt(D). { PARSER_TRACE; A = createRealTableNode(pCxt, &B, &C, &D); } -table_primary(A) ::= subquery(B) alias_opt(C). { PARSER_TRACE; A = createTempTableNode(pCxt, B, &C); } +table_primary(A) ::= subquery(B) alias_opt(C). { PARSER_TRACE; A = createTempTableNode(pCxt, releaseRawExprNode(pCxt, B), &C); } table_primary(A) ::= parenthesized_joined_table(B). { PARSER_TRACE; A = B; } %type alias_opt { SToken } @@ -354,7 +354,7 @@ limit_clause_opt(A) ::= LIMIT NK_INTEGER(B) OFFSET NK_INTEGER(C). limit_clause_opt(A) ::= LIMIT NK_INTEGER(C) NK_COMMA NK_INTEGER(B). { PARSER_TRACE; A = createLimitNode(pCxt, &B, &C); } /************************************************ subquery ************************************************************/ -subquery(A) ::= NK_LP query_expression(B) NK_RP. { PARSER_TRACE; A = B; } +subquery(A) ::= NK_LP(B) query_expression(C) NK_RP(D). { PARSER_TRACE; A = createRawExprNodeExt(pCxt, &B, &D, C); } /************************************************ search_condition ****************************************************/ search_condition(A) ::= boolean_value_expression(B). { PARSER_TRACE; A = B; } diff --git a/source/libs/parser/src/new_sql.c b/source/libs/parser/src/new_sql.c index 85f73e5eb8..cb2b277df8 100644 --- a/source/libs/parser/src/new_sql.c +++ b/source/libs/parser/src/new_sql.c @@ -1729,7 +1729,6 @@ static YYACTIONTYPE yy_reduce( case 57: /* boolean_primary ::= NK_LP boolean_value_expression NK_RP */ case 70: /* parenthesized_joined_table ::= NK_LP joined_table NK_RP */ yytestcase(yyruleno==70); case 71: /* parenthesized_joined_table ::= NK_LP parenthesized_joined_table NK_RP */ yytestcase(yyruleno==71); - case 123: /* subquery ::= NK_LP query_expression NK_RP */ yytestcase(yyruleno==123); { PARSER_TRACE; yymsp[-2].minor.yy168 = yymsp[-1].minor.yy168; } break; case 58: /* from_clause ::= FROM table_reference_list */ @@ -1750,7 +1749,7 @@ static YYACTIONTYPE yy_reduce( yymsp[-3].minor.yy168 = yylhsminor.yy168; break; case 65: /* table_primary ::= subquery alias_opt */ -{ PARSER_TRACE; yylhsminor.yy168 = createTempTableNode(pCxt, yymsp[-1].minor.yy168, &yymsp[0].minor.yy241); } +{ PARSER_TRACE; yylhsminor.yy168 = createTempTableNode(pCxt, releaseRawExprNode(pCxt, yymsp[-1].minor.yy168), &yymsp[0].minor.yy241); } yymsp[-1].minor.yy168 = yylhsminor.yy168; break; case 67: /* alias_opt ::= */ @@ -1907,6 +1906,10 @@ static YYACTIONTYPE yy_reduce( case 122: /* limit_clause_opt ::= LIMIT NK_INTEGER NK_COMMA NK_INTEGER */ yytestcase(yyruleno==122); { PARSER_TRACE; yymsp[-3].minor.yy168 = createLimitNode(pCxt, &yymsp[0].minor.yy0, &yymsp[-2].minor.yy0); } break; + case 123: /* subquery ::= NK_LP query_expression NK_RP */ +{ PARSER_TRACE; yylhsminor.yy168 = createRawExprNodeExt(pCxt, &yymsp[-2].minor.yy0, &yymsp[0].minor.yy0, yymsp[-1].minor.yy168); } + yymsp[-2].minor.yy168 = yylhsminor.yy168; + break; case 127: /* sort_specification ::= expression ordering_specification_opt null_ordering_opt */ { PARSER_TRACE; yylhsminor.yy168 = createOrderByExprNode(pCxt, releaseRawExprNode(pCxt, yymsp[-2].minor.yy168), yymsp[-1].minor.yy10, yymsp[0].minor.yy177); } yymsp[-2].minor.yy168 = yylhsminor.yy168; From 33b3a978a5a9f799863f9c86541a02ac0a9efaee Mon Sep 17 00:00:00 2001 From: dapan1121 Date: Tue, 8 Feb 2022 16:53:00 +0800 Subject: [PATCH 06/12] feature/qnode --- include/libs/catalog/catalog.h | 11 + include/libs/qcom/query.h | 4 +- source/client/src/clientHb.c | 55 ++--- source/libs/catalog/inc/catalogInt.h | 4 +- source/libs/catalog/src/catalog.c | 280 ++++++++++++++-------- source/libs/catalog/test/catalogTests.cpp | 158 ++++++++++++ 6 files changed, 376 insertions(+), 136 deletions(-) diff --git a/include/libs/catalog/catalog.h b/include/libs/catalog/catalog.h index fef951b010..ab1298785a 100644 --- a/include/libs/catalog/catalog.h +++ b/include/libs/catalog/catalog.h @@ -32,6 +32,15 @@ extern "C" { struct SCatalog; +enum { + CTG_DBG_DB_NUM = 1, + CTG_DBG_META_NUM, + CTG_DBG_STB_NUM, + CTG_DBG_DB_RENT_NUM, + CTG_DBG_STB_RENT_NUM, +}; + + typedef struct SCatalogReq { SArray *pTableName; // element is SNAME SArray *pUdf; // udf name @@ -127,6 +136,8 @@ int32_t catalogGetTableMeta(struct SCatalog* pCatalog, void * pTransporter, cons */ int32_t catalogGetSTableMeta(struct SCatalog* pCatalog, void * pTransporter, const SEpSet* pMgmtEps, const SName* pTableName, STableMeta** pTableMeta); +int32_t catalogUpdateSTableMeta(struct SCatalog* pCatalog, STableMetaRsp *rspMsg); + /** * Force renew a table's local cached meta data. diff --git a/include/libs/qcom/query.h b/include/libs/qcom/query.h index 808437cb7e..549f36a898 100644 --- a/include/libs/qcom/query.h +++ b/include/libs/qcom/query.h @@ -81,7 +81,6 @@ typedef struct STableMeta { } STableMeta; typedef struct SDBVgroupInfo { - uint64_t dbId; int32_t vgVersion; int8_t hashMethod; SHashObj *vgHash; //key:vgId, value:SVgroupInfo @@ -103,6 +102,7 @@ enum { typedef struct STableMetaOutput { int32_t metaType; + uint64_t dbId; char dbFName[TSDB_DB_FNAME_LEN]; char ctbName[TSDB_TABLE_NAME_LEN]; char tbName[TSDB_TABLE_NAME_LEN]; @@ -160,6 +160,8 @@ void initQueryModuleMsgHandle(); const SSchema* tGetTbnameColumnSchema(); bool tIsValidSchema(struct SSchema* pSchema, int32_t numOfCols, int32_t numOfTags); +int32_t queryCreateTableMetaFromMsg(STableMetaRsp* msg, bool isSuperTable, STableMeta **pMeta); + extern int32_t (*queryBuildMsg[TDMT_MAX])(void* input, char **msg, int32_t msgSize, int32_t *msgLen); extern int32_t (*queryProcessMsgRsp[TDMT_MAX])(void* output, char *msg, int32_t msgSize); diff --git a/source/client/src/clientHb.c b/source/client/src/clientHb.c index e80e87f03b..3e1af765b0 100644 --- a/source/client/src/clientHb.c +++ b/source/client/src/clientHb.c @@ -100,50 +100,33 @@ static int32_t hbProcessStbInfoRsp(void *value, int32_t valueLen, struct SCatalo tscDebug("hb remove stb, db:%s, stb:%s", rsp->dbFName, rsp->stbName); - code = catalogRemoveSTableMeta(pCatalog, rsp->dbFName, rsp->stbName, rsp->suid); + catalogRemoveSTableMeta(pCatalog, rsp->dbFName, rsp->stbName, rsp->suid); } else { + tscDebug("hb update stb, db:%s, stb:%s", rsp->dbFName, rsp->stbName); + rsp->numOfTags = ntohl(rsp->numOfTags); + rsp->sversion = ntohl(rsp->sversion); + rsp->tversion = ntohl(rsp->tversion); + rsp->tuid = be64toh(rsp->tuid); + rsp->vgId = ntohl(rsp->vgId); + + SSchema* pSchema = rsp->pSchema; schemaNum = rsp->numOfColumns + rsp->numOfTags; -/* - rsp->vgNum = ntohl(rsp->vgNum); - rsp->uid = be64toh(rsp->uid); - SDBVgroupInfo vgInfo = {0}; - vgInfo.dbId = rsp->uid; - vgInfo.vgVersion = rsp->vgVersion; - vgInfo.hashMethod = rsp->hashMethod; - vgInfo.vgHash = taosHashInit(rsp->vgNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); - if (NULL == vgInfo.vgHash) { - tscError("hash init[%d] failed", rsp->vgNum); - return TSDB_CODE_TSC_OUT_OF_MEMORY; + for (int i = 0; i < schemaNum; ++i) { + pSchema->bytes = ntohl(pSchema->bytes); + pSchema->colId = ntohl(pSchema->colId); + + pSchema++; } - for (int32_t i = 0; i < rsp->vgNum; ++i) { - rsp->vgroupInfo[i].vgId = ntohl(rsp->vgroupInfo[i].vgId); - rsp->vgroupInfo[i].hashBegin = ntohl(rsp->vgroupInfo[i].hashBegin); - rsp->vgroupInfo[i].hashEnd = ntohl(rsp->vgroupInfo[i].hashEnd); + if (rsp->pSchema[0].colId != PRIMARYKEY_TIMESTAMP_COL_ID) { + tscError("invalid colId[%d] for the first column in table meta rsp msg", rsp->pSchema[0].colId); + return TSDB_CODE_TSC_INVALID_VALUE; + } - for (int32_t n = 0; n < rsp->vgroupInfo[i].epset.numOfEps; ++n) { - rsp->vgroupInfo[i].epset.eps[n].port = ntohs(rsp->vgroupInfo[i].epset.eps[n].port); - } - - if (0 != taosHashPut(vgInfo.vgHash, &rsp->vgroupInfo[i].vgId, sizeof(rsp->vgroupInfo[i].vgId), &rsp->vgroupInfo[i], sizeof(rsp->vgroupInfo[i]))) { - tscError("hash push failed, errno:%d", errno); - taosHashCleanup(vgInfo.vgHash); - return TSDB_CODE_TSC_OUT_OF_MEMORY; - } - } - - code = catalogUpdateDBVgroup(pCatalog, rsp->db, &vgInfo); - if (code) { - taosHashCleanup(vgInfo.vgHash); - } -*/ - } - - if (code) { - return code; + catalogUpdateSTableMeta(pCatalog, rsp); } msgLen += sizeof(STableMetaRsp) + schemaNum * sizeof(SSchema); diff --git a/source/libs/catalog/inc/catalogInt.h b/source/libs/catalog/inc/catalogInt.h index 1c4a19530d..021a119d6c 100644 --- a/source/libs/catalog/inc/catalogInt.h +++ b/source/libs/catalog/inc/catalogInt.h @@ -140,8 +140,8 @@ typedef uint32_t (*tableNameHashFp)(const char *, uint32_t); #define CTG_RET(c) do { int32_t _code = c; if (_code != TSDB_CODE_SUCCESS) { terrno = _code; } return _code; } while (0) #define CTG_ERR_JRET(c) do { code = c; if (code != TSDB_CODE_SUCCESS) { terrno = code; goto _return; } } while (0) -#define CTG_LOCK_DEBUG(...) do { if (gCTGDebug.lockDebug) { ctgDebug(__VA_ARGS__); } } while (0) -#define CTG_CACHE_DEBUG(...) do { if (gCTGDebug.cacheDebug) { ctgDebug(__VA_ARGS__); } } while (0) +#define CTG_LOCK_DEBUG(...) do { if (gCTGDebug.lockDebug) { qDebug(__VA_ARGS__); } } while (0) +#define CTG_CACHE_DEBUG(...) do { if (gCTGDebug.cacheDebug) { qDebug(__VA_ARGS__); } } while (0) #define TD_RWLATCH_WRITE_FLAG_COPY 0x40000000 diff --git a/source/libs/catalog/src/catalog.c b/source/libs/catalog/src/catalog.c index 3776969d90..f0ea22197b 100644 --- a/source/libs/catalog/src/catalog.c +++ b/source/libs/catalog/src/catalog.c @@ -22,7 +22,68 @@ SCatalogMgmt ctgMgmt = {0}; SCtgDebug gCTGDebug = {0}; -void ctgShowDBCache(SHashObj *dbHash) { +int32_t ctgDbgGetTbMetaNum(SCtgDBCache *dbCache) { + return dbCache->tbCache.metaCache ? (int32_t)taosHashGetSize(dbCache->tbCache.metaCache) : 0; +} + +int32_t ctgDbgGetStbNum(SCtgDBCache *dbCache) { + return dbCache->tbCache.stbCache ? (int32_t)taosHashGetSize(dbCache->tbCache.stbCache) : 0; +} + +int32_t ctgDbgGetRentNum(SCtgRentMgmt *rent) { + int32_t num = 0; + for (uint16_t i = 0; i < rent->slotNum; ++i) { + SCtgRentSlot *slot = &rent->slots[i]; + if (NULL == slot->meta) { + continue; + } + + num += taosArrayGetSize(slot->meta); + } + + return num; +} + +int32_t ctgDbgGetClusterCacheNum(struct SCatalog* pCatalog, int32_t type) { + if (NULL == pCatalog || NULL == pCatalog->dbCache) { + return 0; + } + + switch (type) { + case CTG_DBG_DB_NUM: + return (int32_t)taosHashGetSize(pCatalog->dbCache); + case CTG_DBG_DB_RENT_NUM: + return ctgDbgGetRentNum(&pCatalog->dbRent); + case CTG_DBG_STB_RENT_NUM: + return ctgDbgGetRentNum(&pCatalog->stbRent); + default: + break; + } + + SCtgDBCache *dbCache = NULL; + int32_t num = 0; + void *pIter = taosHashIterate(pCatalog->dbCache, NULL); + while (pIter) { + dbCache = (SCtgDBCache *)pIter; + switch (type) { + case CTG_DBG_META_NUM: + num += ctgDbgGetTbMetaNum(dbCache); + break; + case CTG_DBG_STB_NUM: + num += ctgDbgGetStbNum(dbCache); + break; + default: + ctgError("invalid type:%d", type); + break; + } + pIter = taosHashIterate(pCatalog->dbCache, pIter); + } + + return num; +} + + +void ctgDbgShowDBCache(SHashObj *dbHash) { if (NULL == dbHash) { return; } @@ -36,23 +97,27 @@ void ctgShowDBCache(SHashObj *dbHash) { dbCache = (SCtgDBCache *)pIter; - taosHashGetKey(dbCache, &dbFName, &len); + taosHashGetKey(dbCache, (void **)&dbFName, &len); - CTG_CACHE_DEBUG("** %dth db [%.*s] **", i, len, dbFName); + CTG_CACHE_DEBUG("** %dth db [%.*s][%"PRIx64"] **", i, (int32_t)len, dbFName, dbCache->dbId); pIter = taosHashIterate(dbHash, pIter); } } -void ctgShowClusterCache(struct SCatalog* pCatalog) { + + + +void ctgDbgShowClusterCache(struct SCatalog* pCatalog) { if (NULL == pCatalog) { return; } - CTG_CACHE_DEBUG("## cluster %"PRIx64" cache Info ##", pCatalog->clusterId); - CTG_CACHE_DEBUG("db cache number:%d", pCatalog->dbCache ? taosHashGetSize(pCatalog->dbCache) : 0); - ctgShowDBCache(pCatalog->dbCache); - + CTG_CACHE_DEBUG("## cluster %"PRIx64" %p cache Info ##", pCatalog->clusterId, pCatalog); + CTG_CACHE_DEBUG("db:%d meta:%d stb:%d dbRent:%d stbRent:%d", ctgDbgGetClusterCacheNum(pCatalog, CTG_DBG_DB_NUM), ctgDbgGetClusterCacheNum(pCatalog, CTG_DBG_META_NUM), + ctgDbgGetClusterCacheNum(pCatalog, CTG_DBG_STB_NUM), ctgDbgGetClusterCacheNum(pCatalog, CTG_DBG_DB_RENT_NUM), ctgDbgGetClusterCacheNum(pCatalog, CTG_DBG_STB_RENT_NUM)); + + ctgDbgShowDBCache(pCatalog->dbCache); } int32_t ctgInitDBCache(struct SCatalog* pCatalog) { @@ -843,7 +908,7 @@ int32_t ctgMetaRentGet(SCtgRentMgmt *mgmt, void **res, uint32_t *num, int32_t si return TSDB_CODE_SUCCESS; } -int32_t ctgAddDBCache(struct SCatalog *pCatalog, char *dbFName, SCtgDBCache *dbCache) { +int32_t ctgAddDBCache(struct SCatalog *pCatalog, const char *dbFName, SCtgDBCache *dbCache) { int32_t code = 0; if (taosHashPut(pCatalog->dbCache, dbFName, strlen(dbFName), dbCache, sizeof(SCtgDBCache))) { ctgError("taosHashPut db to cache failed, db:%s", dbFName); @@ -867,6 +932,111 @@ _return: } +void ctgRemoveAndFreeTableMeta(struct SCatalog* pCatalog, SCtgTbMetaCache *cache) { + CTG_LOCK(CTG_WRITE, &cache->stbLock); + if (cache->stbCache) { + void *pIter = taosHashIterate(cache->stbCache, NULL); + while (pIter) { + uint64_t *suid = NULL; + taosHashGetKey(pIter, (void **)&suid, NULL); + + if (TSDB_CODE_SUCCESS == ctgMetaRentRemove(&pCatalog->stbRent, *suid, ctgSTableVersionCompare)) { + ctgDebug("stb removed from rent, suid:%"PRIx64, *suid); + } + + pIter = taosHashIterate(cache->stbCache, pIter); + } + } + CTG_UNLOCK(CTG_WRITE, &cache->stbLock); + + ctgFreeTableMetaCache(cache); +} + + +int32_t ctgValidateAndRemoveDb(struct SCatalog* pCatalog, SCtgDBCache *dbCache, const char* dbFName) { + if (taosHashRemove(pCatalog->dbCache, dbFName, strlen(dbFName))) { + ctgError("taosHashRemove from dbCache failed, dbFName:%s", dbFName); + CTG_ERR_RET(TSDB_CODE_CTG_INTERNAL_ERROR); + } + + atomic_store_8(&dbCache->deleted, 1); + + CTG_LOCK(CTG_WRITE, &dbCache->vgLock); + if (dbCache->vgInfo) { + ctgInfo("cleanup db vgInfo, dbFName:%s, dbId:%"PRIx64, dbFName, dbCache->dbId); + + if (dbCache->vgInfo->vgHash) { + taosHashCleanup(dbCache->vgInfo->vgHash); + } + + tfree(dbCache->vgInfo); + } + CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); + + ctgRemoveAndFreeTableMeta(pCatalog, &dbCache->tbCache); + + ctgInfo("db removed from cache, dbFName:%s, uid:%"PRIx64, dbFName, dbCache->dbId); + + CTG_ERR_RET(ctgMetaRentRemove(&pCatalog->dbRent, dbCache->dbId, ctgDbVgVersionCompare)); + + ctgDebug("db removed from rent, dbFName:%s, uid:%"PRIx64, dbFName, dbCache->dbId); + + return TSDB_CODE_SUCCESS; +} + + +int32_t ctgAcquireDBCache(struct SCatalog* pCatalog, const char *dbFName, uint64_t dbId, SCtgDBCache **pCache) { + int32_t code = 0; + SCtgDBCache *dbCache = NULL; + + while (true) { + dbCache = (SCtgDBCache *)taosHashAcquire(pCatalog->dbCache, dbFName, strlen(dbFName)); + if (dbCache) { + // TODO OPEN IT +#if 0 + if (dbCache->dbId == dbId) { + *pCache = dbCache; + return TSDB_CODE_SUCCESS; + } +#else + if (0 == dbId) { + *pCache = dbCache; + return TSDB_CODE_SUCCESS; + } + + if (dbId && (dbCache->dbId == 0)) { + dbCache->dbId = dbId; + *pCache = dbCache; + return TSDB_CODE_SUCCESS; + } + + if (dbCache->dbId == dbId) { + *pCache = dbCache; + return TSDB_CODE_SUCCESS; + } +#endif + CTG_ERR_JRET(ctgValidateAndRemoveDb(pCatalog, dbCache, dbFName)); + taosHashRelease(pCatalog->dbCache, dbCache); + dbCache = NULL; + } + + SCtgDBCache newDBCache = {0}; + newDBCache.dbId = dbId; + + CTG_ERR_JRET(ctgAddDBCache(pCatalog, dbFName, &newDBCache)); + } + +_return: + + if (dbCache) { + taosHashRelease(pCatalog->dbCache, dbCache); + } + + CTG_RET(code); +} + + + int32_t ctgUpdateTbMetaImpl(struct SCatalog *pCatalog, SCtgTbMetaCache *tbCache, char *dbFName, char *tbName, STableMeta *meta, int32_t metaSize) { CTG_LOCK(CTG_READ, &tbCache->metaLock); if (taosHashPut(tbCache->metaCache, tbName, strlen(tbName), meta, metaSize) != 0) { @@ -954,7 +1124,7 @@ int32_t ctgUpdateTableMetaCache(struct SCatalog *pCatalog, STableMetaOutput *out CTG_ERR_JRET(ctgInitStbCache(pCatalog, dbCache)); if (CTG_IS_META_CTABLE(output->metaType) || CTG_IS_META_BOTH(output->metaType)) { - CTG_ERR_JRET(ctgUpdateTbMetaImpl(pCatalog, &dbCache->tbCache, output->ctbName, (STableMeta *)&output->ctbMeta, sizeof(output->ctbMeta))); + CTG_ERR_JRET(ctgUpdateTbMetaImpl(pCatalog, &dbCache->tbCache, output->dbFName, output->ctbName, (STableMeta *)&output->ctbMeta, sizeof(output->ctbMeta))); } if (CTG_IS_META_CTABLE(output->metaType)) { @@ -1002,7 +1172,7 @@ int32_t ctgGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgm while (true) { CTG_ERR_RET(ctgGetDBVgroupFromMnode(pCatalog, pRpc, pMgmtEps, &input, &DbOut)); - CTG_ERR_RET(catalogUpdateDBVgroup(pCatalog, dbFName, DbOut.dbVgroup)); + CTG_ERR_RET(catalogUpdateDBVgroup(pCatalog, dbFName, DbOut.dbId, DbOut.dbVgroup)); CTG_ERR_RET(ctgGetDBVgroupFromCache(pCatalog, dbFName, dbCache, &inCache)); if (!inCache) { @@ -1016,90 +1186,6 @@ int32_t ctgGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgm return TSDB_CODE_SUCCESS; } -void ctgRemoveAndFreeTableMeta(struct SCatalog* pCatalog, SCtgTbMetaCache *cache) { - CTG_LOCK(CTG_WRITE, &cache->stbLock); - if (cache->stbCache) { - void *pIter = taosHashIterate(cache->stbCache, NULL); - while (pIter) { - uint64_t suid = 0; - taosHashGetKey(pIter, &suid, NULL); - - CTG_ERR_RET(ctgMetaRentRemove(&pCatalog->stbRent, suid, ctgSTableVersionCompare)); - ctgDebug("stb removed from rent, suid:%"PRIx64, suid); - - pIter = taosHashIterate(cache->stbCache, pIter); - } - } - CTG_UNLOCK(CTG_WRITE, &cache->stbLock); - - ctgFreeTableMetaCache(cache); -} - -int32_t ctgValidateAndRemoveDb(struct SCatalog* pCatalog, SCtgDBCache *dbCache, const char* dbFName) { - if (taosHashRemove(pCatalog->dbCache, dbFName, strlen(dbFName))) { - ctgError("taosHashRemove from dbCache failed, dbFName:%s", dbFName); - CTG_ERR_RET(TSDB_CODE_CTG_INTERNAL_ERROR); - } - - atomic_store_8(&dbCache->deleted, 1); - - CTG_LOCK(CTG_WRITE, &dbCache->vgLock); - if (dbCache->vgInfo) { - ctgInfo("cleanup db vgInfo, dbFName:%s, dbId:%"PRIx64, dbFName, dbCache->dbId); - - if (dbCache->vgInfo->vgHash) { - taosHashCleanup(dbCache->vgInfo->vgHash); - } - - tfree(dbCache->vgInfo); - } - CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); - - ctgRemoveAndFreeTableMeta(pCatalog, &dbCache->tbCache); - - ctgInfo("db removed from cache, dbFName:%s, uid:%"PRIx64, dbFName, dbCache->dbId); - - CTG_ERR_RET(ctgMetaRentRemove(&pCatalog->dbRent, dbCache->dbId, ctgDbVgVersionCompare)); - - ctgDebug("db removed from rent, dbFName:%s, uid:%"PRIx64, dbFName, dbCache->dbId); - - return TSDB_CODE_SUCCESS; -} - - -int32_t ctgAcquireDBCache(struct SCatalog* pCatalog, const char *dbFName, uint64_t dbId, SCtgDBCache **pCache) { - int32_t code = 0; - SCtgDBCache *dbCache = NULL; - - while (true) { - dbCache = (SCtgDBCache *)taosHashAcquire(pCatalog->dbCache, dbFName, strlen(dbFName)); - if (dbCache) { - if (dbCache->dbId == dbId) { - *pCache = dbCache; - return TSDB_CODE_SUCCESS; - } - - CTG_ERR_JRET(ctgValidateAndRemoveDb(pCatalog, dbCache, dbFName)); - taosHashRelease(pCatalog->dbCache, dbCache); - dbCache = NULL; - } - - SCtgDBCache newDBCache = {0}; - newDBCache.dbId = dbId; - - CTG_ERR_JRET(ctgAddDBCache(pCatalog, dbFName, &newDBCache)); - } - -_return: - - if (dbCache) { - taosHashRelease(pCatalog->dbCache, dbCache); - } - - CTG_RET(code); -} - - int32_t ctgValidateAndRemoveStbMeta(struct SCatalog* pCatalog, const char* dbName, const char* stbName, uint64_t suid, bool *removed) { *removed = false; @@ -1407,7 +1493,7 @@ int32_t catalogGetDBVgroupVersion(struct SCatalog* pCatalog, const char* dbName, } int32_t catalogGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgmtEps, const char* dbFName, bool forceUpdate, SArray** vgroupList) { - if (NULL == pCatalog || NULL == dbName || NULL == pRpc || NULL == pMgmtEps || NULL == vgroupList) { + if (NULL == pCatalog || NULL == dbFName || NULL == pRpc || NULL == pMgmtEps || NULL == vgroupList) { CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } @@ -1480,7 +1566,7 @@ int32_t catalogUpdateDBVgroup(struct SCatalog* pCatalog, const char* dbFName, ui CTG_LOCK(CTG_WRITE, &dbCache->vgLock); if (dbCache->deleted) { - ctgInfo("db is dropping, dbFName:%s, dbId:%"PRIx64, dbFName, dbInfo->dbId); + ctgInfo("db is dropping, dbFName:%s, dbId:%"PRIx64, dbFName, dbId); CTG_UNLOCK(CTG_WRITE, &dbCache->vgLock); taosHashRelease(pCatalog->dbCache, dbCache); CTG_ERR_JRET(TSDB_CODE_CTG_DB_DROPPED); diff --git a/source/libs/catalog/test/catalogTests.cpp b/source/libs/catalog/test/catalogTests.cpp index 5b58794651..0214078287 100644 --- a/source/libs/catalog/test/catalogTests.cpp +++ b/source/libs/catalog/test/catalogTests.cpp @@ -39,6 +39,7 @@ namespace { extern "C" int32_t ctgGetTableMetaFromCache(struct SCatalog *pCatalog, const SName *pTableName, STableMeta **pTableMeta, int32_t *exist); extern "C" int32_t ctgUpdateTableMetaCache(struct SCatalog *pCatalog, STableMetaOutput *output); +extern "C" int32_t ctgDbgGetClusterCacheNum(struct SCatalog* pCatalog, int32_t type); void ctgTestSetPrepareTableMeta(); void ctgTestSetPrepareCTableMeta(); @@ -208,6 +209,45 @@ void ctgTestBuildDBVgroup(SDBVgroupInfo **pdbVgroup) { *pdbVgroup = dbVgroup; } + +void ctgTestBuildSTableMetaRsp(STableMetaRsp *rspMsg) { + strcpy(rspMsg->dbFName, ctgTestDbname); + sprintf(rspMsg->tbName, "%s", ctgTestSTablename); + sprintf(rspMsg->stbName, "%s", ctgTestSTablename); + rspMsg->numOfTags = ctgTestTagNum; + rspMsg->numOfColumns = ctgTestColNum; + rspMsg->precision = 1 + 1; + rspMsg->tableType = TSDB_SUPER_TABLE; + rspMsg->update = 1 + 1; + rspMsg->sversion = ctgTestSVersion + 1; + rspMsg->tversion = ctgTestTVersion + 1; + rspMsg->suid = ctgTestSuid + 1; + rspMsg->tuid = ctgTestSuid + 1; + rspMsg->vgId = 1; + + SSchema *s = NULL; + s = &rspMsg->pSchema[0]; + s->type = TSDB_DATA_TYPE_TIMESTAMP; + s->colId = 1; + s->bytes = 8; + strcpy(s->name, "ts"); + + s = &rspMsg->pSchema[1]; + s->type = TSDB_DATA_TYPE_INT; + s->colId = 2; + s->bytes = 4; + strcpy(s->name, "col1s"); + + s = &rspMsg->pSchema[2]; + s->type = TSDB_DATA_TYPE_BINARY; + s->colId = 3; + s->bytes = 12 + 1; + strcpy(s->name, "tag1s"); + + return; +} + + void ctgTestPrepareDbVgroups(void *shandle, SEpSet *pEpSet, SRpcMsg *pMsg, SRpcMsg *pRsp) { SUseDbRsp *rspMsg = NULL; // todo @@ -963,6 +1003,124 @@ TEST(tableMeta, superTableCase) { catalogDestroy(); } +TEST(tableMeta, rmStbMeta) { + struct SCatalog *pCtg = NULL; + void *mockPointer = (void *)0x1; + SVgroupInfo vgInfo = {0}; + + ctgTestInitLogFile(); + + ctgTestSetPrepareDbVgroupsAndSuperMeta(); + + initQueryModuleMsgHandle(); + + int32_t code = catalogInit(NULL); + ASSERT_EQ(code, 0); + + // sendCreateDbMsg(pConn->pTransporter, &pConn->pAppInfo->mgmtEp.epSet); + code = catalogGetHandle(ctgTestClusterId, &pCtg); + ASSERT_EQ(code, 0); + + SName n = {.type = TSDB_TABLE_NAME_T, .acctId = 1}; + strcpy(n.dbname, "db1"); + strcpy(n.tname, ctgTestSTablename); + + STableMeta *tableMeta = NULL; + code = catalogGetTableMeta(pCtg, mockPointer, (const SEpSet *)mockPointer, &n, &tableMeta); + ASSERT_EQ(code, 0); + ASSERT_EQ(tableMeta->vgId, 0); + ASSERT_EQ(tableMeta->tableType, TSDB_SUPER_TABLE); + ASSERT_EQ(tableMeta->sversion, ctgTestSVersion); + ASSERT_EQ(tableMeta->tversion, ctgTestTVersion); + ASSERT_EQ(tableMeta->uid, ctgTestSuid); + ASSERT_EQ(tableMeta->suid, ctgTestSuid); + ASSERT_EQ(tableMeta->tableInfo.numOfColumns, ctgTestColNum); + ASSERT_EQ(tableMeta->tableInfo.numOfTags, ctgTestTagNum); + ASSERT_EQ(tableMeta->tableInfo.precision, 1); + ASSERT_EQ(tableMeta->tableInfo.rowSize, 12); + + code = catalogRemoveSTableMeta(pCtg, "1.db1", ctgTestSTablename, ctgTestSuid); + ASSERT_EQ(code, 0); + + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_DB_NUM), 1); + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_META_NUM), 0); + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_STB_NUM), 0); + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_DB_RENT_NUM), 1); + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_STB_RENT_NUM), 0); + + catalogDestroy(); +} + +TEST(tableMeta, updateStbMeta) { + struct SCatalog *pCtg = NULL; + void *mockPointer = (void *)0x1; + SVgroupInfo vgInfo = {0}; + + ctgTestInitLogFile(); + + ctgTestSetPrepareDbVgroupsAndSuperMeta(); + + initQueryModuleMsgHandle(); + + int32_t code = catalogInit(NULL); + ASSERT_EQ(code, 0); + + // sendCreateDbMsg(pConn->pTransporter, &pConn->pAppInfo->mgmtEp.epSet); + code = catalogGetHandle(ctgTestClusterId, &pCtg); + ASSERT_EQ(code, 0); + + SName n = {.type = TSDB_TABLE_NAME_T, .acctId = 1}; + strcpy(n.dbname, "db1"); + strcpy(n.tname, ctgTestSTablename); + + STableMeta *tableMeta = NULL; + code = catalogGetTableMeta(pCtg, mockPointer, (const SEpSet *)mockPointer, &n, &tableMeta); + ASSERT_EQ(code, 0); + ASSERT_EQ(tableMeta->vgId, 0); + ASSERT_EQ(tableMeta->tableType, TSDB_SUPER_TABLE); + ASSERT_EQ(tableMeta->sversion, ctgTestSVersion); + ASSERT_EQ(tableMeta->tversion, ctgTestTVersion); + ASSERT_EQ(tableMeta->uid, ctgTestSuid); + ASSERT_EQ(tableMeta->suid, ctgTestSuid); + ASSERT_EQ(tableMeta->tableInfo.numOfColumns, ctgTestColNum); + ASSERT_EQ(tableMeta->tableInfo.numOfTags, ctgTestTagNum); + ASSERT_EQ(tableMeta->tableInfo.precision, 1); + ASSERT_EQ(tableMeta->tableInfo.rowSize, 12); + + tfree(tableMeta); + + STableMetaRsp rsp = {0}; + ctgTestBuildSTableMetaRsp(&rsp); + + code = catalogUpdateSTableMeta(pCtg, &rsp); + ASSERT_EQ(code, 0); + + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_DB_NUM), 1); + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_META_NUM), 1); + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_STB_NUM), 1); + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_DB_RENT_NUM), 1); + ASSERT_EQ(ctgDbgGetClusterCacheNum(pCtg, CTG_DBG_STB_RENT_NUM), 1); + + code = catalogGetTableMeta(pCtg, mockPointer, (const SEpSet *)mockPointer, &n, &tableMeta); + ASSERT_EQ(code, 0); + ASSERT_EQ(tableMeta->vgId, 0); + ASSERT_EQ(tableMeta->tableType, TSDB_SUPER_TABLE); + ASSERT_EQ(tableMeta->sversion, ctgTestSVersion + 1); + ASSERT_EQ(tableMeta->tversion, ctgTestTVersion + 1); + ASSERT_EQ(tableMeta->uid, ctgTestSuid + 1); + ASSERT_EQ(tableMeta->suid, ctgTestSuid + 1); + ASSERT_EQ(tableMeta->tableInfo.numOfColumns, ctgTestColNum); + ASSERT_EQ(tableMeta->tableInfo.numOfTags, ctgTestTagNum); + ASSERT_EQ(tableMeta->tableInfo.precision, 1 + 1); + ASSERT_EQ(tableMeta->tableInfo.rowSize, 12); + + tfree(tableMeta); + + catalogDestroy(); +} + + + TEST(tableDistVgroup, normalTable) { struct SCatalog *pCtg = NULL; void *mockPointer = (void *)0x1; From c901a21a8c800ec888c1e9cb250481cfdb9e4188 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Tue, 8 Feb 2022 21:23:43 +0800 Subject: [PATCH 07/12] enhance interface --- source/libs/transport/inc/transComm.h | 10 +- source/libs/transport/src/rpcMain.c | 4 +- source/libs/transport/src/trans.c | 34 +++- source/libs/transport/src/transCli.c | 69 ++++++- source/libs/transport/src/transSrv.c | 24 ++- source/libs/transport/test/CMakeLists.txt | 19 +- source/libs/transport/test/rclient.c | 1 - source/libs/transport/test/syncClient.c | 220 ++++++++++++++++++++++ source/libs/transport/test/transUT.cc | 21 +++ 9 files changed, 380 insertions(+), 22 deletions(-) create mode 100644 source/libs/transport/test/syncClient.c diff --git a/source/libs/transport/inc/transComm.h b/source/libs/transport/inc/transComm.h index 082c89fed4..846f2d5099 100644 --- a/source/libs/transport/inc/transComm.h +++ b/source/libs/transport/inc/transComm.h @@ -134,10 +134,12 @@ typedef struct { // int16_t numOfTry; // number of try for different servers // int8_t oldInUse; // server EP inUse passed by app // int8_t redirect; // flag to indicate redirect - int8_t connType; // connection type - int64_t rid; // refId returned by taosAddRef - SRpcMsg* pRsp; // for synchronous API - tsem_t* pSem; // for synchronous API + int8_t connType; // connection type + int64_t rid; // refId returned by taosAddRef + + SRpcMsg* pRsp; // for synchronous API + tsem_t* pSem; // for synchronous API + char* ip; uint32_t port; // SEpSet* pSet; // for synchronous API diff --git a/source/libs/transport/src/rpcMain.c b/source/libs/transport/src/rpcMain.c index a286482fc1..d8ef0462fb 100644 --- a/source/libs/transport/src/rpcMain.c +++ b/source/libs/transport/src/rpcMain.c @@ -813,8 +813,8 @@ static SRpcConn *rpcSetupConnToServer(SRpcReqContext *pContext) { SRpcInfo *pRpc = pContext->pRpc; SEpSet * pEpSet = &pContext->epSet; - pConn = - rpcGetConnFromCache(pRpc->pCache, pEpSet->eps[pEpSet->inUse].fqdn, pEpSet->eps[pEpSet->inUse].port, pContext->connType); + pConn = rpcGetConnFromCache(pRpc->pCache, pEpSet->eps[pEpSet->inUse].fqdn, pEpSet->eps[pEpSet->inUse].port, + pContext->connType); if (pConn == NULL || pConn->user[0] == 0) { pConn = rpcOpenConn(pRpc, pEpSet->eps[pEpSet->inUse].fqdn, pEpSet->eps[pEpSet->inUse].port, pContext->connType); } diff --git a/source/libs/transport/src/trans.c b/source/libs/transport/src/trans.c index 91f9a8ead2..a6040a3873 100644 --- a/source/libs/transport/src/trans.c +++ b/source/libs/transport/src/trans.c @@ -63,17 +63,41 @@ void rpcFreeCont(void* cont) { } free((char*)cont - TRANS_MSG_OVERHEAD); } -void* rpcReallocCont(void* ptr, int contLen) { return NULL; } +void* rpcReallocCont(void* ptr, int contLen) { + if (ptr == NULL) { + return rpcMallocCont(contLen); + } + char* st = (char*)ptr - TRANS_MSG_OVERHEAD; + int sz = contLen + TRANS_MSG_OVERHEAD; + st = realloc(st, sz); + if (st == NULL) { + return NULL; + } + return st + TRANS_MSG_OVERHEAD; +} + +void rpcSendRedirectRsp(void* thandle, const SEpSet* pEpSet) { + SRpcMsg rpcMsg; + memset(&rpcMsg, 0, sizeof(rpcMsg)); + + rpcMsg.contLen = sizeof(SEpSet); + rpcMsg.pCont = rpcMallocCont(rpcMsg.contLen); + if (rpcMsg.pCont == NULL) return; + + memcpy(rpcMsg.pCont, pEpSet, sizeof(SEpSet)); + + rpcMsg.code = TSDB_CODE_RPC_REDIRECT; + rpcMsg.handle = thandle; + + rpcSendResponse(&rpcMsg); +} -void rpcSendRedirectRsp(void* pConn, const SEpSet* pEpSet) {} -int rpcGetConnInfo(void* thandle, SRpcConnInfo* pInfo) { return -1; } -void rpcSendRecv(void* shandle, SEpSet* pEpSet, SRpcMsg* pReq, SRpcMsg* pRsp) { return; } int rpcReportProgress(void* pConn, char* pCont, int contLen) { return -1; } void rpcCancelRequest(int64_t rid) { return; } int32_t rpcInit(void) { // impl later - return -1; + return 0; } void rpcCleanup(void) { diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 24ff5e956a..3d93049c6a 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -123,9 +123,14 @@ static void clientHandleResp(SCliConn* conn) { rpcMsg.code = pHead->code; rpcMsg.msgType = pHead->msgType; rpcMsg.ahandle = pCtx->ahandle; - - tDebug("conn %p handle resp", conn); - (pRpc->cfp)(NULL, &rpcMsg, NULL); + if (pCtx->pSem == NULL) { + tDebug("conn %p handle resp", conn); + (pRpc->cfp)(NULL, &rpcMsg, NULL); + } else { + tDebug("conn %p handle resp", conn); + memcpy((char*)pCtx->pRsp, (char*)&rpcMsg, sizeof(rpcMsg)); + tsem_post(pCtx->pSem); + } conn->notifyCount += 1; // buf's mem alread translated to rpcMsg.pCont @@ -159,14 +164,20 @@ static void clientHandleExcept(SCliConn* pConn) { SRpcMsg rpcMsg = {0}; rpcMsg.ahandle = pCtx->ahandle; rpcMsg.code = TSDB_CODE_RPC_NETWORK_UNAVAIL; - // SRpcInfo* pRpc = pMsg->ctx->pRpc; - (pCtx->pTransInst->cfp)(NULL, &rpcMsg, NULL); - pConn->notifyCount += 1; + if (pCtx->pSem == NULL) { + // SRpcInfo* pRpc = pMsg->ctx->pRpc; + (pCtx->pTransInst->cfp)(NULL, &rpcMsg, NULL); + } else { + memcpy((char*)(pCtx->pRsp), (char*)(&rpcMsg), sizeof(rpcMsg)); + // SRpcMsg rpcMsg + tsem_post(pCtx->pSem); + } destroyCmsg(pMsg); pConn->data = NULL; // transDestroyConnCtx(pCtx); clientConnDestroy(pConn, true); + pConn->notifyCount += 1; } static void clientTimeoutCb(uv_timer_t* handle) { @@ -463,6 +474,7 @@ static void clientAsyncCb(uv_async_t* handle) { static void* clientThread(void* arg) { SCliThrdObj* pThrd = (SCliThrdObj*)arg; + setThreadName("trans-client-work"); uv_run(pThrd->loop, UV_RUN_DEFAULT); } @@ -568,8 +580,8 @@ void taosCloseClient(void* arg) { } void rpcSendRequest(void* shandle, const SEpSet* pEpSet, SRpcMsg* pMsg, int64_t* pRid) { // impl later - char* ip = (char*)(pEpSet->fqdn[pEpSet->inUse]); - uint32_t port = pEpSet->port[pEpSet->inUse]; + char* ip = (char*)(pEpSet->eps[pEpSet->inUse].fqdn); + uint32_t port = pEpSet->eps[pEpSet->inUse].port; SRpcInfo* pRpc = (SRpcInfo*)shandle; @@ -609,4 +621,45 @@ void rpcSendRequest(void* shandle, const SEpSet* pEpSet, SRpcMsg* pMsg, int64_t* // int end = taosGetTimestampUs() - start; // tError("client sent to rpc, time cost: %d", (int)end); } +void rpcSendRecv(void* shandle, SEpSet* pEpSet, SRpcMsg* pReq, SRpcMsg* pRsp) { + char* ip = (char*)(pEpSet->eps[pEpSet->inUse].fqdn); + uint32_t port = pEpSet->eps[pEpSet->inUse].port; + + SRpcInfo* pRpc = (SRpcInfo*)shandle; + + STransConnCtx* pCtx = calloc(1, sizeof(STransConnCtx)); + pCtx->pTransInst = (SRpcInfo*)shandle; + pCtx->ahandle = pReq->ahandle; + pCtx->msgType = pReq->msgType; + pCtx->ip = strdup(ip); + pCtx->port = port; + pCtx->pSem = calloc(1, sizeof(tsem_t)); + pCtx->pRsp = pRsp; + tsem_init(pCtx->pSem, 0, 0); + + int64_t index = pRpc->index; + if (pRpc->index++ >= pRpc->numOfThreads) { + pRpc->index = 0; + } + SCliMsg* cliMsg = malloc(sizeof(SCliMsg)); + cliMsg->ctx = pCtx; + cliMsg->msg = *pReq; + cliMsg->st = taosGetTimestampUs(); + + SCliThrdObj* thrd = ((SClientObj*)pRpc->tcphandle)->pThreadObj[index % pRpc->numOfThreads]; + + // pthread_mutex_lock(&thrd->msgMtx); + // QUEUE_PUSH(&thrd->msg, &cliMsg->q); + // pthread_mutex_unlock(&thrd->msgMtx); + + // int start = taosGetTimestampUs(); + transSendAsync(thrd->asyncPool, &(cliMsg->q)); + + tsem_t* pSem = pCtx->pSem; + tsem_wait(pSem); + tsem_destroy(pSem); + free(pSem); + + return; +} #endif diff --git a/source/libs/transport/src/transSrv.c b/source/libs/transport/src/transSrv.c index a005b31fe4..4d2ac434dd 100644 --- a/source/libs/transport/src/transSrv.c +++ b/source/libs/transport/src/transSrv.c @@ -33,6 +33,8 @@ typedef struct SSrvConn { void* hostThrd; void* pSrvMsg; + struct sockaddr peername; + // SRpcMsg sendMsg; // del later char secured; @@ -487,7 +489,13 @@ void uvOnConnectionCb(uv_stream_t* q, ssize_t nread, const uv_buf_t* buf) { uv_os_fd_t fd; uv_fileno((const uv_handle_t*)pConn->pTcp, &fd); tDebug("conn %p created, fd: %d", pConn, fd); - uv_read_start((uv_stream_t*)(pConn->pTcp), uvAllocReadBufferCb, uvOnReadCb); + int namelen = sizeof(pConn->peername); + if (0 != uv_tcp_getpeername(pConn->pTcp, &pConn->peername, &namelen)) { + tError("failed to get peer name"); + destroyConn(pConn, true); + } else { + uv_read_start((uv_stream_t*)(pConn->pTcp), uvAllocReadBufferCb, uvOnReadCb); + } } else { tDebug("failed to create new connection"); destroyConn(pConn, true); @@ -496,6 +504,7 @@ void uvOnConnectionCb(uv_stream_t* q, ssize_t nread, const uv_buf_t* buf) { void* acceptThread(void* arg) { // opt + setThreadName("trans-accept"); SServerObj* srv = (SServerObj*)arg; uv_run(srv->loop, UV_RUN_DEFAULT); } @@ -548,6 +557,7 @@ static bool addHandleToAcceptloop(void* arg) { return true; } void* workerThread(void* arg) { + setThreadName("trans-worker"); SWorkThrdObj* pThrd = (SWorkThrdObj*)arg; uv_run(pThrd->loop, UV_RUN_DEFAULT); } @@ -723,4 +733,16 @@ void rpcSendResponse(const SRpcMsg* pMsg) { // uv_async_send(pThrd->workerAsync); } +int rpcGetConnInfo(void* thandle, SRpcConnInfo* pInfo) { + SSrvConn* pConn = thandle; + struct sockaddr* pPeerName = &pConn->peername; + + struct sockaddr_in caddr = *(struct sockaddr_in*)(pPeerName); + pInfo->clientIp = (uint32_t)(caddr.sin_addr.s_addr); + pInfo->clientPort = ntohs(caddr.sin_port); + + tstrncpy(pInfo->user, pConn->user, sizeof(pInfo->user)); + return 0; +} + #endif diff --git a/source/libs/transport/test/CMakeLists.txt b/source/libs/transport/test/CMakeLists.txt index 3d9c396336..3c9c40f46a 100644 --- a/source/libs/transport/test/CMakeLists.txt +++ b/source/libs/transport/test/CMakeLists.txt @@ -2,6 +2,7 @@ add_executable(transportTest "") add_executable(client "") add_executable(server "") add_executable(transUT "") +add_executable(syncClient "") target_sources(transUT PRIVATE @@ -20,6 +21,10 @@ target_sources (server PRIVATE "rserver.c" ) +target_sources (syncClient + PRIVATE + "syncClient.c" +) target_include_directories(transportTest PUBLIC @@ -67,7 +72,6 @@ target_include_directories(transUT "${CMAKE_CURRENT_SOURCE_DIR}/../inc" ) - target_link_libraries (server os util @@ -75,4 +79,17 @@ target_link_libraries (server gtest_main transport ) +target_include_directories(syncClient + PUBLIC + "${CMAKE_SOURCE_DIR}/include/libs/transport" + "${CMAKE_CURRENT_SOURCE_DIR}/../inc" +) +target_link_libraries (syncClient + os + util + common + gtest_main + transport +) + diff --git a/source/libs/transport/test/rclient.c b/source/libs/transport/test/rclient.c index 308b7b54bd..4e29c02508 100644 --- a/source/libs/transport/test/rclient.c +++ b/source/libs/transport/test/rclient.c @@ -33,7 +33,6 @@ typedef struct { pthread_t thread; void * pRpc; } SInfo; - static void processResponse(void *pParent, SRpcMsg *pMsg, SEpSet *pEpSet) { SInfo *pInfo = (SInfo *)pMsg->ahandle; tDebug("thread:%d, response is received, type:%d contLen:%d code:0x%x", pInfo->index, pMsg->msgType, pMsg->contLen, diff --git a/source/libs/transport/test/syncClient.c b/source/libs/transport/test/syncClient.c new file mode 100644 index 0000000000..c5d7f5664a --- /dev/null +++ b/source/libs/transport/test/syncClient.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#include + +#include +#include "os.h" +#include "rpcLog.h" +#include "taoserror.h" +#include "tglobal.h" +#include "trpc.h" +#include "tutil.h" + +typedef struct { + int index; + SEpSet epSet; + int num; + int numOfReqs; + int msgSize; + tsem_t rspSem; + tsem_t * pOverSem; + pthread_t thread; + void * pRpc; +} SInfo; +static void processResponse(void *pParent, SRpcMsg *pMsg, SEpSet *pEpSet) { + SInfo *pInfo = (SInfo *)pMsg->ahandle; + tDebug("thread:%d, response is received, type:%d contLen:%d code:0x%x", pInfo->index, pMsg->msgType, pMsg->contLen, + pMsg->code); + + if (pEpSet) pInfo->epSet = *pEpSet; + + rpcFreeCont(pMsg->pCont); + // tsem_post(&pInfo->rspSem); + tsem_post(&pInfo->rspSem); +} + +static int tcount = 0; + +static void *sendRequest(void *param) { + SInfo * pInfo = (SInfo *)param; + SRpcMsg rpcMsg = {0}; + + tDebug("thread:%d, start to send request", pInfo->index); + + tDebug("thread:%d, reqs: %d", pInfo->index, pInfo->numOfReqs); + int u100 = 0; + int u500 = 0; + int u1000 = 0; + int u10000 = 0; + SRpcMsg respMsg = {0}; + while (pInfo->numOfReqs == 0 || pInfo->num < pInfo->numOfReqs) { + pInfo->num++; + rpcMsg.pCont = rpcMallocCont(pInfo->msgSize); + rpcMsg.contLen = pInfo->msgSize; + rpcMsg.ahandle = pInfo; + rpcMsg.msgType = 1; + // tDebug("thread:%d, send request, contLen:%d num:%d", pInfo->index, pInfo->msgSize, pInfo->num); + int64_t start = taosGetTimestampUs(); + rpcSendRecv(pInfo->pRpc, &pInfo->epSet, &rpcMsg, &respMsg); + // rpcSendRequest(pInfo->pRpc, &pInfo->epSet, &rpcMsg, NULL); + if (pInfo->num % 20000 == 0) tInfo("thread:%d, %d requests have been sent", pInfo->index, pInfo->num); + // tsem_wait(&pInfo->rspSem); + // wtsem_wait(&pInfo->rspSem); + int64_t end = taosGetTimestampUs() - start; + if (end <= 100) { + u100++; + } else if (end > 100 && end <= 500) { + u500++; + } else if (end > 500 && end < 1000) { + u1000++; + } else { + u10000++; + } + + tDebug("recv response succefully"); + + // usleep(100000000); + } + + tError("send and recv sum: %d, %d, %d, %d", u100, u500, u1000, u10000); + tDebug("thread:%d, it is over", pInfo->index); + tcount++; + + return NULL; +} + +int main(int argc, char *argv[]) { + SRpcInit rpcInit; + SEpSet epSet = {0}; + int msgSize = 128; + int numOfReqs = 0; + int appThreads = 1; + char serverIp[40] = "127.0.0.1"; + char secret[20] = "mypassword"; + struct timeval systemTime; + int64_t startTime, endTime; + pthread_attr_t thattr; + + // server info + epSet.inUse = 0; + addEpIntoEpSet(&epSet, serverIp, 7000); + addEpIntoEpSet(&epSet, "192.168.0.1", 7000); + + // client info + memset(&rpcInit, 0, sizeof(rpcInit)); + rpcInit.localPort = 0; + rpcInit.label = "APP"; + rpcInit.numOfThreads = 1; + rpcInit.cfp = processResponse; + rpcInit.sessions = 100; + rpcInit.idleTime = 100; + rpcInit.user = "michael"; + rpcInit.secret = secret; + rpcInit.ckey = "key"; + rpcInit.spi = 1; + rpcInit.connType = TAOS_CONN_CLIENT; + + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "-p") == 0 && i < argc - 1) { + epSet.eps[0].port = atoi(argv[++i]); + } else if (strcmp(argv[i], "-i") == 0 && i < argc - 1) { + tstrncpy(epSet.eps[0].fqdn, argv[++i], sizeof(epSet.eps[0].fqdn)); + } else if (strcmp(argv[i], "-t") == 0 && i < argc - 1) { + rpcInit.numOfThreads = atoi(argv[++i]); + } else if (strcmp(argv[i], "-m") == 0 && i < argc - 1) { + msgSize = atoi(argv[++i]); + } else if (strcmp(argv[i], "-s") == 0 && i < argc - 1) { + rpcInit.sessions = atoi(argv[++i]); + } else if (strcmp(argv[i], "-n") == 0 && i < argc - 1) { + numOfReqs = atoi(argv[++i]); + } else if (strcmp(argv[i], "-a") == 0 && i < argc - 1) { + appThreads = atoi(argv[++i]); + } else if (strcmp(argv[i], "-o") == 0 && i < argc - 1) { + tsCompressMsgSize = atoi(argv[++i]); + } else if (strcmp(argv[i], "-u") == 0 && i < argc - 1) { + rpcInit.user = argv[++i]; + } else if (strcmp(argv[i], "-k") == 0 && i < argc - 1) { + rpcInit.secret = argv[++i]; + } else if (strcmp(argv[i], "-spi") == 0 && i < argc - 1) { + rpcInit.spi = atoi(argv[++i]); + } else if (strcmp(argv[i], "-d") == 0 && i < argc - 1) { + rpcDebugFlag = atoi(argv[++i]); + } else { + printf("\nusage: %s [options] \n", argv[0]); + printf(" [-i ip]: first server IP address, default is:%s\n", serverIp); + printf(" [-p port]: server port number, default is:%d\n", epSet.eps[0].port); + printf(" [-t threads]: number of rpc threads, default is:%d\n", rpcInit.numOfThreads); + printf(" [-s sessions]: number of rpc sessions, default is:%d\n", rpcInit.sessions); + printf(" [-m msgSize]: message body size, default is:%d\n", msgSize); + printf(" [-a threads]: number of app threads, default is:%d\n", appThreads); + printf(" [-n requests]: number of requests per thread, default is:%d\n", numOfReqs); + printf(" [-o compSize]: compression message size, default is:%d\n", tsCompressMsgSize); + printf(" [-u user]: user name for the connection, default is:%s\n", rpcInit.user); + printf(" [-k secret]: password for the connection, default is:%s\n", rpcInit.secret); + printf(" [-spi SPI]: security parameter index, default is:%d\n", rpcInit.spi); + printf(" [-d debugFlag]: debug flag, default:%d\n", rpcDebugFlag); + printf(" [-h help]: print out this help\n\n"); + exit(0); + } + } + + taosInitLog("client.log", 100000, 10); + + void *pRpc = rpcOpen(&rpcInit); + if (pRpc == NULL) { + tError("failed to initialize RPC"); + return -1; + } + + tInfo("client is initialized"); + tInfo("threads:%d msgSize:%d requests:%d", appThreads, msgSize, numOfReqs); + + gettimeofday(&systemTime, NULL); + startTime = systemTime.tv_sec * 1000000 + systemTime.tv_usec; + + SInfo *pInfo = (SInfo *)calloc(1, sizeof(SInfo) * appThreads); + + pthread_attr_init(&thattr); + pthread_attr_setdetachstate(&thattr, PTHREAD_CREATE_JOINABLE); + + for (int i = 0; i < appThreads; ++i) { + pInfo->index = i; + pInfo->epSet = epSet; + pInfo->numOfReqs = numOfReqs; + pInfo->msgSize = msgSize; + tsem_init(&pInfo->rspSem, 0, 0); + pInfo->pRpc = pRpc; + pthread_create(&pInfo->thread, &thattr, sendRequest, pInfo); + pInfo++; + } + + do { + usleep(1); + } while (tcount < appThreads); + + gettimeofday(&systemTime, NULL); + endTime = systemTime.tv_sec * 1000000 + systemTime.tv_usec; + float usedTime = (endTime - startTime) / 1000.0f; // mseconds + + tInfo("it takes %.3f mseconds to send %d requests to server", usedTime, numOfReqs * appThreads); + tInfo("Performance: %.3f requests per second, msgSize:%d bytes", 1000.0 * numOfReqs * appThreads / usedTime, msgSize); + + int ch = getchar(); + UNUSED(ch); + + taosCloseLog(); + + return 0; +} diff --git a/source/libs/transport/test/transUT.cc b/source/libs/transport/test/transUT.cc index 08c683590b..6f80ea42ac 100644 --- a/source/libs/transport/test/transUT.cc +++ b/source/libs/transport/test/transUT.cc @@ -15,6 +15,7 @@ #include #include #include +#include "tep.h" #include "trpc.h" using namespace std; @@ -50,6 +51,25 @@ class TransObj { trans = rpcOpen(&rpcInit); return trans != NULL ? true : false; } + + bool sendAndRecv() { + SEpSet epSet = {0}; + epSet.inUse = 0; + addEpIntoEpSet(&epSet, "192.168.1.1", 7000); + addEpIntoEpSet(&epSet, "192.168.0.1", 7000); + + if (trans == NULL) { + return false; + } + SRpcMsg rpcMsg = {0}, reqMsg = {0}; + reqMsg.pCont = rpcMallocCont(10); + reqMsg.contLen = 10; + reqMsg.ahandle = NULL; + rpcSendRecv(trans, &epSet, &reqMsg, &rpcMsg); + int code = rpcMsg.code; + std::cout << tstrerror(code) << std::endl; + return true; + } bool stop() { rpcClose(trans); trans = NULL; @@ -75,6 +95,7 @@ class TransEnv : public ::testing::Test { }; TEST_F(TransEnv, test_start_stop) { assert(tr->startCli()); + assert(tr->sendAndRecv()); assert(tr->stop()); assert(tr->startSrv()); From e2d4c81cdcfe8d87a8669c78857bd472ca949887 Mon Sep 17 00:00:00 2001 From: Cary Xu Date: Wed, 9 Feb 2022 10:30:39 +0800 Subject: [PATCH 08/12] Feature/td 11463 update colId to PRIMARYKEY_TIMESTAMP_COL_ID during commit (#10150) * initial commit * fix commit error * update colId to PRIMARYKEY_TIMESTAMP_COL_ID during commit * update colId to PRIMARYKEY_TIMESTAMP_COL_ID during commit * update colId to PRIMARYKEY_TIMESTAMP_COL_ID during commit Co-authored-by: Hongze Cheng --- source/dnode/vnode/src/tsdb/tsdbCommit.c | 2 +- source/dnode/vnode/src/tsdb/tsdbReadImpl.c | 2 +- source/dnode/vnode/src/vnd/vnodeBufferPool.c | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbCommit.c b/source/dnode/vnode/src/tsdb/tsdbCommit.c index 0f2d711a79..be6c086040 100644 --- a/source/dnode/vnode/src/tsdb/tsdbCommit.c +++ b/source/dnode/vnode/src/tsdb/tsdbCommit.c @@ -1327,7 +1327,7 @@ static int tsdbMergeMemData(SCommitH *pCommith, SCommitIter *pIter, int bidx) { int nBlocks = pCommith->readh.pBlkIdx->numOfBlocks; SBlock * pBlock = pCommith->readh.pBlkInfo->blocks + bidx; TSKEY keyLimit; - int16_t colId = 0; + int16_t colId = PRIMARYKEY_TIMESTAMP_COL_ID; SMergeInfo mInfo; SBlock subBlocks[TSDB_MAX_SUBBLOCKS]; SBlock block, supBlock; diff --git a/source/dnode/vnode/src/tsdb/tsdbReadImpl.c b/source/dnode/vnode/src/tsdb/tsdbReadImpl.c index 3dcbb7888b..24c71fdc7e 100644 --- a/source/dnode/vnode/src/tsdb/tsdbReadImpl.c +++ b/source/dnode/vnode/src/tsdb/tsdbReadImpl.c @@ -472,7 +472,7 @@ static int tsdbLoadBlockDataImpl(SReadH *pReadh, SBlock *pBlock, SDataCols *pDat continue; } - int16_t tcolId = 0; + int16_t tcolId = PRIMARYKEY_TIMESTAMP_COL_ID; uint32_t toffset = TSDB_KEY_COL_OFFSET; int32_t tlen = pBlock->keyLen; diff --git a/source/dnode/vnode/src/vnd/vnodeBufferPool.c b/source/dnode/vnode/src/vnd/vnodeBufferPool.c index 434498eef5..f7a72353eb 100644 --- a/source/dnode/vnode/src/vnd/vnodeBufferPool.c +++ b/source/dnode/vnode/src/vnd/vnodeBufferPool.c @@ -185,6 +185,7 @@ static void vBufPoolDestroyMA(SMemAllocatorFactory *pMAF, SMemAllocator *pMA) { free(pMA); if (--pVMA->_ref.val == 0) { TD_DLIST_POP(&(pVnode->pBufPool->incycle), pVMA); + vmaReset(pVMA); TD_DLIST_APPEND(&(pVnode->pBufPool->free), pVMA); } } \ No newline at end of file From 0b452701bc2dfc97c91232f70db927c7f5c58097 Mon Sep 17 00:00:00 2001 From: dapan1121 Date: Wed, 9 Feb 2022 10:56:22 +0800 Subject: [PATCH 09/12] feature/qnode --- include/common/tmsg.h | 1 + include/util/taoserror.h | 3 +- source/dnode/mnode/impl/src/mndStb.c | 1 + source/libs/catalog/inc/catalogInt.h | 14 +- source/libs/catalog/src/catalog.c | 179 ++++++++++++++++------ source/libs/catalog/test/catalogTests.cpp | 89 ++++++++++- source/libs/qcom/src/querymsg.c | 3 + source/util/src/terror.c | 1 + 8 files changed, 237 insertions(+), 54 deletions(-) diff --git a/include/common/tmsg.h b/include/common/tmsg.h index c641fbb1a3..7f9710b03f 100644 --- a/include/common/tmsg.h +++ b/include/common/tmsg.h @@ -725,6 +725,7 @@ typedef struct { char tbName[TSDB_TABLE_NAME_LEN]; char stbName[TSDB_TABLE_NAME_LEN]; char dbFName[TSDB_DB_FNAME_LEN]; + uint64_t dbId; int32_t numOfTags; int32_t numOfColumns; int8_t precision; diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 6237de36ff..db195c8c76 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -435,6 +435,8 @@ int32_t* taosGetErrno(); #define TSDB_CODE_CTG_NOT_READY TAOS_DEF_ERROR_CODE(0, 0x2402) //catalog is not ready #define TSDB_CODE_CTG_MEM_ERROR TAOS_DEF_ERROR_CODE(0, 0x2403) //catalog memory error #define TSDB_CODE_CTG_SYS_ERROR TAOS_DEF_ERROR_CODE(0, 0x2404) //catalog system error +#define TSDB_CODE_CTG_DB_DROPPED TAOS_DEF_ERROR_CODE(0, 0x2405) //Database is dropped +#define TSDB_CODE_CTG_OUT_OF_SERVICE TAOS_DEF_ERROR_CODE(0, 0x2406) //catalog is out of service //scheduler #define TSDB_CODE_SCH_STATUS_ERROR TAOS_DEF_ERROR_CODE(0, 0x2501) //scheduler status error @@ -450,4 +452,3 @@ int32_t* taosGetErrno(); #endif #endif /*_TD_COMMON_TAOS_ERROR_H_*/ - \ No newline at end of file diff --git a/source/dnode/mnode/impl/src/mndStb.c b/source/dnode/mnode/impl/src/mndStb.c index 4ccd4b63c4..51d5fd62d4 100644 --- a/source/dnode/mnode/impl/src/mndStb.c +++ b/source/dnode/mnode/impl/src/mndStb.c @@ -760,6 +760,7 @@ static int32_t mndProcessStbMetaReq(SMnodeMsg *pReq) { strcpy(pMeta->dbFName, pStb->db); strcpy(pMeta->tbName, pInfo->tbName); strcpy(pMeta->stbName, pInfo->tbName); + pMeta->dbId = htobe64(pDb->uid); pMeta->numOfTags = htonl(pStb->numOfTags); pMeta->numOfColumns = htonl(pStb->numOfColumns); pMeta->precision = pDb->cfg.precision; diff --git a/source/libs/catalog/inc/catalogInt.h b/source/libs/catalog/inc/catalogInt.h index 021a119d6c..8f24374387 100644 --- a/source/libs/catalog/inc/catalogInt.h +++ b/source/libs/catalog/inc/catalogInt.h @@ -85,6 +85,7 @@ typedef struct SCtgRentMgmt { typedef struct SCatalog { uint64_t clusterId; + SRWLatch dbLock; SHashObj *dbCache; //key:dbname, value:SCtgDBCache SCtgRentMgmt dbRent; SCtgRentMgmt stbRent; @@ -109,6 +110,8 @@ typedef struct SCatalogStat { } SCatalogStat; typedef struct SCatalogMgmt { + bool exit; + SRWLatch lock; SHashObj *pCluster; //key: clusterId, value: SCatalog* SCatalogStat stat; SCatalogCfg cfg; @@ -136,10 +139,6 @@ typedef uint32_t (*tableNameHashFp)(const char *, uint32_t); #define ctgDebug(param, ...) qDebug("CTG:%p " param, pCatalog, __VA_ARGS__) #define ctgTrace(param, ...) qTrace("CTG:%p " param, pCatalog, __VA_ARGS__) -#define CTG_ERR_RET(c) do { int32_t _code = c; if (_code != TSDB_CODE_SUCCESS) { terrno = _code; return _code; } } while (0) -#define CTG_RET(c) do { int32_t _code = c; if (_code != TSDB_CODE_SUCCESS) { terrno = _code; } return _code; } while (0) -#define CTG_ERR_JRET(c) do { code = c; if (code != TSDB_CODE_SUCCESS) { terrno = code; goto _return; } } while (0) - #define CTG_LOCK_DEBUG(...) do { if (gCTGDebug.lockDebug) { qDebug(__VA_ARGS__); } } while (0) #define CTG_CACHE_DEBUG(...) do { if (gCTGDebug.cacheDebug) { qDebug(__VA_ARGS__); } } while (0) @@ -177,6 +176,13 @@ typedef uint32_t (*tableNameHashFp)(const char *, uint32_t); } \ } while (0) + +#define CTG_ERR_RET(c) do { int32_t _code = c; if (_code != TSDB_CODE_SUCCESS) { terrno = _code; return _code; } } while (0) +#define CTG_RET(c) do { int32_t _code = c; if (_code != TSDB_CODE_SUCCESS) { terrno = _code; } return _code; } while (0) +#define CTG_ERR_JRET(c) do { code = c; if (code != TSDB_CODE_SUCCESS) { terrno = code; goto _return; } } while (0) + +#define CTG_API_ENTER() do { CTG_LOCK(CTG_READ, &ctgMgmt.lock); if (atomic_load_8(&ctgMgmt.exit)) { CTG_RET(TSDB_CODE_CTG_OUT_OF_SERVICE); } } while (0) +#define CTG_API_LEAVE(c) do { int32_t __code = c; CTG_UNLOCK(CTG_READ, &ctgMgmt.lock); CTG_RET(__code); } while (0) diff --git a/source/libs/catalog/src/catalog.c b/source/libs/catalog/src/catalog.c index f0ea22197b..268322a86e 100644 --- a/source/libs/catalog/src/catalog.c +++ b/source/libs/catalog/src/catalog.c @@ -122,7 +122,7 @@ void ctgDbgShowClusterCache(struct SCatalog* pCatalog) { int32_t ctgInitDBCache(struct SCatalog* pCatalog) { if (NULL == pCatalog->dbCache) { - SHashObj *cache = taosHashInit(ctgMgmt.cfg.maxDBCacheNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); + SHashObj *cache = taosHashInit(ctgMgmt.cfg.maxDBCacheNum, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), false, HASH_ENTRY_LOCK); if (NULL == cache) { ctgError("taosHashInit %d failed", CTG_DEFAULT_CACHE_DB_NUMBER); CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); @@ -693,6 +693,7 @@ int32_t ctgGetVgInfoFromHashValue(struct SCatalog *pCatalog, SDBVgroupInfo *dbIn CTG_RET(code); } +#if 1 int32_t ctgSTableVersionCompare(const void* key1, const void* key2) { if (*(uint64_t *)key1 < ((SSTableMetaVersion*)key2)->suid) { return -1; @@ -712,7 +713,29 @@ int32_t ctgDbVgVersionCompare(const void* key1, const void* key2) { return 0; } } +#else +int32_t ctgSTableVersionCompare(const void* key1, const void* key2) { + if (((SSTableMetaVersion*)key1)->suid < ((SSTableMetaVersion*)key2)->suid) { + return -1; + } else if (((SSTableMetaVersion*)key1)->suid > ((SSTableMetaVersion*)key2)->suid) { + return 1; + } else { + return 0; + } +} + +int32_t ctgDbVgVersionCompare(const void* key1, const void* key2) { + if (((SDbVgVersion*)key1)->dbId < ((SDbVgVersion*)key2)->dbId) { + return -1; + } else if (((SDbVgVersion*)key1)->dbId > ((SDbVgVersion*)key2)->dbId) { + return 1; + } else { + return 0; + } +} + +#endif int32_t ctgMetaRentInit(SCtgRentMgmt *mgmt, uint32_t rentSec, int8_t type) { mgmt->slotRIdx = 0; @@ -776,14 +799,15 @@ int32_t ctgMetaRentUpdate(SCtgRentMgmt *mgmt, void *meta, int64_t id, int32_t si } if (slot->needSort) { + qDebug("meta slot before sorte, slot idx:%d, type:%d, size:%d", widx, mgmt->type, (int32_t)taosArrayGetSize(slot->meta)); taosArraySort(slot->meta, compare); slot->needSort = false; - qDebug("meta slot sorted, slot idx:%d, type:%d", widx, mgmt->type); + qDebug("meta slot sorted, slot idx:%d, type:%d, size:%d", widx, mgmt->type, (int32_t)taosArrayGetSize(slot->meta)); } void *orig = taosArraySearch(slot->meta, &id, compare, TD_EQ); if (NULL == orig) { - qError("meta not found in slot, id:%"PRIx64", slot idx:%d, type:%d", id, widx, mgmt->type); + qError("meta not found in slot, id:%"PRIx64", slot idx:%d, type:%d, size:%d", id, widx, mgmt->type, (int32_t)taosArrayGetSize(slot->meta)); CTG_ERR_JRET(TSDB_CODE_CTG_INTERNAL_ERROR); } @@ -910,8 +934,15 @@ int32_t ctgMetaRentGet(SCtgRentMgmt *mgmt, void **res, uint32_t *num, int32_t si int32_t ctgAddDBCache(struct SCatalog *pCatalog, const char *dbFName, SCtgDBCache *dbCache) { int32_t code = 0; - if (taosHashPut(pCatalog->dbCache, dbFName, strlen(dbFName), dbCache, sizeof(SCtgDBCache))) { - ctgError("taosHashPut db to cache failed, db:%s", dbFName); + + code = taosHashPut(pCatalog->dbCache, dbFName, strlen(dbFName), dbCache, sizeof(SCtgDBCache)); + if (code) { + if (HASH_NODE_EXIST(code)) { + ctgDebug("db already in cache, dbFName:%s", dbFName); + return TSDB_CODE_SUCCESS; + } + + ctgError("taosHashPut db to cache failed, dbFName:%s", dbFName); CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); } @@ -919,7 +950,7 @@ int32_t ctgAddDBCache(struct SCatalog *pCatalog, const char *dbFName, SCtgDBCach strncpy(vgVersion.dbFName, dbFName, sizeof(vgVersion.dbFName)); ctgDebug("dbCache added, dbFName:%s, vgVersion:%d, dbId:%"PRIx64, dbFName, vgVersion.vgVersion, dbCache->dbId); - + CTG_ERR_JRET(ctgMetaRentAdd(&pCatalog->dbRent, &vgVersion, vgVersion.dbId, sizeof(SDbVgVersion))); return TSDB_CODE_SUCCESS; @@ -955,8 +986,8 @@ void ctgRemoveAndFreeTableMeta(struct SCatalog* pCatalog, SCtgTbMetaCache *cache int32_t ctgValidateAndRemoveDb(struct SCatalog* pCatalog, SCtgDBCache *dbCache, const char* dbFName) { if (taosHashRemove(pCatalog->dbCache, dbFName, strlen(dbFName))) { - ctgError("taosHashRemove from dbCache failed, dbFName:%s", dbFName); - CTG_ERR_RET(TSDB_CODE_CTG_INTERNAL_ERROR); + ctgInfo("taosHashRemove from dbCache failed, may be removed, dbFName:%s", dbFName); + CTG_ERR_RET(TSDB_CODE_CTG_DB_DROPPED); } atomic_store_8(&dbCache->deleted, 1); @@ -965,7 +996,7 @@ int32_t ctgValidateAndRemoveDb(struct SCatalog* pCatalog, SCtgDBCache *dbCache, if (dbCache->vgInfo) { ctgInfo("cleanup db vgInfo, dbFName:%s, dbId:%"PRIx64, dbFName, dbCache->dbId); - if (dbCache->vgInfo->vgHash) { + if (dbCache->vgInfo->vgHash) { taosHashCleanup(dbCache->vgInfo->vgHash); } @@ -988,6 +1019,8 @@ int32_t ctgValidateAndRemoveDb(struct SCatalog* pCatalog, SCtgDBCache *dbCache, int32_t ctgAcquireDBCache(struct SCatalog* pCatalog, const char *dbFName, uint64_t dbId, SCtgDBCache **pCache) { int32_t code = 0; SCtgDBCache *dbCache = NULL; + + CTG_LOCK(CTG_WRITE, &pCatalog->dbLock); while (true) { dbCache = (SCtgDBCache *)taosHashAcquire(pCatalog->dbCache, dbFName, strlen(dbFName)); @@ -1015,9 +1048,16 @@ int32_t ctgAcquireDBCache(struct SCatalog* pCatalog, const char *dbFName, uint64 return TSDB_CODE_SUCCESS; } #endif - CTG_ERR_JRET(ctgValidateAndRemoveDb(pCatalog, dbCache, dbFName)); + code = ctgValidateAndRemoveDb(pCatalog, dbCache, dbFName); taosHashRelease(pCatalog->dbCache, dbCache); dbCache = NULL; + if (code) { + if (TSDB_CODE_CTG_DB_DROPPED == code) { + continue; + } + + CTG_ERR_JRET(code); + } } SCtgDBCache newDBCache = {0}; @@ -1031,6 +1071,8 @@ _return: if (dbCache) { taosHashRelease(pCatalog->dbCache, dbCache); } + + CTG_UNLOCK(CTG_WRITE, &pCatalog->dbLock); CTG_RET(code); } @@ -1147,7 +1189,8 @@ int32_t ctgUpdateTableMetaCache(struct SCatalog *pCatalog, STableMetaOutput *out _return: if (dbCache) { - taosHashRelease(pCatalog->dbCache, dbCache); + taosHashRelease(pCatalog->dbCache, dbCache); + CTG_UNLOCK(CTG_WRITE, &pCatalog->dbLock); } CTG_RET(code); @@ -1459,17 +1502,19 @@ int32_t catalogGetDBVgroupVersion(struct SCatalog* pCatalog, const char* dbName, CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } + CTG_API_ENTER(); + if (NULL == pCatalog->dbCache) { *version = CTG_DEFAULT_INVALID_VERSION; ctgInfo("empty db cache, dbName:%s", dbName); - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } SCtgDBCache *db = taosHashAcquire(pCatalog->dbCache, dbName, strlen(dbName)); if (NULL == db) { *version = CTG_DEFAULT_INVALID_VERSION; ctgInfo("db not in cache, dbName:%s", dbName); - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } CTG_LOCK(CTG_READ, &db->vgLock); @@ -1479,7 +1524,7 @@ int32_t catalogGetDBVgroupVersion(struct SCatalog* pCatalog, const char* dbName, *version = CTG_DEFAULT_INVALID_VERSION; ctgInfo("db not in cache, dbName:%s", dbName); - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } *version = db->vgInfo->vgVersion; @@ -1489,7 +1534,7 @@ int32_t catalogGetDBVgroupVersion(struct SCatalog* pCatalog, const char* dbName, ctgDebug("Got db vgVersion from cache, dbName:%s, vgVersion:%d", dbName, *version); - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } int32_t catalogGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgmtEps, const char* dbFName, bool forceUpdate, SArray** vgroupList) { @@ -1497,6 +1542,8 @@ int32_t catalogGetDBVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } + CTG_API_ENTER(); + SCtgDBCache* dbCache = NULL; SVgroupInfo *vgInfo = NULL; @@ -1540,12 +1587,14 @@ _return: vgList = NULL; } - CTG_RET(code); + CTG_API_LEAVE(code); } int32_t catalogUpdateDBVgroup(struct SCatalog* pCatalog, const char* dbFName, uint64_t dbId, SDBVgroupInfo* dbInfo) { int32_t code = 0; + + CTG_API_ENTER(); if (NULL == pCatalog || NULL == dbFName || NULL == dbInfo) { CTG_ERR_JRET(TSDB_CODE_CTG_INVALID_INPUT); @@ -1584,7 +1633,7 @@ int32_t catalogUpdateDBVgroup(struct SCatalog* pCatalog, const char* dbFName, ui } if (dbCache->vgInfo->vgHash) { - ctgInfo("cleanup db vgHash, dbFName:%s", dbFName); + ctgDebug("cleanup db vgHash, dbFName:%s", dbFName); taosHashCleanup(dbCache->vgInfo->vgHash); dbCache->vgInfo->vgHash = NULL; } @@ -1605,13 +1654,17 @@ int32_t catalogUpdateDBVgroup(struct SCatalog* pCatalog, const char* dbFName, ui _return: + if (dbCache) { + CTG_UNLOCK(CTG_WRITE, &pCatalog->dbLock); + } + if (dbInfo) { taosHashCleanup(dbInfo->vgHash); dbInfo->vgHash = NULL; tfree(dbInfo); } - CTG_RET(code); + CTG_API_LEAVE(code); } @@ -1622,19 +1675,22 @@ int32_t catalogRemoveDB(struct SCatalog* pCatalog, const char* dbFName, uint64_t CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } + CTG_API_ENTER(); + if (NULL == pCatalog->dbCache) { - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } SCtgDBCache *dbCache = (SCtgDBCache *)taosHashAcquire(pCatalog->dbCache, dbFName, strlen(dbFName)); if (NULL == dbCache) { ctgInfo("db not exist in dbCache, may be removed, dbFName:%s", dbFName); - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } if (dbCache->dbId != dbId) { ctgInfo("db id already updated, dbFName:%s, dbId:%"PRIx64 ", targetId:%"PRIx64, dbFName, dbCache->dbId, dbId); - return TSDB_CODE_SUCCESS; + taosHashRelease(pCatalog->dbCache, dbCache); + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } CTG_ERR_JRET(ctgValidateAndRemoveDb(pCatalog, dbCache, dbFName)); @@ -1643,7 +1699,7 @@ _return: taosHashRelease(pCatalog->dbCache, dbCache); - CTG_RET(code); + CTG_API_LEAVE(code); } int32_t catalogRemoveSTableMeta(struct SCatalog* pCatalog, const char* dbName, const char* stbName, uint64_t suid) { @@ -1654,43 +1710,53 @@ int32_t catalogRemoveSTableMeta(struct SCatalog* pCatalog, const char* dbName, c CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } + CTG_API_ENTER(); + if (NULL == pCatalog->dbCache) { - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } CTG_ERR_RET(ctgValidateAndRemoveStbMeta(pCatalog, dbName, stbName, suid, &removed)); if (!removed) { - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } ctgInfo("stb removed from cache, db:%s, stbName:%s, suid:%"PRIx64, dbName, stbName, suid); - CTG_ERR_RET(ctgMetaRentRemove(&pCatalog->stbRent, suid, ctgSTableVersionCompare)); + CTG_ERR_JRET(ctgMetaRentRemove(&pCatalog->stbRent, suid, ctgSTableVersionCompare)); ctgDebug("stb removed from rent, db:%s, stbName:%s, suid:%"PRIx64, dbName, stbName, suid); + +_return: - CTG_RET(code); + CTG_API_LEAVE(code); } int32_t catalogGetTableMeta(struct SCatalog* pCatalog, void *pTransporter, const SEpSet* pMgmtEps, const SName* pTableName, STableMeta** pTableMeta) { - return ctgGetTableMeta(pCatalog, pTransporter, pMgmtEps, pTableName, false, pTableMeta, -1); + CTG_API_ENTER(); + + CTG_API_LEAVE(ctgGetTableMeta(pCatalog, pTransporter, pMgmtEps, pTableName, false, pTableMeta, -1)); } int32_t catalogGetSTableMeta(struct SCatalog* pCatalog, void * pTransporter, const SEpSet* pMgmtEps, const SName* pTableName, STableMeta** pTableMeta) { - return ctgGetTableMeta(pCatalog, pTransporter, pMgmtEps, pTableName, false, pTableMeta, 1); + CTG_API_ENTER(); + + CTG_API_LEAVE(ctgGetTableMeta(pCatalog, pTransporter, pMgmtEps, pTableName, false, pTableMeta, 1)); } int32_t catalogUpdateSTableMeta(struct SCatalog* pCatalog, STableMetaRsp *rspMsg) { STableMetaOutput output = {0}; int32_t code = 0; + CTG_API_ENTER(); + strcpy(output.dbFName, rspMsg->dbFName); strcpy(output.tbName, rspMsg->tbName); SET_META_TYPE_TABLE(output.metaType); - CTG_ERR_RET(queryCreateTableMetaFromMsg(rspMsg, true, &output.tbMeta)); + CTG_ERR_JRET(queryCreateTableMetaFromMsg(rspMsg, true, &output.tbMeta)); CTG_ERR_JRET(ctgUpdateTableMetaCache(pCatalog, &output)); @@ -1698,7 +1764,7 @@ _return: tfree(output.tbMeta); - CTG_RET(code); + CTG_API_LEAVE(code); } @@ -1707,17 +1773,23 @@ int32_t catalogRenewTableMeta(struct SCatalog* pCatalog, void *pTransporter, con CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } - return ctgRenewTableMetaImpl(pCatalog, pTransporter, pMgmtEps, pTableName, isSTable); + CTG_API_ENTER(); + + CTG_API_LEAVE(ctgRenewTableMetaImpl(pCatalog, pTransporter, pMgmtEps, pTableName, isSTable)); } int32_t catalogRenewAndGetTableMeta(struct SCatalog* pCatalog, void *pTransporter, const SEpSet* pMgmtEps, const SName* pTableName, STableMeta** pTableMeta, int32_t isSTable) { - return ctgGetTableMeta(pCatalog, pTransporter, pMgmtEps, pTableName, true, pTableMeta, isSTable); + CTG_API_ENTER(); + + CTG_API_LEAVE(ctgGetTableMeta(pCatalog, pTransporter, pMgmtEps, pTableName, true, pTableMeta, isSTable)); } int32_t catalogGetTableDistVgroup(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgmtEps, const SName* pTableName, SArray** pVgroupList) { if (NULL == pCatalog || NULL == pRpc || NULL == pMgmtEps || NULL == pTableName || NULL == pVgroupList) { CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } + + CTG_API_ENTER(); STableMeta *tbMeta = NULL; int32_t code = 0; @@ -1733,7 +1805,7 @@ int32_t catalogGetTableDistVgroup(struct SCatalog* pCatalog, void *pRpc, const S tNameGetFullDbName(pTableName, db); CTG_ERR_JRET(ctgGetDBVgroup(pCatalog, pRpc, pMgmtEps, db, false, &dbCache)); - // REMOEV THIS .... + // TODO REMOEV THIS .... if (0 == tbMeta->vgId) { SVgroupInfo vgroup = {0}; @@ -1741,7 +1813,7 @@ int32_t catalogGetTableDistVgroup(struct SCatalog* pCatalog, void *pRpc, const S tbMeta->vgId = vgroup.vgId; } - // REMOVE THIS .... + // TODO REMOVE THIS .... if (tbMeta->tableType == TSDB_SUPER_TABLE) { CTG_ERR_JRET(ctgGetVgInfoFromDB(pCatalog, pRpc, pMgmtEps, dbCache->vgInfo, pVgroupList)); @@ -1780,7 +1852,7 @@ _return: vgList = NULL; } - CTG_RET(code); + CTG_API_LEAVE(code); } @@ -1788,10 +1860,12 @@ int32_t catalogGetTableHashVgroup(struct SCatalog *pCatalog, void *pTransporter, SCtgDBCache* dbCache = NULL; int32_t code = 0; + CTG_API_ENTER(); + char db[TSDB_DB_FNAME_LEN] = {0}; tNameGetFullDbName(pTableName, db); - CTG_ERR_RET(ctgGetDBVgroup(pCatalog, pTransporter, pMgmtEps, db, false, &dbCache)); + CTG_ERR_JRET(ctgGetDBVgroup(pCatalog, pTransporter, pMgmtEps, db, false, &dbCache)); CTG_ERR_JRET(ctgGetVgInfoFromHashValue(pCatalog, dbCache->vgInfo, pTableName, pVgroup)); @@ -1802,7 +1876,7 @@ _return: taosHashRelease(pCatalog->dbCache, dbCache); } - CTG_RET(code); + CTG_API_LEAVE(code); } @@ -1811,19 +1885,22 @@ int32_t catalogGetAllMeta(struct SCatalog* pCatalog, void *pTransporter, const S CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } + CTG_API_ENTER(); + int32_t code = 0; + pRsp->pTableMeta = NULL; if (pReq->pTableName) { int32_t tbNum = (int32_t)taosArrayGetSize(pReq->pTableName); if (tbNum <= 0) { ctgError("empty table name list, tbNum:%d", tbNum); - CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); + CTG_ERR_JRET(TSDB_CODE_CTG_INVALID_INPUT); } pRsp->pTableMeta = taosArrayInit(tbNum, POINTER_BYTES); if (NULL == pRsp->pTableMeta) { ctgError("taosArrayInit %d failed", tbNum); - CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); + CTG_ERR_JRET(TSDB_CODE_CTG_MEM_ERROR); } for (int32_t i = 0; i < tbNum; ++i) { @@ -1840,7 +1917,7 @@ int32_t catalogGetAllMeta(struct SCatalog* pCatalog, void *pTransporter, const S } } - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); _return: @@ -1855,7 +1932,7 @@ _return: pRsp->pTableMeta = NULL; } - CTG_RET(code); + CTG_API_LEAVE(code); } int32_t catalogGetQnodeList(struct SCatalog* pCatalog, void *pRpc, const SEpSet* pMgmtEps, SArray* pQnodeList) { @@ -1863,9 +1940,11 @@ int32_t catalogGetQnodeList(struct SCatalog* pCatalog, void *pRpc, const SEpSet* CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } + CTG_API_ENTER(); + //TODO - return TSDB_CODE_SUCCESS; + CTG_API_LEAVE(TSDB_CODE_SUCCESS); } int32_t catalogGetExpiredSTables(struct SCatalog* pCatalog, SSTableMetaVersion **stables, uint32_t *num) { @@ -1873,7 +1952,9 @@ int32_t catalogGetExpiredSTables(struct SCatalog* pCatalog, SSTableMetaVersion * CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } - CTG_RET(ctgMetaRentGet(&pCatalog->stbRent, (void **)stables, num, sizeof(SSTableMetaVersion))); + CTG_API_ENTER(); + + CTG_API_LEAVE(ctgMetaRentGet(&pCatalog->stbRent, (void **)stables, num, sizeof(SSTableMetaVersion))); } int32_t catalogGetExpiredDBs(struct SCatalog* pCatalog, SDbVgVersion **dbs, uint32_t *num) { @@ -1881,15 +1962,21 @@ int32_t catalogGetExpiredDBs(struct SCatalog* pCatalog, SDbVgVersion **dbs, uint CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } - CTG_RET(ctgMetaRentGet(&pCatalog->dbRent, (void **)dbs, num, sizeof(SDbVgVersion))); + CTG_API_ENTER(); + + CTG_API_LEAVE(ctgMetaRentGet(&pCatalog->dbRent, (void **)dbs, num, sizeof(SDbVgVersion))); } void catalogDestroy(void) { - if (NULL == ctgMgmt.pCluster) { + if (NULL == ctgMgmt.pCluster || atomic_load_8(&ctgMgmt.exit)) { return; } + atomic_store_8(&ctgMgmt.exit, true); + + CTG_LOCK(CTG_WRITE, &ctgMgmt.lock); + SCatalog *pCatalog = NULL; void *pIter = taosHashIterate(ctgMgmt.pCluster, NULL); while (pIter) { @@ -1905,6 +1992,8 @@ void catalogDestroy(void) { taosHashCleanup(ctgMgmt.pCluster); ctgMgmt.pCluster = NULL; + CTG_UNLOCK(CTG_WRITE, &ctgMgmt.lock); + qInfo("catalog destroyed"); } diff --git a/source/libs/catalog/test/catalogTests.cpp b/source/libs/catalog/test/catalogTests.cpp index 0214078287..1284f1d7d3 100644 --- a/source/libs/catalog/test/catalogTests.cpp +++ b/source/libs/catalog/test/catalogTests.cpp @@ -108,6 +108,7 @@ void ctgTestInitLogFile() { const int32_t maxLogFileNum = 10; tsAsyncLog = 0; + qDebugFlag = 159; char temp[128] = {0}; sprintf(temp, "%s/%s", tsLogDir, defaultLogFileNamePrefix); @@ -631,7 +632,7 @@ void *ctgTestGetDbVgroupThread(void *param) { return NULL; } -void *ctgTestSetDbVgroupThread(void *param) { +void *ctgTestSetSameDbVgroupThread(void *param) { struct SCatalog *pCtg = (struct SCatalog *)param; int32_t code = 0; SDBVgroupInfo *dbVgroup = NULL; @@ -655,6 +656,32 @@ void *ctgTestSetDbVgroupThread(void *param) { return NULL; } + +void *ctgTestSetDiffDbVgroupThread(void *param) { + struct SCatalog *pCtg = (struct SCatalog *)param; + int32_t code = 0; + SDBVgroupInfo *dbVgroup = NULL; + int32_t n = 0; + + while (!ctgTestStop) { + ctgTestBuildDBVgroup(&dbVgroup); + code = catalogUpdateDBVgroup(pCtg, ctgTestDbname, ctgTestDbId++, dbVgroup); + if (code) { + assert(0); + } + + if (ctgTestEnableSleep) { + usleep(rand() % 5); + } + if (++n % ctgTestPrintNum == 0) { + printf("Set:%d\n", n); + } + } + + return NULL; +} + + void *ctgTestGetCtableMetaThread(void *param) { struct SCatalog *pCtg = (struct SCatalog *)param; int32_t code = 0; @@ -720,6 +747,8 @@ TEST(tableMeta, normalTable) { void *mockPointer = (void *)0x1; SVgroupInfo vgInfo = {0}; + ctgTestInitLogFile(); + ctgTestSetPrepareDbVgroups(); initQueryModuleMsgHandle(); @@ -1285,7 +1314,7 @@ TEST(dbVgroup, getSetDbVgroupCase) { catalogDestroy(); } -TEST(multiThread, getSetDbVgroupCase) { +TEST(multiThread, getSetRmSameDbVgroup) { struct SCatalog *pCtg = NULL; void *mockPointer = (void *)0x1; SVgroupInfo vgInfo = {0}; @@ -1316,10 +1345,10 @@ TEST(multiThread, getSetDbVgroupCase) { pthread_attr_init(&thattr); pthread_t thread1, thread2; - pthread_create(&(thread1), &thattr, ctgTestSetDbVgroupThread, pCtg); + pthread_create(&(thread1), &thattr, ctgTestSetSameDbVgroupThread, pCtg); sleep(1); - pthread_create(&(thread1), &thattr, ctgTestGetDbVgroupThread, pCtg); + pthread_create(&(thread2), &thattr, ctgTestGetDbVgroupThread, pCtg); while (true) { if (ctgTestDeadLoop) { @@ -1336,6 +1365,58 @@ TEST(multiThread, getSetDbVgroupCase) { catalogDestroy(); } +TEST(multiThread, getSetRmDiffDbVgroup) { + struct SCatalog *pCtg = NULL; + void *mockPointer = (void *)0x1; + SVgroupInfo vgInfo = {0}; + SVgroupInfo *pvgInfo = NULL; + SDBVgroupInfo dbVgroup = {0}; + SArray *vgList = NULL; + ctgTestStop = false; + + ctgTestInitLogFile(); + + ctgTestSetPrepareDbVgroups(); + + initQueryModuleMsgHandle(); + + // sendCreateDbMsg(pConn->pTransporter, &pConn->pAppInfo->mgmtEp.epSet); + + int32_t code = catalogInit(NULL); + ASSERT_EQ(code, 0); + + code = catalogGetHandle(ctgTestClusterId, &pCtg); + ASSERT_EQ(code, 0); + + SName n = {.type = TSDB_TABLE_NAME_T, .acctId = 1}; + strcpy(n.dbname, "db1"); + strcpy(n.tname, ctgTestTablename); + + pthread_attr_t thattr; + pthread_attr_init(&thattr); + + pthread_t thread1, thread2; + pthread_create(&(thread1), &thattr, ctgTestSetDiffDbVgroupThread, pCtg); + + sleep(1); + pthread_create(&(thread2), &thattr, ctgTestGetDbVgroupThread, pCtg); + + while (true) { + if (ctgTestDeadLoop) { + sleep(1); + } else { + sleep(ctgTestMTRunSec); + break; + } + } + + ctgTestStop = true; + sleep(1); + + catalogDestroy(); +} + + TEST(multiThread, ctableMeta) { struct SCatalog *pCtg = NULL; diff --git a/source/libs/qcom/src/querymsg.c b/source/libs/qcom/src/querymsg.c index 55099d9972..3e14bfca09 100644 --- a/source/libs/qcom/src/querymsg.c +++ b/source/libs/qcom/src/querymsg.c @@ -159,6 +159,7 @@ _return: } static int32_t queryConvertTableMetaMsg(STableMetaRsp* pMetaMsg) { + pMetaMsg->dbId = be64toh(pMetaMsg->dbId); pMetaMsg->numOfTags = ntohl(pMetaMsg->numOfTags); pMetaMsg->numOfColumns = ntohl(pMetaMsg->numOfColumns); pMetaMsg->sversion = ntohl(pMetaMsg->sversion); @@ -258,6 +259,8 @@ int32_t queryProcessTableMetaRsp(void* output, char *msg, int32_t msgSize) { } strcpy(pOut->dbFName, pMetaMsg->dbFName); + + pOut->dbId = pMetaMsg->dbId; if (pMetaMsg->tableType == TSDB_CHILD_TABLE) { SET_META_TYPE_BOTH_TABLE(pOut->metaType); diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 55d40ff98c..937d5d035d 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -419,6 +419,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_CTG_NOT_READY, "catalog is not ready" TAOS_DEFINE_ERROR(TSDB_CODE_CTG_MEM_ERROR, "catalog memory error") TAOS_DEFINE_ERROR(TSDB_CODE_CTG_SYS_ERROR, "catalog system error") TAOS_DEFINE_ERROR(TSDB_CODE_CTG_DB_DROPPED, "Database is dropped") +TAOS_DEFINE_ERROR(TSDB_CODE_CTG_OUT_OF_SERVICE, "catalog is out of service") //scheduler TAOS_DEFINE_ERROR(TSDB_CODE_SCH_STATUS_ERROR, "scheduler status error") From e31e83360a2827391e0ef986354f4d8333d20070 Mon Sep 17 00:00:00 2001 From: dapan1121 Date: Wed, 9 Feb 2022 11:28:38 +0800 Subject: [PATCH 10/12] feature/qnode --- source/libs/catalog/src/catalog.c | 2 ++ source/libs/catalog/test/catalogTests.cpp | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/source/libs/catalog/src/catalog.c b/source/libs/catalog/src/catalog.c index 268322a86e..6ecff87a89 100644 --- a/source/libs/catalog/src/catalog.c +++ b/source/libs/catalog/src/catalog.c @@ -1384,6 +1384,8 @@ int32_t catalogInit(SCatalogCfg *cfg) { CTG_ERR_RET(TSDB_CODE_CTG_INVALID_INPUT); } + atomic_store_8(&ctgMgmt.exit, false); + if (cfg) { memcpy(&ctgMgmt.cfg, cfg, sizeof(*cfg)); diff --git a/source/libs/catalog/test/catalogTests.cpp b/source/libs/catalog/test/catalogTests.cpp index 1284f1d7d3..d0f98e3c2a 100644 --- a/source/libs/catalog/test/catalogTests.cpp +++ b/source/libs/catalog/test/catalogTests.cpp @@ -50,7 +50,7 @@ bool ctgTestStop = false; bool ctgTestEnableSleep = false; bool ctgTestDeadLoop = false; int32_t ctgTestPrintNum = 200000; -int32_t ctgTestMTRunSec = 30; +int32_t ctgTestMTRunSec = 5; int32_t ctgTestCurrentVgVersion = 0; int32_t ctgTestVgVersion = 1; @@ -839,6 +839,8 @@ TEST(tableMeta, childTableCase) { void *mockPointer = (void *)0x1; SVgroupInfo vgInfo = {0}; + ctgTestInitLogFile(); + ctgTestSetPrepareDbVgroupsAndChildMeta(); initQueryModuleMsgHandle(); From ec70e160ef8f6652cbe07e0fb0435c0495c1c3d6 Mon Sep 17 00:00:00 2001 From: Minghao Li Date: Wed, 9 Feb 2022 14:13:06 +0800 Subject: [PATCH 11/12] rm old sync code --- source/libs/sync/CMakeLists.txt | 16 - source/libs/sync/inc/raft.h | 146 ------- source/libs/sync/inc/raft_log.h | 76 ---- source/libs/sync/inc/raft_message.h | 237 ---------- source/libs/sync/inc/raft_replication.h | 30 -- source/libs/sync/inc/raft_unstable_log.h | 115 ----- source/libs/sync/inc/syncInt.h | 71 --- source/libs/sync/inc/sync_const.h | 25 -- .../libs/sync/inc/sync_raft_config_change.h | 47 -- source/libs/sync/inc/sync_raft_impl.h | 58 --- source/libs/sync/inc/sync_raft_inflights.h | 70 --- source/libs/sync/inc/sync_raft_node_map.h | 49 --- source/libs/sync/inc/sync_raft_progress.h | 259 ----------- .../sync/inc/sync_raft_progress_tracker.h | 132 ------ source/libs/sync/inc/sync_raft_proto.h | 77 ---- source/libs/sync/inc/sync_raft_quorum.h | 40 -- source/libs/sync/inc/sync_raft_quorum_joint.h | 84 ---- .../libs/sync/inc/sync_raft_quorum_majority.h | 36 -- source/libs/sync/inc/sync_raft_restore.h | 33 -- source/libs/sync/inc/sync_type.h | 91 ---- source/libs/sync/src/raft.c | 325 -------------- .../src/raft_handle_append_entries_message.c | 48 -- .../sync/src/raft_handle_election_message.c | 29 -- .../libs/sync/src/raft_handle_vote_message.c | 61 --- .../sync/src/raft_handle_vote_resp_message.c | 60 --- source/libs/sync/src/raft_log.c | 66 --- source/libs/sync/src/raft_message.c | 22 - source/libs/sync/src/raft_replication.c | 110 ----- source/libs/sync/src/raft_unstable_log.c | 23 - source/libs/sync/src/sync.c | 302 ------------- .../libs/sync/src/sync_raft_config_change.c | 409 ------------------ source/libs/sync/src/sync_raft_election.c | 114 ----- source/libs/sync/src/sync_raft_impl.c | 369 ---------------- source/libs/sync/src/sync_raft_inflights.c | 97 ----- source/libs/sync/src/sync_raft_node_map.c | 82 ---- source/libs/sync/src/sync_raft_progress.c | 260 ----------- .../sync/src/sync_raft_progress_tracker.c | 156 ------- source/libs/sync/src/sync_raft_quorum_joint.c | 75 ---- .../libs/sync/src/sync_raft_quorum_majority.c | 121 ------ source/libs/sync/src/sync_raft_restore.c | 180 -------- source/libs/sync/test/raftTests.cpp | 0 41 files changed, 4601 deletions(-) delete mode 100644 source/libs/sync/inc/raft.h delete mode 100644 source/libs/sync/inc/raft_log.h delete mode 100644 source/libs/sync/inc/raft_message.h delete mode 100644 source/libs/sync/inc/raft_replication.h delete mode 100644 source/libs/sync/inc/raft_unstable_log.h delete mode 100644 source/libs/sync/inc/syncInt.h delete mode 100644 source/libs/sync/inc/sync_const.h delete mode 100644 source/libs/sync/inc/sync_raft_config_change.h delete mode 100644 source/libs/sync/inc/sync_raft_impl.h delete mode 100644 source/libs/sync/inc/sync_raft_inflights.h delete mode 100644 source/libs/sync/inc/sync_raft_node_map.h delete mode 100644 source/libs/sync/inc/sync_raft_progress.h delete mode 100644 source/libs/sync/inc/sync_raft_progress_tracker.h delete mode 100644 source/libs/sync/inc/sync_raft_proto.h delete mode 100644 source/libs/sync/inc/sync_raft_quorum.h delete mode 100644 source/libs/sync/inc/sync_raft_quorum_joint.h delete mode 100644 source/libs/sync/inc/sync_raft_quorum_majority.h delete mode 100644 source/libs/sync/inc/sync_raft_restore.h delete mode 100644 source/libs/sync/inc/sync_type.h delete mode 100644 source/libs/sync/src/raft.c delete mode 100644 source/libs/sync/src/raft_handle_append_entries_message.c delete mode 100644 source/libs/sync/src/raft_handle_election_message.c delete mode 100644 source/libs/sync/src/raft_handle_vote_message.c delete mode 100644 source/libs/sync/src/raft_handle_vote_resp_message.c delete mode 100644 source/libs/sync/src/raft_log.c delete mode 100644 source/libs/sync/src/raft_message.c delete mode 100644 source/libs/sync/src/raft_replication.c delete mode 100644 source/libs/sync/src/raft_unstable_log.c delete mode 100644 source/libs/sync/src/sync.c delete mode 100644 source/libs/sync/src/sync_raft_config_change.c delete mode 100644 source/libs/sync/src/sync_raft_election.c delete mode 100644 source/libs/sync/src/sync_raft_impl.c delete mode 100644 source/libs/sync/src/sync_raft_inflights.c delete mode 100644 source/libs/sync/src/sync_raft_node_map.c delete mode 100644 source/libs/sync/src/sync_raft_progress.c delete mode 100644 source/libs/sync/src/sync_raft_progress_tracker.c delete mode 100644 source/libs/sync/src/sync_raft_quorum_joint.c delete mode 100644 source/libs/sync/src/sync_raft_quorum_majority.c delete mode 100644 source/libs/sync/src/sync_raft_restore.c delete mode 100644 source/libs/sync/test/raftTests.cpp diff --git a/source/libs/sync/CMakeLists.txt b/source/libs/sync/CMakeLists.txt index 37ee5194c8..e69de29bb2 100644 --- a/source/libs/sync/CMakeLists.txt +++ b/source/libs/sync/CMakeLists.txt @@ -1,16 +0,0 @@ -aux_source_directory(src SYNC_SRC) -add_library(sync ${SYNC_SRC}) - -target_link_libraries( - sync - PUBLIC common - PUBLIC transport - PUBLIC util - PUBLIC wal -) - -target_include_directories( - sync - PUBLIC "${CMAKE_SOURCE_DIR}/include/libs/sync" - PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/inc" -) \ No newline at end of file diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h deleted file mode 100644 index 129f0f4dbc..0000000000 --- a/source/libs/sync/inc/raft.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_RAFT_H -#define _TD_LIBS_SYNC_RAFT_H - -#include "sync.h" -#include "sync_type.h" -#include "thash.h" -#include "raft_message.h" -#include "sync_raft_impl.h" -#include "sync_raft_quorum.h" - -typedef struct RaftLeaderState { - -} RaftLeaderState; - -typedef struct RaftCandidateState { - /* true if in pre-vote phase */ - bool inPreVote; -} RaftCandidateState; - -typedef struct SSyncRaftIOMethods { - // send SSyncMessage to node - int (*send)(const SSyncMessage* pMsg, const SNodeInfo* pNode); -} SSyncRaftIOMethods; - -typedef int (*SyncRaftStepFp)(SSyncRaft* pRaft, const SSyncMessage* pMsg); -typedef void (*SyncRaftTickFp)(SSyncRaft* pRaft); - -struct SSyncRaft { - // owner sync node - SSyncNode* pNode; - - // hash map nodeId -> SNodeInfo* - SHashObj* nodeInfoMap; - - SyncNodeId selfId; - SyncGroupId selfGroupId; - - SSyncRaftIOMethods io; - - SSyncFSM fsm; - SSyncLogStore logStore; - SStateManager stateManager; - - union { - RaftLeaderState leaderState; - RaftCandidateState candidateState; - }; - - SyncTerm term; - SyncNodeId voteFor; - - SSyncRaftLog *log; - - uint64_t maxMsgSize; - uint64_t maxUncommittedSize; - SSyncRaftProgressTracker *tracker; - - ESyncState state; - - // isLearner is true if the local raft node is a learner. - bool isLearner; - - /** - * the leader id - **/ - SyncNodeId leaderId; - - /** - * leadTransferee is id of the leader transfer target when its value is not zero. - * Follow the procedure defined in raft thesis 3.10. - **/ - SyncNodeId leadTransferee; - - /** - * Only one conf change may be pending (in the log, but not yet - * applied) at a time. This is enforced via pendingConfIndex, which - * is set to a value >= the log index of the latest pending - * configuration change (if any). Config changes are only allowed to - * be proposed if the leader's applied index is greater than this - * value. - **/ - SyncIndex pendingConfigIndex; - - /** - * an estimate of the size of the uncommitted tail of the Raft log. Used to - * prevent unbounded log growth. Only maintained by the leader. Reset on - * term changes. - **/ - uint32_t uncommittedSize; - - /** - * number of ticks since it reached last electionTimeout when it is leader - * or candidate. - * number of ticks since it reached last electionTimeout or received a - * valid message from current leader when it is a follower. - **/ - uint16_t electionElapsed; - - /** - * number of ticks since it reached last heartbeatTimeout. - * only leader keeps heartbeatElapsed. - **/ - uint16_t heartbeatElapsed; - - bool preVote; - bool checkQuorum; - - int heartbeatTimeout; - int electionTimeout; - - /** - * randomizedElectionTimeout is a random number between - * [electiontimeout, 2 * electiontimeout - 1]. It gets reset - * when raft changes its state to follower or candidate. - **/ - int randomizedElectionTimeout; - bool disableProposalForwarding; - - // current tick count since start up - uint32_t currentTick; - - SyncRaftStepFp stepFp; - - SyncRaftTickFp tickFp; -}; - -int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo); -int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg); -int32_t syncRaftTick(SSyncRaft* pRaft); - -#endif /* _TD_LIBS_SYNC_RAFT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_log.h b/source/libs/sync/inc/raft_log.h deleted file mode 100644 index 117ed42c2c..0000000000 --- a/source/libs/sync/inc/raft_log.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_RAFT_LOG_H -#define _TD_LIBS_SYNC_RAFT_LOG_H - -#include "sync.h" -#include "sync_type.h" - -typedef enum ESyncRaftEntryType { - SYNC_ENTRY_TYPE_LOG = 1, -} ESyncRaftEntryType; - -struct SSyncRaftEntry { - SyncTerm term; - - SyncIndex index; - - ESyncRaftEntryType type; - - SSyncBuffer buffer; -}; - -struct SSyncRaftLog { - SyncIndex uncommittedConfigIndex; - - SyncIndex commitIndex; - - SyncIndex appliedIndex; -}; - -SSyncRaftLog* syncRaftLogOpen(); - -SyncIndex syncRaftLogLastIndex(SSyncRaftLog* pLog); - -SyncIndex syncRaftLogSnapshotIndex(SSyncRaftLog* pLog); - -SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog); - -void syncRaftLogAppliedTo(SSyncRaftLog* pLog, SyncIndex appliedIndex); - -bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term); - -int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog); - -bool syncRaftHasUnappliedLog(SSyncRaftLog* pLog); - -SyncTerm syncRaftLogTermOf(SSyncRaftLog* pLog, SyncIndex index); - -int syncRaftLogAppend(SSyncRaftLog* pLog, SSyncRaftEntry *pEntries, int n); - -int syncRaftLogAcquire(SSyncRaftLog* pLog, SyncIndex index, int maxMsgSize, - SSyncRaftEntry **ppEntries, int *n); - -void syncRaftLogRelease(SSyncRaftLog* pLog, SyncIndex index, - SSyncRaftEntry *pEntries, int n); - -bool syncRaftLogMatchTerm(); - -static FORCE_INLINE bool syncRaftLogIsCommitted(SSyncRaftLog* pLog, SyncIndex index) { - return pLog->commitIndex > index; -} - -#endif /* _TD_LIBS_SYNC_RAFT_LOG_H */ diff --git a/source/libs/sync/inc/raft_message.h b/source/libs/sync/inc/raft_message.h deleted file mode 100644 index 0d81511756..0000000000 --- a/source/libs/sync/inc/raft_message.h +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_RAFT_MESSAGE_H -#define _TD_LIBS_SYNC_RAFT_MESSAGE_H - -#include "sync.h" -#include "sync_type.h" - -/** - * below define message type which handled by Raft. - * - * internal message, which communicate between threads, start with RAFT_MSG_INTERNAL_*. - * internal message use pointer only and stack memory, need not to be decode/encode and free. - * - * outter message start with RAFT_MSG_*, which communicate between cluster peers, - * need to implement its decode/encode functions. - **/ -typedef enum ESyncRaftMessageType { - // client propose a cmd - RAFT_MSG_INTERNAL_PROP = 1, - - // node election timeout - RAFT_MSG_INTERNAL_ELECTION = 2, - - RAFT_MSG_VOTE = 3, - RAFT_MSG_VOTE_RESP = 4, - - RAFT_MSG_APPEND = 5, - RAFT_MSG_APPEND_RESP = 6, -} ESyncRaftMessageType; - -typedef struct RaftMsgInternal_Prop { - const SSyncBuffer *pBuf; - bool isWeak; - void* pData; -} RaftMsgInternal_Prop; - -typedef struct RaftMsgInternal_Election { - -} RaftMsgInternal_Election; - -typedef struct RaftMsg_Vote { - ESyncRaftElectionType cType; - SyncIndex lastIndex; - SyncTerm lastTerm; -} RaftMsg_Vote; - -typedef struct RaftMsg_VoteResp { - bool rejected; - ESyncRaftElectionType cType; -} RaftMsg_VoteResp; - -typedef struct RaftMsg_Append_Entries { - // index of log entry preceeding new ones - SyncIndex index; - - // term of entry at prevIndex - SyncTerm term; - - // leader's commit index. - SyncIndex commitIndex; - - // size of the log entries array - int nEntries; - - // log entries array - SSyncRaftEntry* entries; -} RaftMsg_Append_Entries; - -typedef struct RaftMsg_Append_Resp { - SyncIndex index; -} RaftMsg_Append_Resp; - -typedef struct SSyncMessage { - ESyncRaftMessageType msgType; - SyncTerm term; - SyncGroupId groupId; - SyncNodeId from; - - union { - RaftMsgInternal_Prop propose; - - RaftMsgInternal_Election election; - - RaftMsg_Vote vote; - RaftMsg_VoteResp voteResp; - - RaftMsg_Append_Entries appendEntries; - RaftMsg_Append_Resp appendResp; - }; -} SSyncMessage; - -static FORCE_INLINE SSyncMessage* syncInitPropMsg(SSyncMessage* pMsg, const SSyncBuffer* pBuf, void* pData, bool isWeak) { - *pMsg = (SSyncMessage) { - .msgType = RAFT_MSG_INTERNAL_PROP, - .term = 0, - .propose = (RaftMsgInternal_Prop) { - .isWeak = isWeak, - .pBuf = pBuf, - .pData = pData, - }, - }; - - return pMsg; -} - -static FORCE_INLINE SSyncMessage* syncInitElectionMsg(SSyncMessage* pMsg, SyncNodeId from) { - *pMsg = (SSyncMessage) { - .msgType = RAFT_MSG_INTERNAL_ELECTION, - .term = 0, - .from = from, - .election = (RaftMsgInternal_Election) { - - }, - }; - - return pMsg; -} - -static FORCE_INLINE SSyncMessage* syncNewVoteMsg(SyncGroupId groupId, SyncNodeId from, - SyncTerm term, ESyncRaftElectionType cType, - SyncIndex lastIndex, SyncTerm lastTerm) { - SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); - if (pMsg == NULL) { - return NULL; - } - *pMsg = (SSyncMessage) { - .groupId = groupId, - .from = from, - .term = term, - .msgType = RAFT_MSG_VOTE, - .vote = (RaftMsg_Vote) { - .cType = cType, - .lastIndex = lastIndex, - .lastTerm = lastTerm, - }, - }; - - return pMsg; -} - -static FORCE_INLINE SSyncMessage* syncNewVoteRespMsg(SyncGroupId groupId, SyncNodeId from, - ESyncRaftElectionType cType, bool rejected) { - SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); - if (pMsg == NULL) { - return NULL; - } - *pMsg = (SSyncMessage) { - .groupId = groupId, - .from = from, - .msgType = RAFT_MSG_VOTE_RESP, - .voteResp = (RaftMsg_VoteResp) { - .cType = cType, - .rejected = rejected, - }, - }; - - return pMsg; -} - -static FORCE_INLINE SSyncMessage* syncNewAppendMsg(SyncGroupId groupId, SyncNodeId from, - SyncTerm term, SyncIndex logIndex, SyncTerm logTerm, - SyncIndex commitIndex, int nEntries, SSyncRaftEntry* entries) { - SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); - if (pMsg == NULL) { - return NULL; - } - *pMsg = (SSyncMessage) { - .groupId = groupId, - .from = from, - .term = term, - .msgType = RAFT_MSG_APPEND, - .appendEntries = (RaftMsg_Append_Entries) { - .index = logIndex, - .term = logTerm, - .commitIndex = commitIndex, - .nEntries = nEntries, - .entries = entries, - }, - }; - - return pMsg; -} - -static FORCE_INLINE SSyncMessage* syncNewEmptyAppendRespMsg(SyncGroupId groupId, SyncNodeId from, SyncTerm term) { - SSyncMessage* pMsg = (SSyncMessage*)malloc(sizeof(SSyncMessage)); - if (pMsg == NULL) { - return NULL; - } - *pMsg = (SSyncMessage) { - .groupId = groupId, - .from = from, - .term = term, - .msgType = RAFT_MSG_APPEND_RESP, - .appendResp = (RaftMsg_Append_Resp) { - - }, - }; - - return pMsg; -} - -static FORCE_INLINE bool syncIsInternalMsg(ESyncRaftMessageType msgType) { - return msgType == RAFT_MSG_INTERNAL_PROP || - msgType == RAFT_MSG_INTERNAL_ELECTION; -} - -static FORCE_INLINE bool syncIsPreVoteRespMsg(const SSyncMessage* pMsg) { - return pMsg->msgType == RAFT_MSG_VOTE_RESP && pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION; -} - -static FORCE_INLINE bool syncIsPreVoteMsg(const SSyncMessage* pMsg) { - return pMsg->msgType == RAFT_MSG_VOTE && pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION; -} - -void syncFreeMessage(const SSyncMessage* pMsg); - -// message handlers -int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); -int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); -int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); -int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); - -#endif /* _TD_LIBS_SYNC_RAFT_MESSAGE_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/raft_replication.h b/source/libs/sync/inc/raft_replication.h deleted file mode 100644 index 180a2db61f..0000000000 --- a/source/libs/sync/inc/raft_replication.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef TD_SYNC_RAFT_REPLICATION_H -#define TD_SYNC_RAFT_REPLICATION_H - -#include "sync.h" -#include "syncInt.h" -#include "sync_type.h" - -// syncRaftMaybeSendAppend sends an append RPC with new entries to the given peer, -// if necessary. Returns true if a message was sent. The sendIfEmpty -// argument controls whether messages with no entries will be sent -// ("empty" messages are useful to convey updated Commit indexes, but -// are undesirable when we're sending multiple messages in a batch). -bool syncRaftMaybeSendAppend(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty); - -#endif /* TD_SYNC_RAFT_REPLICATION_H */ diff --git a/source/libs/sync/inc/raft_unstable_log.h b/source/libs/sync/inc/raft_unstable_log.h deleted file mode 100644 index 0748a425a1..0000000000 --- a/source/libs/sync/inc/raft_unstable_log.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef TD_SYNC_RAFT_UNSTABLE_LOG_H -#define TD_SYNC_RAFT_UNSTABLE_LOG_H - -#include "sync_type.h" - -/* in-memory unstable raft log storage */ -struct SSyncRaftUnstableLog { -#if 0 - /* Circular buffer of log entries */ - RaftEntry *entries; - - /* size of Circular buffer */ - int size; - - /* Indexes of used slots [front, back) */ - int front, back; - - /* Index of first entry is offset + 1 */ - SyncIndex offset; - - /* meta data of snapshot */ - SSyncRaftUnstableLog snapshot; -#endif -}; - -/** - * return index of last in memory log, return 0 if log is empty - **/ -//SyncIndex syncRaftLogLastIndex(SSyncRaftUnstableLog* pLog); - -#if 0 -void raftLogInit(RaftLog* pLog); - -void raftLogClose(RaftLog* pLog); - -/** - * When startup populating log entrues loaded from disk, - * init raft memory log with snapshot index,term and log start idnex. - **/ -/* -void raftLogStart(RaftLog* pLog, - RaftSnapshotMeta snapshot, - SyncIndex startIndex); -*/ -/** - * Get the number of entries the log. - **/ -int raftLogNumEntries(const RaftLog* pLog); - - - -/** - * return last term of in memory log, return 0 if log is empty - **/ -SyncTerm raftLogLastTerm(RaftLog* pLog); - -/** - * return term of log with the given index, return 0 if the term of index cannot be found - * , errCode will save the error code. - **/ -SyncTerm raftLogTermOf(RaftLog* pLog, SyncIndex index, RaftCode* errCode); - -/** - * Get the last index of the most recent snapshot. Return 0 if there are no * - * snapshots. - **/ -SyncIndex raftLogSnapshotIndex(RaftLog* pLog); - -/* Append a new entry to the log. */ -int raftLogAppend(RaftLog* pLog, - SyncTerm term, - const SSyncBuffer *buf); - -/** - * acquire log from given index onwards. - **/ -/* -int raftLogAcquire(RaftLog* pLog, - SyncIndex index, - RaftEntry **ppEntries, - int *n); - -void raftLogRelease(RaftLog* pLog, - SyncIndex index, - RaftEntry *pEntries, - int n); -*/ -/* Delete all entries from the given index (included) onwards. */ -void raftLogTruncate(RaftLog* pLog, SyncIndex index); - -/** - * when taking a new snapshot, the function will update the last snapshot information and delete - * all entries up last_index - trailing (included). If the log contains no entry - * a last_index - trailing, then no entry will be deleted. - **/ -void raftLogSnapshot(RaftLog* pLog, SyncIndex index, SyncIndex trailing); - -#endif - -#endif /* TD_SYNC_RAFT_UNSTABLE_LOG_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h deleted file mode 100644 index f99fb066ae..0000000000 --- a/source/libs/sync/inc/syncInt.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_INT_H -#define _TD_LIBS_SYNC_INT_H - -#include "thash.h" -#include "os.h" -#include "sync.h" -#include "sync_type.h" -#include "raft.h" -#include "tlog.h" - -#define TAOS_SYNC_MAX_WORKER 3 - -typedef struct SSyncWorker { - pthread_t thread; -} SSyncWorker; - -struct SSyncNode { - pthread_mutex_t mutex; - int32_t refCount; - SyncGroupId vgId; - SSyncRaft raft; - void* syncTimer; -}; - -typedef struct SSyncManager { - pthread_mutex_t mutex; - - // sync server rpc - void* serverRpc; - // rpc server hash table base on FQDN:port key - SHashObj* rpcServerTable; - - // sync client rpc - void* clientRpc; - - // worker threads - SSyncWorker worker[TAOS_SYNC_MAX_WORKER]; - - // vgroup hash table - SHashObj* vgroupTable; - - // timer manager - void* syncTimerManager; - -} SSyncManager; - -extern SSyncManager* gSyncManager; - -#define syncFatal(...) do { if (sDebugFlag & DEBUG_FATAL) { taosPrintLog("SYNC FATAL ", 255, __VA_ARGS__); }} while(0) -#define syncError(...) do { if (sDebugFlag & DEBUG_ERROR) { taosPrintLog("SYNC ERROR ", 255, __VA_ARGS__); }} while(0) -#define syncWarn(...) do { if (sDebugFlag & DEBUG_WARN) { taosPrintLog("SYNC WARN ", 255, __VA_ARGS__); }} while(0) -#define syncInfo(...) do { if (sDebugFlag & DEBUG_INFO) { taosPrintLog("SYNC ", 255, __VA_ARGS__); }} while(0) -#define syncDebug(...) do { if (sDebugFlag & DEBUG_DEBUG) { taosPrintLog("SYNC ", sDebugFlag, __VA_ARGS__); }} while(0) -#define syncTrace(...) do { if (sDebugFlag & DEBUG_TRACE) { taosPrintLog("SYNC ", sDebugFlag, __VA_ARGS__); }} while(0) - -#endif /* _TD_LIBS_SYNC_INT_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_const.h b/source/libs/sync/inc/sync_const.h deleted file mode 100644 index b49c17f82e..0000000000 --- a/source/libs/sync/inc/sync_const.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_CONST_H -#define _TD_LIBS_SYNC_CONST_H - -#include "sync.h" - -static int kSyncRaftMaxInflghtMsgs = 20; - -static SyncIndex kMaxCommitIndex = UINT64_MAX; - -#endif /* _TD_LIBS_SYNC_CONST_H */ diff --git a/source/libs/sync/inc/sync_raft_config_change.h b/source/libs/sync/inc/sync_raft_config_change.h deleted file mode 100644 index 75a29f35e8..0000000000 --- a/source/libs/sync/inc/sync_raft_config_change.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef TD_SYNC_RAFT_CONFIG_CHANGE_H -#define TD_SYNC_RAFT_CONFIG_CHANGE_H - -#include "sync_type.h" -#include "sync_raft_proto.h" - -/** - * Changer facilitates configuration changes. It exposes methods to handle - * simple and joint consensus while performing the proper validation that allows - * refusing invalid configuration changes before they affect the active - * configuration. - **/ -struct SSyncRaftChanger { - SSyncRaftProgressTracker* tracker; - SyncIndex lastIndex; -}; - -typedef int (*configChangeFp)(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css, - SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); - -// Simple carries out a series of configuration changes that (in aggregate) -// mutates the incoming majority config Voters[0] by at most one. This method -// will return an error if that is not the case, if the resulting quorum is -// zero, or if the configuration is in a joint state (i.e. if there is an -// outgoing configuration). -int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css, - SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); - -int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css, - SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); - -#endif /* TD_SYNC_RAFT_CONFIG_CHANGE_H */ diff --git a/source/libs/sync/inc/sync_raft_impl.h b/source/libs/sync/inc/sync_raft_impl.h deleted file mode 100644 index 1a6c13f65f..0000000000 --- a/source/libs/sync/inc/sync_raft_impl.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_RAFT_IMPL_H -#define _TD_LIBS_SYNC_RAFT_IMPL_H - -#include "sync.h" -#include "sync_type.h" -#include "raft_message.h" -#include "sync_raft_quorum.h" - -void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId); -void syncRaftBecomePreCandidate(SSyncRaft* pRaft); -void syncRaftBecomeCandidate(SSyncRaft* pRaft); -void syncRaftBecomeLeader(SSyncRaft* pRaft); - -void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType); - -void syncRaftCampaign(SSyncRaft* pRaft, ESyncRaftElectionType cType); - -void syncRaftTriggerHeartbeat(SSyncRaft* pRaft); - -void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft); -bool syncRaftIsPromotable(SSyncRaft* pRaft); -bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft); -int syncRaftQuorum(SSyncRaft* pRaft); - -bool syncRaftMaybeCommit(SSyncRaft* pRaft); - -ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id, - bool preVote, bool accept, - int* rejectNum, int *granted); - -static FORCE_INLINE bool syncRaftIsEmptyServerState(const SSyncServerState* serverState) { - return serverState->commitIndex == 0 && - serverState->term == SYNC_NON_TERM && - serverState->voteFor == SYNC_NON_NODE_ID; -} - -void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState); - -void syncRaftBroadcastAppend(SSyncRaft* pRaft); - -SNodeInfo* syncRaftGetNodeById(SSyncRaft *pRaft, SyncNodeId id); - -#endif /* _TD_LIBS_SYNC_RAFT_IMPL_H */ diff --git a/source/libs/sync/inc/sync_raft_inflights.h b/source/libs/sync/inc/sync_raft_inflights.h deleted file mode 100644 index 627bf9a26f..0000000000 --- a/source/libs/sync/inc/sync_raft_inflights.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef TD_SYNC_RAFT_INFLIGHTS_H -#define TD_SYNC_RAFT_INFLIGHTS_H - -#include "sync.h" - -// Inflights limits the number of MsgApp (represented by the largest index -// contained within) sent to followers but not yet acknowledged by them. Callers -// use Full() to check whether more messages can be sent, call Add() whenever -// they are sending a new append, and release "quota" via FreeLE() whenever an -// ack is received. -typedef struct SSyncRaftInflights { - // the starting index in the buffer - int start; - - // number of inflights in the buffer - int count; - - // the size of the buffer - int size; - - // buffer contains the index of the last entry - // inside one message. - SyncIndex* buffer; -} SSyncRaftInflights; - -SSyncRaftInflights* syncRaftOpenInflights(int size); -void syncRaftCloseInflights(SSyncRaftInflights*); - -// reset frees all inflights. -static FORCE_INLINE void syncRaftInflightReset(SSyncRaftInflights* inflights) { - inflights->count = 0; - inflights->start = 0; -} - -// Full returns true if no more messages can be sent at the moment. -static FORCE_INLINE bool syncRaftInflightFull(SSyncRaftInflights* inflights) { - return inflights->count == inflights->size; -} - -// Add notifies the Inflights that a new message with the given index is being -// dispatched. Full() must be called prior to Add() to verify that there is room -// for one more message, and consecutive calls to add Add() must provide a -// monotonic sequence of indexes. -void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex); - -// FreeLE frees the inflights smaller or equal to the given `to` flight. -void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex); - -/** - * syncRaftInflightFreeFirstOne releases the first inflight. - * This is a no-op if nothing is inflight. - **/ -void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights); - -#endif /* TD_SYNC_RAFT_INFLIGHTS_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_node_map.h b/source/libs/sync/inc/sync_raft_node_map.h deleted file mode 100644 index b4cf04056d..0000000000 --- a/source/libs/sync/inc/sync_raft_node_map.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_RAFT_NODE_MAP_H -#define _TD_LIBS_SYNC_RAFT_NODE_MAP_H - -#include "thash.h" -#include "sync.h" -#include "sync_type.h" - -struct SSyncRaftNodeMap { - SHashObj* nodeIdMap; -}; - -void syncRaftInitNodeMap(SSyncRaftNodeMap* nodeMap); -void syncRaftFreeNodeMap(SSyncRaftNodeMap* nodeMap); - -void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap); - -bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); - -void syncRaftCopyNodeMap(SSyncRaftNodeMap* from, SSyncRaftNodeMap* to); - -void syncRaftUnionNodeMap(SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to); - -void syncRaftAddToNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); - -void syncRaftRemoveFromNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); - -int32_t syncRaftNodeMapSize(const SSyncRaftNodeMap* nodeMap); - -// return true if reach the end -bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId); - -bool syncRaftIsAllNodeInProgressMap(SSyncRaftNodeMap* nodeMap, SSyncRaftProgressMap* progressMap); - -#endif /* _TD_LIBS_SYNC_RAFT_NODE_MAP_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_progress.h b/source/libs/sync/inc/sync_raft_progress.h deleted file mode 100644 index 32c21281cd..0000000000 --- a/source/libs/sync/inc/sync_raft_progress.h +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef TD_SYNC_RAFT_PROGRESS_H -#define TD_SYNC_RAFT_PROGRESS_H - -#include "sync_type.h" -#include "sync_raft_inflights.h" -#include "thash.h" - -/** - * State defines how the leader should interact with the follower. - * - * When in PROGRESS_STATE_PROBE, leader sends at most one replication message - * per heartbeat interval. It also probes actual progress of the follower. - * - * When in PROGRESS_STATE_REPLICATE, leader optimistically increases next - * to the latest entry sent after sending replication message. This is - * an optimized state for fast replicating log entries to the follower. - * - * When in PROGRESS_STATE_SNAPSHOT, leader should have sent out snapshot - * before and stops sending any replication message. - * - * PROGRESS_STATE_PROBE is the initial state. - **/ -typedef enum ESyncRaftProgressState { - /** - * StateProbe indicates a follower whose last index isn't known. Such a - * follower is "probed" (i.e. an append sent periodically) to narrow down - * its last index. In the ideal (and common) case, only one round of probing - * is necessary as the follower will react with a hint. Followers that are - * probed over extended periods of time are often offline. - **/ - PROGRESS_STATE_PROBE = 0, - - /** - * StateReplicate is the state steady in which a follower eagerly receives - * log entries to append to its log. - **/ - PROGRESS_STATE_REPLICATE, - - /** - * StateSnapshot indicates a follower that needs log entries not available - * from the leader's Raft log. Such a follower needs a full snapshot to - * return to StateReplicate. - **/ - PROGRESS_STATE_SNAPSHOT, -} ESyncRaftProgressState; - -static const char* kProgressStateString[] = { - "Probe", - "Replicate", - "Snapshot", -}; - -// Progress represents a follower’s progress in the view of the leader. Leader -// maintains progresses of all followers, and sends entries to the follower -// based on its progress. -// -// NB(tbg): Progress is basically a state machine whose transitions are mostly -// strewn around `*raft.raft`. Additionally, some fields are only used when in a -// certain State. All of this isn't ideal. -struct SSyncRaftProgress { - SyncGroupId groupId; - - SyncNodeId id; - - int16_t refCount; - - SyncIndex nextIndex; - - SyncIndex matchIndex; - - // State defines how the leader should interact with the follower. - // - // When in StateProbe, leader sends at most one replication message - // per heartbeat interval. It also probes actual progress of the follower. - // - // When in StateReplicate, leader optimistically increases next - // to the latest entry sent after sending replication message. This is - // an optimized state for fast replicating log entries to the follower. - // - // When in StateSnapshot, leader should have sent out snapshot - // before and stops sending any replication message. - ESyncRaftProgressState state; - - // PendingSnapshot is used in StateSnapshot. - // If there is a pending snapshot, the pendingSnapshot will be set to the - // index of the snapshot. If pendingSnapshot is set, the replication process of - // this Progress will be paused. raft will not resend snapshot until the pending one - // is reported to be failed. - SyncIndex pendingSnapshotIndex; - - // RecentActive is true if the progress is recently active. Receiving any messages - // from the corresponding follower indicates the progress is active. - // RecentActive can be reset to false after an election timeout. - // - // TODO(tbg): the leader should always have this set to true. - bool recentActive; - - // ProbeSent is used while this follower is in StateProbe. When ProbeSent is - // true, raft should pause sending replication message to this peer until - // ProbeSent is reset. See ProbeAcked() and IsPaused(). - bool probeSent; - - // Inflights is a sliding window for the inflight messages. - // Each inflight message contains one or more log entries. - // The max number of entries per message is defined in raft config as MaxSizePerMsg. - // Thus inflight effectively limits both the number of inflight messages - // and the bandwidth each Progress can use. - // When inflights is Full, no more message should be sent. - // When a leader sends out a message, the index of the last - // entry should be added to inflights. The index MUST be added - // into inflights in order. - // When a leader receives a reply, the previous inflights should - // be freed by calling inflights.FreeLE with the index of the last - // received entry. - SSyncRaftInflights* inflights; - - // IsLearner is true if this progress is tracked for a learner. - bool isLearner; -}; - -struct SSyncRaftProgressMap { - // map nodeId -> SSyncRaftProgress* - SHashObj* progressMap; -}; - -static FORCE_INLINE const char* syncRaftProgressStateString(const SSyncRaftProgress* progress) { - return kProgressStateString[progress->state]; -} - -void syncRaftResetProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress); - -// BecomeProbe transitions into StateProbe. Next is reset to Match+1 or, -// optionally and if larger, the index of the pending snapshot. -void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress); - -// BecomeReplicate transitions into StateReplicate, resetting Next to Match+1. -void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress); - -// MaybeUpdate is called when an MsgAppResp arrives from the follower, with the -// index acked by it. The method returns false if the given n index comes from -// an outdated message. Otherwise it updates the progress and returns true. -bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex); - -// OptimisticUpdate signals that appends all the way up to and including index n -// are in-flight. As a result, Next is increased to n+1. -static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* progress, SyncIndex nextIndex) { - progress->nextIndex = nextIndex + 1; -} - -// MaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The -// arguments are the index of the append message rejected by the follower, and -// the hint that we want to decrease to. -// -// Rejections can happen spuriously as messages are sent out of order or -// duplicated. In such cases, the rejection pertains to an index that the -// Progress already knows were previously acknowledged, and false is returned -// without changing the Progress. -// -// If the rejection is genuine, Next is lowered sensibly, and the Progress is -// cleared for sending log entries. -bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, - SyncIndex rejected, SyncIndex matchHint); - -// IsPaused returns whether sending log entries to this node has been throttled. -// This is done when a node has rejected recent MsgApps, is currently waiting -// for a snapshot, or has reached the MaxInflightMsgs limit. In normal -// operation, this is false. A throttled node will be contacted less frequently -// until it has reached a state in which it's able to accept a steady stream of -// log entries again. -bool syncRaftProgressIsPaused(SSyncRaftProgress* progress); - -static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progress) { - return progress->nextIndex; -} - -static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInReplicate(SSyncRaftProgress* progress) { - return progress->state == PROGRESS_STATE_REPLICATE; -} - -static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInSnapshot(SSyncRaftProgress* progress) { - return progress->state == PROGRESS_STATE_SNAPSHOT; -} - -static FORCE_INLINE ESyncRaftProgressState syncRaftProgressInProbe(SSyncRaftProgress* progress) { - return progress->state == PROGRESS_STATE_PROBE; -} - -static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progress) { - return progress->recentActive; -} - -void syncRaftInitProgressMap(SSyncRaftProgressMap* progressMap); -void syncRaftFreeProgressMap(SSyncRaftProgressMap* progressMap); - -void syncRaftClearProgressMap(SSyncRaftProgressMap* progressMap); -void syncRaftCopyProgressMap(SSyncRaftProgressMap* from, SSyncRaftProgressMap* to); - -SSyncRaftProgress* syncRaftFindProgressByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id); - -int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgress* progress); - -void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id); - -bool syncRaftIsInProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id); - -/** - * return true if progress's log is up-todate - **/ -bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress); - -// BecomeSnapshot moves the Progress to StateSnapshot with the specified pending -// snapshot index. -void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex); - -void syncRaftCopyProgress(const SSyncRaftProgress* from, SSyncRaftProgress* to); - -// return true if reach the end -bool syncRaftIterateProgressMap(const SSyncRaftProgressMap* progressMap, SSyncRaftProgress *pProgress); - -bool syncRaftVisitProgressMap(SSyncRaftProgressMap* progressMap, visitProgressFp fp, void* arg); - -#if 0 - -void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); - - - -SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i); - -void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i); - -void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i); - -bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i); - -void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i); - - - -void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i); - -#endif - -#endif /* TD_SYNC_RAFT_PROGRESS_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_progress_tracker.h b/source/libs/sync/inc/sync_raft_progress_tracker.h deleted file mode 100644 index 0a3c7dd6fc..0000000000 --- a/source/libs/sync/inc/sync_raft_progress_tracker.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H -#define _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H - -#include "sync_type.h" -#include "sync_raft_quorum.h" -#include "sync_raft_quorum_joint.h" -#include "sync_raft_progress.h" -#include "sync_raft_proto.h" -#include "thash.h" - -// Config reflects the configuration tracked in a ProgressTracker. -struct SSyncRaftProgressTrackerConfig { - SSyncRaftQuorumJointConfig voters; - - // autoLeave is true if the configuration is joint and a transition to the - // incoming configuration should be carried out automatically by Raft when - // this is possible. If false, the configuration will be joint until the - // application initiates the transition manually. - bool autoLeave; - - // Learners is a set of IDs corresponding to the learners active in the - // current configuration. - // - // Invariant: Learners and Voters does not intersect, i.e. if a peer is in - // either half of the joint config, it can't be a learner; if it is a - // learner it can't be in either half of the joint config. This invariant - // simplifies the implementation since it allows peers to have clarity about - // its current role without taking into account joint consensus. - SSyncRaftNodeMap learners; - - // When we turn a voter into a learner during a joint consensus transition, - // we cannot add the learner directly when entering the joint state. This is - // because this would violate the invariant that the intersection of - // voters and learners is empty. For example, assume a Voter is removed and - // immediately re-added as a learner (or in other words, it is demoted): - // - // Initially, the configuration will be - // - // voters: {1 2 3} - // learners: {} - // - // and we want to demote 3. Entering the joint configuration, we naively get - // - // voters: {1 2} & {1 2 3} - // learners: {3} - // - // but this violates the invariant (3 is both voter and learner). Instead, - // we get - // - // voters: {1 2} & {1 2 3} - // learners: {} - // next_learners: {3} - // - // Where 3 is now still purely a voter, but we are remembering the intention - // to make it a learner upon transitioning into the final configuration: - // - // voters: {1 2} - // learners: {3} - // next_learners: {} - // - // Note that next_learners is not used while adding a learner that is not - // also a voter in the joint config. In this case, the learner is added - // right away when entering the joint configuration, so that it is caught up - // as soon as possible. - SSyncRaftNodeMap learnersNext; -}; - -struct SSyncRaftProgressTracker { - SSyncRaftProgressTrackerConfig config; - - SSyncRaftProgressMap progressMap; - - // nodeid -> ESyncRaftVoteType map - SHashObj* votesMap; - - int maxInflightMsgs; - - SSyncRaft* pRaft; -}; - -SSyncRaftProgressTracker* syncRaftOpenProgressTracker(SSyncRaft* pRaft); - -void syncRaftInitTrackConfig(SSyncRaftProgressTrackerConfig* config); -void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config); - -void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config); - -// ResetVotes prepares for a new round of vote counting via recordVote. -void syncRaftResetVotes(SSyncRaftProgressTracker*); - -void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg); - -// RecordVote records that the node with the given id voted for this Raft -// instance if v == true (and declined it otherwise). -void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant); - -void syncRaftCopyTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to); - -int syncRaftCheckTrackerConfigInProgress(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); - -// TallyVotes returns the number of granted and rejected Votes, and whether the -// election outcome is known. -ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted); - -void syncRaftConfigState(SSyncRaftProgressTracker* tracker, SSyncConfigState* cs); - -// Committed returns the largest log index known to be committed based on what -// the voting members of the group have acknowledged. -SyncIndex syncRaftCommittedIndex(SSyncRaftProgressTracker* tracker); - -// QuorumActive returns true if the quorum is active from the view of the local -// raft state machine. Otherwise, it returns false. -bool syncRaftQuorumActive(SSyncRaftProgressTracker* tracker); - -bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); - -#endif /* _TD_LIBS_SYNC_RAFT_PROGRESS_TRACKER_H */ diff --git a/source/libs/sync/inc/sync_raft_proto.h b/source/libs/sync/inc/sync_raft_proto.h deleted file mode 100644 index 29371e328d..0000000000 --- a/source/libs/sync/inc/sync_raft_proto.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef TD_SYNC_RAFT_PROTO_H -#define TD_SYNC_RAFT_PROTO_H - -#include "sync_type.h" -#include "sync_raft_node_map.h" - -typedef enum ESyncRaftConfChangeType { - SYNC_RAFT_Conf_AddNode = 0, - SYNC_RAFT_Conf_RemoveNode = 1, - SYNC_RAFT_Conf_UpdateNode = 2, - SYNC_RAFT_Conf_AddLearnerNode = 3, -} ESyncRaftConfChangeType; - -// ConfChangeSingle is an individual configuration change operation. Multiple -// such operations can be carried out atomically via a ConfChangeV2. -typedef struct SSyncConfChangeSingle { - ESyncRaftConfChangeType type; - SyncNodeId nodeId; -} SSyncConfChangeSingle; - -typedef struct SSyncConfChangeSingleArray { - int n; - SSyncConfChangeSingle* changes; -} SSyncConfChangeSingleArray; - -typedef struct SSyncConfigState { - // The voters in the incoming config. (If the configuration is not joint, - // then the outgoing config is empty). - SSyncRaftNodeMap voters; - - // The learners in the incoming config. - SSyncRaftNodeMap learners; - - // The voters in the outgoing config. - SSyncRaftNodeMap votersOutgoing; - - // The nodes that will become learners when the outgoing config is removed. - // These nodes are necessarily currently in nodes_joint (or they would have - // been added to the incoming config right away). - SSyncRaftNodeMap learnersNext; - - // If set, the config is joint and Raft will automatically transition into - // the final config (i.e. remove the outgoing config) when this is safe. - bool autoLeave; -} SSyncConfigState; - -static FORCE_INLINE bool syncRaftConfArrayIsEmpty(const SSyncConfChangeSingleArray* ary) { - return ary->n == 0; -} - -static FORCE_INLINE void syncRaftInitConfArray(SSyncConfChangeSingleArray* ary) { - *ary = (SSyncConfChangeSingleArray) { - .changes = NULL, - .n = 0, - }; -} - -static FORCE_INLINE void syncRaftFreeConfArray(SSyncConfChangeSingleArray* ary) { - if (ary->changes != NULL) free(ary->changes); -} - -#endif /* TD_SYNC_RAFT_PROTO_H */ diff --git a/source/libs/sync/inc/sync_raft_quorum.h b/source/libs/sync/inc/sync_raft_quorum.h deleted file mode 100644 index 16ac1cd029..0000000000 --- a/source/libs/sync/inc/sync_raft_quorum.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef TD_SYNC_RAFT_QUORUM_H -#define TD_SYNC_RAFT_QUORUM_H - -/** - * ESyncRaftVoteResult indicates the outcome of a vote. - **/ -typedef enum { - /** - * SYNC_RAFT_VOTE_PENDING indicates that the decision of the vote depends on future - * votes, i.e. neither "yes" or "no" has reached quorum yet. - **/ - SYNC_RAFT_VOTE_PENDING = 1, - - /** - * SYNC_RAFT_VOTE_LOST indicates that the quorum has voted "no". - **/ - SYNC_RAFT_VOTE_LOST = 2, - - /** - * SYNC_RAFT_VOTE_WON indicates that the quorum has voted "yes". - **/ - SYNC_RAFT_VOTE_WON = 3, -} ESyncRaftVoteResult; - -#endif /* TD_SYNC_RAFT_QUORUM_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_quorum_joint.h b/source/libs/sync/inc/sync_raft_quorum_joint.h deleted file mode 100644 index 9d5f10ab51..0000000000 --- a/source/libs/sync/inc/sync_raft_quorum_joint.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H -#define _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H - -#include "taosdef.h" -#include "sync.h" -#include "sync_type.h" -#include "sync_raft_node_map.h" -#include "thash.h" - -// JointConfig is a configuration of two groups of (possibly overlapping) -// majority configurations. Decisions require the support of both majorities. -typedef struct SSyncRaftQuorumJointConfig { - SSyncRaftNodeMap outgoing; - SSyncRaftNodeMap incoming; -} SSyncRaftQuorumJointConfig; - -// IDs returns a newly initialized map representing the set of voters present -// in the joint configuration. -void syncRaftJointConfigIDs(SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap); - -// CommittedIndex returns the largest committed index for the given joint -// quorum. An index is jointly committed if it is committed in both constituent -// majorities. -SyncIndex syncRaftJointConfigCommittedIndex(const SSyncRaftQuorumJointConfig* config, matchAckIndexerFp indexer, void* arg); - -// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns -// a result indicating whether the vote is pending, lost, or won. A joint quorum -// requires both majority quorums to vote in favor. -ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashObj* votesMap); - -void syncRaftInitQuorumJointConfig(SSyncRaftQuorumJointConfig* config); - -static FORCE_INLINE bool syncRaftJointConfigInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) { - return syncRaftIsInNodeMap(&config->outgoing, id); -} - -static FORCE_INLINE bool syncRaftJointConfigInIncoming(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) { - return syncRaftIsInNodeMap(&config->incoming, id); -} - -void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id); - -void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id); - -static FORCE_INLINE const SSyncRaftNodeMap* syncRaftJointConfigIncoming(const SSyncRaftQuorumJointConfig* config) { - return &config->incoming; -} - -static FORCE_INLINE const SSyncRaftNodeMap* syncRaftJointConfigOutgoing(const SSyncRaftQuorumJointConfig* config) { - return &config->outgoing; -} - -static FORCE_INLINE void syncRaftJointConfigClearOutgoing(SSyncRaftQuorumJointConfig* config) { - syncRaftClearNodeMap(&config->outgoing); -} - -static FORCE_INLINE bool syncRaftJointConfigIsIncomingEmpty(const SSyncRaftQuorumJointConfig* config) { - return syncRaftNodeMapSize(&config->incoming) == 0; -} - -static FORCE_INLINE bool syncRaftJointConfigIsOutgoingEmpty(const SSyncRaftQuorumJointConfig* config) { - return syncRaftNodeMapSize(&config->outgoing) == 0; -} - -static FORCE_INLINE bool syncRaftJointConfigIsInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) { - return syncRaftIsInNodeMap(&config->outgoing, id); -} - -#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */ diff --git a/source/libs/sync/inc/sync_raft_quorum_majority.h b/source/libs/sync/inc/sync_raft_quorum_majority.h deleted file mode 100644 index 399bd71db8..0000000000 --- a/source/libs/sync/inc/sync_raft_quorum_majority.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H -#define _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H - -#include "sync.h" -#include "sync_type.h" -#include "sync_raft_quorum.h" -#include "thash.h" - -/** - * syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns - * a result indicating whether the vote is pending (i.e. neither a quorum of - * yes/no has been reached), won (a quorum of yes has been reached), or lost (a - * quorum of no has been reached). - **/ -ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashObj* votesMap); - -// CommittedIndex computes the committed index from those supplied via the -// provided AckedIndexer (for the active config). -SyncIndex syncRaftMajorityConfigCommittedIndex(const SSyncRaftNodeMap* config, matchAckIndexerFp indexer, void* arg); - -#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */ diff --git a/source/libs/sync/inc/sync_raft_restore.h b/source/libs/sync/inc/sync_raft_restore.h deleted file mode 100644 index df4448cab8..0000000000 --- a/source/libs/sync/inc/sync_raft_restore.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef TD_SYNC_RAFT_RESTORE_H -#define TD_SYNC_RAFT_RESTORE_H - -#include "sync_type.h" -#include "sync_raft_proto.h" - -// syncRaftRestoreConfig takes a Changer (which must represent an empty configuration), and -// runs a sequence of changes enacting the configuration described in the -// ConfState. -// -// TODO(tbg) it's silly that this takes a Changer. Unravel this by making sure -// the Changer only needs a ProgressMap (not a whole Tracker) at which point -// this can just take LastIndex and MaxInflight directly instead and cook up -// the results from that alone. -int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs, - SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); - -#endif /* TD_SYNC_RAFT_RESTORE_H */ diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h deleted file mode 100644 index c5c4cc3a76..0000000000 --- a/source/libs/sync/inc/sync_type.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_TYPE_H -#define _TD_LIBS_SYNC_TYPE_H - -#include -#include "sync.h" -#include "osMath.h" - -#define SYNC_NON_NODE_ID -1 -#define SYNC_NON_TERM 0 - -typedef int32_t SyncTime; -typedef uint32_t SyncTick; - -typedef struct SSyncRaft SSyncRaft; - -typedef struct SSyncRaftProgress SSyncRaftProgress; -typedef struct SSyncRaftProgressMap SSyncRaftProgressMap; -typedef struct SSyncRaftProgressTrackerConfig SSyncRaftProgressTrackerConfig; - -typedef struct SSyncRaftNodeMap SSyncRaftNodeMap; - -typedef struct SSyncRaftProgressTracker SSyncRaftProgressTracker; - -typedef struct SSyncRaftChanger SSyncRaftChanger; - -typedef struct SSyncRaftLog SSyncRaftLog; - -typedef struct SSyncRaftEntry SSyncRaftEntry; - -#if 0 -#ifndef TMIN -#define TMIN(x, y) (((x) < (y)) ? (x) : (y)) -#endif - -#ifndef TMAX -#define TMAX(x, y) (((x) > (y)) ? (x) : (y)) -#endif -#endif - - -typedef struct SSyncServerState { - SyncNodeId voteFor; - SyncTerm term; - SyncIndex commitIndex; -} SSyncServerState; - -typedef struct SSyncClusterConfig { - // Log index number of current cluster config. - SyncIndex index; - - // Log index number of previous cluster config. - SyncIndex prevIndex; - - // current cluster - const SSyncCluster* cluster; -} SSyncClusterConfig; - -typedef enum { - SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0, - SYNC_RAFT_CAMPAIGN_ELECTION = 1, - SYNC_RAFT_CAMPAIGN_TRANSFER = 2, -} ESyncRaftElectionType; - -typedef enum { - // grant the vote request - SYNC_RAFT_VOTE_RESP_GRANT = 1, - - // reject the vote request - SYNC_RAFT_VOTE_RESP_REJECT = 2, -} ESyncRaftVoteType; - -typedef void (*visitProgressFp)(SSyncRaftProgress* progress, void* arg); - -typedef void (*matchAckIndexerFp)(SyncNodeId id, void* arg, SyncIndex* index); - -#endif /* _TD_LIBS_SYNC_TYPE_H */ diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c deleted file mode 100644 index 72b0d268a8..0000000000 --- a/source/libs/sync/src/raft.c +++ /dev/null @@ -1,325 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "raft.h" -#include "sync_raft_impl.h" -#include "raft_log.h" -#include "sync_raft_restore.h" -#include "raft_replication.h" -#include "sync_raft_config_change.h" -#include "sync_raft_progress_tracker.h" -#include "syncInt.h" - -#define RAFT_READ_LOG_MAX_NUM 100 - -static int deserializeServerStateFromBuffer(SSyncServerState* server, const char* buffer, int n); -static int deserializeClusterStateFromBuffer(SSyncConfigState* cluster, const char* buffer, int n); - -static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfig* config, - const SSyncRaftProgressMap* progressMap, SSyncConfigState* cs); - -static void abortLeaderTransfer(SSyncRaft* pRaft); - -static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); -static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); -static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); - -int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { - SSyncNode* pNode = pRaft->pNode; - SSyncServerState serverState; - SSyncConfigState confState; - SStateManager* stateManager; - SSyncLogStore* logStore; - SSyncFSM* fsm; - SSyncBuffer buffer[RAFT_READ_LOG_MAX_NUM]; - int nBuf, limit, i; - char* buf; - int n; - SSyncRaftChanger changer; - - memset(pRaft, 0, sizeof(SSyncRaft)); - - memcpy(&pRaft->fsm, &pInfo->fsm, sizeof(SSyncFSM)); - memcpy(&pRaft->logStore, &pInfo->logStore, sizeof(SSyncLogStore)); - memcpy(&pRaft->stateManager, &pInfo->stateManager, sizeof(SStateManager)); - - stateManager = &(pRaft->stateManager); - logStore = &(pRaft->logStore); - fsm = &(pRaft->fsm); - - pRaft->nodeInfoMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); - if (pRaft->nodeInfoMap == NULL) { - return -1; - } - - // init progress tracker - pRaft->tracker = syncRaftOpenProgressTracker(pRaft); - if (pRaft->tracker == NULL) { - return -1; - } - - // open raft log - if ((pRaft->log = syncRaftLogOpen()) == NULL) { - return -1; - } - // read server state - if (stateManager->readServerState(stateManager, &buf, &n) != 0) { - syncError("readServerState for vgid %d fail", pInfo->vgId); - return -1; - } - if (deserializeServerStateFromBuffer(&serverState, buf, n) != 0) { - syncError("deserializeServerStateFromBuffer for vgid %d fail", pInfo->vgId); - return -1; - } - free(buf); - //assert(initIndex <= serverState.commitIndex); - - // read config state - if (stateManager->readClusterState(stateManager, &buf, &n) != 0) { - syncError("readClusterState for vgid %d fail", pInfo->vgId); - return -1; - } - if (deserializeClusterStateFromBuffer(&confState, buf, n) != 0) { - syncError("deserializeClusterStateFromBuffer for vgid %d fail", pInfo->vgId); - return -1; - } - free(buf); - - changer = (SSyncRaftChanger) { - .tracker = pRaft->tracker, - .lastIndex = syncRaftLogLastIndex(pRaft->log), - }; - SSyncRaftProgressTrackerConfig config; - SSyncRaftProgressMap progressMap; - - if (syncRaftRestoreConfig(&changer, &confState, &config, &progressMap) < 0) { - syncError("syncRaftRestoreConfig for vgid %d fail", pInfo->vgId); - return -1; - } - - // save restored config and progress map to tracker - syncRaftCopyProgressMap(&progressMap, &pRaft->tracker->progressMap); - syncRaftCopyTrackerConfig(&config, &pRaft->tracker->config); - - // free progress map and config - syncRaftFreeProgressMap(&progressMap); - syncRaftFreeTrackConfig(&config); - - if (!syncRaftIsEmptyServerState(&serverState)) { - syncRaftLoadState(pRaft, &serverState); - } - - if (pInfo->appliedIndex > 0) { - syncRaftLogAppliedTo(pRaft->log, pInfo->appliedIndex); - } - - syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); - - syncInfo("[%d:%d] restore vgid %d state: snapshot index success", - pRaft->selfGroupId, pRaft->selfId, pInfo->vgId); - return 0; -} - -int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - syncDebug("from %d, type:%d, term:%" PRId64 ", state:%d", - pMsg->from, pMsg->msgType, pMsg->term, pRaft->state); - - if (preHandleMessage(pRaft, pMsg)) { - syncFreeMessage(pMsg); - return 0; - } - - ESyncRaftMessageType msgType = pMsg->msgType; - if (msgType == RAFT_MSG_INTERNAL_ELECTION) { - syncRaftHandleElectionMessage(pRaft, pMsg); - } else if (msgType == RAFT_MSG_VOTE) { - syncRaftHandleVoteMessage(pRaft, pMsg); - } else { - pRaft->stepFp(pRaft, pMsg); - } - - syncFreeMessage(pMsg); - return 0; -} - -int32_t syncRaftTick(SSyncRaft* pRaft) { - pRaft->currentTick += 1; - pRaft->tickFp(pRaft); - return 0; -} - -static int deserializeServerStateFromBuffer(SSyncServerState* server, const char* buffer, int n) { - return 0; -} - -static int deserializeClusterStateFromBuffer(SSyncConfigState* cluster, const char* buffer, int n) { - return 0; -} - -static void visitProgressMaybeSendAppend(SSyncRaftProgress* progress, void* arg) { - syncRaftMaybeSendAppend(arg, progress, false); -} - -// switchToConfig reconfigures this node to use the provided configuration. It -// updates the in-memory state and, when necessary, carries out additional -// actions such as reacting to the removal of nodes or changed quorum -// requirements. -// -// The inputs usually result from restoring a ConfState or applying a ConfChange. -static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfig* config, - const SSyncRaftProgressMap* progressMap, SSyncConfigState* cs) { - SyncNodeId selfId = pRaft->selfId; - int i; - bool exist; - SSyncRaftProgress* progress = NULL; - - syncRaftConfigState(pRaft->tracker, cs); - progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, selfId); - exist = (progress != NULL); - - // Update whether the node itself is a learner, resetting to false when the - // node is removed. - if (exist) { - pRaft->isLearner = progress->isLearner; - } else { - pRaft->isLearner = false; - } - - if ((!exist || pRaft->isLearner) && pRaft->state == TAOS_SYNC_STATE_LEADER) { - // This node is leader and was removed or demoted. We prevent demotions - // at the time writing but hypothetically we handle them the same way as - // removing the leader: stepping down into the next Term. - // - // TODO(tbg): step down (for sanity) and ask follower with largest Match - // to TimeoutNow (to avoid interruption). This might still drop some - // proposals but it's better than nothing. - // - // TODO(tbg): test this branch. It is untested at the time of writing. - return; - } - - // The remaining steps only make sense if this node is the leader and there - // are other nodes. - if (pRaft->state != TAOS_SYNC_STATE_LEADER || syncRaftNodeMapSize(&cs->voters) == 0) { - return; - } - - if (syncRaftMaybeCommit(pRaft)) { - // If the configuration change means that more entries are committed now, - // broadcast/append to everyone in the updated config. - syncRaftBroadcastAppend(pRaft); - } else { - // Otherwise, still probe the newly added replicas; there's no reason to - // let them wait out a heartbeat interval (or the next incoming - // proposal). - syncRaftProgressVisit(pRaft->tracker, visitProgressMaybeSendAppend, pRaft); - - // If the the leadTransferee was removed or demoted, abort the leadership transfer. - SyncNodeId leadTransferee = pRaft->leadTransferee; - if (leadTransferee != SYNC_NON_NODE_ID) { - if (!syncRaftIsInNodeMap(&pRaft->tracker->config.voters.incoming, leadTransferee) && - !syncRaftIsInNodeMap(&pRaft->tracker->config.voters.outgoing, leadTransferee)) { - abortLeaderTransfer(pRaft); - } - } - } -} - -static void abortLeaderTransfer(SSyncRaft* pRaft) { - pRaft->leadTransferee = SYNC_NON_NODE_ID; -} - -/** - * pre-handle message, return true means no need to continue - * Handle the message term, which may result in our stepping down to a follower. - **/ -static bool preHandleMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - // local message? - if (pMsg->term == 0) { - return false; - } - - if (pMsg->term > pRaft->term) { - return preHandleNewTermMessage(pRaft, pMsg); - } else if (pMsg->term < pRaft->term) { - return preHandleOldTermMessage(pRaft, pMsg); - } - - return false; -} - -static bool preHandleNewTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - SyncNodeId leaderId = pMsg->from; - ESyncRaftMessageType msgType = pMsg->msgType; - - if (msgType == RAFT_MSG_VOTE) { - // TODO - leaderId = SYNC_NON_NODE_ID; - } - - if (syncIsPreVoteMsg(pMsg)) { - // Never change our term in response to a PreVote - } else if (syncIsPreVoteRespMsg(pMsg) && !pMsg->voteResp.rejected) { - /** - * We send pre-vote requests with a term in our future. If the - * pre-vote is granted, we will increment our term when we get a - * quorum. If it is not, the term comes from the node that - * rejected our vote so we should become a follower at the new - * term. - **/ - } else { - syncInfo("[%d:%d] [term:%" PRId64 "] received a %d message with higher term from %d [term:%" PRId64 "]", - pRaft->selfGroupId, pRaft->selfId, pRaft->term, msgType, pMsg->from, pMsg->term); - syncRaftBecomeFollower(pRaft, pMsg->term, leaderId); - } - - return false; -} - -static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - if (pRaft->checkQuorum && pMsg->msgType == RAFT_MSG_APPEND) { - /** - * We have received messages from a leader at a lower term. It is possible - * that these messages were simply delayed in the network, but this could - * also mean that this node has advanced its term number during a network - * partition, and it is now unable to either win an election or to rejoin - * the majority on the old term. If checkQuorum is false, this will be - * handled by incrementing term numbers in response to MsgVote with a - * higher term, but if checkQuorum is true we may not advance the term on - * MsgVote and must generate other messages to advance the term. The net - * result of these two features is to minimize the disruption caused by - * nodes that have been removed from the cluster's configuration: a - * removed node will send MsgVotes (or MsgPreVotes) which will be ignored, - * but it will not receive MsgApp or MsgHeartbeat, so it will not create - * disruptive term increases - **/ - SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from); - if (pNode == NULL) { - return true; - } - SSyncMessage* msg = syncNewEmptyAppendRespMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term); - if (msg == NULL) { - return true; - } - - pRaft->io.send(msg, pNode); - } else { - // ignore other cases - syncInfo("[%d:%d] [term:%" PRId64 "] ignored a %d message with lower term from %d [term:%" PRId64 "]", - pRaft->selfGroupId, pRaft->selfId, pRaft->term, pMsg->msgType, pMsg->from, pMsg->term); - } - - return true; -} \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_append_entries_message.c b/source/libs/sync/src/raft_handle_append_entries_message.c deleted file mode 100644 index 92ebfe75f5..0000000000 --- a/source/libs/sync/src/raft_handle_append_entries_message.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "syncInt.h" -#include "raft.h" -#include "raft_log.h" -#include "sync_raft_impl.h" -#include "raft_message.h" - -int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - const RaftMsg_Append_Entries *appendEntries = &(pMsg->appendEntries); - - SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from); - if (pNode == NULL) { - return 0; - } - - SSyncMessage* pRespMsg = syncNewEmptyAppendRespMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term); - if (pRespMsg == NULL) { - return 0; - } - - RaftMsg_Append_Resp *appendResp = &(pRespMsg->appendResp); - // ignore committed logs - if (syncRaftLogIsCommitted(pRaft->log, appendEntries->index)) { - appendResp->index = pRaft->log->commitIndex; - goto out; - } - - syncInfo("[%d:%d] recv append from %d index %" PRId64"", - pRaft->selfGroupId, pRaft->selfId, pMsg->from, appendEntries->index); - -out: - pRaft->io.send(pRespMsg, pNode); - return 0; -} \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_election_message.c b/source/libs/sync/src/raft_handle_election_message.c deleted file mode 100644 index a58c8ba5cf..0000000000 --- a/source/libs/sync/src/raft_handle_election_message.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "syncInt.h" -#include "raft.h" -#include "raft_log.h" -#include "raft_message.h" - -int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - if (pRaft->preVote) { - syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_PRE_ELECTION); - } else { - syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); - } - - return 0; -} diff --git a/source/libs/sync/src/raft_handle_vote_message.c b/source/libs/sync/src/raft_handle_vote_message.c deleted file mode 100644 index 4d940732dc..0000000000 --- a/source/libs/sync/src/raft_handle_vote_message.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "syncInt.h" -#include "raft.h" -#include "sync_raft_impl.h" -#include "raft_log.h" -#include "raft_message.h" - -static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); - -int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - SSyncMessage* pRespMsg; - SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from); - if (pNode == NULL) { - return 0; - } - - bool grant; - SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); - SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); - - grant = canGrantVoteMessage(pRaft, pMsg); - pRespMsg = syncNewVoteRespMsg(pRaft->selfGroupId, pRaft->selfId, pMsg->vote.cType, !grant); - if (pRespMsg == NULL) { - return 0; - } - syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 ", vote: %d] %s for %d" - "[logterm: %" PRId64 ", index: %" PRId64 "] at term %" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, lastTerm, lastIndex, pRaft->voteFor, - grant ? "grant" : "reject", - pMsg->from, pMsg->vote.lastTerm, pMsg->vote.lastIndex, pRaft->term); - - pRaft->io.send(pRespMsg, pNode); - return 0; -} - -static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - bool canVote = - // We can vote if this is a repeat of a vote we've already cast... - pRaft->voteFor == pMsg->from || - // ...we haven't voted and we don't think there's a leader yet in this term... - (pRaft->voteFor == SYNC_NON_NODE_ID && pRaft->leaderId == SYNC_NON_NODE_ID) || - // ...or this is a PreVote for a future term... - (pMsg->vote.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION && pMsg->term > pRaft->term); - - // ...and we believe the candidate is up to date. - return canVote && syncRaftLogIsUptodate(pRaft->log, pMsg->vote.lastIndex, pMsg->vote.lastTerm); -} \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_vote_resp_message.c b/source/libs/sync/src/raft_handle_vote_resp_message.c deleted file mode 100644 index 87a5cfcd15..0000000000 --- a/source/libs/sync/src/raft_handle_vote_resp_message.c +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "syncInt.h" -#include "raft.h" -#include "sync_raft_impl.h" -#include "raft_message.h" - -int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - int granted, rejected; - int quorum; - int voterIndex; - - assert(pRaft->state == TAOS_SYNC_STATE_CANDIDATE); - - SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from); - if (pNode == NULL) { - syncError("[%d:%d] recv vote resp from unknown server %d", pRaft->selfGroupId, pRaft->selfId, pMsg->from); - return 0; - } - - if (pRaft->state != TAOS_SYNC_STATE_CANDIDATE) { - syncError("[%d:%d] is not candidate, ignore vote resp", pRaft->selfGroupId, pRaft->selfId); - return 0; - } - - ESyncRaftVoteResult result = syncRaftPollVote(pRaft, pMsg->from, - pMsg->voteResp.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION, - !pMsg->voteResp.rejected, &rejected, &granted); - - syncInfo("[%d:%d] [quorum:%d] has received %d votes and %d vote rejections", - pRaft->selfGroupId, pRaft->selfId, quorum, granted, rejected); - - if (result == SYNC_RAFT_VOTE_WON) { - if (pRaft->candidateState.inPreVote) { - syncRaftCampaign(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); - } else { - syncRaftBecomeLeader(pRaft); - syncRaftBroadcastAppend(pRaft); - } - } else if (result == SYNC_RAFT_VOTE_LOST) { - // pb.MsgPreVoteResp contains future term of pre-candidate - // m.Term > r.Term; reuse r.Term - syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); - } - - return 0; -} \ No newline at end of file diff --git a/source/libs/sync/src/raft_log.c b/source/libs/sync/src/raft_log.c deleted file mode 100644 index b6e6d292e8..0000000000 --- a/source/libs/sync/src/raft_log.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "raft_log.h" - -SSyncRaftLog* syncRaftLogOpen() { - return NULL; -} - -SyncIndex syncRaftLogLastIndex(SSyncRaftLog* pLog) { - return 0; -} - -SyncIndex syncRaftLogSnapshotIndex(SSyncRaftLog* pLog) { - return 0; -} - -SyncTerm syncRaftLogLastTerm(SSyncRaftLog* pLog) { - return 0; -} - -void syncRaftLogAppliedTo(SSyncRaftLog* pLog, SyncIndex appliedIndex) { - -} - -bool syncRaftLogIsUptodate(SSyncRaftLog* pLog, SyncIndex index, SyncTerm term) { - return true; -} - -int syncRaftLogNumOfPendingConf(SSyncRaftLog* pLog) { - return 0; -} - -bool syncRaftHasUnappliedLog(SSyncRaftLog* pLog) { - return pLog->commitIndex > pLog->appliedIndex; -} - -SyncTerm syncRaftLogTermOf(SSyncRaftLog* pLog, SyncIndex index) { - return SYNC_NON_TERM; -} - -int syncRaftLogAppend(SSyncRaftLog* pLog, SSyncRaftEntry *pEntries, int n) { - -} - -int syncRaftLogAcquire(SSyncRaftLog* pLog, SyncIndex index, int maxMsgSize, - SSyncRaftEntry **ppEntries, int *n) { - return 0; -} - -void syncRaftLogRelease(SSyncRaftLog* pLog, SyncIndex index, - SSyncRaftEntry *pEntries, int n) { - return; -} \ No newline at end of file diff --git a/source/libs/sync/src/raft_message.c b/source/libs/sync/src/raft_message.c deleted file mode 100644 index e706127f29..0000000000 --- a/source/libs/sync/src/raft_message.c +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "raft_message.h" - -void syncFreeMessage(const SSyncMessage* pMsg) { - if (!syncIsInternalMsg(pMsg->msgType)) { - free((SSyncMessage*)pMsg); - } -} \ No newline at end of file diff --git a/source/libs/sync/src/raft_replication.c b/source/libs/sync/src/raft_replication.c deleted file mode 100644 index c8c2d2c379..0000000000 --- a/source/libs/sync/src/raft_replication.c +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "raft.h" -#include "raft_log.h" -#include "sync_raft_progress.h" -#include "syncInt.h" -#include "raft_replication.h" - -static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress); -static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress, - SyncIndex prevIndex, SyncTerm prevTerm, - SSyncRaftEntry *entries, int nEntry); - -// maybeSendAppend sends an append RPC with new entries to the given peer, -// if necessary. Returns true if a message was sent. The sendIfEmpty -// argument controls whether messages with no entries will be sent -// ("empty" messages are useful to convey updated Commit indexes, but -// are undesirable when we're sending multiple messages in a batch). -bool syncRaftMaybeSendAppend(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty) { - assert(pRaft->state == TAOS_SYNC_STATE_LEADER); - SyncNodeId nodeId = progress->id; - - if (syncRaftProgressIsPaused(progress)) { - syncInfo("node [%d:%d] paused", pRaft->selfGroupId, nodeId); - return false; - } - - SyncIndex nextIndex = syncRaftProgressNextIndex(progress); - SSyncRaftEntry *entries; - int nEntry; - SyncIndex prevIndex; - SyncTerm prevTerm; - - prevIndex = nextIndex - 1; - prevTerm = syncRaftLogTermOf(pRaft->log, prevIndex); - int ret = syncRaftLogAcquire(pRaft->log, nextIndex, pRaft->maxMsgSize, &entries, &nEntry); - - if (nEntry == 0 && !sendIfEmpty) { - return false; - } - - if (ret != 0 || prevTerm == SYNC_NON_TERM) { - return sendSnapshot(pRaft, progress); - } - - return sendAppendEntries(pRaft, progress, prevIndex, prevTerm, entries, nEntry); -} - -static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress) { - if (!syncRaftProgressRecentActive(progress)) { - return false; - } - return true; -} - -static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress, - SyncIndex prevIndex, SyncTerm prevTerm, - SSyncRaftEntry *entries, int nEntry) { - SNodeInfo* pNode = syncRaftGetNodeById(pRaft, progress->id); - if (pNode == NULL) { - return false; - } - SyncIndex lastIndex; - SyncTerm logTerm = prevTerm; - - SSyncMessage* msg = syncNewAppendMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term, - prevIndex, prevTerm, pRaft->log->commitIndex, - nEntry, entries); - - if (msg == NULL) { - goto err_release_log; - } - - if (nEntry != 0) { - switch (progress->state) { - // optimistically increase the next when in StateReplicate - case PROGRESS_STATE_REPLICATE: - lastIndex = entries[nEntry - 1].index; - syncRaftProgressOptimisticNextIndex(progress, lastIndex); - syncRaftInflightAdd(progress->inflights, lastIndex); - break; - case PROGRESS_STATE_PROBE: - progress->probeSent = true; - break; - default: - syncFatal("[%d:%d] is sending append in unhandled state %s", - pRaft->selfGroupId, pRaft->selfId, syncRaftProgressStateString(progress)); - break; - } - } - pRaft->io.send(msg, pNode); - return true; - -err_release_log: - syncRaftLogRelease(pRaft->log, prevIndex + 1, entries, nEntry); - return false; -} diff --git a/source/libs/sync/src/raft_unstable_log.c b/source/libs/sync/src/raft_unstable_log.c deleted file mode 100644 index e798e20662..0000000000 --- a/source/libs/sync/src/raft_unstable_log.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "sync.h" -#include "raft_unstable_log.h" - -/* -SyncIndex syncRaftLogLastIndex(SSyncRaftUnstableLog* pLog) { - return 0; -} -*/ \ No newline at end of file diff --git a/source/libs/sync/src/sync.c b/source/libs/sync/src/sync.c deleted file mode 100644 index 321b03d2ee..0000000000 --- a/source/libs/sync/src/sync.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "syncInt.h" -#include "trpc.h" -#include "ttimer.h" - -SSyncManager* gSyncManager = NULL; - -#define SYNC_TICK_TIMER 50 -#define SYNC_ACTIVITY_TIMER 5 -#define SYNC_SERVER_WORKER 2 - -static void syncProcessRsp(void *parent, SRpcMsg *pMsg, SEpSet *pEpSet); -static void syncProcessReqMsg(void *parent, SRpcMsg *pMsg, SEpSet *pEpSet); - -static int syncInitRpcServer(SSyncManager* syncManager, const SSyncCluster* pSyncCfg); -static int syncInitRpcClient(SSyncManager* syncManager); -static int syncOpenWorkerPool(SSyncManager* syncManager); -static int syncCloseWorkerPool(SSyncManager* syncManager); -static void *syncWorkerMain(void *argv); -static void syncNodeTick(void *param, void *tmrId); - -int32_t syncInit() { - if (gSyncManager != NULL) { - return 0; - } - - gSyncManager = (SSyncManager*)calloc(sizeof(SSyncManager), 0); - if (gSyncManager == NULL) { - syncError("malloc SSyncManager fail"); - return -1; - } - - pthread_mutex_init(&gSyncManager->mutex, NULL); - - // init client rpc - if (syncInitRpcClient(gSyncManager) != 0) { - syncCleanUp(); - return -1; - } - - // init sync timer manager - gSyncManager->syncTimerManager = taosTmrInit(1000, 50, 10000, "SYNC"); - if (gSyncManager->syncTimerManager == NULL) { - syncCleanUp(); - return -1; - } - - // init worker pool - if (syncOpenWorkerPool(gSyncManager) != 0) { - syncCleanUp(); - return -1; - } - - // init vgroup hash table - gSyncManager->vgroupTable = taosHashInit(TSDB_MIN_VNODES, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); - if (gSyncManager->vgroupTable == NULL) { - syncCleanUp(); - return -1; - } - return 0; -} - -void syncCleanUp() { - if (gSyncManager == NULL) { - return; - } - pthread_mutex_lock(&gSyncManager->mutex); - if (gSyncManager->vgroupTable) { - taosHashCleanup(gSyncManager->vgroupTable); - } - if (gSyncManager->clientRpc) { - rpcClose(gSyncManager->clientRpc); - syncInfo("sync inter-sync rpc client is closed"); - } - if (gSyncManager->syncTimerManager) { - taosTmrCleanUp(gSyncManager->syncTimerManager); - } - syncCloseWorkerPool(gSyncManager); - pthread_mutex_unlock(&gSyncManager->mutex); - pthread_mutex_destroy(&gSyncManager->mutex); - free(gSyncManager); - gSyncManager = NULL; -} - -SSyncNode* syncStart(const SSyncInfo* pInfo) { - pthread_mutex_lock(&gSyncManager->mutex); - - SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pInfo->vgId, sizeof(SyncGroupId*)); - if (ppNode != NULL) { - syncInfo("vgroup %d already exist", pInfo->vgId); - pthread_mutex_unlock(&gSyncManager->mutex); - return *ppNode; - } - - // init rpc server - if (syncInitRpcServer(gSyncManager, &pInfo->syncCfg) != 0) { - pthread_mutex_unlock(&gSyncManager->mutex); - return NULL; - } - - SSyncNode *pNode = (SSyncNode*)malloc(sizeof(SSyncNode)); - if (pNode == NULL) { - syncError("malloc vgroup %d node fail", pInfo->vgId); - pthread_mutex_unlock(&gSyncManager->mutex); - return NULL; - } - - pNode->syncTimer = taosTmrStart(syncNodeTick, SYNC_TICK_TIMER, (void*)((int64_t)pInfo->vgId), gSyncManager->syncTimerManager); - - // start raft - pNode->raft.pNode = pNode; - if (syncRaftStart(&pNode->raft, pInfo) != 0) { - syncError("raft start at %d node fail", pInfo->vgId); - pthread_mutex_unlock(&gSyncManager->mutex); - return NULL; - } - - pthread_mutex_init(&pNode->mutex, NULL); - - taosHashPut(gSyncManager->vgroupTable, &pInfo->vgId, sizeof(SyncGroupId), &pNode, sizeof(SSyncNode *)); - - pthread_mutex_unlock(&gSyncManager->mutex); - return NULL; -} - -void syncStop(const SSyncNode* pNode) { - pthread_mutex_lock(&gSyncManager->mutex); - - SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId*)); - if (ppNode == NULL) { - syncInfo("vgroup %d not exist", pNode->vgId); - pthread_mutex_unlock(&gSyncManager->mutex); - return; - } - assert(*ppNode == pNode); - taosTmrStop(pNode->syncTimer); - - taosHashRemove(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId)); - pthread_mutex_unlock(&gSyncManager->mutex); - - pthread_mutex_destroy(&((*ppNode)->mutex)); - free(*ppNode); -} - -int32_t syncPropose(SSyncNode* syncNode, const SSyncBuffer* pBuf, void* pData, bool isWeak) { - SSyncMessage msg; - - pthread_mutex_lock(&syncNode->mutex); - int32_t ret = syncRaftStep(&syncNode->raft, syncInitPropMsg(&msg, pBuf, pData, isWeak)); - pthread_mutex_unlock(&syncNode->mutex); - return ret; -} - -void syncReconfig(const SSyncNode* pNode, const SSyncCluster* pCfg) {} - -int32_t syncAddNode(SSyncNode syncNode, const SNodeInfo *pNode) { - return 0; -} - -int32_t syncRemoveNode(SSyncNode syncNode, const SNodeInfo *pNode) { - return 0; -} - -// process rpc rsp message from other sync server -static void syncProcessRsp(void *parent, SRpcMsg *pMsg, SEpSet *pEpSet) { - -} - -// process rpc message from other sync server -static void syncProcessReqMsg(void *parent, SRpcMsg *pMsg, SEpSet *pEpSet) { - -} - -static int syncInitRpcServer(SSyncManager* syncManager, const SSyncCluster* pSyncCfg) { - if (gSyncManager->rpcServerTable == NULL) { - gSyncManager->rpcServerTable = taosHashInit(TSDB_MIN_VNODES, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); - if (gSyncManager->rpcServerTable == NULL) { - syncError("init sync rpc server hash table error"); - return -1; - } - } - assert(pSyncCfg->selfIndex < pSyncCfg->replica && pSyncCfg->selfIndex >= 0); - const SNodeInfo* pNode = &(pSyncCfg->nodeInfo[pSyncCfg->replica]); - char buffer[156] = {'\0'}; - snprintf(buffer, sizeof(buffer), "%s:%d", &(pNode->nodeFqdn[0]), pNode->nodePort); - size_t len = strlen(buffer); - void** ppRpcServer = taosHashGet(gSyncManager->rpcServerTable, buffer, len); - if (ppRpcServer != NULL) { - // already inited - syncInfo("sync rpc server for %s already exist", buffer); - return 0; - } - - SRpcInit rpcInit; - memset(&rpcInit, 0, sizeof(rpcInit)); - rpcInit.localPort = pNode->nodePort; - rpcInit.label = "sync-server"; - rpcInit.numOfThreads = SYNC_SERVER_WORKER; - rpcInit.cfp = syncProcessReqMsg; - rpcInit.sessions = TSDB_MAX_VNODES << 4; - rpcInit.connType = TAOS_CONN_SERVER; - rpcInit.idleTime = SYNC_ACTIVITY_TIMER * 1000; - - void* rpcServer = rpcOpen(&rpcInit); - if (rpcServer == NULL) { - syncInfo("rpcOpen for sync rpc server for %s fail", buffer); - return -1; - } - - taosHashPut(gSyncManager->rpcServerTable, buffer, strlen(buffer), rpcServer, len); - syncInfo("sync rpc server for %s init success", buffer); - - return 0; -} - -static int syncInitRpcClient(SSyncManager* syncManager) { - char secret[TSDB_PASSWORD_LEN] = "secret"; - SRpcInit rpcInit; - memset(&rpcInit, 0, sizeof(rpcInit)); - rpcInit.label = "sync-client"; - rpcInit.numOfThreads = 1; - rpcInit.cfp = syncProcessRsp; - rpcInit.sessions = TSDB_MAX_VNODES << 4; - rpcInit.connType = TAOS_CONN_CLIENT; - rpcInit.idleTime = SYNC_ACTIVITY_TIMER * 1000; - rpcInit.user = "t"; - rpcInit.ckey = "key"; - rpcInit.secret = secret; - - syncManager->clientRpc = rpcOpen(&rpcInit); - if (syncManager->clientRpc == NULL) { - syncError("failed to init sync rpc client"); - return -1; - } - - syncInfo("sync inter-sync rpc client is initialized"); - return 0; -} - -static int syncOpenWorkerPool(SSyncManager* syncManager) { - int i; - pthread_attr_t thattr; - - pthread_attr_init(&thattr); - pthread_attr_setdetachstate(&thattr, PTHREAD_CREATE_JOINABLE); - - for (i = 0; i < TAOS_SYNC_MAX_WORKER; ++i) { - SSyncWorker* pWorker = &(syncManager->worker[i]); - - if (pthread_create(&(pWorker->thread), &thattr, (void *)syncWorkerMain, pWorker) != 0) { - syncError("failed to create sync worker since %s", strerror(errno)); - - return -1; - } - } - - pthread_attr_destroy(&thattr); - - return 0; -} - -static int syncCloseWorkerPool(SSyncManager* syncManager) { - return 0; -} - -static void *syncWorkerMain(void *argv) { - SSyncWorker* pWorker = (SSyncWorker *)argv; - - taosBlockSIGPIPE(); - setThreadName("syncWorker"); - - return NULL; -} - -static void syncNodeTick(void *param, void *tmrId) { - SyncGroupId vgId = (SyncGroupId)((int64_t)param); - SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &vgId, sizeof(SyncGroupId*)); - if (ppNode == NULL) { - return; - } - SSyncNode *pNode = *ppNode; - - pthread_mutex_lock(&pNode->mutex); - syncRaftTick(&pNode->raft); - pthread_mutex_unlock(&pNode->mutex); - - pNode->syncTimer = taosTmrStart(syncNodeTick, SYNC_TICK_TIMER, (void*)(int64_t)pNode->vgId, gSyncManager->syncTimerManager); -} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_config_change.c b/source/libs/sync/src/sync_raft_config_change.c deleted file mode 100644 index de790b5876..0000000000 --- a/source/libs/sync/src/sync_raft_config_change.c +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "raft.h" -#include "syncInt.h" -#include "sync_raft_config_change.h" -#include "sync_raft_progress.h" -#include "sync_raft_progress_tracker.h" -#include "sync_raft_quorum_joint.h" - -static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); -static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); -static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); -static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); -static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config); -static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css); - -static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r); - -static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner); - -static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id); -static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id); - -static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id); -static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id); -static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id); - -// EnterJoint verifies that the outgoing (=right) majority config of the joint -// config is empty and initializes it with a copy of the incoming (=left) -// majority config. That is, it transitions from -// -// (1 2 3)&&() -// to -// (1 2 3)&&(1 2 3). -// -// The supplied changes are then applied to the incoming majority config, -// resulting in a joint configuration that in terms of the Raft thesis[1] -// (Section 4.3) corresponds to `C_{new,old}`. -// -// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf -int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css, - SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - int ret; - - ret = checkAndCopy(changer, config, progressMap); - if (ret != 0) { - return ret; - } - - if (hasJointConfig(config)) { - syncError("config is already joint"); - return -1; - } - - if(syncRaftJointConfigIsIncomingEmpty(&config->voters) == 0) { - // We allow adding nodes to an empty config for convenience (testing and - // bootstrap), but you can't enter a joint state. - syncError("can't make a zero-voter config joint"); - return -1; - } - - // Clear the outgoing config. - syncRaftJointConfigClearOutgoing(&config->voters); - - // Copy incoming to outgoing. - syncRaftCopyNodeMap(&config->voters.incoming, &config->voters.outgoing); - - ret = applyConfig(changer, config, progressMap, css); - if (ret != 0) { - return ret; - } - - config->autoLeave = autoLeave; - return checkAndReturn(config, progressMap); -} - -// Simple carries out a series of configuration changes that (in aggregate) -// mutates the incoming majority config Voters[0] by at most one. This method -// will return an error if that is not the case, if the resulting quorum is -// zero, or if the configuration is in a joint state (i.e. if there is an -// outgoing configuration). -int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css, - SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - int ret; - - ret = checkAndCopy(changer, config, progressMap); - if (ret != 0) { - return ret; - } - - if (hasJointConfig(config)) { - syncError("can't apply simple config change in joint config"); - return -1; - } - - ret = applyConfig(changer, config, progressMap, css); - if (ret != 0) { - return ret; - } - - int n = symDiff(syncRaftJointConfigIncoming(&changer->tracker->config.voters), - syncRaftJointConfigIncoming(&config->voters)); - if (n > 1) { - syncError("more than one voter changed without entering joint config"); - return -1; - } - - return checkAndReturn(config, progressMap); -} - -// apply a change to the configuration. By convention, changes to voters are -// always made to the incoming majority config Voters[0]. Voters[1] is either -// empty or preserves the outgoing majority configuration while in a joint state. -static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css) { - int i; - - for (i = 0; i < css->n; ++i) { - const SSyncConfChangeSingle* cs = &(css->changes[i]); - if (cs->nodeId == SYNC_NON_NODE_ID) { - continue; - } - - ESyncRaftConfChangeType type = cs->type; - switch (type) { - case SYNC_RAFT_Conf_AddNode: - makeVoter(changer, config, progressMap, cs->nodeId); - break; - case SYNC_RAFT_Conf_AddLearnerNode: - makeLearner(changer, config, progressMap, cs->nodeId); - break; - case SYNC_RAFT_Conf_RemoveNode: - removeNodeId(changer, config, progressMap, cs->nodeId); - break; - case SYNC_RAFT_Conf_UpdateNode: - break; - } - } - - if (syncRaftJointConfigIsIncomingEmpty(&config->voters)) { - syncError("removed all voters"); - return -1; - } - - return 0; -} - - -// makeVoter adds or promotes the given ID to be a voter in the incoming -// majority config. -static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id) { - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); - if (progress == NULL) { - initProgress(changer, config, progressMap, id, false); - return; - } - - progress->isLearner = false; - nilAwareDelete(&config->learners, id); - nilAwareDelete(&config->learnersNext, id); - syncRaftJointConfigAddToIncoming(&config->voters, id); -} - -// makeLearner makes the given ID a learner or stages it to be a learner once -// an active joint configuration is exited. -// -// The former happens when the peer is not a part of the outgoing config, in -// which case we either add a new learner or demote a voter in the incoming -// config. -// -// The latter case occurs when the configuration is joint and the peer is a -// voter in the outgoing config. In that case, we do not want to add the peer -// as a learner because then we'd have to track a peer as a voter and learner -// simultaneously. Instead, we add the learner to LearnersNext, so that it will -// be added to Learners the moment the outgoing config is removed by -// LeaveJoint(). -static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id) { - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); - if (progress == NULL) { - initProgress(changer, config, progressMap, id, true); - return; - } - - if (progress->isLearner) { - return; - } - // Remove any existing voter in the incoming config... - removeNodeId(changer, config, progressMap, id); - - // ... but save the Progress. - syncRaftAddToProgressMap(progressMap, progress); - - // Use LearnersNext if we can't add the learner to Learners directly, i.e. - // if the peer is still tracked as a voter in the outgoing config. It will - // be turned into a learner in LeaveJoint(). - // - // Otherwise, add a regular learner right away. - bool inInOutgoing = syncRaftJointConfigIsInOutgoing(&config->voters, id); - if (inInOutgoing) { - nilAwareAdd(&config->learnersNext, id); - } else { - nilAwareAdd(&config->learners, id); - progress->isLearner = true; - } -} - -// removeNodeId this peer as a voter or learner from the incoming config. -static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id) { - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); - if (progress == NULL) { - return; - } - - syncRaftJointConfigRemoveFromIncoming(&config->voters, id); - nilAwareDelete(&config->learners, id); - nilAwareDelete(&config->learnersNext, id); - - // If the peer is still a voter in the outgoing config, keep the Progress. - bool inInOutgoing = syncRaftJointConfigIsInOutgoing(&config->voters, id); - if (!inInOutgoing) { - syncRaftRemoveFromProgressMap(progressMap, id); - } -} - -// initProgress initializes a new progress for the given node or learner. -static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner) { - if (!isLearner) { - syncRaftJointConfigAddToIncoming(&config->voters, id); - } else { - nilAwareAdd(&config->learners, id); - } - - SSyncRaftProgress* pProgress = (SSyncRaftProgress*)malloc(sizeof(SSyncRaftProgress)); - assert (pProgress != NULL); - *pProgress = (SSyncRaftProgress) { - // Initializing the Progress with the last index means that the follower - // can be probed (with the last index). - // - // TODO(tbg): seems awfully optimistic. Using the first index would be - // better. The general expectation here is that the follower has no log - // at all (and will thus likely need a snapshot), though the app may - // have applied a snapshot out of band before adding the replica (thus - // making the first index the better choice). - .id = id, - .groupId = changer->tracker->pRaft->selfGroupId, - .nextIndex = changer->lastIndex, - .matchIndex = 0, - .state = PROGRESS_STATE_PROBE, - .pendingSnapshotIndex = 0, - .probeSent = false, - .inflights = syncRaftOpenInflights(changer->tracker->maxInflightMsgs), - .isLearner = isLearner, - // When a node is first added, we should mark it as recently active. - // Otherwise, CheckQuorum may cause us to step down if it is invoked - // before the added node has had a chance to communicate with us. - .recentActive = true, - .refCount = 0, - }; - - syncRaftAddToProgressMap(progressMap, pProgress); -} - -// checkInvariants makes sure that the config and progress are compatible with -// each other. This is used to check both what the Changer is initialized with, -// as well as what it returns. -static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - int ret = syncRaftCheckTrackerConfigInProgress(config, progressMap); - if (ret != 0) { - return ret; - } - - // Any staged learner was staged because it could not be directly added due - // to a conflicting voter in the outgoing config. - SyncNodeId* pNodeId = NULL; - while (!syncRaftIterateNodeMap(&config->learnersNext, pNodeId)) { - SyncNodeId nodeId = *pNodeId; - if (!syncRaftJointConfigInOutgoing(&config->voters, nodeId)) { - syncError("[%d] is in LearnersNext, but not outgoing", nodeId); - return -1; - } - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, nodeId); - assert(progress); - assert(progress->id == nodeId); - if (progress->isLearner) { - syncError("[%d:%d] is in LearnersNext, but is already marked as learner", progress->groupId, nodeId); - return -1; - } - } - - // Conversely Learners and Voters doesn't intersect at all. - pNodeId = NULL; - while (!syncRaftIterateNodeMap(&config->learners, pNodeId)) { - SyncNodeId nodeId = *pNodeId; - if (syncRaftJointConfigInOutgoing(&config->voters, nodeId)) { - syncError("%d is in Learners and outgoing", nodeId); - return -1; - } - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, nodeId); - assert(progress); - assert(progress->id == nodeId); - - if (!progress->isLearner) { - syncError("[%d:%d] is in Learners, but is not marked as learner", progress->groupId, nodeId); - return -1; - } - } - - if (!hasJointConfig(config)) { - // We enforce that empty maps are nil instead of zero. - if (syncRaftNodeMapSize(&config->learnersNext) > 0) { - syncError("cfg.LearnersNext must be nil when not joint"); - return -1; - } - if (config->autoLeave) { - syncError("AutoLeave must be false when not joint"); - return -1; - } - } - - return 0; -} - -// checkAndCopy copies the tracker's config and progress map (deeply enough for -// the purposes of the Changer) and returns those copies. It returns an error -// if checkInvariants does. -static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - syncRaftCopyTrackerConfig(&changer->tracker->config, config); - syncRaftClearProgressMap(progressMap); - - SSyncRaftProgress* pProgress = NULL; - while (!syncRaftIterateProgressMap(&changer->tracker->progressMap, pProgress)) { - syncRaftAddToProgressMap(progressMap, pProgress); - } - - return checkAndReturn(config, progressMap); -} - -// checkAndReturn calls checkInvariants on the input and returns either the -// resulting error or the input. -static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - if (checkInvariants(config, progressMap) != 0) { - return -1; - } - - return 0; -} - -static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config) { - return !syncRaftJointConfigIsOutgoingEmpty(&config->voters); -} - -// symdiff returns the count of the symmetric difference between the sets of -// uint64s, i.e. len( (l - r) \union (r - l)). -static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r) { - int n; - int i; - int j0, j1; - const SSyncRaftNodeMap* pairs[2][2] = { - {l, r}, // count elems in l but not in r - {r, l}, // count elems in r but not in l - }; - - for (n = 0, i = 0; i < 2; ++i) { - const SSyncRaftNodeMap** pp = pairs[i]; - - const SSyncRaftNodeMap* p0 = pp[0]; - const SSyncRaftNodeMap* p1 = pp[1]; - SyncNodeId* pNodeId; - while (!syncRaftIterateNodeMap(p0, pNodeId)) { - if (!syncRaftIsInNodeMap(p1, *pNodeId)) { - n+=1; - } - } - } - - return n; -} - -// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after. -static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id) { - syncRaftRemoveFromNodeMap(nodeMap, id); -} - -// nilAwareAdd populates a map entry, creating the map if necessary. -static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id) { - syncRaftAddToNodeMap(nodeMap, id); -} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_election.c b/source/libs/sync/src/sync_raft_election.c deleted file mode 100644 index fe2e0fd9d3..0000000000 --- a/source/libs/sync/src/sync_raft_election.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "syncInt.h" -#include "raft.h" -#include "raft_log.h" -#include "raft_message.h" -#include "sync_raft_progress_tracker.h" - -void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) { - if (pRaft->state == TAOS_SYNC_STATE_LEADER) { - syncDebug("[%d:%d] ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfGroupId, pRaft->selfId); - return; - } - - if (!syncRaftIsPromotable(pRaft)) { - syncWarn("[%d:%d] is unpromotable and can not syncRaftCampaign", pRaft->selfGroupId, pRaft->selfId); - return; - } - - // if there is pending uncommitted config,cannot start election - if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) { - syncWarn("[%d:%d] cannot syncRaftStartElection at term %" PRId64 " since there are still pending configuration changes to apply", - pRaft->selfGroupId, pRaft->selfId, pRaft->term); - return; - } - - syncInfo("[%d:%d] is starting a new election at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); - - syncRaftCampaign(pRaft, cType); -} - -// syncRaftCampaign transitions the raft instance to candidate state. This must only be -// called after verifying that this is a legitimate transition. -void syncRaftCampaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { - bool preVote; - SyncTerm term; - - if (syncRaftIsPromotable(pRaft)) { - syncDebug("[%d:%d] is unpromotable; syncRaftCampaign() should have been called", pRaft->selfGroupId, pRaft->selfId); - return; - } - - if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { - syncRaftBecomePreCandidate(pRaft); - preVote = true; - // PreVote RPCs are sent for the next term before we've incremented r.Term. - term = pRaft->term + 1; - } else { - syncRaftBecomeCandidate(pRaft); - term = pRaft->term; - preVote = false; - } - - int quorum = syncRaftQuorum(pRaft); - ESyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL); - if (result == SYNC_RAFT_VOTE_WON) { - // We won the election after voting for ourselves (which must mean that - // this is a single-node cluster). Advance to the next state. - if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { - syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); - } else { - syncRaftBecomeLeader(pRaft); - } - return; - } - - // broadcast vote message to other peers - int i; - SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); - SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); - SSyncRaftNodeMap nodeMap; - syncRaftJointConfigIDs(&pRaft->tracker->config.voters, &nodeMap); - SyncNodeId *pNodeId = NULL; - while (!syncRaftIterateNodeMap(&nodeMap, pNodeId)) { - SyncNodeId nodeId = *pNodeId; - if (nodeId == SYNC_NON_NODE_ID) { - continue; - } - - if (nodeId == pRaft->selfId) { - continue; - } - - SNodeInfo* pNode = syncRaftGetNodeById(pRaft, nodeId); - if (pNode == NULL) { - continue; - } - - SSyncMessage* pMsg = syncNewVoteMsg(pRaft->selfGroupId, pRaft->selfId, - term, cType, lastIndex, lastTerm); - if (pMsg == NULL) { - continue; - } - - syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 "] sent vote request to %d at term %" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, lastTerm, - lastIndex, nodeId, pRaft->term); - - pRaft->io.send(pMsg, pNode); - } -} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_impl.c b/source/libs/sync/src/sync_raft_impl.c deleted file mode 100644 index 3050bb2c8a..0000000000 --- a/source/libs/sync/src/sync_raft_impl.c +++ /dev/null @@ -1,369 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "raft.h" -#include "sync_raft_impl.h" -#include "raft_log.h" -#include "raft_replication.h" -#include "sync_raft_progress_tracker.h" -#include "syncInt.h" - -static int convertClear(SSyncRaft* pRaft); -static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg); -static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg); -static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg); - -static bool increaseUncommittedSize(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n); - -static int triggerAll(SSyncRaft* pRaft); - -static void tickElection(SSyncRaft* pRaft); -static void tickHeartbeat(SSyncRaft* pRaft); - -static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n); - -static void abortLeaderTransfer(SSyncRaft* pRaft); - -static void resetRaft(SSyncRaft* pRaft, SyncTerm term); - -void syncRaftBecomeFollower(SSyncRaft* pRaft, SyncTerm term, SyncNodeId leaderId) { - convertClear(pRaft); - - pRaft->stepFp = stepFollower; - resetRaft(pRaft, term); - pRaft->tickFp = tickElection; - pRaft->leaderId = leaderId; - pRaft->state = TAOS_SYNC_STATE_FOLLOWER; - syncInfo("[%d:%d] became followe at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); -} - -void syncRaftBecomePreCandidate(SSyncRaft* pRaft) { - convertClear(pRaft); - - /** - * Becoming a pre-candidate changes our step functions and state, - * but doesn't change anything else. In particular it does not increase - * r.Term or change r.Vote. - **/ - pRaft->stepFp = stepCandidate; - pRaft->tickFp = tickElection; - pRaft->state = TAOS_SYNC_STATE_CANDIDATE; - pRaft->candidateState.inPreVote = true; - syncInfo("[%d:%d] became pre-candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); -} - -void syncRaftBecomeCandidate(SSyncRaft* pRaft) { - convertClear(pRaft); - - pRaft->candidateState.inPreVote = false; - pRaft->stepFp = stepCandidate; - // become candidate make term+1 - resetRaft(pRaft, pRaft->term + 1); - pRaft->tickFp = tickElection; - pRaft->voteFor = pRaft->selfId; - pRaft->state = TAOS_SYNC_STATE_CANDIDATE; - syncInfo("[%d:%d] became candidate at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); -} - -void syncRaftBecomeLeader(SSyncRaft* pRaft) { - assert(pRaft->state != TAOS_SYNC_STATE_FOLLOWER); - - pRaft->stepFp = stepLeader; - resetRaft(pRaft, pRaft->term); - pRaft->leaderId = pRaft->leaderId; - pRaft->state = TAOS_SYNC_STATE_LEADER; - - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, pRaft->selfId); - assert(progress != NULL); - // Followers enter replicate mode when they've been successfully probed - // (perhaps after having received a snapshot as a result). The leader is - // trivially in this state. Note that r.reset() has initialized this - // progress with the last index already. - syncRaftProgressBecomeReplicate(progress); - - // Conservatively set the pendingConfIndex to the last index in the - // log. There may or may not be a pending config change, but it's - // safe to delay any future proposals until we commit all our - // pending log entries, and scanning the entire tail of the log - // could be expensive. - SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); - pRaft->pendingConfigIndex = lastIndex; - - // after become leader, send a no-op log - SSyncRaftEntry* entry = (SSyncRaftEntry*)malloc(sizeof(SSyncRaftEntry)); - if (entry == NULL) { - return; - } - *entry = (SSyncRaftEntry) { - .buffer = (SSyncBuffer) { - .data = NULL, - .len = 0, - } - }; - appendEntries(pRaft, entry, 1); - //syncRaftTriggerHeartbeat(pRaft); - syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); -} - -void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) { - triggerAll(pRaft); -} - -void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft) { - // electionTimeoutTick in [3,6] tick - pRaft->randomizedElectionTimeout = taosRand() % 4 + 3; -} - -bool syncRaftIsPromotable(SSyncRaft* pRaft) { - return pRaft->selfId != SYNC_NON_NODE_ID; -} - -bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) { - return pRaft->electionElapsed >= pRaft->randomizedElectionTimeout; -} - -int syncRaftQuorum(SSyncRaft* pRaft) { - return 0; - //return pRaft->cluster.replica / 2 + 1; -} - -ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id, - bool preVote, bool grant, - int* rejected, int *granted) { - SNodeInfo* pNode = syncRaftGetNodeById(pRaft, id); - if (pNode == NULL) { - return true; - } - - if (grant) { - syncInfo("[%d:%d] received grant (pre-vote %d) from %d at term %" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); - } else { - syncInfo("[%d:%d] received rejection (pre-vote %d) from %d at term %" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); - } - - syncRaftRecordVote(pRaft->tracker, pNode->nodeId, grant); - return syncRaftTallyVotes(pRaft->tracker, rejected, granted); -} -/* - if (accept) { - syncInfo("[%d:%d] received (pre-vote %d) from %d at term %" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); - } else { - syncInfo("[%d:%d] received rejection from %d at term %" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, id, pRaft->term); - } - - int voteIndex = syncRaftGetNodeById(pRaft, id); - assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0); - assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN); - - pRaft->candidateState.votes[voteIndex] = accept ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT; - int granted = 0, rejected = 0; - int i; - for (i = 0; i < pRaft->cluster.replica; ++i) { - if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) granted++; - else if (pRaft->candidateState.votes[i] == SYNC_RAFT_VOTE_RESP_REJECT) rejected++; - } - - if (rejectNum) *rejectNum = rejected; - return granted; -*/ - -void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState) { - SyncIndex commitIndex = serverState->commitIndex; - SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); - - if (commitIndex < pRaft->log->commitIndex || commitIndex > lastIndex) { - syncFatal("[%d:%d] state.commit %"PRId64" is out of range [%" PRId64 ",%" PRId64 "", - pRaft->selfGroupId, pRaft->selfId, commitIndex, pRaft->log->commitIndex, lastIndex); - return; - } - - pRaft->log->commitIndex = commitIndex; - pRaft->term = serverState->term; - pRaft->voteFor = serverState->voteFor; -} - -static void visitProgressSendAppend(SSyncRaftProgress* progress, void* arg) { - SSyncRaft* pRaft = (SSyncRaft*)arg; - if (pRaft->selfId == progress->id) { - return; - } - - syncRaftMaybeSendAppend(arg, progress, true); -} - -// bcastAppend sends RPC, with entries to all peers that are not up-to-date -// according to the progress recorded in r.prs. -void syncRaftBroadcastAppend(SSyncRaft* pRaft) { - syncRaftProgressVisit(pRaft->tracker, visitProgressSendAppend, pRaft); -} - -SNodeInfo* syncRaftGetNodeById(SSyncRaft *pRaft, SyncNodeId id) { - SNodeInfo **ppNode = taosHashGet(pRaft->nodeInfoMap, &id, sizeof(SyncNodeId*)); - if (ppNode != NULL) { - return *ppNode; - } - - return NULL; -} - -static int convertClear(SSyncRaft* pRaft) { - -} - -static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - - return 0; -} - -static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - /** - * Only handle vote responses corresponding to our candidacy (while in - * StateCandidate, we may get stale MsgPreVoteResp messages in this term from - * our pre-candidate state). - **/ - ESyncRaftMessageType msgType = pMsg->msgType; - - if (msgType == RAFT_MSG_INTERNAL_PROP) { - return 0; - } - - if (msgType == RAFT_MSG_VOTE_RESP) { - syncRaftHandleVoteRespMessage(pRaft, pMsg); - return 0; - } else if (msgType == RAFT_MSG_APPEND) { - syncRaftBecomeFollower(pRaft, pMsg->term, pMsg->from); - syncRaftHandleAppendEntriesMessage(pRaft, pMsg); - } - return 0; -} - -static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - convertClear(pRaft); - return 0; -} - -// tickElection is run by followers and candidates after r.electionTimeout. -static void tickElection(SSyncRaft* pRaft) { - pRaft->electionElapsed += 1; - - if (!syncRaftIsPromotable(pRaft)) { - return; - } - - if (!syncRaftIsPastElectionTimeout(pRaft)) { - return; - } - - // election timeout - pRaft->electionElapsed = 0; - SSyncMessage msg; - syncRaftStep(pRaft, syncInitElectionMsg(&msg, pRaft->selfId)); -} - -// tickHeartbeat is run by leaders to send a MsgBeat after r.heartbeatTimeout. -static void tickHeartbeat(SSyncRaft* pRaft) { - -} - -// TODO -static bool increaseUncommittedSize(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { - return false; -} - -static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { - SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); - SyncTerm term = pRaft->term; - int i; - - for (i = 0; i < n; ++i) { - entries[i].term = term; - entries[i].index = lastIndex + 1 + i; - } - - // Track the size of this uncommitted proposal. - if (!increaseUncommittedSize(pRaft, entries, n)) { - // Drop the proposal. - return; - } - - syncRaftLogAppend(pRaft->log, entries, n); - - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, pRaft->selfId); - assert(progress != NULL); - syncRaftProgressMaybeUpdate(progress, lastIndex); - // Regardless of syncRaftMaybeCommit's return, our caller will call bcastAppend. - syncRaftMaybeCommit(pRaft); -} - -// syncRaftMaybeCommit attempts to advance the commit index. Returns true if -// the commit index changed (in which case the caller should call -// r.bcastAppend). -bool syncRaftMaybeCommit(SSyncRaft* pRaft) { - - return true; -} - -/** - * trigger I/O requests for newly appended log entries or heartbeats. - **/ -static int triggerAll(SSyncRaft* pRaft) { - #if 0 - assert(pRaft->state == TAOS_SYNC_STATE_LEADER); - int i; - - for (i = 0; i < pRaft->cluster.replica; ++i) { - if (i == pRaft->cluster.selfIndex) { - continue; - } - - syncRaftMaybeSendAppend(pRaft, pRaft->tracker->progressMap.progress[i], true); - } - #endif - return 0; -} - -static void abortLeaderTransfer(SSyncRaft* pRaft) { - pRaft->leadTransferee = SYNC_NON_NODE_ID; -} - -static void resetProgress(SSyncRaftProgress* progress, void* arg) { - syncRaftResetProgress((SSyncRaft*)arg, progress); -} - -static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { - if (pRaft->term != term) { - pRaft->term = term; - pRaft->voteFor = SYNC_NON_NODE_ID; - } - - pRaft->leaderId = SYNC_NON_NODE_ID; - - pRaft->electionElapsed = 0; - pRaft->heartbeatElapsed = 0; - - syncRaftRandomizedElectionTimeout(pRaft); - - abortLeaderTransfer(pRaft); - - syncRaftResetVotes(pRaft->tracker); - syncRaftProgressVisit(pRaft->tracker, resetProgress, pRaft); - - pRaft->pendingConfigIndex = 0; - pRaft->uncommittedSize = 0; -} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_inflights.c b/source/libs/sync/src/sync_raft_inflights.c deleted file mode 100644 index 7b97aca014..0000000000 --- a/source/libs/sync/src/sync_raft_inflights.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "sync_raft_inflights.h" - -SSyncRaftInflights* syncRaftOpenInflights(int size) { - SSyncRaftInflights* inflights = (SSyncRaftInflights*)malloc(sizeof(SSyncRaftInflights)); - if (inflights == NULL) { - return NULL; - } - SyncIndex* buffer = (SyncIndex*)malloc(sizeof(SyncIndex) * size); - if (buffer == NULL) { - free(inflights); - return NULL; - } - *inflights = (SSyncRaftInflights) { - .buffer = buffer, - .count = 0, - .size = 0, - .start = 0, - }; - - return inflights; -} - -void syncRaftCloseInflights(SSyncRaftInflights* inflights) { - free(inflights->buffer); - free(inflights); -} - -// Add notifies the Inflights that a new message with the given index is being -// dispatched. Full() must be called prior to Add() to verify that there is room -// for one more message, and consecutive calls to add Add() must provide a -// monotonic sequence of indexes. -void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex) { - assert(!syncRaftInflightFull(inflights)); - - int next = inflights->start + inflights->count; - int size = inflights->size; - - if (next >= size) { - next -= size; - } - - inflights->buffer[next] = inflightIndex; - inflights->count++; -} - -// FreeLE frees the inflights smaller or equal to the given `to` flight. -void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex) { - if (inflights->count == 0 || toIndex < inflights->buffer[inflights->start]) { - // out of the left side of the window - return; - } - - int i, idx; - for (i = 0, idx = inflights->start; i < inflights->count; i++) { - if (toIndex < inflights->buffer[idx]) { // found the first large inflight - break; - } - - // increase index and maybe rotate - int size = inflights->size; - idx++; - if (idx >= size) { - idx -= size; - } - } - - // free i inflights and set new start index - inflights->count -= i; - inflights->start = idx; - assert(inflights->count >= 0); - if (inflights->count == 0) { - // inflights is empty, reset the start index so that we don't grow the - // buffer unnecessarily. - inflights->start = 0; - } -} - -// FreeFirstOne releases the first inflight. This is a no-op if nothing is -// inflight. -void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights) { - syncRaftInflightFreeLE(inflights, inflights->buffer[inflights->start]); -} diff --git a/source/libs/sync/src/sync_raft_node_map.c b/source/libs/sync/src/sync_raft_node_map.c deleted file mode 100644 index 642eebe65b..0000000000 --- a/source/libs/sync/src/sync_raft_node_map.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "sync_raft_node_map.h" -#include "sync_type.h" -#include "sync_raft_progress.h" - -void syncRaftInitNodeMap(SSyncRaftNodeMap* nodeMap) { - nodeMap->nodeIdMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); -} - -void syncRaftFreeNodeMap(SSyncRaftNodeMap* nodeMap) { - taosHashCleanup(nodeMap->nodeIdMap); -} - -void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap) { - taosHashClear(nodeMap->nodeIdMap); -} - -bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { - SyncNodeId** ppId = (SyncNodeId**)taosHashGet(nodeMap->nodeIdMap, &nodeId, sizeof(SyncNodeId*)); - if (ppId == NULL) { - return false; - } - return true; -} - -void syncRaftCopyNodeMap(SSyncRaftNodeMap* from, SSyncRaftNodeMap* to) { - SyncNodeId *pId = NULL; - while (!syncRaftIterateNodeMap(from, pId)) { - taosHashPut(to->nodeIdMap, &pId, sizeof(SyncNodeId*), &pId, sizeof(SyncNodeId*)); - } -} - -bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId) { - SyncNodeId **ppId = taosHashIterate(nodeMap->nodeIdMap, pId); - if (ppId == NULL) { - return true; - } - - *pId = *(*ppId); - return false; -} - -bool syncRaftIsAllNodeInProgressMap(SSyncRaftNodeMap* nodeMap, SSyncRaftProgressMap* progressMap) { - SyncNodeId *pId = NULL; - while (!syncRaftIterateNodeMap(nodeMap, pId)) { - if (!syncRaftIsInProgressMap(progressMap, *pId)) { - return false; - } - } - - return true; -} - -void syncRaftUnionNodeMap(SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) { - syncRaftCopyNodeMap(nodeMap, to); -} - -void syncRaftAddToNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { - taosHashPut(nodeMap->nodeIdMap, &nodeId, sizeof(SyncNodeId*), &nodeId, sizeof(SyncNodeId*)); -} - -void syncRaftRemoveFromNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { - taosHashRemove(nodeMap->nodeIdMap, &nodeId, sizeof(SyncNodeId*)); -} - -int32_t syncRaftNodeMapSize(const SSyncRaftNodeMap* nodeMap) { - return taosHashGetSize(nodeMap->nodeIdMap); -} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_progress.c b/source/libs/sync/src/sync_raft_progress.c deleted file mode 100644 index 6577972b29..0000000000 --- a/source/libs/sync/src/sync_raft_progress.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "raft.h" -#include "raft_log.h" -#include "sync_raft_progress.h" -#include "sync_raft_progress_tracker.h" -#include "sync.h" -#include "syncInt.h" - -static void copyProgress(SSyncRaftProgress* progress, void* arg); - -static void refProgress(SSyncRaftProgress* progress); -static void unrefProgress(SSyncRaftProgress* progress, void*); - -static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state); -static void probeAcked(SSyncRaftProgress* progress); - -static void resumeProgress(SSyncRaftProgress* progress); - -void syncRaftResetProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress) { - if (progress->inflights) { - syncRaftCloseInflights(progress->inflights); - } - SSyncRaftInflights* inflights = syncRaftOpenInflights(pRaft->tracker->maxInflightMsgs); - if (inflights == NULL) { - return; - } - *progress = (SSyncRaftProgress) { - .matchIndex = progress->id == pRaft->selfId ? syncRaftLogLastIndex(pRaft->log) : 0, - .nextIndex = syncRaftLogLastIndex(pRaft->log) + 1, - .inflights = inflights, - .isLearner = false, - .state = PROGRESS_STATE_PROBE, - }; -} - -// MaybeUpdate is called when an MsgAppResp arrives from the follower, with the -// index acked by it. The method returns false if the given n index comes from -// an outdated message. Otherwise it updates the progress and returns true. -bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex) { - bool updated = false; - - if (progress->matchIndex < lastIndex) { - progress->matchIndex = lastIndex; - updated = true; - probeAcked(progress); - } - - progress->nextIndex = TMAX(progress->nextIndex, lastIndex + 1); - - return updated; -} - -// MaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The -// arguments are the index of the append message rejected by the follower, and -// the hint that we want to decrease to. -// -// Rejections can happen spuriously as messages are sent out of order or -// duplicated. In such cases, the rejection pertains to an index that the -// Progress already knows were previously acknowledged, and false is returned -// without changing the Progress. -// -// If the rejection is genuine, Next is lowered sensibly, and the Progress is -// cleared for sending log entries. -bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, - SyncIndex rejected, SyncIndex matchHint) { - if (progress->state == PROGRESS_STATE_REPLICATE) { - // The rejection must be stale if the progress has matched and "rejected" - // is smaller than "match". - if (rejected <= progress->matchIndex) { - syncDebug("match index is up to date,ignore"); - return false; - } - - // Directly decrease next to match + 1. - // - // TODO(tbg): why not use matchHint if it's larger? - progress->nextIndex = progress->matchIndex + 1; - return true; - } - - // The rejection must be stale if "rejected" does not match next - 1. This - // is because non-replicating followers are probed one entry at a time. - if (rejected != progress->nextIndex - 1) { - syncDebug("rejected index %" PRId64 " different from next index %" PRId64 " -> ignore" - , rejected, progress->nextIndex); - return false; - } - - progress->nextIndex = TMAX(TMIN(rejected, matchHint + 1), 1); - - progress->probeSent = false; - return true; -} - -// IsPaused returns whether sending log entries to this node has been throttled. -// This is done when a node has rejected recent MsgApps, is currently waiting -// for a snapshot, or has reached the MaxInflightMsgs limit. In normal -// operation, this is false. A throttled node will be contacted less frequently -// until it has reached a state in which it's able to accept a steady stream of -// log entries again. -bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) { - switch (progress->state) { - case PROGRESS_STATE_PROBE: - return progress->probeSent; - case PROGRESS_STATE_REPLICATE: - return syncRaftInflightFull(progress->inflights); - case PROGRESS_STATE_SNAPSHOT: - return true; - default: - syncFatal("error sync state:%d", progress->state); - } -} - -SSyncRaftProgress* syncRaftFindProgressByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id) { - SSyncRaftProgress** ppProgress = (SSyncRaftProgress**)taosHashGet(progressMap->progressMap, &id, sizeof(SyncNodeId*)); - if (ppProgress == NULL) { - return NULL; - } - - return *ppProgress; -} - -int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgress* progress) { - refProgress(progress); - taosHashPut(progressMap->progressMap, &progress->id, sizeof(SyncNodeId*), &progress, sizeof(SSyncRaftProgress*)); -} - -void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) { - SSyncRaftProgress** ppProgress = (SSyncRaftProgress**)taosHashGet(progressMap->progressMap, &id, sizeof(SyncNodeId*)); - if (ppProgress == NULL) { - return; - } - unrefProgress(*ppProgress, NULL); - - taosHashRemove(progressMap->progressMap, &id, sizeof(SyncNodeId*)); -} - -bool syncRaftIsInProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) { - return taosHashGet(progressMap->progressMap, &id, sizeof(SyncNodeId*)) != NULL; -} - -bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) { - return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex; -} - -// BecomeProbe transitions into StateProbe. Next is reset to Match+1 or, -// optionally and if larger, the index of the pending snapshot. -void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress) { - // If the original state is StateSnapshot, progress knows that - // the pending snapshot has been sent to this peer successfully, then - // probes from pendingSnapshot + 1. - if (progress->state == PROGRESS_STATE_SNAPSHOT) { - SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex; - resetProgressState(progress, PROGRESS_STATE_PROBE); - progress->nextIndex = TMAX(progress->matchIndex + 1, pendingSnapshotIndex + 1); - } else { - resetProgressState(progress, PROGRESS_STATE_PROBE); - progress->nextIndex = progress->matchIndex + 1; - } -} - -// BecomeReplicate transitions into StateReplicate, resetting Next to Match+1. -void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress) { - resetProgressState(progress, PROGRESS_STATE_REPLICATE); - progress->nextIndex = progress->matchIndex + 1; -} - -// BecomeSnapshot moves the Progress to StateSnapshot with the specified pending -// snapshot index. -void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex) { - resetProgressState(progress, PROGRESS_STATE_SNAPSHOT); - progress->pendingSnapshotIndex = snapshotIndex; -} - -void syncRaftCopyProgress(const SSyncRaftProgress* progress, SSyncRaftProgress* out) { - memcpy(out, progress, sizeof(SSyncRaftProgress)); -} - -void syncRaftInitProgressMap(SSyncRaftProgressMap* progressMap) { - progressMap->progressMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); -} - -void syncRaftFreeProgressMap(SSyncRaftProgressMap* progressMap) { - syncRaftVisitProgressMap(progressMap, unrefProgress, NULL); - taosHashCleanup(progressMap->progressMap); -} - -void syncRaftClearProgressMap(SSyncRaftProgressMap* progressMap) { - taosHashClear(progressMap->progressMap); -} - -void syncRaftCopyProgressMap(SSyncRaftProgressMap* from, SSyncRaftProgressMap* to) { - syncRaftVisitProgressMap(from, copyProgress, to); -} - -bool syncRaftIterateProgressMap(const SSyncRaftProgressMap* progressMap, SSyncRaftProgress *pProgress) { - SSyncRaftProgress **ppProgress = taosHashIterate(progressMap->progressMap, pProgress); - if (ppProgress == NULL) { - return true; - } - - *pProgress = *(*ppProgress); - return false; -} - -bool syncRaftVisitProgressMap(SSyncRaftProgressMap* progressMap, visitProgressFp fp, void* arg) { - SSyncRaftProgress *pProgress; - while (!syncRaftIterateProgressMap(progressMap, pProgress)) { - fp(pProgress, arg); - } -} - -static void copyProgress(SSyncRaftProgress* progress, void* arg) { - assert(progress->refCount > 0); - SSyncRaftProgressMap* to = (SSyncRaftProgressMap*)arg; - syncRaftAddToProgressMap(to, progress); -} - -static void refProgress(SSyncRaftProgress* progress) { - progress->refCount += 1; -} - -static void unrefProgress(SSyncRaftProgress* progress, void* arg) { - (void)arg; - progress->refCount -= 1; - assert(progress->refCount >= 0); - if (progress->refCount == 0) { - free(progress); - } -} - -// ResetState moves the Progress into the specified State, resetting ProbeSent, -// PendingSnapshot, and Inflights. -static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state) { - progress->probeSent = false; - progress->pendingSnapshotIndex = 0; - progress->state = state; - syncRaftInflightReset(progress->inflights); -} - -// ProbeAcked is called when this peer has accepted an append. It resets -// ProbeSent to signal that additional append messages should be sent without -// further delay. -static void probeAcked(SSyncRaftProgress* progress) { - progress->probeSent = false; -} diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c deleted file mode 100644 index e0b4afae21..0000000000 --- a/source/libs/sync/src/sync_raft_progress_tracker.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "raft.h" -#include "sync_const.h" -#include "sync_raft_progress_tracker.h" -#include "sync_raft_proto.h" - -SSyncRaftProgressTracker* syncRaftOpenProgressTracker(SSyncRaft* pRaft) { - SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)malloc(sizeof(SSyncRaftProgressTracker)); - if (tracker == NULL) { - return NULL; - } - - tracker->votesMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); - - syncRaftInitTrackConfig(&tracker->config); - tracker->pRaft = pRaft; - tracker->maxInflightMsgs = kSyncRaftMaxInflghtMsgs; - - return tracker; -} - -void syncRaftInitTrackConfig(SSyncRaftProgressTrackerConfig* config) { - syncRaftInitNodeMap(&config->learners); - syncRaftInitNodeMap(&config->learnersNext); - syncRaftInitQuorumJointConfig(&config->voters); - config->autoLeave = false; -} - -void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config) { - syncRaftFreeNodeMap(&config->learners); - syncRaftFreeNodeMap(&config->learnersNext); - syncRaftFreeNodeMap(&config->voters.incoming); - syncRaftFreeNodeMap(&config->voters.outgoing); -} - -// ResetVotes prepares for a new round of vote counting via recordVote. -void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) { - taosHashClear(tracker->votesMap); -} - -void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) { - syncRaftVisitProgressMap(&tracker->progressMap, visit, arg); -} - -// RecordVote records that the node with the given id voted for this Raft -// instance if v == true (and declined it otherwise). -void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant) { - ESyncRaftVoteType* pType = taosHashGet(tracker->votesMap, &id, sizeof(SyncNodeId*)); - if (pType != NULL) { - return; - } - - taosHashPut(tracker->votesMap, &id, sizeof(SyncNodeId), &grant, sizeof(bool*)); -} - -void syncRaftCopyTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) { - memcpy(to, from, sizeof(SSyncRaftProgressTrackerConfig)); -} - -int syncRaftCheckTrackerConfigInProgress(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - // NB: intentionally allow the empty config. In production we'll never see a - // non-empty config (we prevent it from being created) but we will need to - // be able to *create* an initial config, for example during bootstrap (or - // during tests). Instead of having to hand-code this, we allow - // transitioning from an empty config into any other legal and non-empty - // config. - if (!syncRaftIsAllNodeInProgressMap(&config->voters.incoming, progressMap)) return -1; - if (!syncRaftIsAllNodeInProgressMap(&config->voters.outgoing, progressMap)) return -1; - if (!syncRaftIsAllNodeInProgressMap(&config->learners, progressMap)) return -1; - if (!syncRaftIsAllNodeInProgressMap(&config->learnersNext, progressMap)) return -1; - return 0; -} - -// TallyVotes returns the number of granted and rejected Votes, and whether the -// election outcome is known. -ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) { - SSyncRaftProgress* progress = NULL; - int r, g; - - // Make sure to populate granted/rejected correctly even if the Votes slice - // contains members no longer part of the configuration. This doesn't really - // matter in the way the numbers are used (they're informational), but might - // as well get it right. - while (!syncRaftIterateProgressMap(&tracker->progressMap, progress)) { - if (progress->id == SYNC_NON_NODE_ID) { - continue; - } - - bool* v = taosHashGet(tracker->votesMap, &progress->id, sizeof(SyncNodeId*)); - if (v == NULL) { - continue; - } - - if (*v) { - g++; - } else { - r++; - } - } - - if (rejected) *rejected = r; - if (granted) *granted = g; - return syncRaftVoteResult(&(tracker->config.voters), tracker->votesMap); -} - -void syncRaftConfigState(SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) { - syncRaftCopyNodeMap(&tracker->config.voters.incoming, &cs->voters); - syncRaftCopyNodeMap(&tracker->config.voters.outgoing, &cs->votersOutgoing); - syncRaftCopyNodeMap(&tracker->config.learners, &cs->learners); - syncRaftCopyNodeMap(&tracker->config.learnersNext, &cs->learnersNext); - cs->autoLeave = tracker->config.autoLeave; -} - -static void matchAckIndexer(SyncNodeId id, void* arg, SyncIndex* index) { - SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)arg; - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&tracker->progressMap, id); - if (progress == NULL) { - *index = 0; - return; - } - *index = progress->matchIndex; -} - -// Committed returns the largest log index known to be committed based on what -// the voting members of the group have acknowledged. -SyncIndex syncRaftCommittedIndex(SSyncRaftProgressTracker* tracker) { - return syncRaftJointConfigCommittedIndex(&tracker->config.voters, matchAckIndexer, tracker); -} - -static void visitProgressActive(SSyncRaftProgress* progress, void* arg) { - SHashObj* votesMap = (SHashObj*)arg; - taosHashPut(votesMap, &progress->id, sizeof(SyncNodeId), &progress->recentActive, sizeof(bool)); -} - -// QuorumActive returns true if the quorum is active from the view of the local -// raft state machine. Otherwise, it returns false. -bool syncRaftQuorumActive(SSyncRaftProgressTracker* tracker) { - SHashObj* votesMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); - syncRaftVisitProgressMap(&tracker->progressMap, visitProgressActive, votesMap); - - return syncRaftVoteResult(&tracker->config.voters, votesMap) == SYNC_RAFT_VOTE_WON; -} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_quorum_joint.c b/source/libs/sync/src/sync_raft_quorum_joint.c deleted file mode 100644 index 70c078b6f5..0000000000 --- a/source/libs/sync/src/sync_raft_quorum_joint.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "sync_raft_node_map.h" -#include "sync_raft_quorum_majority.h" -#include "sync_raft_quorum_joint.h" -#include "sync_raft_quorum.h" - -/** - * syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns - * a result indicating whether the vote is pending, lost, or won. A joint quorum - * requires both majority quorums to vote in favor. - **/ -ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashObj* votesMap) { - ESyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->incoming), votesMap); - ESyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->outgoing), votesMap); - - if (r1 == r2) { - // If they agree, return the agreed state. - return r1; - } - - if (r1 == SYNC_RAFT_VOTE_LOST || r2 == SYNC_RAFT_VOTE_LOST) { - // If either config has lost, loss is the only possible outcome. - return SYNC_RAFT_VOTE_LOST; - } - - // One side won, the other one is pending, so the whole outcome is. - return SYNC_RAFT_VOTE_PENDING; -} - -void syncRaftInitQuorumJointConfig(SSyncRaftQuorumJointConfig* config) { - syncRaftInitNodeMap(&config->incoming); - syncRaftInitNodeMap(&config->outgoing); -} - -void syncRaftFreeQuorumJointConfig(SSyncRaftQuorumJointConfig* config) { - syncRaftFreeNodeMap(&config->incoming); - syncRaftFreeNodeMap(&config->outgoing); -} - -void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) { - syncRaftAddToNodeMap(&config->incoming, id); -} - -void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) { - syncRaftRemoveFromNodeMap(&config->incoming, id); -} - -void syncRaftJointConfigIDs(SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap) { - syncRaftCopyNodeMap(&config->incoming, nodeMap); - - syncRaftUnionNodeMap(&config->outgoing, nodeMap); -} - -SyncIndex syncRaftJointConfigCommittedIndex(const SSyncRaftQuorumJointConfig* config, matchAckIndexerFp indexer, void* arg) { - SyncIndex index0, index1; - - index0 = syncRaftMajorityConfigCommittedIndex(&config->incoming, indexer, arg); - index1 = syncRaftMajorityConfigCommittedIndex(&config->outgoing, indexer, arg); - - return index0 < index1 ? index0 : index1; -} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_quorum_majority.c b/source/libs/sync/src/sync_raft_quorum_majority.c deleted file mode 100644 index 313f213cda..0000000000 --- a/source/libs/sync/src/sync_raft_quorum_majority.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "sync_const.h" -#include "sync_raft_quorum.h" -#include "sync_raft_quorum_majority.h" -#include "sync_raft_node_map.h" - -// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns -// a result indicating whether the vote is pending (i.e. neither a quorum of -// yes/no has been reached), won (a quorum of yes has been reached), or lost (a -// quorum of no has been reached). -ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashObj* votesMap) { - int n = syncRaftNodeMapSize(config); - if (n == 0) { - // By convention, the elections on an empty config win. This comes in - // handy with joint quorums because it'll make a half-populated joint - // quorum behave like a majority quorum. - return SYNC_RAFT_VOTE_WON; - } - - int i, g, r, missing; - i = g = r = missing = 0; - SyncNodeId* pId = NULL; - while (!syncRaftIterateNodeMap(config, pId)) { - const bool* v = (const bool*)taosHashGet(votesMap, pId, sizeof(SyncNodeId*)); - if (v == NULL) { - missing += 1; - continue; - } - - if (*v) { - g +=1; - } else { - r += 1; - } - } - - int quorum = n / 2 + 1; - if (g >= quorum) { - return SYNC_RAFT_VOTE_WON; - } - if (g + missing >= quorum) { - return SYNC_RAFT_VOTE_PENDING; - } - - return SYNC_RAFT_VOTE_LOST; -} - -int compSyncIndex(const void * elem1, const void * elem2) { - SyncIndex index1 = *((SyncIndex*)elem1); - SyncIndex index2 = *((SyncIndex*)elem1); - if (index1 > index2) return 1; - if (index1 < index2) return -1; - return 0; -} - -SyncIndex syncRaftMajorityConfigCommittedIndex(const SSyncRaftNodeMap* config, matchAckIndexerFp indexer, void* arg) { - int n = syncRaftNodeMapSize(config); - if (n == 0) { - // This plays well with joint quorums which, when one half is the zero - // MajorityConfig, should behave like the other half. - return kMaxCommitIndex; - } - - // Use an on-stack slice to collect the committed indexes when n <= 7 - // (otherwise we alloc). The alternative is to stash a slice on - // MajorityConfig, but this impairs usability (as is, MajorityConfig is just - // a map, and that's nice). The assumption is that running with a - // replication factor of >7 is rare, and in cases in which it happens - // performance is a lesser concern (additionally the performance - // implications of an allocation here are far from drastic). - SyncIndex* srt = NULL; - SyncIndex srk[TSDB_MAX_REPLICA]; - if (n > TSDB_MAX_REPLICA) { - srt = (SyncIndex*)malloc(sizeof(SyncIndex) * n); - if (srt == NULL) { - return kMaxCommitIndex; - } - } else { - srt = &srk[0]; - } - - // Fill the slice with the indexes observed. Any unused slots will be - // left as zero; these correspond to voters that may report in, but - // haven't yet. We fill from the right (since the zeroes will end up on - // the left after sorting below anyway). - SyncNodeId *pId = NULL; - int i = 0; - SyncIndex index; - while (!syncRaftIterateNodeMap(config, pId)) { - indexer(*pId, arg, &index); - srt[i++] = index; - } - - // Sort by index. Use a bespoke algorithm (copied from the stdlib's sort - // package) to keep srt on the stack. - qsort(srt, n, sizeof(SyncIndex), compSyncIndex); - - // The smallest index into the array for which the value is acked by a - // quorum. In other words, from the end of the slice, move n/2+1 to the - // left (accounting for zero-indexing). - index = srt[n - (n/2 + 1)]; - if (srt != &srk[0]) { - free(srt); - } - - return index; -} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_restore.c b/source/libs/sync/src/sync_raft_restore.c deleted file mode 100644 index d1acd3e8e9..0000000000 --- a/source/libs/sync/src/sync_raft_restore.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "sync_raft_config_change.h" -#include "sync_raft_restore.h" -#include "sync_raft_progress_tracker.h" - -static void addToConfChangeSingleArray(SSyncConfChangeSingleArray* out, int* i, const SSyncRaftNodeMap* nodeMap, ESyncRaftConfChangeType t); -static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in); - -// syncRaftRestoreConfig takes a Changer (which must represent an empty configuration), and -// runs a sequence of changes enacting the configuration described in the -// ConfState. -// -// TODO(tbg) it's silly that this takes a Changer. Unravel this by making sure -// the Changer only needs a ProgressMap (not a whole Tracker) at which point -// this can just take LastIndex and MaxInflight directly instead and cook up -// the results from that alone. -int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs, - SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - SSyncConfChangeSingleArray outgoing; - SSyncConfChangeSingleArray incoming; - SSyncConfChangeSingleArray css; - SSyncRaftProgressTracker* tracker = changer->tracker; - int i, ret; - - syncRaftInitConfArray(&outgoing); - syncRaftInitConfArray(&incoming); - - syncRaftInitTrackConfig(config); - syncRaftInitProgressMap(progressMap); - - ret = toConfChangeSingle(cs, &outgoing, &incoming); - if (ret != 0) { - goto out; - } - - if (syncRaftConfArrayIsEmpty(&outgoing)) { - // No outgoing config, so just apply the incoming changes one by one. - for (i = 0; i < incoming.n; ++i) { - css = (SSyncConfChangeSingleArray) { - .n = 1, - .changes = &incoming.changes[i], - }; - ret = syncRaftChangerSimpleConfig(changer, &css, config, progressMap); - if (ret != 0) { - goto out; - } - - syncRaftCopyTrackerConfig(config, &changer->tracker->config); - syncRaftCopyProgressMap(progressMap, &changer->tracker->progressMap); - } - } else { - // The ConfState describes a joint configuration. - // - // First, apply all of the changes of the outgoing config one by one, so - // that it temporarily becomes the incoming active config. For example, - // if the config is (1 2 3)&(2 3 4), this will establish (2 3 4)&(). - for (i = 0; i < outgoing.n; ++i) { - css = (SSyncConfChangeSingleArray) { - .n = 1, - .changes = &outgoing.changes[i], - }; - ret = syncRaftChangerSimpleConfig(changer, &css, config, progressMap); - if (ret != 0) { - goto out; - } - syncRaftCopyTrackerConfig(config, &changer->tracker->config); - syncRaftCopyProgressMap(progressMap, &changer->tracker->progressMap); - } - - ret = syncRaftChangerEnterJoint(changer, cs->autoLeave, &incoming, config, progressMap); - if (ret != 0) { - goto out; - } - } - -out: - syncRaftFreeConfArray(&incoming); - syncRaftFreeConfArray(&outgoing); - - return ret; -} - -static void addToConfChangeSingleArray(SSyncConfChangeSingleArray* out, int* i, const SSyncRaftNodeMap* nodeMap, ESyncRaftConfChangeType t) { - SyncNodeId* pId = NULL; - - while (!syncRaftIterateNodeMap(nodeMap, pId)) { - out->changes[*i] = (SSyncConfChangeSingle) { - .type = t, - .nodeId = *pId, - }; - *i += 1; - } -} - -// toConfChangeSingle translates a conf state into 1) a slice of operations creating -// first the config that will become the outgoing one, and then the incoming one, and -// b) another slice that, when applied to the config resulted from 1), represents the -// ConfState. -static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in) { - int i; - - out->n = syncRaftNodeMapSize(&cs->votersOutgoing); - out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * out->n); - if (out->changes == NULL) { - out->n = 0; - return -1; - } - in->n = syncRaftNodeMapSize(&cs->votersOutgoing) + - syncRaftNodeMapSize(&cs->voters) + - syncRaftNodeMapSize(&cs->learners) + - syncRaftNodeMapSize(&cs->learnersNext); - out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * in->n); - if (in->changes == NULL) { - in->n = 0; - return -1; - } - - // Example to follow along this code: - // voters=(1 2 3) learners=(5) outgoing=(1 2 4 6) learners_next=(4) - // - // This means that before entering the joint config, the configuration - // had voters (1 2 4 6) and perhaps some learners that are already gone. - // The new set of voters is (1 2 3), i.e. (1 2) were kept around, and (4 6) - // are no longer voters; however 4 is poised to become a learner upon leaving - // the joint state. - // We can't tell whether 5 was a learner before entering the joint config, - // but it doesn't matter (we'll pretend that it wasn't). - // - // The code below will construct - // outgoing = add 1; add 2; add 4; add 6 - // incoming = remove 1; remove 2; remove 4; remove 6 - // add 1; add 2; add 3; - // add-learner 5; - // add-learner 4; - // - // So, when starting with an empty config, after applying 'outgoing' we have - // - // quorum=(1 2 4 6) - // - // From which we enter a joint state via 'incoming' - // - // quorum=(1 2 3)&&(1 2 4 6) learners=(5) learners_next=(4) - // - // as desired. - - // If there are outgoing voters, first add them one by one so that the - // (non-joint) config has them all. - i = 0; - addToConfChangeSingleArray(out, &i, &cs->votersOutgoing, SYNC_RAFT_Conf_AddNode); - assert(i == out->n); - - // We're done constructing the outgoing slice, now on to the incoming one - // (which will apply on top of the config created by the outgoing slice). - i = 0; - - // First, we'll remove all of the outgoing voters. - addToConfChangeSingleArray(in, &i, &cs->votersOutgoing, SYNC_RAFT_Conf_RemoveNode); - - // Then we'll add the incoming voters and learners. - addToConfChangeSingleArray(in, &i, &cs->voters, SYNC_RAFT_Conf_AddNode); - addToConfChangeSingleArray(in, &i, &cs->learners, SYNC_RAFT_Conf_AddLearnerNode); - addToConfChangeSingleArray(in, &i, &cs->learnersNext, SYNC_RAFT_Conf_AddLearnerNode); - assert(i == in->n); - - return 0; -} \ No newline at end of file diff --git a/source/libs/sync/test/raftTests.cpp b/source/libs/sync/test/raftTests.cpp deleted file mode 100644 index e69de29bb2..0000000000 From c478fd06f5d5b50b56815547dc5e5d205cd4f0b9 Mon Sep 17 00:00:00 2001 From: Minghao Li Date: Wed, 9 Feb 2022 14:18:49 +0800 Subject: [PATCH 12/12] rm old sync code --- include/libs/sync/sync.h | 159 --------------------------------------- 1 file changed, 159 deletions(-) diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index 283604508f..e69de29bb2 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_H -#define _TD_LIBS_SYNC_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include "taosdef.h" - -typedef int32_t SyncNodeId; -typedef int32_t SyncGroupId; -typedef int64_t SyncIndex; -typedef uint64_t SyncTerm; - -typedef enum { - TAOS_SYNC_STATE_FOLLOWER = 0, - TAOS_SYNC_STATE_CANDIDATE = 1, - TAOS_SYNC_STATE_LEADER = 2, -} ESyncState; - -typedef struct { - void* data; - size_t len; -} SSyncBuffer; - -typedef struct { - SyncNodeId nodeId; - uint16_t nodePort; // node sync Port - char nodeFqdn[TSDB_FQDN_LEN]; // node FQDN -} SNodeInfo; - -typedef struct { - int32_t selfIndex; - int32_t replica; - SNodeInfo nodeInfo[TSDB_MAX_REPLICA]; -} SSyncCluster; - -typedef struct { - int32_t selfIndex; - int32_t replica; - SNodeInfo node[TSDB_MAX_REPLICA]; - ESyncState role[TSDB_MAX_REPLICA]; -} SNodesRole; - -typedef struct SSyncFSM { - void* pData; - - // apply committed log, bufs will be free by sync module - int32_t (*applyLog)(struct SSyncFSM* fsm, SyncIndex index, const SSyncBuffer* buf, void* pData); - - // cluster commit callback - int32_t (*onClusterChanged)(struct SSyncFSM* fsm, const SSyncCluster* cluster, void* pData); - - // fsm return snapshot in ppBuf, bufs will be free by sync module - // TODO: getSnapshot SHOULD be async? - int32_t (*getSnapshot)(struct SSyncFSM* fsm, SSyncBuffer** ppBuf, int32_t* objId, bool* isLast); - - // fsm apply snapshot with pBuf data - int32_t (*applySnapshot)(struct SSyncFSM* fsm, SSyncBuffer* pBuf, int32_t objId, bool isLast); - - // call when restore snapshot and log done - int32_t (*onRestoreDone)(struct SSyncFSM* fsm); - - void (*onRollback)(struct SSyncFSM* fsm, SyncIndex index, const SSyncBuffer* buf); - - void (*onRoleChanged)(struct SSyncFSM* fsm, const SNodesRole* pRole); - -} SSyncFSM; - -typedef struct SSyncLogStore { - void* pData; - - // write log with given index - int32_t (*logWrite)(struct SSyncLogStore* logStore, SyncIndex index, SSyncBuffer* pBuf); - - /** - * read log from given index(included) with limit, return the actual num in nBuf, - * pBuf will be free in sync module - **/ - int32_t (*logRead)(struct SSyncLogStore* logStore, SyncIndex index, int limit, - SSyncBuffer* pBuf, int* nBuf); - - // mark log with given index has been commtted - int32_t (*logCommit)(struct SSyncLogStore* logStore, SyncIndex index); - - // prune log before given index(not included) - int32_t (*logPrune)(struct SSyncLogStore* logStore, SyncIndex index); - - // rollback log after given index(included) - int32_t (*logRollback)(struct SSyncLogStore* logStore, SyncIndex index); - - // return last index of log - SyncIndex (*logLastIndex)(struct SSyncLogStore* logStore); -} SSyncLogStore; - -typedef struct SStateManager { - void* pData; - - // save serialized server state data, buffer will be free by Sync - int32_t (*saveServerState)(struct SStateManager* stateMng, const char* buffer, int n); - - // read serialized server state data, buffer will be free by Sync - int32_t (*readServerState)(struct SStateManager* stateMng, char** ppBuffer, int* n); - - // save serialized cluster state data, buffer will be free by Sync - void (*saveClusterState)(struct SStateManager* stateMng, const char* buffer, int n); - - // read serialized cluster state data, buffer will be free by Sync - int32_t (*readClusterState)(struct SStateManager* stateMng, char** ppBuffer, int* n); -} SStateManager; - -typedef struct { - SyncGroupId vgId; - SyncIndex appliedIndex; - SSyncCluster syncCfg; - SSyncFSM fsm; - SSyncLogStore logStore; - SStateManager stateManager; -} SSyncInfo; - -struct SSyncNode; -typedef struct SSyncNode SSyncNode; - -int32_t syncInit(); -void syncCleanUp(); - -SSyncNode* syncStart(const SSyncInfo*); -void syncReconfig(const SSyncNode*, const SSyncCluster*); -void syncStop(const SSyncNode*); - -int32_t syncPropose(SSyncNode* syncNode, const SSyncBuffer* pBuf, void* pData, bool isWeak); - -int32_t syncAddNode(SSyncNode syncNode, const SNodeInfo *pNode); - -int32_t syncRemoveNode(SSyncNode syncNode, const SNodeInfo *pNode); - -extern int32_t sDebugFlag; - -#ifdef __cplusplus -} -#endif - -#endif /*_TD_LIBS_SYNC_H*/