From 235fb59ede06b94d5469416d2d3394ed6390ccee Mon Sep 17 00:00:00 2001 From: Shungang Li Date: Thu, 22 Aug 2024 18:02:23 +0800 Subject: [PATCH] fix: (ttl) continue flush cache after error occurs --- include/util/taoserror.h | 1 + source/dnode/vnode/src/meta/metaTtl.c | 36 ++++++++++++++++----------- source/util/src/terror.c | 5 ++-- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 1911c48d26..58f906e3f3 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -530,6 +530,7 @@ int32_t taosGetErrSize(); #define TSDB_CODE_VND_COLUMN_COMPRESS_ALREADY_EXIST TAOS_DEF_ERROR_CODE(0, 0x0536) #define TSDB_CODE_VND_ARB_NOT_SYNCED TAOS_DEF_ERROR_CODE(0, 0x0537) // internal #define TSDB_CODE_VND_WRITE_DISABLED TAOS_DEF_ERROR_CODE(0, 0x0538) // internal +#define TSDB_CODE_VND_TTL_FLUSH_INCOMPLETION TAOS_DEF_ERROR_CODE(0, 0x0539) // internal // tsdb #define TSDB_CODE_TDB_INVALID_TABLE_ID TAOS_DEF_ERROR_CODE(0, 0x0600) diff --git a/source/dnode/vnode/src/meta/metaTtl.c b/source/dnode/vnode/src/meta/metaTtl.c index cd1aa7bcad..0b5b9280df 100644 --- a/source/dnode/vnode/src/meta/metaTtl.c +++ b/source/dnode/vnode/src/meta/metaTtl.c @@ -408,7 +408,7 @@ int32_t ttlMgrFlush(STtlManger *pTtlMgr, TXN *pTxn) { int64_t startNs = taosGetTimestampNs(); int64_t endNs = startNs; - metaTrace("%s, ttl mgr flush start. dirty uids:%d", pTtlMgr->logPrefix, taosHashGetSize(pTtlMgr->pDirtyUids)); + metaTrace("%s, ttl mgr flush start. num of dirty uids:%d", pTtlMgr->logPrefix, taosHashGetSize(pTtlMgr->pDirtyUids)); int32_t code = TSDB_CODE_SUCCESS; @@ -419,13 +419,8 @@ int32_t ttlMgrFlush(STtlManger *pTtlMgr, TXN *pTxn) { STtlCacheEntry *cacheEntry = taosHashGet(pTtlMgr->pTtlCache, pUid, sizeof(*pUid)); if (cacheEntry == NULL) { - metaInfo("%s, ttlMgr flush failed to get ttl cache, uid: %" PRId64 ", type: %d", pTtlMgr->logPrefix, *pUid, + metaError("%s, ttlMgr flush failed to get ttl cache, uid: %" PRId64 ", type: %d", pTtlMgr->logPrefix, *pUid, pEntry->type); - code = taosHashRemove(pTtlMgr->pDirtyUids, pUid, sizeof(*pUid)); - if (TSDB_CODE_SUCCESS != code) { - metaError("%s, ttlMgr flush failed to remove dirty uid since %s", pTtlMgr->logPrefix, tstrerror(code)); - goto _out; - } continue; } @@ -437,31 +432,35 @@ int32_t ttlMgrFlush(STtlManger *pTtlMgr, TXN *pTxn) { if (pEntry->type == ENTRY_TYPE_UPSERT) { // delete old key & upsert new key - (void)tdbTbDelete(pTtlMgr->pTtlIdx, &ttlKey, sizeof(ttlKey), pTxn); // maybe first insert, ignore error + code = tdbTbDelete(pTtlMgr->pTtlIdx, &ttlKey, sizeof(ttlKey), pTxn); // maybe first insert, ignore error + if (TSDB_CODE_SUCCESS != code && TSDB_CODE_NOT_FOUND != code) { + metaError("%s, ttlMgr flush failed to delete since %s", pTtlMgr->logPrefix, tstrerror(code)); + continue; + } code = tdbTbUpsert(pTtlMgr->pTtlIdx, &ttlKeyDirty, sizeof(ttlKeyDirty), &cacheEntry->ttlDaysDirty, sizeof(cacheEntry->ttlDaysDirty), pTxn); if (TSDB_CODE_SUCCESS != code) { metaError("%s, ttlMgr flush failed to upsert since %s", pTtlMgr->logPrefix, tstrerror(code)); - goto _out; + continue; } cacheEntry->ttlDays = cacheEntry->ttlDaysDirty; cacheEntry->changeTimeMs = cacheEntry->changeTimeMsDirty; } else if (pEntry->type == ENTRY_TYPE_DELETE) { code = tdbTbDelete(pTtlMgr->pTtlIdx, &ttlKey, sizeof(ttlKey), pTxn); - if (TSDB_CODE_SUCCESS != code) { + if (TSDB_CODE_SUCCESS != code && TSDB_CODE_NOT_FOUND != code) { metaError("%s, ttlMgr flush failed to delete since %s", pTtlMgr->logPrefix, tstrerror(code)); - goto _out; + continue; } code = taosHashRemove(pTtlMgr->pTtlCache, pUid, sizeof(*pUid)); if (TSDB_CODE_SUCCESS != code) { metaError("%s, ttlMgr flush failed to remove cache since %s", pTtlMgr->logPrefix, tstrerror(code)); - goto _out; + continue; } } else { metaError("%s, ttlMgr flush failed, unknown type: %d", pTtlMgr->logPrefix, pEntry->type); - goto _out; + continue; } metaDebug("isdel:%d", pEntry->type == ENTRY_TYPE_DELETE); @@ -471,11 +470,18 @@ int32_t ttlMgrFlush(STtlManger *pTtlMgr, TXN *pTxn) { code = taosHashRemove(pTtlMgr->pDirtyUids, pUid, sizeof(*pUid)); if (TSDB_CODE_SUCCESS != code) { metaError("%s, ttlMgr flush failed to remove dirty uid since %s", pTtlMgr->logPrefix, tstrerror(code)); - goto _out; + continue; } } - taosHashClear(pTtlMgr->pDirtyUids); + int32_t count = taosHashGetSize(pTtlMgr->pDirtyUids); + if (count != 0) { + taosHashClear(pTtlMgr->pDirtyUids); + metaError("%s, ttlMgr flush failed, dirty uids not empty, count: %d", pTtlMgr->logPrefix, count); + code = TSDB_CODE_VND_TTL_FLUSH_INCOMPLETION; + + goto _out; + } code = TSDB_CODE_SUCCESS; diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 396abf21a7..0b6d2674fd 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -57,8 +57,8 @@ TAOS_DEFINE_ERROR(TSDB_CODE_RPC_MAX_SESSIONS, "rpc open too many ses TAOS_DEFINE_ERROR(TSDB_CODE_RPC_NETWORK_ERROR, "rpc network error") TAOS_DEFINE_ERROR(TSDB_CODE_RPC_NETWORK_BUSY, "rpc network busy") TAOS_DEFINE_ERROR(TSDB_CODE_HTTP_MODULE_QUIT, "http-report already quit") -TAOS_DEFINE_ERROR(TSDB_CODE_RPC_MODULE_QUIT, "rpc module already quit") -TAOS_DEFINE_ERROR(TSDB_CODE_RPC_ASYNC_MODULE_QUIT, "rpc async module already quit") +TAOS_DEFINE_ERROR(TSDB_CODE_RPC_MODULE_QUIT, "rpc module already quit") +TAOS_DEFINE_ERROR(TSDB_CODE_RPC_ASYNC_MODULE_QUIT, "rpc async module already quit") //common & util TAOS_DEFINE_ERROR(TSDB_CODE_TIME_UNSYNCED, "Client and server's time is not synchronized") @@ -411,6 +411,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_VND_META_DATA_UNSAFE_DELETE, "Single replica vnode TAOS_DEFINE_ERROR(TSDB_CODE_VND_ARB_NOT_SYNCED, "Vgroup peer is not synced") TAOS_DEFINE_ERROR(TSDB_CODE_VND_WRITE_DISABLED, "Vnode write is disabled for snapshot") TAOS_DEFINE_ERROR(TSDB_CODE_VND_COLUMN_COMPRESS_ALREADY_EXIST,"Same with old param") +TAOS_DEFINE_ERROR(TSDB_CODE_VND_TTL_FLUSH_INCOMPLETION, "Failed to flush all ttl modification to tdb") // tsdb