From e34da43e38220d01d70faf09c4db0490fe4e8be5 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Mon, 23 Oct 2023 19:32:44 +0800 Subject: [PATCH 01/31] feat: support pipelining of snap replication --- include/util/tdef.h | 2 + source/libs/sync/inc/syncInt.h | 4 +- source/libs/sync/inc/syncSnapshot.h | 19 ++- source/libs/sync/src/syncSnapshot.c | 251 +++++++++++++++++++++------- 4 files changed, 215 insertions(+), 61 deletions(-) diff --git a/include/util/tdef.h b/include/util/tdef.h index 7f8fe22340..9ea6240b1f 100644 --- a/include/util/tdef.h +++ b/include/util/tdef.h @@ -294,6 +294,8 @@ typedef enum ELogicConditionType { #define TSDB_SYNC_APPLYQ_SIZE_LIMIT 512 #define TSDB_SYNC_NEGOTIATION_WIN 512 +#define TSDB_SYNC_SNAP_BUFFER_SIZE 2048 + #define TSDB_TBNAME_COLUMN_INDEX (-1) #define TSDB_MULTI_TABLEMETA_MAX_NUM 100000 // maximum batch size allowed to load table meta diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index cec1a12024..637c18e97d 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -249,8 +249,8 @@ int32_t syncNodeOnRequestVote(SSyncNode* pNode, const SRpcMsg* pMsg); int32_t syncNodeOnRequestVoteReply(SSyncNode* pNode, const SRpcMsg* pMsg); int32_t syncNodeOnAppendEntries(SSyncNode* pNode, const SRpcMsg* pMsg); int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, const SRpcMsg* pMsg); -int32_t syncNodeOnSnapshot(SSyncNode* ths, const SRpcMsg* pMsg); -int32_t syncNodeOnSnapshotRsp(SSyncNode* ths, const SRpcMsg* pMsg); +int32_t syncNodeOnSnapshot(SSyncNode* ths, SRpcMsg* pMsg); +int32_t syncNodeOnSnapshotRsp(SSyncNode* ths, SRpcMsg* pMsg); int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pMsg); int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, const SRpcMsg* pMsg); int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pMsg); diff --git a/source/libs/sync/inc/syncSnapshot.h b/source/libs/sync/inc/syncSnapshot.h index 95382132b5..0332204769 100644 --- a/source/libs/sync/inc/syncSnapshot.h +++ b/source/libs/sync/inc/syncSnapshot.h @@ -30,6 +30,15 @@ extern "C" { #define SYNC_SNAPSHOT_RETRY_MS 5000 +typedef struct SSyncSnapBuffer { + void *entries[TSDB_SYNC_SNAP_BUFFER_SIZE]; + int64_t start; + int64_t cursor; + int64_t end; + int64_t size; + TdThreadMutex mutex; +} SSyncSnapBuffer; + typedef struct SSyncSnapshotSender { int8_t start; int32_t seq; @@ -47,6 +56,9 @@ typedef struct SSyncSnapshotSender { int64_t lastSendTime; bool finish; + // buffer + SSyncSnapBuffer *pSndBuf; + // init when create SSyncNode *pSyncNode; int32_t replicaIndex; @@ -72,6 +84,9 @@ typedef struct SSyncSnapshotReceiver { SSnapshotParam snapshotParam; SSnapshot snapshot; + // buffer + SSyncSnapBuffer *pRcvBuf; + // init when create SSyncNode *pSyncNode; } SSyncSnapshotReceiver; @@ -83,8 +98,8 @@ void snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver); bool snapshotReceiverIsStart(SSyncSnapshotReceiver *pReceiver); // on message -int32_t syncNodeOnSnapshot(SSyncNode *ths, const SRpcMsg *pMsg); -int32_t syncNodeOnSnapshotRsp(SSyncNode *ths, const SRpcMsg *pMsg); +// int32_t syncNodeOnSnapshot(SSyncNode *ths, const SRpcMsg *pMsg); +// int32_t syncNodeOnSnapshotRsp(SSyncNode *ths, const SRpcMsg *pMsg); SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode *pSyncNode, SyncIndex snapshotLastApplyIndex); diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 924813eb98..43726307e6 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -23,6 +23,42 @@ #include "syncReplication.h" #include "syncUtil.h" +static void syncSnapBufferReset(SSyncSnapBuffer *pBuf) { + taosThreadMutexLock(&pBuf->mutex); + for (int64_t i = pBuf->start; i < pBuf->end; ++i) { + rpcFreeCont(pBuf->entries[i % pBuf->size]); + pBuf->entries[i % pBuf->size] = NULL; + } + pBuf->start = 1; + pBuf->end = 1; + pBuf->cursor = 0; + taosThreadMutexUnlock(&pBuf->mutex); +} + +static void syncSnapBufferDestroy(SSyncSnapBuffer **ppBuf) { + if (ppBuf == NULL || ppBuf[0] == NULL) return; + SSyncSnapBuffer *pBuf = ppBuf[0]; + + syncSnapBufferReset(pBuf); + + taosThreadMutexDestroy(&pBuf->mutex); + taosMemoryFree(ppBuf[0]); + ppBuf[0] = NULL; + return; +} + +static SSyncSnapBuffer *syncSnapBufferCreate() { + SSyncSnapBuffer *pBuf = taosMemoryCalloc(1, sizeof(SSyncSnapBuffer)); + if (pBuf == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + pBuf->size = sizeof(pBuf->entries) / sizeof(void *); + ASSERT(pBuf->size == TSDB_SYNC_SNAP_BUFFER_SIZE); + taosThreadMutexInit(&pBuf->mutex, NULL); + return pBuf; +} + SSyncSnapshotSender *snapshotSenderCreate(SSyncNode *pSyncNode, int32_t replicaIndex) { bool condition = (pSyncNode->pFsm->FpSnapshotStartRead != NULL) && (pSyncNode->pFsm->FpSnapshotStopRead != NULL) && (pSyncNode->pFsm->FpSnapshotDoRead != NULL); @@ -49,6 +85,14 @@ SSyncSnapshotSender *snapshotSenderCreate(SSyncNode *pSyncNode, int32_t replicaI pSender->pSyncNode->pFsm->FpGetSnapshotInfo(pSender->pSyncNode->pFsm, &pSender->snapshot); pSender->finish = false; + pSender->pSndBuf = syncSnapBufferCreate(); + if (pSender->pSndBuf == NULL) { + taosMemoryFree(pSender); + pSender = NULL; + return NULL; + } + syncSnapBufferReset(pSender->pSndBuf); + return pSender; } @@ -67,6 +111,10 @@ void snapshotSenderDestroy(SSyncSnapshotSender *pSender) { pSender->pReader = NULL; } + // free snap buffer + if (pSender->pSndBuf) { + syncSnapBufferDestroy(&pSender->pSndBuf); + } // free sender taosMemoryFree(pSender); } @@ -181,6 +229,8 @@ void snapshotSenderStop(SSyncSnapshotSender *pSender, bool finish) { pSender->pCurrentBlock = NULL; pSender->blockLen = 0; } + + syncSnapBufferReset(pSender->pSndBuf); } // when sender receive ack, call this function to send msg from seq @@ -193,6 +243,8 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { pSender->blockLen = 0; } + pSender->seq++; + // read data int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotDoRead(pSender->pSyncNode->pFsm, pSender->pReader, &pSender->pCurrentBlock, &pSender->blockLen); @@ -362,6 +414,14 @@ SSyncSnapshotReceiver *snapshotReceiverCreate(SSyncNode *pSyncNode, SRaftId from pReceiver->snapshot.lastApplyTerm = 0; pReceiver->snapshot.lastConfigIndex = SYNC_INDEX_INVALID; + pReceiver->pRcvBuf = syncSnapBufferCreate(); + if (pReceiver->pRcvBuf == NULL) { + taosMemoryFree(pReceiver); + pReceiver = NULL; + return NULL; + } + + syncSnapBufferReset(pReceiver->pRcvBuf); return pReceiver; } @@ -389,6 +449,11 @@ void snapshotReceiverDestroy(SSyncSnapshotReceiver *pReceiver) { pReceiver->snapshot.data = NULL; } + // free snap buf + if (pReceiver->pRcvBuf) { + syncSnapBufferDestroy(&pReceiver->pRcvBuf); + } + // free receiver taosMemoryFree(pReceiver); } @@ -472,6 +537,8 @@ void snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver) { } else { sRInfo(pReceiver, "snapshot receiver stop, writer is null"); } + + syncSnapBufferReset(pReceiver->pRcvBuf); } // when recv last snapshot block, apply data into snapshot @@ -765,29 +832,8 @@ _SEND_REPLY: return code; } -static int32_t syncNodeOnSnapshotReceive(SSyncNode *pSyncNode, SyncSnapshotSend *pMsg) { - // condition 4 - // transfering - SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver; - int64_t timeNow = taosGetTimestampMs(); - int32_t code = 0; - - if (snapshotReceiverSignatureCmp(pReceiver, pMsg) != 0) { - terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; - sRError(pReceiver, "failed to receive snapshot data since %s.", terrstr()); - code = terrno; - goto _SEND_REPLY; - } - - if (snapshotReceiverGotData(pReceiver, pMsg) != 0) { - code = terrno; - if (code >= SYNC_SNAPSHOT_SEQ_INVALID) { - code = TSDB_CODE_SYN_INTERNAL_ERROR; - } - } - -_SEND_REPLY:; - +static int32_t syncSnapSendRsp(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *pMsg, int32_t code) { + SSyncNode *pSyncNode = pReceiver->pSyncNode; // build msg SRpcMsg rpcMsg = {0}; if (syncBuildSnapshotSendRsp(&rpcMsg, 0, pSyncNode->vgId)) { @@ -811,10 +857,76 @@ _SEND_REPLY:; sRError(pReceiver, "failed to send snapshot receiver resp since %s", terrstr()); return -1; } + return 0; +} +static int32_t syncSnapBufferRecv(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend **ppMsg) { + int32_t code = 0; + SSyncSnapBuffer *pRcvBuf = pReceiver->pRcvBuf; + SyncSnapshotSend *pMsg = ppMsg[0]; + terrno = TSDB_CODE_SUCCESS; + + taosThreadMutexLock(&pRcvBuf->mutex); + + if (pMsg->seq - pRcvBuf->start >= pRcvBuf->size) { + terrno = TSDB_CODE_SYN_BUFFER_FULL; + code = terrno; + goto _out; + } + + ASSERT(pRcvBuf->start <= pRcvBuf->cursor + 1 && pRcvBuf->cursor < pRcvBuf->end); + + if (pMsg->seq > pRcvBuf->cursor) { + pRcvBuf->entries[pMsg->seq % pRcvBuf->size] = pMsg; + ppMsg[0] = NULL; + pRcvBuf->end = TMAX(pMsg->seq + 1, pRcvBuf->end); + } + + for (int64_t seq = pRcvBuf->cursor + 1; seq < pRcvBuf->end; ++seq) { + if (pRcvBuf->entries[seq]) { + pRcvBuf->cursor = seq; + } else { + break; + } + } + + for (int64_t seq = pRcvBuf->start; seq <= pRcvBuf->cursor; ++seq) { + if (snapshotReceiverGotData(pReceiver, pRcvBuf->entries[seq % pRcvBuf->size]) != 0) { + code = terrno; + if (code >= SYNC_SNAPSHOT_SEQ_INVALID) { + code = TSDB_CODE_SYN_INTERNAL_ERROR; + } + } + pRcvBuf->start = seq + 1; + syncSnapSendRsp(pReceiver, pRcvBuf->entries[seq % pRcvBuf->size], code); + rpcFreeCont(pRcvBuf->entries[seq % pRcvBuf->size]); + pRcvBuf->entries[seq % pRcvBuf->size] = NULL; + if (code) goto _out; + } + +_out: + taosThreadMutexUnlock(&pRcvBuf->mutex); return code; } +static int32_t syncNodeOnSnapshotReceive(SSyncNode *pSyncNode, SyncSnapshotSend **ppMsg) { + // condition 4 + // transfering + SyncSnapshotSend *pMsg = ppMsg[0]; + ASSERT(pMsg); + SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver; + int64_t timeNow = taosGetTimestampMs(); + int32_t code = 0; + + if (snapshotReceiverSignatureCmp(pReceiver, pMsg) != 0) { + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; + sRError(pReceiver, "failed to receive snapshot data since %s.", terrstr()); + return syncSnapSendRsp(pReceiver, pMsg, terrno); + } + + return syncSnapBufferRecv(pReceiver, ppMsg); +} + static int32_t syncNodeOnSnapshotEnd(SSyncNode *pSyncNode, SyncSnapshotSend *pMsg) { // condition 2 // end, finish FSM @@ -885,8 +997,10 @@ _SEND_REPLY:; // // condition 5, got data, update ack // -int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { - SyncSnapshotSend *pMsg = pRpcMsg->pCont; +int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { + SyncSnapshotSend **ppMsg = (SyncSnapshotSend **)&pRpcMsg->pCont; + SyncSnapshotSend *pMsg = ppMsg[0]; + ASSERT(pMsg); SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver; // if already drop replica, do not process @@ -935,9 +1049,9 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { // force close, no response syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process force stop"); snapshotReceiverStop(pReceiver); - } else if (pMsg->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->seq < SYNC_SNAPSHOT_SEQ_END) { + } else if (pMsg->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->seq <= SYNC_SNAPSHOT_SEQ_END) { syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq data"); - code = syncNodeOnSnapshotReceive(pSyncNode, pMsg); + code = syncNodeOnSnapshotReceive(pSyncNode, ppMsg); } else { // error log sRError(pReceiver, "snapshot receiver recv error seq:%d, my ack:%d", pMsg->seq, pReceiver->ack); @@ -1038,14 +1152,62 @@ static int32_t snapshotSenderSignatureCmp(SSyncSnapshotSender *pSender, SyncSnap return 0; } +static int32_t syncSnapBufferSend(SSyncSnapshotSender *pSender, SyncSnapshotRsp **ppMsg) { + int32_t code = 0; + SSyncSnapBuffer *pSndBuf = pSender->pSndBuf; + SyncSnapshotRsp *pMsg = ppMsg[0]; + + taosThreadMutexLock(&pSndBuf->mutex); + + if (pMsg->ack - pSndBuf->start >= pSndBuf->size) { + terrno = TSDB_CODE_SYN_BUFFER_FULL; + code = terrno; + goto _out; + } + + ASSERT(pSndBuf->start <= pSndBuf->cursor + 1 && pSndBuf->cursor < pSndBuf->end); + + if (pMsg->ack > pSndBuf->cursor && pSndBuf->entries[pMsg->ack % pSndBuf->size] == NULL) { + pSndBuf->entries[pMsg->ack % pSndBuf->size] = pMsg; + ppMsg[0] = NULL; + pSndBuf->end = TMAX(pMsg->ack + 1, pSndBuf->end); + } + + for (int64_t ack = pSndBuf->cursor + 1; ack < pSndBuf->end; ++ack) { + if (pSndBuf->entries[ack % pSndBuf->size]) { + pSndBuf->cursor = ack; + } else { + break; + } + } + + for (int64_t ack = pSndBuf->start; ack < pSndBuf->cursor; ++ack) { + rpcFreeCont(pSndBuf->entries[ack % pSndBuf->size]); + pSndBuf->entries[ack % pSndBuf->size] = NULL; + pSndBuf->start = ack + 1; + } + + while (pSender->seq - pSndBuf->start < (pSndBuf->size >> 2)) { + if (snapshotSend(pSender) != 0) { + code = terrno; + goto _out; + } + } + +_out: + taosThreadMutexUnlock(&pSndBuf->mutex); + return code; +} + // sender on message // // condition 1 sender receives SYNC_SNAPSHOT_SEQ_END, close sender // condition 2 sender receives ack, set seq = ack + 1, send msg from seq // condition 3 sender receives error msg, just print error log // -int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { - SyncSnapshotRsp *pMsg = pRpcMsg->pCont; +int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { + SyncSnapshotRsp **ppMsg = (SyncSnapshotRsp **)&pRpcMsg->pCont; + SyncSnapshotRsp *pMsg = ppMsg[0]; // if already drop replica, do not process if (!syncNodeInRaftGroup(pSyncNode, &pMsg->srcId)) { @@ -1123,12 +1285,8 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { if (pMsg->ack == SYNC_SNAPSHOT_SEQ_BEGIN) { syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq begin"); - if (snapshotSenderUpdateProgress(pSender, pMsg) != 0) { - return -1; - } - if (snapshotSend(pSender) != 0) { - return -1; + goto _ERROR; } return 0; } @@ -1142,30 +1300,9 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { } // send next msg - if (pMsg->ack == pSender->seq) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq data"); - // update sender ack - if (snapshotSenderUpdateProgress(pSender, pMsg) != 0) { - return -1; - } - if (snapshotSend(pSender) != 0) { - return -1; - } - } else if (pMsg->ack == pSender->seq - 1) { - // maybe resend - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq and resend"); - if (snapshotReSend(pSender) != 0) { - return -1; - } - } else { - // error log - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "receive error ack"); - sSError(pSender, "snapshot sender receive error ack:%d, my seq:%d", pMsg->ack, pSender->seq); - snapshotSenderStop(pSender, true); - syncNodeReplicateReset(pSyncNode, &pMsg->srcId); - return -1; + if (syncSnapBufferSend(pSender, ppMsg) != 0) { + goto _ERROR; } - return 0; _ERROR: From f444cd7a6d7b813c09bfc3f6a910189479457e08 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 24 Oct 2023 10:15:39 +0800 Subject: [PATCH 02/31] enh: log signature of snap sender/receiver while started or stopped --- source/libs/sync/src/syncSnapshot.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 43726307e6..3017f4e33c 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -175,7 +175,7 @@ int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) { SyncSnapshotSend *pMsg = rpcMsg.pCont; pMsg->srcId = pSender->pSyncNode->myRaftId; pMsg->destId = pSender->pSyncNode->replicasId[pSender->replicaIndex]; - pMsg->term = raftStoreGetTerm(pSender->pSyncNode); + pMsg->term = pSender->term; pMsg->beginIndex = pSender->snapshotParam.start; pMsg->lastIndex = pSender->snapshot.lastApplyIndex; pMsg->lastTerm = pSender->snapshot.lastApplyTerm; @@ -189,15 +189,15 @@ int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) { memcpy(pMsg->data, snapInfo.data, dataLen); } - // event log - syncLogSendSyncSnapshotSend(pSender->pSyncNode, pMsg, "snapshot sender start"); - // send msg if (syncNodeSendMsgById(&pMsg->destId, pSender->pSyncNode, &rpcMsg) != 0) { sSError(pSender, "snapshot sender send msg failed since %s", terrstr()); goto _out; } + sSInfo(pSender, "snapshot sender started. signature:(%" PRId64 ", %" PRId64 "), to dnode:%d", pSender->term, + pSender->startTime, DID(&pMsg->destId)); + code = 0; _out: if (snapInfo.data) { @@ -231,6 +231,10 @@ void snapshotSenderStop(SSyncSnapshotSender *pSender, bool finish) { } syncSnapBufferReset(pSender->pSndBuf); + + SRaftId destId = pSender->pSyncNode->replicasId[pSender->replicaIndex]; + sSInfo(pSender, "snapshot sender stopped. signature:(%" PRId64 ", %" PRId64 "), to dnode:%d", pSender->term, + pSender->startTime, DID(&destId)); } // when sender receive ack, call this function to send msg from seq @@ -515,14 +519,14 @@ void snapshotReceiverStart(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *p pReceiver->startTime = pPreMsg->startTime; ASSERT(pReceiver->startTime); - // event log - sRInfo(pReceiver, "snapshot receiver is start"); + sRInfo(pReceiver, "snapshot receiver started. signature:(%" PRId64 ", %" PRId64 "), from dnode:%d", pReceiver->term, + pReceiver->startTime, DID(&pReceiver->fromId)); } // just set start = false // FpSnapshotStopWrite should not be called void snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver) { - sRInfo(pReceiver, "snapshot receiver stop, not apply, writer:%p", pReceiver->pWriter); + sRDebug(pReceiver, "snapshot receiver stop, not apply, writer:%p", pReceiver->pWriter); int8_t stopped = !atomic_val_compare_exchange_8(&pReceiver->start, true, false); if (stopped) return; @@ -539,6 +543,9 @@ void snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver) { } syncSnapBufferReset(pReceiver->pRcvBuf); + + sRInfo(pReceiver, "snapshot receiver stopped. signature:(%" PRId64 ", %" PRId64 "), from dnode:%d", pReceiver->term, + pReceiver->startTime, DID(&pReceiver->fromId)); } // when recv last snapshot block, apply data into snapshot From 1c60e67a8380813bb78f1cde4f86dce03d101de8 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 24 Oct 2023 14:16:46 +0800 Subject: [PATCH 03/31] enh: send the END snap msg at last --- source/libs/sync/src/syncSnapshot.c | 52 ++++++++++++++++++----------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 3017f4e33c..7e556b28f5 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -247,25 +247,28 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { pSender->blockLen = 0; } - pSender->seq++; + if (pSender->seq != SYNC_SNAPSHOT_SEQ_END) { + pSender->seq++; - // read data - int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotDoRead(pSender->pSyncNode->pFsm, pSender->pReader, - &pSender->pCurrentBlock, &pSender->blockLen); - if (ret != 0) { - sSError(pSender, "snapshot sender read failed since %s", terrstr()); - return -1; - } + // read data + int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotDoRead(pSender->pSyncNode->pFsm, pSender->pReader, + &pSender->pCurrentBlock, &pSender->blockLen); + if (ret != 0) { + sSError(pSender, "snapshot sender read failed since %s", terrstr()); + return -1; + } - if (pSender->blockLen > 0) { - // has read data - sSDebug(pSender, "vgId:%d, snapshot sender continue to read, blockLen:%d seq:%d", pSender->pSyncNode->vgId, - pSender->blockLen, pSender->seq); - } else { - // read finish, update seq to end - pSender->seq = SYNC_SNAPSHOT_SEQ_END; - sSInfo(pSender, "vgId:%d, snapshot sender read to the end, blockLen:%d seq:%d", pSender->pSyncNode->vgId, - pSender->blockLen, pSender->seq); + if (pSender->blockLen > 0) { + // has read data + sSDebug(pSender, "vgId:%d, snapshot sender continue to read, blockLen:%d seq:%d", pSender->pSyncNode->vgId, + pSender->blockLen, pSender->seq); + } else { + // read finish, update seq to end + pSender->seq = SYNC_SNAPSHOT_SEQ_END; + sSInfo(pSender, "vgId:%d, snapshot sender read to the end, blockLen:%d seq:%d", pSender->pSyncNode->vgId, + pSender->blockLen, pSender->seq); + return 0; + } } // build msg @@ -1188,19 +1191,28 @@ static int32_t syncSnapBufferSend(SSyncSnapshotSender *pSender, SyncSnapshotRsp } } - for (int64_t ack = pSndBuf->start; ack < pSndBuf->cursor; ++ack) { + for (int64_t ack = pSndBuf->start; ack <= pSndBuf->cursor; ++ack) { rpcFreeCont(pSndBuf->entries[ack % pSndBuf->size]); pSndBuf->entries[ack % pSndBuf->size] = NULL; pSndBuf->start = ack + 1; } - while (pSender->seq - pSndBuf->start < (pSndBuf->size >> 2)) { + while (pSender->seq != SYNC_SNAPSHOT_SEQ_END && pSender->seq - pSndBuf->start < (pSndBuf->size >> 2)) { + if (snapshotSend(pSender) != 0) { + code = terrno; + goto _out; + } + if (pSender->seq != SYNC_SNAPSHOT_SEQ_END) { + pSndBuf->end = TMAX(pSender->seq + 1, pSndBuf->end); + } + } + + if (pSender->seq == SYNC_SNAPSHOT_SEQ_END && pSndBuf->end <= pSndBuf->start) { if (snapshotSend(pSender) != 0) { code = terrno; goto _out; } } - _out: taosThreadMutexUnlock(&pSndBuf->mutex); return code; From 811f1bbbea25bbbd2fc470f7cd80919b71fb8dc2 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 24 Oct 2023 15:47:47 +0800 Subject: [PATCH 04/31] enh: tidy up logging msg in syncNodeOnSnapshotRsp --- source/libs/sync/src/syncSnapshot.c | 30 +++++++++++++---------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 7e556b28f5..195354ac79 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -195,8 +195,8 @@ int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) { goto _out; } - sSInfo(pSender, "snapshot sender started. signature:(%" PRId64 ", %" PRId64 "), to dnode:%d", pSender->term, - pSender->startTime, DID(&pMsg->destId)); + sSInfo(pSender, "snapshot sender start to dnode:%d. signature:(%" PRId64 ", %" PRId64 ")", DID(&pMsg->destId), + pSender->term, pSender->startTime); code = 0; _out: @@ -233,8 +233,8 @@ void snapshotSenderStop(SSyncSnapshotSender *pSender, bool finish) { syncSnapBufferReset(pSender->pSndBuf); SRaftId destId = pSender->pSyncNode->replicasId[pSender->replicaIndex]; - sSInfo(pSender, "snapshot sender stopped. signature:(%" PRId64 ", %" PRId64 "), to dnode:%d", pSender->term, - pSender->startTime, DID(&destId)); + sSInfo(pSender, "snapshot sender stop to dnode:%d. signature:(%" PRId64 ", %" PRId64 "), finish:%d", DID(&destId), + pSender->term, pSender->startTime, finish); } // when sender receive ack, call this function to send msg from seq @@ -522,8 +522,8 @@ void snapshotReceiverStart(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *p pReceiver->startTime = pPreMsg->startTime; ASSERT(pReceiver->startTime); - sRInfo(pReceiver, "snapshot receiver started. signature:(%" PRId64 ", %" PRId64 "), from dnode:%d", pReceiver->term, - pReceiver->startTime, DID(&pReceiver->fromId)); + sRInfo(pReceiver, "snapshot receiver start from dnode:%d. signature:(%" PRId64 ", %" PRId64 ")", + DID(&pReceiver->fromId), pReceiver->term, pReceiver->startTime); } // just set start = false @@ -547,8 +547,8 @@ void snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver) { syncSnapBufferReset(pReceiver->pRcvBuf); - sRInfo(pReceiver, "snapshot receiver stopped. signature:(%" PRId64 ", %" PRId64 "), from dnode:%d", pReceiver->term, - pReceiver->startTime, DID(&pReceiver->fromId)); + sRInfo(pReceiver, "snapshot receiver stop from dnode:%d. signature:(%" PRId64 ", %" PRId64 ")", + DID(&pReceiver->fromId), pReceiver->term, pReceiver->startTime); } // when recv last snapshot block, apply data into snapshot @@ -1266,7 +1266,6 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { // state, term, seq/ack if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "snapshot sender not leader"); sSError(pSender, "snapshot sender not leader"); terrno = TSDB_CODE_SYN_NOT_LEADER; goto _ERROR; @@ -1274,15 +1273,13 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { SyncTerm currentTerm = raftStoreGetTerm(pSyncNode); if (pMsg->term != currentTerm) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "snapshot sender and receiver term not match"); - sSError(pSender, "snapshot sender term not equal, msg term:%" PRId64 " currentTerm:%" PRId64, pMsg->term, + sSError(pSender, "snapshot sender term mismatch, msg term:%" PRId64 " currentTerm:%" PRId64, pMsg->term, currentTerm); terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; goto _ERROR; } if (pMsg->code != 0) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "receive error code"); sSError(pSender, "snapshot sender receive error:%s 0x%x and stop sender", tstrerror(pMsg->code), pMsg->code); terrno = pMsg->code; goto _ERROR; @@ -1290,12 +1287,10 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { // prepare , send begin msg if (pMsg->ack == SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq pre-snapshot"); return syncNodeOnSnapshotPrepRsp(pSyncNode, pSender, pMsg); } if (pSender->pReader == NULL || pSender->finish) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "snapshot sender invalid"); sSError(pSender, "snapshot sender invalid error:%s 0x%x, pReader:%p finish:%d", tstrerror(pMsg->code), pMsg->code, pSender->pReader, pSender->finish); terrno = pMsg->code; @@ -1303,7 +1298,7 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { } if (pMsg->ack == SYNC_SNAPSHOT_SEQ_BEGIN) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq begin"); + sSInfo(pSender, "process seq begin"); if (snapshotSend(pSender) != 0) { goto _ERROR; } @@ -1312,7 +1307,7 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { // receive ack is finish, close sender if (pMsg->ack == SYNC_SNAPSHOT_SEQ_END) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "process seq end"); + sSInfo(pSender, "process seq end"); snapshotSenderStop(pSender, true); syncNodeReplicateReset(pSyncNode, &pMsg->srcId); return 0; @@ -1320,12 +1315,13 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { // send next msg if (syncSnapBufferSend(pSender, ppMsg) != 0) { + sSError(pSender, "failed to send snapshot msg since %s. seq:%d", terrstr(), pSender->seq); goto _ERROR; } return 0; _ERROR: - snapshotSenderStop(pSender, true); + snapshotSenderStop(pSender, false); syncNodeReplicateReset(pSyncNode, &pMsg->srcId); return -1; } From 4d61e87c0f66fddd0ae3946b553772b8530fc044 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 24 Oct 2023 17:11:32 +0800 Subject: [PATCH 05/31] enh: add signature info in logging msgs of snap replication --- source/libs/sync/src/syncSnapshot.c | 56 +++++++++++++---------------- source/libs/sync/src/syncUtil.c | 51 +++++++++++++------------- 2 files changed, 49 insertions(+), 58 deletions(-) diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 195354ac79..ad782bd51d 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -195,8 +195,7 @@ int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) { goto _out; } - sSInfo(pSender, "snapshot sender start to dnode:%d. signature:(%" PRId64 ", %" PRId64 ")", DID(&pMsg->destId), - pSender->term, pSender->startTime); + sSInfo(pSender, "snapshot sender start, to dnode:%d.", DID(&pMsg->destId)); code = 0; _out: @@ -233,8 +232,7 @@ void snapshotSenderStop(SSyncSnapshotSender *pSender, bool finish) { syncSnapBufferReset(pSender->pSndBuf); SRaftId destId = pSender->pSyncNode->replicasId[pSender->replicaIndex]; - sSInfo(pSender, "snapshot sender stop to dnode:%d. signature:(%" PRId64 ", %" PRId64 "), finish:%d", DID(&destId), - pSender->term, pSender->startTime, finish); + sSInfo(pSender, "snapshot sender stop, to dnode:%d, finish:%d", DID(&destId), finish); } // when sender receive ack, call this function to send msg from seq @@ -522,8 +520,7 @@ void snapshotReceiverStart(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *p pReceiver->startTime = pPreMsg->startTime; ASSERT(pReceiver->startTime); - sRInfo(pReceiver, "snapshot receiver start from dnode:%d. signature:(%" PRId64 ", %" PRId64 ")", - DID(&pReceiver->fromId), pReceiver->term, pReceiver->startTime); + sRInfo(pReceiver, "snapshot receiver start, from dnode:%d.", DID(&pReceiver->fromId)); } // just set start = false @@ -547,8 +544,7 @@ void snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver) { syncSnapBufferReset(pReceiver->pRcvBuf); - sRInfo(pReceiver, "snapshot receiver stop from dnode:%d. signature:(%" PRId64 ", %" PRId64 ")", - DID(&pReceiver->fromId), pReceiver->term, pReceiver->startTime); + sRInfo(pReceiver, "snapshot receiver stop, from dnode:%d.", DID(&pReceiver->fromId)); } // when recv last snapshot block, apply data into snapshot @@ -676,22 +672,22 @@ static int32_t syncNodeOnSnapshotPrep(SSyncNode *pSyncNode, SyncSnapshotSend *pM int32_t order = 0; if ((order = snapshotReceiverSignatureCmp(pReceiver, pMsg)) < 0) { sRInfo(pReceiver, - "received a new snapshot preparation. restart receiver" - "receiver signature: (%" PRId64 ", %" PRId64 "), msg signature:(%" PRId64 ", %" PRId64 ")", - pReceiver->term, pReceiver->startTime, pMsg->term, pMsg->startTime); + "received a new snapshot preparation. restart receiver." + " msg signature:(%" PRId64 ", %" PRId64 ")", + pMsg->term, pMsg->startTime); goto _START_RECEIVER; } else if (order == 0) { sRInfo(pReceiver, - "received a duplicate snapshot preparation. send reply" - "receiver signature: (%" PRId64 ", %" PRId64 "), msg signature:(%" PRId64 ", %" PRId64 ")", - pReceiver->term, pReceiver->startTime, pMsg->term, pMsg->startTime); + "received a duplicate snapshot preparation. send reply." + " msg signature:(%" PRId64 ", %" PRId64 ")", + pMsg->term, pMsg->startTime); goto _SEND_REPLY; } else { // ignore sRError(pReceiver, - "received a stale snapshot preparation. ignore" - "receiver signature: (%" PRId64 ", %" PRId64 "), msg signature:(%" PRId64 ", %" PRId64 ")", - pReceiver->term, pReceiver->startTime, pMsg->term, pMsg->startTime); + "received a stale snapshot preparation. ignore." + " msg signature:(%" PRId64 ", %" PRId64 ")", + pMsg->term, pMsg->startTime); terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; code = terrno; goto _SEND_REPLY; @@ -809,6 +805,8 @@ static int32_t syncNodeOnSnapshotBegin(SSyncNode *pSyncNode, SyncSnapshotSend *p goto _SEND_REPLY; } + sRInfo(pReceiver, "snapshot begin"); + code = 0; _SEND_REPLY: if (code != 0 && terrno != 0) { @@ -1009,8 +1007,7 @@ _SEND_REPLY:; // int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { SyncSnapshotSend **ppMsg = (SyncSnapshotSend **)&pRpcMsg->pCont; - SyncSnapshotSend *pMsg = ppMsg[0]; - ASSERT(pMsg); + SyncSnapshotSend *pMsg = ppMsg[0]; SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver; // if already drop replica, do not process @@ -1040,16 +1037,16 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER || pSyncNode->state == TAOS_SYNC_STATE_LEARNER) { if (pMsg->term == raftStoreGetTerm(pSyncNode)) { if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT) { - sInfo("vgId:%d, receive pre-snapshot msg of snapshot replication. signature:(%" PRId64 ", %" PRId64 ")", + sInfo("vgId:%d, receive prepare msg of snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", pSyncNode->vgId, pMsg->term, pMsg->startTime); code = syncNodeOnSnapshotPrep(pSyncNode, pMsg); } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_BEGIN) { - sInfo("vgId:%d, receive begin msg of snapshot replication. signature:(%" PRId64 ", %" PRId64 ")", + sInfo("vgId:%d, receive begin msg of snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", pSyncNode->vgId, pMsg->term, pMsg->startTime); code = syncNodeOnSnapshotBegin(pSyncNode, pMsg); } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_END) { - sInfo("vgId:%d, receive end msg of snapshot replication. signature: (%" PRId64 ", %" PRId64 ")", - pSyncNode->vgId, pMsg->term, pMsg->startTime); + sInfo("vgId:%d, receive end msg of snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", pSyncNode->vgId, + pMsg->term, pMsg->startTime); code = syncNodeOnSnapshotEnd(pSyncNode, pMsg); if (syncLogBufferReInit(pSyncNode->pLogBuf, pSyncNode) != 0) { sRError(pReceiver, "failed to reinit log buffer since %s", terrstr()); @@ -1059,7 +1056,7 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { // force close, no response syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process force stop"); snapshotReceiverStop(pReceiver); - } else if (pMsg->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->seq <= SYNC_SNAPSHOT_SEQ_END) { + } else if (pMsg->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->seq < SYNC_SNAPSHOT_SEQ_END) { syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq data"); code = syncNodeOnSnapshotReceive(pSyncNode, ppMsg); } else { @@ -1139,10 +1136,7 @@ static int32_t syncNodeOnSnapshotPrepRsp(SSyncNode *pSyncNode, SSyncSnapshotSend pSendMsg->startTime = pSender->startTime; pSendMsg->seq = SYNC_SNAPSHOT_SEQ_BEGIN; - ASSERT(pSendMsg->startTime); - - sSInfo(pSender, "begin snapshot replication to dnode %d. startTime:%" PRId64, DID(&pSendMsg->destId), - pSendMsg->startTime); + sSInfo(pSender, "begin snapshot replication to dnode %d." PRId64, DID(&pSendMsg->destId)); // send msg syncLogSendSyncSnapshotSend(pSyncNode, pSendMsg, "snapshot sender reply pre"); @@ -1252,10 +1246,8 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { // check signature int32_t order = 0; if ((order = snapshotSenderSignatureCmp(pSender, pMsg)) > 0) { - sSError(pSender, - "received a stale snapshot rsp. ignore it" - "sender signature: (%" PRId64 ", %" PRId64 "), msg signature:(%" PRId64 ", %" PRId64 ")", - pSender->term, pSender->startTime, pMsg->term, pMsg->startTime); + sSError(pSender, "received a stale snapshot rsp, msg signature:(%" PRId64 ", %" PRId64 "), ignore it.", pMsg->term, + pMsg->startTime); terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; return -1; } else if (order < 0) { diff --git a/source/libs/sync/src/syncUtil.c b/source/libs/sync/src/syncUtil.c index 9acc17e130..dc16b0e958 100644 --- a/source/libs/sync/src/syncUtil.c +++ b/source/libs/sync/src/syncUtil.c @@ -266,22 +266,21 @@ void syncPrintSnapshotSenderLog(const char* flags, ELogLevel level, int32_t dfla int32_t writeLen = vsnprintf(eventLog, sizeof(eventLog), format, argpointer); va_end(argpointer); - taosPrintLog(flags, level, dflag, - "vgId:%d, %s, sync:%s, snap-sender:{%p start:%" PRId64 " end:%" PRId64 " last-index:%" PRId64 - " last-term:%" PRIu64 " last-cfg:%" PRId64 - ", seq:%d ack:%d finish:%d, as:%d dnode:%d}" - ", term:%" PRIu64 ", commit-index:%" PRId64 ", firstver:%" PRId64 ", lastver:%" PRId64 - ", min-match:%" PRId64 ", snap:{last-index:%" PRId64 ", term:%" PRIu64 - "}, standby:%d, batch-sz:%d, replicas:%d, last-cfg:%" PRId64 - ", chging:%d, restore:%d, quorum:%d, lc-timer:{elect:%" PRId64 ", hb:%" PRId64 "}, peer:%s, cfg:%s", - pNode->vgId, eventLog, syncStr(pNode->state), pSender, pSender->snapshotParam.start, - pSender->snapshotParam.end, pSender->snapshot.lastApplyIndex, pSender->snapshot.lastApplyTerm, - pSender->snapshot.lastConfigIndex, pSender->seq, pSender->ack, pSender->finish, pSender->replicaIndex, - DID(&pNode->replicasId[pSender->replicaIndex]), raftStoreGetTerm(pNode), pNode->commitIndex, - logBeginIndex, logLastIndex, pNode->minMatchIndex, snapshot.lastApplyIndex, snapshot.lastApplyTerm, - pNode->raftCfg.isStandBy, pNode->raftCfg.batchSize, pNode->replicaNum, pNode->raftCfg.lastConfigIndex, - pNode->changing, pNode->restoreFinish, syncNodeDynamicQuorum(pNode), pNode->electTimerLogicClock, - pNode->heartbeatTimerLogicClockUser, peerStr, cfgStr); + taosPrintLog( + flags, level, dflag, + "vgId:%d, %s, sync:%s, snap-sender:%p signature:(%" PRId64 ", %" PRId64 "), {start:%" PRId64 " end:%" PRId64 + " last-index:%" PRId64 " last-term:%" PRIu64 " last-cfg:%" PRId64 + ", seq:%d ack:%d finish:%d, as:%d, to-dnode:%d}" + ", term:%" PRIu64 ", commit-index:%" PRId64 ", firstver:%" PRId64 ", lastver:%" PRId64 ", min-match:%" PRId64 + ", snap:{last-index:%" PRId64 ", term:%" PRIu64 "}, standby:%d, batch-sz:%d, replicas:%d, last-cfg:%" PRId64 + ", chging:%d, restore:%d, quorum:%d, peer:%s, cfg:%s", + pNode->vgId, eventLog, syncStr(pNode->state), pSender, pSender->term, pSender->startTime, + pSender->snapshotParam.start, pSender->snapshotParam.end, pSender->snapshot.lastApplyIndex, + pSender->snapshot.lastApplyTerm, pSender->snapshot.lastConfigIndex, pSender->seq, pSender->ack, pSender->finish, + pSender->replicaIndex, DID(&pNode->replicasId[pSender->replicaIndex]), raftStoreGetTerm(pNode), + pNode->commitIndex, logBeginIndex, logLastIndex, pNode->minMatchIndex, snapshot.lastApplyIndex, + snapshot.lastApplyTerm, pNode->raftCfg.isStandBy, pNode->raftCfg.batchSize, pNode->replicaNum, + pNode->raftCfg.lastConfigIndex, pNode->changing, pNode->restoreFinish, pNode->quorum, peerStr, cfgStr); } void syncPrintSnapshotReceiverLog(const char* flags, ELogLevel level, int32_t dflag, SSyncSnapshotReceiver* pReceiver, @@ -316,19 +315,19 @@ void syncPrintSnapshotReceiverLog(const char* flags, ELogLevel level, int32_t df taosPrintLog( flags, level, dflag, "vgId:%d, %s, sync:%s," - " snap-receiver:{%p started:%d acked:%d term:%" PRIu64 " start-time:%" PRId64 " from-dnode:%d, start:%" PRId64 - " end:%" PRId64 " last-index:%" PRId64 " last-term:%" PRIu64 " last-cfg:%" PRId64 + " snap-receiver:%p signature:(%" PRId64 ", %" PRId64 "), {start:%d ack:%d term:%" PRIu64 " start-time:%" PRId64 + " from-dnode:%d, start:%" PRId64 " end:%" PRId64 " last-index:%" PRId64 " last-term:%" PRIu64 " last-cfg:%" PRId64 "}" ", term:%" PRIu64 ", commit-index:%" PRId64 ", firstver:%" PRId64 ", lastver:%" PRId64 ", min-match:%" PRId64 ", snap:{last-index:%" PRId64 ", last-term:%" PRIu64 "}, standby:%d, batch-sz:%d, replicas:%d, last-cfg:%" PRId64 - ", chging:%d, restore:%d, quorum:%d, lc-timers:{elect:%" PRId64 ", hb:%" PRId64 "}, peer:%s, cfg:%s", - pNode->vgId, eventLog, syncStr(pNode->state), pReceiver, pReceiver->start, pReceiver->ack, pReceiver->term, - pReceiver->startTime, DID(&pReceiver->fromId), pReceiver->snapshotParam.start, pReceiver->snapshotParam.end, - pReceiver->snapshot.lastApplyIndex, pReceiver->snapshot.lastApplyTerm, pReceiver->snapshot.lastConfigIndex, - raftStoreGetTerm(pNode), pNode->commitIndex, logBeginIndex, logLastIndex, pNode->minMatchIndex, - snapshot.lastApplyIndex, snapshot.lastApplyTerm, pNode->raftCfg.isStandBy, pNode->raftCfg.batchSize, - pNode->replicaNum, pNode->raftCfg.lastConfigIndex, pNode->changing, pNode->restoreFinish, - syncNodeDynamicQuorum(pNode), pNode->electTimerLogicClock, pNode->heartbeatTimerLogicClockUser, peerStr, cfgStr); + ", chging:%d, restore:%d, quorum:%d, peer:%s, cfg:%s", + pNode->vgId, eventLog, syncStr(pNode->state), pReceiver, pReceiver->term, pReceiver->startTime, pReceiver->start, + pReceiver->ack, pReceiver->term, pReceiver->startTime, DID(&pReceiver->fromId), pReceiver->snapshotParam.start, + pReceiver->snapshotParam.end, pReceiver->snapshot.lastApplyIndex, pReceiver->snapshot.lastApplyTerm, + pReceiver->snapshot.lastConfigIndex, raftStoreGetTerm(pNode), pNode->commitIndex, logBeginIndex, logLastIndex, + pNode->minMatchIndex, snapshot.lastApplyIndex, snapshot.lastApplyTerm, pNode->raftCfg.isStandBy, + pNode->raftCfg.batchSize, pNode->replicaNum, pNode->raftCfg.lastConfigIndex, pNode->changing, + pNode->restoreFinish, pNode->quorum, peerStr, cfgStr); } void syncLogRecvTimer(SSyncNode* pSyncNode, const SyncTimeout* pMsg, const char* s) { From 95188ab8c387d10ad54c7a58a8c4264dc3274864 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 24 Oct 2023 17:42:17 +0800 Subject: [PATCH 06/31] enh: adjust some logging msgs for sync log replication mgr --- source/libs/sync/src/syncPipeline.c | 34 ++++++++++++++--------------- source/libs/sync/src/syncSnapshot.c | 8 +++---- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index a7ee37cc3b..f2386797c1 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -797,7 +797,7 @@ _out: pMgr->retryBackoff = syncLogReplGetNextRetryBackoff(pMgr); SSyncLogBuffer* pBuf = pNode->pLogBuf; sInfo("vgId:%d, resend %d sync log entries. dest:%" PRIx64 ", indexes:%" PRId64 " ..., terms: ... %" PRId64 - ", retryWaitMs:%" PRId64 ", mgr: [%" PRId64 " %" PRId64 ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 + ", retryWaitMs:%" PRId64 ", repl-mgr:[%" PRId64 " %" PRId64 ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, count, pDestId->addr, firstIndex, term, retryWaitMs, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); @@ -815,9 +815,9 @@ int32_t syncLogReplRecover(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEn ASSERT(pMgr->matchIndex == 0); if (pMsg->matchIndex < 0) { pMgr->restored = true; - sInfo("vgId:%d, sync log repl restored. peer: dnode:%d (%" PRIx64 "), mgr: rs(%d) [%" PRId64 " %" PRId64 - ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, DID(&destId), destId.addr, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + sInfo("vgId:%d, sync log repl restored. peer: dnode:%d (%" PRIx64 "), repl-mgr:[%" PRId64 " %" PRId64 ", %" PRId64 + "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, DID(&destId), destId.addr, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -832,9 +832,9 @@ int32_t syncLogReplRecover(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEn if (pMsg->success && pMsg->matchIndex == pMsg->lastSendIndex) { pMgr->matchIndex = pMsg->matchIndex; pMgr->restored = true; - sInfo("vgId:%d, sync log repl restored. peer: dnode:%d (%" PRIx64 "), mgr: rs(%d) [%" PRId64 " %" PRId64 - ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, DID(&destId), destId.addr, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + sInfo("vgId:%d, sync log repl restored. peer: dnode:%d (%" PRIx64 "), repl-mgr:[%" PRId64 " %" PRId64 ", %" PRId64 + "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, DID(&destId), destId.addr, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -958,10 +958,10 @@ int32_t syncLogReplProbe(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncIndex inde pMgr->endIndex = index + 1; SSyncLogBuffer* pBuf = pNode->pLogBuf; - sTrace("vgId:%d, probe peer:%" PRIx64 " with msg of index:%" PRId64 " term:%" PRId64 ". mgr (rs:%d): [%" PRId64 - " %" PRId64 ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pDestId->addr, index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + sTrace("vgId:%d, probe peer:%" PRIx64 " with msg of index:%" PRId64 " term:%" PRId64 ". repl-mgr:[%" PRId64 + " %" PRId64 ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pDestId->addr, index, term, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, pBuf->startIndex, + pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } @@ -1002,9 +1002,9 @@ int32_t syncLogReplAttempt(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { pMgr->endIndex = index + 1; if (barrier) { - sInfo("vgId:%d, replicated sync barrier to dnode:%d. index:%" PRId64 ", term:%" PRId64 - ", repl mgr: rs(%d) [%" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, DID(pDestId), index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex); + sInfo("vgId:%d, replicated sync barrier to dnode:%d. index:%" PRId64 ", term:%" PRId64 ", repl-mgr:[%" PRId64 + " %" PRId64 ", %" PRId64 ")", + pNode->vgId, DID(pDestId), index, term, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex); break; } } @@ -1013,10 +1013,10 @@ int32_t syncLogReplAttempt(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { SSyncLogBuffer* pBuf = pNode->pLogBuf; sTrace("vgId:%d, replicated %d msgs to peer:%" PRIx64 ". indexes:%" PRId64 "..., terms: ...%" PRId64 - ", mgr: (rs:%d) [%" PRId64 " %" PRId64 ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 + ", repl-mgr:[%" PRId64 " %" PRId64 ", %" PRId64 "), buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, count, pDestId->addr, firstIndex, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, - pMgr->endIndex, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + pNode->vgId, count, pDestId->addr, firstIndex, term, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex, + pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); return 0; } diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index ad782bd51d..7e431111f2 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -258,13 +258,11 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { if (pSender->blockLen > 0) { // has read data - sSDebug(pSender, "vgId:%d, snapshot sender continue to read, blockLen:%d seq:%d", pSender->pSyncNode->vgId, - pSender->blockLen, pSender->seq); + sSDebug(pSender, "snapshot sender continue to read, blockLen:%d seq:%d", pSender->blockLen, pSender->seq); } else { // read finish, update seq to end pSender->seq = SYNC_SNAPSHOT_SEQ_END; - sSInfo(pSender, "vgId:%d, snapshot sender read to the end, blockLen:%d seq:%d", pSender->pSyncNode->vgId, - pSender->blockLen, pSender->seq); + sSInfo(pSender, "snapshot sender read to the end, blockLen:%d seq:%d", pSender->blockLen, pSender->seq); return 0; } } @@ -1136,7 +1134,7 @@ static int32_t syncNodeOnSnapshotPrepRsp(SSyncNode *pSyncNode, SSyncSnapshotSend pSendMsg->startTime = pSender->startTime; pSendMsg->seq = SYNC_SNAPSHOT_SEQ_BEGIN; - sSInfo(pSender, "begin snapshot replication to dnode %d." PRId64, DID(&pSendMsg->destId)); + sSInfo(pSender, "begin snapshot replication to dnode %d.", DID(&pSendMsg->destId)); // send msg syncLogSendSyncSnapshotSend(pSyncNode, pSendMsg, "snapshot sender reply pre"); From 374217cefffefcedae3c68a2a52d9a4eeed47b86 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 24 Oct 2023 19:20:53 +0800 Subject: [PATCH 07/31] fixup: adjust a logging msg in syncNodeStartSnapshot --- source/libs/sync/src/syncSnapshot.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 7e431111f2..c2ba30a6bb 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -383,8 +383,6 @@ int32_t syncNodeStartSnapshot(SSyncNode *pSyncNode, SRaftId *pDestId) { return 0; } - sSInfo(pSender, "snapshot sender start"); - int32_t code = snapshotSenderStart(pSender); if (code != 0) { sSError(pSender, "snapshot sender start error since %s", terrstr()); @@ -803,8 +801,6 @@ static int32_t syncNodeOnSnapshotBegin(SSyncNode *pSyncNode, SyncSnapshotSend *p goto _SEND_REPLY; } - sRInfo(pReceiver, "snapshot begin"); - code = 0; _SEND_REPLY: if (code != 0 && terrno != 0) { From 4e3a8b893df4c6ee66eb887758f0da0126e617bc Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Tue, 24 Oct 2023 20:03:48 +0800 Subject: [PATCH 08/31] enh: restore wal log from snapshot after finish --- source/libs/sync/src/syncSnapshot.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index c2ba30a6bb..8a83891735 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -539,8 +539,6 @@ void snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver) { } syncSnapBufferReset(pReceiver->pRcvBuf); - - sRInfo(pReceiver, "snapshot receiver stop, from dnode:%d.", DID(&pReceiver->fromId)); } // when recv last snapshot block, apply data into snapshot @@ -548,7 +546,7 @@ static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnap int32_t code = 0; if (pReceiver->pWriter != NULL) { // write data - sRInfo(pReceiver, "snapshot receiver write finish, blockLen:%d seq:%d", pMsg->dataLen, pMsg->seq); + sRInfo(pReceiver, "snapshot receiver write about to finish, blockLen:%d seq:%d", pMsg->dataLen, pMsg->seq); if (pMsg->dataLen > 0) { code = pReceiver->pSyncNode->pFsm->FpSnapshotDoWrite(pReceiver->pSyncNode->pFsm, pReceiver->pWriter, pMsg->data, pMsg->dataLen); @@ -558,15 +556,6 @@ static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnap } } - // reset wal - sRInfo(pReceiver, "snapshot receiver log restore"); - code = - pReceiver->pSyncNode->pLogStore->syncLogRestoreFromSnapshot(pReceiver->pSyncNode->pLogStore, pMsg->lastIndex); - if (code != 0) { - sRError(pReceiver, "failed to snapshot receiver log restore since %s", terrstr()); - return -1; - } - // update commit index if (pReceiver->snapshot.lastApplyIndex > pReceiver->pSyncNode->commitIndex) { pReceiver->pSyncNode->commitIndex = pReceiver->snapshot.lastApplyIndex; @@ -578,7 +567,6 @@ static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnap } // stop writer, apply data - sRInfo(pReceiver, "snapshot receiver apply write"); code = pReceiver->pSyncNode->pFsm->FpSnapshotStopWrite(pReceiver->pSyncNode->pFsm, pReceiver->pWriter, true, &pReceiver->snapshot); if (code != 0) { @@ -586,10 +574,21 @@ static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnap return -1; } pReceiver->pWriter = NULL; + sRInfo(pReceiver, "snapshot receiver write stopped"); // update progress pReceiver->ack = SYNC_SNAPSHOT_SEQ_END; + // reset wal + code = + pReceiver->pSyncNode->pLogStore->syncLogRestoreFromSnapshot(pReceiver->pSyncNode->pLogStore, pMsg->lastIndex); + if (code != 0) { + sRError(pReceiver, "failed to snapshot receiver log restore since %s", terrstr()); + return -1; + } + sRInfo(pReceiver, "wal log restored from snapshot"); + + // get fsmState SSnapshot snapshot = {0}; pReceiver->pSyncNode->pFsm->FpGetSnapshotInfo(pReceiver->pSyncNode->pFsm, &snapshot); pReceiver->pSyncNode->fsmState = snapshot.state; @@ -599,8 +598,6 @@ static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnap return -1; } - // event log - sRInfo(pReceiver, "snapshot receiver got last data and apply snapshot finished"); return 0; } From 8b2d57504c50de10130808abc3ada23153790c84 Mon Sep 17 00:00:00 2001 From: shenglian zhou Date: Wed, 25 Oct 2023 13:54:45 +0800 Subject: [PATCH 09/31] enhance: bi mode default tag scan --- source/libs/parser/src/parAstCreater.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/libs/parser/src/parAstCreater.c b/source/libs/parser/src/parAstCreater.c index 06c098af92..7ad61f4324 100644 --- a/source/libs/parser/src/parAstCreater.c +++ b/source/libs/parser/src/parAstCreater.c @@ -978,6 +978,9 @@ SNode* createSelectStmt(SAstCreateContext* pCxt, bool isDistinct, SNodeList* pPr SNodeList* pHint) { CHECK_PARSER_STATUS(pCxt); SNode* select = createSelectStmtImpl(isDistinct, pProjectionList, pTable, pHint); + if (pCxt->pQueryCxt->biMode) { + setSelectStmtTagMode(pCxt, select, true); + } CHECK_OUT_OF_MEM(select); return select; } From 7d9d1c6877a8fee601822d4199c210eac297cb22 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Wed, 25 Oct 2023 14:50:53 +0800 Subject: [PATCH 10/31] enh: terminate snap replication on timeout --- include/libs/sync/sync.h | 1 + source/libs/sync/src/syncTimeout.c | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index ad525a2aa7..e3b5f3611c 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -46,6 +46,7 @@ extern "C" { #define SYNC_HEARTBEAT_SLOW_MS 1500 #define SYNC_HEARTBEAT_REPLY_SLOW_MS 1500 #define SYNC_SNAP_RESEND_MS 1000 * 60 +#define SYNC_SNAP_TIMEOUT_MS 1000 * 180 #define SYNC_VND_COMMIT_MIN_MS 3000 diff --git a/source/libs/sync/src/syncTimeout.c b/source/libs/sync/src/syncTimeout.c index 37166805ce..91c7494fa4 100644 --- a/source/libs/sync/src/syncTimeout.c +++ b/source/libs/sync/src/syncTimeout.c @@ -78,11 +78,9 @@ static int32_t syncNodeTimerRoutine(SSyncNode* ths) { SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(ths, &(ths->peersId[i])); if (pSender != NULL) { if (ths->isStart && ths->state == TAOS_SYNC_STATE_LEADER && pSender->start && - timeNow - pSender->lastSendTime > SYNC_SNAP_RESEND_MS) { - snapshotReSend(pSender); - } else { - sTrace("vgId:%d, do not resend: nstart%d, now:%" PRId64 ", lstsend:%" PRId64 ", diff:%" PRId64, ths->vgId, - ths->isStart, timeNow, pSender->lastSendTime, timeNow - pSender->lastSendTime); + timeNow - pSender->lastSendTime > SYNC_SNAP_TIMEOUT_MS) { + sSError(pSender, "snapshot timeout, terminate. lastSendTime:%d", pSender->lastSendTime); + snapshotSenderStop(pSender, false); } } } From ec5b1f2ec175b33c5aa8f6b40ce98e006787d2fa Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Wed, 25 Oct 2023 15:04:14 +0800 Subject: [PATCH 11/31] enh: logging buf info for snapshot reader and sender --- source/libs/sync/inc/syncSnapshot.h | 2 +- source/libs/sync/src/syncSnapshot.c | 16 ++-------- source/libs/sync/src/syncUtil.c | 49 ++++++++++++++++------------- 3 files changed, 30 insertions(+), 37 deletions(-) diff --git a/source/libs/sync/inc/syncSnapshot.h b/source/libs/sync/inc/syncSnapshot.h index 0332204769..93c62544f2 100644 --- a/source/libs/sync/inc/syncSnapshot.h +++ b/source/libs/sync/inc/syncSnapshot.h @@ -56,7 +56,7 @@ typedef struct SSyncSnapshotSender { int64_t lastSendTime; bool finish; - // buffer + // ring buffer for ack SSyncSnapBuffer *pSndBuf; // init when create diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 8a83891735..751f777b18 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -345,20 +345,6 @@ int32_t snapshotReSend(SSyncSnapshotSender *pSender) { return 0; } -static int32_t snapshotSenderUpdateProgress(SSyncSnapshotSender *pSender, SyncSnapshotRsp *pMsg) { - if (pMsg->ack != pSender->seq) { - sSError(pSender, "snapshot sender update seq failed, ack:%d seq:%d", pMsg->ack, pSender->seq); - terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; - return -1; - } - - pSender->ack = pMsg->ack; - pSender->seq++; - - sSDebug(pSender, "snapshot sender update seq:%d", pSender->seq); - return 0; -} - // return 0, start ok // return 1, last snapshot finish ok // return -1, error @@ -1182,6 +1168,8 @@ static int32_t syncSnapBufferSend(SSyncSnapshotSender *pSender, SyncSnapshotRsp pSndBuf->start = ack + 1; } + pSender->ack = pSndBuf->start - 1; + while (pSender->seq != SYNC_SNAPSHOT_SEQ_END && pSender->seq - pSndBuf->start < (pSndBuf->size >> 2)) { if (snapshotSend(pSender) != 0) { code = terrno; diff --git a/source/libs/sync/src/syncUtil.c b/source/libs/sync/src/syncUtil.c index dc16b0e958..9e6ea94e78 100644 --- a/source/libs/sync/src/syncUtil.c +++ b/source/libs/sync/src/syncUtil.c @@ -266,21 +266,24 @@ void syncPrintSnapshotSenderLog(const char* flags, ELogLevel level, int32_t dfla int32_t writeLen = vsnprintf(eventLog, sizeof(eventLog), format, argpointer); va_end(argpointer); - taosPrintLog( - flags, level, dflag, - "vgId:%d, %s, sync:%s, snap-sender:%p signature:(%" PRId64 ", %" PRId64 "), {start:%" PRId64 " end:%" PRId64 - " last-index:%" PRId64 " last-term:%" PRIu64 " last-cfg:%" PRId64 - ", seq:%d ack:%d finish:%d, as:%d, to-dnode:%d}" - ", term:%" PRIu64 ", commit-index:%" PRId64 ", firstver:%" PRId64 ", lastver:%" PRId64 ", min-match:%" PRId64 - ", snap:{last-index:%" PRId64 ", term:%" PRIu64 "}, standby:%d, batch-sz:%d, replicas:%d, last-cfg:%" PRId64 - ", chging:%d, restore:%d, quorum:%d, peer:%s, cfg:%s", - pNode->vgId, eventLog, syncStr(pNode->state), pSender, pSender->term, pSender->startTime, - pSender->snapshotParam.start, pSender->snapshotParam.end, pSender->snapshot.lastApplyIndex, - pSender->snapshot.lastApplyTerm, pSender->snapshot.lastConfigIndex, pSender->seq, pSender->ack, pSender->finish, - pSender->replicaIndex, DID(&pNode->replicasId[pSender->replicaIndex]), raftStoreGetTerm(pNode), - pNode->commitIndex, logBeginIndex, logLastIndex, pNode->minMatchIndex, snapshot.lastApplyIndex, - snapshot.lastApplyTerm, pNode->raftCfg.isStandBy, pNode->raftCfg.batchSize, pNode->replicaNum, - pNode->raftCfg.lastConfigIndex, pNode->changing, pNode->restoreFinish, pNode->quorum, peerStr, cfgStr); + taosPrintLog(flags, level, dflag, + "vgId:%d, %s, sync:%s, snap-sender:%p signature:(%" PRId64 ", %" PRId64 "), {start:%" PRId64 + " end:%" PRId64 " last-index:%" PRId64 " last-term:%" PRIu64 " last-cfg:%" PRId64 + ", seq:%d, ack:%d, " + " buf:[%" PRId64 " %" PRId64 ", %" PRId64 + "), finish:%d, as:%d, to-dnode:%d}" + ", term:%" PRIu64 ", commit-index:%" PRId64 ", firstver:%" PRId64 ", lastver:%" PRId64 + ", min-match:%" PRId64 ", snap:{last-index:%" PRId64 ", term:%" PRIu64 + "}, standby:%d, batch-sz:%d, replicas:%d, last-cfg:%" PRId64 + ", chging:%d, restore:%d, quorum:%d, peer:%s, cfg:%s", + pNode->vgId, eventLog, syncStr(pNode->state), pSender, pSender->term, pSender->startTime, + pSender->snapshotParam.start, pSender->snapshotParam.end, pSender->snapshot.lastApplyIndex, + pSender->snapshot.lastApplyTerm, pSender->snapshot.lastConfigIndex, pSender->seq, pSender->ack, + pSender->pSndBuf->start, pSender->pSndBuf->cursor, pSender->pSndBuf->end, pSender->finish, + pSender->replicaIndex, DID(&pNode->replicasId[pSender->replicaIndex]), raftStoreGetTerm(pNode), + pNode->commitIndex, logBeginIndex, logLastIndex, pNode->minMatchIndex, snapshot.lastApplyIndex, + snapshot.lastApplyTerm, pNode->raftCfg.isStandBy, pNode->raftCfg.batchSize, pNode->replicaNum, + pNode->raftCfg.lastConfigIndex, pNode->changing, pNode->restoreFinish, pNode->quorum, peerStr, cfgStr); } void syncPrintSnapshotReceiverLog(const char* flags, ELogLevel level, int32_t dflag, SSyncSnapshotReceiver* pReceiver, @@ -315,19 +318,21 @@ void syncPrintSnapshotReceiverLog(const char* flags, ELogLevel level, int32_t df taosPrintLog( flags, level, dflag, "vgId:%d, %s, sync:%s," - " snap-receiver:%p signature:(%" PRId64 ", %" PRId64 "), {start:%d ack:%d term:%" PRIu64 " start-time:%" PRId64 + " snap-receiver:%p signature:(%" PRId64 ", %" PRId64 "), {start:%d ack:%d buf:[%" PRId64 " %" PRId64 ", %" PRId64 + ")" " from-dnode:%d, start:%" PRId64 " end:%" PRId64 " last-index:%" PRId64 " last-term:%" PRIu64 " last-cfg:%" PRId64 "}" ", term:%" PRIu64 ", commit-index:%" PRId64 ", firstver:%" PRId64 ", lastver:%" PRId64 ", min-match:%" PRId64 ", snap:{last-index:%" PRId64 ", last-term:%" PRIu64 "}, standby:%d, batch-sz:%d, replicas:%d, last-cfg:%" PRId64 ", chging:%d, restore:%d, quorum:%d, peer:%s, cfg:%s", pNode->vgId, eventLog, syncStr(pNode->state), pReceiver, pReceiver->term, pReceiver->startTime, pReceiver->start, - pReceiver->ack, pReceiver->term, pReceiver->startTime, DID(&pReceiver->fromId), pReceiver->snapshotParam.start, - pReceiver->snapshotParam.end, pReceiver->snapshot.lastApplyIndex, pReceiver->snapshot.lastApplyTerm, - pReceiver->snapshot.lastConfigIndex, raftStoreGetTerm(pNode), pNode->commitIndex, logBeginIndex, logLastIndex, - pNode->minMatchIndex, snapshot.lastApplyIndex, snapshot.lastApplyTerm, pNode->raftCfg.isStandBy, - pNode->raftCfg.batchSize, pNode->replicaNum, pNode->raftCfg.lastConfigIndex, pNode->changing, - pNode->restoreFinish, pNode->quorum, peerStr, cfgStr); + pReceiver->ack, pReceiver->pRcvBuf->start, pReceiver->pRcvBuf->cursor, pReceiver->pRcvBuf->end, + DID(&pReceiver->fromId), pReceiver->snapshotParam.start, pReceiver->snapshotParam.end, + pReceiver->snapshot.lastApplyIndex, pReceiver->snapshot.lastApplyTerm, pReceiver->snapshot.lastConfigIndex, + raftStoreGetTerm(pNode), pNode->commitIndex, logBeginIndex, logLastIndex, pNode->minMatchIndex, + snapshot.lastApplyIndex, snapshot.lastApplyTerm, pNode->raftCfg.isStandBy, pNode->raftCfg.batchSize, + pNode->replicaNum, pNode->raftCfg.lastConfigIndex, pNode->changing, pNode->restoreFinish, pNode->quorum, peerStr, + cfgStr); } void syncLogRecvTimer(SSyncNode* pSyncNode, const SyncTimeout* pMsg, const char* s) { From 25aade13336149bcbcd3bd5384b554d3b9d368b0 Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Wed, 25 Oct 2023 19:44:21 +0800 Subject: [PATCH 12/31] feat: alloc disk acorrding to avail disk space --- source/libs/tfs/src/tfsTier.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/source/libs/tfs/src/tfsTier.c b/source/libs/tfs/src/tfsTier.c index 1c47182e2a..d4f228a537 100644 --- a/source/libs/tfs/src/tfsTier.c +++ b/source/libs/tfs/src/tfsTier.c @@ -110,7 +110,9 @@ int32_t tfsAllocDiskOnTier(STfsTier *pTier) { } int32_t retId = -1; + int64_t avail = 0; for (int32_t id = 0; id < TFS_MAX_DISKS_PER_TIER; ++id) { +#if 0 // round-robin int32_t diskId = (pTier->nextid + id) % pTier->ndisk; STfsDisk *pDisk = pTier->disks[diskId]; @@ -126,6 +128,18 @@ int32_t tfsAllocDiskOnTier(STfsTier *pTier) { terrno = 0; pTier->nextid = (diskId + 1) % pTier->ndisk; break; +#else // select the disk with the most available space + STfsDisk *pDisk = pTier->disks[id]; + if (pDisk == NULL) continue; + + if (pDisk->size.avail < tsMinDiskFreeSize) continue; + + if (pDisk->size.avail > avail) { + avail = pDisk->size.avail; + retId = id; + terrno = 0; + } +#endif } tfsUnLockTier(pTier); From fe553352b85812fc4c24934f50fd440e6007f44e Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Wed, 25 Oct 2023 20:25:57 +0800 Subject: [PATCH 13/31] enh: cache snap blocks sent in snapshotSend for resending --- source/libs/sync/inc/syncSnapshot.h | 13 ++++ source/libs/sync/src/syncSnapshot.c | 108 +++++++++++++++++----------- 2 files changed, 81 insertions(+), 40 deletions(-) diff --git a/source/libs/sync/inc/syncSnapshot.h b/source/libs/sync/inc/syncSnapshot.h index 93c62544f2..e68702568a 100644 --- a/source/libs/sync/inc/syncSnapshot.h +++ b/source/libs/sync/inc/syncSnapshot.h @@ -37,8 +37,21 @@ typedef struct SSyncSnapBuffer { int64_t end; int64_t size; TdThreadMutex mutex; + void (*entryDeleteCb)(void *ptr); } SSyncSnapBuffer; +typedef struct SyncSnapBlock { + int32_t seq; + int8_t acked; + int64_t sendTimeMs; + + int16_t blockType; + void *pBlock; + int32_t blockLen; +} SyncSnapBlock; + +void syncSnapBlockDestroy(void *ptr); + typedef struct SSyncSnapshotSender { int8_t start; int32_t seq; diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 751f777b18..eee0ab2cc9 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -26,7 +26,9 @@ static void syncSnapBufferReset(SSyncSnapBuffer *pBuf) { taosThreadMutexLock(&pBuf->mutex); for (int64_t i = pBuf->start; i < pBuf->end; ++i) { - rpcFreeCont(pBuf->entries[i % pBuf->size]); + if (pBuf->entryDeleteCb) { + pBuf->entryDeleteCb(pBuf->entries[i % pBuf->size]); + } pBuf->entries[i % pBuf->size] = NULL; } pBuf->start = 1; @@ -85,17 +87,29 @@ SSyncSnapshotSender *snapshotSenderCreate(SSyncNode *pSyncNode, int32_t replicaI pSender->pSyncNode->pFsm->FpGetSnapshotInfo(pSender->pSyncNode->pFsm, &pSender->snapshot); pSender->finish = false; - pSender->pSndBuf = syncSnapBufferCreate(); - if (pSender->pSndBuf == NULL) { + SSyncSnapBuffer *pSndBuf = syncSnapBufferCreate(); + if (pSndBuf == NULL) { taosMemoryFree(pSender); pSender = NULL; return NULL; } - syncSnapBufferReset(pSender->pSndBuf); + pSndBuf->entryDeleteCb = syncSnapBlockDestroy; + pSender->pSndBuf = pSndBuf; + syncSnapBufferReset(pSender->pSndBuf); return pSender; } +void syncSnapBlockDestroy(void *ptr) { + SyncSnapBlock *pBlk = ptr; + if (pBlk->pBlock != NULL) { + taosMemoryFree(pBlk->pBlock); + pBlk->pBlock = NULL; + pBlk->blockLen = 0; + } + taosMemoryFree(pBlk); +} + void snapshotSenderDestroy(SSyncSnapshotSender *pSender) { if (pSender == NULL) return; @@ -238,23 +252,26 @@ void snapshotSenderStop(SSyncSnapshotSender *pSender, bool finish) { // when sender receive ack, call this function to send msg from seq // seq = ack + 1, already updated static int32_t snapshotSend(SSyncSnapshotSender *pSender) { - // free memory last time (current seq - 1) - if (pSender->pCurrentBlock != NULL) { - taosMemoryFree(pSender->pCurrentBlock); - pSender->pCurrentBlock = NULL; - pSender->blockLen = 0; - } + int32_t code = -1; + SyncSnapBlock *pBlk = NULL; if (pSender->seq != SYNC_SNAPSHOT_SEQ_END) { pSender->seq++; + pBlk = taosMemoryCalloc(1, sizeof(SyncSnapBlock)); + if (pBlk == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _OUT; + } + // read data - int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotDoRead(pSender->pSyncNode->pFsm, pSender->pReader, - &pSender->pCurrentBlock, &pSender->blockLen); + int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotDoRead(pSender->pSyncNode->pFsm, pSender->pReader, &pBlk->pBlock, + &pBlk->blockLen); if (ret != 0) { sSError(pSender, "snapshot sender read failed since %s", terrstr()); - return -1; + goto _OUT; } + pBlk->seq = pSender->seq; if (pSender->blockLen > 0) { // has read data @@ -263,7 +280,8 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { // read finish, update seq to end pSender->seq = SYNC_SNAPSHOT_SEQ_END; sSInfo(pSender, "snapshot sender read to the end, blockLen:%d seq:%d", pSender->blockLen, pSender->seq); - return 0; + code = 0; + goto _OUT; } } @@ -271,7 +289,7 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { SRpcMsg rpcMsg = {0}; if (syncBuildSnapshotSend(&rpcMsg, pSender->blockLen, pSender->pSyncNode->vgId) != 0) { sSError(pSender, "vgId:%d, snapshot sender build msg failed since %s", pSender->pSyncNode->vgId, terrstr()); - return -1; + goto _OUT; } SyncSnapshotSend *pMsg = rpcMsg.pCont; @@ -286,25 +304,35 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { pMsg->startTime = pSender->startTime; pMsg->seq = pSender->seq; - if (pSender->pCurrentBlock != NULL) { - memcpy(pMsg->data, pSender->pCurrentBlock, pSender->blockLen); - } - - // event log - if (pSender->seq == SYNC_SNAPSHOT_SEQ_END) { - syncLogSendSyncSnapshotSend(pSender->pSyncNode, pMsg, "snapshot sender finish"); - } else { - syncLogSendSyncSnapshotSend(pSender->pSyncNode, pMsg, "snapshot sender sending"); + if (pBlk != NULL && pBlk->pBlock != NULL) { + memcpy(pMsg->data, pBlk->pBlock, pBlk->blockLen); } // send msg if (syncNodeSendMsgById(&pMsg->destId, pSender->pSyncNode, &rpcMsg) != 0) { sSError(pSender, "snapshot sender send msg failed since %s", terrstr()); - return -1; + goto _OUT; } - pSender->lastSendTime = taosGetTimestampMs(); - return 0; + // put in buffer + int64_t nowMs = taosGetTimestampMs(); + if (pBlk) { + ASSERT(pBlk->seq != SYNC_SNAPSHOT_SEQ_END); + pBlk->sendTimeMs = nowMs; + pSender->pSndBuf->entries[pSender->seq % pSender->pSndBuf->size] = pBlk; + pBlk = NULL; + pSender->pSndBuf->end = pSender->seq + 1; + } + + pSender->lastSendTime = nowMs; + code = 0; + +_OUT:; + if (pBlk != NULL) { + syncSnapBlockDestroy(pBlk); + pBlk = NULL; + } + return code; } // send snapshot data from cache @@ -319,7 +347,7 @@ int32_t snapshotReSend(SSyncSnapshotSender *pSender) { SyncSnapshotSend *pMsg = rpcMsg.pCont; pMsg->srcId = pSender->pSyncNode->myRaftId; pMsg->destId = pSender->pSyncNode->replicasId[pSender->replicaIndex]; - pMsg->term = raftStoreGetTerm(pSender->pSyncNode); + pMsg->term = pSender->term; pMsg->beginIndex = pSender->snapshotParam.start; pMsg->lastIndex = pSender->snapshot.lastApplyIndex; pMsg->lastTerm = pSender->snapshot.lastApplyTerm; @@ -332,9 +360,6 @@ int32_t snapshotReSend(SSyncSnapshotSender *pSender) { memcpy(pMsg->data, pSender->pCurrentBlock, pSender->blockLen); } - // event log - syncLogSendSyncSnapshotSend(pSender->pSyncNode, pMsg, "snapshot sender resend"); - // send msg if (syncNodeSendMsgById(&pMsg->destId, pSender->pSyncNode, &rpcMsg) != 0) { sSError(pSender, "snapshot sender resend msg failed since %s", terrstr()); @@ -401,12 +426,14 @@ SSyncSnapshotReceiver *snapshotReceiverCreate(SSyncNode *pSyncNode, SRaftId from pReceiver->snapshot.lastApplyTerm = 0; pReceiver->snapshot.lastConfigIndex = SYNC_INDEX_INVALID; - pReceiver->pRcvBuf = syncSnapBufferCreate(); - if (pReceiver->pRcvBuf == NULL) { + SSyncSnapBuffer *pRcvBuf = syncSnapBufferCreate(); + if (pRcvBuf == NULL) { taosMemoryFree(pReceiver); pReceiver = NULL; return NULL; } + pRcvBuf->entryDeleteCb = rpcFreeCont; + pReceiver->pRcvBuf = pRcvBuf; syncSnapBufferReset(pReceiver->pRcvBuf); return pReceiver; @@ -884,7 +911,7 @@ static int32_t syncSnapBufferRecv(SSyncSnapshotReceiver *pReceiver, SyncSnapshot } pRcvBuf->start = seq + 1; syncSnapSendRsp(pReceiver, pRcvBuf->entries[seq % pRcvBuf->size], code); - rpcFreeCont(pRcvBuf->entries[seq % pRcvBuf->size]); + pRcvBuf->entryDeleteCb(pRcvBuf->entries[seq % pRcvBuf->size]); pRcvBuf->entries[seq % pRcvBuf->size] = NULL; if (code) goto _out; } @@ -1148,14 +1175,15 @@ static int32_t syncSnapBufferSend(SSyncSnapshotSender *pSender, SyncSnapshotRsp ASSERT(pSndBuf->start <= pSndBuf->cursor + 1 && pSndBuf->cursor < pSndBuf->end); - if (pMsg->ack > pSndBuf->cursor && pSndBuf->entries[pMsg->ack % pSndBuf->size] == NULL) { - pSndBuf->entries[pMsg->ack % pSndBuf->size] = pMsg; - ppMsg[0] = NULL; - pSndBuf->end = TMAX(pMsg->ack + 1, pSndBuf->end); + if (pMsg->ack > pSndBuf->cursor && pMsg->ack < pSndBuf->end) { + SyncSnapBlock *pBlk = pSndBuf->entries[pMsg->ack % pSndBuf->size]; + ASSERT(pBlk); + pBlk->acked = 1; } for (int64_t ack = pSndBuf->cursor + 1; ack < pSndBuf->end; ++ack) { - if (pSndBuf->entries[ack % pSndBuf->size]) { + SyncSnapBlock *pBlk = pSndBuf->entries[ack % pSndBuf->size]; + if (pBlk->acked) { pSndBuf->cursor = ack; } else { break; @@ -1163,7 +1191,7 @@ static int32_t syncSnapBufferSend(SSyncSnapshotSender *pSender, SyncSnapshotRsp } for (int64_t ack = pSndBuf->start; ack <= pSndBuf->cursor; ++ack) { - rpcFreeCont(pSndBuf->entries[ack % pSndBuf->size]); + pSndBuf->entryDeleteCb(pSndBuf->entries[ack % pSndBuf->size]); pSndBuf->entries[ack % pSndBuf->size] = NULL; pSndBuf->start = ack + 1; } From dc03d8cd1a7638b2b5d9b2e99bb12979f98767c6 Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Thu, 26 Oct 2023 20:07:38 +0800 Subject: [PATCH 14/31] disable tfs unit test --- source/libs/tfs/test/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/libs/tfs/test/CMakeLists.txt b/source/libs/tfs/test/CMakeLists.txt index ee28dcf723..2fd0836a1d 100644 --- a/source/libs/tfs/test/CMakeLists.txt +++ b/source/libs/tfs/test/CMakeLists.txt @@ -8,7 +8,7 @@ target_link_libraries( PUBLIC gtest_main ) -add_test( - NAME tfs_test - COMMAND tfs_test -) +# add_test( +# NAME tfs_test +# COMMAND tfs_test +# ) From 418602808312b7ae6d71563c0e715ff2b876decb Mon Sep 17 00:00:00 2001 From: xsren <285808407@qq.com> Date: Fri, 27 Oct 2023 11:09:18 +0800 Subject: [PATCH 15/31] fix: interval more than 1000 years --- include/util/taoserror.h | 1 + source/libs/parser/src/parTranslater.c | 3 ++ source/libs/parser/src/parUtil.c | 2 + source/os/src/osTime.c | 69 +++++++++++--------------- source/util/src/terror.c | 1 + 5 files changed, 37 insertions(+), 39 deletions(-) diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 6fbe4422ac..671d390638 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -657,6 +657,7 @@ int32_t* taosGetErrno(); #define TSDB_CODE_PAR_CORRESPONDING_STABLE_ERR TAOS_DEF_ERROR_CODE(0, 0x2618) #define TSDB_CODE_PAR_INVALID_DB_OPTION TAOS_DEF_ERROR_CODE(0, 0x2619) #define TSDB_CODE_PAR_INVALID_TABLE_OPTION TAOS_DEF_ERROR_CODE(0, 0x261A) +#define TSDB_CODE_PAR_INTER_VALUE_TOO_BIG TAOS_DEF_ERROR_CODE(0, 0x261B) #define TSDB_CODE_PAR_GROUPBY_WINDOW_COEXIST TAOS_DEF_ERROR_CODE(0, 0x2624) #define TSDB_CODE_PAR_AGG_FUNC_NESTING TAOS_DEF_ERROR_CODE(0, 0x2627) #define TSDB_CODE_PAR_INVALID_STATE_WIN_TYPE TAOS_DEF_ERROR_CODE(0, 0x2628) diff --git a/source/libs/parser/src/parTranslater.c b/source/libs/parser/src/parTranslater.c index 72293e2f8c..bff0b46b61 100644 --- a/source/libs/parser/src/parTranslater.c +++ b/source/libs/parser/src/parTranslater.c @@ -3512,6 +3512,7 @@ static void convertVarDuration(SValueNode* pOffset, uint8_t precision) { pOffset->unit = units[precision]; } +static const int64_t maxKeepMS = (int64_t)3600 * 1000 * 24 * (365 * 1000 + 250); static int32_t checkIntervalWindow(STranslateContext* pCxt, SIntervalWindowNode* pInterval) { uint8_t precision = ((SColumnNode*)pInterval->pCol)->node.resType.precision; @@ -3520,6 +3521,8 @@ static int32_t checkIntervalWindow(STranslateContext* pCxt, SIntervalWindowNode* if (pInter->datum.i <= 0 || (!valInter && pInter->datum.i < tsMinIntervalTime)) { return generateSyntaxErrMsg(&pCxt->msgBuf, TSDB_CODE_PAR_INTER_VALUE_TOO_SMALL, tsMinIntervalTime, getPrecisionStr(precision)); + } else if (pInter->datum.i > maxKeepMS) { + return generateSyntaxErrMsg(&pCxt->msgBuf, TSDB_CODE_PAR_INTER_VALUE_TOO_BIG, 1000, "years"); } if (NULL != pInterval->pOffset) { diff --git a/source/libs/parser/src/parUtil.c b/source/libs/parser/src/parUtil.c index 0ab8927bd0..825e02f9aa 100644 --- a/source/libs/parser/src/parUtil.c +++ b/source/libs/parser/src/parUtil.c @@ -65,6 +65,8 @@ static char* getSyntaxErrFormat(int32_t errCode) { return "This statement is no longer supported"; case TSDB_CODE_PAR_INTER_VALUE_TOO_SMALL: return "Interval cannot be less than %d %s"; + case TSDB_CODE_PAR_INTER_VALUE_TOO_BIG: + return "Interval cannot be more than %d %s"; case TSDB_CODE_PAR_DB_NOT_SPECIFIED: return "Database not specified"; case TSDB_CODE_PAR_INVALID_IDENTIFIER_NAME: diff --git a/source/os/src/osTime.c b/source/os/src/osTime.c index 05233065fa..e506ee603b 100644 --- a/source/os/src/osTime.c +++ b/source/os/src/osTime.c @@ -477,47 +477,38 @@ struct tm *taosLocalTime(const time_t *timep, struct tm *result, char *buf) { return res; } #ifdef WINDOWS - if (*timep < 0) { - if (*timep < -2208988800LL) { - if (buf != NULL) { - sprintf(buf, "NaN"); - } - return NULL; - } - - SYSTEMTIME s; - FILETIME f; - LARGE_INTEGER offset; - struct tm tm1; - time_t tt = 0; - if (localtime_s(&tm1, &tt) != 0 ) { - if (buf != NULL) { - sprintf(buf, "NaN"); - } - return NULL; - } - offset.QuadPart = TIMEEPOCH1900; - offset.QuadPart += *timep * 10000000; - f.dwLowDateTime = offset.QuadPart & 0xffffffff; - f.dwHighDateTime = (offset.QuadPart >> 32) & 0xffffffff; - FileTimeToSystemTime(&f, &s); - result->tm_sec = s.wSecond; - result->tm_min = s.wMinute; - result->tm_hour = s.wHour; - result->tm_mday = s.wDay; - result->tm_mon = s.wMonth - 1; - result->tm_year = s.wYear - 1900; - result->tm_wday = s.wDayOfWeek; - result->tm_yday = 0; - result->tm_isdst = 0; - } else { - if (localtime_s(result, timep) != 0) { - if (buf != NULL) { - sprintf(buf, "NaN"); - } - return NULL; + if (*timep < -2208988800LL) { + if (buf != NULL) { + sprintf(buf, "NaN"); } + return NULL; } + + SYSTEMTIME s; + FILETIME f; + LARGE_INTEGER offset; + struct tm tm1; + time_t tt = 0; + if (localtime_s(&tm1, &tt) != 0) { + if (buf != NULL) { + sprintf(buf, "NaN"); + } + return NULL; + } + offset.QuadPart = TIMEEPOCH1900; + offset.QuadPart += *timep * 10000000; + f.dwLowDateTime = offset.QuadPart & 0xffffffff; + f.dwHighDateTime = (offset.QuadPart >> 32) & 0xffffffff; + FileTimeToSystemTime(&f, &s); + result->tm_sec = s.wSecond; + result->tm_min = s.wMinute; + result->tm_hour = s.wHour; + result->tm_mday = s.wDay; + result->tm_mon = s.wMonth - 1; + result->tm_year = s.wYear - 1900; + result->tm_wday = s.wDayOfWeek; + result->tm_yday = 0; + result->tm_isdst = 0; #else res = localtime_r(timep, result); if (res == NULL && buf != NULL) { diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 4cc86d51b7..83b0ec7a82 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -518,6 +518,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_PORT, "Port should be an in TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_ENDPOINT, "Endpoint should be in the format of 'fqdn:port'") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_EXPRIE_STATEMENT, "This statement is no longer supported") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INTER_VALUE_TOO_SMALL, "Interval too small") +TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INTER_VALUE_TOO_BIG, "Interval too big") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_DB_NOT_SPECIFIED, "Database not specified") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_INVALID_IDENTIFIER_NAME, "Invalid identifier name") TAOS_DEFINE_ERROR(TSDB_CODE_PAR_CORRESPONDING_STABLE_ERR, "Corresponding super table not in this db") From b422c29d4e94dfe2a7c6550ea305b596160d83df Mon Sep 17 00:00:00 2001 From: xsren <285808407@qq.com> Date: Fri, 27 Oct 2023 15:16:03 +0800 Subject: [PATCH 16/31] fix interval limit on us/ns db --- source/libs/parser/src/parTranslater.c | 16 +++++++++++++++- source/os/test/osTimeTests.cpp | 4 ---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/source/libs/parser/src/parTranslater.c b/source/libs/parser/src/parTranslater.c index bff0b46b61..d65f97dce7 100644 --- a/source/libs/parser/src/parTranslater.c +++ b/source/libs/parser/src/parTranslater.c @@ -3499,6 +3499,20 @@ static const char* getPrecisionStr(uint8_t precision) { return "unknown"; } +static int64_t getPrecisionMultiple(uint8_t precision) { + switch (precision) { + case TSDB_TIME_PRECISION_MILLI: + return 1; + case TSDB_TIME_PRECISION_MICRO: + return 1000; + case TSDB_TIME_PRECISION_NANO: + return 1000000; + default: + break; + } + return 1; +} + static void convertVarDuration(SValueNode* pOffset, uint8_t precision) { const int64_t factors[3] = {NANOSECOND_PER_MSEC, NANOSECOND_PER_USEC, 1}; const int8_t units[3] = {TIME_UNIT_MILLISECOND, TIME_UNIT_MICROSECOND, TIME_UNIT_NANOSECOND}; @@ -3521,7 +3535,7 @@ static int32_t checkIntervalWindow(STranslateContext* pCxt, SIntervalWindowNode* if (pInter->datum.i <= 0 || (!valInter && pInter->datum.i < tsMinIntervalTime)) { return generateSyntaxErrMsg(&pCxt->msgBuf, TSDB_CODE_PAR_INTER_VALUE_TOO_SMALL, tsMinIntervalTime, getPrecisionStr(precision)); - } else if (pInter->datum.i > maxKeepMS) { + } else if (pInter->datum.i / getPrecisionMultiple(precision) > maxKeepMS) { return generateSyntaxErrMsg(&pCxt->msgBuf, TSDB_CODE_PAR_INTER_VALUE_TOO_BIG, 1000, "years"); } diff --git a/source/os/test/osTimeTests.cpp b/source/os/test/osTimeTests.cpp index 5dd15db4ab..90c089e310 100644 --- a/source/os/test/osTimeTests.cpp +++ b/source/os/test/osTimeTests.cpp @@ -114,10 +114,6 @@ TEST(osTimeTests, taosLocalTime) { ASSERT_EQ(local_time->tm_min, 0); ASSERT_EQ(local_time->tm_sec, 0); - time_t over_timep = 6406301441633558; - local_time = taosLocalTime(&over_timep, &result, NULL); - ASSERT_EQ(local_time, nullptr); - time_t neg_timep3 = -78115158887; local_time = taosLocalTime(&neg_timep3, &result, NULL); ASSERT_EQ(local_time, nullptr); From 43e6dec6c218fcc6b3993c247d18ed90d021bf79 Mon Sep 17 00:00:00 2001 From: dmchen Date: Fri, 27 Oct 2023 09:30:30 +0000 Subject: [PATCH 17/31] TD-26971 --- source/dnode/vnode/src/vnd/vnodeSvr.c | 72 +++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index c46ea15111..040ae96972 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -20,6 +20,7 @@ #include "vnode.h" #include "vnodeInt.h" #include "audit.h" +#include "tstrbuild.h" static int32_t vnodeProcessCreateStbReq(SVnode *pVnode, int64_t ver, void *pReq, int32_t len, SRpcMsg *pRsp); static int32_t vnodeProcessAlterStbReq(SVnode *pVnode, int64_t ver, void *pReq, int32_t len, SRpcMsg *pRsp); @@ -886,6 +887,7 @@ static int32_t vnodeProcessCreateTbReq(SVnode *pVnode, int64_t ver, void *pReq, char tbName[TSDB_TABLE_FNAME_LEN]; STbUidStore *pStore = NULL; SArray *tbUids = NULL; + SArray *tbNames = NULL; pRsp->msgType = TDMT_VND_CREATE_TABLE_RSP; pRsp->code = TSDB_CODE_SUCCESS; @@ -902,7 +904,8 @@ static int32_t vnodeProcessCreateTbReq(SVnode *pVnode, int64_t ver, void *pReq, rsp.pArray = taosArrayInit(req.nReqs, sizeof(cRsp)); tbUids = taosArrayInit(req.nReqs, sizeof(int64_t)); - if (rsp.pArray == NULL || tbUids == NULL) { + tbNames = taosArrayInit(req.nReqs, sizeof(char*)); + if (rsp.pArray == NULL || tbUids == NULL || tbNames == NULL) { rcode = -1; terrno = TSDB_CODE_OUT_OF_MEMORY; goto _exit; @@ -947,14 +950,9 @@ static int32_t vnodeProcessCreateTbReq(SVnode *pVnode, int64_t ver, void *pReq, taosArrayPush(rsp.pArray, &cRsp); - if(tsEnableAuditCreateTable){ - int64_t clusterId = pVnode->config.syncCfg.nodeInfo[0].clusterId; - - SName name = {0}; - tNameFromString(&name, pVnode->config.dbname, T_NAME_ACCT | T_NAME_DB); - - auditRecord(NULL, clusterId, "createTable", name.dbname, "", pCreateReq->name, strlen(pCreateReq->name)); - } + char* str = taosMemoryCalloc(1, TSDB_TABLE_FNAME_LEN); + strcpy(str, pCreateReq->name); + taosArrayPush(tbNames, str); } vDebug("vgId:%d, add %d new created tables into query table list", TD_VID(pVnode), (int32_t)taosArrayGetSize(tbUids)); @@ -976,6 +974,29 @@ static int32_t vnodeProcessCreateTbReq(SVnode *pVnode, int64_t ver, void *pReq, tEncoderInit(&encoder, pRsp->pCont, pRsp->contLen); tEncodeSVCreateTbBatchRsp(&encoder, &rsp); + if(tsEnableAuditCreateTable){ + int64_t clusterId = pVnode->config.syncCfg.nodeInfo[0].clusterId; + + SName name = {0}; + tNameFromString(&name, pVnode->config.dbname, T_NAME_ACCT | T_NAME_DB); + + SStringBuilder sb = {0}; + for(int32_t iReq = 0; iReq < req.nReqs; iReq++){ + char* key = taosArrayGet(tbNames, iReq); + taosStringBuilderAppendStringLen(&sb, key, strlen(key)); + if(iReq < req.nReqs - 1){ + taosStringBuilderAppendChar(&sb, ','); + } + } + + size_t len = 0; + char* keyJoined = taosStringBuilderGetResult(&sb, &len); + + auditRecord(NULL, clusterId, "createTable", name.dbname, "", keyJoined, len); + + taosStringBuilderDestroy(&sb); + } + _exit: for (int32_t iReq = 0; iReq < req.nReqs; iReq++) { pCreateReq = req.pReqs + iReq; @@ -987,6 +1008,7 @@ _exit: taosArrayDestroy(tbUids); tDecoderClear(&decoder); tEncoderClear(&encoder); + taosArrayDestroy(tbNames); return rcode; } @@ -1120,6 +1142,7 @@ static int32_t vnodeProcessDropTbReq(SVnode *pVnode, int64_t ver, void *pReq, in int32_t ret; SArray *tbUids = NULL; STbUidStore *pStore = NULL; + SArray *tbNames = NULL; pRsp->msgType = TDMT_VND_DROP_TABLE_RSP; pRsp->pCont = NULL; @@ -1138,7 +1161,8 @@ static int32_t vnodeProcessDropTbReq(SVnode *pVnode, int64_t ver, void *pReq, in // process req tbUids = taosArrayInit(req.nReqs, sizeof(int64_t)); rsp.pArray = taosArrayInit(req.nReqs, sizeof(SVDropTbRsp)); - if (tbUids == NULL || rsp.pArray == NULL) goto _exit; + tbNames = taosArrayInit(req.nReqs, sizeof(char*)); + if (tbUids == NULL || rsp.pArray == NULL || tbNames == NULL) goto _exit; for (int32_t iReq = 0; iReq < req.nReqs; iReq++) { SVDropTbReq *pDropTbReq = req.pReqs + iReq; @@ -1159,11 +1183,38 @@ static int32_t vnodeProcessDropTbReq(SVnode *pVnode, int64_t ver, void *pReq, in } taosArrayPush(rsp.pArray, &dropTbRsp); + + char* str = taosMemoryCalloc(1, TSDB_TABLE_FNAME_LEN); + strcpy(str, pDropTbReq->name); + taosArrayPush(tbNames, str); } tqUpdateTbUidList(pVnode->pTq, tbUids, false); tdUpdateTbUidList(pVnode->pSma, pStore, false); + if(tsEnableAuditCreateTable){ + int64_t clusterId = pVnode->config.syncCfg.nodeInfo[0].clusterId; + + SName name = {0}; + tNameFromString(&name, pVnode->config.dbname, T_NAME_ACCT | T_NAME_DB); + + SStringBuilder sb = {0}; + for(int32_t iReq = 0; iReq < req.nReqs; iReq++){ + char* key = taosArrayGet(tbNames, iReq); + taosStringBuilderAppendStringLen(&sb, key, strlen(key)); + if(iReq < req.nReqs - 1){ + taosStringBuilderAppendChar(&sb, ','); + } + } + + size_t len = 0; + char* keyJoined = taosStringBuilderGetResult(&sb, &len); + + auditRecord(NULL, clusterId, "createTable", name.dbname, "", keyJoined, len); + + taosStringBuilderDestroy(&sb); + } + _exit: taosArrayDestroy(tbUids); tdUidStoreFree(pStore); @@ -1174,6 +1225,7 @@ _exit: tEncodeSVDropTbBatchRsp(&encoder, &rsp); tEncoderClear(&encoder); taosArrayDestroy(rsp.pArray); + taosArrayDestroy(tbNames); return 0; } From cdb1d8296ddd322d8f0de91656ed9a153017d71f Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Fri, 27 Oct 2023 17:48:19 +0800 Subject: [PATCH 18/31] feat: concurrency on fileset --- source/dnode/vnode/src/inc/tsdb.h | 18 +- source/dnode/vnode/src/sma/smaRollup.c | 2 - source/dnode/vnode/src/tsdb/tsdbCommit.c | 20 +- source/dnode/vnode/src/tsdb/tsdbCommit2.c | 25 +- source/dnode/vnode/src/tsdb/tsdbFS2.c | 319 ++++++++++---------- source/dnode/vnode/src/tsdb/tsdbFS2.h | 50 +-- source/dnode/vnode/src/tsdb/tsdbFSet2.c | 33 +- source/dnode/vnode/src/tsdb/tsdbFSet2.h | 44 ++- source/dnode/vnode/src/tsdb/tsdbMerge.c | 117 ++++--- source/dnode/vnode/src/tsdb/tsdbOpen.c | 9 +- source/dnode/vnode/src/tsdb/tsdbRead2.c | 66 ++-- source/dnode/vnode/src/tsdb/tsdbRetention.c | 264 +++++++++------- source/dnode/vnode/src/tsdb/tsdbSnapshot.c | 18 +- 13 files changed, 542 insertions(+), 443 deletions(-) diff --git a/source/dnode/vnode/src/inc/tsdb.h b/source/dnode/vnode/src/inc/tsdb.h index 79112babc3..1efcd9417d 100644 --- a/source/dnode/vnode/src/inc/tsdb.h +++ b/source/dnode/vnode/src/inc/tsdb.h @@ -309,7 +309,7 @@ int32_t tsdbTakeReadSnap2(STsdbReader *pReader, _query_reseek_func_t reseek, STs void tsdbUntakeReadSnap2(STsdbReader *pReader, STsdbReadSnap *pSnap, bool proactive); // tsdbMerge.c ============================================================================================== -int32_t tsdbMerge(void *arg); +int32_t tsdbSchedMerge(STsdb *tsdb, int32_t fid); // tsdbDiskData ============================================================================================== int32_t tDiskDataBuilderCreate(SDiskDataBuilder **ppBuilder); @@ -371,7 +371,7 @@ struct STsdb { char *path; SVnode *pVnode; STsdbKeepCfg keepCfg; - TdThreadRwlock rwLock; + TdThreadMutex mutex; SMemTable *mem; SMemTable *imem; STsdbFS fs; // old @@ -668,8 +668,8 @@ struct SDelFWriter { }; #include "tarray2.h" -//#include "tsdbFS2.h" -// struct STFileSet; +// #include "tsdbFS2.h" +// struct STFileSet; typedef struct STFileSet STFileSet; typedef TARRAY2(STFileSet *) TFileSetArray; @@ -677,9 +677,9 @@ typedef struct STSnapRange STSnapRange; typedef TARRAY2(STSnapRange *) TSnapRangeArray; // disjoint snap ranges // util -int32_t tSerializeSnapRangeArray(void *buf, int32_t bufLen, TSnapRangeArray *pSnapR); -int32_t tDeserializeSnapRangeArray(void *buf, int32_t bufLen, TSnapRangeArray *pSnapR); -void tsdbSnapRangeArrayDestroy(TSnapRangeArray **ppSnap); +int32_t tSerializeSnapRangeArray(void *buf, int32_t bufLen, TSnapRangeArray *pSnapR); +int32_t tDeserializeSnapRangeArray(void *buf, int32_t bufLen, TSnapRangeArray *pSnapR); +void tsdbSnapRangeArrayDestroy(TSnapRangeArray **ppSnap); SHashObj *tsdbGetSnapRangeHash(TSnapRangeArray *pRanges); // snap partition list @@ -873,8 +873,8 @@ int32_t tMergeTreeOpen2(SMergeTree *pMTree, SMergeTreeConf *pConf); void tMergeTreeAddIter(SMergeTree *pMTree, SLDataIter *pIter); bool tMergeTreeNext(SMergeTree *pMTree); -void tMergeTreePinSttBlock(SMergeTree* pMTree); -void tMergeTreeUnpinSttBlock(SMergeTree* pMTree); +void tMergeTreePinSttBlock(SMergeTree *pMTree); +void tMergeTreeUnpinSttBlock(SMergeTree *pMTree); bool tMergeTreeIgnoreEarlierTs(SMergeTree *pMTree); void tMergeTreeClose(SMergeTree *pMTree); diff --git a/source/dnode/vnode/src/sma/smaRollup.c b/source/dnode/vnode/src/sma/smaRollup.c index 8da2fff5a6..14c5baa402 100644 --- a/source/dnode/vnode/src/sma/smaRollup.c +++ b/source/dnode/vnode/src/sma/smaRollup.c @@ -31,8 +31,6 @@ SSmaMgmt smaMgmt = { typedef struct SRSmaQTaskInfoItem SRSmaQTaskInfoItem; -extern int32_t tsdbDoRetention(STsdb *pTsdb, int64_t now); - static int32_t tdUidStorePut(STbUidStore *pStore, tb_uid_t suid, tb_uid_t *uid); static void tdUidStoreDestory(STbUidStore *pStore); static int32_t tdUpdateTbUidListImpl(SSma *pSma, tb_uid_t *suid, SArray *tbUids, bool isAdd); diff --git a/source/dnode/vnode/src/tsdb/tsdbCommit.c b/source/dnode/vnode/src/tsdb/tsdbCommit.c index b4c2c0a979..55ae25aee4 100644 --- a/source/dnode/vnode/src/tsdb/tsdbCommit.c +++ b/source/dnode/vnode/src/tsdb/tsdbCommit.c @@ -131,7 +131,7 @@ int32_t tsdbBegin(STsdb *pTsdb) { TSDB_CHECK_CODE(code, lino, _exit); // lock - if ((code = taosThreadRwlockWrlock(&pTsdb->rwLock))) { + if ((code = taosThreadMutexLock(&pTsdb->mutex))) { code = TAOS_SYSTEM_ERROR(code); TSDB_CHECK_CODE(code, lino, _exit); } @@ -139,7 +139,7 @@ int32_t tsdbBegin(STsdb *pTsdb) { pTsdb->mem = pMemTable; // unlock - if ((code = taosThreadRwlockUnlock(&pTsdb->rwLock))) { + if ((code = taosThreadMutexUnlock(&pTsdb->mutex))) { code = TAOS_SYSTEM_ERROR(code); TSDB_CHECK_CODE(code, lino, _exit); } @@ -152,11 +152,11 @@ _exit: } int32_t tsdbPrepareCommit(STsdb *pTsdb) { - taosThreadRwlockWrlock(&pTsdb->rwLock); + taosThreadMutexLock(&pTsdb->mutex); ASSERT(pTsdb->imem == NULL); pTsdb->imem = pTsdb->mem; pTsdb->mem = NULL; - taosThreadRwlockUnlock(&pTsdb->rwLock); + taosThreadMutexUnlock(&pTsdb->mutex); return 0; } @@ -171,9 +171,9 @@ int32_t tsdbCommit(STsdb *pTsdb, SCommitInfo *pInfo) { // check if (pMemTable->nRow == 0 && pMemTable->nDel == 0) { - taosThreadRwlockWrlock(&pTsdb->rwLock); + taosThreadMutexLock(&pTsdb->mutex); pTsdb->imem = NULL; - taosThreadRwlockUnlock(&pTsdb->rwLock); + taosThreadMutexUnlock(&pTsdb->mutex); tsdbUnrefMemTable(pMemTable, NULL, true); goto _exit; @@ -501,6 +501,7 @@ static int32_t tsdbCommitFileDataStart(SCommitter *pCommitter) { int32_t lino = 0; STsdb *pTsdb = pCommitter->pTsdb; SDFileSet *pRSet = NULL; + // memory pCommitter->commitFid = tsdbKeyFid(pCommitter->nextKey, pCommitter->minutes, pCommitter->precision); pCommitter->expLevel = tsdbFidLevel(pCommitter->commitFid, &pCommitter->pTsdb->keepCfg, taosGetTimestampSec()); @@ -798,6 +799,7 @@ static int32_t tsdbCommitFileData(SCommitter *pCommitter) { int32_t lino = 0; STsdb *pTsdb = pCommitter->pTsdb; SMemTable *pMemTable = pTsdb->imem; + // commit file data start code = tsdbCommitFileDataStart(pCommitter); TSDB_CHECK_CODE(code, lino, _exit); @@ -1650,18 +1652,18 @@ int32_t tsdbFinishCommit(STsdb *pTsdb) { SMemTable *pMemTable = pTsdb->imem; // lock - taosThreadRwlockWrlock(&pTsdb->rwLock); + taosThreadMutexLock(&pTsdb->mutex); code = tsdbFSCommit(pTsdb); if (code) { - taosThreadRwlockUnlock(&pTsdb->rwLock); + taosThreadMutexUnlock(&pTsdb->mutex); TSDB_CHECK_CODE(code, lino, _exit); } pTsdb->imem = NULL; // unlock - taosThreadRwlockUnlock(&pTsdb->rwLock); + taosThreadMutexUnlock(&pTsdb->mutex); if (pMemTable) { tsdbUnrefMemTable(pMemTable, NULL, true); } diff --git a/source/dnode/vnode/src/tsdb/tsdbCommit2.c b/source/dnode/vnode/src/tsdb/tsdbCommit2.c index 6dc492f420..79ecdab15c 100644 --- a/source/dnode/vnode/src/tsdb/tsdbCommit2.c +++ b/source/dnode/vnode/src/tsdb/tsdbCommit2.c @@ -367,7 +367,12 @@ static int32_t tsdbCommitFileSetBegin(SCommitter2 *committer) { int32_t lino = 0; STsdb *tsdb = committer->tsdb; - committer->ctx->fid = tsdbKeyFid(committer->ctx->nextKey, committer->minutes, committer->precision); + int32_t fid = tsdbKeyFid(committer->ctx->nextKey, committer->minutes, committer->precision); + + // check if can commit + tsdbFSCheckCommit(tsdb, fid); + + committer->ctx->fid = fid; committer->ctx->expLevel = tsdbFidLevel(committer->ctx->fid, &tsdb->keepCfg, committer->ctx->now); tsdbFidKeyRange(committer->ctx->fid, committer->minutes, committer->precision, &committer->ctx->minKey, &committer->ctx->maxKey); @@ -549,11 +554,11 @@ _exit: } int32_t tsdbPreCommit(STsdb *tsdb) { - taosThreadRwlockWrlock(&tsdb->rwLock); + taosThreadMutexLock(&tsdb->mutex); ASSERT(tsdb->imem == NULL); tsdb->imem = tsdb->mem; tsdb->mem = NULL; - taosThreadRwlockUnlock(&tsdb->rwLock); + taosThreadMutexUnlock(&tsdb->mutex); return 0; } @@ -568,15 +573,13 @@ int32_t tsdbCommitBegin(STsdb *tsdb, SCommitInfo *info) { int64_t nDel = imem->nDel; if (nRow == 0 && nDel == 0) { - taosThreadRwlockWrlock(&tsdb->rwLock); + taosThreadMutexLock(&tsdb->mutex); tsdb->imem = NULL; - taosThreadRwlockUnlock(&tsdb->rwLock); + taosThreadMutexUnlock(&tsdb->mutex); tsdbUnrefMemTable(imem, NULL, true); } else { SCommitter2 committer[1]; - tsdbFSCheckCommit(tsdb->pFS); - code = tsdbOpenCommitter(tsdb, info, committer); TSDB_CHECK_CODE(code, lino, _exit); @@ -605,14 +608,14 @@ int32_t tsdbCommitCommit(STsdb *tsdb) { if (tsdb->imem == NULL) goto _exit; SMemTable *pMemTable = tsdb->imem; - taosThreadRwlockWrlock(&tsdb->rwLock); + taosThreadMutexLock(&tsdb->mutex); code = tsdbFSEditCommit(tsdb->pFS); if (code) { - taosThreadRwlockUnlock(&tsdb->rwLock); + taosThreadMutexUnlock(&tsdb->mutex); TSDB_CHECK_CODE(code, lino, _exit); } tsdb->imem = NULL; - taosThreadRwlockUnlock(&tsdb->rwLock); + taosThreadMutexUnlock(&tsdb->mutex); tsdbUnrefMemTable(pMemTable, NULL, true); _exit: @@ -640,4 +643,4 @@ _exit: tsdbInfo("vgId:%d %s done", TD_VID(pTsdb->pVnode), __func__); } return code; -} +} \ No newline at end of file diff --git a/source/dnode/vnode/src/tsdb/tsdbFS2.c b/source/dnode/vnode/src/tsdb/tsdbFS2.c index 93a16b5502..38d221d978 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFS2.c +++ b/source/dnode/vnode/src/tsdb/tsdbFS2.c @@ -55,25 +55,11 @@ static int32_t create_fs(STsdb *pTsdb, STFileSystem **fs) { TARRAY2_INIT(fs[0]->fSetArr); TARRAY2_INIT(fs[0]->fSetArrTmp); - // background task queue - taosThreadMutexInit(fs[0]->mutex, NULL); - fs[0]->bgTaskQueue->next = fs[0]->bgTaskQueue; - fs[0]->bgTaskQueue->prev = fs[0]->bgTaskQueue; - - taosThreadMutexInit(&fs[0]->commitMutex, NULL); - taosThreadCondInit(&fs[0]->canCommit, NULL); - fs[0]->blockCommit = false; - return 0; } static int32_t destroy_fs(STFileSystem **fs) { if (fs[0] == NULL) return 0; - taosThreadMutexDestroy(&fs[0]->commitMutex); - taosThreadCondDestroy(&fs[0]->canCommit); - taosThreadMutexDestroy(fs[0]->mutex); - - ASSERT(fs[0]->bgTaskNum == 0); TARRAY2_DESTROY(fs[0]->fSetArr, NULL); TARRAY2_DESTROY(fs[0]->fSetArrTmp, NULL); @@ -264,10 +250,11 @@ static int32_t apply_commit(STFileSystem *fs) { if (fset1 && fset2) { if (fset1->fid < fset2->fid) { // delete fset1 - TARRAY2_REMOVE(fsetArray1, i1, tsdbTFileSetRemove); + tsdbTFileSetRemove(fset1); + i1++; } else if (fset1->fid > fset2->fid) { // create new file set with fid of fset2->fid - code = tsdbTFileSetInitDup(fs->tsdb, fset2, &fset1); + code = tsdbTFileSetInitCopy(fs->tsdb, fset2, &fset1); if (code) return code; code = TARRAY2_SORT_INSERT(fsetArray1, fset1, tsdbTFileSetCmprFn); if (code) return code; @@ -282,10 +269,11 @@ static int32_t apply_commit(STFileSystem *fs) { } } else if (fset1) { // delete fset1 - TARRAY2_REMOVE(fsetArray1, i1, tsdbTFileSetRemove); + tsdbTFileSetRemove(fset1); + i1++; } else { // create new file set with fid of fset2->fid - code = tsdbTFileSetInitDup(fs->tsdb, fset2, &fset1); + code = tsdbTFileSetInitCopy(fs->tsdb, fset2, &fset1); if (code) return code; code = TARRAY2_SORT_INSERT(fsetArray1, fset1, tsdbTFileSetCmprFn); if (code) return code; @@ -512,7 +500,8 @@ static int32_t tsdbFSDoSanAndFix(STFileSystem *fs) { TARRAY2_FOREACH(lvl->fobjArr, fobj) { code = tsdbFSDoScanAndFixFile(fs, fobj); if (code) { - fset->maxVerValid = (fobj->f->minVer <= fobj->f->maxVer) ? TMIN(fset->maxVerValid, fobj->f->minVer - 1) : -1; + fset->maxVerValid = + (fobj->f->minVer <= fobj->f->maxVer) ? TMIN(fset->maxVerValid, fobj->f->minVer - 1) : -1; corrupt = true; } } @@ -592,7 +581,7 @@ static int32_t tsdbFSDupState(STFileSystem *fs) { const STFileSet *fset1; TARRAY2_FOREACH(src, fset1) { STFileSet *fset2; - code = tsdbTFileSetInitDup(fs->tsdb, fset1, &fset2); + code = tsdbTFileSetInitCopy(fs->tsdb, fset1, &fset2); if (code) return code; code = TARRAY2_APPEND(dst, fset2); if (code) return code; @@ -665,12 +654,6 @@ static int32_t close_file_system(STFileSystem *fs) { return 0; } -static int32_t apply_edit(STFileSystem *pFS) { - int32_t code = 0; - ASSERTS(0, "TODO: Not implemented yet"); - return code; -} - static int32_t fset_cmpr_fn(const struct STFileSet *pSet1, const struct STFileSet *pSet2) { if (pSet1->fid < pSet2->fid) { return -1; @@ -710,10 +693,23 @@ static int32_t edit_fs(STFileSystem *fs, const TFileOpArray *opArray) { TSDB_CHECK_CODE(code, lino, _exit); } - // remove empty file set + // remove empty empty stt level and empty file set int32_t i = 0; while (i < TARRAY2_SIZE(fsetArray)) { fset = TARRAY2_GET(fsetArray, i); + + SSttLvl *lvl; + int32_t j = 0; + while (j < TARRAY2_SIZE(fset->lvlArr)) { + lvl = TARRAY2_GET(fset->lvlArr, j); + + if (TARRAY2_SIZE(lvl->fobjArr) == 0) { + TARRAY2_REMOVE(fset->lvlArr, j, tsdbSttLvlClear); + } else { + j++; + } + } + if (tsdbTFileSetIsEmpty(fset)) { TARRAY2_REMOVE(fsetArray, i, tsdbTFileSetClear); } else { @@ -753,13 +749,13 @@ _exit: static void tsdbDoWaitBgTask(STFileSystem *fs, STFSBgTask *task) { task->numWait++; - taosThreadCondWait(task->done, fs->mutex); + taosThreadCondWait(task->done, &fs->tsdb->mutex); task->numWait--; if (task->numWait == 0) { taosThreadCondDestroy(task->done); - if (task->free) { - task->free(task->arg); + if (task->destroy) { + task->destroy(task->arg); } taosMemoryFree(task); } @@ -770,8 +766,8 @@ static void tsdbDoDoneBgTask(STFileSystem *fs, STFSBgTask *task) { taosThreadCondBroadcast(task->done); } else { taosThreadCondDestroy(task->done); - if (task->free) { - task->free(task->arg); + if (task->destroy) { + task->destroy(task->arg); } taosMemoryFree(task); } @@ -780,23 +776,16 @@ static void tsdbDoDoneBgTask(STFileSystem *fs, STFSBgTask *task) { int32_t tsdbCloseFS(STFileSystem **fs) { if (fs[0] == NULL) return 0; - taosThreadMutexLock(fs[0]->mutex); - fs[0]->stop = true; - - if (fs[0]->bgTaskRunning) { - tsdbDoWaitBgTask(fs[0], fs[0]->bgTaskRunning); - } - taosThreadMutexUnlock(fs[0]->mutex); - + tsdbFSDisableBgTask(fs[0]); close_file_system(fs[0]); destroy_fs(fs); return 0; } int64_t tsdbFSAllocEid(STFileSystem *fs) { - taosThreadRwlockRdlock(&fs->tsdb->rwLock); + taosThreadMutexLock(&fs->tsdb->mutex); int64_t cid = ++fs->neid; - taosThreadRwlockUnlock(&fs->tsdb->rwLock); + taosThreadMutexUnlock(&fs->tsdb->mutex); return cid; } @@ -837,27 +826,34 @@ _exit: return code; } -static int32_t tsdbFSSetBlockCommit(STFileSystem *fs, bool block) { - taosThreadMutexLock(&fs->commitMutex); +static int32_t tsdbFSSetBlockCommit(STFileSet *fset, bool block) { if (block) { - fs->blockCommit = true; + fset->blockCommit = true; } else { - fs->blockCommit = false; - taosThreadCondSignal(&fs->canCommit); + fset->blockCommit = false; + if (fset->numWaitCommit > 0) { + taosThreadCondSignal(&fset->canCommit); + } } - taosThreadMutexUnlock(&fs->commitMutex); return 0; } -int32_t tsdbFSCheckCommit(STFileSystem *fs) { - taosThreadMutexLock(&fs->commitMutex); - while (fs->blockCommit) { - taosThreadCondWait(&fs->canCommit, &fs->commitMutex); +int32_t tsdbFSCheckCommit(STsdb *tsdb, int32_t fid) { + taosThreadMutexLock(&tsdb->mutex); + STFileSet *fset; + tsdbFSGetFSet(tsdb->pFS, fid, &fset); + if (fset) { + while (fset->blockCommit) { + fset->numWaitCommit++; + taosThreadCondWait(&fset->canCommit, &tsdb->mutex); + fset->numWaitCommit--; + } } - taosThreadMutexUnlock(&fs->commitMutex); + taosThreadMutexUnlock(&tsdb->mutex); return 0; } +// IMPORTANT: the caller must hold fs->tsdb->mutex int32_t tsdbFSEditCommit(STFileSystem *fs) { int32_t code = 0; int32_t lino = 0; @@ -867,36 +863,57 @@ int32_t tsdbFSEditCommit(STFileSystem *fs) { TSDB_CHECK_CODE(code, lino, _exit); // schedule merge - if (fs->tsdb->pVnode->config.sttTrigger > 1) { + int32_t sttTrigger = fs->tsdb->pVnode->config.sttTrigger; + if (sttTrigger > 1) { STFileSet *fset; - int32_t sttTrigger = fs->tsdb->pVnode->config.sttTrigger; - bool schedMerge = false; - bool blockCommit = false; - TARRAY2_FOREACH_REVERSE(fs->fSetArr, fset) { - if (TARRAY2_SIZE(fset->lvlArr) == 0) continue; + if (TARRAY2_SIZE(fset->lvlArr) == 0) { + tsdbFSSetBlockCommit(fset, false); + continue; + } SSttLvl *lvl = TARRAY2_FIRST(fset->lvlArr); - if (lvl->level != 0) continue; + if (lvl->level != 0) { + tsdbFSSetBlockCommit(fset, false); + continue; + } int32_t numFile = TARRAY2_SIZE(lvl->fobjArr); if (numFile >= sttTrigger) { - schedMerge = true; + // launch merge + code = tsdbSchedMerge(fs->tsdb, fset->fid); + TSDB_CHECK_CODE(code, lino, _exit); } if (numFile >= sttTrigger * BLOCK_COMMIT_FACTOR) { - blockCommit = true; + tsdbFSSetBlockCommit(fset, true); + } else { + tsdbFSSetBlockCommit(fset, false); } + } + } - if (schedMerge && blockCommit) break; + // clear empty level and fset + int32_t i = 0; + while (i < TARRAY2_SIZE(fs->fSetArr)) { + STFileSet *fset = TARRAY2_GET(fs->fSetArr, i); + + int32_t j = 0; + while (j < TARRAY2_SIZE(fset->lvlArr)) { + SSttLvl *lvl = TARRAY2_GET(fset->lvlArr, j); + + if (TARRAY2_SIZE(lvl->fobjArr) == 0) { + TARRAY2_REMOVE(fset->lvlArr, j, tsdbSttLvlClear); + } else { + j++; + } } - if (schedMerge) { - code = tsdbFSScheduleBgTask(fs, TSDB_BG_TASK_MERGER, tsdbMerge, NULL, fs->tsdb, NULL); - TSDB_CHECK_CODE(code, lino, _exit); + if (tsdbTFileSetIsEmpty(fset) && fset->bgTaskRunning == NULL) { + TARRAY2_REMOVE(fs->fSetArr, i, tsdbTFileSetClear); + } else { + i++; } - - tsdbFSSetBlockCommit(fs, blockCommit); } _exit: @@ -933,15 +950,15 @@ int32_t tsdbFSCreateCopySnapshot(STFileSystem *fs, TFileSetArray **fsetArr) { TARRAY2_INIT(fsetArr[0]); - taosThreadRwlockRdlock(&fs->tsdb->rwLock); + taosThreadMutexLock(&fs->tsdb->mutex); TARRAY2_FOREACH(fs->fSetArr, fset) { - code = tsdbTFileSetInitDup(fs->tsdb, fset, &fset1); + code = tsdbTFileSetInitCopy(fs->tsdb, fset, &fset1); if (code) break; code = TARRAY2_APPEND(fsetArr[0], fset1); if (code) break; } - taosThreadRwlockUnlock(&fs->tsdb->rwLock); + taosThreadMutexUnlock(&fs->tsdb->mutex); if (code) { TARRAY2_DESTROY(fsetArr[0], tsdbTFileSetClear); @@ -961,9 +978,9 @@ int32_t tsdbFSDestroyCopySnapshot(TFileSetArray **fsetArr) { } int32_t tsdbFSCreateRefSnapshot(STFileSystem *fs, TFileSetArray **fsetArr) { - taosThreadRwlockRdlock(&fs->tsdb->rwLock); + taosThreadMutexLock(&fs->tsdb->mutex); int32_t code = tsdbFSCreateRefSnapshotWithoutLock(fs, fsetArr); - taosThreadRwlockUnlock(&fs->tsdb->rwLock); + taosThreadMutexUnlock(&fs->tsdb->mutex); return code; } @@ -1017,7 +1034,7 @@ int32_t tsdbFSCreateCopyRangedSnapshot(STFileSystem *fs, TSnapRangeArray *pRange } } - taosThreadRwlockRdlock(&fs->tsdb->rwLock); + taosThreadMutexLock(&fs->tsdb->mutex); TARRAY2_FOREACH(fs->fSetArr, fset) { int64_t ever = VERSION_MAX; if (pHash) { @@ -1034,7 +1051,7 @@ int32_t tsdbFSCreateCopyRangedSnapshot(STFileSystem *fs, TSnapRangeArray *pRange code = TARRAY2_APPEND(fsetArr[0], fset1); if (code) break; } - taosThreadRwlockUnlock(&fs->tsdb->rwLock); + taosThreadMutexUnlock(&fs->tsdb->mutex); _out: if (code) { @@ -1089,7 +1106,7 @@ int32_t tsdbFSCreateRefRangedSnapshot(STFileSystem *fs, int64_t sver, int64_t ev } } - taosThreadRwlockRdlock(&fs->tsdb->rwLock); + taosThreadMutexLock(&fs->tsdb->mutex); TARRAY2_FOREACH(fs->fSetArr, fset) { int64_t sver1 = sver; int64_t ever1 = ever; @@ -1118,7 +1135,7 @@ int32_t tsdbFSCreateRefRangedSnapshot(STFileSystem *fs, int64_t sver, int64_t ev fsr1 = NULL; } - taosThreadRwlockUnlock(&fs->tsdb->rwLock); + taosThreadMutexUnlock(&fs->tsdb->mutex); if (code) { tsdbTSnapRangeClear(&fsr1); @@ -1137,59 +1154,69 @@ _out: const char *gFSBgTaskName[] = {NULL, "MERGE", "RETENTION", "COMPACT"}; static int32_t tsdbFSRunBgTask(void *arg) { - STFileSystem *fs = (STFileSystem *)arg; + STFSBgTask *task = (STFSBgTask *)arg; + STFileSystem *fs = task->fs; + STFileSet *fset; - ASSERT(fs->bgTaskRunning != NULL); + tsdbFSGetFSet(fs, task->fid, &fset); - fs->bgTaskRunning->launchTime = taosGetTimestampMs(); - fs->bgTaskRunning->run(fs->bgTaskRunning->arg); - fs->bgTaskRunning->finishTime = taosGetTimestampMs(); + ASSERT(fset != NULL && fset->bgTaskRunning == task); + + task->launchTime = taosGetTimestampMs(); + task->run(task->arg); + task->finishTime = taosGetTimestampMs(); tsdbDebug("vgId:%d bg task:%s task id:%" PRId64 " finished, schedule time:%" PRId64 " launch time:%" PRId64 " finish time:%" PRId64, - TD_VID(fs->tsdb->pVnode), gFSBgTaskName[fs->bgTaskRunning->type], fs->bgTaskRunning->taskid, - fs->bgTaskRunning->scheduleTime, fs->bgTaskRunning->launchTime, fs->bgTaskRunning->finishTime); + TD_VID(fs->tsdb->pVnode), gFSBgTaskName[task->type], task->taskid, task->scheduleTime, task->launchTime, + task->finishTime); - taosThreadMutexLock(fs->mutex); + taosThreadMutexLock(&fs->tsdb->mutex); // free last - tsdbDoDoneBgTask(fs, fs->bgTaskRunning); - fs->bgTaskRunning = NULL; + tsdbDoDoneBgTask(fs, task); + fset->bgTaskRunning = NULL; // schedule next - if (fs->bgTaskNum > 0) { + if (fset->bgTaskNum > 0) { if (fs->stop) { - while (fs->bgTaskNum > 0) { - STFSBgTask *task = fs->bgTaskQueue->next; - task->prev->next = task->next; - task->next->prev = task->prev; - fs->bgTaskNum--; - tsdbDoDoneBgTask(fs, task); + while (fset->bgTaskNum > 0) { + STFSBgTask *nextTask = fset->bgTaskQueue->next; + nextTask->prev->next = nextTask->next; + nextTask->next->prev = nextTask->prev; + fset->bgTaskNum--; + tsdbDoDoneBgTask(fs, nextTask); } } else { // pop task from head - fs->bgTaskRunning = fs->bgTaskQueue->next; - fs->bgTaskRunning->prev->next = fs->bgTaskRunning->next; - fs->bgTaskRunning->next->prev = fs->bgTaskRunning->prev; - fs->bgTaskNum--; - vnodeScheduleTaskEx(1, tsdbFSRunBgTask, arg); + fset->bgTaskRunning = fset->bgTaskQueue->next; + fset->bgTaskRunning->prev->next = fset->bgTaskRunning->next; + fset->bgTaskRunning->next->prev = fset->bgTaskRunning->prev; + fset->bgTaskNum--; + vnodeScheduleTaskEx(1, tsdbFSRunBgTask, fset->bgTaskRunning); } } - taosThreadMutexUnlock(fs->mutex); + taosThreadMutexUnlock(&fs->tsdb->mutex); return 0; } -static int32_t tsdbFSScheduleBgTaskImpl(STFileSystem *fs, EFSBgTaskT type, int32_t (*run)(void *), - void (*destroy)(void *), void *arg, int64_t *taskid) { +// IMPORTANT: the caller must hold the fs->tsdb->mutex +int32_t tsdbFSScheduleBgTask(STFileSystem *fs, int32_t fid, EFSBgTaskT type, int32_t (*run)(void *), + void (*destroy)(void *), void *arg, int64_t *taskid) { if (fs->stop) { if (destroy) { destroy(arg); } - return 0; // TODO: use a better error code + return 0; } - for (STFSBgTask *task = fs->bgTaskQueue->next; task != fs->bgTaskQueue; task = task->next) { + STFileSet *fset; + tsdbFSGetFSet(fs, fid, &fset); + + ASSERT(fset != NULL); + + for (STFSBgTask *task = fset->bgTaskQueue->next; task != fset->bgTaskQueue; task = task->next) { if (task->type == type) { if (destroy) { destroy(arg); @@ -1203,22 +1230,24 @@ static int32_t tsdbFSScheduleBgTaskImpl(STFileSystem *fs, EFSBgTaskT type, int if (task == NULL) return TSDB_CODE_OUT_OF_MEMORY; taosThreadCondInit(task->done, NULL); + task->fs = fs; + task->fid = fid; task->type = type; task->run = run; - task->free = destroy; + task->destroy = destroy; task->arg = arg; task->scheduleTime = taosGetTimestampMs(); task->taskid = ++fs->taskid; - if (fs->bgTaskRunning == NULL && fs->bgTaskNum == 0) { + if (fset->bgTaskRunning == NULL && fset->bgTaskNum == 0) { // launch task directly - fs->bgTaskRunning = task; - vnodeScheduleTaskEx(1, tsdbFSRunBgTask, fs); + fset->bgTaskRunning = task; + vnodeScheduleTaskEx(1, tsdbFSRunBgTask, task); } else { // add to the queue tail - fs->bgTaskNum++; - task->next = fs->bgTaskQueue; - task->prev = fs->bgTaskQueue->prev; + fset->bgTaskNum++; + task->next = fset->bgTaskQueue; + task->prev = fset->bgTaskQueue->prev; task->prev->next = task; task->next->prev = task; } @@ -1227,68 +1256,30 @@ static int32_t tsdbFSScheduleBgTaskImpl(STFileSystem *fs, EFSBgTaskT type, int return 0; } -int32_t tsdbFSScheduleBgTask(STFileSystem *fs, EFSBgTaskT type, int32_t (*run)(void *), void (*free)(void *), void *arg, - int64_t *taskid) { - taosThreadMutexLock(fs->mutex); - int32_t code = tsdbFSScheduleBgTaskImpl(fs, type, run, free, arg, taskid); - taosThreadMutexUnlock(fs->mutex); - return code; -} +int32_t tsdbFSDisableBgTask(STFileSystem *fs) { + taosThreadMutexLock(&fs->tsdb->mutex); + for (;;) { + fs->stop = true; + bool done = true; -int32_t tsdbFSWaitBgTask(STFileSystem *fs, int64_t taskid) { - STFSBgTask *task = NULL; - - taosThreadMutexLock(fs->mutex); - - if (fs->bgTaskRunning && fs->bgTaskRunning->taskid == taskid) { - task = fs->bgTaskRunning; - } else { - for (STFSBgTask *taskt = fs->bgTaskQueue->next; taskt != fs->bgTaskQueue; taskt = taskt->next) { - if (taskt->taskid == taskid) { - task = taskt; + STFileSet *fset; + TARRAY2_FOREACH(fs->fSetArr, fset) { + if (fset->bgTaskRunning) { + tsdbDoWaitBgTask(fs, fset->bgTaskRunning); + done = false; break; } } - } - if (task) { - tsdbDoWaitBgTask(fs, task); + if (done) break; } - - taosThreadMutexUnlock(fs->mutex); + taosThreadMutexUnlock(&fs->tsdb->mutex); return 0; } -int32_t tsdbFSWaitAllBgTask(STFileSystem *fs) { - taosThreadMutexLock(fs->mutex); - - while (fs->bgTaskRunning) { - taosThreadCondWait(fs->bgTaskRunning->done, fs->mutex); - } - - taosThreadMutexUnlock(fs->mutex); - return 0; -} - -static int32_t tsdbFSDoDisableBgTask(STFileSystem *fs) { - fs->stop = true; - - if (fs->bgTaskRunning) { - tsdbDoWaitBgTask(fs, fs->bgTaskRunning); - } - return 0; -} - -int32_t tsdbFSDisableBgTask(STFileSystem *fs) { - taosThreadMutexLock(fs->mutex); - int32_t code = tsdbFSDoDisableBgTask(fs); - taosThreadMutexUnlock(fs->mutex); - return code; -} - int32_t tsdbFSEnableBgTask(STFileSystem *fs) { - taosThreadMutexLock(fs->mutex); + taosThreadMutexLock(&fs->tsdb->mutex); fs->stop = false; - taosThreadMutexUnlock(fs->mutex); + taosThreadMutexUnlock(&fs->tsdb->mutex); return 0; -} +} \ No newline at end of file diff --git a/source/dnode/vnode/src/tsdb/tsdbFS2.h b/source/dnode/vnode/src/tsdb/tsdbFS2.h index 31b98e5656..a3a8e2f575 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFS2.h +++ b/source/dnode/vnode/src/tsdb/tsdbFS2.h @@ -22,22 +22,11 @@ extern "C" { #endif -/* Exposed Handle */ -typedef struct STFileSystem STFileSystem; -typedef struct STFSBgTask STFSBgTask; -// typedef TARRAY2(STFileSet *) TFileSetArray; - typedef enum { TSDB_FEDIT_COMMIT = 1, // TSDB_FEDIT_MERGE } EFEditT; -typedef enum { - TSDB_BG_TASK_MERGER = 1, - TSDB_BG_TASK_RETENTION, - TSDB_BG_TASK_COMPACT, -} EFSBgTaskT; - typedef enum { TSDB_FCURRENT = 1, TSDB_FCURRENT_C, // for commit @@ -67,37 +56,17 @@ int32_t tsdbFSEditBegin(STFileSystem *fs, const TFileOpArray *opArray, EFEditT e int32_t tsdbFSEditCommit(STFileSystem *fs); int32_t tsdbFSEditAbort(STFileSystem *fs); // background task -int32_t tsdbFSScheduleBgTask(STFileSystem *fs, EFSBgTaskT type, int32_t (*run)(void *), void (*free)(void *), void *arg, - int64_t *taskid); -int32_t tsdbFSWaitBgTask(STFileSystem *fs, int64_t taskid); -int32_t tsdbFSWaitAllBgTask(STFileSystem *fs); +int32_t tsdbFSScheduleBgTask(STFileSystem *fs, int32_t fid, EFSBgTaskT type, int32_t (*run)(void *), + void (*destroy)(void *), void *arg, int64_t *taskid); int32_t tsdbFSDisableBgTask(STFileSystem *fs); int32_t tsdbFSEnableBgTask(STFileSystem *fs); // other int32_t tsdbFSGetFSet(STFileSystem *fs, int32_t fid, STFileSet **fset); -int32_t tsdbFSCheckCommit(STFileSystem *fs); +int32_t tsdbFSCheckCommit(STsdb *tsdb, int32_t fid); // utils int32_t save_fs(const TFileSetArray *arr, const char *fname); int32_t current_fname(STsdb *pTsdb, char *fname, EFCurrentT ftype); -struct STFSBgTask { - EFSBgTaskT type; - int32_t (*run)(void *arg); - void (*free)(void *arg); - void *arg; - - TdThreadCond done[1]; - int32_t numWait; - - int64_t taskid; - int64_t scheduleTime; - int64_t launchTime; - int64_t finishTime; - - struct STFSBgTask *prev; - struct STFSBgTask *next; -}; - /* Exposed Structs */ struct STFileSystem { STsdb *tsdb; @@ -109,17 +78,8 @@ struct STFileSystem { TFileSetArray fSetArrTmp[1]; // background task queue - TdThreadMutex mutex[1]; - bool stop; - int64_t taskid; - int32_t bgTaskNum; - STFSBgTask bgTaskQueue[1]; - STFSBgTask *bgTaskRunning; - - // block commit variables - TdThreadMutex commitMutex; - TdThreadCond canCommit; - bool blockCommit; + bool stop; + int64_t taskid; }; #ifdef __cplusplus diff --git a/source/dnode/vnode/src/tsdb/tsdbFSet2.c b/source/dnode/vnode/src/tsdb/tsdbFSet2.c index 620fcb3a47..642d555366 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFSet2.c +++ b/source/dnode/vnode/src/tsdb/tsdbFSet2.c @@ -342,11 +342,6 @@ int32_t tsdbTFileSetEdit(STsdb *pTsdb, STFileSet *fset, const STFileOp *op) { int32_t idx = TARRAY2_SEARCH_IDX(lvl->fobjArr, &tfobjp, tsdbTFileObjCmpr, TD_EQ); ASSERT(idx >= 0); TARRAY2_REMOVE(lvl->fobjArr, idx, tsdbSttLvlClearFObj); - - if (TARRAY2_SIZE(lvl->fobjArr) == 0) { - // TODO: remove the stt level if no file exists anymore - // TARRAY2_REMOVE(&fset->lvlArr, lvl - fset->lvlArr.data, tsdbSttLvlClear); - } } else { ASSERT(tsdbIsSameTFile(&op->of, fset->farr[op->of.type]->f)); tsdbTFileObjUnref(fset->farr[op->of.type]); @@ -454,10 +449,22 @@ int32_t tsdbTFileSetInit(int32_t fid, STFileSet **fset) { fset[0]->fid = fid; fset[0]->maxVerValid = VERSION_MAX; TARRAY2_INIT(fset[0]->lvlArr); + + // background task queue + fset[0]->bgTaskNum = 0; + fset[0]->bgTaskQueue->next = fset[0]->bgTaskQueue; + fset[0]->bgTaskQueue->prev = fset[0]->bgTaskQueue; + fset[0]->bgTaskRunning = NULL; + + // block commit variables + taosThreadCondInit(&fset[0]->canCommit, NULL); + fset[0]->numWaitCommit = 0; + fset[0]->blockCommit = false; + return 0; } -int32_t tsdbTFileSetInitDup(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fset) { +int32_t tsdbTFileSetInitCopy(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fset) { int32_t code = tsdbTFileSetInit(fset1->fid, fset); if (code) return code; @@ -588,21 +595,23 @@ int32_t tsdbTFileSetClear(STFileSet **fset) { TARRAY2_DESTROY(fset[0]->lvlArr, tsdbSttLvlClear); + taosThreadCondDestroy(&fset[0]->canCommit); taosMemoryFree(fset[0]); fset[0] = NULL; return 0; } -int32_t tsdbTFileSetRemove(STFileSet **fset) { +int32_t tsdbTFileSetRemove(STFileSet *fset) { + if (fset == NULL) return 0; + for (tsdb_ftype_t ftype = TSDB_FTYPE_MIN; ftype < TSDB_FTYPE_MAX; ++ftype) { - if (fset[0]->farr[ftype] == NULL) continue; - tsdbTFileObjRemove(fset[0]->farr[ftype]); + if (fset->farr[ftype] == NULL) continue; + tsdbTFileObjRemove(fset->farr[ftype]); } - TARRAY2_DESTROY(fset[0]->lvlArr, tsdbSttLvlRemove); - taosMemoryFree(fset[0]); - fset[0] = NULL; + TARRAY2_DESTROY(fset->lvlArr, tsdbSttLvlRemove); + return 0; } diff --git a/source/dnode/vnode/src/tsdb/tsdbFSet2.h b/source/dnode/vnode/src/tsdb/tsdbFSet2.h index ea0f99f68e..34f174ade7 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFSet2.h +++ b/source/dnode/vnode/src/tsdb/tsdbFSet2.h @@ -28,6 +28,8 @@ typedef struct SSttLvl SSttLvl; typedef TARRAY2(STFileObj *) TFileObjArray; typedef TARRAY2(SSttLvl *) TSttLvlArray; typedef TARRAY2(STFileOp) TFileOpArray; +typedef struct STFileSystem STFileSystem; +typedef struct STFSBgTask STFSBgTask; typedef enum { TSDB_FOP_NONE = 0, @@ -41,10 +43,10 @@ typedef enum { // init/clear int32_t tsdbTFileSetInit(int32_t fid, STFileSet **fset); -int32_t tsdbTFileSetInitDup(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fset); +int32_t tsdbTFileSetInitCopy(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fset); int32_t tsdbTFileSetInitRef(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fset); int32_t tsdbTFileSetClear(STFileSet **fset); -int32_t tsdbTFileSetRemove(STFileSet **fset); +int32_t tsdbTFileSetRemove(STFileSet *fset); int32_t tsdbTFileSetFilteredInitDup(STsdb *pTsdb, const STFileSet *fset1, int64_t ever, STFileSet **fset, TFileOpArray *fopArr); @@ -58,6 +60,7 @@ int32_t tsdbJsonToTFileSet(STsdb *pTsdb, const cJSON *json, STFileSet **fset); // cmpr int32_t tsdbTFileSetCmprFn(const STFileSet **fset1, const STFileSet **fset2); // edit +int32_t tsdbSttLvlClear(SSttLvl **lvl); int32_t tsdbTFileSetEdit(STsdb *pTsdb, STFileSet *fset, const STFileOp *op); int32_t tsdbTFileSetApplyEdit(STsdb *pTsdb, const STFileSet *fset1, STFileSet *fset); // max commit id @@ -70,6 +73,33 @@ bool tsdbTFileSetIsEmpty(const STFileSet *fset); int32_t tsdbSttLvlInit(int32_t level, SSttLvl **lvl); int32_t tsdbSttLvlClear(SSttLvl **lvl); +typedef enum { + TSDB_BG_TASK_MERGER = 1, + TSDB_BG_TASK_RETENTION, + TSDB_BG_TASK_COMPACT, +} EFSBgTaskT; + +struct STFSBgTask { + STFileSystem *fs; + int32_t fid; + + EFSBgTaskT type; + int32_t (*run)(void *arg); + void (*destroy)(void *arg); + void *arg; + + TdThreadCond done[1]; + int32_t numWait; + + int64_t taskid; + int64_t scheduleTime; + int64_t launchTime; + int64_t finishTime; + + struct STFSBgTask *prev; + struct STFSBgTask *next; +}; + struct STFileOp { tsdb_fop_t optype; int32_t fid; @@ -87,6 +117,16 @@ struct STFileSet { int64_t maxVerValid; STFileObj *farr[TSDB_FTYPE_MAX]; // file array TSttLvlArray lvlArr[1]; // level array + + // background task queue + int32_t bgTaskNum; + STFSBgTask bgTaskQueue[1]; + STFSBgTask *bgTaskRunning; + + // block commit variables + TdThreadCond canCommit; + int32_t numWaitCommit; + bool blockCommit; }; struct STSnapRange { diff --git a/source/dnode/vnode/src/tsdb/tsdbMerge.c b/source/dnode/vnode/src/tsdb/tsdbMerge.c index e659cedba3..0c20a342d3 100644 --- a/source/dnode/vnode/src/tsdb/tsdbMerge.c +++ b/source/dnode/vnode/src/tsdb/tsdbMerge.c @@ -15,11 +15,17 @@ #include "tsdbMerge.h" -#define TSDB_MAX_LEVEL 6 // means max level is 7 +#define TSDB_MAX_LEVEL 2 // means max level is 3 typedef struct { - STsdb *tsdb; - TFileSetArray *fsetArr; + STsdb *tsdb; + int32_t fid; +} SMergeArg; + +typedef struct { + STsdb *tsdb; + int32_t fid; + STFileSet *fset; int32_t sttTrigger; int32_t maxRow; @@ -313,7 +319,6 @@ static int32_t tsdbMergeFileSetBeginOpenWriter(SMerger *merger) { if (merger->ctx->fset->farr[ftype]) { config.files[ftype].exist = true; config.files[ftype].file = merger->ctx->fset->farr[ftype]->f[0]; - } else { config.files[ftype].exist = false; } @@ -397,13 +402,13 @@ static int32_t tsdbMergeFileSetEnd(SMerger *merger) { code = tsdbFSEditBegin(merger->tsdb->pFS, merger->fopArr, TSDB_FEDIT_MERGE); TSDB_CHECK_CODE(code, lino, _exit); - taosThreadRwlockWrlock(&merger->tsdb->rwLock); + taosThreadMutexLock(&merger->tsdb->mutex); code = tsdbFSEditCommit(merger->tsdb->pFS); if (code) { - taosThreadRwlockUnlock(&merger->tsdb->rwLock); + taosThreadMutexUnlock(&merger->tsdb->mutex); TSDB_CHECK_CODE(code, lino, _exit); } - taosThreadRwlockUnlock(&merger->tsdb->rwLock); + taosThreadMutexUnlock(&merger->tsdb->mutex); _exit: if (code) { @@ -478,30 +483,21 @@ _exit: } static int32_t tsdbDoMerge(SMerger *merger) { - int32_t code = 0; - int32_t lino = 0; + int32_t code = 0; + int32_t lino = 0; + SSttLvl *lvl = TARRAY2_FIRST(merger->fset->lvlArr); - STFileSet *fset; - TARRAY2_FOREACH(merger->fsetArr, fset) { - if (TARRAY2_SIZE(fset->lvlArr) == 0) continue; + if (TARRAY2_SIZE(merger->fset->lvlArr) == 0) return 0; + if (lvl->level != 0 || TARRAY2_SIZE(lvl->fobjArr) < merger->sttTrigger) return 0; - SSttLvl *lvl = TARRAY2_FIRST(fset->lvlArr); + code = tsdbMergerOpen(merger); + TSDB_CHECK_CODE(code, lino, _exit); - if (lvl->level != 0 || TARRAY2_SIZE(lvl->fobjArr) < merger->sttTrigger) continue; + code = tsdbMergeFileSet(merger, merger->fset); + TSDB_CHECK_CODE(code, lino, _exit); - if (!merger->ctx->opened) { - code = tsdbMergerOpen(merger); - TSDB_CHECK_CODE(code, lino, _exit); - } - - code = tsdbMergeFileSet(merger, fset); - TSDB_CHECK_CODE(code, lino, _exit); - } - - if (merger->ctx->opened) { - code = tsdbMergerClose(merger); - TSDB_CHECK_CODE(code, lino, _exit); - } + code = tsdbMergerClose(merger); + TSDB_CHECK_CODE(code, lino, _exit); _exit: if (code) { @@ -512,36 +508,73 @@ _exit: return code; } -int32_t tsdbMerge(void *arg) { - int32_t code = 0; - int32_t lino = 0; - STsdb *tsdb = (STsdb *)arg; +static int32_t tsdbMergeGetFSet(SMerger *merger) { + STFileSet *fset; - SMerger merger[1] = {{ - .tsdb = tsdb, - .sttTrigger = tsdb->pVnode->config.sttTrigger, - }}; - - if (merger->sttTrigger <= 1) { + taosThreadMutexLock(&merger->tsdb->mutex); + tsdbFSGetFSet(merger->tsdb->pFS, merger->fid, &fset); + if (fset == NULL) { + taosThreadMutexUnlock(&merger->tsdb->mutex); return 0; } - code = tsdbFSCreateCopySnapshot(tsdb->pFS, &merger->fsetArr); + int32_t code = tsdbTFileSetInitCopy(merger->tsdb, fset, &merger->fset); + if (code) { + taosThreadMutexUnlock(&merger->tsdb->mutex); + return code; + } + taosThreadMutexUnlock(&merger->tsdb->mutex); + return 0; +} + +static int32_t tsdbMerge(void *arg) { + int32_t code = 0; + int32_t lino = 0; + SMergeArg *mergeArg = (SMergeArg *)arg; + STsdb *tsdb = mergeArg->tsdb; + + SMerger merger[1] = {{ + .tsdb = tsdb, + .fid = mergeArg->fid, + .sttTrigger = tsdb->pVnode->config.sttTrigger, + }}; + + if (merger->sttTrigger <= 1) return 0; + + // copy snapshot + code = tsdbMergeGetFSet(merger); TSDB_CHECK_CODE(code, lino, _exit); + if (merger->fset == NULL) return 0; + + // do merge + tsdbDebug("vgId:%d merge begin, fid:%d", TD_VID(tsdb->pVnode), merger->fid); code = tsdbDoMerge(merger); + tsdbDebug("vgId:%d merge done, fid:%d", TD_VID(tsdb->pVnode), mergeArg->fid); TSDB_CHECK_CODE(code, lino, _exit); - tsdbFSDestroyCopySnapshot(&merger->fsetArr); - _exit: if (code) { TSDB_ERROR_LOG(TD_VID(tsdb->pVnode), lino, code); tsdbFatal("vgId:%d, failed to merge stt files since %s. code:%d", TD_VID(tsdb->pVnode), terrstr(), code); taosMsleep(100); exit(EXIT_FAILURE); - } else if (merger->ctx->opened) { - tsdbDebug("vgId:%d %s done", TD_VID(tsdb->pVnode), __func__); } + tsdbTFileSetClear(&merger->fset); return code; } + +int32_t tsdbSchedMerge(STsdb *tsdb, int32_t fid) { + SMergeArg *arg = taosMemoryMalloc(sizeof(*arg)); + if (arg == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + + arg->tsdb = tsdb; + arg->fid = fid; + + int32_t code = tsdbFSScheduleBgTask(tsdb->pFS, fid, TSDB_BG_TASK_MERGER, tsdbMerge, taosMemoryFree, arg, NULL); + if (code) taosMemoryFree(arg); + + return code; +} \ No newline at end of file diff --git a/source/dnode/vnode/src/tsdb/tsdbOpen.c b/source/dnode/vnode/src/tsdb/tsdbOpen.c index 6dd66c7a40..c32b2eedd7 100644 --- a/source/dnode/vnode/src/tsdb/tsdbOpen.c +++ b/source/dnode/vnode/src/tsdb/tsdbOpen.c @@ -53,7 +53,7 @@ int tsdbOpen(SVnode *pVnode, STsdb **ppTsdb, const char *dir, STsdbKeepCfg *pKee snprintf(pTsdb->path, TD_PATH_MAX, "%s%s%s", pVnode->path, TD_DIRSEP, dir); // taosRealPath(pTsdb->path, NULL, slen); pTsdb->pVnode = pVnode; - taosThreadRwlockInit(&pTsdb->rwLock, NULL); + taosThreadMutexInit(&pTsdb->mutex, NULL); if (!pKeepCfg) { tsdbSetKeepCfg(pTsdb, &pVnode->config.tsdbCfg); } else { @@ -99,15 +99,14 @@ int tsdbClose(STsdb **pTsdb) { tsdbDebug("vgId:%d, tsdb is close at %s, days:%d, keep:%d,%d,%d, keepTimeOffset:%d", TD_VID(pdb->pVnode), pdb->path, pdb->keepCfg.days, pdb->keepCfg.keep0, pdb->keepCfg.keep1, pdb->keepCfg.keep2, pdb->keepCfg.keepTimeOffset); - taosThreadRwlockWrlock(&(*pTsdb)->rwLock); + taosThreadMutexLock(&(*pTsdb)->mutex); tsdbMemTableDestroy((*pTsdb)->mem, true); (*pTsdb)->mem = NULL; - taosThreadRwlockUnlock(&(*pTsdb)->rwLock); - - taosThreadRwlockDestroy(&(*pTsdb)->rwLock); + taosThreadMutexUnlock(&(*pTsdb)->mutex); tsdbCloseFS(&(*pTsdb)->pFS); tsdbCloseCache(*pTsdb); + taosThreadMutexDestroy(&(*pTsdb)->mutex); taosMemoryFreeClear(*pTsdb); } return 0; diff --git a/source/dnode/vnode/src/tsdb/tsdbRead2.c b/source/dnode/vnode/src/tsdb/tsdbRead2.c index 65cebf0ca0..41e0bd373e 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRead2.c +++ b/source/dnode/vnode/src/tsdb/tsdbRead2.c @@ -1105,8 +1105,9 @@ static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* p (pVerRange->maxVer < pBlock->record.maxVer && pVerRange->maxVer >= pBlock->record.minVer); } -static bool getNeighborBlockOfSameTable(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pTableBlockScanInfo, - int32_t* nextIndex, int32_t order, SBrinRecord* pRecord) { +static bool getNeighborBlockOfSameTable(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pBlockInfo, + STableBlockScanInfo* pTableBlockScanInfo, int32_t* nextIndex, int32_t order, + SBrinRecord* pRecord) { bool asc = ASCENDING_TRAVERSE(order); if (asc && pBlockInfo->tbBlockIdx >= taosArrayGetSize(pTableBlockScanInfo->pBlockIdxList) - 1) { return false; @@ -1119,7 +1120,8 @@ static bool getNeighborBlockOfSameTable(SDataBlockIter* pBlockIter, SFileDataBlo int32_t step = asc ? 1 : -1; // *nextIndex = pBlockInfo->tbBlockIdx + step; // *pBlockIndex = *(SBlockIndex*)taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex); - STableDataBlockIdx* pTableDataBlockIdx = taosArrayGet(pTableBlockScanInfo->pBlockIdxList, pBlockInfo->tbBlockIdx + step); + STableDataBlockIdx* pTableDataBlockIdx = + taosArrayGet(pTableBlockScanInfo->pBlockIdxList, pBlockInfo->tbBlockIdx + step); SFileDataBlockInfo* p = taosArrayGet(pBlockIter->blockList, pTableDataBlockIdx->globalIndex); memcpy(pRecord, &p->record, sizeof(SBrinRecord)); @@ -1145,7 +1147,8 @@ static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlock return -1; } -static int32_t setFileBlockActiveInBlockIter(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t index, int32_t step) { +static int32_t setFileBlockActiveInBlockIter(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t index, + int32_t step) { if (index < 0 || index >= pBlockIter->numOfBlocks) { return -1; } @@ -1153,12 +1156,13 @@ static int32_t setFileBlockActiveInBlockIter(STsdbReader* pReader, SDataBlockIte SFileDataBlockInfo fblock = *(SFileDataBlockInfo*)taosArrayGet(pBlockIter->blockList, index); pBlockIter->index += step; - if (index != pBlockIter->index) { + if (index != pBlockIter->index) { if (index > pBlockIter->index) { for (int32_t i = index - 1; i >= pBlockIter->index; --i) { SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, i); - STableBlockScanInfo* pBlockScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr); + STableBlockScanInfo* pBlockScanInfo = + getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr); STableDataBlockIdx* pTableDataBlockIdx = taosArrayGet(pBlockScanInfo->pBlockIdxList, pBlockInfo->tbBlockIdx); pTableDataBlockIdx->globalIndex = i + 1; @@ -1168,13 +1172,13 @@ static int32_t setFileBlockActiveInBlockIter(STsdbReader* pReader, SDataBlockIte for (int32_t i = index + 1; i <= pBlockIter->index; ++i) { SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, i); - STableBlockScanInfo* pBlockScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr); + STableBlockScanInfo* pBlockScanInfo = + getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr); STableDataBlockIdx* pTableDataBlockIdx = taosArrayGet(pBlockScanInfo->pBlockIdxList, pBlockInfo->tbBlockIdx); pTableDataBlockIdx->globalIndex = i - 1; taosArraySet(pBlockIter->blockList, i - 1, pBlockInfo); } - } taosArraySet(pBlockIter->blockList, pBlockIter->index, &fblock); @@ -1286,7 +1290,8 @@ static void getBlockToLoadInfo(SDataBlockToLoadInfo* pInfo, SFileDataBlockInfo* int32_t neighborIndex = 0; SBrinRecord rec = {0}; - bool hasNeighbor = getNeighborBlockOfSameTable(&pReader->status.blockIter, pBlockInfo, pScanInfo, &neighborIndex, pReader->info.order, &rec); + bool hasNeighbor = getNeighborBlockOfSameTable(&pReader->status.blockIter, pBlockInfo, pScanInfo, &neighborIndex, + pReader->info.order, &rec); // overlap with neighbor if (hasNeighbor) { @@ -1420,9 +1425,7 @@ static bool nextRowFromLastBlocks(SLastBlockReader* pLastBlockReader, STableBloc } } -static void doPinSttBlock(SLastBlockReader* pLastBlockReader) { - tMergeTreePinSttBlock(&pLastBlockReader->mergeTree); -} +static void doPinSttBlock(SLastBlockReader* pLastBlockReader) { tMergeTreePinSttBlock(&pLastBlockReader->mergeTree); } static void doUnpinSttBlock(SLastBlockReader* pLastBlockReader) { tMergeTreeUnpinSttBlock(&pLastBlockReader->mergeTree); @@ -1568,7 +1571,7 @@ static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* if (minKey == tsLast) { TSDBROW* fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree); - int32_t code = tsdbRowMergerAdd(pMerger, fRow1, NULL); + int32_t code = tsdbRowMergerAdd(pMerger, fRow1, NULL); if (code != TSDB_CODE_SUCCESS) { return code; } @@ -1618,7 +1621,7 @@ static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* if (minKey == tsLast) { TSDBROW* fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree); - int32_t code = tsdbRowMergerAdd(pMerger, fRow1, NULL); + int32_t code = tsdbRowMergerAdd(pMerger, fRow1, NULL); if (code != TSDB_CODE_SUCCESS) { return code; } @@ -1826,8 +1829,8 @@ static int32_t doMergeMultiLevelRows(STsdbReader* pReader, STableBlockScanInfo* int64_t key = hasDataInFileBlock(pBlockData, pDumpInfo) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN; - TSDBKEY k = TSDBROW_KEY(pRow); - TSDBKEY ik = TSDBROW_KEY(piRow); + TSDBKEY k = TSDBROW_KEY(pRow); + TSDBKEY ik = TSDBROW_KEY(piRow); STSchema* pSchema = NULL; if (pRow->type == TSDBROW_ROW_FMT) { @@ -2219,8 +2222,9 @@ static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanI SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo; TSDBROW *pRow = NULL, *piRow = NULL; - int64_t key = (pBlockData->nRow > 0 && (!pDumpInfo->allDumped)) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : - (ASCENDING_TRAVERSE(pReader->info.order) ? INT64_MAX : INT64_MIN); + int64_t key = (pBlockData->nRow > 0 && (!pDumpInfo->allDumped)) + ? pBlockData->aTSKEY[pDumpInfo->rowIndex] + : (ASCENDING_TRAVERSE(pReader->info.order) ? INT64_MAX : INT64_MIN); if (pBlockScanInfo->iter.hasVal) { pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader); } @@ -2257,7 +2261,8 @@ static int32_t loadNeighborIfOverlap(SFileDataBlockInfo* pBlockInfo, STableBlock *loadNeighbor = false; SBrinRecord rec = {0}; - bool hasNeighbor = getNeighborBlockOfSameTable(&pReader->status.blockIter, pBlockInfo, pBlockScanInfo, &nextIndex, pReader->info.order, &rec); + bool hasNeighbor = getNeighborBlockOfSameTable(&pReader->status.blockIter, pBlockInfo, pBlockScanInfo, &nextIndex, + pReader->info.order, &rec); if (!hasNeighbor) { // do nothing return code; } @@ -2268,7 +2273,7 @@ static int32_t loadNeighborIfOverlap(SFileDataBlockInfo* pBlockInfo, STableBlock // 1. find the next neighbor block in the scan block list STableDataBlockIdx* tableDataBlockIdx = taosArrayGet(pBlockScanInfo->pBlockIdxList, nextIndex); - int32_t neighborIndex = tableDataBlockIdx->globalIndex; + int32_t neighborIndex = tableDataBlockIdx->globalIndex; // 2. remove it from the scan block list setFileBlockActiveInBlockIter(pReader, pBlockIter, neighborIndex, step); @@ -2704,7 +2709,7 @@ static int32_t doBuildDataBlock(STsdbReader* pReader) { (ASCENDING_TRAVERSE(pReader->info.order)) ? pBlockInfo->record.firstKey : pBlockInfo->record.lastKey; code = buildDataBlockFromBuf(pReader, pScanInfo, endKey); } else { - bool bHasDataInLastBlock = hasDataInLastBlock(pLastBlockReader); + bool bHasDataInLastBlock = hasDataInLastBlock(pLastBlockReader); int64_t tsLast = bHasDataInLastBlock ? getCurrentKeyInLastBlock(pLastBlockReader) : INT64_MIN; if (!bHasDataInLastBlock || ((asc && pBlockInfo->record.lastKey < tsLast) || (!asc && pBlockInfo->record.firstKey > tsLast))) { @@ -3479,7 +3484,7 @@ int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, // start to merge duplicated rows STSchema* pTSchema = NULL; - if (current.type == TSDBROW_ROW_FMT) { // get the correct schema for row-wise data in memory + if (current.type == TSDBROW_ROW_FMT) { // get the correct schema for row-wise data in memory pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(¤t), pReader, uid); if (pTSchema == NULL) { return terrno; @@ -3525,8 +3530,8 @@ int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* p SRow** pTSRow) { SRowMerger* pMerger = &pReader->status.merger; - TSDBKEY k = TSDBROW_KEY(pRow); - TSDBKEY ik = TSDBROW_KEY(piRow); + TSDBKEY k = TSDBROW_KEY(pRow); + TSDBKEY ik = TSDBROW_KEY(piRow); STSchema* pSchema = NULL; if (pRow->type == TSDBROW_ROW_FMT) { @@ -4907,12 +4912,12 @@ int32_t tsdbTakeReadSnap2(STsdbReader* pReader, _query_reseek_func_t reseek, STs SVersionRange* pRange = &pReader->info.verRange; // lock - taosThreadRwlockRdlock(&pTsdb->rwLock); + taosThreadMutexLock(&pTsdb->mutex); // alloc STsdbReadSnap* pSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(STsdbReadSnap)); if (pSnap == NULL) { - taosThreadRwlockUnlock(&pTsdb->rwLock); + taosThreadMutexUnlock(&pTsdb->mutex); code = TSDB_CODE_OUT_OF_MEMORY; goto _exit; } @@ -4922,7 +4927,7 @@ int32_t tsdbTakeReadSnap2(STsdbReader* pReader, _query_reseek_func_t reseek, STs pSnap->pMem = pTsdb->mem; pSnap->pNode = taosMemoryMalloc(sizeof(*pSnap->pNode)); if (pSnap->pNode == NULL) { - taosThreadRwlockUnlock(&pTsdb->rwLock); + taosThreadMutexUnlock(&pTsdb->mutex); code = TSDB_CODE_OUT_OF_MEMORY; goto _exit; } @@ -4937,7 +4942,7 @@ int32_t tsdbTakeReadSnap2(STsdbReader* pReader, _query_reseek_func_t reseek, STs pSnap->pIMem = pTsdb->imem; pSnap->pINode = taosMemoryMalloc(sizeof(*pSnap->pINode)); if (pSnap->pINode == NULL) { - taosThreadRwlockUnlock(&pTsdb->rwLock); + taosThreadMutexUnlock(&pTsdb->mutex); code = TSDB_CODE_OUT_OF_MEMORY; goto _exit; } @@ -4952,7 +4957,7 @@ int32_t tsdbTakeReadSnap2(STsdbReader* pReader, _query_reseek_func_t reseek, STs code = tsdbFSCreateRefSnapshotWithoutLock(pTsdb->pFS, &pSnap->pfSetArray); // unlock - taosThreadRwlockUnlock(&pTsdb->rwLock); + taosThreadMutexUnlock(&pTsdb->mutex); if (code == TSDB_CODE_SUCCESS) { tsdbTrace("vgId:%d, take read snapshot", TD_VID(pTsdb->pVnode)); @@ -5005,4 +5010,5 @@ void tsdbReaderSetId2(STsdbReader* pReader, const char* idstr) { pReader->status.fileIter.pLastBlockReader->mergeTree.idStr = pReader->idStr; } -void tsdbReaderSetCloseFlag(STsdbReader* pReader) { /*pReader->code = TSDB_CODE_TSC_QUERY_CANCELLED;*/ } +void tsdbReaderSetCloseFlag(STsdbReader* pReader) { /*pReader->code = TSDB_CODE_TSC_QUERY_CANCELLED;*/ +} diff --git a/source/dnode/vnode/src/tsdb/tsdbRetention.c b/source/dnode/vnode/src/tsdb/tsdbRetention.c index f2665dcf26..6c41b46c73 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRetention.c +++ b/source/dnode/vnode/src/tsdb/tsdbRetention.c @@ -25,11 +25,6 @@ typedef struct { TFileSetArray *fsetArr; TFileOpArray fopArr[1]; - - struct { - int32_t fsetArrIdx; - STFileSet *fset; - } ctx[1]; } SRTNer; static int32_t tsdbDoRemoveFileObject(SRTNer *rtner, const STFileObj *fobj) { @@ -227,8 +222,8 @@ _exit: typedef struct { STsdb *tsdb; - int32_t sync; int64_t now; + int32_t fid; } SRtnArg; static int32_t tsdbDoRetentionBegin(SRtnArg *arg, SRTNer *rtner) { @@ -263,15 +258,15 @@ static int32_t tsdbDoRetentionEnd(SRTNer *rtner) { code = tsdbFSEditBegin(rtner->tsdb->pFS, rtner->fopArr, TSDB_FEDIT_MERGE); TSDB_CHECK_CODE(code, lino, _exit); - taosThreadRwlockWrlock(&rtner->tsdb->rwLock); + taosThreadMutexLock(&rtner->tsdb->mutex); code = tsdbFSEditCommit(rtner->tsdb->pFS); if (code) { - taosThreadRwlockUnlock(&rtner->tsdb->rwLock); + taosThreadMutexUnlock(&rtner->tsdb->mutex); TSDB_CHECK_CODE(code, lino, _exit); } - taosThreadRwlockUnlock(&rtner->tsdb->rwLock); + taosThreadMutexUnlock(&rtner->tsdb->mutex); TARRAY2_DESTROY(rtner->fopArr, NULL); @@ -285,95 +280,83 @@ _exit: return code; } -static int32_t tsdbDoRetention2(void *arg) { - int32_t code = 0; - int32_t lino = 0; - SRTNer rtner[1] = {0}; +static int32_t tsdbDoRetentionOnFileSet(SRTNer *rtner, STFileSet *fset) { + int32_t code = 0; + int32_t lino = 0; + STFileObj *fobj = NULL; + int32_t expLevel = tsdbFidLevel(fset->fid, &rtner->tsdb->keepCfg, rtner->now); - code = tsdbDoRetentionBegin(arg, rtner); - TSDB_CHECK_CODE(code, lino, _exit); + if (expLevel < 0) { // remove the fileset + for (int32_t ftype = 0; (ftype < TSDB_FTYPE_MAX) && (fobj = fset->farr[ftype], 1); ++ftype) { + if (fobj == NULL) continue; - for (rtner->ctx->fsetArrIdx = 0; rtner->ctx->fsetArrIdx < TARRAY2_SIZE(rtner->fsetArr); rtner->ctx->fsetArrIdx++) { - rtner->ctx->fset = TARRAY2_GET(rtner->fsetArr, rtner->ctx->fsetArrIdx); - - STFileObj *fobj; - int32_t expLevel = tsdbFidLevel(rtner->ctx->fset->fid, &rtner->tsdb->keepCfg, rtner->now); - - if (expLevel < 0) { // remove the file set - for (int32_t ftype = 0; (ftype < TSDB_FTYPE_MAX) && (fobj = rtner->ctx->fset->farr[ftype], 1); ++ftype) { - if (fobj == NULL) continue; - - int32_t nlevel = tfsGetLevel(rtner->tsdb->pVnode->pTfs); - if (tsS3Enabled && nlevel > 1 && TSDB_FTYPE_DATA == ftype && fobj->f->did.level == nlevel - 1) { - code = tsdbRemoveFileObjectS3(rtner, fobj); - TSDB_CHECK_CODE(code, lino, _exit); - } else { - code = tsdbDoRemoveFileObject(rtner, fobj); - TSDB_CHECK_CODE(code, lino, _exit); - } - } - - SSttLvl *lvl; - TARRAY2_FOREACH(rtner->ctx->fset->lvlArr, lvl) { - TARRAY2_FOREACH(lvl->fobjArr, fobj) { - code = tsdbDoRemoveFileObject(rtner, fobj); - TSDB_CHECK_CODE(code, lino, _exit); - } - } - } else if (expLevel == 0) { - continue; - } else { - SDiskID did; - - if (tfsAllocDisk(rtner->tsdb->pVnode->pTfs, expLevel, &did) < 0) { - code = terrno; + int32_t nlevel = tfsGetLevel(rtner->tsdb->pVnode->pTfs); + if (tsS3Enabled && nlevel > 1 && TSDB_FTYPE_DATA == ftype && fobj->f->did.level == nlevel - 1) { + code = tsdbRemoveFileObjectS3(rtner, fobj); + TSDB_CHECK_CODE(code, lino, _exit); + } else { + code = tsdbDoRemoveFileObject(rtner, fobj); TSDB_CHECK_CODE(code, lino, _exit); } - tfsMkdirRecurAt(rtner->tsdb->pVnode->pTfs, rtner->tsdb->path, did); + } - // data - for (int32_t ftype = 0; ftype < TSDB_FTYPE_MAX && (fobj = rtner->ctx->fset->farr[ftype], 1); ++ftype) { - if (fobj == NULL) continue; + SSttLvl *lvl; + TARRAY2_FOREACH(fset->lvlArr, lvl) { + TARRAY2_FOREACH(lvl->fobjArr, fobj) { + code = tsdbDoRemoveFileObject(rtner, fobj); + TSDB_CHECK_CODE(code, lino, _exit); + } + } + } else if (expLevel == 0) { // only migrate to upper level + return 0; + } else { // migrate + SDiskID did; + if (tfsAllocDisk(rtner->tsdb->pVnode->pTfs, expLevel, &did) < 0) { + code = terrno; + TSDB_CHECK_CODE(code, lino, _exit); + } + tfsMkdirRecurAt(rtner->tsdb->pVnode->pTfs, rtner->tsdb->path, did); + + // data + for (int32_t ftype = 0; ftype < TSDB_FTYPE_MAX && (fobj = fset->farr[ftype], 1); ++ftype) { + if (fobj == NULL) continue; + + if (fobj->f->did.level == did.level) continue; + + int32_t nlevel = tfsGetLevel(rtner->tsdb->pVnode->pTfs); + if (tsS3Enabled && nlevel > 1 && TSDB_FTYPE_DATA == ftype && did.level == nlevel - 1) { + code = tsdbMigrateDataFileS3(rtner, fobj, &did); + TSDB_CHECK_CODE(code, lino, _exit); + } else { + if (tsS3Enabled) { + int64_t fsize = 0; + if (taosStatFile(fobj->fname, &fsize, NULL, NULL) < 0) { + code = TAOS_SYSTEM_ERROR(terrno); + tsdbError("vgId:%d %s failed since file:%s stat failed, reason:%s", TD_VID(rtner->tsdb->pVnode), __func__, + fobj->fname, tstrerror(code)); + TSDB_CHECK_CODE(code, lino, _exit); + } + s3EvictCache(fobj->fname, fsize * 2); + } + + code = tsdbDoMigrateFileObj(rtner, fobj, &did); + TSDB_CHECK_CODE(code, lino, _exit); + } + } + + // stt + SSttLvl *lvl; + TARRAY2_FOREACH(fset->lvlArr, lvl) { + TARRAY2_FOREACH(lvl->fobjArr, fobj) { if (fobj->f->did.level == did.level) continue; - int32_t nlevel = tfsGetLevel(rtner->tsdb->pVnode->pTfs); - if (tsS3Enabled && nlevel > 1 && TSDB_FTYPE_DATA == ftype && did.level == nlevel - 1) { - code = tsdbMigrateDataFileS3(rtner, fobj, &did); - TSDB_CHECK_CODE(code, lino, _exit); - } else { - if (tsS3Enabled) { - int64_t fsize = 0; - if (taosStatFile(fobj->fname, &fsize, NULL, NULL) < 0) { - code = TAOS_SYSTEM_ERROR(terrno); - tsdbError("vgId:%d %s failed since file:%s stat failed, reason:%s", TD_VID(rtner->tsdb->pVnode), __func__, - fobj->fname, tstrerror(code)); - TSDB_CHECK_CODE(code, lino, _exit); - } - s3EvictCache(fobj->fname, fsize * 2); - } - - code = tsdbDoMigrateFileObj(rtner, fobj, &did); - TSDB_CHECK_CODE(code, lino, _exit); - } - } - - // stt - SSttLvl *lvl; - TARRAY2_FOREACH(rtner->ctx->fset->lvlArr, lvl) { - TARRAY2_FOREACH(lvl->fobjArr, fobj) { - if (fobj->f->did.level == did.level) continue; - - code = tsdbDoMigrateFileObj(rtner, fobj, &did); - TSDB_CHECK_CODE(code, lino, _exit); - } + code = tsdbDoMigrateFileObj(rtner, fobj, &did); + TSDB_CHECK_CODE(code, lino, _exit); } } } - code = tsdbDoRetentionEnd(rtner); - TSDB_CHECK_CODE(code, lino, _exit); - _exit: if (code) { if (TARRAY2_DATA(rtner->fopArr)) { @@ -389,30 +372,105 @@ _exit: return code; } -static void tsdbFreeRtnArg(void *arg) { - SRtnArg *rArg = (SRtnArg *)arg; - if (rArg->sync) { - tsem_post(&rArg->tsdb->pVnode->canCommit); +static int32_t tsdbDoRetentionSync(void *arg) { + int32_t code = 0; + int32_t lino = 0; + SRTNer rtner[1] = {0}; + + code = tsdbDoRetentionBegin(arg, rtner); + TSDB_CHECK_CODE(code, lino, _exit); + + STFileSet *fset; + TARRAY2_FOREACH(rtner->fsetArr, fset) { + code = tsdbDoRetentionOnFileSet(rtner, fset); + TSDB_CHECK_CODE(code, lino, _exit); } - taosMemoryFree(arg); + + code = tsdbDoRetentionEnd(rtner); + TSDB_CHECK_CODE(code, lino, _exit); + +_exit: + if (code) { + TSDB_ERROR_LOG(TD_VID(rtner->tsdb->pVnode), lino, code); + } + tsem_post(&((SRtnArg *)arg)->tsdb->pVnode->canCommit); + return code; } -int32_t tsdbRetention(STsdb *tsdb, int64_t now, int32_t sync) { - SRtnArg *arg = taosMemoryMalloc(sizeof(*arg)); - if (arg == NULL) return TSDB_CODE_OUT_OF_MEMORY; - arg->tsdb = tsdb; - arg->sync = sync; - arg->now = now; +static int32_t tsdbDoRetentionAsync(void *arg) { + int32_t code = 0; + int32_t lino = 0; + SRTNer rtner[1] = {0}; - if (sync) { - tsem_wait(&tsdb->pVnode->canCommit); + code = tsdbDoRetentionBegin(arg, rtner); + TSDB_CHECK_CODE(code, lino, _exit); + + STFileSet *fset; + TARRAY2_FOREACH(rtner->fsetArr, fset) { + if (fset->fid != ((SRtnArg *)arg)->fid) continue; + + code = tsdbDoRetentionOnFileSet(rtner, fset); + TSDB_CHECK_CODE(code, lino, _exit); } - int64_t taskid; - int32_t code = - tsdbFSScheduleBgTask(tsdb->pFS, TSDB_BG_TASK_RETENTION, tsdbDoRetention2, tsdbFreeRtnArg, arg, &taskid); + code = tsdbDoRetentionEnd(rtner); + TSDB_CHECK_CODE(code, lino, _exit); + +_exit: if (code) { - tsdbFreeRtnArg(arg); + TSDB_ERROR_LOG(TD_VID(rtner->tsdb->pVnode), lino, code); } return code; } + +static void tsdbFreeRtnArg(void *arg) { taosMemoryFree(arg); } + +int32_t tsdbRetention(STsdb *tsdb, int64_t now, int32_t sync) { + int32_t code = 0; + + if (sync) { // sync retention + SRtnArg *arg = taosMemoryMalloc(sizeof(*arg)); + if (arg == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + + arg->tsdb = tsdb; + arg->now = now; + arg->fid = INT32_MAX; + + tsem_wait(&tsdb->pVnode->canCommit); + code = vnodeScheduleTask(tsdbDoRetentionSync, arg); + if (code) { + tsem_post(&tsdb->pVnode->canCommit); + taosMemoryFree(arg); + return code; + } + } else { // async retention + taosThreadMutexLock(&tsdb->mutex); + + STFileSet *fset; + TARRAY2_FOREACH(tsdb->pFS->fSetArr, fset) { + SRtnArg *arg = taosMemoryMalloc(sizeof(*arg)); + if (arg == NULL) { + taosThreadMutexUnlock(&tsdb->mutex); + return TSDB_CODE_OUT_OF_MEMORY; + } + + arg->tsdb = tsdb; + arg->now = now; + arg->fid = fset->fid; + + code = tsdbFSScheduleBgTask(tsdb->pFS, fset->fid, TSDB_BG_TASK_RETENTION, tsdbDoRetentionAsync, tsdbFreeRtnArg, + arg, NULL); + if (code) { + tsdbFreeRtnArg(arg); + taosThreadMutexUnlock(&tsdb->mutex); + return code; + } + } + + taosThreadMutexUnlock(&tsdb->mutex); + } + + return code; +} diff --git a/source/dnode/vnode/src/tsdb/tsdbSnapshot.c b/source/dnode/vnode/src/tsdb/tsdbSnapshot.c index 3b4827a6be..df7154b775 100644 --- a/source/dnode/vnode/src/tsdb/tsdbSnapshot.c +++ b/source/dnode/vnode/src/tsdb/tsdbSnapshot.c @@ -38,8 +38,8 @@ struct STsdbSnapReader { struct { int32_t fsrArrIdx; STSnapRange* fsr; - bool isDataDone; - bool isTombDone; + bool isDataDone; + bool isTombDone; } ctx[1]; // reader @@ -1095,17 +1095,17 @@ int32_t tsdbSnapWriterClose(STsdbSnapWriter** writer, int8_t rollback) { code = tsdbFSEditAbort(writer[0]->tsdb->pFS); TSDB_CHECK_CODE(code, lino, _exit); } else { - taosThreadRwlockWrlock(&writer[0]->tsdb->rwLock); + taosThreadMutexLock(&writer[0]->tsdb->mutex); code = tsdbFSEditCommit(writer[0]->tsdb->pFS); if (code) { - taosThreadRwlockUnlock(&writer[0]->tsdb->rwLock); + taosThreadMutexUnlock(&writer[0]->tsdb->mutex); TSDB_CHECK_CODE(code, lino, _exit); } writer[0]->tsdb->pFS->fsstate = TSDB_FS_STATE_NORMAL; - taosThreadRwlockUnlock(&writer[0]->tsdb->rwLock); + taosThreadMutexUnlock(&writer[0]->tsdb->mutex); } tsdbFSEnableBgTask(tsdb->pFS); @@ -1236,7 +1236,7 @@ static int32_t tsdbTFileSetToSnapPart(STFileSet* fset, STsdbSnapPartition** ppSP if (fset->farr[ftype] == NULL) continue; typ = tsdbFTypeToSRangeTyp(ftype); ASSERT(typ < TSDB_SNAP_RANGE_TYP_MAX); - STFile* f = fset->farr[ftype]->f; + STFile* f = fset->farr[ftype]->f; if (f->maxVer > fset->maxVerValid) { corrupt = true; tsdbError("skip incomplete data file: fid:%d, maxVerValid:%" PRId64 ", minVer:%" PRId64 ", maxVer:%" PRId64 @@ -1255,7 +1255,7 @@ static int32_t tsdbTFileSetToSnapPart(STFileSet* fset, STsdbSnapPartition** ppSP TARRAY2_FOREACH(fset->lvlArr, lvl) { STFileObj* fobj; TARRAY2_FOREACH(lvl->fobjArr, fobj) { - STFile* f = fobj->f; + STFile* f = fobj->f; if (f->maxVer > fset->maxVerValid) { corrupt = true; tsdbError("skip incomplete stt file.fid:%d, maxVerValid:%" PRId64 ", minVer:%" PRId64 ", maxVer:%" PRId64 @@ -1299,7 +1299,7 @@ static STsdbSnapPartList* tsdbGetSnapPartList(STFileSystem* fs) { } int32_t code = 0; - taosThreadRwlockRdlock(&fs->tsdb->rwLock); + taosThreadMutexLock(&fs->tsdb->mutex); STFileSet* fset; TARRAY2_FOREACH(fs->fSetArr, fset) { STsdbSnapPartition* pItem = NULL; @@ -1311,7 +1311,7 @@ static STsdbSnapPartList* tsdbGetSnapPartList(STFileSystem* fs) { code = TARRAY2_SORT_INSERT(pList, pItem, tsdbSnapPartCmprFn); ASSERT(code == 0); } - taosThreadRwlockUnlock(&fs->tsdb->rwLock); + taosThreadMutexUnlock(&fs->tsdb->mutex); if (code) { TARRAY2_DESTROY(pList, tsdbSnapPartitionClear); From 4163a3be7c8233d9ca32de6d570b766ba8e5c00e Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 27 Oct 2023 15:21:55 +0800 Subject: [PATCH 19/31] feat: resend snap replication of data on timeout --- source/dnode/vnode/src/vnd/vnodeSnapshot.c | 1 + source/dnode/vnode/src/vnd/vnodeSync.c | 5 +- source/libs/sync/inc/syncSnapshot.h | 6 +- source/libs/sync/src/syncMain.c | 3 +- source/libs/sync/src/syncSnapshot.c | 265 +++++++++------------ source/libs/sync/src/syncTimeout.c | 5 +- 6 files changed, 126 insertions(+), 159 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeSnapshot.c b/source/dnode/vnode/src/vnd/vnodeSnapshot.c index 87b407efcb..91244e321f 100644 --- a/source/dnode/vnode/src/vnd/vnodeSnapshot.c +++ b/source/dnode/vnode/src/vnd/vnodeSnapshot.c @@ -584,6 +584,7 @@ int32_t vnodeSnapWriterClose(SVSnapWriter *pWriter, int8_t rollback, SSnapshot * // commit json if (!rollback) { + ASSERT(pVnode->config.vgId == pWriter->info.config.vgId); pWriter->info.state.committed = pWriter->ever; pVnode->config = pWriter->info.config; pVnode->state = (SVState){.committed = pWriter->info.state.committed, diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index 6c03ed68e9..c9e805d80b 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -516,7 +516,10 @@ static int32_t vnodeSnapshotStopWrite(const SSyncFSM *pFsm, void *pWriter, bool pVnode->config.vgId, isApply, pSnapshot->lastApplyIndex, pSnapshot->lastApplyTerm, pSnapshot->lastConfigIndex); int32_t code = vnodeSnapWriterClose(pWriter, !isApply, pSnapshot); - vInfo("vgId:%d, apply vnode snapshot finished, code:0x%x", pVnode->config.vgId, code); + if (code != 0) { + vError("vgId:%d, failed to finish applying vnode snapshot since %s, code:0x%x", pVnode->config.vgId, terrstr(), + code); + } return code; } diff --git a/source/libs/sync/inc/syncSnapshot.h b/source/libs/sync/inc/syncSnapshot.h index e68702568a..f8ee99e8a0 100644 --- a/source/libs/sync/inc/syncSnapshot.h +++ b/source/libs/sync/inc/syncSnapshot.h @@ -22,9 +22,9 @@ extern "C" { #include "syncInt.h" -#define SYNC_SNAPSHOT_SEQ_INVALID -2 #define SYNC_SNAPSHOT_SEQ_FORCE_CLOSE -3 -#define SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT -1 +#define SYNC_SNAPSHOT_SEQ_INVALID -2 +#define SYNC_SNAPSHOT_SEQ_PREP -1 #define SYNC_SNAPSHOT_SEQ_BEGIN 0 #define SYNC_SNAPSHOT_SEQ_END 0x7FFFFFFF @@ -57,8 +57,6 @@ typedef struct SSyncSnapshotSender { int32_t seq; int32_t ack; void *pReader; - void *pCurrentBlock; - int32_t blockLen; SSnapshotParam snapshotParam; SSnapshot snapshot; SSyncCfg lastConfig; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index f9dc10da02..199c7a1445 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -818,7 +818,8 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo, int32_t vnodeVersion) { if (!taosCheckExistFile(pSyncNode->configPath)) { // create a new raft config file - sInfo("vgId:%d, create a new raft config file", pSyncNode->vgId); + sInfo("vgId:%d, create a new raft config file", pSyncInfo->vgId); + pSyncNode->vgId = pSyncInfo->vgId; pSyncNode->raftCfg.isStandBy = pSyncInfo->isStandBy; pSyncNode->raftCfg.snapshotStrategy = pSyncInfo->snapshotStrategy; pSyncNode->raftCfg.lastConfigIndex = pSyncInfo->syncCfg.lastIndex; diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index eee0ab2cc9..92d9571906 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -31,9 +31,9 @@ static void syncSnapBufferReset(SSyncSnapBuffer *pBuf) { } pBuf->entries[i % pBuf->size] = NULL; } - pBuf->start = 1; - pBuf->end = 1; - pBuf->cursor = 0; + pBuf->start = SYNC_SNAPSHOT_SEQ_BEGIN + 1; + pBuf->end = pBuf->start; + pBuf->cursor = pBuf->start - 1; taosThreadMutexUnlock(&pBuf->mutex); } @@ -76,8 +76,6 @@ SSyncSnapshotSender *snapshotSenderCreate(SSyncNode *pSyncNode, int32_t replicaI pSender->seq = SYNC_SNAPSHOT_SEQ_INVALID; pSender->ack = SYNC_SNAPSHOT_SEQ_INVALID; pSender->pReader = NULL; - pSender->pCurrentBlock = NULL; - pSender->blockLen = 0; pSender->sendingMS = SYNC_SNAPSHOT_RETRY_MS; pSender->pSyncNode = pSyncNode; pSender->replicaIndex = replicaIndex; @@ -113,12 +111,6 @@ void syncSnapBlockDestroy(void *ptr) { void snapshotSenderDestroy(SSyncSnapshotSender *pSender) { if (pSender == NULL) return; - // free current block - if (pSender->pCurrentBlock != NULL) { - taosMemoryFree(pSender->pCurrentBlock); - pSender->pCurrentBlock = NULL; - } - // close reader if (pSender->pReader != NULL) { pSender->pSyncNode->pFsm->FpSnapshotStopRead(pSender->pSyncNode->pFsm, pSender->pReader); @@ -141,11 +133,9 @@ int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) { int8_t started = atomic_val_compare_exchange_8(&pSender->start, false, true); if (started) return 0; - pSender->seq = SYNC_SNAPSHOT_SEQ_BEGIN; + pSender->seq = SYNC_SNAPSHOT_SEQ_PREP; pSender->ack = SYNC_SNAPSHOT_SEQ_INVALID; pSender->pReader = NULL; - pSender->pCurrentBlock = NULL; - pSender->blockLen = 0; pSender->snapshotParam.start = SYNC_INDEX_INVALID; pSender->snapshotParam.end = SYNC_INDEX_INVALID; pSender->snapshot.data = NULL; @@ -196,7 +186,7 @@ int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) { pMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex; pMsg->lastConfig = pSender->lastConfig; pMsg->startTime = pSender->startTime; - pMsg->seq = SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT; + pMsg->seq = pSender->seq; if (dataLen > 0) { pMsg->payloadType = snapInfo.type; @@ -236,13 +226,6 @@ void snapshotSenderStop(SSyncSnapshotSender *pSender, bool finish) { pSender->pReader = NULL; } - // free current block - if (pSender->pCurrentBlock != NULL) { - taosMemoryFree(pSender->pCurrentBlock); - pSender->pCurrentBlock = NULL; - pSender->blockLen = 0; - } - syncSnapBufferReset(pSender->pSndBuf); SRaftId destId = pSender->pSyncNode->replicasId[pSender->replicaIndex]; @@ -255,39 +238,45 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { int32_t code = -1; SyncSnapBlock *pBlk = NULL; - if (pSender->seq != SYNC_SNAPSHOT_SEQ_END) { + if (pSender->seq < SYNC_SNAPSHOT_SEQ_END) { pSender->seq++; - pBlk = taosMemoryCalloc(1, sizeof(SyncSnapBlock)); - if (pBlk == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - goto _OUT; - } + if (pSender->seq > SYNC_SNAPSHOT_SEQ_BEGIN) { + pBlk = taosMemoryCalloc(1, sizeof(SyncSnapBlock)); + if (pBlk == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _OUT; + } - // read data - int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotDoRead(pSender->pSyncNode->pFsm, pSender->pReader, &pBlk->pBlock, - &pBlk->blockLen); - if (ret != 0) { - sSError(pSender, "snapshot sender read failed since %s", terrstr()); - goto _OUT; - } - pBlk->seq = pSender->seq; + pBlk->seq = pSender->seq; - if (pSender->blockLen > 0) { - // has read data - sSDebug(pSender, "snapshot sender continue to read, blockLen:%d seq:%d", pSender->blockLen, pSender->seq); - } else { - // read finish, update seq to end - pSender->seq = SYNC_SNAPSHOT_SEQ_END; - sSInfo(pSender, "snapshot sender read to the end, blockLen:%d seq:%d", pSender->blockLen, pSender->seq); - code = 0; - goto _OUT; + // read data + int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotDoRead(pSender->pSyncNode->pFsm, pSender->pReader, + &pBlk->pBlock, &pBlk->blockLen); + if (ret != 0) { + sSError(pSender, "snapshot sender read failed since %s", terrstr()); + goto _OUT; + } + + if (pBlk->blockLen > 0) { + // has read data + sSDebug(pSender, "snapshot sender continue to read, blockLen:%d seq:%d", pBlk->blockLen, pBlk->seq); + } else { + // read finish, update seq to end + pSender->seq = SYNC_SNAPSHOT_SEQ_END; + sSInfo(pSender, "snapshot sender read to the end"); + code = 0; + goto _OUT; + } } } + ASSERT(pSender->seq >= SYNC_SNAPSHOT_SEQ_BEGIN && pSender->seq <= SYNC_SNAPSHOT_SEQ_END); + + int32_t blockLen = (pBlk != NULL) ? pBlk->blockLen : 0; // build msg SRpcMsg rpcMsg = {0}; - if (syncBuildSnapshotSend(&rpcMsg, pSender->blockLen, pSender->pSyncNode->vgId) != 0) { + if (syncBuildSnapshotSend(&rpcMsg, blockLen, pSender->pSyncNode->vgId) != 0) { sSError(pSender, "vgId:%d, snapshot sender build msg failed since %s", pSender->pSyncNode->vgId, terrstr()); goto _OUT; } @@ -304,7 +293,7 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { pMsg->startTime = pSender->startTime; pMsg->seq = pSender->seq; - if (pBlk != NULL && pBlk->pBlock != NULL) { + if (pBlk != NULL && pBlk->pBlock != NULL && pBlk->blockLen > 0) { memcpy(pMsg->data, pBlk->pBlock, pBlk->blockLen); } @@ -317,13 +306,12 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { // put in buffer int64_t nowMs = taosGetTimestampMs(); if (pBlk) { - ASSERT(pBlk->seq != SYNC_SNAPSHOT_SEQ_END); + ASSERT(pBlk->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pBlk->seq < SYNC_SNAPSHOT_SEQ_END); pBlk->sendTimeMs = nowMs; pSender->pSndBuf->entries[pSender->seq % pSender->pSndBuf->size] = pBlk; pBlk = NULL; - pSender->pSndBuf->end = pSender->seq + 1; + pSender->pSndBuf->end = TMAX(pSender->seq + 1, pSender->pSndBuf->end); } - pSender->lastSendTime = nowMs; code = 0; @@ -337,36 +325,52 @@ _OUT:; // send snapshot data from cache int32_t snapshotReSend(SSyncSnapshotSender *pSender) { - // build msg - SRpcMsg rpcMsg = {0}; - if (syncBuildSnapshotSend(&rpcMsg, pSender->blockLen, pSender->pSyncNode->vgId) != 0) { - sSError(pSender, "snapshot sender build msg failed since %s", terrstr()); - return -1; + SSyncSnapBuffer *pSndBuf = pSender->pSndBuf; + int32_t code = -1; + + taosThreadMutexLock(&pSndBuf->mutex); + + for (int32_t seq = pSndBuf->cursor + 1; seq < pSndBuf->end; ++seq) { + SyncSnapBlock *pBlk = pSndBuf->entries[seq % pSndBuf->size]; + ASSERT(pBlk && !pBlk->acked); + int64_t nowMs = taosGetTimestampMs(); + if (nowMs < pBlk->sendTimeMs + SYNC_SNAP_RESEND_MS) { + continue; + } + // build msg + SRpcMsg rpcMsg = {0}; + if (syncBuildSnapshotSend(&rpcMsg, pBlk->blockLen, pSender->pSyncNode->vgId) != 0) { + sSError(pSender, "snapshot sender build msg failed since %s", terrstr()); + goto _out; + } + + SyncSnapshotSend *pMsg = rpcMsg.pCont; + pMsg->srcId = pSender->pSyncNode->myRaftId; + pMsg->destId = pSender->pSyncNode->replicasId[pSender->replicaIndex]; + pMsg->term = pSender->term; + pMsg->beginIndex = pSender->snapshotParam.start; + pMsg->lastIndex = pSender->snapshot.lastApplyIndex; + pMsg->lastTerm = pSender->snapshot.lastApplyTerm; + pMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex; + pMsg->lastConfig = pSender->lastConfig; + pMsg->startTime = pSender->startTime; + pMsg->seq = pBlk->seq; + + if (pBlk->pBlock != NULL && pBlk->blockLen > 0) { + memcpy(pMsg->data, pBlk->pBlock, pBlk->blockLen); + } + + // send msg + if (syncNodeSendMsgById(&pMsg->destId, pSender->pSyncNode, &rpcMsg) != 0) { + sSError(pSender, "snapshot sender resend msg failed since %s", terrstr()); + goto _out; + } + pBlk->sendTimeMs = nowMs; + pSender->lastSendTime = nowMs; } - - SyncSnapshotSend *pMsg = rpcMsg.pCont; - pMsg->srcId = pSender->pSyncNode->myRaftId; - pMsg->destId = pSender->pSyncNode->replicasId[pSender->replicaIndex]; - pMsg->term = pSender->term; - pMsg->beginIndex = pSender->snapshotParam.start; - pMsg->lastIndex = pSender->snapshot.lastApplyIndex; - pMsg->lastTerm = pSender->snapshot.lastApplyTerm; - pMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex; - pMsg->lastConfig = pSender->lastConfig; - pMsg->startTime = pSender->startTime; - pMsg->seq = pSender->seq; - - if (pSender->pCurrentBlock != NULL && pSender->blockLen > 0) { - memcpy(pMsg->data, pSender->pCurrentBlock, pSender->blockLen); - } - - // send msg - if (syncNodeSendMsgById(&pMsg->destId, pSender->pSyncNode, &rpcMsg) != 0) { - sSError(pSender, "snapshot sender resend msg failed since %s", terrstr()); - return -1; - } - - pSender->lastSendTime = taosGetTimestampMs(); + code = 0; +_out:; + taosThreadMutexUnlock(&pSndBuf->mutex); return 0; } @@ -523,7 +527,7 @@ void snapshotReceiverStart(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *p int8_t started = atomic_val_compare_exchange_8(&pReceiver->start, false, true); if (started) return; - pReceiver->ack = SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT; + pReceiver->ack = SYNC_SNAPSHOT_SEQ_PREP; pReceiver->term = pPreMsg->term; pReceiver->fromId = pPreMsg->srcId; pReceiver->startTime = pPreMsg->startTime; @@ -592,6 +596,11 @@ static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnap // update progress pReceiver->ack = SYNC_SNAPSHOT_SEQ_END; + // get fsmState + SSnapshot snapshot = {0}; + pReceiver->pSyncNode->pFsm->FpGetSnapshotInfo(pReceiver->pSyncNode->pFsm, &snapshot); + pReceiver->pSyncNode->fsmState = snapshot.state; + // reset wal code = pReceiver->pSyncNode->pLogStore->syncLogRestoreFromSnapshot(pReceiver->pSyncNode->pLogStore, pMsg->lastIndex); @@ -600,12 +609,6 @@ static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnap return -1; } sRInfo(pReceiver, "wal log restored from snapshot"); - - // get fsmState - SSnapshot snapshot = {0}; - pReceiver->pSyncNode->pFsm->FpGetSnapshotInfo(pReceiver->pSyncNode->pFsm, &snapshot); - pReceiver->pSyncNode->fsmState = snapshot.state; - } else { sRError(pReceiver, "snapshot receiver finish error since writer is null"); return -1; @@ -892,6 +895,9 @@ static int32_t syncSnapBufferRecv(SSyncSnapshotReceiver *pReceiver, SyncSnapshot pRcvBuf->entries[pMsg->seq % pRcvBuf->size] = pMsg; ppMsg[0] = NULL; pRcvBuf->end = TMAX(pMsg->seq + 1, pRcvBuf->end); + } else { + syncSnapSendRsp(pReceiver, pMsg, code); + goto _out; } for (int64_t seq = pRcvBuf->cursor + 1; seq < pRcvBuf->end; ++seq) { @@ -991,7 +997,7 @@ _SEND_REPLY:; // receiver on message // -// condition 1, recv SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT +// condition 1, recv SYNC_SNAPSHOT_SEQ_PREP // if receiver already start // if sender.start-time > receiver.start-time, restart receiver(reply snapshot start) // if sender.start-time = receiver.start-time, maybe duplicate msg @@ -1040,7 +1046,7 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { int32_t code = 0; if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER || pSyncNode->state == TAOS_SYNC_STATE_LEARNER) { if (pMsg->term == raftStoreGetTerm(pSyncNode)) { - if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT) { + if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PREP) { sInfo("vgId:%d, receive prepare msg of snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", pSyncNode->vgId, pMsg->term, pMsg->startTime); code = syncNodeOnSnapshotPrep(pSyncNode, pMsg); @@ -1052,16 +1058,14 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { sInfo("vgId:%d, receive end msg of snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", pSyncNode->vgId, pMsg->term, pMsg->startTime); code = syncNodeOnSnapshotEnd(pSyncNode, pMsg); - if (syncLogBufferReInit(pSyncNode->pLogBuf, pSyncNode) != 0) { + if (code != 0) { + sRError(pReceiver, "failed to end snapshot."); + code = -1; + } else if (syncLogBufferReInit(pSyncNode->pLogBuf, pSyncNode) != 0) { sRError(pReceiver, "failed to reinit log buffer since %s", terrstr()); code = -1; } - } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_FORCE_CLOSE) { - // force close, no response - syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process force stop"); - snapshotReceiverStop(pReceiver); } else if (pMsg->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->seq < SYNC_SNAPSHOT_SEQ_END) { - syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq data"); code = syncNodeOnSnapshotReceive(pSyncNode, ppMsg); } else { // error log @@ -1118,38 +1122,7 @@ static int32_t syncNodeOnSnapshotPrepRsp(SSyncNode *pSyncNode, SSyncSnapshotSend // update next index syncIndexMgrSetIndex(pSyncNode->pNextIndex, &pMsg->srcId, snapshot.lastApplyIndex + 1); - // update seq - pSender->seq = SYNC_SNAPSHOT_SEQ_BEGIN; - - // build begin msg - SRpcMsg rpcMsg = {0}; - if (syncBuildSnapshotSend(&rpcMsg, 0, pSender->pSyncNode->vgId) != 0) { - sSError(pSender, "prepare snapshot failed since build msg error"); - return -1; - } - - SyncSnapshotSend *pSendMsg = rpcMsg.pCont; - pSendMsg->srcId = pSender->pSyncNode->myRaftId; - pSendMsg->destId = pSender->pSyncNode->replicasId[pSender->replicaIndex]; - pSendMsg->term = raftStoreGetTerm(pSender->pSyncNode); - pSendMsg->beginIndex = pSender->snapshotParam.start; - pSendMsg->lastIndex = pSender->snapshot.lastApplyIndex; - pSendMsg->lastTerm = pSender->snapshot.lastApplyTerm; - pSendMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex; - pSendMsg->lastConfig = pSender->lastConfig; - pSendMsg->startTime = pSender->startTime; - pSendMsg->seq = SYNC_SNAPSHOT_SEQ_BEGIN; - - sSInfo(pSender, "begin snapshot replication to dnode %d.", DID(&pSendMsg->destId)); - - // send msg - syncLogSendSyncSnapshotSend(pSyncNode, pSendMsg, "snapshot sender reply pre"); - if (syncNodeSendMsgById(&pSendMsg->destId, pSender->pSyncNode, &rpcMsg) != 0) { - sSError(pSender, "prepare snapshot failed since send msg error"); - return -1; - } - - return 0; + return snapshotSend(pSender); } static int32_t snapshotSenderSignatureCmp(SSyncSnapshotSender *pSender, SyncSnapshotRsp *pMsg) { @@ -1166,10 +1139,18 @@ static int32_t syncSnapBufferSend(SSyncSnapshotSender *pSender, SyncSnapshotRsp SyncSnapshotRsp *pMsg = ppMsg[0]; taosThreadMutexLock(&pSndBuf->mutex); + if (snapshotSenderSignatureCmp(pSender, pMsg) != 0) { + code = terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; + goto _out; + } + + if (pSender->pReader == NULL || pSender->finish) { + code = terrno = TSDB_CODE_SYN_INTERNAL_ERROR; + goto _out; + } if (pMsg->ack - pSndBuf->start >= pSndBuf->size) { - terrno = TSDB_CODE_SYN_BUFFER_FULL; - code = terrno; + code = terrno = TSDB_CODE_SYN_BUFFER_FULL; goto _out; } @@ -1196,16 +1177,11 @@ static int32_t syncSnapBufferSend(SSyncSnapshotSender *pSender, SyncSnapshotRsp pSndBuf->start = ack + 1; } - pSender->ack = pSndBuf->start - 1; - while (pSender->seq != SYNC_SNAPSHOT_SEQ_END && pSender->seq - pSndBuf->start < (pSndBuf->size >> 2)) { if (snapshotSend(pSender) != 0) { code = terrno; goto _out; } - if (pSender->seq != SYNC_SNAPSHOT_SEQ_END) { - pSndBuf->end = TMAX(pSender->seq + 1, pSndBuf->end); - } } if (pSender->seq == SYNC_SNAPSHOT_SEQ_END && pSndBuf->end <= pSndBuf->start) { @@ -1285,23 +1261,17 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { } // prepare , send begin msg - if (pMsg->ack == SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT) { + if (pMsg->ack == SYNC_SNAPSHOT_SEQ_PREP) { return syncNodeOnSnapshotPrepRsp(pSyncNode, pSender, pMsg); } - if (pSender->pReader == NULL || pSender->finish) { - sSError(pSender, "snapshot sender invalid error:%s 0x%x, pReader:%p finish:%d", tstrerror(pMsg->code), pMsg->code, - pSender->pReader, pSender->finish); - terrno = pMsg->code; - goto _ERROR; - } - - if (pMsg->ack == SYNC_SNAPSHOT_SEQ_BEGIN) { - sSInfo(pSender, "process seq begin"); - if (snapshotSend(pSender) != 0) { + // send next msg + if (pMsg->ack >= SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->ack < SYNC_SNAPSHOT_SEQ_END) { + if (syncSnapBufferSend(pSender, ppMsg) != 0) { + sSError(pSender, "failed to replicate snap since %s. seq:%d, pReader:%p, finish:%d", terrstr(), pSender->seq, + pSender->pReader, pSender->finish); goto _ERROR; } - return 0; } // receive ack is finish, close sender @@ -1312,11 +1282,6 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { return 0; } - // send next msg - if (syncSnapBufferSend(pSender, ppMsg) != 0) { - sSError(pSender, "failed to send snapshot msg since %s. seq:%d", terrstr(), pSender->seq); - goto _ERROR; - } return 0; _ERROR: diff --git a/source/libs/sync/src/syncTimeout.c b/source/libs/sync/src/syncTimeout.c index 91c7494fa4..5837308e59 100644 --- a/source/libs/sync/src/syncTimeout.c +++ b/source/libs/sync/src/syncTimeout.c @@ -78,9 +78,8 @@ static int32_t syncNodeTimerRoutine(SSyncNode* ths) { SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(ths, &(ths->peersId[i])); if (pSender != NULL) { if (ths->isStart && ths->state == TAOS_SYNC_STATE_LEADER && pSender->start && - timeNow - pSender->lastSendTime > SYNC_SNAP_TIMEOUT_MS) { - sSError(pSender, "snapshot timeout, terminate. lastSendTime:%d", pSender->lastSendTime); - snapshotSenderStop(pSender, false); + timeNow - pSender->lastSendTime > SYNC_SNAP_RESEND_MS) { + snapshotReSend(pSender); } } } From a7f3041ff3e28179d7916a103e45967608fad1db Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Fri, 27 Oct 2023 17:59:27 +0800 Subject: [PATCH 20/31] more fix --- source/dnode/vnode/src/tsdb/tsdbMemTable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbMemTable.c b/source/dnode/vnode/src/tsdb/tsdbMemTable.c index ee3abf7559..cc77474e79 100644 --- a/source/dnode/vnode/src/tsdb/tsdbMemTable.c +++ b/source/dnode/vnode/src/tsdb/tsdbMemTable.c @@ -191,7 +191,7 @@ int32_t tsdbDeleteTableData(STsdb *pTsdb, int64_t version, tb_uid_t suid, tb_uid pMemTable->nDel++; pMemTable->minVer = TMIN(pMemTable->minVer, version); - pMemTable->maxVer = TMIN(pMemTable->maxVer, version); + pMemTable->maxVer = TMAX(pMemTable->maxVer, version); /* if (TSDB_CACHE_LAST_ROW(pMemTable->pTsdb->pVnode->config) && tsdbKeyCmprFn(&lastKey, &pTbData->maxKey) >= 0) { tsdbCacheDeleteLastrow(pTsdb->lruCache, pTbData->uid, eKey); From c3f9cae36bfa9fa8ce7df630834259999b14e3f6 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 27 Oct 2023 18:55:30 +0800 Subject: [PATCH 21/31] refact: improve code of syncNodeOnSnapshot --- source/libs/sync/src/syncSnapshot.c | 106 +++++++++++++++------------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 92d9571906..56735d479e 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -1019,6 +1019,7 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { SyncSnapshotSend **ppMsg = (SyncSnapshotSend **)&pRpcMsg->pCont; SyncSnapshotSend *pMsg = ppMsg[0]; SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver; + int32_t code = 0; // if already drop replica, do not process if (!syncNodeInRaftGroup(pSyncNode, &pMsg->srcId)) { @@ -1042,47 +1043,56 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { syncNodeUpdateTermWithoutStepDown(pSyncNode, pMsg->term); } - // state, term, seq/ack - int32_t code = 0; - if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER || pSyncNode->state == TAOS_SYNC_STATE_LEARNER) { - if (pMsg->term == raftStoreGetTerm(pSyncNode)) { - if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PREP) { - sInfo("vgId:%d, receive prepare msg of snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", - pSyncNode->vgId, pMsg->term, pMsg->startTime); - code = syncNodeOnSnapshotPrep(pSyncNode, pMsg); - } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_BEGIN) { - sInfo("vgId:%d, receive begin msg of snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", - pSyncNode->vgId, pMsg->term, pMsg->startTime); - code = syncNodeOnSnapshotBegin(pSyncNode, pMsg); - } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_END) { - sInfo("vgId:%d, receive end msg of snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", pSyncNode->vgId, - pMsg->term, pMsg->startTime); - code = syncNodeOnSnapshotEnd(pSyncNode, pMsg); - if (code != 0) { - sRError(pReceiver, "failed to end snapshot."); - code = -1; - } else if (syncLogBufferReInit(pSyncNode->pLogBuf, pSyncNode) != 0) { - sRError(pReceiver, "failed to reinit log buffer since %s", terrstr()); - code = -1; - } - } else if (pMsg->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->seq < SYNC_SNAPSHOT_SEQ_END) { - code = syncNodeOnSnapshotReceive(pSyncNode, ppMsg); - } else { - // error log - sRError(pReceiver, "snapshot receiver recv error seq:%d, my ack:%d", pMsg->seq, pReceiver->ack); - code = -1; - } - } else { - // error log - sRError(pReceiver, "snapshot receiver term not equal"); - code = -1; - } - } else { - // error log - sRError(pReceiver, "snapshot receiver not follower"); - code = -1; + if (pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER && pSyncNode->state != TAOS_SYNC_STATE_LEARNER) { + sRError(pReceiver, "snapshot receiver not a follower or learner"); + return -1; } + if (pMsg->seq < SYNC_SNAPSHOT_SEQ_PREP || pMsg->seq > SYNC_SNAPSHOT_SEQ_END) { + sRError(pReceiver, "snap replication msg with invalid seq:%d", pMsg->seq); + return -1; + } + + // prepare + if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PREP) { + sInfo("vgId:%d, prepare snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", pSyncNode->vgId, pMsg->term, + pMsg->startTime); + code = syncNodeOnSnapshotPrep(pSyncNode, pMsg); + goto _out; + } + + // begin + if (pMsg->seq == SYNC_SNAPSHOT_SEQ_BEGIN) { + sInfo("vgId:%d, begin snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", pSyncNode->vgId, pMsg->term, + pMsg->startTime); + code = syncNodeOnSnapshotBegin(pSyncNode, pMsg); + goto _out; + } + + // data + if (pMsg->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->seq < SYNC_SNAPSHOT_SEQ_END) { + code = syncNodeOnSnapshotReceive(pSyncNode, ppMsg); + goto _out; + } + + // end + if (pMsg->seq == SYNC_SNAPSHOT_SEQ_END) { + sInfo("vgId:%d, end snap replication. msg signature:(%" PRId64 ", %" PRId64 ")", pSyncNode->vgId, pMsg->term, + pMsg->startTime); + code = syncNodeOnSnapshotEnd(pSyncNode, pMsg); + if (code != 0) { + sRError(pReceiver, "failed to end snapshot."); + goto _out; + } + + code = syncLogBufferReInit(pSyncNode->pLogBuf, pSyncNode); + if (code != 0) { + sRError(pReceiver, "failed to reinit log buffer since %s", terrstr()); + } + goto _out; + } + +_out:; syncNodeResetElectTimer(pSyncNode); return code; } @@ -1229,8 +1239,7 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { // check signature int32_t order = 0; if ((order = snapshotSenderSignatureCmp(pSender, pMsg)) > 0) { - sSError(pSender, "received a stale snapshot rsp, msg signature:(%" PRId64 ", %" PRId64 "), ignore it.", pMsg->term, - pMsg->startTime); + sSWarn(pSender, "ignore a stale snap rsp, msg signature:(%" PRId64 ", %" PRId64 ").", pMsg->term, pMsg->startTime); terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; return -1; } else if (order < 0) { @@ -1239,7 +1248,6 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { goto _ERROR; } - // state, term, seq/ack if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { sSError(pSender, "snapshot sender not leader"); terrno = TSDB_CODE_SYN_NOT_LEADER; @@ -1260,12 +1268,15 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { goto _ERROR; } - // prepare , send begin msg + // send begin if (pMsg->ack == SYNC_SNAPSHOT_SEQ_PREP) { - return syncNodeOnSnapshotPrepRsp(pSyncNode, pSender, pMsg); + sSInfo(pSender, "process prepare rsp"); + if (syncNodeOnSnapshotPrepRsp(pSyncNode, pSender, pMsg) != 0) { + goto _ERROR; + } } - // send next msg + // send msg of data or end if (pMsg->ack >= SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->ack < SYNC_SNAPSHOT_SEQ_END) { if (syncSnapBufferSend(pSender, ppMsg) != 0) { sSError(pSender, "failed to replicate snap since %s. seq:%d, pReader:%p, finish:%d", terrstr(), pSender->seq, @@ -1274,12 +1285,11 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, SRpcMsg *pRpcMsg) { } } - // receive ack is finish, close sender + // end if (pMsg->ack == SYNC_SNAPSHOT_SEQ_END) { - sSInfo(pSender, "process seq end"); + sSInfo(pSender, "process end rsp"); snapshotSenderStop(pSender, true); syncNodeReplicateReset(pSyncNode, &pMsg->srcId); - return 0; } return 0; From 55f6d2b7a96e50e364c286c0bdd813ba35ff5717 Mon Sep 17 00:00:00 2001 From: facetosea <285808407@qq.com> Date: Mon, 30 Oct 2023 09:30:35 +0800 Subject: [PATCH 22/31] use TSDB_MAX_KEEP --- source/libs/parser/src/parTranslater.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/parser/src/parTranslater.c b/source/libs/parser/src/parTranslater.c index d65f97dce7..5c7a6180c0 100644 --- a/source/libs/parser/src/parTranslater.c +++ b/source/libs/parser/src/parTranslater.c @@ -3526,7 +3526,7 @@ static void convertVarDuration(SValueNode* pOffset, uint8_t precision) { pOffset->unit = units[precision]; } -static const int64_t maxKeepMS = (int64_t)3600 * 1000 * 24 * (365 * 1000 + 250); +static const int64_t tsdbMaxKeepMS = (int64_t)60 * 1000 * TSDB_MAX_KEEP; static int32_t checkIntervalWindow(STranslateContext* pCxt, SIntervalWindowNode* pInterval) { uint8_t precision = ((SColumnNode*)pInterval->pCol)->node.resType.precision; @@ -3535,7 +3535,7 @@ static int32_t checkIntervalWindow(STranslateContext* pCxt, SIntervalWindowNode* if (pInter->datum.i <= 0 || (!valInter && pInter->datum.i < tsMinIntervalTime)) { return generateSyntaxErrMsg(&pCxt->msgBuf, TSDB_CODE_PAR_INTER_VALUE_TOO_SMALL, tsMinIntervalTime, getPrecisionStr(precision)); - } else if (pInter->datum.i / getPrecisionMultiple(precision) > maxKeepMS) { + } else if (pInter->datum.i / getPrecisionMultiple(precision) > tsdbMaxKeepMS) { return generateSyntaxErrMsg(&pCxt->msgBuf, TSDB_CODE_PAR_INTER_VALUE_TOO_BIG, 1000, "years"); } From 4b6f927e6a719e763cc1b34f44fcd669cfa2f3be Mon Sep 17 00:00:00 2001 From: slzhou Date: Mon, 30 Oct 2023 10:11:57 +0800 Subject: [PATCH 23/31] fix: add test case for bi tag scan --- source/libs/parser/src/parAstCreater.c | 7 +++--- tests/parallel_test/cases.task | 1 + tests/script/tsim/query/bi_tag_scan.sim | 31 +++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 tests/script/tsim/query/bi_tag_scan.sim diff --git a/source/libs/parser/src/parAstCreater.c b/source/libs/parser/src/parAstCreater.c index 8196a9b329..de7ffce2d3 100644 --- a/source/libs/parser/src/parAstCreater.c +++ b/source/libs/parser/src/parAstCreater.c @@ -1024,15 +1024,14 @@ SNode* createSelectStmt(SAstCreateContext* pCxt, bool isDistinct, SNodeList* pPr SNodeList* pHint) { CHECK_PARSER_STATUS(pCxt); SNode* select = createSelectStmtImpl(isDistinct, pProjectionList, pTable, pHint); - if (pCxt->pQueryCxt->biMode) { - setSelectStmtTagMode(pCxt, select, true); - } CHECK_OUT_OF_MEM(select); return select; } SNode* setSelectStmtTagMode(SAstCreateContext* pCxt, SNode* pStmt, bool bSelectTags) { - if (pStmt && QUERY_NODE_SELECT_STMT == nodeType(pStmt)) { + if (pCxt->pQueryCxt->biMode) { + ((SSelectStmt*)pStmt)->tagScan = true; + } else if (pStmt && QUERY_NODE_SELECT_STMT == nodeType(pStmt)) { ((SSelectStmt*)pStmt)->tagScan = bSelectTags; } return pStmt; diff --git a/tests/parallel_test/cases.task b/tests/parallel_test/cases.task index 6915f802c3..1206e11c75 100644 --- a/tests/parallel_test/cases.task +++ b/tests/parallel_test/cases.task @@ -1044,6 +1044,7 @@ e ,,y,script,./test.sh -f tsim/query/tableCount.sim ,,y,script,./test.sh -f tsim/query/show_db_table_kind.sim ,,y,script,./test.sh -f tsim/query/bi_star_table.sim +,,y,script,./test.sh -f tsim/query/bi_tag_scan.sim ,,y,script,./test.sh -f tsim/query/tag_scan.sim ,,y,script,./test.sh -f tsim/query/nullColSma.sim ,,y,script,./test.sh -f tsim/query/bug3398.sim diff --git a/tests/script/tsim/query/bi_tag_scan.sim b/tests/script/tsim/query/bi_tag_scan.sim new file mode 100644 index 0000000000..5b8af68b5a --- /dev/null +++ b/tests/script/tsim/query/bi_tag_scan.sim @@ -0,0 +1,31 @@ +system sh/stop_dnodes.sh +system sh/deploy.sh -n dnode1 -i 1 +system sh/exec.sh -n dnode1 -s start +sql connect + +sql drop database if exists db1; +sql create database db1 vgroups 3; +sql create database db1; +sql use db1; +sql create stable sta (ts timestamp, f1 int, f2 binary(200)) tags(t1 int, t2 int, t3 int); +sql create stable stb (ts timestamp, f1 int, f2 binary(200)) tags(t1 int, t2 int, t3 int); +sql create table tba1 using sta tags(1, 1, 1); +sql create table tba2 using sta tags(2, 2, 2); +sql insert into tba1 values(now, 1, "1")(now+3s, 3, "3")(now+5s, 5, "5"); +sql insert into tba2 values(now + 1s, 2, "2")(now+2s, 2, "2")(now+4s, 4, "4"); +sql create table tbn1 (ts timestamp, f1 int); + +set_bi_mode 1 +sql select t1,t2,t3 from db1.sta order by t1; +print $rows $data00 $data10 +if $rows != 2 then + return -1 +endi +if $data00 != 1 then + return -1 +endi +if $data10 != 2 then + return -1 +endi + +system sh/exec.sh -n dnode1 -s stop -x SIGINT From 70e261f66217e2d1ff8f1c6edd5374db7ed921b6 Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Fri, 27 Oct 2023 19:21:54 +0800 Subject: [PATCH 24/31] enh: terminate snap replication on timeout --- include/libs/sync/sync.h | 2 +- source/libs/sync/src/syncSnapshot.c | 4 +--- source/libs/sync/src/syncTimeout.c | 16 +++++++++++++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index e3b5f3611c..a19b249bc7 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -46,7 +46,7 @@ extern "C" { #define SYNC_HEARTBEAT_SLOW_MS 1500 #define SYNC_HEARTBEAT_REPLY_SLOW_MS 1500 #define SYNC_SNAP_RESEND_MS 1000 * 60 -#define SYNC_SNAP_TIMEOUT_MS 1000 * 180 +#define SYNC_SNAP_TIMEOUT_MS 1000 * 600 #define SYNC_VND_COMMIT_MIN_MS 3000 diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 56735d479e..0b94d377f1 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -327,7 +327,6 @@ _OUT:; int32_t snapshotReSend(SSyncSnapshotSender *pSender) { SSyncSnapBuffer *pSndBuf = pSender->pSndBuf; int32_t code = -1; - taosThreadMutexLock(&pSndBuf->mutex); for (int32_t seq = pSndBuf->cursor + 1; seq < pSndBuf->end; ++seq) { @@ -366,12 +365,11 @@ int32_t snapshotReSend(SSyncSnapshotSender *pSender) { goto _out; } pBlk->sendTimeMs = nowMs; - pSender->lastSendTime = nowMs; } code = 0; _out:; taosThreadMutexUnlock(&pSndBuf->mutex); - return 0; + return code; } // return 0, start ok diff --git a/source/libs/sync/src/syncTimeout.c b/source/libs/sync/src/syncTimeout.c index 5837308e59..a57dfbee53 100644 --- a/source/libs/sync/src/syncTimeout.c +++ b/source/libs/sync/src/syncTimeout.c @@ -77,9 +77,19 @@ static int32_t syncNodeTimerRoutine(SSyncNode* ths) { for (int i = 0; i < ths->peersNum; ++i) { SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(ths, &(ths->peersId[i])); if (pSender != NULL) { - if (ths->isStart && ths->state == TAOS_SYNC_STATE_LEADER && pSender->start && - timeNow - pSender->lastSendTime > SYNC_SNAP_RESEND_MS) { - snapshotReSend(pSender); + if (ths->isStart && ths->state == TAOS_SYNC_STATE_LEADER && pSender->start) { + int64_t elapsedMs = timeNow - pSender->lastSendTime; + if (elapsedMs < SYNC_SNAP_RESEND_MS) { + continue; + } + + if (elapsedMs > SYNC_SNAP_TIMEOUT_MS) { + sSError(pSender, "snap replication timeout, terminate."); + snapshotSenderStop(pSender, false); + } else { + sSWarn(pSender, "snap replication resend."); + snapshotReSend(pSender); + } } } } From 78cce0003767bb44b4b876a1e8e4954a9bc2a8bc Mon Sep 17 00:00:00 2001 From: Benguang Zhao Date: Mon, 30 Oct 2023 10:46:17 +0800 Subject: [PATCH 25/31] test: pause 1s after creating stream in testcase sliding.sim --- tests/script/tsim/stream/sliding.sim | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/script/tsim/stream/sliding.sim b/tests/script/tsim/stream/sliding.sim index 18893245fa..a92da7f472 100644 --- a/tests/script/tsim/stream/sliding.sim +++ b/tests/script/tsim/stream/sliding.sim @@ -21,6 +21,7 @@ sql create stream streams1 trigger at_once IGNORE EXPIRED 0 IGNORE UPDATE 0 in sql create stream streams2 trigger at_once watermark 1d IGNORE EXPIRED 0 IGNORE UPDATE 0 into streamt2 as select _wstart, count(*) c1, sum(a) c3 , max(b) c4, min(c) c5 from t1 interval(10s) sliding (5s); sql create stream stream_t1 trigger at_once IGNORE EXPIRED 0 IGNORE UPDATE 0 into streamtST as select _wstart, count(*) c1, sum(a) c3 , max(b) c4, min(c) c5 from st interval(10s) sliding (5s); sql create stream stream_t2 trigger at_once watermark 1d IGNORE EXPIRED 0 IGNORE UPDATE 0 into streamtST2 as select _wstart, count(*) c1, sum(a) c3 , max(b) c4, min(c) c5 from st interval(10s) sliding (5s); +sleep 1000 sql insert into t1 values(1648791210000,1,2,3,1.0); sql insert into t1 values(1648791216000,2,2,3,1.1); @@ -311,6 +312,7 @@ sql create table t2 using st tags(2,2,2); sql create stream streams11 trigger at_once IGNORE EXPIRED 0 IGNORE UPDATE 0 into streamt as select _wstart, count(*) c1, sum(a) c3 , max(b) c4, min(c) c5 from t1 interval(10s, 5s); sql create stream streams12 trigger at_once IGNORE EXPIRED 0 IGNORE UPDATE 0 into streamt2 as select _wstart, count(*) c1, sum(a) c3 , max(b) c4, min(c) c5 from st interval(10s, 5s); +sleep 1000 sql insert into t1 values(1648791213000,1,2,3,1.0); sql insert into t1 values(1648791223001,2,2,3,1.1); @@ -444,6 +446,7 @@ sql create table t2 using st tags(2,2,2); sql create stream streams21 trigger at_once IGNORE EXPIRED 0 IGNORE UPDATE 0 into streamt21 as select _wstart, count(*) c1, sum(a) c3 , max(b) c4, min(c) c5 from t1 interval(10s, 5s); sql create stream streams22 trigger at_once IGNORE EXPIRED 0 IGNORE UPDATE 0 into streamt22 as select _wstart, count(*) c1, sum(a) c3 , max(b) c4, min(c) c5 from st interval(10s, 5s); +sleep 1000 sql insert into t1 values(1648791213000,1,1,1,1.0); sql insert into t1 values(1648791223001,2,2,2,1.1); @@ -582,6 +585,7 @@ sql create table t1 using st tags(1,1,1); sql create table t2 using st tags(2,2,2); sql create stream streams23 trigger at_once IGNORE EXPIRED 0 IGNORE UPDATE 0 into streamt23 as select _wstart, count(*) c1, sum(a) c3 , max(b) c4, min(c) c5 from st interval(20s) sliding(10s); +sleep 1000 sql insert into t1 values(1648791213000,1,1,1,1.0); sql insert into t1 values(1648791223001,2,2,2,1.1); @@ -706,6 +710,7 @@ sql create table t1 using st tags(1,1,1); sql create table t2 using st tags(2,2,2); sql create stream streams4 trigger at_once IGNORE EXPIRED 0 IGNORE UPDATE 0 into streamt4 as select _wstart as ts, count(*),min(a) c1 from st interval(10s) sliding(5s); +sleep 1000 sql insert into t1 values(1648791213000,1,1,1,1.0); sql insert into t1 values(1648791243000,2,1,1,1.0); @@ -818,4 +823,4 @@ print ============loop_all=$loop_all #=goto looptest -system sh/stop_dnodes.sh \ No newline at end of file +system sh/stop_dnodes.sh From be0a85a0ffd4fca365d150682af58ae1d562675e Mon Sep 17 00:00:00 2001 From: dmchen Date: Mon, 30 Oct 2023 07:19:08 +0000 Subject: [PATCH 26/31] memory leak --- source/dnode/vnode/src/vnd/vnodeSvr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index 040ae96972..d4bea4332b 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -1008,7 +1008,7 @@ _exit: taosArrayDestroy(tbUids); tDecoderClear(&decoder); tEncoderClear(&encoder); - taosArrayDestroy(tbNames); + taosArrayDestroyP(tbNames, taosMemoryFree); return rcode; } @@ -1225,7 +1225,7 @@ _exit: tEncodeSVDropTbBatchRsp(&encoder, &rsp); tEncoderClear(&encoder); taosArrayDestroy(rsp.pArray); - taosArrayDestroy(tbNames); + taosArrayDestroyP(tbNames, taosMemoryFree); return 0; } From 0f40e1dbb90fb638328412d83354103eafccc889 Mon Sep 17 00:00:00 2001 From: dmchen Date: Mon, 30 Oct 2023 08:57:07 +0000 Subject: [PATCH 27/31] memory leak --- source/dnode/vnode/src/vnd/vnodeSvr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index d4bea4332b..81baa5a54e 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -1008,7 +1008,7 @@ _exit: taosArrayDestroy(tbUids); tDecoderClear(&decoder); tEncoderClear(&encoder); - taosArrayDestroyP(tbNames, taosMemoryFree); + taosArrayDestroyEx(tbNames, taosMemoryFree); return rcode; } @@ -1225,7 +1225,7 @@ _exit: tEncodeSVDropTbBatchRsp(&encoder, &rsp); tEncoderClear(&encoder); taosArrayDestroy(rsp.pArray); - taosArrayDestroyP(tbNames, taosMemoryFree); + taosArrayDestroyEx(tbNames, taosMemoryFree); return 0; } From f852f87f94d9fff0156852bfc9f7cc35119735e3 Mon Sep 17 00:00:00 2001 From: tjuzyp Date: Mon, 30 Oct 2023 17:31:42 +0800 Subject: [PATCH 28/31] feat: do not build taosx in ci --- tests/parallel_test/container_build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/parallel_test/container_build.sh b/tests/parallel_test/container_build.sh index 94704b1c25..f4db7edb8b 100755 --- a/tests/parallel_test/container_build.sh +++ b/tests/parallel_test/container_build.sh @@ -69,7 +69,7 @@ docker run \ -v ${REP_REAL_PATH}/community/contrib/libuv/:${REP_DIR}/community/contrib/libuv \ -v ${REP_REAL_PATH}/community/contrib/lz4/:${REP_DIR}/community/contrib/lz4 \ -v ${REP_REAL_PATH}/community/contrib/zlib/:${REP_DIR}/community/contrib/zlib \ - --rm --ulimit core=-1 taos_test:v1.0 sh -c "pip uninstall taospy -y;pip3 install taospy==2.7.2;cd $REP_DIR;rm -rf debug;mkdir -p debug;cd debug;cmake .. -DBUILD_HTTP=false -DBUILD_TOOLS=true -DBUILD_TEST=true -DWEBSOCKET=true -DBUILD_TAOSX=true -DJEMALLOC_ENABLED=0;make -j 10|| exit 1" + --rm --ulimit core=-1 taos_test:v1.0 sh -c "pip uninstall taospy -y;pip3 install taospy==2.7.2;cd $REP_DIR;rm -rf debug;mkdir -p debug;cd debug;cmake .. -DBUILD_HTTP=false -DBUILD_TOOLS=true -DBUILD_TEST=true -DWEBSOCKET=true -DBUILD_TAOSX=false -DJEMALLOC_ENABLED=0;make -j 10|| exit 1" # -v ${REP_REAL_PATH}/community/contrib/jemalloc/:${REP_DIR}/community/contrib/jemalloc \ if [[ -d ${WORKDIR}/debugNoSan ]] ;then @@ -99,7 +99,7 @@ docker run \ -v ${REP_REAL_PATH}/community/contrib/lz4/:${REP_DIR}/community/contrib/lz4 \ -v ${REP_REAL_PATH}/community/contrib/zlib/:${REP_DIR}/community/contrib/zlib \ -v ${REP_REAL_PATH}/community/contrib/jemalloc/:${REP_DIR}/community/contrib/jemalloc \ - --rm --ulimit core=-1 taos_test:v1.0 sh -c "pip uninstall taospy -y;pip3 install taospy==2.7.2;cd $REP_DIR;rm -rf debug;mkdir -p debug;cd debug;cmake .. -DBUILD_HTTP=false -DBUILD_TOOLS=true -DBUILD_TEST=true -DWEBSOCKET=true -DBUILD_SANITIZER=1 -DTOOLS_SANITIZE=true -DTOOLS_BUILD_TYPE=Debug -DBUILD_TAOSX=true -DJEMALLOC_ENABLED=0;make -j 10|| exit 1 " + --rm --ulimit core=-1 taos_test:v1.0 sh -c "pip uninstall taospy -y;pip3 install taospy==2.7.2;cd $REP_DIR;rm -rf debug;mkdir -p debug;cd debug;cmake .. -DBUILD_HTTP=false -DBUILD_TOOLS=true -DBUILD_TEST=true -DWEBSOCKET=true -DBUILD_SANITIZER=1 -DTOOLS_SANITIZE=true -DTOOLS_BUILD_TYPE=Debug -DBUILD_TAOSX=false -DJEMALLOC_ENABLED=0;make -j 10|| exit 1 " mv ${REP_REAL_PATH}/debug ${WORKDIR}/debugSan From fe044051c8ffd90d136bdf0192b9e53ae07ccf9f Mon Sep 17 00:00:00 2001 From: dmchen Date: Mon, 30 Oct 2023 11:34:34 +0000 Subject: [PATCH 29/31] memory leak --- source/dnode/vnode/src/vnd/vnodeSvr.c | 34 ++++++++++++++++----------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index 81baa5a54e..b6a4aaf388 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -950,9 +950,11 @@ static int32_t vnodeProcessCreateTbReq(SVnode *pVnode, int64_t ver, void *pReq, taosArrayPush(rsp.pArray, &cRsp); - char* str = taosMemoryCalloc(1, TSDB_TABLE_FNAME_LEN); - strcpy(str, pCreateReq->name); - taosArrayPush(tbNames, str); + if(tsEnableAuditCreateTable){ + char* str = taosMemoryCalloc(1, TSDB_TABLE_FNAME_LEN); + strcpy(str, pCreateReq->name); + taosArrayPush(tbNames, &str); + } } vDebug("vgId:%d, add %d new created tables into query table list", TD_VID(pVnode), (int32_t)taosArrayGetSize(tbUids)); @@ -982,11 +984,12 @@ static int32_t vnodeProcessCreateTbReq(SVnode *pVnode, int64_t ver, void *pReq, SStringBuilder sb = {0}; for(int32_t iReq = 0; iReq < req.nReqs; iReq++){ - char* key = taosArrayGet(tbNames, iReq); - taosStringBuilderAppendStringLen(&sb, key, strlen(key)); + char** key = (char**)taosArrayGet(tbNames, iReq); + taosStringBuilderAppendStringLen(&sb, *key, strlen(*key)); if(iReq < req.nReqs - 1){ taosStringBuilderAppendChar(&sb, ','); } + taosMemoryFreeClear(*key); } size_t len = 0; @@ -1002,13 +1005,13 @@ _exit: pCreateReq = req.pReqs + iReq; taosMemoryFree(pCreateReq->sql); taosMemoryFree(pCreateReq->comment); - taosArrayDestroy(pCreateReq->ctb.tagName); + taosArrayDestroy(pCreateReq->ctb.tagName); } taosArrayDestroyEx(rsp.pArray, tFreeSVCreateTbRsp); taosArrayDestroy(tbUids); tDecoderClear(&decoder); tEncoderClear(&encoder); - taosArrayDestroyEx(tbNames, taosMemoryFree); + taosArrayDestroy(tbNames); return rcode; } @@ -1184,9 +1187,11 @@ static int32_t vnodeProcessDropTbReq(SVnode *pVnode, int64_t ver, void *pReq, in taosArrayPush(rsp.pArray, &dropTbRsp); - char* str = taosMemoryCalloc(1, TSDB_TABLE_FNAME_LEN); - strcpy(str, pDropTbReq->name); - taosArrayPush(tbNames, str); + if(tsEnableAuditCreateTable){ + char* str = taosMemoryCalloc(1, TSDB_TABLE_FNAME_LEN); + strcpy(str, pDropTbReq->name); + taosArrayPush(tbNames, &str); + } } tqUpdateTbUidList(pVnode->pTq, tbUids, false); @@ -1200,17 +1205,18 @@ static int32_t vnodeProcessDropTbReq(SVnode *pVnode, int64_t ver, void *pReq, in SStringBuilder sb = {0}; for(int32_t iReq = 0; iReq < req.nReqs; iReq++){ - char* key = taosArrayGet(tbNames, iReq); - taosStringBuilderAppendStringLen(&sb, key, strlen(key)); + char** key = (char**)taosArrayGet(tbNames, iReq); + taosStringBuilderAppendStringLen(&sb, *key, strlen(*key)); if(iReq < req.nReqs - 1){ taosStringBuilderAppendChar(&sb, ','); } + taosMemoryFreeClear(*key); } size_t len = 0; char* keyJoined = taosStringBuilderGetResult(&sb, &len); - auditRecord(NULL, clusterId, "createTable", name.dbname, "", keyJoined, len); + auditRecord(NULL, clusterId, "dropTable", name.dbname, "", keyJoined, len); taosStringBuilderDestroy(&sb); } @@ -1225,7 +1231,7 @@ _exit: tEncodeSVDropTbBatchRsp(&encoder, &rsp); tEncoderClear(&encoder); taosArrayDestroy(rsp.pArray); - taosArrayDestroyEx(tbNames, taosMemoryFree); + taosArrayDestroy(tbNames); return 0; } From e800c993dd81b16175cbbf7622402d2262673587 Mon Sep 17 00:00:00 2001 From: slzhou Date: Tue, 31 Oct 2023 07:37:56 +0800 Subject: [PATCH 30/31] fix: set scan tags when bi mode --- source/libs/parser/src/parAstCreater.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/source/libs/parser/src/parAstCreater.c b/source/libs/parser/src/parAstCreater.c index de7ffce2d3..c6d70667bf 100644 --- a/source/libs/parser/src/parAstCreater.c +++ b/source/libs/parser/src/parAstCreater.c @@ -1028,11 +1028,13 @@ SNode* createSelectStmt(SAstCreateContext* pCxt, bool isDistinct, SNodeList* pPr return select; } -SNode* setSelectStmtTagMode(SAstCreateContext* pCxt, SNode* pStmt, bool bSelectTags) { - if (pCxt->pQueryCxt->biMode) { - ((SSelectStmt*)pStmt)->tagScan = true; - } else if (pStmt && QUERY_NODE_SELECT_STMT == nodeType(pStmt)) { - ((SSelectStmt*)pStmt)->tagScan = bSelectTags; +SNode* setSelectStmtTagMode(SAstCreateContext* pCxt, SNode* pStmt, bool bSelectTags) { + if (pStmt && QUERY_NODE_SELECT_STMT == nodeType(pStmt)) { + if (pCxt->pQueryCxt->biMode) { + ((SSelectStmt*)pStmt)->tagScan = true; + } else { + ((SSelectStmt*)pStmt)->tagScan = bSelectTags; + } } return pStmt; } From 72a866988bb602a9d782b5c32780714004257eec Mon Sep 17 00:00:00 2001 From: dapan1121 Date: Tue, 31 Oct 2023 11:42:00 +0800 Subject: [PATCH 31/31] enh: disable telemetry in enterprise version --- source/common/src/tglobal.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c index 0a155f4ea1..c6cff27011 100644 --- a/source/common/src/tglobal.c +++ b/source/common/src/tglobal.c @@ -104,13 +104,21 @@ uint16_t tsAuditPort = 6043; bool tsEnableAuditCreateTable = true; // telem +#ifdef TD_ENTERPRISE +bool tsEnableTelem = false; +#else bool tsEnableTelem = true; +#endif int32_t tsTelemInterval = 43200; char tsTelemServer[TSDB_FQDN_LEN] = "telemetry.tdengine.com"; uint16_t tsTelemPort = 80; char *tsTelemUri = "/report"; +#ifdef TD_ENTERPRISE +bool tsEnableCrashReport = false; +#else bool tsEnableCrashReport = true; +#endif char *tsClientCrashReportUri = "/ccrashreport"; char *tsSvrCrashReportUri = "/dcrashreport";