refactor: do some internal refactor.

This commit is contained in:
Haojun Liao 2024-09-15 01:19:30 +08:00
parent c4a49a7bd9
commit 3332a0b822
5 changed files with 86 additions and 59 deletions

View File

@ -494,6 +494,14 @@ typedef struct SScanWalInfo {
tmr_h scanTimer; tmr_h scanTimer;
} SScanWalInfo; } SScanWalInfo;
typedef struct SFatalErrInfo {
int32_t code;
int64_t ts;
int32_t threadId;
int32_t line;
char func[128];
} SFatalErrInfo;
// meta // meta
typedef struct SStreamMeta { typedef struct SStreamMeta {
char* path; char* path;
@ -523,13 +531,12 @@ typedef struct SStreamMeta {
int32_t numOfStreamTasks; // this value should be increased when a new task is added into the meta int32_t numOfStreamTasks; // this value should be increased when a new task is added into the meta
int32_t numOfPausedTasks; int32_t numOfPausedTasks;
int64_t rid; int64_t rid;
SFatalErrInfo fatalInfo; // fatal error occurs, stream stop to execute
int64_t chkpId; int64_t chkpId;
int32_t chkpCap; int32_t chkpCap;
SArray* chkpSaved; SArray* chkpSaved;
SArray* chkpInUse; SArray* chkpInUse;
SRWLatch chkpDirLock; SRWLatch chkpDirLock;
void* qHandle; // todo remove it
void* bkdChkptMgt; void* bkdChkptMgt;
} SStreamMeta; } SStreamMeta;
@ -776,6 +783,9 @@ void streamMetaRLock(SStreamMeta* pMeta);
void streamMetaRUnLock(SStreamMeta* pMeta); void streamMetaRUnLock(SStreamMeta* pMeta);
void streamMetaWLock(SStreamMeta* pMeta); void streamMetaWLock(SStreamMeta* pMeta);
void streamMetaWUnLock(SStreamMeta* pMeta); void streamMetaWUnLock(SStreamMeta* pMeta);
void streamSetFatalError(SStreamMeta* pMeta, int32_t code, const char* funcName, int32_t lino);
int32_t streamGetFatalError(const SStreamMeta* pMeta);
void streamMetaResetStartInfo(STaskStartInfo* pMeta, int32_t vgId); void streamMetaResetStartInfo(STaskStartInfo* pMeta, int32_t vgId);
int32_t streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta, SArray** pTaskList); int32_t streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta, SArray** pTaskList);
void streamMetaUpdateStageRole(SStreamMeta* pMeta, int64_t stage, bool isLeader); void streamMetaUpdateStageRole(SStreamMeta* pMeta, int64_t stage, bool isLeader);

View File

@ -1143,18 +1143,16 @@ int32_t extractStreamNodeList(SMnode *pMnode) {
return taosArrayGetSize(execInfo.pNodeList); return taosArrayGetSize(execInfo.pNodeList);
} }
static bool taskNodeIsUpdated(SMnode *pMnode) { static int32_t doCheckForUpdated(SMnode *pMnode, SArray **ppNodeSnapshot) {
bool allReady = true; bool allReady = false;
SArray *pNodeSnapshot = NULL; bool nodeUpdated = false;
SVgroupChangeInfo changeInfo = {0};
// check if the node update happens or not
streamMutexLock(&execInfo.lock);
int32_t numOfNodes = extractStreamNodeList(pMnode); int32_t numOfNodes = extractStreamNodeList(pMnode);
if (numOfNodes == 0) { if (numOfNodes == 0) {
mDebug("stream task node change checking done, no vgroups exist, do nothing"); mDebug("stream task node change checking done, no vgroups exist, do nothing");
execInfo.ts = taosGetTimestampSec(); execInfo.ts = taosGetTimestampSec();
streamMutexUnlock(&execInfo.lock);
return false; return false;
} }
@ -1166,43 +1164,46 @@ static bool taskNodeIsUpdated(SMnode *pMnode) {
if (pNodeEntry->stageUpdated) { if (pNodeEntry->stageUpdated) {
mDebug("stream task not ready due to node update detected, checkpoint not issued"); mDebug("stream task not ready due to node update detected, checkpoint not issued");
streamMutexUnlock(&execInfo.lock);
return true; return true;
} }
} }
int32_t code = mndTakeVgroupSnapshot(pMnode, &allReady, &pNodeSnapshot); int32_t code = mndTakeVgroupSnapshot(pMnode, &allReady, ppNodeSnapshot);
if (code) { if (code) {
mError("failed to get the vgroup snapshot, ignore it and continue"); mError("failed to get the vgroup snapshot, ignore it and continue");
} }
if (!allReady) { if (!allReady) {
mWarn("not all vnodes ready, quit from vnodes status check"); mWarn("not all vnodes ready, quit from vnodes status check");
taosArrayDestroy(pNodeSnapshot);
streamMutexUnlock(&execInfo.lock);
return true; return true;
} }
SVgroupChangeInfo changeInfo = {0}; code = mndFindChangedNodeInfo(pMnode, execInfo.pNodeList, *ppNodeSnapshot, &changeInfo);
code = mndFindChangedNodeInfo(pMnode, execInfo.pNodeList, pNodeSnapshot, &changeInfo);
if (code) { if (code) {
streamMutexUnlock(&execInfo.lock); nodeUpdated = false;
return false; } else {
} nodeUpdated = (taosArrayGetSize(changeInfo.pUpdateNodeList) > 0);
bool nodeUpdated = (taosArrayGetSize(changeInfo.pUpdateNodeList) > 0);
mndDestroyVgroupChangeInfo(&changeInfo);
taosArrayDestroy(pNodeSnapshot);
if (nodeUpdated) { if (nodeUpdated) {
mDebug("stream tasks not ready due to node update"); mDebug("stream tasks not ready due to node update");
} }
}
streamMutexUnlock(&execInfo.lock); mndDestroyVgroupChangeInfo(&changeInfo);
return nodeUpdated; return nodeUpdated;
} }
// check if the node update happens or not
static bool taskNodeIsUpdated(SMnode *pMnode) {
SArray *pNodeSnapshot = NULL;
streamMutexLock(&execInfo.lock);
bool updated = doCheckForUpdated(pMnode, &pNodeSnapshot);
streamMutexUnlock(&execInfo.lock);
taosArrayDestroy(pNodeSnapshot);
return updated;
}
static int32_t mndCheckTaskAndNodeStatus(SMnode *pMnode) { static int32_t mndCheckTaskAndNodeStatus(SMnode *pMnode) {
bool ready = true; bool ready = true;
if (taskNodeIsUpdated(pMnode)) { if (taskNodeIsUpdated(pMnode)) {
@ -1993,7 +1994,7 @@ static int32_t mndFindChangedNodeInfo(SMnode *pMnode, const SArray *pPrevNodeLis
if (pInfo->pUpdateNodeList == NULL || pInfo->pDBMap == NULL) { if (pInfo->pUpdateNodeList == NULL || pInfo->pDBMap == NULL) {
mndDestroyVgroupChangeInfo(pInfo); mndDestroyVgroupChangeInfo(pInfo);
return terrno; TSDB_CHECK_NULL(NULL, code, lino, _err, terrno);
} }
int32_t numOfNodes = taosArrayGetSize(pPrevNodeList); int32_t numOfNodes = taosArrayGetSize(pPrevNodeList);
@ -2048,6 +2049,7 @@ static int32_t mndFindChangedNodeInfo(SMnode *pMnode, const SArray *pPrevNodeLis
return code; return code;
_err: _err:
mError("failed to find node change info, code:%s at %s line:%d", tstrerror(code), __func__, lino);
mndDestroyVgroupChangeInfo(pInfo); mndDestroyVgroupChangeInfo(pInfo);
return code; return code;
} }

View File

@ -160,6 +160,13 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM
tDecoderClear(&decoder); tDecoderClear(&decoder);
int32_t gError = streamGetFatalError(pMeta);
if (gError != 0) {
tqError("vgId:%d global fatal occurs, code:%s, ts:%" PRId64 " func:%s", pMeta->vgId, tstrerror(gError),
pMeta->fatalInfo.ts, pMeta->fatalInfo.func);
return 0;
}
// update the nodeEpset when it exists // update the nodeEpset when it exists
streamMetaWLock(pMeta); streamMetaWLock(pMeta);
@ -290,8 +297,11 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM
tqDebug("vgId:%d closed tasks:%d, unclosed:%d, all tasks will be started when nodeEp update completed", vgId, tqDebug("vgId:%d closed tasks:%d, unclosed:%d, all tasks will be started when nodeEp update completed", vgId,
updateTasks, (numOfTasks - updateTasks)); updateTasks, (numOfTasks - updateTasks));
} else { } else {
if (streamMetaCommit(pMeta) < 0) { if ((code = streamMetaCommit(pMeta)) < 0) {
// persist to disk // always return true
streamMetaWUnLock(pMeta);
taosArrayDestroy(req.pNodeList);
return TSDB_CODE_SUCCESS;
} }
streamMetaClearSetUpdateTaskListComplete(pMeta); streamMetaClearSetUpdateTaskListComplete(pMeta);
@ -754,8 +764,9 @@ int32_t tqStreamTaskProcessUpdateCheckpointReq(SStreamMeta* pMeta, bool restored
} }
streamMetaWUnLock(pMeta); streamMetaWUnLock(pMeta);
// always return success when handling the requirement issued by mnode during transaction. // always return success when handling the requirement issued by mnode during transaction.
return code; return TSDB_CODE_SUCCESS;
} }
static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) {
@ -1197,10 +1208,6 @@ int32_t tqStreamProcessReqCheckpointRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { ret
int32_t tqStreamProcessChkptReportRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { return doProcessDummyRspMsg(pMeta, pMsg); } int32_t tqStreamProcessChkptReportRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { return doProcessDummyRspMsg(pMeta, pMsg); }
int32_t tqStreamProcessConsensusChkptRsp2(SStreamMeta* pMeta, SRpcMsg* pMsg) {
return doProcessDummyRspMsg(pMeta, pMsg);
}
int32_t tqStreamProcessCheckpointReadyRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { int32_t tqStreamProcessCheckpointReadyRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) {
SMStreamCheckpointReadyRspMsg* pRsp = pMsg->pCont; SMStreamCheckpointReadyRspMsg* pRsp = pMsg->pCont;
@ -1221,14 +1228,13 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) {
int32_t vgId = pMeta->vgId; int32_t vgId = pMeta->vgId;
int32_t code = 0; int32_t code = 0;
SStreamTask* pTask = NULL; SStreamTask* pTask = NULL;
SRestoreCheckpointInfo req = {0};
char* msg = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead)); char* msg = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead));
int32_t len = pMsg->contLen - sizeof(SMsgHead); int32_t len = pMsg->contLen - sizeof(SMsgHead);
int64_t now = taosGetTimestampMs(); int64_t now = taosGetTimestampMs();
SDecoder decoder; SDecoder decoder;
tDecoderInit(&decoder, (uint8_t*)msg, len); SRestoreCheckpointInfo req = {0};
tDecoderInit(&decoder, (uint8_t*)msg, len);
if (tDecodeRestoreCheckpointInfo(&decoder, &req) < 0) { if (tDecodeRestoreCheckpointInfo(&decoder, &req) < 0) {
tqError("vgId:%d failed to decode set consensus checkpointId req, code:%s", vgId, tstrerror(code)); tqError("vgId:%d failed to decode set consensus checkpointId req, code:%s", vgId, tstrerror(code));
tDecoderClear(&decoder); tDecoderClear(&decoder);
@ -1239,8 +1245,7 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) {
code = streamMetaAcquireTask(pMeta, req.streamId, req.taskId, &pTask); code = streamMetaAcquireTask(pMeta, req.streamId, req.taskId, &pTask);
if (pTask == NULL || (code != 0)) { if (pTask == NULL || (code != 0)) {
tqError( tqError("vgId:%d process consensus checkpointId req, failed to acquire task:0x%x, it may have been dropped already",
"vgId:%d process set consensus checkpointId req, failed to acquire task:0x%x, it may have been dropped already",
pMeta->vgId, req.taskId); pMeta->vgId, req.taskId);
// ignore this code to avoid error code over write // ignore this code to avoid error code over write
int32_t ret = streamMetaAddFailedTask(pMeta, req.streamId, req.taskId); int32_t ret = streamMetaAddFailedTask(pMeta, req.streamId, req.taskId);
@ -1248,7 +1253,7 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) {
tqError("s-task:0x%x failed add check downstream failed, core:%s", req.taskId, tstrerror(ret)); tqError("s-task:0x%x failed add check downstream failed, core:%s", req.taskId, tstrerror(ret));
} }
return code; return 0;
} }
// discard the rsp, since it is expired. // discard the rsp, since it is expired.
@ -1272,7 +1277,7 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) {
streamMutexUnlock(&pTask->lock); streamMutexUnlock(&pTask->lock);
streamMetaReleaseTask(pMeta, pTask); streamMetaReleaseTask(pMeta, pTask);
return TSDB_CODE_STREAM_INTERNAL_ERROR; return 0;
} }
SConsenChkptInfo* pConsenInfo = &pTask->status.consenChkptInfo; SConsenChkptInfo* pConsenInfo = &pTask->status.consenChkptInfo;
@ -1299,10 +1304,13 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) {
if (pMeta->role == NODE_ROLE_LEADER) { if (pMeta->role == NODE_ROLE_LEADER) {
code = tqStreamStartOneTaskAsync(pMeta, pTask->pMsgCb, req.streamId, req.taskId); code = tqStreamStartOneTaskAsync(pMeta, pTask->pMsgCb, req.streamId, req.taskId);
if (code) {
tqError("s-task:0x%x vgId:%d failed start task async, code:%s", req.taskId, vgId, tstrerror(code));
}
} else { } else {
tqDebug("vgId:%d follower not start task:%s", vgId, pTask->id.idStr); tqDebug("vgId:%d follower not start task:%s", vgId, pTask->id.idStr);
} }
streamMetaReleaseTask(pMeta, pTask); streamMetaReleaseTask(pMeta, pTask);
return code; return 0;
} }

View File

@ -923,32 +923,38 @@ int32_t streamMetaBegin(SStreamMeta* pMeta) {
streamMetaWLock(pMeta); streamMetaWLock(pMeta);
int32_t code = tdbBegin(pMeta->db, &pMeta->txn, tdbDefaultMalloc, tdbDefaultFree, NULL, int32_t code = tdbBegin(pMeta->db, &pMeta->txn, tdbDefaultMalloc, tdbDefaultFree, NULL,
TDB_TXN_WRITE | TDB_TXN_READ_UNCOMMITTED); TDB_TXN_WRITE | TDB_TXN_READ_UNCOMMITTED);
if (code) {
streamSetFatalError(pMeta, code, __func__, __LINE__);
}
streamMetaWUnLock(pMeta); streamMetaWUnLock(pMeta);
return code; return code;
} }
int32_t streamMetaCommit(SStreamMeta* pMeta) { int32_t streamMetaCommit(SStreamMeta* pMeta) {
int32_t code = 0; int32_t code = tdbCommit(pMeta->db, pMeta->txn);
code = tdbCommit(pMeta->db, pMeta->txn);
if (code != 0) { if (code != 0) {
stError("vgId:%d failed to commit stream meta", pMeta->vgId); streamSetFatalError(pMeta, code, __func__, __LINE__);
return code; stFatal("vgId:%d failed to commit stream meta, code:%s, line:%d", pMeta->vgId, tstrerror(code),
pMeta->fatalInfo.line);
} }
code = tdbPostCommit(pMeta->db, pMeta->txn); code = tdbPostCommit(pMeta->db, pMeta->txn);
if (code != 0) { if (code != 0) {
stError("vgId:%d failed to do post-commit stream meta", pMeta->vgId); streamSetFatalError(pMeta, code, __func__, __LINE__);
stFatal("vgId:%d failed to do post-commit stream meta, code:%s, line:%d", pMeta->vgId, tstrerror(code),
pMeta->fatalInfo.line);
return code; return code;
} }
code = tdbBegin(pMeta->db, &pMeta->txn, tdbDefaultMalloc, tdbDefaultFree, NULL, code = tdbBegin(pMeta->db, &pMeta->txn, tdbDefaultMalloc, tdbDefaultFree, NULL,
TDB_TXN_WRITE | TDB_TXN_READ_UNCOMMITTED); TDB_TXN_WRITE | TDB_TXN_READ_UNCOMMITTED);
if (code != 0) { if (code != 0) {
stError("vgId:%d failed to begin trans", pMeta->vgId); streamSetFatalError(pMeta, code, __func__, __LINE__);
return code; stFatal("vgId:%d failed to begin trans, code:%s, line:%d", pMeta->vgId, tstrerror(code), pMeta->fatalInfo.line);
} else {
stDebug("vgId:%d stream meta file commit completed", pMeta->vgId);
} }
stDebug("vgId:%d stream meta file commit completed", pMeta->vgId);
return code; return code;
} }

View File

@ -77,9 +77,10 @@ void streamSetFatalError(SStreamMeta* pMeta, int32_t code, const char* funcName,
pMeta->fatalInfo.threadId = taosGetSelfPthreadId(); pMeta->fatalInfo.threadId = taosGetSelfPthreadId();
tstrncpy(pMeta->fatalInfo.func, funcName, tListLen(pMeta->fatalInfo.func)); tstrncpy(pMeta->fatalInfo.func, funcName, tListLen(pMeta->fatalInfo.func));
pMeta->fatalInfo.line = lino; pMeta->fatalInfo.line = lino;
stInfo("vgId:%d set global fatal error, code:%s %s line:%d", pMeta->vgId, tstrerror(code), funcName, lino); stInfo("vgId:%d set fatal error, code:%s %s line:%d", pMeta->vgId, tstrerror(code), funcName, lino);
} else { } else {
stFatal("vgId:%d existed global fatal eror:%s, failed to set new fatal error code:%s", pMeta->vgId, code); stFatal("vgId:%d existed fatal error:%s, ts:%" PRId64 " failed to set new fatal error code:%s", pMeta->vgId,
pMeta->fatalInfo.ts, tstrerror(code));
} }
} }