fix(stream): drop the related fill-history task & set the task to be ready, if the drop fill-history task msg is missing due to the vnode restarting.
This commit is contained in:
parent
de47ea6c81
commit
0a90c33b4d
|
@ -3271,7 +3271,7 @@ typedef struct {
|
|||
SMsgHead head;
|
||||
int64_t streamId;
|
||||
int32_t taskId;
|
||||
} SVPauseStreamTaskReq, SVResetStreamTaskReq;
|
||||
} SVPauseStreamTaskReq, SVResetStreamTaskReq, SVDropHTaskReq;
|
||||
|
||||
typedef struct {
|
||||
int8_t reserved;
|
||||
|
|
|
@ -271,6 +271,7 @@ enum { // WARN: new msg should be appended to segment tail
|
|||
TD_DEF_MSG_TYPE(TDMT_STREAM_TASK_PAUSE, "stream-task-pause", NULL, NULL)
|
||||
TD_DEF_MSG_TYPE(TDMT_STREAM_TASK_RESUME, "stream-task-resume", NULL, NULL)
|
||||
TD_DEF_MSG_TYPE(TDMT_STREAM_TASK_STOP, "stream-task-stop", NULL, NULL)
|
||||
TD_DEF_MSG_TYPE(TDMT_STREAM_HTASK_DROP, "stream-htask-drop", NULL, NULL)
|
||||
TD_DEF_MSG_TYPE(TDMT_STREAM_MAX_MSG, "stream-max", NULL, NULL)
|
||||
|
||||
TD_NEW_MSG_SEG(TDMT_MON_MSG)
|
||||
|
|
|
@ -655,17 +655,19 @@ int32_t tDecodeStreamCheckpointReadyMsg(SDecoder* pDecoder, SStreamCheckpointRea
|
|||
typedef struct STaskStatusEntry {
|
||||
STaskId id;
|
||||
int32_t status;
|
||||
int64_t stage;
|
||||
int32_t statusLastDuration; // to record the last duration of current status
|
||||
int32_t stage;
|
||||
int32_t nodeId;
|
||||
int64_t verStart; // start version in WAL, only valid for source task
|
||||
int64_t verEnd; // end version in WAL, only valid for source task
|
||||
int64_t processedVer; // only valid for source task
|
||||
int32_t relatedHTask; // has related fill-history task
|
||||
int64_t activeCheckpointId; // current active checkpoint id
|
||||
bool checkpointFailed; // denote if the checkpoint is failed or not
|
||||
double inputQUsed; // in MiB
|
||||
double inputRate;
|
||||
double sinkQuota; // existed quota size for sink task
|
||||
double sinkDataSize; // sink to dest data size
|
||||
double sinkDataSize; // sink to dst data size
|
||||
} STaskStatusEntry;
|
||||
|
||||
typedef struct SStreamHbMsg {
|
||||
|
|
|
@ -84,6 +84,7 @@ SArray *smGetMsgHandles() {
|
|||
if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_PAUSE, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_RESUME, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_STOP, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_STREAM_HTASK_DROP, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_TASK_CHECK, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_TASK_CHECK_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_SCAN_HISTORY_FINISH, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER;
|
||||
|
|
|
@ -833,6 +833,7 @@ SArray *vmGetMsgHandles() {
|
|||
if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_PAUSE, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_RESUME, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_STOP, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_STREAM_HTASK_DROP, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_CHECK_POINT_SOURCE, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_CHECKPOINT_READY, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER;
|
||||
if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_TASK_UPDATE, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
|
||||
|
|
|
@ -78,6 +78,8 @@ static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg);
|
|||
static SArray *extractNodeListFromStream(SMnode *pMnode);
|
||||
static SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady);
|
||||
|
||||
static SStreamObj *mndGetStreamObj(SMnode *pMnode, int64_t streamId);
|
||||
|
||||
static SVgroupChangeInfo mndFindChangedNodeInfo(SMnode *pMnode, const SArray *pPrevNodeList, const SArray *pNodeList);
|
||||
|
||||
static STrans *doCreateTrans(SMnode *pMnode, SStreamObj *pStream, SRpcMsg *pReq, const char *name, const char* pMsg);
|
||||
|
@ -1570,42 +1572,7 @@ static void mndCancelGetNextStream(SMnode *pMnode, void *pIter) {
|
|||
sdbCancelFetch(pSdb, pIter);
|
||||
}
|
||||
|
||||
static int32_t mndRetrieveStreamTask(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock *pBlock, int32_t rowsCapacity) {
|
||||
SMnode * pMnode = pReq->info.node;
|
||||
SSdb * pSdb = pMnode->pSdb;
|
||||
int32_t numOfRows = 0;
|
||||
SStreamObj *pStream = NULL;
|
||||
|
||||
while (numOfRows < rowsCapacity) {
|
||||
pShow->pIter = sdbFetch(pSdb, SDB_STREAM, pShow->pIter, (void **)&pStream);
|
||||
if (pShow->pIter == NULL) {
|
||||
break;
|
||||
}
|
||||
|
||||
// lock
|
||||
taosRLockLatch(&pStream->lock);
|
||||
|
||||
// count task num
|
||||
int32_t sz = taosArrayGetSize(pStream->tasks);
|
||||
|
||||
int32_t count = 0;
|
||||
for (int32_t i = 0; i < sz; i++) {
|
||||
SArray *pLevel = taosArrayGetP(pStream->tasks, i);
|
||||
count += taosArrayGetSize(pLevel);
|
||||
}
|
||||
|
||||
if (numOfRows + count > rowsCapacity) {
|
||||
blockDataEnsureCapacity(pBlock, numOfRows + count);
|
||||
}
|
||||
|
||||
// add row for each task
|
||||
for (int32_t i = 0; i < sz; i++) {
|
||||
SArray *pLevel = taosArrayGetP(pStream->tasks, i);
|
||||
int32_t levelCnt = taosArrayGetSize(pLevel);
|
||||
|
||||
for (int32_t j = 0; j < levelCnt; j++) {
|
||||
SStreamTask *pTask = taosArrayGetP(pLevel, j);
|
||||
|
||||
static void setTaskAttrInResBlock(SStreamObj* pStream, SStreamTask* pTask, SSDataBlock* pBlock, int32_t numOfRows) {
|
||||
SColumnInfoData *pColInfo;
|
||||
int32_t cols = 0;
|
||||
|
||||
|
@ -1660,11 +1627,11 @@ static int32_t mndRetrieveStreamTask(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock
|
|||
|
||||
// status
|
||||
char status[20 + VARSTR_HEADER_SIZE] = {0};
|
||||
|
||||
STaskId id = {.streamId = pTask->id.streamId, .taskId = pTask->id.taskId};
|
||||
|
||||
STaskStatusEntry *pe = taosHashGet(execInfo.pTaskMap, &id, sizeof(id));
|
||||
if (pe == NULL) {
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
|
||||
const char *pStatus = streamTaskGetStatusStr(pe->status);
|
||||
|
@ -1708,14 +1675,54 @@ static int32_t mndRetrieveStreamTask(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock
|
|||
|
||||
pColInfo = taosArrayGet(pBlock->pDataBlock, cols++);
|
||||
colDataSetVal(pColInfo, numOfRows, (const char *)vbuf, false);
|
||||
}
|
||||
|
||||
static int32_t getNumOfTasks(SArray* pTaskList) {
|
||||
int32_t numOfLevels = taosArrayGetSize(pTaskList);
|
||||
|
||||
int32_t count = 0;
|
||||
for (int32_t i = 0; i < numOfLevels; i++) {
|
||||
SArray *pLevel = taosArrayGetP(pTaskList, i);
|
||||
count += taosArrayGetSize(pLevel);
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static int32_t mndRetrieveStreamTask(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock *pBlock, int32_t rowsCapacity) {
|
||||
SMnode * pMnode = pReq->info.node;
|
||||
SSdb * pSdb = pMnode->pSdb;
|
||||
int32_t numOfRows = 0;
|
||||
SStreamObj *pStream = NULL;
|
||||
|
||||
while (numOfRows < rowsCapacity) {
|
||||
pShow->pIter = sdbFetch(pSdb, SDB_STREAM, pShow->pIter, (void **)&pStream);
|
||||
if (pShow->pIter == NULL) {
|
||||
break;
|
||||
}
|
||||
|
||||
// lock
|
||||
taosRLockLatch(&pStream->lock);
|
||||
|
||||
int32_t count = getNumOfTasks(pStream->tasks);
|
||||
if (numOfRows + count > rowsCapacity) {
|
||||
blockDataEnsureCapacity(pBlock, numOfRows + count);
|
||||
}
|
||||
|
||||
// add row for each task
|
||||
for (int32_t i = 0; i < taosArrayGetSize(pStream->tasks); i++) {
|
||||
SArray *pLevel = taosArrayGetP(pStream->tasks, i);
|
||||
|
||||
int32_t numOfLevels = taosArrayGetSize(pLevel);
|
||||
for (int32_t j = 0; j < numOfLevels; j++) {
|
||||
SStreamTask *pTask = taosArrayGetP(pLevel, j);
|
||||
setTaskAttrInResBlock(pStream, pTask, pBlock, numOfRows);
|
||||
numOfRows++;
|
||||
}
|
||||
}
|
||||
|
||||
// unlock
|
||||
taosRUnLockLatch(&pStream->lock);
|
||||
|
||||
sdbRelease(pSdb, pStream);
|
||||
}
|
||||
|
||||
|
@ -2729,7 +2736,7 @@ int32_t killActiveCheckpointTrans(SMnode *pMnode, const char* pDBName, size_t le
|
|||
return TSDB_CODE_SUCCESS;
|
||||
}
|
||||
|
||||
int32_t mndResetStatusFromCheckpoint(SMnode *pMnode, int32_t transId) {
|
||||
static int32_t mndResetStatusFromCheckpoint(SMnode *pMnode) {
|
||||
STrans* pTrans = mndAcquireTrans(pMnode, transId);
|
||||
if (pTrans != NULL) {
|
||||
mInfo("kill checkpoint transId:%d to reset task status", transId);
|
||||
|
@ -2765,6 +2772,88 @@ int32_t mndResetStatusFromCheckpoint(SMnode *pMnode, int32_t transId) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
static SStreamTask* mndGetStreamTask(STaskId* pId, SStreamObj* pStream) {
|
||||
for (int32_t i = 0; i < taosArrayGetSize(pStream->tasks); i++) {
|
||||
SArray *pLevel = taosArrayGetP(pStream->tasks, i);
|
||||
|
||||
int32_t numOfLevels = taosArrayGetSize(pLevel);
|
||||
for (int32_t j = 0; j < numOfLevels; j++) {
|
||||
SStreamTask *pTask = taosArrayGetP(pLevel, j);
|
||||
if (pTask->id.taskId == pId->taskId) {
|
||||
return pTask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static bool needDropRelatedFillhistoryTask(STaskStatusEntry *pTaskEntry, SStreamExecInfo *pExecNode) {
|
||||
if (pTaskEntry->status == TASK_STATUS__STREAM_SCAN_HISTORY && pTaskEntry->statusLastDuration >= 10) {
|
||||
if (fabs(pTaskEntry->inputQUsed) <= DBL_EPSILON) {
|
||||
int32_t numOfReady = 0;
|
||||
int32_t numOfTotal = 0;
|
||||
for (int32_t k = 0; k < taosArrayGetSize(pExecNode->pTaskList); ++k) {
|
||||
STaskId *pId = taosArrayGet(pExecNode->pTaskList, k);
|
||||
if (pTaskEntry->id.streamId == pId->streamId) {
|
||||
numOfTotal++;
|
||||
|
||||
if (pTaskEntry->id.taskId != pId->taskId && pTaskEntry->status == TASK_STATUS__READY) {
|
||||
numOfReady++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numOfReady > 0) {
|
||||
mDebug("stream:0x%" PRIx64
|
||||
" %d tasks are ready, %d tasks in stream-scan-history for more than 50s, drop related fill-history task",
|
||||
pTaskEntry->id.streamId, numOfReady, numOfTotal - numOfReady);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// currently only handle the sink task
|
||||
// 1. sink task, drop related fill-history task msg is missing
|
||||
// 2. other tasks are in ready state for at least 3 * hb_interval
|
||||
static int32_t mndDropRelatedFillhistoryTask(SMnode *pMnode, STaskStatusEntry *pTaskEntry, SStreamObj *pStream) {
|
||||
SStreamTask *pTask = mndGetStreamTask(&pTaskEntry->id, pStream);
|
||||
if (pTask == NULL) {
|
||||
mError("failed to get the stream task:0x%x, may have been dropped", (int32_t) pTaskEntry->id.taskId);
|
||||
return -1;
|
||||
}
|
||||
|
||||
SVDropHTaskReq *pReq = rpcMallocCont(sizeof(SVDropHTaskReq));
|
||||
if (pReq == NULL) {
|
||||
mError("failed to malloc in drop related fill-history task, size:%" PRIzu ", code:%s", sizeof(SVDropHTaskReq),
|
||||
tstrerror(TSDB_CODE_OUT_OF_MEMORY));
|
||||
terrno = TSDB_CODE_OUT_OF_MEMORY;
|
||||
return -1;
|
||||
}
|
||||
|
||||
pReq->head.vgId = htonl(pTask->info.nodeId);
|
||||
pReq->taskId = pTask->id.taskId;
|
||||
pReq->streamId = pTask->id.streamId;
|
||||
|
||||
SRpcMsg msg = {.info.noResp = 1};
|
||||
|
||||
initRpcMsg(&msg, TDMT_STREAM_HTASK_DROP, pReq, sizeof(SVDropHTaskReq));
|
||||
|
||||
mDebug("build and send drop related fill-history task for task:0x%x", pTask->id.taskId);
|
||||
|
||||
SVgObj *pVgObj = mndAcquireVgroup(pMnode, pTask->info.nodeId);
|
||||
SEpSet epset = mndGetVgroupEpset(pMnode, pVgObj);
|
||||
mndReleaseVgroup(pMnode, pVgObj);
|
||||
|
||||
tmsgSendReq(&epset, &msg);
|
||||
return TSDB_CODE_SUCCESS;
|
||||
}
|
||||
|
||||
int32_t setNodeEpsetExpiredFlag(const SArray *pNodeList) {
|
||||
int32_t num = taosArrayGetSize(pNodeList);
|
||||
mInfo("set node expired for %d nodes", num);
|
||||
|
@ -2865,9 +2954,28 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) {
|
|||
}
|
||||
}
|
||||
|
||||
if (p->status == pTaskEntry->status) {
|
||||
pTaskEntry->statusLastDuration++;
|
||||
} else {
|
||||
pTaskEntry->status = p->status;
|
||||
pTaskEntry->statusLastDuration = 0;
|
||||
}
|
||||
|
||||
if (p->status != TASK_STATUS__READY) {
|
||||
mDebug("received s-task:0x%" PRIx64 " not in ready status:%s", p->id.taskId, streamTaskGetStatusStr(p->status));
|
||||
|
||||
if (p->status == TASK_STATUS__STREAM_SCAN_HISTORY) {
|
||||
bool drop = needDropRelatedFillhistoryTask(pTaskEntry, &execInfo);
|
||||
if(drop) {
|
||||
SStreamObj *pStreamObj = mndGetStreamObj(pMnode, pTaskEntry->id.streamId);
|
||||
if (pStreamObj == NULL) {
|
||||
mError("failed to acquire the streamObj:0x%" PRIx64 " it may have been dropped", pStreamObj->uid);
|
||||
} else {
|
||||
mndDropRelatedFillhistoryTask(pMnode, pTaskEntry, pStreamObj);
|
||||
mndReleaseStream(pMnode, pStreamObj);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2894,3 +3002,18 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) {
|
|||
taosArrayDestroy(req.pUpdateNodes);
|
||||
return TSDB_CODE_SUCCESS;
|
||||
}
|
||||
|
||||
SStreamObj *mndGetStreamObj(SMnode *pMnode, int64_t streamId) {
|
||||
void *pIter = NULL;
|
||||
SSdb *pSdb = pMnode->pSdb;
|
||||
SStreamObj *pStream = NULL;
|
||||
|
||||
while ((pIter = sdbFetch(pSdb, SDB_STREAM, pIter, (void **)&pStream)) != NULL) {
|
||||
if (pStream->uid == streamId) {
|
||||
sdbCancelFetch(pSdb, pIter);
|
||||
return pStream;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
|
@ -232,6 +232,7 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp)
|
|||
int32_t tqProcessTaskCheckpointReadyMsg(STQ* pTq, SRpcMsg* pMsg);
|
||||
int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg);
|
||||
int32_t tqProcessTaskResetReq(STQ* pTq, SRpcMsg* pMsg);
|
||||
int32_t tqProcessTaskDropHTask(STQ* pTq, SRpcMsg* pMsg);
|
||||
|
||||
int32_t tqStartStreamTaskAsync(STQ* pTq, bool restart);
|
||||
int32_t tqRestartStreamTasks(STQ* pTq);
|
||||
|
|
|
@ -2014,3 +2014,34 @@ int32_t tqProcessTaskResetReq(STQ* pTq, SRpcMsg* pMsg) {
|
|||
streamMetaReleaseTask(pMeta, pTask);
|
||||
return TSDB_CODE_SUCCESS;
|
||||
}
|
||||
|
||||
int32_t tqProcessTaskDropHTask(STQ* pTq, SRpcMsg* pMsg) {
|
||||
SVDropHTaskReq* pReq = (SVDropHTaskReq*) pMsg->pCont;
|
||||
|
||||
SStreamMeta* pMeta = pTq->pStreamMeta;
|
||||
SStreamTask* pTask = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->taskId);
|
||||
if (pTask == NULL) {
|
||||
tqError("vgId:%d process drop fill-history task req, failed to acquire task:0x%x, it may have been dropped already",
|
||||
pMeta->vgId, pReq->taskId);
|
||||
return TSDB_CODE_SUCCESS;
|
||||
}
|
||||
|
||||
tqDebug("s-task:%s receive drop fill-history msg from mnode", pTask->id.idStr);
|
||||
if (pTask->hTaskInfo.id.taskId == 0) {
|
||||
tqError("vgId:%d s-task:%s not have related fill-history task", pMeta->vgId, pTask->id.idStr);
|
||||
streamMetaReleaseTask(pMeta, pTask);
|
||||
return TSDB_CODE_SUCCESS;
|
||||
}
|
||||
|
||||
ETaskStatus status = streamTaskGetStatus(pTask, NULL);
|
||||
ASSERT(status == TASK_STATUS__STREAM_SCAN_HISTORY);
|
||||
|
||||
streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_SCANHIST_DONE);
|
||||
|
||||
SStreamTaskId id = {.streamId = pTask->hTaskInfo.id.streamId, .taskId = pTask->hTaskInfo.id.taskId};
|
||||
streamBuildAndSendDropTaskMsg(pTask->pMsgCb, pMeta->vgId, &id);
|
||||
|
||||
streamMetaReleaseTask(pMeta, pTask);
|
||||
return TSDB_CODE_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
@ -595,6 +595,11 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t ver, SRpcMsg
|
|||
tqProcessTaskResetReq(pVnode->pTq, pMsg);
|
||||
}
|
||||
} break;
|
||||
case TDMT_STREAM_HTASK_DROP: {
|
||||
if (pVnode->restored && vnodeIsLeader(pVnode)) {
|
||||
tqProcessTaskDropHTask(pVnode->pTq, pMsg);
|
||||
}
|
||||
} break;
|
||||
case TDMT_VND_ALTER_CONFIRM:
|
||||
needCommit = pVnode->config.hashChange;
|
||||
if (vnodeProcessAlterConfirmReq(pVnode, ver, pReq, len, pRsp) < 0) {
|
||||
|
|
|
@ -401,7 +401,7 @@ int32_t streamDoTransferStateToStreamTask(SStreamTask* pTask) {
|
|||
// 5. save to disk
|
||||
pStreamTask->status.taskStatus = streamTaskGetStatus(pStreamTask, NULL);
|
||||
|
||||
// 6. pause allowed.
|
||||
// 6. add empty delete block
|
||||
if ((pStreamTask->info.taskLevel == TASK_LEVEL__SOURCE) && taosQueueEmpty(pStreamTask->inputq.queue->pQueue)) {
|
||||
SStreamRefDataBlock* pItem = taosAllocateQitem(sizeof(SStreamRefDataBlock), DEF_QITEM, 0);
|
||||
|
||||
|
|
Loading…
Reference in New Issue