Merge pull request #21957 from taosdata/FIX/TD-24149-main
enh: terminate on IO error while appending wal logs or flushing data
This commit is contained in:
commit
53263bf962
|
@ -360,7 +360,12 @@ static int32_t vnodeCommitTask(void *arg) {
|
||||||
|
|
||||||
// commit
|
// commit
|
||||||
code = vnodeCommitImpl(pInfo);
|
code = vnodeCommitImpl(pInfo);
|
||||||
if (code) goto _exit;
|
if (code) {
|
||||||
|
vFatal("vgId:%d, failed to commit vnode since %s", TD_VID(pVnode), terrstr());
|
||||||
|
taosMsleep(100);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
goto _exit;
|
||||||
|
}
|
||||||
|
|
||||||
vnodeReturnBufPool(pVnode);
|
vnodeReturnBufPool(pVnode);
|
||||||
|
|
||||||
|
|
|
@ -602,18 +602,18 @@ int walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) {
|
||||||
// ftruncate idx file
|
// ftruncate idx file
|
||||||
if (offset < fileSize) {
|
if (offset < fileSize) {
|
||||||
if (taosFtruncateFile(pIdxFile, offset) < 0) {
|
if (taosFtruncateFile(pIdxFile, offset) < 0) {
|
||||||
wError("vgId:%d, failed to ftruncate file due to %s. offset:%" PRId64 ", file:%s", pWal->cfg.vgId,
|
|
||||||
strerror(errno), offset, fnameStr);
|
|
||||||
terrno = TAOS_SYSTEM_ERROR(errno);
|
terrno = TAOS_SYSTEM_ERROR(errno);
|
||||||
|
wError("vgId:%d, failed to ftruncate file since %s. offset:%" PRId64 ", file:%s", pWal->cfg.vgId, terrstr(),
|
||||||
|
offset, fnameStr);
|
||||||
goto _err;
|
goto _err;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// rebuild idx file
|
// rebuild idx file
|
||||||
if (taosLSeekFile(pIdxFile, 0, SEEK_END) < 0) {
|
if (taosLSeekFile(pIdxFile, 0, SEEK_END) < 0) {
|
||||||
wError("vgId:%d, failed to seek file due to %s. offset:%" PRId64 ", file:%s", pWal->cfg.vgId, strerror(errno),
|
|
||||||
offset, fnameStr);
|
|
||||||
terrno = TAOS_SYSTEM_ERROR(errno);
|
terrno = TAOS_SYSTEM_ERROR(errno);
|
||||||
|
wError("vgId:%d, failed to seek file since %s. offset:%" PRId64 ", file:%s", pWal->cfg.vgId, terrstr(), offset,
|
||||||
|
fnameStr);
|
||||||
goto _err;
|
goto _err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -625,11 +625,12 @@ int walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) {
|
||||||
idxEntry.offset += sizeof(SWalCkHead) + ckHead.head.bodyLen;
|
idxEntry.offset += sizeof(SWalCkHead) + ckHead.head.bodyLen;
|
||||||
|
|
||||||
if (walReadLogHead(pLogFile, idxEntry.offset, &ckHead) < 0) {
|
if (walReadLogHead(pLogFile, idxEntry.offset, &ckHead) < 0) {
|
||||||
wError("vgId:%d, failed to read wal log head since %s. offset:%" PRId64 ", file:%s", pWal->cfg.vgId, terrstr(),
|
wError("vgId:%d, failed to read wal log head since %s. index:%" PRId64 ", offset:%" PRId64 ", file:%s",
|
||||||
idxEntry.offset, fLogNameStr);
|
pWal->cfg.vgId, terrstr(), idxEntry.ver, idxEntry.offset, fLogNameStr);
|
||||||
goto _err;
|
goto _err;
|
||||||
}
|
}
|
||||||
if (taosWriteFile(pIdxFile, &idxEntry, sizeof(SWalIdxEntry)) < 0) {
|
if (taosWriteFile(pIdxFile, &idxEntry, sizeof(SWalIdxEntry)) < 0) {
|
||||||
|
terrno = TAOS_SYSTEM_ERROR(errno);
|
||||||
wError("vgId:%d, failed to append file since %s. file:%s", pWal->cfg.vgId, terrstr(), fnameStr);
|
wError("vgId:%d, failed to append file since %s. file:%s", pWal->cfg.vgId, terrstr(), fnameStr);
|
||||||
goto _err;
|
goto _err;
|
||||||
}
|
}
|
||||||
|
@ -637,6 +638,7 @@ int walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (taosFsyncFile(pIdxFile) < 0) {
|
if (taosFsyncFile(pIdxFile) < 0) {
|
||||||
|
terrno = TAOS_SYSTEM_ERROR(errno);
|
||||||
wError("vgId:%d, faild to fsync file since %s. file:%s", pWal->cfg.vgId, terrstr(), fnameStr);
|
wError("vgId:%d, faild to fsync file since %s. file:%s", pWal->cfg.vgId, terrstr(), fnameStr);
|
||||||
goto _err;
|
goto _err;
|
||||||
}
|
}
|
||||||
|
|
|
@ -473,7 +473,10 @@ static int32_t walWriteIndex(SWal *pWal, int64_t ver, int64_t offset) {
|
||||||
// check alignment of idx entries
|
// check alignment of idx entries
|
||||||
int64_t endOffset = taosLSeekFile(pWal->pIdxFile, 0, SEEK_END);
|
int64_t endOffset = taosLSeekFile(pWal->pIdxFile, 0, SEEK_END);
|
||||||
if (endOffset < 0) {
|
if (endOffset < 0) {
|
||||||
wFatal("vgId:%d, failed to seek end of idxfile due to %s. ver:%" PRId64 "", pWal->cfg.vgId, strerror(errno), ver);
|
wFatal("vgId:%d, failed to seek end of WAL idxfile due to %s. ver:%" PRId64 "", pWal->cfg.vgId, strerror(errno),
|
||||||
|
ver);
|
||||||
|
taosMsleep(100);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -533,16 +536,20 @@ static FORCE_INLINE int32_t walWriteImpl(SWal *pWal, int64_t index, tmsg_t msgTy
|
||||||
END:
|
END:
|
||||||
// recover in a reverse order
|
// recover in a reverse order
|
||||||
if (taosFtruncateFile(pWal->pLogFile, offset) < 0) {
|
if (taosFtruncateFile(pWal->pLogFile, offset) < 0) {
|
||||||
wFatal("vgId:%d, failed to ftruncate logfile to offset:%" PRId64 " during recovery due to %s", pWal->cfg.vgId,
|
|
||||||
offset, strerror(errno));
|
|
||||||
terrno = TAOS_SYSTEM_ERROR(errno);
|
terrno = TAOS_SYSTEM_ERROR(errno);
|
||||||
|
wFatal("vgId:%d, failed to recover WAL logfile from write error since %s, offset:%" PRId64, pWal->cfg.vgId,
|
||||||
|
terrstr(), offset);
|
||||||
|
taosMsleep(100);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t idxOffset = (index - pFileInfo->firstVer) * sizeof(SWalIdxEntry);
|
int64_t idxOffset = (index - pFileInfo->firstVer) * sizeof(SWalIdxEntry);
|
||||||
if (taosFtruncateFile(pWal->pIdxFile, idxOffset) < 0) {
|
if (taosFtruncateFile(pWal->pIdxFile, idxOffset) < 0) {
|
||||||
wFatal("vgId:%d, failed to ftruncate idxfile to offset:%" PRId64 "during recovery due to %s", pWal->cfg.vgId,
|
|
||||||
idxOffset, strerror(errno));
|
|
||||||
terrno = TAOS_SYSTEM_ERROR(errno);
|
terrno = TAOS_SYSTEM_ERROR(errno);
|
||||||
|
wFatal("vgId:%d, failed to recover WAL idxfile from write error since %s, offset:%" PRId64, pWal->cfg.vgId,
|
||||||
|
terrstr(), idxOffset);
|
||||||
|
taosMsleep(100);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -486,24 +486,11 @@ static inline int32_t taosBuildLogHead(char *buffer, const char *flags) {
|
||||||
static inline void taosPrintLogImp(ELogLevel level, int32_t dflag, const char *buffer, int32_t len) {
|
static inline void taosPrintLogImp(ELogLevel level, int32_t dflag, const char *buffer, int32_t len) {
|
||||||
if ((dflag & DEBUG_FILE) && tsLogObj.logHandle && tsLogObj.logHandle->pFile != NULL && osLogSpaceAvailable()) {
|
if ((dflag & DEBUG_FILE) && tsLogObj.logHandle && tsLogObj.logHandle->pFile != NULL && osLogSpaceAvailable()) {
|
||||||
taosUpdateLogNums(level);
|
taosUpdateLogNums(level);
|
||||||
#if 0
|
|
||||||
// DEBUG_FATAL and DEBUG_ERROR are duplicated
|
|
||||||
// fsync will cause thread blocking and may also generate log misalignment in case of asyncLog
|
|
||||||
if (tsAsyncLog && level != DEBUG_FATAL) {
|
|
||||||
taosPushLogBuffer(tsLogObj.logHandle, buffer, len);
|
|
||||||
} else {
|
|
||||||
taosWriteFile(tsLogObj.logHandle->pFile, buffer, len);
|
|
||||||
if (level == DEBUG_FATAL) {
|
|
||||||
taosFsyncFile(tsLogObj.logHandle->pFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if (tsAsyncLog) {
|
if (tsAsyncLog) {
|
||||||
taosPushLogBuffer(tsLogObj.logHandle, buffer, len);
|
taosPushLogBuffer(tsLogObj.logHandle, buffer, len);
|
||||||
} else {
|
} else {
|
||||||
taosWriteFile(tsLogObj.logHandle->pFile, buffer, len);
|
taosWriteFile(tsLogObj.logHandle->pFile, buffer, len);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
if (tsLogObj.maxLines > 0) {
|
if (tsLogObj.maxLines > 0) {
|
||||||
atomic_add_fetch_32(&tsLogObj.lines, 1);
|
atomic_add_fetch_32(&tsLogObj.lines, 1);
|
||||||
|
|
Loading…
Reference in New Issue