diff --git a/source/dnode/vnode/tsdb2/CMakeLists.txt b/source/dnode/vnode/tsdb2/CMakeLists.txt new file mode 100644 index 0000000000..875bb62581 --- /dev/null +++ b/source/dnode/vnode/tsdb2/CMakeLists.txt @@ -0,0 +1,18 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 3.0...3.20) +PROJECT(TDengine) + +INCLUDE_DIRECTORIES(inc) +INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/src/query/inc) +INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/deps/cJson/inc) +AUX_SOURCE_DIRECTORY(src SRC) +ADD_LIBRARY(tsdb ${SRC}) +TARGET_LINK_LIBRARIES(tsdb tfs common tutil cJson) + +IF (TD_TSDB_PLUGINS) + TARGET_LINK_LIBRARIES(tsdb tsdbPlugins) +ENDIF () + +IF (TD_LINUX) + # Someone has no gtest directory, so comment it + # ADD_SUBDIRECTORY(tests) +ENDIF () diff --git a/source/dnode/vnode/tsdb2/inc/tsdbBuffer.h b/source/dnode/vnode/tsdb2/inc/tsdbBuffer.h new file mode 100644 index 0000000000..4b650d3993 --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbBuffer.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_BUFFER_H_ +#define _TD_TSDB_BUFFER_H_ + +typedef struct { + int64_t blockId; + int offset; + int remain; + char data[]; +} STsdbBufBlock; + +typedef struct { + pthread_cond_t poolNotEmpty; + int bufBlockSize; + int tBufBlocks; + int nBufBlocks; + int nRecycleBlocks; + int nElasticBlocks; + int64_t index; + SList* bufBlockList; +} STsdbBufPool; + +#define TSDB_BUFFER_RESERVE 1024 // Reseve 1K as commit threshold + +STsdbBufPool* tsdbNewBufPool(); +void tsdbFreeBufPool(STsdbBufPool* pBufPool); +int tsdbOpenBufPool(STsdbRepo* pRepo); +void tsdbCloseBufPool(STsdbRepo* pRepo); +SListNode* tsdbAllocBufBlockFromPool(STsdbRepo* pRepo); +int tsdbExpandPool(STsdbRepo* pRepo, int32_t oldTotalBlocks); +void tsdbRecycleBufferBlock(STsdbBufPool* pPool, SListNode *pNode, bool bELastic); + +// health cite +STsdbBufBlock *tsdbNewBufBlock(int bufBlockSize); +void tsdbFreeBufBlock(STsdbBufBlock *pBufBlock); + +#endif /* _TD_TSDB_BUFFER_H_ */ diff --git a/source/dnode/vnode/tsdb2/inc/tsdbCommit.h b/source/dnode/vnode/tsdb2/inc/tsdbCommit.h new file mode 100644 index 0000000000..9cb8417c45 --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbCommit.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_COMMIT_H_ +#define _TD_TSDB_COMMIT_H_ + +typedef struct { + int minFid; + int midFid; + int maxFid; + TSKEY minKey; +} SRtn; + +typedef struct { + uint64_t uid; + int64_t offset; + int64_t size; +} SKVRecord; + +#define TSDB_DEFAULT_BLOCK_ROWS(maxRows) ((maxRows)*4 / 5) + +void tsdbGetRtnSnap(STsdbRepo *pRepo, SRtn *pRtn); +int tsdbEncodeKVRecord(void **buf, SKVRecord *pRecord); +void *tsdbDecodeKVRecord(void *buf, SKVRecord *pRecord); +void *tsdbCommitData(STsdbRepo *pRepo); +int tsdbApplyRtnOnFSet(STsdbRepo *pRepo, SDFileSet *pSet, SRtn *pRtn); +int tsdbWriteBlockInfoImpl(SDFile *pHeadf, STable *pTable, SArray *pSupA, SArray *pSubA, void **ppBuf, SBlockIdx *pIdx); +int tsdbWriteBlockIdx(SDFile *pHeadf, SArray *pIdxA, void **ppBuf); +int tsdbWriteBlockImpl(STsdbRepo *pRepo, STable *pTable, SDFile *pDFile, SDFile *pDFileAggr, SDataCols *pDataCols, + SBlock *pBlock, bool isLast, bool isSuper, void **ppBuf, void **ppCBuf, void **ppExBuf); +int tsdbApplyRtn(STsdbRepo *pRepo); + +static FORCE_INLINE int tsdbGetFidLevel(int fid, SRtn *pRtn) { + if (fid >= pRtn->maxFid) { + return 0; + } else if (fid >= pRtn->midFid) { + return 1; + } else if (fid >= pRtn->minFid) { + return 2; + } else { + return -1; + } +} + +#endif /* _TD_TSDB_COMMIT_H_ */ \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/inc/tsdbCommitQueue.h b/source/dnode/vnode/tsdb2/inc/tsdbCommitQueue.h new file mode 100644 index 0000000000..b690e3bdc2 --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbCommitQueue.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_COMMIT_QUEUE_H_ +#define _TD_TSDB_COMMIT_QUEUE_H_ + +typedef enum { COMMIT_REQ, COMPACT_REQ,COMMIT_CONFIG_REQ } TSDB_REQ_T; + +int tsdbScheduleCommit(STsdbRepo *pRepo, TSDB_REQ_T req); + +#endif /* _TD_TSDB_COMMIT_QUEUE_H_ */ \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/inc/tsdbCompact.h b/source/dnode/vnode/tsdb2/inc/tsdbCompact.h new file mode 100644 index 0000000000..5a382de5e0 --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbCompact.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#ifndef _TD_TSDB_COMPACT_H_ +#define _TD_TSDB_COMPACT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +void *tsdbCompactImpl(STsdbRepo *pRepo); + +#ifdef __cplusplus +} +#endif + +#endif /* _TD_TSDB_COMPACT_H_ */ \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/inc/tsdbFS.h b/source/dnode/vnode/tsdb2/inc/tsdbFS.h new file mode 100644 index 0000000000..f3a5e29c0b --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbFS.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_FS_H_ +#define _TD_TSDB_FS_H_ + +/** + * 1. The fileset .head/.data/.last use the same fver 0 before 2021.10.10. + * 2. .head fver is 1 when extract aggregate block data from .data/.last file and save to separate .smad/.smal file + * since 2021.10.10 + * // TODO update date and add release version. + */ +typedef enum { + TSDB_FS_VER_0 = 0, + TSDB_FS_VER_1, +} ETsdbFsVer; + +#define TSDB_FVER_TYPE uint32_t +#define TSDB_LATEST_FVER TSDB_FS_VER_1 // latest version for DFile +#define TSDB_LATEST_SFS_VER TSDB_FS_VER_1 // latest version for 'current' file + +static FORCE_INLINE uint32_t tsdbGetDFSVersion(TSDB_FILE_T fType) { // latest version for DFile + switch (fType) { + case TSDB_FILE_HEAD: + return TSDB_FS_VER_1; + default: + return TSDB_FS_VER_0; + } +} + +// ================== TSDB global config +extern bool tsdbForceKeepFile; + +// ================== CURRENT file header info +typedef struct { + uint32_t version; // Current file system version (relating to code) + uint32_t len; // Encode content length (including checksum) +} SFSHeader; + +// ================== TSDB File System Meta +typedef struct { + uint32_t version; // Commit version from 0 to increase + int64_t totalPoints; // total points + int64_t totalStorage; // Uncompressed total storage +} STsdbFSMeta; + +// ================== +typedef struct { + STsdbFSMeta meta; // FS meta + SMFile* pmf; // meta file pointer + SMFile mf; // meta file + SArray* df; // data file array +} SFSStatus; + +typedef struct { + pthread_rwlock_t lock; + + SFSStatus* cstatus; // current status + SHashObj* metaCache; // meta cache + SHashObj* metaCacheComp; // meta cache for compact + bool intxn; + SFSStatus* nstatus; // new status +} STsdbFS; + +#define FS_CURRENT_STATUS(pfs) ((pfs)->cstatus) +#define FS_NEW_STATUS(pfs) ((pfs)->nstatus) +#define FS_IN_TXN(pfs) (pfs)->intxn +#define FS_VERSION(pfs) ((pfs)->cstatus->meta.version) +#define FS_TXN_VERSION(pfs) ((pfs)->nstatus->meta.version) + +typedef struct { + int direction; + uint64_t version; // current FS version + STsdbFS* pfs; + int index; // used to position next fset when version the same + int fid; // used to seek when version is changed + SDFileSet* pSet; +} SFSIter; + +#define TSDB_FS_ITER_FORWARD TSDB_ORDER_ASC +#define TSDB_FS_ITER_BACKWARD TSDB_ORDER_DESC + +STsdbFS *tsdbNewFS(STsdbCfg *pCfg); +void * tsdbFreeFS(STsdbFS *pfs); +int tsdbOpenFS(STsdbRepo *pRepo); +void tsdbCloseFS(STsdbRepo *pRepo); +void tsdbStartFSTxn(STsdbRepo *pRepo, int64_t pointsAdd, int64_t storageAdd); +int tsdbEndFSTxn(STsdbRepo *pRepo); +int tsdbEndFSTxnWithError(STsdbFS *pfs); +void tsdbUpdateFSTxnMeta(STsdbFS *pfs, STsdbFSMeta *pMeta); +void tsdbUpdateMFile(STsdbFS *pfs, const SMFile *pMFile); +int tsdbUpdateDFileSet(STsdbFS *pfs, const SDFileSet *pSet); + +void tsdbFSIterInit(SFSIter *pIter, STsdbFS *pfs, int direction); +void tsdbFSIterSeek(SFSIter *pIter, int fid); +SDFileSet *tsdbFSIterNext(SFSIter *pIter); +int tsdbLoadMetaCache(STsdbRepo *pRepo, bool recoverMeta); + +static FORCE_INLINE int tsdbRLockFS(STsdbFS* pFs) { + int code = pthread_rwlock_rdlock(&(pFs->lock)); + if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + return 0; +} + +static FORCE_INLINE int tsdbWLockFS(STsdbFS* pFs) { + int code = pthread_rwlock_wrlock(&(pFs->lock)); + if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + return 0; +} + +static FORCE_INLINE int tsdbUnLockFS(STsdbFS* pFs) { + int code = pthread_rwlock_unlock(&(pFs->lock)); + if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + return 0; +} + +#endif /* _TD_TSDB_FS_H_ */ diff --git a/source/dnode/vnode/tsdb2/inc/tsdbFile.h b/source/dnode/vnode/tsdb2/inc/tsdbFile.h new file mode 100644 index 0000000000..6d1e0cf246 --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbFile.h @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TS_TSDB_FILE_H_ +#define _TS_TSDB_FILE_H_ + +#define TSDB_FILE_HEAD_SIZE 512 +#define TSDB_FILE_DELIMITER 0xF00AFA0F +#define TSDB_FILE_INIT_MAGIC 0xFFFFFFFF +#define TSDB_IVLD_FID INT_MIN +#define TSDB_FILE_STATE_OK 0 +#define TSDB_FILE_STATE_BAD 1 + +#define TSDB_FILE_INFO(tf) (&((tf)->info)) +#define TSDB_FILE_F(tf) (&((tf)->f)) +#define TSDB_FILE_FD(tf) ((tf)->fd) +#define TSDB_FILE_FULL_NAME(tf) TFILE_NAME(TSDB_FILE_F(tf)) +#define TSDB_FILE_OPENED(tf) (TSDB_FILE_FD(tf) >= 0) +#define TSDB_FILE_CLOSED(tf) (!TSDB_FILE_OPENED(tf)) +#define TSDB_FILE_SET_CLOSED(f) (TSDB_FILE_FD(f) = -1) +#define TSDB_FILE_LEVEL(tf) TFILE_LEVEL(TSDB_FILE_F(tf)) +#define TSDB_FILE_ID(tf) TFILE_ID(TSDB_FILE_F(tf)) +#define TSDB_FILE_FSYNC(tf) taosFsync(TSDB_FILE_FD(tf)) +#define TSDB_FILE_STATE(tf) ((tf)->state) +#define TSDB_FILE_SET_STATE(tf, s) ((tf)->state = (s)) +#define TSDB_FILE_IS_OK(tf) (TSDB_FILE_STATE(tf) == TSDB_FILE_STATE_OK) +#define TSDB_FILE_IS_BAD(tf) (TSDB_FILE_STATE(tf) == TSDB_FILE_STATE_BAD) +#define ASSERT_TSDB_FSET_NFILES_VALID(s) \ + do { \ + uint8_t nDFiles = tsdbGetNFiles(s); \ + ASSERT((nDFiles >= TSDB_FILE_MIN) && (nDFiles <= TSDB_FILE_MAX)); \ + } while (0) +typedef enum { + TSDB_FILE_HEAD = 0, + TSDB_FILE_DATA, + TSDB_FILE_LAST, + TSDB_FILE_SMAD, // sma for .data + TSDB_FILE_SMAL, // sma for .last + TSDB_FILE_MAX, + TSDB_FILE_META +} TSDB_FILE_T; + +#define TSDB_FILE_MIN 3U // min valid number of files in one DFileSet(.head/.data/.last) + +// =============== SMFile +typedef struct { + int64_t size; + int64_t tombSize; + int64_t nRecords; + int64_t nDels; + uint32_t magic; +} SMFInfo; + +typedef struct { + SMFInfo info; + TFILE f; + int fd; + uint8_t state; +} SMFile; + +void tsdbInitMFile(SMFile* pMFile, SDiskID did, int vid, uint32_t ver); +void tsdbInitMFileEx(SMFile* pMFile, const SMFile* pOMFile); +int tsdbEncodeSMFile(void** buf, SMFile* pMFile); +void* tsdbDecodeSMFile(void* buf, SMFile* pMFile); +int tsdbEncodeSMFileEx(void** buf, SMFile* pMFile); +void* tsdbDecodeSMFileEx(void* buf, SMFile* pMFile); +int tsdbApplyMFileChange(SMFile* from, SMFile* to); +int tsdbCreateMFile(SMFile* pMFile, bool updateHeader); +int tsdbUpdateMFileHeader(SMFile* pMFile); +int tsdbLoadMFileHeader(SMFile* pMFile, SMFInfo* pInfo); +int tsdbScanAndTryFixMFile(STsdbRepo* pRepo); +int tsdbEncodeMFInfo(void** buf, SMFInfo* pInfo); +void* tsdbDecodeMFInfo(void* buf, SMFInfo* pInfo); + +static FORCE_INLINE void tsdbSetMFileInfo(SMFile* pMFile, SMFInfo* pInfo) { pMFile->info = *pInfo; } + +static FORCE_INLINE int tsdbOpenMFile(SMFile* pMFile, int flags) { + ASSERT(TSDB_FILE_CLOSED(pMFile)); + + pMFile->fd = open(TSDB_FILE_FULL_NAME(pMFile), flags | O_BINARY); + if (pMFile->fd < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + return 0; +} + +static FORCE_INLINE void tsdbCloseMFile(SMFile* pMFile) { + if (TSDB_FILE_OPENED(pMFile)) { + close(pMFile->fd); + TSDB_FILE_SET_CLOSED(pMFile); + } +} + +static FORCE_INLINE int64_t tsdbSeekMFile(SMFile* pMFile, int64_t offset, int whence) { + ASSERT(TSDB_FILE_OPENED(pMFile)); + + int64_t loffset = taosLSeek(TSDB_FILE_FD(pMFile), offset, whence); + if (loffset < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + return loffset; +} + +static FORCE_INLINE int64_t tsdbWriteMFile(SMFile* pMFile, void* buf, int64_t nbyte) { + ASSERT(TSDB_FILE_OPENED(pMFile)); + + int64_t nwrite = taosWrite(pMFile->fd, buf, nbyte); + if (nwrite < nbyte) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + return nwrite; +} + +static FORCE_INLINE void tsdbUpdateMFileMagic(SMFile* pMFile, void* pCksum) { + pMFile->info.magic = taosCalcChecksum(pMFile->info.magic, (uint8_t*)(pCksum), sizeof(TSCKSUM)); +} + +static FORCE_INLINE int tsdbAppendMFile(SMFile* pMFile, void* buf, int64_t nbyte, int64_t* offset) { + ASSERT(TSDB_FILE_OPENED(pMFile)); + + int64_t toffset; + + if ((toffset = tsdbSeekMFile(pMFile, 0, SEEK_END)) < 0) { + return -1; + } + + ASSERT(pMFile->info.size == toffset); + + if (offset) { + *offset = toffset; + } + + if (tsdbWriteMFile(pMFile, buf, nbyte) < 0) { + return -1; + } + + pMFile->info.size += nbyte; + + return (int)nbyte; +} + +static FORCE_INLINE int tsdbRemoveMFile(SMFile* pMFile) { return tfsremove(TSDB_FILE_F(pMFile)); } + +static FORCE_INLINE int64_t tsdbReadMFile(SMFile* pMFile, void* buf, int64_t nbyte) { + ASSERT(TSDB_FILE_OPENED(pMFile)); + + int64_t nread = taosRead(pMFile->fd, buf, nbyte); + if (nread < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + return nread; +} + +// =============== SDFile +typedef struct { + uint32_t magic; + uint32_t len; + uint32_t totalBlocks; + uint32_t totalSubBlocks; + uint32_t offset; + uint64_t size; + uint64_t tombSize; + uint32_t fver; +} SDFInfo; + +typedef struct { + SDFInfo info; + TFILE f; + int fd; + uint8_t state; +} SDFile; + +void tsdbInitDFile(SDFile* pDFile, SDiskID did, int vid, int fid, uint32_t ver, TSDB_FILE_T ftype); +void tsdbInitDFileEx(SDFile* pDFile, SDFile* pODFile); +int tsdbEncodeSDFile(void** buf, SDFile* pDFile); +void* tsdbDecodeSDFile(void* buf, SDFile* pDFile, uint32_t sfver); +int tsdbCreateDFile(SDFile* pDFile, bool updateHeader, TSDB_FILE_T ftype); +int tsdbUpdateDFileHeader(SDFile* pDFile); +int tsdbLoadDFileHeader(SDFile* pDFile, SDFInfo* pInfo); +int tsdbParseDFilename(const char* fname, int* vid, int* fid, TSDB_FILE_T* ftype, uint32_t* version); + +static FORCE_INLINE void tsdbSetDFileInfo(SDFile* pDFile, SDFInfo* pInfo) { pDFile->info = *pInfo; } + +static FORCE_INLINE int tsdbOpenDFile(SDFile* pDFile, int flags) { + ASSERT(!TSDB_FILE_OPENED(pDFile)); + + pDFile->fd = open(TSDB_FILE_FULL_NAME(pDFile), flags | O_BINARY); + if (pDFile->fd < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + return 0; +} + +static FORCE_INLINE void tsdbCloseDFile(SDFile* pDFile) { + if (TSDB_FILE_OPENED(pDFile)) { + close(pDFile->fd); + TSDB_FILE_SET_CLOSED(pDFile); + } +} + +static FORCE_INLINE int64_t tsdbSeekDFile(SDFile* pDFile, int64_t offset, int whence) { + ASSERT(TSDB_FILE_OPENED(pDFile)); + + int64_t loffset = taosLSeek(TSDB_FILE_FD(pDFile), offset, whence); + if (loffset < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + return loffset; +} + +static FORCE_INLINE int64_t tsdbWriteDFile(SDFile* pDFile, void* buf, int64_t nbyte) { + ASSERT(TSDB_FILE_OPENED(pDFile)); + + int64_t nwrite = taosWrite(pDFile->fd, buf, nbyte); + if (nwrite < nbyte) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + return nwrite; +} + +static FORCE_INLINE void tsdbUpdateDFileMagic(SDFile* pDFile, void* pCksm) { + pDFile->info.magic = taosCalcChecksum(pDFile->info.magic, (uint8_t*)(pCksm), sizeof(TSCKSUM)); +} + +static FORCE_INLINE int tsdbAppendDFile(SDFile* pDFile, void* buf, int64_t nbyte, int64_t* offset) { + ASSERT(TSDB_FILE_OPENED(pDFile)); + + int64_t toffset; + + if ((toffset = tsdbSeekDFile(pDFile, 0, SEEK_END)) < 0) { + return -1; + } + + ASSERT(pDFile->info.size == toffset); + + if (offset) { + *offset = toffset; + } + + if (tsdbWriteDFile(pDFile, buf, nbyte) < 0) { + return -1; + } + + pDFile->info.size += nbyte; + + return (int)nbyte; +} + +static FORCE_INLINE int tsdbRemoveDFile(SDFile* pDFile) { return tfsremove(TSDB_FILE_F(pDFile)); } + +static FORCE_INLINE int64_t tsdbReadDFile(SDFile* pDFile, void* buf, int64_t nbyte) { + ASSERT(TSDB_FILE_OPENED(pDFile)); + + int64_t nread = taosRead(pDFile->fd, buf, nbyte); + if (nread < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + return nread; +} + +static FORCE_INLINE int tsdbCopyDFile(SDFile* pSrc, SDFile* pDest) { + if (tfscopy(TSDB_FILE_F(pSrc), TSDB_FILE_F(pDest)) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + tsdbSetDFileInfo(pDest, TSDB_FILE_INFO(pSrc)); + return 0; +} + +// =============== SDFileSet +typedef struct { + int fid; + int state; + uint16_t ver; // fset version + SDFile files[TSDB_FILE_MAX]; +} SDFileSet; + +typedef enum { + TSDB_FSET_VER_0 = 0, // .head/.data/.last + TSDB_FSET_VER_1, // .head/.data/.last/.smad/.smal +} ETsdbFSetVer; + +#define TSDB_LATEST_FSET_VER TSDB_FSET_VER_1 + +// get nDFiles in SDFileSet +static FORCE_INLINE uint8_t tsdbGetNFiles(SDFileSet* pSet) { + switch (pSet->ver) { + case TSDB_FSET_VER_0: + return TSDB_FILE_MIN; + case TSDB_FSET_VER_1: + default: + return TSDB_FILE_MAX; + } +} +#define TSDB_FSET_FID(s) ((s)->fid) +#define TSDB_DFILE_IN_SET(s, t) ((s)->files + (t)) +#define TSDB_FSET_LEVEL(s) TSDB_FILE_LEVEL(TSDB_DFILE_IN_SET(s, 0)) +#define TSDB_FSET_ID(s) TSDB_FILE_ID(TSDB_DFILE_IN_SET(s, 0)) +#define TSDB_FSET_SET_CLOSED(s) \ + do { \ + for (TSDB_FILE_T ftype = TSDB_FILE_HEAD; ftype < TSDB_FILE_MAX; ftype++) { \ + TSDB_FILE_SET_CLOSED(TSDB_DFILE_IN_SET(s, ftype)); \ + } \ + } while (0); +#define TSDB_FSET_FSYNC(s) \ + do { \ + for (TSDB_FILE_T ftype = TSDB_FILE_HEAD; ftype < tsdbGetNFiles(s); ftype++) { \ + TSDB_FILE_FSYNC(TSDB_DFILE_IN_SET(s, ftype)); \ + } \ + } while (0); + +void tsdbInitDFileSet(SDFileSet* pSet, SDiskID did, int vid, int fid, uint32_t ver, uint16_t fsetVer); +void tsdbInitDFileSetEx(SDFileSet* pSet, SDFileSet* pOSet); +int tsdbEncodeDFileSet(void** buf, SDFileSet* pSet); +void* tsdbDecodeDFileSet(void* buf, SDFileSet* pSet, uint32_t sfver); +int tsdbEncodeDFileSetEx(void** buf, SDFileSet* pSet); +void* tsdbDecodeDFileSetEx(void* buf, SDFileSet* pSet); +int tsdbApplyDFileSetChange(SDFileSet* from, SDFileSet* to); +int tsdbCreateDFileSet(SDFileSet* pSet, bool updateHeader); +int tsdbUpdateDFileSetHeader(SDFileSet* pSet); +int tsdbScanAndTryFixDFileSet(STsdbRepo *pRepo, SDFileSet* pSet); + +static FORCE_INLINE void tsdbCloseDFileSet(SDFileSet* pSet) { + ASSERT_TSDB_FSET_NFILES_VALID(pSet); + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + tsdbCloseDFile(TSDB_DFILE_IN_SET(pSet, ftype)); + } +} + +static FORCE_INLINE int tsdbOpenDFileSet(SDFileSet* pSet, int flags) { + ASSERT_TSDB_FSET_NFILES_VALID(pSet); + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + if (tsdbOpenDFile(TSDB_DFILE_IN_SET(pSet, ftype), flags) < 0) { + tsdbCloseDFileSet(pSet); + return -1; + } + } + return 0; +} + +static FORCE_INLINE void tsdbRemoveDFileSet(SDFileSet* pSet) { + ASSERT_TSDB_FSET_NFILES_VALID(pSet); + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + (void)tsdbRemoveDFile(TSDB_DFILE_IN_SET(pSet, ftype)); + } +} + +static FORCE_INLINE int tsdbCopyDFileSet(SDFileSet* pSrc, SDFileSet* pDest) { + ASSERT_TSDB_FSET_NFILES_VALID(pSrc); + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSrc); ftype++) { + if (tsdbCopyDFile(TSDB_DFILE_IN_SET(pSrc, ftype), TSDB_DFILE_IN_SET(pDest, ftype)) < 0) { + tsdbRemoveDFileSet(pDest); + return -1; + } + } + + return 0; +} + +static FORCE_INLINE void tsdbGetFidKeyRange(int days, int8_t precision, int fid, TSKEY* minKey, TSKEY* maxKey) { + *minKey = fid * days * tsTickPerDay[precision]; + *maxKey = *minKey + days * tsTickPerDay[precision] - 1; +} + +static FORCE_INLINE bool tsdbFSetIsOk(SDFileSet* pSet) { + for (TSDB_FILE_T ftype = 0; ftype < TSDB_FILE_MAX; ftype++) { + if (TSDB_FILE_IS_BAD(TSDB_DFILE_IN_SET(pSet, ftype))) { + return false; + } + } + + return true; +} + +#endif /* _TS_TSDB_FILE_H_ */ \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/inc/tsdbHealth.h b/source/dnode/vnode/tsdb2/inc/tsdbHealth.h new file mode 100644 index 0000000000..324f4312e0 --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbHealth.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_HEALTH_H_ +#define _TD_TSDB_HEALTH_H_ + +bool tsdbUrgeQueryFree(STsdbRepo* pRepo); +int32_t tsdbInsertNewBlock(STsdbRepo* pRepo); + +bool tsdbIdleMemEnough(); +bool tsdbAllowNewBlock(STsdbRepo* pRepo); + +#endif /* _TD_TSDB_BUFFER_H_ */ diff --git a/source/dnode/vnode/tsdb2/inc/tsdbLog.h b/source/dnode/vnode/tsdb2/inc/tsdbLog.h new file mode 100644 index 0000000000..fdd04e968a --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbLog.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_LOG_H_ +#define _TD_TSDB_LOG_H_ + +extern int32_t tsdbDebugFlag; + +#define tsdbFatal(...) do { if (tsdbDebugFlag & DEBUG_FATAL) { taosPrintLog("TDB FATAL ", 255, __VA_ARGS__); }} while(0) +#define tsdbError(...) do { if (tsdbDebugFlag & DEBUG_ERROR) { taosPrintLog("TDB ERROR ", 255, __VA_ARGS__); }} while(0) +#define tsdbWarn(...) do { if (tsdbDebugFlag & DEBUG_WARN) { taosPrintLog("TDB WARN ", 255, __VA_ARGS__); }} while(0) +#define tsdbInfo(...) do { if (tsdbDebugFlag & DEBUG_INFO) { taosPrintLog("TDB ", 255, __VA_ARGS__); }} while(0) +#define tsdbDebug(...) do { if (tsdbDebugFlag & DEBUG_DEBUG) { taosPrintLog("TDB ", tsdbDebugFlag, __VA_ARGS__); }} while(0) +#define tsdbTrace(...) do { if (tsdbDebugFlag & DEBUG_TRACE) { taosPrintLog("TDB ", tsdbDebugFlag, __VA_ARGS__); }} while(0) + +#endif /* _TD_TSDB_LOG_H_ */ \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/inc/tsdbMemTable.h b/source/dnode/vnode/tsdb2/inc/tsdbMemTable.h new file mode 100644 index 0000000000..67e9976c70 --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbMemTable.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_MEMTABLE_H_ +#define _TD_TSDB_MEMTABLE_H_ + +typedef struct { + int rowsInserted; + int rowsUpdated; + int rowsDeleteSucceed; + int rowsDeleteFailed; + int nOperations; + TSKEY keyFirst; + TSKEY keyLast; +} SMergeInfo; + +typedef struct { + STable * pTable; + SSkipListIterator *pIter; +} SCommitIter; + +struct STableData { + uint64_t uid; + TSKEY keyFirst; + TSKEY keyLast; + int64_t numOfRows; + SSkipList* pData; + T_REF_DECLARE() +}; + +enum { TSDB_UPDATE_META, TSDB_DROP_META }; + +#ifdef WINDOWS +#pragma pack(push ,1) +typedef struct { +#else +typedef struct __attribute__((packed)){ +#endif + char act; + uint64_t uid; +} SActObj; +#ifdef WINDOWS +#pragma pack(pop) +#endif + +typedef struct { + int len; + char cont[]; +} SActCont; + +int tsdbRefMemTable(STsdbRepo* pRepo, SMemTable* pMemTable); +int tsdbUnRefMemTable(STsdbRepo* pRepo, SMemTable* pMemTable); +int tsdbTakeMemSnapshot(STsdbRepo* pRepo, SMemSnapshot* pSnapshot, SArray* pATable); +void tsdbUnTakeMemSnapShot(STsdbRepo* pRepo, SMemSnapshot* pSnapshot); +void* tsdbAllocBytes(STsdbRepo* pRepo, int bytes); +int tsdbAsyncCommit(STsdbRepo* pRepo); +int tsdbSyncCommitConfig(STsdbRepo* pRepo); +int tsdbLoadDataFromCache(STable* pTable, SSkipListIterator* pIter, TSKEY maxKey, int maxRowsToRead, SDataCols* pCols, + TKEY* filterKeys, int nFilterKeys, bool keepDup, SMergeInfo* pMergeInfo); +void* tsdbCommitData(STsdbRepo* pRepo); + +static FORCE_INLINE SMemRow tsdbNextIterRow(SSkipListIterator* pIter) { + if (pIter == NULL) return NULL; + + SSkipListNode* node = tSkipListIterGet(pIter); + if (node == NULL) return NULL; + + return (SMemRow)SL_GET_NODE_DATA(node); +} + +static FORCE_INLINE TSKEY tsdbNextIterKey(SSkipListIterator* pIter) { + SMemRow row = tsdbNextIterRow(pIter); + if (row == NULL) return TSDB_DATA_TIMESTAMP_NULL; + + return memRowKey(row); +} + +static FORCE_INLINE TKEY tsdbNextIterTKey(SSkipListIterator* pIter) { + SMemRow row = tsdbNextIterRow(pIter); + if (row == NULL) return TKEY_NULL; + + return memRowTKey(row); +} + +#endif /* _TD_TSDB_MEMTABLE_H_ */ \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/inc/tsdbMeta.h b/source/dnode/vnode/tsdb2/inc/tsdbMeta.h new file mode 100644 index 0000000000..9cdb8a83aa --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbMeta.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_META_H_ +#define _TD_TSDB_META_H_ + +#define TSDB_MAX_TABLE_SCHEMAS 16 + +#pragma pack (push,1) +typedef struct jsonMapValue{ + void* table; // STable * + int16_t colId; // the json col ID. +}JsonMapValue; + +#pragma pack (pop) + +typedef struct STable { + STableId tableId; + ETableType type; + tstr* name; // NOTE: there a flexible string here + uint64_t suid; + struct STable* pSuper; // super table pointer + SArray* schema; + STSchema* tagSchema; + SKVRow tagVal; + SSkipList* pIndex; // For TSDB_SUPER_TABLE, it is the skiplist index + SHashObj* jsonKeyMap; // For json tag key {"key":[t1, t2, t3]} + void* eventHandler; // TODO + void* streamHandler; // TODO + TSKEY lastKey; + SMemRow lastRow; + char* sql; + void* cqhandle; + SRWLatch latch; // TODO: implementa latch functions + + SDataCol *lastCols; + int16_t maxColNum; + int16_t restoreColumnNum; + bool hasRestoreLastColumn; + int lastColSVersion; + int16_t cacheLastConfigVersion; + T_REF_DECLARE() +} STable; + +typedef struct { + pthread_rwlock_t rwLock; + + int32_t nTables; + int32_t maxTables; + STable** tables; + SList* superList; + SHashObj* uidMap; + int maxRowBytes; + int maxCols; +} STsdbMeta; + +#define TSDB_INIT_NTABLES 1024 +#define TABLE_TYPE(t) (t)->type +#define TABLE_NAME(t) (t)->name +#define TABLE_CHAR_NAME(t) TABLE_NAME(t)->data +#define TABLE_UID(t) (t)->tableId.uid +#define TABLE_TID(t) (t)->tableId.tid +#define TABLE_SUID(t) (t)->suid +// #define TSDB_META_FILE_MAGIC(m) KVSTORE_MAGIC((m)->pStore) +#define TSDB_RLOCK_TABLE(t) taosRLockLatch(&((t)->latch)) +#define TSDB_RUNLOCK_TABLE(t) taosRUnLockLatch(&((t)->latch)) +#define TSDB_WLOCK_TABLE(t) taosWLockLatch(&((t)->latch)) +#define TSDB_WUNLOCK_TABLE(t) taosWUnLockLatch(&((t)->latch)) + +STsdbMeta* tsdbNewMeta(STsdbCfg* pCfg); +void tsdbFreeMeta(STsdbMeta* pMeta); +int tsdbOpenMeta(STsdbRepo* pRepo); +int tsdbCloseMeta(STsdbRepo* pRepo); +STable* tsdbGetTableByUid(STsdbMeta* pMeta, uint64_t uid); +STSchema* tsdbGetTableSchemaByVersion(STable* pTable, int16_t _version, int8_t rowType); +int tsdbWLockRepoMeta(STsdbRepo* pRepo); +int tsdbRLockRepoMeta(STsdbRepo* pRepo); +int tsdbUnlockRepoMeta(STsdbRepo* pRepo); +void tsdbRefTable(STable* pTable); +void tsdbUnRefTable(STable* pTable); +void tsdbUpdateTableSchema(STsdbRepo* pRepo, STable* pTable, STSchema* pSchema, bool insertAct); +int tsdbRestoreTable(STsdbRepo* pRepo, void* cont, int contLen); +void tsdbOrgMeta(STsdbRepo* pRepo); +int tsdbInitColIdCacheWithSchema(STable* pTable, STSchema* pSchema); +int16_t tsdbGetLastColumnsIndexByColId(STable* pTable, int16_t colId); +int tsdbUpdateLastColSchema(STable *pTable, STSchema *pNewSchema); +STSchema* tsdbGetTableLatestSchema(STable *pTable); +void tsdbFreeLastColumns(STable* pTable); +int tsdbCompareJsonMapValue(const void* a, const void* b); +void* tsdbGetJsonTagValue(STable* pTable, char* key, int32_t keyLen, int16_t* colId); + +static FORCE_INLINE int tsdbCompareSchemaVersion(const void *key1, const void *key2) { + if (*(int16_t *)key1 < schemaVersion(*(STSchema **)key2)) { + return -1; + } else if (*(int16_t *)key1 > schemaVersion(*(STSchema **)key2)) { + return 1; + } else { + return 0; + } +} + +static FORCE_INLINE STSchema* tsdbGetTableSchemaImpl(STable* pTable, bool lock, bool copy, int16_t _version, int8_t rowType) { + STable* pDTable = (pTable->pSuper != NULL) ? pTable->pSuper : pTable; // for performance purpose + STSchema* pSchema = NULL; + STSchema* pTSchema = NULL; + + if (lock) TSDB_RLOCK_TABLE(pDTable); + if (_version < 0) { // get the latest version of schema + pTSchema = *(STSchema **)taosArrayGetLast(pDTable->schema); + } else { // get the schema with version + void* ptr = taosArraySearch(pDTable->schema, &_version, tsdbCompareSchemaVersion, TD_EQ); + if (ptr == NULL) { + if (rowType == SMEM_ROW_KV) { + ptr = taosArrayGetLast(pDTable->schema); + } else { + terrno = TSDB_CODE_TDB_IVD_TB_SCHEMA_VERSION; + goto _exit; + } + } + pTSchema = *(STSchema**)ptr; + } + + ASSERT(pTSchema != NULL); + + if (copy) { + if ((pSchema = tdDupSchema(pTSchema)) == NULL) terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + } else { + pSchema = pTSchema; + } + +_exit: + if (lock) TSDB_RUNLOCK_TABLE(pDTable); + return pSchema; +} + +static FORCE_INLINE STSchema* tsdbGetTableSchema(STable* pTable) { + return tsdbGetTableSchemaImpl(pTable, false, false, -1, -1); +} + +static FORCE_INLINE STSchema *tsdbGetTableTagSchema(STable *pTable) { + if (pTable->type == TSDB_CHILD_TABLE) { // check child table first + STable *pSuper = pTable->pSuper; + if (pSuper == NULL) return NULL; + return pSuper->tagSchema; + } else if (pTable->type == TSDB_SUPER_TABLE) { + return pTable->tagSchema; + } else { + return NULL; + } +} + +static FORCE_INLINE TSKEY tsdbGetTableLastKeyImpl(STable* pTable) { + ASSERT((pTable->lastRow == NULL) || (pTable->lastKey == memRowKey(pTable->lastRow))); + return pTable->lastKey; +} + +#endif /* _TD_TSDB_META_H_ */ \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/inc/tsdbReadImpl.h b/source/dnode/vnode/tsdb2/inc/tsdbReadImpl.h new file mode 100644 index 0000000000..20d8b88c83 --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbReadImpl.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_READ_IMPL_H_ +#define _TD_TSDB_READ_IMPL_H_ + +#include "tfs.h" +#include "tsdb.h" +#include "os.h" +#include "tsdbFile.h" +#include "tskiplist.h" +#include "tsdbMeta.h" + +typedef struct SReadH SReadH; + +typedef struct { + int32_t tid; + uint32_t len; + uint32_t offset; + uint32_t hasLast : 2; + uint32_t numOfBlocks : 30; + uint64_t uid; + TSKEY maxKey; +} SBlockIdx; + +#if 0 +typedef struct { + int64_t last : 1; + int64_t offset : 63; + int32_t algorithm : 8; + int32_t numOfRows : 24; + int32_t len; + int32_t keyLen; // key column length, keyOffset = offset+sizeof(SBlockData)+sizeof(SBlockCol)*numOfCols + int16_t numOfSubBlocks; + int16_t numOfCols; // not including timestamp column + TSKEY keyFirst; + TSKEY keyLast; + } SBlock; +#endif + +/** + * keyLen; // key column length, keyOffset = offset+sizeof(SBlockData)+sizeof(SBlockCol)*numOfCols + * numOfCols; // not including timestamp column + */ +#define SBlockFieldsP0 \ + int64_t last : 1; \ + int64_t offset : 63; \ + int32_t algorithm : 8; \ + int32_t numOfRows : 24; \ + int32_t len; \ + int32_t keyLen; \ + int16_t numOfSubBlocks; \ + int16_t numOfCols; \ + TSKEY keyFirst; \ + TSKEY keyLast + +/** + * aggrStat; // only valid when blkVer > 0. 0 - no aggr part in .data/.last/.smad/.smal, 1 - has aggr in .smad/.smal + * blkVer; // 0 - original block, 1 - block since importing .smad/.smal + * aggrOffset; // only valid when blkVer > 0 and aggrStat > 0 + */ +#define SBlockFieldsP1 \ + uint64_t aggrStat : 1; \ + uint64_t blkVer : 7; \ + uint64_t aggrOffset : 56 + +typedef struct { + SBlockFieldsP0; +} SBlockV0; + +typedef struct { + SBlockFieldsP0; + SBlockFieldsP1; +} SBlockV1; + +typedef enum { + TSDB_SBLK_VER_0 = 0, + TSDB_SBLK_VER_1, +} ESBlockVer; + +#define SBlockVerLatest TSDB_SBLK_VER_1 + +#define SBlock SBlockV1 // latest SBlock definition + +// lastest SBlockInfo definition +typedef struct { + int32_t delimiter; // For recovery usage + int32_t tid; + uint64_t uid; + SBlock blocks[]; +} SBlockInfo; + +typedef struct { + int16_t colId; + int32_t len; + uint32_t type : 8; + uint32_t offset : 24; + int64_t sum; + int64_t max; + int64_t min; + int16_t maxIndex; + int16_t minIndex; + int16_t numOfNull; + uint8_t offsetH; + char padding[1]; +} SBlockColV0; + +typedef struct { + int16_t colId; + uint8_t offsetH; + uint8_t reserved; // reserved field, not used + int32_t len; + uint32_t type : 8; + uint32_t offset : 24; +} SBlockColV1; + +#define SBlockCol SBlockColV1 // latest SBlockCol definition + +typedef struct { + int16_t colId; + int16_t maxIndex; + int16_t minIndex; + int16_t numOfNull; + int64_t sum; + int64_t max; + int64_t min; +} SAggrBlkColV1; + +#define SAggrBlkCol SAggrBlkColV1 // latest SAggrBlkCol definition + +// Code here just for back-ward compatibility +static FORCE_INLINE void tsdbSetBlockColOffset(SBlockCol *pBlockCol, uint32_t offset) { + pBlockCol->offset = offset & ((((uint32_t)1) << 24) - 1); + pBlockCol->offsetH = (uint8_t)(offset >> 24); +} + +static FORCE_INLINE uint32_t tsdbGetBlockColOffset(SBlockCol *pBlockCol) { + uint32_t offset1 = pBlockCol->offset; + uint32_t offset2 = pBlockCol->offsetH; + return (offset1 | (offset2 << 24)); +} + +typedef struct { + int32_t delimiter; // For recovery usage + int32_t numOfCols; // For recovery usage + uint64_t uid; // For recovery usage + SBlockCol cols[]; +} SBlockData; + +typedef void SAggrBlkData; // SBlockCol cols[]; + +struct SReadH { + STsdbRepo * pRepo; + SDFileSet rSet; // FSET to read + SArray * aBlkIdx; // SBlockIdx array + STable * pTable; // table to read + SBlockIdx * pBlkIdx; // current reading table SBlockIdx + int cidx; + SBlockInfo * pBlkInfo; // SBlockInfoV# + SBlockData *pBlkData; // Block info + SAggrBlkData *pAggrBlkData; // Aggregate Block info + SDataCols * pDCols[2]; + void * pBuf; // buffer + void * pCBuf; // compression buffer + void * pExBuf; // extra buffer +}; + +#define TSDB_READ_REPO(rh) ((rh)->pRepo) +#define TSDB_READ_REPO_ID(rh) REPO_ID(TSDB_READ_REPO(rh)) +#define TSDB_READ_FSET(rh) (&((rh)->rSet)) +#define TSDB_READ_TABLE(rh) ((rh)->pTable) +#define TSDB_READ_HEAD_FILE(rh) TSDB_DFILE_IN_SET(TSDB_READ_FSET(rh), TSDB_FILE_HEAD) +#define TSDB_READ_DATA_FILE(rh) TSDB_DFILE_IN_SET(TSDB_READ_FSET(rh), TSDB_FILE_DATA) +#define TSDB_READ_LAST_FILE(rh) TSDB_DFILE_IN_SET(TSDB_READ_FSET(rh), TSDB_FILE_LAST) +#define TSDB_READ_SMAD_FILE(rh) TSDB_DFILE_IN_SET(TSDB_READ_FSET(rh), TSDB_FILE_SMAD) +#define TSDB_READ_SMAL_FILE(rh) TSDB_DFILE_IN_SET(TSDB_READ_FSET(rh), TSDB_FILE_SMAL) +#define TSDB_READ_BUF(rh) ((rh)->pBuf) +#define TSDB_READ_COMP_BUF(rh) ((rh)->pCBuf) +#define TSDB_READ_EXBUF(rh) ((rh)->pExBuf) + +#define TSDB_BLOCK_STATIS_SIZE(ncols, blkVer) \ + (sizeof(SBlockData) + sizeof(SBlockColV##blkVer) * (ncols) + sizeof(TSCKSUM)) + +static FORCE_INLINE size_t tsdbBlockStatisSize(int nCols, uint32_t blkVer) { + switch (blkVer) { + case TSDB_SBLK_VER_0: + return TSDB_BLOCK_STATIS_SIZE(nCols, 0); + case TSDB_SBLK_VER_1: + default: + return TSDB_BLOCK_STATIS_SIZE(nCols, 1); + } +} + +#define TSDB_BLOCK_AGGR_SIZE(ncols, blkVer) (sizeof(SAggrBlkColV##blkVer) * (ncols) + sizeof(TSCKSUM)) + +static FORCE_INLINE size_t tsdbBlockAggrSize(int nCols, uint32_t blkVer) { + switch (blkVer) { + case TSDB_SBLK_VER_0: + ASSERT(false); + return 0; + case TSDB_SBLK_VER_1: + default: + return TSDB_BLOCK_AGGR_SIZE(nCols, 1); + } +} + +int tsdbInitReadH(SReadH *pReadh, STsdbRepo *pRepo); +void tsdbDestroyReadH(SReadH *pReadh); +int tsdbSetAndOpenReadFSet(SReadH *pReadh, SDFileSet *pSet); +void tsdbCloseAndUnsetFSet(SReadH *pReadh); +int tsdbLoadBlockIdx(SReadH *pReadh); +int tsdbSetReadTable(SReadH *pReadh, STable *pTable); +int tsdbLoadBlockInfo(SReadH *pReadh, void **pTarget, uint32_t *extendedLen); +int tsdbLoadBlockData(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlockInfo); +int tsdbLoadBlockDataCols(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo, int16_t *colIds, int numOfColsIds); +int tsdbLoadBlockStatis(SReadH *pReadh, SBlock *pBlock); +int tsdbLoadBlockOffset(SReadH *pReadh, SBlock *pBlock); +int tsdbEncodeSBlockIdx(void **buf, SBlockIdx *pIdx); +void *tsdbDecodeSBlockIdx(void *buf, SBlockIdx *pIdx); +void tsdbGetBlockStatis(SReadH *pReadh, SDataStatis *pStatis, int numOfCols, SBlock *pBlock); + +static FORCE_INLINE int tsdbMakeRoom(void **ppBuf, size_t size) { + void * pBuf = *ppBuf; + size_t tsize = taosTSizeof(pBuf); + + if (tsize < size) { + if (tsize == 0) tsize = 1024; + + while (tsize < size) { + tsize *= 2; + } + + *ppBuf = taosTRealloc(pBuf, tsize); + if (*ppBuf == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + } + + return 0; +} + +static FORCE_INLINE SBlockCol *tsdbGetSBlockCol(SBlock *pBlock, SBlockCol **pDestBlkCol, SBlockCol *pBlkCols, + int colIdx) { + if (pBlock->blkVer == SBlockVerLatest) { + *pDestBlkCol = pBlkCols + colIdx; + return *pDestBlkCol; + } + if (pBlock->blkVer == TSDB_SBLK_VER_0) { + SBlockColV0 *pBlkCol = (SBlockColV0 *)pBlkCols + colIdx; + (*pDestBlkCol)->colId = pBlkCol->colId; + (*pDestBlkCol)->len = pBlkCol->len; + (*pDestBlkCol)->type = pBlkCol->type; + (*pDestBlkCol)->offset = pBlkCol->offset; + (*pDestBlkCol)->offsetH = pBlkCol->offsetH; + } + return *pDestBlkCol; +} + +#endif /*_TD_TSDB_READ_IMPL_H_*/ diff --git a/source/dnode/vnode/tsdb2/inc/tsdbRowMergeBuf.h b/source/dnode/vnode/tsdb2/inc/tsdbRowMergeBuf.h new file mode 100644 index 0000000000..cefa9b27fb --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbRowMergeBuf.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef TSDB_ROW_MERGE_BUF_H +#define TSDB_ROW_MERGE_BUF_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "tsdb.h" +#include "tchecksum.h" +#include "tsdbReadImpl.h" + +typedef void* SMergeBuf; + +SDataRow tsdbMergeTwoRows(SMergeBuf *pBuf, SMemRow row1, SMemRow row2, STSchema *pSchema1, STSchema *pSchema2); + +static FORCE_INLINE int tsdbMergeBufMakeSureRoom(SMergeBuf *pBuf, STSchema* pSchema1, STSchema* pSchema2) { + size_t len1 = dataRowMaxBytesFromSchema(pSchema1); + size_t len2 = dataRowMaxBytesFromSchema(pSchema2); + return tsdbMakeRoom(pBuf, MAX(len1, len2)); +} + +static FORCE_INLINE void tsdbFreeMergeBuf(SMergeBuf buf) { + taosTZfree(buf); +} + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef TSDB_ROW_MERGE_BUF_H */ diff --git a/source/dnode/vnode/tsdb2/inc/tsdbint.h b/source/dnode/vnode/tsdb2/inc/tsdbint.h new file mode 100644 index 0000000000..a7f08e61ed --- /dev/null +++ b/source/dnode/vnode/tsdb2/inc/tsdbint.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_TSDB_INT_H_ +#define _TD_TSDB_INT_H_ + +// // TODO: remove the include +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// #include + +#include "os.h" +#include "tlog.h" +#include "taosdef.h" +#include "taoserror.h" +#include "tchecksum.h" +#include "tskiplist.h" +#include "tdataformat.h" +#include "tcoding.h" +#include "tscompression.h" +#include "tlockfree.h" +#include "tlist.h" +#include "hash.h" +#include "tarray.h" +#include "tfs.h" +#include "tsocket.h" + +#include "tsdb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Log +#include "tsdbLog.h" +// Meta +#include "tsdbMeta.h" +// Buffer +#include "tsdbBuffer.h" +// MemTable +#include "tsdbMemTable.h" +// File +#include "tsdbFile.h" +// FS +#include "tsdbFS.h" +// ReadImpl +#include "tsdbReadImpl.h" +// Commit +#include "tsdbCommit.h" +// Compact +#include "tsdbCompact.h" +// Commit Queue +#include "tsdbCommitQueue.h" + +#include "tsdbRowMergeBuf.h" +// Main definitions +struct STsdbRepo { + uint8_t state; + + STsdbCfg config; + + STsdbCfg save_config; // save apply config + bool config_changed; // config changed flag + pthread_mutex_t save_mutex; // protect save config + + int16_t cacheLastConfigVersion; + + STsdbAppH appH; + STsdbStat stat; + STsdbMeta* tsdbMeta; + STsdbBufPool* pPool; + SMemTable* mem; + SMemTable* imem; + STsdbFS* fs; + SRtn rtn; + tsem_t readyToCommit; + pthread_mutex_t mutex; + bool repoLocked; + int32_t code; // Commit code + + SMergeBuf mergeBuf; //used when update=2 + int8_t compactState; // compact state: inCompact/noCompact/waitingCompact? + pthread_t* pthread; +}; + +#define REPO_ID(r) (r)->config.tsdbId +#define REPO_CFG(r) (&((r)->config)) +#define REPO_FS(r) ((r)->fs) +#define IS_REPO_LOCKED(r) (r)->repoLocked +#define TSDB_SUBMIT_MSG_HEAD_SIZE sizeof(SSubmitMsg) + +int tsdbLockRepo(STsdbRepo* pRepo); +int tsdbUnlockRepo(STsdbRepo* pRepo); +STsdbMeta* tsdbGetMeta(STsdbRepo* pRepo); +int tsdbCheckCommit(STsdbRepo* pRepo); +int tsdbRestoreInfo(STsdbRepo* pRepo); +UNUSED_FUNC int tsdbCacheLastData(STsdbRepo *pRepo, STsdbCfg* oldCfg); +int32_t tsdbLoadLastCache(STsdbRepo *pRepo, STable* pTable); +void tsdbGetRootDir(int repoid, char dirName[]); +void tsdbGetDataDir(int repoid, char dirName[]); + +static FORCE_INLINE STsdbBufBlock* tsdbGetCurrBufBlock(STsdbRepo* pRepo) { + ASSERT(pRepo != NULL); + if (pRepo->mem == NULL) return NULL; + + SListNode* pNode = listTail(pRepo->mem->bufBlockList); + if (pNode == NULL) return NULL; + + STsdbBufBlock* pBufBlock = NULL; + tdListNodeGetData(pRepo->mem->bufBlockList, pNode, (void*)(&pBufBlock)); + + return pBufBlock; +} + +static FORCE_INLINE int tsdbGetNextMaxTables(int tid) { + ASSERT(tid >= 1 && tid <= TSDB_MAX_TABLES); + int maxTables = TSDB_INIT_NTABLES; + while (true) { + maxTables = MIN(maxTables, TSDB_MAX_TABLES); + if (tid <= maxTables) break; + maxTables *= 2; + } + + return maxTables + 1; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _TD_TSDB_INT_H_ */ diff --git a/source/dnode/vnode/tsdb2/src/tsdbBuffer.c b/source/dnode/vnode/tsdb2/src/tsdbBuffer.c new file mode 100644 index 0000000000..70589031f6 --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbBuffer.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "tsdbint.h" +#include "tsdbHealth.h" + +#define POOL_IS_EMPTY(b) (listNEles((b)->bufBlockList) == 0) + +// ---------------- INTERNAL FUNCTIONS ---------------- +STsdbBufPool *tsdbNewBufPool() { + STsdbBufPool *pBufPool = (STsdbBufPool *)calloc(1, sizeof(*pBufPool)); + if (pBufPool == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + int code = pthread_cond_init(&(pBufPool->poolNotEmpty), NULL); + if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(code); + goto _err; + } + + pBufPool->bufBlockList = tdListNew(sizeof(STsdbBufBlock *)); + if (pBufPool->bufBlockList == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + return pBufPool; + +_err: + tsdbFreeBufPool(pBufPool); + return NULL; +} + +void tsdbFreeBufPool(STsdbBufPool *pBufPool) { + if (pBufPool) { + if (pBufPool->bufBlockList) { + ASSERT(listNEles(pBufPool->bufBlockList) == 0); + tdListFree(pBufPool->bufBlockList); + } + + pthread_cond_destroy(&pBufPool->poolNotEmpty); + + free(pBufPool); + } +} + +int tsdbOpenBufPool(STsdbRepo *pRepo) { + STsdbCfg * pCfg = &(pRepo->config); + STsdbBufPool *pPool = pRepo->pPool; + + ASSERT(pPool != NULL); + pPool->bufBlockSize = pCfg->cacheBlockSize * 1024 * 1024; // MB + pPool->tBufBlocks = pCfg->totalBlocks; + pPool->nBufBlocks = 0; + pPool->nElasticBlocks = 0; + pPool->index = 0; + pPool->nRecycleBlocks = 0; + + for (int i = 0; i < pCfg->totalBlocks; i++) { + STsdbBufBlock *pBufBlock = tsdbNewBufBlock(pPool->bufBlockSize); + if (pBufBlock == NULL) goto _err; + + if (tdListAppend(pPool->bufBlockList, (void *)(&pBufBlock)) < 0) { + tsdbFreeBufBlock(pBufBlock); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + pPool->nBufBlocks++; + } + + tsdbDebug("vgId:%d buffer pool is opened! bufBlockSize:%d tBufBlocks:%d nBufBlocks:%d", REPO_ID(pRepo), + pPool->bufBlockSize, pPool->tBufBlocks, pPool->nBufBlocks); + + return 0; + +_err: + tsdbCloseBufPool(pRepo); + return -1; +} + +void tsdbCloseBufPool(STsdbRepo *pRepo) { + if (pRepo == NULL) return; + + STsdbBufPool * pBufPool = pRepo->pPool; + STsdbBufBlock *pBufBlock = NULL; + + if (pBufPool) { + SListNode *pNode = NULL; + while ((pNode = tdListPopHead(pBufPool->bufBlockList)) != NULL) { + tdListNodeGetData(pBufPool->bufBlockList, pNode, (void *)(&pBufBlock)); + tsdbFreeBufBlock(pBufBlock); + free(pNode); + } + } + + tsdbDebug("vgId:%d, buffer pool is closed", REPO_ID(pRepo)); +} + +SListNode *tsdbAllocBufBlockFromPool(STsdbRepo *pRepo) { + ASSERT(pRepo != NULL && pRepo->pPool != NULL); + ASSERT(IS_REPO_LOCKED(pRepo)); + + STsdbBufPool *pBufPool = pRepo->pPool; + + while (POOL_IS_EMPTY(pBufPool)) { + if(tsDeadLockKillQuery) { + // supply new Block + if(tsdbInsertNewBlock(pRepo) > 0) { + tsdbWarn("vgId:%d add new elastic block . elasticBlocks=%d cur free Blocks=%d", REPO_ID(pRepo), pBufPool->nElasticBlocks, pBufPool->bufBlockList->numOfEles); + break; + } else { + // no newBlock, kill query free + if(!tsdbUrgeQueryFree(pRepo)) + tsdbWarn("vgId:%d Urge query free thread start failed.", REPO_ID(pRepo)); + } + } + + pRepo->repoLocked = false; + pthread_cond_wait(&(pBufPool->poolNotEmpty), &(pRepo->mutex)); + pRepo->repoLocked = true; + } + + SListNode * pNode = tdListPopHead(pBufPool->bufBlockList); + ASSERT(pNode != NULL); + STsdbBufBlock *pBufBlock = NULL; + tdListNodeGetData(pBufPool->bufBlockList, pNode, (void *)(&pBufBlock)); + + pBufBlock->blockId = pBufPool->index++; + pBufBlock->offset = 0; + pBufBlock->remain = pBufPool->bufBlockSize; + + tsdbDebug("vgId:%d, buffer block is allocated, blockId:%" PRId64, REPO_ID(pRepo), pBufBlock->blockId); + return pNode; +} + +// ---------------- LOCAL FUNCTIONS ---------------- +STsdbBufBlock *tsdbNewBufBlock(int bufBlockSize) { + STsdbBufBlock *pBufBlock = (STsdbBufBlock *)malloc(sizeof(*pBufBlock) + bufBlockSize); + if (pBufBlock == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + + pBufBlock->blockId = 0; + pBufBlock->offset = 0; + pBufBlock->remain = bufBlockSize; + + return pBufBlock; +} + + void tsdbFreeBufBlock(STsdbBufBlock *pBufBlock) { tfree(pBufBlock); } + +int tsdbExpandPool(STsdbRepo* pRepo, int32_t oldTotalBlocks) { + if (oldTotalBlocks == pRepo->config.totalBlocks) { + return TSDB_CODE_SUCCESS; + } + + int err = TSDB_CODE_SUCCESS; + + if (tsdbLockRepo(pRepo) < 0) return terrno; + STsdbBufPool* pPool = pRepo->pPool; + + if (pRepo->config.totalBlocks > oldTotalBlocks) { + for (int i = 0; i < pRepo->config.totalBlocks - oldTotalBlocks; i++) { + STsdbBufBlock *pBufBlock = tsdbNewBufBlock(pPool->bufBlockSize); + if (pBufBlock == NULL) goto err; + + if (tdListAppend(pPool->bufBlockList, (void *)(&pBufBlock)) < 0) { + tsdbFreeBufBlock(pBufBlock); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + err = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto err; + } + + pPool->nBufBlocks++; + } + pthread_cond_signal(&pPool->poolNotEmpty); + } else { + pPool->nRecycleBlocks = oldTotalBlocks - pRepo->config.totalBlocks; + } + +err: + tsdbUnlockRepo(pRepo); + return err; +} + +void tsdbRecycleBufferBlock(STsdbBufPool* pPool, SListNode *pNode, bool bELastic) { + STsdbBufBlock *pBufBlock = NULL; + tdListNodeGetData(pPool->bufBlockList, pNode, (void *)(&pBufBlock)); + tsdbFreeBufBlock(pBufBlock); + free(pNode); + if(bELastic) + { + pPool->nElasticBlocks--; + tsdbWarn("pPool=%p elastic block reduce one . nElasticBlocks=%d cur free Blocks=%d", pPool, pPool->nElasticBlocks, pPool->bufBlockList->numOfEles); + } + else + pPool->nBufBlocks--; +} \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/src/tsdbCommit.c b/source/dnode/vnode/tsdb2/src/tsdbCommit.c new file mode 100644 index 0000000000..db675d0427 --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbCommit.c @@ -0,0 +1,1776 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#include "tsdbint.h" + +extern int32_t tsTsdbMetaCompactRatio; + +#define TSDB_MAX_SUBBLOCKS 8 +static FORCE_INLINE int TSDB_KEY_FID(TSKEY key, int32_t days, int8_t precision) { + if (key < 0) { + return (int)((key + 1) / tsTickPerDay[precision] / days - 1); + } else { + return (int)((key / tsTickPerDay[precision] / days)); + } +} + +typedef struct { + SRtn rtn; // retention snapshot + SFSIter fsIter; // tsdb file iterator + int niters; // memory iterators + SCommitIter *iters; + bool isRFileSet; // read and commit FSET + SReadH readh; + SDFileSet wSet; + bool isDFileSame; + bool isLFileSame; + TSKEY minKey; + TSKEY maxKey; + SArray * aBlkIdx; // SBlockIdx array + STable * pTable; + SArray * aSupBlk; // Table super-block array + SArray * aSubBlk; // table sub-block array + SDataCols * pDataCols; +} SCommitH; + +#define TSDB_COMMIT_REPO(ch) TSDB_READ_REPO(&(ch->readh)) +#define TSDB_COMMIT_REPO_ID(ch) REPO_ID(TSDB_READ_REPO(&(ch->readh))) +#define TSDB_COMMIT_WRITE_FSET(ch) (&((ch)->wSet)) +#define TSDB_COMMIT_TABLE(ch) ((ch)->pTable) +#define TSDB_COMMIT_HEAD_FILE(ch) TSDB_DFILE_IN_SET(TSDB_COMMIT_WRITE_FSET(ch), TSDB_FILE_HEAD) +#define TSDB_COMMIT_DATA_FILE(ch) TSDB_DFILE_IN_SET(TSDB_COMMIT_WRITE_FSET(ch), TSDB_FILE_DATA) +#define TSDB_COMMIT_LAST_FILE(ch) TSDB_DFILE_IN_SET(TSDB_COMMIT_WRITE_FSET(ch), TSDB_FILE_LAST) +#define TSDB_COMMIT_SMAD_FILE(ch) TSDB_DFILE_IN_SET(TSDB_COMMIT_WRITE_FSET(ch), TSDB_FILE_SMAD) +#define TSDB_COMMIT_SMAL_FILE(ch) TSDB_DFILE_IN_SET(TSDB_COMMIT_WRITE_FSET(ch), TSDB_FILE_SMAL) +#define TSDB_COMMIT_BUF(ch) TSDB_READ_BUF(&((ch)->readh)) +#define TSDB_COMMIT_COMP_BUF(ch) TSDB_READ_COMP_BUF(&((ch)->readh)) +#define TSDB_COMMIT_EXBUF(ch) TSDB_READ_EXBUF(&((ch)->readh)) +#define TSDB_COMMIT_DEFAULT_ROWS(ch) TSDB_DEFAULT_BLOCK_ROWS(TSDB_COMMIT_REPO(ch)->config.maxRowsPerFileBlock) +#define TSDB_COMMIT_TXN_VERSION(ch) FS_TXN_VERSION(REPO_FS(TSDB_COMMIT_REPO(ch))) + +static int tsdbCommitMeta(STsdbRepo *pRepo); +static int tsdbUpdateMetaRecord(STsdbFS *pfs, SMFile *pMFile, uint64_t uid, void *cont, int contLen, bool compact); +static int tsdbDropMetaRecord(STsdbFS *pfs, SMFile *pMFile, uint64_t uid); +static int tsdbCompactMetaFile(STsdbRepo *pRepo, STsdbFS *pfs, SMFile *pMFile); +static int tsdbCommitTSData(STsdbRepo *pRepo); +static void tsdbStartCommit(STsdbRepo *pRepo); +static void tsdbEndCommit(STsdbRepo *pRepo, int eno); +static int tsdbCommitToFile(SCommitH *pCommith, SDFileSet *pSet, int fid); +static int tsdbCreateCommitIters(SCommitH *pCommith); +static void tsdbDestroyCommitIters(SCommitH *pCommith); +static void tsdbSeekCommitIter(SCommitH *pCommith, TSKEY key); +static int tsdbInitCommitH(SCommitH *pCommith, STsdbRepo *pRepo); +static void tsdbDestroyCommitH(SCommitH *pCommith); +static int tsdbGetFidLevel(int fid, SRtn *pRtn); +static int tsdbNextCommitFid(SCommitH *pCommith); +static int tsdbCommitToTable(SCommitH *pCommith, int tid); +static int tsdbSetCommitTable(SCommitH *pCommith, STable *pTable); +static int tsdbComparKeyBlock(const void *arg1, const void *arg2); +static int tsdbWriteBlockInfo(SCommitH *pCommih); +static int tsdbCommitMemData(SCommitH *pCommith, SCommitIter *pIter, TSKEY keyLimit, bool toData); +static int tsdbMergeMemData(SCommitH *pCommith, SCommitIter *pIter, int bidx); +static int tsdbMoveBlock(SCommitH *pCommith, int bidx); +static int tsdbCommitAddBlock(SCommitH *pCommith, const SBlock *pSupBlock, const SBlock *pSubBlocks, int nSubBlocks); +static int tsdbMergeBlockData(SCommitH *pCommith, SCommitIter *pIter, SDataCols *pDataCols, TSKEY keyLimit, + bool isLastOneBlock); +static void tsdbResetCommitFile(SCommitH *pCommith); +static void tsdbResetCommitTable(SCommitH *pCommith); +static int tsdbSetAndOpenCommitFile(SCommitH *pCommith, SDFileSet *pSet, int fid); +static void tsdbCloseCommitFile(SCommitH *pCommith, bool hasError); +static bool tsdbCanAddSubBlock(SCommitH *pCommith, SBlock *pBlock, SMergeInfo *pInfo); +static void tsdbLoadAndMergeFromCache(SDataCols *pDataCols, int *iter, SCommitIter *pCommitIter, SDataCols *pTarget, + TSKEY maxKey, int maxRows, int8_t update); + +void *tsdbCommitData(STsdbRepo *pRepo) { + if (pRepo->imem == NULL) { + return NULL; + } + tsdbStartCommit(pRepo); + + // Commit to update meta file + if (tsdbCommitMeta(pRepo) < 0) { + tsdbError("vgId:%d error occurs while committing META data since %s", REPO_ID(pRepo), tstrerror(terrno)); + goto _err; + } + + // Create the iterator to read from cache + if (tsdbCommitTSData(pRepo) < 0) { + tsdbError("vgId:%d error occurs while committing TS data since %s", REPO_ID(pRepo), tstrerror(terrno)); + goto _err; + } + + tsdbEndCommit(pRepo, TSDB_CODE_SUCCESS); + return NULL; + +_err: + ASSERT(terrno != TSDB_CODE_SUCCESS); + pRepo->code = terrno; + + tsdbEndCommit(pRepo, terrno); + return NULL; +} + +int tsdbApplyRtnOnFSet(STsdbRepo *pRepo, SDFileSet *pSet, SRtn *pRtn) { + SDiskID did; + SDFileSet nSet; + STsdbFS * pfs = REPO_FS(pRepo); + int level; + + ASSERT(pSet->fid >= pRtn->minFid); + + level = tsdbGetFidLevel(pSet->fid, pRtn); + + tfsAllocDisk(level, &(did.level), &(did.id)); + if (did.level == TFS_UNDECIDED_LEVEL) { + terrno = TSDB_CODE_TDB_NO_AVAIL_DISK; + return -1; + } + + if (did.level > TSDB_FSET_LEVEL(pSet)) { + // Need to move the FSET to higher level + tsdbInitDFileSet(&nSet, did, REPO_ID(pRepo), pSet->fid, FS_TXN_VERSION(pfs), pSet->ver); + + if (tsdbCopyDFileSet(pSet, &nSet) < 0) { + tsdbError("vgId:%d failed to copy FSET %d from level %d to level %d since %s", REPO_ID(pRepo), pSet->fid, + TSDB_FSET_LEVEL(pSet), did.level, tstrerror(terrno)); + return -1; + } + + if (tsdbUpdateDFileSet(pfs, &nSet) < 0) { + return -1; + } + + tsdbInfo("vgId:%d FSET %d is copied from level %d disk id %d to level %d disk id %d", REPO_ID(pRepo), pSet->fid, + TSDB_FSET_LEVEL(pSet), TSDB_FSET_ID(pSet), did.level, did.id); + } else { + // On a correct level + if (tsdbUpdateDFileSet(pfs, pSet) < 0) { + return -1; + } + } + + return 0; +} + +int tsdbWriteBlockInfoImpl(SDFile *pHeadf, STable *pTable, SArray *pSupA, SArray *pSubA, void **ppBuf, + SBlockIdx *pIdx) { + size_t nSupBlocks; + size_t nSubBlocks; + uint32_t tlen; + SBlockInfo *pBlkInfo; + int64_t offset; + SBlock * pBlock; + + memset(pIdx, 0, sizeof(*pIdx)); + + nSupBlocks = taosArrayGetSize(pSupA); + nSubBlocks = (pSubA == NULL) ? 0 : taosArrayGetSize(pSubA); + + if (nSupBlocks <= 0) { + // No data (data all deleted) + return 0; + } + + tlen = (uint32_t)(sizeof(SBlockInfo) + sizeof(SBlock) * (nSupBlocks + nSubBlocks) + sizeof(TSCKSUM)); + if (tsdbMakeRoom(ppBuf, tlen) < 0) return -1; + pBlkInfo = *ppBuf; + + pBlkInfo->delimiter = TSDB_FILE_DELIMITER; + pBlkInfo->tid = TABLE_TID(pTable); + pBlkInfo->uid = TABLE_UID(pTable); + + memcpy((void *)(pBlkInfo->blocks), taosArrayGet(pSupA, 0), nSupBlocks * sizeof(SBlock)); + if (nSubBlocks > 0) { + memcpy((void *)(pBlkInfo->blocks + nSupBlocks), taosArrayGet(pSubA, 0), nSubBlocks * sizeof(SBlock)); + + for (int i = 0; i < nSupBlocks; i++) { + pBlock = pBlkInfo->blocks + i; + + if (pBlock->numOfSubBlocks > 1) { + pBlock->offset += (sizeof(SBlockInfo) + sizeof(SBlock) * nSupBlocks); + } + } + } + + taosCalcChecksumAppend(0, (uint8_t *)pBlkInfo, tlen); + + if (tsdbAppendDFile(pHeadf, (void *)pBlkInfo, tlen, &offset) < 0) { + return -1; + } + + tsdbUpdateDFileMagic(pHeadf, POINTER_SHIFT(pBlkInfo, tlen - sizeof(TSCKSUM))); + + // Set pIdx + pBlock = taosArrayGetLast(pSupA); + + pIdx->tid = TABLE_TID(pTable); + pIdx->uid = TABLE_UID(pTable); + pIdx->hasLast = pBlock->last ? 1 : 0; + pIdx->maxKey = pBlock->keyLast; + pIdx->numOfBlocks = (uint32_t)nSupBlocks; + pIdx->len = tlen; + pIdx->offset = (uint32_t)offset; + + return 0; +} + +int tsdbWriteBlockIdx(SDFile *pHeadf, SArray *pIdxA, void **ppBuf) { + SBlockIdx *pBlkIdx; + size_t nidx = taosArrayGetSize(pIdxA); + int tlen = 0, size; + int64_t offset = 0; + + if (nidx <= 0) { + // All data are deleted + pHeadf->info.offset = 0; + pHeadf->info.len = 0; + return 0; + } + + for (size_t i = 0; i < nidx; i++) { + pBlkIdx = (SBlockIdx *)taosArrayGet(pIdxA, i); + + size = tsdbEncodeSBlockIdx(NULL, pBlkIdx); + if (tsdbMakeRoom(ppBuf, tlen + size) < 0) return -1; + + void *ptr = POINTER_SHIFT(*ppBuf, tlen); + tsdbEncodeSBlockIdx(&ptr, pBlkIdx); + + tlen += size; + } + + tlen += sizeof(TSCKSUM); + if (tsdbMakeRoom(ppBuf, tlen) < 0) return -1; + taosCalcChecksumAppend(0, (uint8_t *)(*ppBuf), tlen); + + if (tsdbAppendDFile(pHeadf, *ppBuf, tlen, &offset) < tlen) { + return -1; + } + + tsdbUpdateDFileMagic(pHeadf, POINTER_SHIFT(*ppBuf, tlen - sizeof(TSCKSUM))); + pHeadf->info.offset = (uint32_t)offset; + pHeadf->info.len = tlen; + + return 0; +} + + +// =================== Commit Meta Data +static int tsdbInitCommitMetaFile(STsdbRepo *pRepo, SMFile* pMf, bool open) { + STsdbFS * pfs = REPO_FS(pRepo); + SMFile * pOMFile = pfs->cstatus->pmf; + SDiskID did; + + // Create/Open a meta file or open the existing file + if (pOMFile == NULL) { + // Create a new meta file + did.level = TFS_PRIMARY_LEVEL; + did.id = TFS_PRIMARY_ID; + tsdbInitMFile(pMf, did, REPO_ID(pRepo), FS_TXN_VERSION(REPO_FS(pRepo))); + + if (open && tsdbCreateMFile(pMf, true) < 0) { + tsdbError("vgId:%d failed to create META file since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + tsdbInfo("vgId:%d meta file %s is created to commit", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMf)); + } else { + tsdbInitMFileEx(pMf, pOMFile); + if (open && tsdbOpenMFile(pMf, O_WRONLY) < 0) { + tsdbError("vgId:%d failed to open META file since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + } + + return 0; +} + +static int tsdbCommitMeta(STsdbRepo *pRepo) { + STsdbFS * pfs = REPO_FS(pRepo); + SMemTable *pMem = pRepo->imem; + SMFile * pOMFile = pfs->cstatus->pmf; + SMFile mf; + SActObj * pAct = NULL; + SActCont * pCont = NULL; + SListNode *pNode = NULL; + + ASSERT(pOMFile != NULL || listNEles(pMem->actList) > 0); + + if (listNEles(pMem->actList) <= 0) { + // no meta data to commit, just keep the old meta file + tsdbUpdateMFile(pfs, pOMFile); + if (tsTsdbMetaCompactRatio > 0) { + if (tsdbInitCommitMetaFile(pRepo, &mf, false) < 0) { + return -1; + } + int ret = tsdbCompactMetaFile(pRepo, pfs, &mf); + if (ret < 0) tsdbError("compact meta file error"); + + return ret; + } + return 0; + } else { + if (tsdbInitCommitMetaFile(pRepo, &mf, true) < 0) { + return -1; + } + } + + // Loop to write + while ((pNode = tdListPopHead(pMem->actList)) != NULL) { + pAct = (SActObj *)pNode->data; + if (pAct->act == TSDB_UPDATE_META) { + pCont = (SActCont *)POINTER_SHIFT(pAct, sizeof(SActObj)); + if (tsdbUpdateMetaRecord(pfs, &mf, pAct->uid, (void *)(pCont->cont), pCont->len, false) < 0) { + tsdbError("vgId:%d failed to update META record, uid %" PRIu64 " since %s", REPO_ID(pRepo), pAct->uid, + tstrerror(terrno)); + tsdbCloseMFile(&mf); + (void)tsdbApplyMFileChange(&mf, pOMFile); + // TODO: need to reload metaCache + return -1; + } + } else if (pAct->act == TSDB_DROP_META) { + if (tsdbDropMetaRecord(pfs, &mf, pAct->uid) < 0) { + tsdbError("vgId:%d failed to drop META record, uid %" PRIu64 " since %s", REPO_ID(pRepo), pAct->uid, + tstrerror(terrno)); + tsdbCloseMFile(&mf); + tsdbApplyMFileChange(&mf, pOMFile); + // TODO: need to reload metaCache + return -1; + } + } else { + ASSERT(false); + } + } + + if (tsdbUpdateMFileHeader(&mf) < 0) { + tsdbError("vgId:%d failed to update META file header since %s, revert it", REPO_ID(pRepo), tstrerror(terrno)); + tsdbApplyMFileChange(&mf, pOMFile); + // TODO: need to reload metaCache + return -1; + } + + TSDB_FILE_FSYNC(&mf); + tsdbCloseMFile(&mf); + tsdbUpdateMFile(pfs, &mf); + + if (tsTsdbMetaCompactRatio > 0 && tsdbCompactMetaFile(pRepo, pfs, &mf) < 0) { + tsdbError("compact meta file error"); + } + + return 0; +} + +int tsdbEncodeKVRecord(void **buf, SKVRecord *pRecord) { + int tlen = 0; + tlen += taosEncodeFixedU64(buf, pRecord->uid); + tlen += taosEncodeFixedI64(buf, pRecord->offset); + tlen += taosEncodeFixedI64(buf, pRecord->size); + + return tlen; +} + +void *tsdbDecodeKVRecord(void *buf, SKVRecord *pRecord) { + buf = taosDecodeFixedU64(buf, &(pRecord->uid)); + buf = taosDecodeFixedI64(buf, &(pRecord->offset)); + buf = taosDecodeFixedI64(buf, &(pRecord->size)); + + return buf; +} + +void tsdbGetRtnSnap(STsdbRepo *pRepo, SRtn *pRtn) { + STsdbCfg *pCfg = REPO_CFG(pRepo); + TSKEY minKey, midKey, maxKey, now; + + now = taosGetTimestamp(pCfg->precision); + minKey = now - pCfg->keep * tsTickPerDay[pCfg->precision]; + midKey = now - pCfg->keep2 * tsTickPerDay[pCfg->precision]; + maxKey = now - pCfg->keep1 * tsTickPerDay[pCfg->precision]; + + pRtn->minKey = minKey; + pRtn->minFid = (int)(TSDB_KEY_FID(minKey, pCfg->daysPerFile, pCfg->precision)); + pRtn->midFid = (int)(TSDB_KEY_FID(midKey, pCfg->daysPerFile, pCfg->precision)); + pRtn->maxFid = (int)(TSDB_KEY_FID(maxKey, pCfg->daysPerFile, pCfg->precision)); + tsdbDebug("vgId:%d now:%" PRId64 " minKey:%" PRId64 " minFid:%d, midFid:%d, maxFid:%d", REPO_ID(pRepo), now, minKey, + pRtn->minFid, pRtn->midFid, pRtn->maxFid); +} + +static int tsdbUpdateMetaRecord(STsdbFS *pfs, SMFile *pMFile, uint64_t uid, void *cont, int contLen, bool compact) { + char buf[64] = "\0"; + void * pBuf = buf; + SKVRecord rInfo; + int64_t offset; + + // Seek to end of meta file + offset = tsdbSeekMFile(pMFile, 0, SEEK_END); + if (offset < 0) { + return -1; + } + + rInfo.offset = offset; + rInfo.uid = uid; + rInfo.size = contLen; + + int tlen = tsdbEncodeKVRecord((void **)(&pBuf), &rInfo); + if (tsdbAppendMFile(pMFile, buf, tlen, NULL) < tlen) { + return -1; + } + + if (tsdbAppendMFile(pMFile, cont, contLen, NULL) < contLen) { + return -1; + } + + tsdbUpdateMFileMagic(pMFile, POINTER_SHIFT(cont, contLen - sizeof(TSCKSUM))); + + SHashObj* cache = compact ? pfs->metaCacheComp : pfs->metaCache; + + pMFile->info.nRecords++; + + SKVRecord *pRecord = taosHashGet(cache, (void *)&uid, sizeof(uid)); + if (pRecord != NULL) { + pMFile->info.tombSize += (pRecord->size + sizeof(SKVRecord)); + } else { + pMFile->info.nRecords++; + } + taosHashPut(cache, (void *)(&uid), sizeof(uid), (void *)(&rInfo), sizeof(rInfo)); + + return 0; +} + +static int tsdbDropMetaRecord(STsdbFS *pfs, SMFile *pMFile, uint64_t uid) { + SKVRecord rInfo = {0}; + char buf[128] = "\0"; + + SKVRecord *pRecord = taosHashGet(pfs->metaCache, (void *)(&uid), sizeof(uid)); + if (pRecord == NULL) { + tsdbError("failed to drop META record with key %" PRIu64 " since not find", uid); + return -1; + } + + rInfo.offset = -pRecord->offset; + rInfo.uid = pRecord->uid; + rInfo.size = pRecord->size; + + void *pBuf = buf; + tsdbEncodeKVRecord(&pBuf, &rInfo); + + if (tsdbAppendMFile(pMFile, buf, sizeof(SKVRecord), NULL) < 0) { + return -1; + } + + pMFile->info.magic = taosCalcChecksum(pMFile->info.magic, (uint8_t *)buf, sizeof(SKVRecord)); + pMFile->info.nDels++; + pMFile->info.nRecords--; + pMFile->info.tombSize += (rInfo.size + sizeof(SKVRecord) * 2); + + taosHashRemove(pfs->metaCache, (void *)(&uid), sizeof(uid)); + return 0; +} + +static int tsdbCompactMetaFile(STsdbRepo *pRepo, STsdbFS *pfs, SMFile *pMFile) { + float delPercent = (float)(pMFile->info.nDels) / (float)(pMFile->info.nRecords); + float tombPercent = (float)(pMFile->info.tombSize) / (float)(pMFile->info.size); + float compactRatio = (float)(tsTsdbMetaCompactRatio)/100; + + if (delPercent < compactRatio && tombPercent < compactRatio) { + return 0; + } + + if (tsdbOpenMFile(pMFile, O_RDONLY) < 0) { + tsdbError("open meta file %s compact fail", pMFile->f.rname); + return -1; + } + + tsdbInfo("begin compact tsdb meta file, ratio:%d, nDels:%" PRId64 ",nRecords:%" PRId64 ",tombSize:%" PRId64 ",size:%" PRId64, + tsTsdbMetaCompactRatio, pMFile->info.nDels,pMFile->info.nRecords,pMFile->info.tombSize,pMFile->info.size); + + SMFile mf; + SDiskID did; + + // first create tmp meta file + did.level = TFS_PRIMARY_LEVEL; + did.id = TFS_PRIMARY_ID; + tsdbInitMFile(&mf, did, REPO_ID(pRepo), FS_TXN_VERSION(REPO_FS(pRepo)) + 1); + + if (tsdbCreateMFile(&mf, true) < 0) { + tsdbError("vgId:%d failed to create META file since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + tsdbInfo("vgId:%d meta file %s is created to compact meta data", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(&mf)); + + // second iterator metaCache + int code = -1; + int64_t maxBufSize = 1024; + SKVRecord *pRecord; + void *pBuf = NULL; + + pBuf = malloc((size_t)maxBufSize); + if (pBuf == NULL) { + goto _err; + } + + // init Comp + assert(pfs->metaCacheComp == NULL); + pfs->metaCacheComp = taosHashInit(4096, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), true, HASH_NO_LOCK); + if (pfs->metaCacheComp == NULL) { + goto _err; + } + + pRecord = taosHashIterate(pfs->metaCache, NULL); + while (pRecord) { + if (tsdbSeekMFile(pMFile, pRecord->offset + sizeof(SKVRecord), SEEK_SET) < 0) { + tsdbError("vgId:%d failed to seek file %s since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile), + tstrerror(terrno)); + goto _err; + } + if (pRecord->size > maxBufSize) { + maxBufSize = pRecord->size; + void* tmp = realloc(pBuf, (size_t)maxBufSize); + if (tmp == NULL) { + goto _err; + } + pBuf = tmp; + } + int nread = (int)tsdbReadMFile(pMFile, pBuf, pRecord->size); + if (nread < 0) { + tsdbError("vgId:%d failed to read file %s since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile), + tstrerror(terrno)); + goto _err; + } + + if (nread < pRecord->size) { + tsdbError("vgId:%d failed to read file %s since file corrupted, expected read:%" PRId64 " actual read:%d", + REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile), pRecord->size, nread); + goto _err; + } + + if (tsdbUpdateMetaRecord(pfs, &mf, pRecord->uid, pBuf, (int)pRecord->size, true) < 0) { + tsdbError("vgId:%d failed to update META record, uid %" PRIu64 " since %s", REPO_ID(pRepo), pRecord->uid, + tstrerror(terrno)); + goto _err; + } + + pRecord = taosHashIterate(pfs->metaCache, pRecord); + } + code = 0; + +_err: + if (code == 0) TSDB_FILE_FSYNC(&mf); + tsdbCloseMFile(&mf); + tsdbCloseMFile(pMFile); + + if (code == 0) { + // rename meta.tmp -> meta + tsdbInfo("vgId:%d meta file rename %s -> %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(&mf), TSDB_FILE_FULL_NAME(pMFile)); + taosRename(mf.f.aname,pMFile->f.aname); + tstrncpy(mf.f.aname, pMFile->f.aname, TSDB_FILENAME_LEN); + tstrncpy(mf.f.rname, pMFile->f.rname, TSDB_FILENAME_LEN); + // update current meta file info + pfs->nstatus->pmf = NULL; + tsdbUpdateMFile(pfs, &mf); + + taosHashCleanup(pfs->metaCache); + pfs->metaCache = pfs->metaCacheComp; + pfs->metaCacheComp = NULL; + } else { + // remove meta.tmp file + remove(mf.f.aname); + taosHashCleanup(pfs->metaCacheComp); + pfs->metaCacheComp = NULL; + } + + tfree(pBuf); + + ASSERT(mf.info.nDels == 0); + ASSERT(mf.info.tombSize == 0); + + tsdbInfo("end compact tsdb meta file,code:%d,nRecords:%" PRId64 ",size:%" PRId64, + code,mf.info.nRecords,mf.info.size); + return code; +} + +// =================== Commit Time-Series Data +static int tsdbCommitTSData(STsdbRepo *pRepo) { + SMemTable *pMem = pRepo->imem; + SCommitH commith; + SDFileSet *pSet = NULL; + int fid; + + memset(&commith, 0, sizeof(commith)); + + if (pMem->numOfRows <= 0) { + // No memory data, just apply retention on each file on disk + if (tsdbApplyRtn(pRepo) < 0) { + return -1; + } + return 0; + } + + // Resource initialization + if (tsdbInitCommitH(&commith, pRepo) < 0) { + return -1; + } + + // Skip expired memory data and expired FSET + tsdbSeekCommitIter(&commith, commith.rtn.minKey); + while ((pSet = tsdbFSIterNext(&(commith.fsIter)))) { + if (pSet->fid < commith.rtn.minFid) { + tsdbInfo("vgId:%d FSET %d on level %d disk id %d expires, remove it", REPO_ID(pRepo), pSet->fid, + TSDB_FSET_LEVEL(pSet), TSDB_FSET_ID(pSet)); + } else { + break; + } + } + + // Loop to commit to each file + fid = tsdbNextCommitFid(&(commith)); + while (true) { + // Loop over both on disk and memory + if (pSet == NULL && fid == TSDB_IVLD_FID) break; + + if (pSet && (fid == TSDB_IVLD_FID || pSet->fid < fid)) { + // Only has existing FSET but no memory data to commit in this + // existing FSET, only check if file in correct retention + if (tsdbApplyRtnOnFSet(pRepo, pSet, &(commith.rtn)) < 0) { + tsdbDestroyCommitH(&commith); + return -1; + } + + pSet = tsdbFSIterNext(&(commith.fsIter)); + } else { + // Has memory data to commit + SDFileSet *pCSet; + int cfid; + + if (pSet == NULL || pSet->fid > fid) { + // Commit to a new FSET with fid: fid + pCSet = NULL; + cfid = fid; + } else { + // Commit to an existing FSET + pCSet = pSet; + cfid = pSet->fid; + pSet = tsdbFSIterNext(&(commith.fsIter)); + } + + if (tsdbCommitToFile(&commith, pCSet, cfid) < 0) { + tsdbDestroyCommitH(&commith); + return -1; + } + + fid = tsdbNextCommitFid(&commith); + } + } + + tsdbDestroyCommitH(&commith); + return 0; +} + +static void tsdbStartCommit(STsdbRepo *pRepo) { + SMemTable *pMem = pRepo->imem; + + ASSERT(pMem->numOfRows > 0 || listNEles(pMem->actList) > 0); + + tsdbInfo("vgId:%d start to commit! keyFirst %" PRId64 " keyLast %" PRId64 " numOfRows %" PRId64 " meta rows: %d", + REPO_ID(pRepo), pMem->keyFirst, pMem->keyLast, pMem->numOfRows, listNEles(pMem->actList)); + + tsdbStartFSTxn(pRepo, pMem->pointsAdd, pMem->storageAdd); + + pRepo->code = TSDB_CODE_SUCCESS; +} + +static void tsdbEndCommit(STsdbRepo *pRepo, int eno) { + if (eno != TSDB_CODE_SUCCESS) { + tsdbEndFSTxnWithError(REPO_FS(pRepo)); + } else { + tsdbEndFSTxn(pRepo); + } + + tsdbInfo("vgId:%d commit over, %s", REPO_ID(pRepo), (eno == TSDB_CODE_SUCCESS) ? "succeed" : "failed"); + + if (pRepo->appH.notifyStatus) pRepo->appH.notifyStatus(pRepo->appH.appH, TSDB_STATUS_COMMIT_OVER, eno); + + SMemTable *pIMem = pRepo->imem; + (void)tsdbLockRepo(pRepo); + pRepo->imem = NULL; + (void)tsdbUnlockRepo(pRepo); + tsdbUnRefMemTable(pRepo, pIMem); + tsem_post(&(pRepo->readyToCommit)); +} + +#if 0 +static bool tsdbHasDataToCommit(SCommitIter *iters, int nIters, TSKEY minKey, TSKEY maxKey) { + for (int i = 0; i < nIters; i++) { + TSKEY nextKey = tsdbNextIterKey((iters + i)->pIter); + if (nextKey != TSDB_DATA_TIMESTAMP_NULL && (nextKey >= minKey && nextKey <= maxKey)) return true; + } + return false; +} +#endif + +static int tsdbCommitToFile(SCommitH *pCommith, SDFileSet *pSet, int fid) { + STsdbRepo *pRepo = TSDB_COMMIT_REPO(pCommith); + STsdbCfg * pCfg = REPO_CFG(pRepo); + + ASSERT(pSet == NULL || pSet->fid == fid); + + tsdbResetCommitFile(pCommith); + tsdbGetFidKeyRange(pCfg->daysPerFile, pCfg->precision, fid, &(pCommith->minKey), &(pCommith->maxKey)); + + // Set and open files + if (tsdbSetAndOpenCommitFile(pCommith, pSet, fid) < 0) { + return -1; + } + + // Loop to commit each table data + for (int tid = 1; tid < pCommith->niters; tid++) { + SCommitIter *pIter = pCommith->iters + tid; + + if (pIter->pTable == NULL) continue; + + if (tsdbCommitToTable(pCommith, tid) < 0) { + tsdbCloseCommitFile(pCommith, true); + // revert the file change + tsdbApplyDFileSetChange(TSDB_COMMIT_WRITE_FSET(pCommith), pSet); + return -1; + } + } + + if (tsdbWriteBlockIdx(TSDB_COMMIT_HEAD_FILE(pCommith), pCommith->aBlkIdx, (void **)(&(TSDB_COMMIT_BUF(pCommith)))) < + 0) { + tsdbError("vgId:%d failed to write SBlockIdx part to FSET %d since %s", REPO_ID(pRepo), fid, tstrerror(terrno)); + tsdbCloseCommitFile(pCommith, true); + // revert the file change + tsdbApplyDFileSetChange(TSDB_COMMIT_WRITE_FSET(pCommith), pSet); + return -1; + } + + if (tsdbUpdateDFileSetHeader(&(pCommith->wSet)) < 0) { + tsdbError("vgId:%d failed to update FSET %d header since %s", REPO_ID(pRepo), fid, tstrerror(terrno)); + tsdbCloseCommitFile(pCommith, true); + // revert the file change + tsdbApplyDFileSetChange(TSDB_COMMIT_WRITE_FSET(pCommith), pSet); + return -1; + } + + // Close commit file + tsdbCloseCommitFile(pCommith, false); + + if (tsdbUpdateDFileSet(REPO_FS(pRepo), &(pCommith->wSet)) < 0) { + return -1; + } + + return 0; +} + +static int tsdbCreateCommitIters(SCommitH *pCommith) { + STsdbRepo *pRepo = TSDB_COMMIT_REPO(pCommith); + SMemTable *pMem = pRepo->imem; + STsdbMeta *pMeta = pRepo->tsdbMeta; + + pCommith->niters = pMem->maxTables; + pCommith->iters = (SCommitIter *)calloc(pMem->maxTables, sizeof(SCommitIter)); + if (pCommith->iters == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + if (tsdbRLockRepoMeta(pRepo) < 0) return -1; + + // reference all tables + for (int i = 0; i < pMem->maxTables; i++) { + if (pMeta->tables[i] != NULL) { + tsdbRefTable(pMeta->tables[i]); + pCommith->iters[i].pTable = pMeta->tables[i]; + } + } + + if (tsdbUnlockRepoMeta(pRepo) < 0) return -1; + + for (int i = 0; i < pMem->maxTables; i++) { + if ((pCommith->iters[i].pTable != NULL) && (pMem->tData[i] != NULL) && + (TABLE_UID(pCommith->iters[i].pTable) == pMem->tData[i]->uid)) { + if ((pCommith->iters[i].pIter = tSkipListCreateIter(pMem->tData[i]->pData)) == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + tSkipListIterNext(pCommith->iters[i].pIter); + } + } + + return 0; +} + +static void tsdbDestroyCommitIters(SCommitH *pCommith) { + if (pCommith->iters == NULL) return; + + for (int i = 1; i < pCommith->niters; i++) { + if (pCommith->iters[i].pTable != NULL) { + tsdbUnRefTable(pCommith->iters[i].pTable); + tSkipListDestroyIter(pCommith->iters[i].pIter); + } + } + + free(pCommith->iters); + pCommith->iters = NULL; + pCommith->niters = 0; +} + +// Skip all keys until key (not included) +static void tsdbSeekCommitIter(SCommitH *pCommith, TSKEY key) { + for (int i = 0; i < pCommith->niters; i++) { + SCommitIter *pIter = pCommith->iters + i; + if (pIter->pTable == NULL || pIter->pIter == NULL) continue; + + tsdbLoadDataFromCache(pIter->pTable, pIter->pIter, key - 1, INT32_MAX, NULL, NULL, 0, true, NULL); + } +} + +static int tsdbInitCommitH(SCommitH *pCommith, STsdbRepo *pRepo) { + STsdbCfg *pCfg = REPO_CFG(pRepo); + + memset(pCommith, 0, sizeof(*pCommith)); + tsdbGetRtnSnap(pRepo, &(pCommith->rtn)); + + TSDB_FSET_SET_CLOSED(TSDB_COMMIT_WRITE_FSET(pCommith)); + + // Init read handle + if (tsdbInitReadH(&(pCommith->readh), pRepo) < 0) { + return -1; + } + + // Init file iterator + tsdbFSIterInit(&(pCommith->fsIter), REPO_FS(pRepo), TSDB_FS_ITER_FORWARD); + + if (tsdbCreateCommitIters(pCommith) < 0) { + tsdbDestroyCommitH(pCommith); + return -1; + } + + pCommith->aBlkIdx = taosArrayInit(1024, sizeof(SBlockIdx)); + if (pCommith->aBlkIdx == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbDestroyCommitH(pCommith); + return -1; + } + + pCommith->aSupBlk = taosArrayInit(1024, sizeof(SBlock)); + if (pCommith->aSupBlk == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbDestroyCommitH(pCommith); + return -1; + } + + pCommith->aSubBlk = taosArrayInit(1024, sizeof(SBlock)); + if (pCommith->aSubBlk == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbDestroyCommitH(pCommith); + return -1; + } + + pCommith->pDataCols = tdNewDataCols(0, pCfg->maxRowsPerFileBlock); + if (pCommith->pDataCols == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbDestroyCommitH(pCommith); + return -1; + } + + return 0; +} + +static void tsdbDestroyCommitH(SCommitH *pCommith) { + pCommith->pDataCols = tdFreeDataCols(pCommith->pDataCols); + pCommith->aSubBlk = taosArrayDestroy(&pCommith->aSubBlk); + pCommith->aSupBlk = taosArrayDestroy(&pCommith->aSupBlk); + pCommith->aBlkIdx = taosArrayDestroy(&pCommith->aBlkIdx); + tsdbDestroyCommitIters(pCommith); + tsdbDestroyReadH(&(pCommith->readh)); + tsdbCloseDFileSet(TSDB_COMMIT_WRITE_FSET(pCommith)); +} + +static int tsdbNextCommitFid(SCommitH *pCommith) { + STsdbRepo *pRepo = TSDB_COMMIT_REPO(pCommith); + STsdbCfg * pCfg = REPO_CFG(pRepo); + int fid = TSDB_IVLD_FID; + + for (int i = 0; i < pCommith->niters; i++) { + SCommitIter *pIter = pCommith->iters + i; + if (pIter->pTable == NULL || pIter->pIter == NULL) continue; + + TSKEY nextKey = tsdbNextIterKey(pIter->pIter); + if (nextKey == TSDB_DATA_TIMESTAMP_NULL) { + continue; + } else { + int tfid = (int)(TSDB_KEY_FID(nextKey, pCfg->daysPerFile, pCfg->precision)); + if (fid == TSDB_IVLD_FID || fid > tfid) { + fid = tfid; // find the least fid + } + } + } + + return fid; +} + +static int tsdbCommitToTable(SCommitH *pCommith, int tid) { + SCommitIter *pIter = pCommith->iters + tid; + TSKEY nextKey = tsdbNextIterKey(pIter->pIter); + + tsdbResetCommitTable(pCommith); + + TSDB_RLOCK_TABLE(pIter->pTable); + + // Set commit table + if (tsdbSetCommitTable(pCommith, pIter->pTable) < 0) { + TSDB_RUNLOCK_TABLE(pIter->pTable); + return -1; + } + + // No disk data and no memory data, just return + if (pCommith->readh.pBlkIdx == NULL && (nextKey == TSDB_DATA_TIMESTAMP_NULL || nextKey > pCommith->maxKey)) { + TSDB_RUNLOCK_TABLE(pIter->pTable); + return 0; + } + + // Must has disk data or has memory data + int nBlocks; + int bidx = 0; + SBlock *pBlock; + + if (pCommith->readh.pBlkIdx) { + if (tsdbLoadBlockInfo(&(pCommith->readh), NULL, NULL) < 0) { + TSDB_RUNLOCK_TABLE(pIter->pTable); + return -1; + } + + nBlocks = pCommith->readh.pBlkIdx->numOfBlocks; + } else { + nBlocks = 0; + } + + if (bidx < nBlocks) { + pBlock = pCommith->readh.pBlkInfo->blocks + bidx; + } else { + pBlock = NULL; + } + + while (true) { + if (pBlock == NULL && (nextKey == TSDB_DATA_TIMESTAMP_NULL || nextKey > pCommith->maxKey)) break; + + if ((nextKey == TSDB_DATA_TIMESTAMP_NULL || nextKey > pCommith->maxKey) || + (pBlock && (!pBlock->last) && tsdbComparKeyBlock((void *)(&nextKey), pBlock) > 0)) { + if (tsdbMoveBlock(pCommith, bidx) < 0) { + TSDB_RUNLOCK_TABLE(pIter->pTable); + return -1; + } + + bidx++; + if (bidx < nBlocks) { + pBlock = pCommith->readh.pBlkInfo->blocks + bidx; + } else { + pBlock = NULL; + } + } else if (pBlock && (pBlock->last || tsdbComparKeyBlock((void *)(&nextKey), pBlock) == 0)) { + // merge pBlock data and memory data + if (tsdbMergeMemData(pCommith, pIter, bidx) < 0) { + TSDB_RUNLOCK_TABLE(pIter->pTable); + return -1; + } + + bidx++; + if (bidx < nBlocks) { + pBlock = pCommith->readh.pBlkInfo->blocks + bidx; + } else { + pBlock = NULL; + } + nextKey = tsdbNextIterKey(pIter->pIter); + } else { + // Only commit memory data + if (pBlock == NULL) { + if (tsdbCommitMemData(pCommith, pIter, pCommith->maxKey, false) < 0) { + TSDB_RUNLOCK_TABLE(pIter->pTable); + return -1; + } + } else { + if (tsdbCommitMemData(pCommith, pIter, pBlock->keyFirst - 1, true) < 0) { + TSDB_RUNLOCK_TABLE(pIter->pTable); + return -1; + } + } + nextKey = tsdbNextIterKey(pIter->pIter); + } + } + + TSDB_RUNLOCK_TABLE(pIter->pTable); + + if (tsdbWriteBlockInfo(pCommith) < 0) { + tsdbError("vgId:%d failed to write SBlockInfo part into file %s since %s", TSDB_COMMIT_REPO_ID(pCommith), + TSDB_FILE_FULL_NAME(TSDB_COMMIT_HEAD_FILE(pCommith)), tstrerror(terrno)); + return -1; + } + + return 0; +} + +static int tsdbSetCommitTable(SCommitH *pCommith, STable *pTable) { + STSchema *pSchema = tsdbGetTableSchemaImpl(pTable, false, false, -1, -1); + + pCommith->pTable = pTable; + + if (tdInitDataCols(pCommith->pDataCols, pSchema) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + if (pCommith->isRFileSet) { + if (tsdbSetReadTable(&(pCommith->readh), pTable) < 0) { + return -1; + } + } else { + pCommith->readh.pBlkIdx = NULL; + } + return 0; +} + +static int tsdbComparKeyBlock(const void *arg1, const void *arg2) { + TSKEY key = *(TSKEY *)arg1; + SBlock *pBlock = (SBlock *)arg2; + + if (key < pBlock->keyFirst) { + return -1; + } else if (key > pBlock->keyLast) { + return 1; + } else { + return 0; + } +} + +int tsdbWriteBlockImpl(STsdbRepo *pRepo, STable *pTable, SDFile *pDFile, SDFile *pDFileAggr, SDataCols *pDataCols, + SBlock *pBlock, bool isLast, bool isSuper, void **ppBuf, void **ppCBuf, void **ppExBuf) { + STsdbCfg * pCfg = REPO_CFG(pRepo); + SBlockData *pBlockData; + SAggrBlkData *pAggrBlkData = NULL; + int64_t offset = 0, offsetAggr = 0; + int rowsToWrite = pDataCols->numOfRows; + + ASSERT(rowsToWrite > 0 && rowsToWrite <= pCfg->maxRowsPerFileBlock); + ASSERT((!isLast) || rowsToWrite < pCfg->minRowsPerFileBlock); + + // Make buffer space + if (tsdbMakeRoom(ppBuf, tsdbBlockStatisSize(pDataCols->numOfCols, SBlockVerLatest)) < 0) { + return -1; + } + pBlockData = (SBlockData *)(*ppBuf); + + if (tsdbMakeRoom(ppExBuf, tsdbBlockAggrSize(pDataCols->numOfCols, SBlockVerLatest)) < 0) { + return -1; + } + pAggrBlkData = (SAggrBlkData *)(*ppExBuf); + + // Get # of cols not all NULL(not including key column) + int nColsNotAllNull = 0; + for (int ncol = 1; ncol < pDataCols->numOfCols; ncol++) { // ncol from 1, we skip the timestamp column + SDataCol * pDataCol = pDataCols->cols + ncol; + SBlockCol * pBlockCol = pBlockData->cols + nColsNotAllNull; + SAggrBlkCol *pAggrBlkCol = (SAggrBlkCol *)pAggrBlkData + nColsNotAllNull; + + if (isAllRowsNull(pDataCol)) { // all data to commit are NULL, just ignore it + continue; + } + + memset(pBlockCol, 0, sizeof(*pBlockCol)); + memset(pAggrBlkCol, 0, sizeof(*pAggrBlkCol)); + + pBlockCol->colId = pDataCol->colId; + pBlockCol->type = pDataCol->type; + pAggrBlkCol->colId = pDataCol->colId; + + if (tDataTypes[pDataCol->type].statisFunc) { +#if 0 + (*tDataTypes[pDataCol->type].statisFunc)(pDataCol->pData, rowsToWrite, &(pBlockCol->min), &(pBlockCol->max), + &(pBlockCol->sum), &(pBlockCol->minIndex), &(pBlockCol->maxIndex), + &(pBlockCol->numOfNull)); +#endif + (*tDataTypes[pDataCol->type].statisFunc)(pDataCol->pData, rowsToWrite, &(pAggrBlkCol->min), &(pAggrBlkCol->max), + &(pAggrBlkCol->sum), &(pAggrBlkCol->minIndex), &(pAggrBlkCol->maxIndex), + &(pAggrBlkCol->numOfNull)); + } + nColsNotAllNull++; + } + + ASSERT(nColsNotAllNull >= 0 && nColsNotAllNull <= pDataCols->numOfCols); + + // Compress the data if neccessary + int tcol = 0; // counter of not all NULL and written columns + uint32_t toffset = 0; + int32_t tsize = (int32_t)tsdbBlockStatisSize(nColsNotAllNull, SBlockVerLatest); + int32_t lsize = tsize; + int32_t keyLen = 0; + + uint32_t tsizeAggr = (uint32_t)tsdbBlockAggrSize(nColsNotAllNull, SBlockVerLatest); + + for (int ncol = 0; ncol < pDataCols->numOfCols; ncol++) { + // All not NULL columns finish + if (ncol != 0 && tcol >= nColsNotAllNull) break; + + SDataCol * pDataCol = pDataCols->cols + ncol; + SBlockCol *pBlockCol = pBlockData->cols + tcol; + + if (ncol != 0 && (pDataCol->colId != pBlockCol->colId)) continue; + + int32_t flen; // final length + int32_t tlen = dataColGetNEleLen(pDataCol, rowsToWrite); + void * tptr; + + // Make room + if (tsdbMakeRoom(ppBuf, lsize + tlen + COMP_OVERFLOW_BYTES + sizeof(TSCKSUM)) < 0) { + return -1; + } + pBlockData = (SBlockData *)(*ppBuf); + pBlockCol = pBlockData->cols + tcol; + tptr = POINTER_SHIFT(pBlockData, lsize); + + if (pCfg->compression == TWO_STAGE_COMP && + tsdbMakeRoom(ppCBuf, tlen + COMP_OVERFLOW_BYTES) < 0) { + return -1; + } + + // Compress or just copy + if (pCfg->compression) { + flen = (*(tDataTypes[pDataCol->type].compFunc))((char *)pDataCol->pData, tlen, rowsToWrite, tptr, + tlen + COMP_OVERFLOW_BYTES, pCfg->compression, *ppCBuf, + tlen + COMP_OVERFLOW_BYTES); + } else { + flen = tlen; + memcpy(tptr, pDataCol->pData, flen); + } + + // Add checksum + ASSERT(flen > 0); + flen += sizeof(TSCKSUM); + taosCalcChecksumAppend(0, (uint8_t *)tptr, flen); + tsdbUpdateDFileMagic(pDFile, POINTER_SHIFT(tptr, flen - sizeof(TSCKSUM))); + + if (ncol != 0) { + tsdbSetBlockColOffset(pBlockCol, toffset); + pBlockCol->len = flen; + tcol++; + } else { + keyLen = flen; + } + + toffset += flen; + lsize += flen; + } + + pBlockData->delimiter = TSDB_FILE_DELIMITER; + pBlockData->uid = TABLE_UID(pTable); + pBlockData->numOfCols = nColsNotAllNull; + + taosCalcChecksumAppend(0, (uint8_t *)pBlockData, tsize); + tsdbUpdateDFileMagic(pDFile, POINTER_SHIFT(pBlockData, tsize - sizeof(TSCKSUM))); + + // Write the whole block to file + if (tsdbAppendDFile(pDFile, (void *)pBlockData, lsize, &offset) < lsize) { + return -1; + } + + uint32_t aggrStatus = nColsNotAllNull > 0 ? 1 : 0; + if (aggrStatus > 0) { + + taosCalcChecksumAppend(0, (uint8_t *)pAggrBlkData, tsizeAggr); + tsdbUpdateDFileMagic(pDFileAggr, POINTER_SHIFT(pAggrBlkData, tsizeAggr - sizeof(TSCKSUM))); + + // Write the whole block to file + if (tsdbAppendDFile(pDFileAggr, (void *)pAggrBlkData, tsizeAggr, &offsetAggr) < tsizeAggr) { + return -1; + } + } + + // Update pBlock membership variables + pBlock->last = isLast; + pBlock->offset = offset; + pBlock->algorithm = pCfg->compression; + pBlock->numOfRows = rowsToWrite; + pBlock->len = lsize; + pBlock->keyLen = keyLen; + pBlock->numOfSubBlocks = isSuper ? 1 : 0; + pBlock->numOfCols = nColsNotAllNull; + pBlock->keyFirst = dataColsKeyFirst(pDataCols); + pBlock->keyLast = dataColsKeyLast(pDataCols); + // since blkVer1 + pBlock->aggrStat = aggrStatus; + pBlock->blkVer = SBlockVerLatest; + pBlock->aggrOffset = (uint64_t)offsetAggr; + + tsdbDebug("vgId:%d tid:%d a block of data is written to file %s, offset %" PRId64 + " numOfRows %d len %d numOfCols %" PRId16 " keyFirst %" PRId64 " keyLast %" PRId64, + REPO_ID(pRepo), TABLE_TID(pTable), TSDB_FILE_FULL_NAME(pDFile), offset, rowsToWrite, pBlock->len, + pBlock->numOfCols, pBlock->keyFirst, pBlock->keyLast); + + return 0; +} + +static int tsdbWriteBlock(SCommitH *pCommith, SDFile *pDFile, SDataCols *pDataCols, SBlock *pBlock, bool isLast, + bool isSuper) { + return tsdbWriteBlockImpl(TSDB_COMMIT_REPO(pCommith), TSDB_COMMIT_TABLE(pCommith), pDFile, + isLast ? TSDB_COMMIT_SMAL_FILE(pCommith) : TSDB_COMMIT_SMAD_FILE(pCommith), pDataCols, + pBlock, isLast, isSuper, (void **)(&(TSDB_COMMIT_BUF(pCommith))), + (void **)(&(TSDB_COMMIT_COMP_BUF(pCommith))), (void **)(&(TSDB_COMMIT_EXBUF(pCommith)))); +} + +static int tsdbWriteBlockInfo(SCommitH *pCommih) { + SDFile * pHeadf = TSDB_COMMIT_HEAD_FILE(pCommih); + SBlockIdx blkIdx; + STable * pTable = TSDB_COMMIT_TABLE(pCommih); + + if (tsdbWriteBlockInfoImpl(pHeadf, pTable, pCommih->aSupBlk, pCommih->aSubBlk, (void **)(&(TSDB_COMMIT_BUF(pCommih))), + &blkIdx) < 0) { + return -1; + } + + if (blkIdx.numOfBlocks == 0) { + return 0; + } + + if (taosArrayPush(pCommih->aBlkIdx, (void *)(&blkIdx)) == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + return 0; +} + +static int tsdbCommitMemData(SCommitH *pCommith, SCommitIter *pIter, TSKEY keyLimit, bool toData) { + STsdbRepo *pRepo = TSDB_COMMIT_REPO(pCommith); + STsdbCfg * pCfg = REPO_CFG(pRepo); + SMergeInfo mInfo; + int32_t defaultRows = TSDB_COMMIT_DEFAULT_ROWS(pCommith); + SDFile * pDFile; + bool isLast; + SBlock block; + + while (true) { + tsdbLoadDataFromCache(pIter->pTable, pIter->pIter, keyLimit, defaultRows, pCommith->pDataCols, NULL, 0, + pCfg->update, &mInfo); + + if (pCommith->pDataCols->numOfRows <= 0) break; + + if (toData || pCommith->pDataCols->numOfRows >= pCfg->minRowsPerFileBlock) { + pDFile = TSDB_COMMIT_DATA_FILE(pCommith); + isLast = false; + } else { + pDFile = TSDB_COMMIT_LAST_FILE(pCommith); + isLast = true; + } + + if (tsdbWriteBlock(pCommith, pDFile, pCommith->pDataCols, &block, isLast, true) < 0) return -1; + + if (tsdbCommitAddBlock(pCommith, &block, NULL, 0) < 0) { + return -1; + } + } + + return 0; +} + +static int tsdbMergeMemData(SCommitH *pCommith, SCommitIter *pIter, int bidx) { + STsdbRepo *pRepo = TSDB_COMMIT_REPO(pCommith); + STsdbCfg * pCfg = REPO_CFG(pRepo); + int nBlocks = pCommith->readh.pBlkIdx->numOfBlocks; + SBlock * pBlock = pCommith->readh.pBlkInfo->blocks + bidx; + TSKEY keyLimit; + int16_t colId = 0; + SMergeInfo mInfo; + SBlock subBlocks[TSDB_MAX_SUBBLOCKS]; + SBlock block, supBlock; + SDFile * pDFile; + + if (bidx == nBlocks - 1) { + keyLimit = pCommith->maxKey; + } else { + keyLimit = pBlock[1].keyFirst - 1; + } + + SSkipListIterator titer = *(pIter->pIter); + if (tsdbLoadBlockDataCols(&(pCommith->readh), pBlock, NULL, &colId, 1) < 0) return -1; + + tsdbLoadDataFromCache(pIter->pTable, &titer, keyLimit, INT32_MAX, NULL, pCommith->readh.pDCols[0]->cols[0].pData, + pCommith->readh.pDCols[0]->numOfRows, pCfg->update, &mInfo); + + if (mInfo.nOperations == 0) { + // no new data to insert (all updates denied) + if (tsdbMoveBlock(pCommith, bidx) < 0) { + return -1; + } + *(pIter->pIter) = titer; + } else if (pBlock->numOfRows + mInfo.rowsInserted - mInfo.rowsDeleteSucceed == 0) { + // Ignore the block + ASSERT(0); + *(pIter->pIter) = titer; + } else if (tsdbCanAddSubBlock(pCommith, pBlock, &mInfo)) { + // Add a sub-block + tsdbLoadDataFromCache(pIter->pTable, pIter->pIter, keyLimit, INT32_MAX, pCommith->pDataCols, + pCommith->readh.pDCols[0]->cols[0].pData, pCommith->readh.pDCols[0]->numOfRows, pCfg->update, + &mInfo); + if (pBlock->last) { + pDFile = TSDB_COMMIT_LAST_FILE(pCommith); + } else { + pDFile = TSDB_COMMIT_DATA_FILE(pCommith); + } + + if (tsdbWriteBlock(pCommith, pDFile, pCommith->pDataCols, &block, pBlock->last, false) < 0) return -1; + + if (pBlock->numOfSubBlocks == 1) { + subBlocks[0] = *pBlock; + subBlocks[0].numOfSubBlocks = 0; + } else { + memcpy(subBlocks, POINTER_SHIFT(pCommith->readh.pBlkInfo, pBlock->offset), + sizeof(SBlock) * pBlock->numOfSubBlocks); + } + subBlocks[pBlock->numOfSubBlocks] = block; + supBlock = *pBlock; + supBlock.keyFirst = mInfo.keyFirst; + supBlock.keyLast = mInfo.keyLast; + supBlock.numOfSubBlocks++; + supBlock.numOfRows = pBlock->numOfRows + mInfo.rowsInserted - mInfo.rowsDeleteSucceed; + supBlock.offset = taosArrayGetSize(pCommith->aSubBlk) * sizeof(SBlock); + + if (tsdbCommitAddBlock(pCommith, &supBlock, subBlocks, supBlock.numOfSubBlocks) < 0) return -1; + } else { + if (tsdbLoadBlockData(&(pCommith->readh), pBlock, NULL) < 0) return -1; + if (tsdbMergeBlockData(pCommith, pIter, pCommith->readh.pDCols[0], keyLimit, bidx == (nBlocks - 1)) < 0) return -1; + } + + return 0; +} + +static int tsdbMoveBlock(SCommitH *pCommith, int bidx) { + SBlock *pBlock = pCommith->readh.pBlkInfo->blocks + bidx; + SDFile *pDFile; + SBlock block; + bool isSameFile; + + ASSERT(pBlock->numOfSubBlocks > 0); + + if (pBlock->last) { + pDFile = TSDB_COMMIT_LAST_FILE(pCommith); + isSameFile = pCommith->isLFileSame; + } else { + pDFile = TSDB_COMMIT_DATA_FILE(pCommith); + isSameFile = pCommith->isDFileSame; + } + + if (isSameFile) { + if (pBlock->numOfSubBlocks == 1) { + if (tsdbCommitAddBlock(pCommith, pBlock, NULL, 0) < 0) { + return -1; + } + } else { + block = *pBlock; + block.offset = sizeof(SBlock) * taosArrayGetSize(pCommith->aSubBlk); + + if (tsdbCommitAddBlock(pCommith, &block, POINTER_SHIFT(pCommith->readh.pBlkInfo, pBlock->offset), + pBlock->numOfSubBlocks) < 0) { + return -1; + } + } + } else { + if (tsdbLoadBlockData(&(pCommith->readh), pBlock, NULL) < 0) return -1; + if (tsdbWriteBlock(pCommith, pDFile, pCommith->readh.pDCols[0], &block, pBlock->last, true) < 0) return -1; + if (tsdbCommitAddBlock(pCommith, &block, NULL, 0) < 0) return -1; + } + + return 0; +} + +static int tsdbCommitAddBlock(SCommitH *pCommith, const SBlock *pSupBlock, const SBlock *pSubBlocks, int nSubBlocks) { + if (taosArrayPush(pCommith->aSupBlk, pSupBlock) == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + if (pSubBlocks && taosArrayAddBatch(pCommith->aSubBlk, pSubBlocks, nSubBlocks) == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + return 0; +} + +static int tsdbMergeBlockData(SCommitH *pCommith, SCommitIter *pIter, SDataCols *pDataCols, TSKEY keyLimit, bool isLastOneBlock) { + STsdbRepo *pRepo = TSDB_COMMIT_REPO(pCommith); + STsdbCfg * pCfg = REPO_CFG(pRepo); + SBlock block; + SDFile * pDFile; + bool isLast; + int32_t defaultRows = TSDB_COMMIT_DEFAULT_ROWS(pCommith); + + int biter = 0; + while (true) { + tsdbLoadAndMergeFromCache(pCommith->readh.pDCols[0], &biter, pIter, pCommith->pDataCols, keyLimit, defaultRows, + pCfg->update); + + if (pCommith->pDataCols->numOfRows == 0) break; + + if (isLastOneBlock) { + if (pCommith->pDataCols->numOfRows < pCfg->minRowsPerFileBlock) { + pDFile = TSDB_COMMIT_LAST_FILE(pCommith); + isLast = true; + } else { + pDFile = TSDB_COMMIT_DATA_FILE(pCommith); + isLast = false; + } + } else { + pDFile = TSDB_COMMIT_DATA_FILE(pCommith); + isLast = false; + } + + if (tsdbWriteBlock(pCommith, pDFile, pCommith->pDataCols, &block, isLast, true) < 0) return -1; + if (tsdbCommitAddBlock(pCommith, &block, NULL, 0) < 0) return -1; + } + + return 0; +} + +static void tsdbLoadAndMergeFromCache(SDataCols *pDataCols, int *iter, SCommitIter *pCommitIter, SDataCols *pTarget, + TSKEY maxKey, int maxRows, int8_t update) { + TSKEY key1 = INT64_MAX; + TSKEY key2 = INT64_MAX; + STSchema *pSchema = NULL; + + ASSERT(maxRows > 0 && dataColsKeyLast(pDataCols) <= maxKey); + tdResetDataCols(pTarget); + + while (true) { + key1 = (*iter >= pDataCols->numOfRows) ? INT64_MAX : dataColsKeyAt(pDataCols, *iter); + SMemRow row = tsdbNextIterRow(pCommitIter->pIter); + if (row == NULL || memRowKey(row) > maxKey) { + key2 = INT64_MAX; + } else { + key2 = memRowKey(row); + } + + if (key1 == INT64_MAX && key2 == INT64_MAX) break; + + if (key1 < key2) { + for (int i = 0; i < pDataCols->numOfCols; i++) { + //TODO: dataColAppendVal may fail + dataColAppendVal(pTarget->cols + i, tdGetColDataOfRow(pDataCols->cols + i, *iter), pTarget->numOfRows, + pTarget->maxPoints, 0); + } + + pTarget->numOfRows++; + (*iter)++; + } else if (key1 > key2) { + if (pSchema == NULL || schemaVersion(pSchema) != memRowVersion(row)) { + pSchema = + tsdbGetTableSchemaImpl(pCommitIter->pTable, false, false, memRowVersion(row), (int8_t)memRowType(row)); + ASSERT(pSchema != NULL); + } + + tdAppendMemRowToDataCol(row, pSchema, pTarget, true, 0); + + tSkipListIterNext(pCommitIter->pIter); + } else { + if (update != TD_ROW_OVERWRITE_UPDATE) { + //copy disk data + for (int i = 0; i < pDataCols->numOfCols; i++) { + //TODO: dataColAppendVal may fail + dataColAppendVal(pTarget->cols + i, tdGetColDataOfRow(pDataCols->cols + i, *iter), pTarget->numOfRows, + pTarget->maxPoints, 0); + } + + if(update == TD_ROW_DISCARD_UPDATE) pTarget->numOfRows++; + } + if (update != TD_ROW_DISCARD_UPDATE) { + //copy mem data + if (pSchema == NULL || schemaVersion(pSchema) != memRowVersion(row)) { + pSchema = + tsdbGetTableSchemaImpl(pCommitIter->pTable, false, false, memRowVersion(row), (int8_t)memRowType(row)); + ASSERT(pSchema != NULL); + } + + tdAppendMemRowToDataCol(row, pSchema, pTarget, update == TD_ROW_OVERWRITE_UPDATE, + update != TD_ROW_PARTIAL_UPDATE ? 0 : -1); + } + (*iter)++; + tSkipListIterNext(pCommitIter->pIter); + } + + if (pTarget->numOfRows >= maxRows) break; + } +} + +static void tsdbResetCommitFile(SCommitH *pCommith) { + pCommith->isRFileSet = false; + pCommith->isDFileSame = false; + pCommith->isLFileSame = false; + taosArrayClear(pCommith->aBlkIdx); +} + +static void tsdbResetCommitTable(SCommitH *pCommith) { + taosArrayClear(pCommith->aSubBlk); + taosArrayClear(pCommith->aSupBlk); + pCommith->pTable = NULL; +} + +static int tsdbSetAndOpenCommitFile(SCommitH *pCommith, SDFileSet *pSet, int fid) { + SDiskID did; + STsdbRepo *pRepo = TSDB_COMMIT_REPO(pCommith); + SDFileSet *pWSet = TSDB_COMMIT_WRITE_FSET(pCommith); + + tfsAllocDisk(tsdbGetFidLevel(fid, &(pCommith->rtn)), &(did.level), &(did.id)); + if (did.level == TFS_UNDECIDED_LEVEL) { + terrno = TSDB_CODE_TDB_NO_AVAIL_DISK; + return -1; + } + + // Open read FSET + if (pSet) { + if (tsdbSetAndOpenReadFSet(&(pCommith->readh), pSet) < 0) { + return -1; + } + + pCommith->isRFileSet = true; + + if (tsdbLoadBlockIdx(&(pCommith->readh)) < 0) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + return -1; + } + + tsdbDebug("vgId:%d FSET %d at level %d disk id %d is opened to read to commit", REPO_ID(pRepo), TSDB_FSET_FID(pSet), + TSDB_FSET_LEVEL(pSet), TSDB_FSET_ID(pSet)); + } else { + pCommith->isRFileSet = false; + } + + // Set and open commit FSET + if (pSet == NULL || did.level > TSDB_FSET_LEVEL(pSet)) { + // Create a new FSET to write data + tsdbInitDFileSet(pWSet, did, REPO_ID(pRepo), fid, FS_TXN_VERSION(REPO_FS(pRepo)), TSDB_LATEST_FSET_VER); + + if (tsdbCreateDFileSet(pWSet, true) < 0) { + tsdbError("vgId:%d failed to create FSET %d at level %d disk id %d since %s", REPO_ID(pRepo), + TSDB_FSET_FID(pWSet), TSDB_FSET_LEVEL(pWSet), TSDB_FSET_ID(pWSet), tstrerror(terrno)); + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + } + return -1; + } + + pCommith->isDFileSame = false; + pCommith->isLFileSame = false; + + tsdbDebug("vgId:%d FSET %d at level %d disk id %d is created to commit", REPO_ID(pRepo), TSDB_FSET_FID(pWSet), + TSDB_FSET_LEVEL(pWSet), TSDB_FSET_ID(pWSet)); + } else { + did.level = TSDB_FSET_LEVEL(pSet); + did.id = TSDB_FSET_ID(pSet); + + pCommith->wSet.fid = fid; + pCommith->wSet.state = 0; + pCommith->wSet.ver = TSDB_LATEST_FSET_VER; + + // TSDB_FILE_HEAD + SDFile *pWHeadf = TSDB_COMMIT_HEAD_FILE(pCommith); + tsdbInitDFile(pWHeadf, did, REPO_ID(pRepo), fid, FS_TXN_VERSION(REPO_FS(pRepo)), TSDB_FILE_HEAD); + if (tsdbCreateDFile(pWHeadf, true, TSDB_FILE_HEAD) < 0) { + tsdbError("vgId:%d failed to create file %s to commit since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pWHeadf), + tstrerror(terrno)); + + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + return -1; + } + } + + // TSDB_FILE_DATA + SDFile *pRDataf = TSDB_READ_DATA_FILE(&(pCommith->readh)); + SDFile *pWDataf = TSDB_COMMIT_DATA_FILE(pCommith); + tsdbInitDFileEx(pWDataf, pRDataf); + if (tsdbOpenDFile(pWDataf, O_WRONLY) < 0) { + tsdbError("vgId:%d failed to open file %s to commit since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pWDataf), + tstrerror(terrno)); + + tsdbCloseDFileSet(pWSet); + tsdbRemoveDFile(pWHeadf); + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + return -1; + } + } + pCommith->isDFileSame = true; + + // TSDB_FILE_LAST + SDFile *pRLastf = TSDB_READ_LAST_FILE(&(pCommith->readh)); + SDFile *pWLastf = TSDB_COMMIT_LAST_FILE(pCommith); + if (pRLastf->info.size < 32 * 1024) { + tsdbInitDFileEx(pWLastf, pRLastf); + pCommith->isLFileSame = true; + + if (tsdbOpenDFile(pWLastf, O_WRONLY) < 0) { + tsdbError("vgId:%d failed to open file %s to commit since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pWLastf), + tstrerror(terrno)); + + tsdbCloseDFileSet(pWSet); + tsdbRemoveDFile(pWHeadf); + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + return -1; + } + } + } else { + tsdbInitDFile(pWLastf, did, REPO_ID(pRepo), fid, FS_TXN_VERSION(REPO_FS(pRepo)), TSDB_FILE_LAST); + pCommith->isLFileSame = false; + + if (tsdbCreateDFile(pWLastf, true, TSDB_FILE_LAST) < 0) { + tsdbError("vgId:%d failed to create file %s to commit since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pWLastf), + tstrerror(terrno)); + + tsdbCloseDFileSet(pWSet); + (void)tsdbRemoveDFile(pWHeadf); + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + return -1; + } + } + } + + // TSDB_FILE_SMAD + SDFile *pRSmadF = TSDB_READ_SMAD_FILE(&(pCommith->readh)); + SDFile *pWSmadF = TSDB_COMMIT_SMAD_FILE(pCommith); + + if (access(TSDB_FILE_FULL_NAME(pRSmadF), F_OK) != 0) { + tsdbDebug("vgId:%d create data file %s as not exist", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pRSmadF)); + tsdbInitDFile(pWSmadF, did, REPO_ID(pRepo), fid, FS_TXN_VERSION(REPO_FS(pRepo)), TSDB_FILE_SMAD); + + if (tsdbCreateDFile(pWSmadF, true, TSDB_FILE_SMAD) < 0) { + tsdbError("vgId:%d failed to create file %s to commit since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pWSmadF), + tstrerror(terrno)); + + tsdbCloseDFileSet(pWSet); + (void)tsdbRemoveDFile(pWHeadf); + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + return -1; + } + } + } else { + tsdbInitDFileEx(pWSmadF, pRSmadF); + if (tsdbOpenDFile(pWSmadF, O_RDWR) < 0) { + tsdbError("vgId:%d failed to open file %s to commit since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pWSmadF), + tstrerror(terrno)); + + tsdbCloseDFileSet(pWSet); + tsdbRemoveDFile(pWHeadf); + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + return -1; + } + } + } + + // TSDB_FILE_SMAL + ASSERT(tsdbGetNFiles(pWSet) >= TSDB_FILE_SMAL); + SDFile *pRSmalF = TSDB_READ_SMAL_FILE(&(pCommith->readh)); + SDFile *pWSmalF = TSDB_COMMIT_SMAL_FILE(pCommith); + + if ((pCommith->isLFileSame) && access(TSDB_FILE_FULL_NAME(pRSmalF), F_OK) == 0) { + tsdbInitDFileEx(pWSmalF, pRSmalF); + if (tsdbOpenDFile(pWSmalF, O_RDWR) < 0) { + tsdbError("vgId:%d failed to open file %s to commit since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pWSmalF), + tstrerror(terrno)); + + tsdbCloseDFileSet(pWSet); + tsdbRemoveDFile(pWHeadf); + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + return -1; + } + } + } else { + tsdbDebug("vgId:%d create data file %s as not exist", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pRSmalF)); + tsdbInitDFile(pWSmalF, did, REPO_ID(pRepo), fid, FS_TXN_VERSION(REPO_FS(pRepo)), TSDB_FILE_SMAL); + + if (tsdbCreateDFile(pWSmalF, true, TSDB_FILE_SMAL) < 0) { + tsdbError("vgId:%d failed to create file %s to commit since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pWSmalF), + tstrerror(terrno)); + + tsdbCloseDFileSet(pWSet); + (void)tsdbRemoveDFile(pWHeadf); + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + return -1; + } + } + } + } + + return 0; +} + +static void tsdbCloseCommitFile(SCommitH *pCommith, bool hasError) { + if (pCommith->isRFileSet) { + tsdbCloseAndUnsetFSet(&(pCommith->readh)); + } + + if (!hasError) { + TSDB_FSET_FSYNC(TSDB_COMMIT_WRITE_FSET(pCommith)); + } + tsdbCloseDFileSet(TSDB_COMMIT_WRITE_FSET(pCommith)); +} + +static bool tsdbCanAddSubBlock(SCommitH *pCommith, SBlock *pBlock, SMergeInfo *pInfo) { + STsdbRepo *pRepo = TSDB_COMMIT_REPO(pCommith); + STsdbCfg * pCfg = REPO_CFG(pRepo); + int mergeRows = pBlock->numOfRows + pInfo->rowsInserted - pInfo->rowsDeleteSucceed; + + ASSERT(mergeRows > 0); + + if (pBlock->numOfSubBlocks < TSDB_MAX_SUBBLOCKS && pInfo->nOperations <= pCfg->maxRowsPerFileBlock) { + if (pBlock->last) { + if (pCommith->isLFileSame && mergeRows < pCfg->minRowsPerFileBlock) return true; + } else { + if (pCommith->isDFileSame && mergeRows <= pCfg->maxRowsPerFileBlock) return true; + } + } + + return false; +} + +int tsdbApplyRtn(STsdbRepo *pRepo) { + SRtn rtn; + SFSIter fsiter; + STsdbFS * pfs = REPO_FS(pRepo); + SDFileSet *pSet; + + // Get retention snapshot + tsdbGetRtnSnap(pRepo, &rtn); + + tsdbFSIterInit(&fsiter, pfs, TSDB_FS_ITER_FORWARD); + while ((pSet = tsdbFSIterNext(&fsiter))) { + if (pSet->fid < rtn.minFid) { + tsdbInfo("vgId:%d FSET %d at level %d disk id %d expires, remove it", REPO_ID(pRepo), pSet->fid, + TSDB_FSET_LEVEL(pSet), TSDB_FSET_ID(pSet)); + continue; + } + + if (tsdbApplyRtnOnFSet(pRepo, pSet, &rtn) < 0) { + return -1; + } + } + + return 0; +} diff --git a/source/dnode/vnode/tsdb2/src/tsdbCommitQueue.c b/source/dnode/vnode/tsdb2/src/tsdbCommitQueue.c new file mode 100644 index 0000000000..dccb85af55 --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbCommitQueue.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "tsdbint.h" + +typedef struct { + bool stop; + pthread_mutex_t lock; + pthread_cond_t queueNotEmpty; + int nthreads; + int refCount; + SList * queue; + pthread_t * threads; +} SCommitQueue; + +typedef struct { + TSDB_REQ_T req; + STsdbRepo *pRepo; +} SReq; + +static void *tsdbLoopCommit(void *arg); + +static SCommitQueue tsCommitQueue = {0}; + +int tsdbInitCommitQueue() { + int nthreads = tsNumOfCommitThreads; + SCommitQueue *pQueue = &tsCommitQueue; + + if (nthreads < 1) nthreads = 1; + + pQueue->stop = false; + pQueue->nthreads = nthreads; + + pQueue->queue = tdListNew(0); + if (pQueue->queue == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + pQueue->threads = (pthread_t *)calloc(nthreads, sizeof(pthread_t)); + if (pQueue->threads == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tdListFree(pQueue->queue); + return -1; + } + + pthread_mutex_init(&(pQueue->lock), NULL); + pthread_cond_init(&(pQueue->queueNotEmpty), NULL); + + for (int i = 0; i < nthreads; i++) { + pthread_create(pQueue->threads + i, NULL, tsdbLoopCommit, NULL); + } + + return 0; +} + +void tsdbDestroyCommitQueue() { + SCommitQueue *pQueue = &tsCommitQueue; + + pthread_mutex_lock(&(pQueue->lock)); + + if (pQueue->stop) { + pthread_mutex_unlock(&(pQueue->lock)); + return; + } + + pQueue->stop = true; + pthread_cond_broadcast(&(pQueue->queueNotEmpty)); + + pthread_mutex_unlock(&(pQueue->lock)); + + for (size_t i = 0; i < pQueue->nthreads; i++) { + pthread_join(pQueue->threads[i], NULL); + } + + free(pQueue->threads); + tdListFree(pQueue->queue); + pthread_cond_destroy(&(pQueue->queueNotEmpty)); + pthread_mutex_destroy(&(pQueue->lock)); +} + +int tsdbScheduleCommit(STsdbRepo *pRepo, TSDB_REQ_T req) { + SCommitQueue *pQueue = &tsCommitQueue; + + SListNode *pNode = (SListNode *)calloc(1, sizeof(SListNode) + sizeof(SReq)); + if (pNode == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + ((SReq *)pNode->data)->req = req; + ((SReq *)pNode->data)->pRepo = pRepo; + + pthread_mutex_lock(&(pQueue->lock)); + + // ASSERT(pQueue->stop); + + tdListAppendNode(pQueue->queue, pNode); + pthread_cond_signal(&(pQueue->queueNotEmpty)); + + pthread_mutex_unlock(&(pQueue->lock)); + return 0; +} + +static void tsdbApplyRepoConfig(STsdbRepo *pRepo) { + pthread_mutex_lock(&pRepo->save_mutex); + + pRepo->config_changed = false; + STsdbCfg * pSaveCfg = &pRepo->save_config; + STsdbCfg oldCfg; + int32_t oldTotalBlocks = pRepo->config.totalBlocks; + + memcpy(&oldCfg, &(pRepo->config), sizeof(STsdbCfg)); + + pRepo->config.compression = pRepo->save_config.compression; + pRepo->config.keep = pRepo->save_config.keep; + pRepo->config.keep1 = pRepo->save_config.keep1; + pRepo->config.keep2 = pRepo->save_config.keep2; + pRepo->config.cacheLastRow = pRepo->save_config.cacheLastRow; + pRepo->config.totalBlocks = pRepo->save_config.totalBlocks; + + pthread_mutex_unlock(&pRepo->save_mutex); + + tsdbInfo("vgId:%d apply new config: compression(%d), keep(%d,%d,%d), totalBlocks(%d), cacheLastRow(%d->%d),totalBlocks(%d->%d)", + REPO_ID(pRepo), + pSaveCfg->compression, pSaveCfg->keep,pSaveCfg->keep1, pSaveCfg->keep2, + pSaveCfg->totalBlocks, oldCfg.cacheLastRow, pSaveCfg->cacheLastRow, oldTotalBlocks, pSaveCfg->totalBlocks); + + int err = tsdbExpandPool(pRepo, oldTotalBlocks); + if (!TAOS_SUCCEEDED(err)) { + tsdbError("vgId:%d expand pool from %d to %d fail,reason:%s", + REPO_ID(pRepo), oldTotalBlocks, pSaveCfg->totalBlocks, tstrerror(err)); + } + + if (oldCfg.cacheLastRow != pRepo->config.cacheLastRow) { + if (tsdbLockRepo(pRepo) < 0) return; + // tsdbCacheLastData(pRepo, &oldCfg); + // lazy load last cache when query or update + ++pRepo->cacheLastConfigVersion; + tsdbUnlockRepo(pRepo); + } + +} + +static void *tsdbLoopCommit(void *arg) { + SCommitQueue *pQueue = &tsCommitQueue; + SListNode * pNode = NULL; + STsdbRepo * pRepo = NULL; + TSDB_REQ_T req; + + setThreadName("tsdbCommit"); + + while (true) { + pthread_mutex_lock(&(pQueue->lock)); + + while (true) { + pNode = tdListPopHead(pQueue->queue); + if (pNode == NULL) { + if (pQueue->stop && pQueue->refCount <= 0) { + pthread_mutex_unlock(&(pQueue->lock)); + goto _exit; + } else { + pthread_cond_wait(&(pQueue->queueNotEmpty), &(pQueue->lock)); + } + } else { + break; + } + } + + pthread_mutex_unlock(&(pQueue->lock)); + + req = ((SReq *)pNode->data)->req; + pRepo = ((SReq *)pNode->data)->pRepo; + + if (req == COMMIT_REQ) { + tsdbCommitData(pRepo); + } else if (req == COMPACT_REQ) { + tsdbCompactImpl(pRepo); + } else if (req == COMMIT_CONFIG_REQ) { + ASSERT(pRepo->config_changed); + tsdbApplyRepoConfig(pRepo); + tsem_post(&(pRepo->readyToCommit)); + } else { + ASSERT(0); + } + + listNodeFree(pNode); + } + +_exit: + return NULL; +} + +void tsdbIncCommitRef(int vgId) { + int refCount = atomic_add_fetch_32(&tsCommitQueue.refCount, 1); + tsdbDebug("vgId:%d, inc commit queue ref to %d", vgId, refCount); +} + +void tsdbDecCommitRef(int vgId) { + int refCount = atomic_sub_fetch_32(&tsCommitQueue.refCount, 1); + pthread_cond_broadcast(&(tsCommitQueue.queueNotEmpty)); + tsdbDebug("vgId:%d, dec commit queue ref to %d", vgId, refCount); +} diff --git a/source/dnode/vnode/tsdb2/src/tsdbCompact.c b/source/dnode/vnode/tsdb2/src/tsdbCompact.c new file mode 100644 index 0000000000..f25c0bd981 --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbCompact.c @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#include "tsdbint.h" + +typedef struct { + STable * pTable; + SBlockIdx * pBlkIdx; + SBlockIdx bindex; + SBlockInfo *pInfo; +} STableCompactH; + +typedef struct { + SRtn rtn; + SFSIter fsIter; + SArray * tbArray; // table array to cache table obj and block indexes + SReadH readh; + SDFileSet wSet; + SArray * aBlkIdx; + SArray * aSupBlk; + SDataCols *pDataCols; +} SCompactH; + +#define TSDB_COMPACT_WSET(pComph) (&((pComph)->wSet)) +#define TSDB_COMPACT_REPO(pComph) TSDB_READ_REPO(&((pComph)->readh)) +#define TSDB_COMPACT_HEAD_FILE(pComph) TSDB_DFILE_IN_SET(TSDB_COMPACT_WSET(pComph), TSDB_FILE_HEAD) +#define TSDB_COMPACT_DATA_FILE(pComph) TSDB_DFILE_IN_SET(TSDB_COMPACT_WSET(pComph), TSDB_FILE_DATA) +#define TSDB_COMPACT_LAST_FILE(pComph) TSDB_DFILE_IN_SET(TSDB_COMPACT_WSET(pComph), TSDB_FILE_LAST) +#define TSDB_COMPACT_SMAD_FILE(pComph) TSDB_DFILE_IN_SET(TSDB_COMPACT_WSET(pComph), TSDB_FILE_SMAD) +#define TSDB_COMPACT_SMAL_FILE(pComph) TSDB_DFILE_IN_SET(TSDB_COMPACT_WSET(pComph), TSDB_FILE_SMAL) +#define TSDB_COMPACT_BUF(pComph) TSDB_READ_BUF(&((pComph)->readh)) +#define TSDB_COMPACT_COMP_BUF(pComph) TSDB_READ_COMP_BUF(&((pComph)->readh)) +#define TSDB_COMPACT_EXBUF(pComph) TSDB_READ_EXBUF(&((pComph)->readh)) + +static int tsdbAsyncCompact(STsdbRepo *pRepo); +static void tsdbStartCompact(STsdbRepo *pRepo); +static void tsdbEndCompact(STsdbRepo *pRepo, int eno); +static int tsdbCompactMeta(STsdbRepo *pRepo); +static int tsdbCompactTSData(STsdbRepo *pRepo); +static int tsdbCompactFSet(SCompactH *pComph, SDFileSet *pSet); +static bool tsdbShouldCompact(SCompactH *pComph); +static int tsdbInitCompactH(SCompactH *pComph, STsdbRepo *pRepo); +static void tsdbDestroyCompactH(SCompactH *pComph); +static int tsdbInitCompTbArray(SCompactH *pComph); +static void tsdbDestroyCompTbArray(SCompactH *pComph); +static int tsdbCacheFSetIndex(SCompactH *pComph); +static int tsdbCompactFSetInit(SCompactH *pComph, SDFileSet *pSet); +static void tsdbCompactFSetEnd(SCompactH *pComph); +static int tsdbCompactFSetImpl(SCompactH *pComph); +static int tsdbWriteBlockToRightFile(SCompactH *pComph, STable *pTable, SDataCols *pDataCols, void **ppBuf, + void **ppCBuf, void **ppExBuf); + +enum { TSDB_NO_COMPACT, TSDB_IN_COMPACT, TSDB_WAITING_COMPACT}; +int tsdbCompact(STsdbRepo *pRepo) { return tsdbAsyncCompact(pRepo); } + +void *tsdbCompactImpl(STsdbRepo *pRepo) { + // Check if there are files in TSDB FS to compact + if (REPO_FS(pRepo)->cstatus->pmf == NULL) { + pRepo->compactState = TSDB_NO_COMPACT; + tsem_post(&(pRepo->readyToCommit)); + tsdbInfo("vgId:%d compact over, no file to compact in FS", REPO_ID(pRepo)); + return NULL; + } + + tsdbStartCompact(pRepo); + + if (tsdbCompactMeta(pRepo) < 0) { + tsdbError("vgId:%d failed to compact META data since %s", REPO_ID(pRepo), tstrerror(terrno)); + goto _err; + } + + if (tsdbCompactTSData(pRepo) < 0) { + tsdbError("vgId:%d failed to compact TS data since %s", REPO_ID(pRepo), tstrerror(terrno)); + goto _err; + } + + tsdbEndCompact(pRepo, TSDB_CODE_SUCCESS); + return NULL; + +_err: + pRepo->code = terrno; + tsdbEndCompact(pRepo, terrno); + return NULL; +} + +static int tsdbAsyncCompact(STsdbRepo *pRepo) { + if (pRepo->compactState != TSDB_NO_COMPACT) { + tsdbInfo("vgId:%d not compact tsdb again ", REPO_ID(pRepo)); + return 0; + } + pRepo->compactState = TSDB_WAITING_COMPACT; + tsem_wait(&(pRepo->readyToCommit)); + return tsdbScheduleCommit(pRepo, COMPACT_REQ); +} + +static void tsdbStartCompact(STsdbRepo *pRepo) { + assert(pRepo->compactState != TSDB_IN_COMPACT); + tsdbInfo("vgId:%d start to compact!", REPO_ID(pRepo)); + tsdbStartFSTxn(pRepo, 0, 0); + pRepo->code = TSDB_CODE_SUCCESS; + pRepo->compactState = TSDB_IN_COMPACT; +} + +static void tsdbEndCompact(STsdbRepo *pRepo, int eno) { + if (eno != TSDB_CODE_SUCCESS) { + tsdbEndFSTxnWithError(REPO_FS(pRepo)); + } else { + tsdbEndFSTxn(pRepo); + } + pRepo->compactState = TSDB_NO_COMPACT; + tsdbInfo("vgId:%d compact over, %s", REPO_ID(pRepo), (eno == TSDB_CODE_SUCCESS) ? "succeed" : "failed"); + tsem_post(&(pRepo->readyToCommit)); +} + +static int tsdbCompactMeta(STsdbRepo *pRepo) { + STsdbFS *pfs = REPO_FS(pRepo); + tsdbUpdateMFile(pfs, pfs->cstatus->pmf); + return 0; +} + + static int tsdbCompactTSData(STsdbRepo *pRepo) { + SCompactH compactH; + SDFileSet *pSet = NULL; + + tsdbDebug("vgId:%d start to compact TS data", REPO_ID(pRepo)); + + // If no file, just return 0; + if (taosArrayGetSize(REPO_FS(pRepo)->cstatus->df) <= 0) { + tsdbDebug("vgId:%d no TS data file to compact, compact over", REPO_ID(pRepo)); + return 0; + } + + if (tsdbInitCompactH(&compactH, pRepo) < 0) { + return -1; + } + + while ((pSet = tsdbFSIterNext(&(compactH.fsIter)))) { + // Remove those expired files + if (pSet->fid < compactH.rtn.minFid) { + tsdbInfo("vgId:%d FSET %d on level %d disk id %d expires, remove it", REPO_ID(pRepo), pSet->fid, + TSDB_FSET_LEVEL(pSet), TSDB_FSET_ID(pSet)); + continue; + } + + if (TSDB_FSET_LEVEL(pSet) == TFS_MAX_LEVEL) { + tsdbDebug("vgId:%d FSET %d on level %d, should not compact", REPO_ID(pRepo), pSet->fid, TFS_MAX_LEVEL); + tsdbUpdateDFileSet(REPO_FS(pRepo), pSet); + continue; + } + + if (tsdbCompactFSet(&compactH, pSet) < 0) { + tsdbDestroyCompactH(&compactH); + tsdbError("vgId:%d failed to compact FSET %d since %s", REPO_ID(pRepo), pSet->fid, tstrerror(terrno)); + return -1; + } + } + + tsdbDestroyCompactH(&compactH); + tsdbDebug("vgId:%d compact TS data over", REPO_ID(pRepo)); + return 0; + } + + static int tsdbCompactFSet(SCompactH *pComph, SDFileSet *pSet) { + STsdbRepo *pRepo = TSDB_COMPACT_REPO(pComph); + SDiskID did; + + tsdbDebug("vgId:%d start to compact FSET %d on level %d id %d", REPO_ID(pRepo), pSet->fid, TSDB_FSET_LEVEL(pSet), + TSDB_FSET_ID(pSet)); + + if (tsdbCompactFSetInit(pComph, pSet) < 0) { + return -1; + } + + if (!tsdbShouldCompact(pComph)) { + tsdbDebug("vgId:%d no need to compact FSET %d", REPO_ID(pRepo), pSet->fid); + if (tsdbApplyRtnOnFSet(TSDB_COMPACT_REPO(pComph), pSet, &(pComph->rtn)) < 0) { + tsdbCompactFSetEnd(pComph); + return -1; + } + } else { + // Create new fset as compacted fset + tfsAllocDisk(tsdbGetFidLevel(pSet->fid, &(pComph->rtn)), &(did.level), &(did.id)); + if (did.level == TFS_UNDECIDED_LEVEL) { + terrno = TSDB_CODE_TDB_NO_AVAIL_DISK; + tsdbError("vgId:%d failed to compact FSET %d since %s", REPO_ID(pRepo), pSet->fid, tstrerror(terrno)); + tsdbCompactFSetEnd(pComph); + return -1; + } + + tsdbInitDFileSet(TSDB_COMPACT_WSET(pComph), did, REPO_ID(pRepo), TSDB_FSET_FID(pSet), + FS_TXN_VERSION(REPO_FS(pRepo)), TSDB_LATEST_FSET_VER); + if (tsdbCreateDFileSet(TSDB_COMPACT_WSET(pComph), true) < 0) { + tsdbError("vgId:%d failed to compact FSET %d since %s", REPO_ID(pRepo), pSet->fid, tstrerror(terrno)); + tsdbCompactFSetEnd(pComph); + return -1; + } + + if (tsdbCompactFSetImpl(pComph) < 0) { + tsdbCloseDFileSet(TSDB_COMPACT_WSET(pComph)); + tsdbRemoveDFileSet(TSDB_COMPACT_WSET(pComph)); + tsdbCompactFSetEnd(pComph); + return -1; + } + + tsdbCloseDFileSet(TSDB_COMPACT_WSET(pComph)); + tsdbUpdateDFileSet(REPO_FS(pRepo), TSDB_COMPACT_WSET(pComph)); + tsdbDebug("vgId:%d FSET %d compact over", REPO_ID(pRepo), pSet->fid); + } + + tsdbCompactFSetEnd(pComph); + return 0; + } + + static bool tsdbShouldCompact(SCompactH *pComph) { + if (tsdbForceCompactFile) { + return true; + } + STsdbRepo * pRepo = TSDB_COMPACT_REPO(pComph); + STsdbCfg * pCfg = REPO_CFG(pRepo); + SReadH * pReadh = &(pComph->readh); + STableCompactH *pTh; + SBlock * pBlock; + int defaultRows = TSDB_DEFAULT_BLOCK_ROWS(pCfg->maxRowsPerFileBlock); + SDFile * pDataF = TSDB_READ_DATA_FILE(pReadh); + SDFile * pLastF = TSDB_READ_LAST_FILE(pReadh); + + int tblocks = 0; // total blocks + int nSubBlocks = 0; // # of blocks with sub-blocks + int nSmallBlocks = 0; // # of blocks with rows < defaultRows + int64_t tsize = 0; + + for (size_t i = 0; i < taosArrayGetSize(pComph->tbArray); i++) { + pTh = (STableCompactH *)taosArrayGet(pComph->tbArray, i); + + if (pTh->pTable == NULL || pTh->pBlkIdx == NULL) continue; + + for (size_t bidx = 0; bidx < pTh->pBlkIdx->numOfBlocks; bidx++) { + tblocks++; + pBlock = pTh->pInfo->blocks + bidx; + + if (pBlock->numOfRows < defaultRows) { + nSmallBlocks++; + } + + if (pBlock->numOfSubBlocks > 1) { + nSubBlocks++; + for (int k = 0; k < pBlock->numOfSubBlocks; k++) { + SBlock *iBlock = ((SBlock *)POINTER_SHIFT(pTh->pInfo, pBlock->offset)) + k; + tsize = tsize + iBlock->len; + } + } else if (pBlock->numOfSubBlocks == 1) { + tsize += pBlock->len; + } else { + ASSERT(0); + } + } + } + + return (((nSubBlocks * 1.0 / tblocks) > 0.33) || ((nSmallBlocks * 1.0 / tblocks) > 0.33) || + (tsize * 1.0 / (pDataF->info.size + pLastF->info.size - 2 * TSDB_FILE_HEAD_SIZE) < 0.85)); + } + + static int tsdbInitCompactH(SCompactH *pComph, STsdbRepo *pRepo) { + STsdbCfg *pCfg = REPO_CFG(pRepo); + + memset(pComph, 0, sizeof(*pComph)); + + TSDB_FSET_SET_CLOSED(TSDB_COMPACT_WSET(pComph)); + + tsdbGetRtnSnap(pRepo, &(pComph->rtn)); + tsdbFSIterInit(&(pComph->fsIter), REPO_FS(pRepo), TSDB_FS_ITER_FORWARD); + + if (tsdbInitReadH(&(pComph->readh), pRepo) < 0) { + return -1; + } + + if (tsdbInitCompTbArray(pComph) < 0) { + tsdbDestroyCompactH(pComph); + return -1; + } + + pComph->aBlkIdx = taosArrayInit(1024, sizeof(SBlockIdx)); + if (pComph->aBlkIdx == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbDestroyCompactH(pComph); + return -1; + } + + pComph->aSupBlk = taosArrayInit(1024, sizeof(SBlock)); + if (pComph->aSupBlk == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbDestroyCompactH(pComph); + return -1; + } + + pComph->pDataCols = tdNewDataCols(0, pCfg->maxRowsPerFileBlock); + if (pComph->pDataCols == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbDestroyCompactH(pComph); + return -1; + } + + return 0; + } + + static void tsdbDestroyCompactH(SCompactH *pComph) { + pComph->pDataCols = tdFreeDataCols(pComph->pDataCols); + pComph->aSupBlk = taosArrayDestroy(&pComph->aSupBlk); + pComph->aBlkIdx = taosArrayDestroy(&pComph->aBlkIdx); + tsdbDestroyCompTbArray(pComph); + tsdbDestroyReadH(&(pComph->readh)); + tsdbCloseDFileSet(TSDB_COMPACT_WSET(pComph)); + } + + static int tsdbInitCompTbArray(SCompactH *pComph) { // Init pComp->tbArray + STsdbRepo *pRepo = TSDB_COMPACT_REPO(pComph); + STsdbMeta *pMeta = pRepo->tsdbMeta; + + if (tsdbRLockRepoMeta(pRepo) < 0) return -1; + + pComph->tbArray = taosArrayInit(pMeta->maxTables, sizeof(STableCompactH)); + if (pComph->tbArray == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbUnlockRepoMeta(pRepo); + return -1; + } + + // Note here must start from 0 + for (int i = 0; i < pMeta->maxTables; i++) { + STableCompactH ch = {0}; + if (pMeta->tables[i] != NULL) { + tsdbRefTable(pMeta->tables[i]); + ch.pTable = pMeta->tables[i]; + } + + if (taosArrayPush(pComph->tbArray, &ch) == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbUnlockRepoMeta(pRepo); + return -1; + } + } + + if (tsdbUnlockRepoMeta(pRepo) < 0) return -1; + return 0; + } + + static void tsdbDestroyCompTbArray(SCompactH *pComph) { + STableCompactH *pTh; + + if (pComph->tbArray == NULL) return; + + for (size_t i = 0; i < taosArrayGetSize(pComph->tbArray); i++) { + pTh = (STableCompactH *)taosArrayGet(pComph->tbArray, i); + if (pTh->pTable) { + tsdbUnRefTable(pTh->pTable); + } + + // pTh->pInfo = taosTZfree(pTh->pInfo); + tfree(pTh->pInfo); + } + + pComph->tbArray = taosArrayDestroy(&pComph->tbArray); + } + + static int tsdbCacheFSetIndex(SCompactH *pComph) { + SReadH *pReadH = &(pComph->readh); + + if (tsdbLoadBlockIdx(pReadH) < 0) { + return -1; + } + + for (int tid = 1; tid < taosArrayGetSize(pComph->tbArray); tid++) { + STableCompactH *pTh = (STableCompactH *)taosArrayGet(pComph->tbArray, tid); + pTh->pBlkIdx = NULL; + + if (pTh->pTable == NULL) continue; + if (tsdbSetReadTable(pReadH, pTh->pTable) < 0) { + return -1; + } + + if (pReadH->pBlkIdx == NULL) continue; + pTh->bindex = *(pReadH->pBlkIdx); + pTh->pBlkIdx = &(pTh->bindex); + + uint32_t originLen = 0; + if (tsdbLoadBlockInfo(pReadH, (void **)(&(pTh->pInfo)), &originLen) < 0) { + return -1; + } + } + + return 0; + } + + static int tsdbCompactFSetInit(SCompactH *pComph, SDFileSet *pSet) { + taosArrayClear(pComph->aBlkIdx); + taosArrayClear(pComph->aSupBlk); + + if (tsdbSetAndOpenReadFSet(&(pComph->readh), pSet) < 0) { + return -1; + } + + if (tsdbCacheFSetIndex(pComph) < 0) { + tsdbCloseAndUnsetFSet(&(pComph->readh)); + return -1; + } + + return 0; + } + + static void tsdbCompactFSetEnd(SCompactH *pComph) { tsdbCloseAndUnsetFSet(&(pComph->readh)); } + + static int tsdbCompactFSetImpl(SCompactH *pComph) { + STsdbRepo *pRepo = TSDB_COMPACT_REPO(pComph); + STsdbCfg * pCfg = REPO_CFG(pRepo); + SReadH * pReadh = &(pComph->readh); + SBlockIdx blkIdx; + void ** ppBuf = &(TSDB_COMPACT_BUF(pComph)); + void ** ppCBuf = &(TSDB_COMPACT_COMP_BUF(pComph)); + void ** ppExBuf = &(TSDB_COMPACT_EXBUF(pComph)); + int defaultRows = TSDB_DEFAULT_BLOCK_ROWS(pCfg->maxRowsPerFileBlock); + + taosArrayClear(pComph->aBlkIdx); + + for (int tid = 1; tid < taosArrayGetSize(pComph->tbArray); tid++) { + STableCompactH *pTh = (STableCompactH *)taosArrayGet(pComph->tbArray, tid); + STSchema * pSchema; + + if (pTh->pTable == NULL || pTh->pBlkIdx == NULL) continue; + + pSchema = tsdbGetTableSchemaImpl(pTh->pTable, true, true, -1, -1); + taosArrayClear(pComph->aSupBlk); + if ((tdInitDataCols(pComph->pDataCols, pSchema) < 0) || (tdInitDataCols(pReadh->pDCols[0], pSchema) < 0) || + (tdInitDataCols(pReadh->pDCols[1], pSchema) < 0)) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tdFreeSchema(pSchema); + return -1; + } + tdFreeSchema(pSchema); + + // Loop to compact each block data + for (int i = 0; i < pTh->pBlkIdx->numOfBlocks; i++) { + SBlock *pBlock = pTh->pInfo->blocks + i; + + // Load the block data + if (tsdbLoadBlockData(pReadh, pBlock, pTh->pInfo) < 0) { + return -1; + } + + // Merge pComph->pDataCols and pReadh->pDCols[0] and write data to file + if (pComph->pDataCols->numOfRows == 0 && pBlock->numOfRows >= defaultRows) { + if (tsdbWriteBlockToRightFile(pComph, pTh->pTable, pReadh->pDCols[0], ppBuf, ppCBuf, ppExBuf) < 0) { + return -1; + } + } else { + int ridx = 0; + + while (true) { + if (pReadh->pDCols[0]->numOfRows - ridx == 0) break; + int rowsToMerge = MIN(pReadh->pDCols[0]->numOfRows - ridx, defaultRows - pComph->pDataCols->numOfRows); + + tdMergeDataCols(pComph->pDataCols, pReadh->pDCols[0], rowsToMerge, &ridx, pCfg->update != TD_ROW_PARTIAL_UPDATE); + + if (pComph->pDataCols->numOfRows < defaultRows) { + break; + } + + if (tsdbWriteBlockToRightFile(pComph, pTh->pTable, pComph->pDataCols, ppBuf, ppCBuf, ppExBuf) < 0) { + return -1; + } + tdResetDataCols(pComph->pDataCols); + } + } + } + + if (pComph->pDataCols->numOfRows > 0 && + tsdbWriteBlockToRightFile(pComph, pTh->pTable, pComph->pDataCols, ppBuf, ppCBuf, ppExBuf) < 0) { + return -1; + } + + if (tsdbWriteBlockInfoImpl(TSDB_COMPACT_HEAD_FILE(pComph), pTh->pTable, pComph->aSupBlk, NULL, ppBuf, &blkIdx) < + 0) { + return -1; + } + + if ((blkIdx.numOfBlocks > 0) && (taosArrayPush(pComph->aBlkIdx, (void *)(&blkIdx)) == NULL)) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + } + + if (tsdbWriteBlockIdx(TSDB_COMPACT_HEAD_FILE(pComph), pComph->aBlkIdx, ppBuf) < 0) { + return -1; + } + + return 0; + } + + static int tsdbWriteBlockToRightFile(SCompactH *pComph, STable *pTable, SDataCols *pDataCols, void **ppBuf, + void **ppCBuf, void **ppExBuf) { + STsdbRepo *pRepo = TSDB_COMPACT_REPO(pComph); + STsdbCfg * pCfg = REPO_CFG(pRepo); + SDFile * pDFile; + bool isLast; + SBlock block; + + ASSERT(pDataCols->numOfRows > 0); + + if (pDataCols->numOfRows < pCfg->minRowsPerFileBlock) { + pDFile = TSDB_COMPACT_LAST_FILE(pComph); + isLast = true; + } else { + pDFile = TSDB_COMPACT_DATA_FILE(pComph); + isLast = false; + } + + if (tsdbWriteBlockImpl(pRepo, pTable, pDFile, + isLast ? TSDB_COMPACT_SMAL_FILE(pComph) : TSDB_COMPACT_SMAD_FILE(pComph), pDataCols, &block, + isLast, true, ppBuf, ppCBuf, ppExBuf) < 0) { + return -1; + } + + if (taosArrayPush(pComph->aSupBlk, (void *)(&block)) == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + return 0; + } + diff --git a/source/dnode/vnode/tsdb2/src/tsdbFS.c b/source/dnode/vnode/tsdb2/src/tsdbFS.c new file mode 100644 index 0000000000..1ef516b305 --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbFS.c @@ -0,0 +1,1448 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "os.h" +#include "tsdbint.h" +#include + +typedef enum { TSDB_TXN_TEMP_FILE = 0, TSDB_TXN_CURR_FILE } TSDB_TXN_FILE_T; +static const char *tsdbTxnFname[] = {"current.t", "current"}; +#define TSDB_MAX_FSETS(keep, days) ((keep) / (days) + 3) + +static int tsdbComparFidFSet(const void *arg1, const void *arg2); +static void tsdbResetFSStatus(SFSStatus *pStatus); +static int tsdbSaveFSStatus(SFSStatus *pStatus, int vid); +static void tsdbApplyFSTxnOnDisk(SFSStatus *pFrom, SFSStatus *pTo); +static void tsdbGetTxnFname(int repoid, TSDB_TXN_FILE_T ftype, char fname[]); +static int tsdbOpenFSFromCurrent(STsdbRepo *pRepo); +static int tsdbScanAndTryFixFS(STsdbRepo *pRepo); +static int tsdbScanRootDir(STsdbRepo *pRepo); +static int tsdbScanDataDir(STsdbRepo *pRepo); +static bool tsdbIsTFileInFS(STsdbFS *pfs, const TFILE *pf); +static int tsdbRestoreCurrent(STsdbRepo *pRepo); +static int tsdbComparTFILE(const void *arg1, const void *arg2); +static void tsdbScanAndTryFixDFilesHeader(STsdbRepo *pRepo, int32_t *nExpired); +static int tsdbProcessExpiredFS(STsdbRepo *pRepo); +static int tsdbCreateMeta(STsdbRepo *pRepo); +static int tsdbFetchTFileSet(STsdbRepo *pRepo, SArray **fArray); + +// For backward compatibility +// ================== CURRENT file header info +static int tsdbEncodeFSHeader(void **buf, SFSHeader *pHeader) { + int tlen = 0; + + tlen += taosEncodeFixedU32(buf, pHeader->version); + tlen += taosEncodeFixedU32(buf, pHeader->len); + + return tlen; +} + +static void *tsdbDecodeFSHeader(void *buf, SFSHeader *pHeader) { + buf = taosDecodeFixedU32(buf, &(pHeader->version)); + buf = taosDecodeFixedU32(buf, &(pHeader->len)); + + return buf; +} + +// ================== STsdbFSMeta +static int tsdbEncodeFSMeta(void **buf, STsdbFSMeta *pMeta) { + int tlen = 0; + + tlen += taosEncodeFixedU32(buf, pMeta->version); + tlen += taosEncodeFixedI64(buf, pMeta->totalPoints); + tlen += taosEncodeFixedI64(buf, pMeta->totalStorage); + + return tlen; +} + +static void *tsdbDecodeFSMeta(void *buf, STsdbFSMeta *pMeta) { + buf = taosDecodeFixedU32(buf, &(pMeta->version)); + buf = taosDecodeFixedI64(buf, &(pMeta->totalPoints)); + buf = taosDecodeFixedI64(buf, &(pMeta->totalStorage)); + + return buf; +} + +// ================== SFSStatus +static int tsdbEncodeDFileSetArray(void **buf, SArray *pArray) { + int tlen = 0; + uint64_t nset = taosArrayGetSize(pArray); + + tlen += taosEncodeFixedU64(buf, nset); + for (size_t i = 0; i < nset; i++) { + SDFileSet *pSet = taosArrayGet(pArray, i); + + tlen += tsdbEncodeDFileSet(buf, pSet); + } + + return tlen; +} + +static int tsdbDecodeDFileSetArray(void **originBuf, void *buf, SArray *pArray, SFSHeader *pSFSHeader) { + uint64_t nset; + SDFileSet dset; + dset.ver = TSDB_FSET_VER_0; // default value + + taosArrayClear(pArray); + + buf = taosDecodeFixedU64(buf, &nset); + + if (pSFSHeader->version == TSDB_FS_VER_0) { + // record fver in new version of 'current' file + uint64_t extendedSize = pSFSHeader->len + nset * TSDB_FILE_MAX * sizeof(TSDB_FVER_TYPE); + if (taosTSizeof(*originBuf) < extendedSize) { + size_t ptrDistance = POINTER_DISTANCE(buf, *originBuf); + if (tsdbMakeRoom(originBuf, (size_t)extendedSize) < 0) { + terrno = TSDB_CODE_FS_OUT_OF_MEMORY; + return -1; + } + buf = POINTER_SHIFT(*originBuf, ptrDistance); + } + } + + for (size_t i = 0; i < nset; i++) { + buf = tsdbDecodeDFileSet(buf, &dset, pSFSHeader->version); + taosArrayPush(pArray, (void *)(&dset)); + } + return TSDB_CODE_SUCCESS; +} + +static int tsdbEncodeFSStatus(void **buf, SFSStatus *pStatus) { + ASSERT(pStatus->pmf); + + int tlen = 0; + + tlen += tsdbEncodeSMFile(buf, pStatus->pmf); + tlen += tsdbEncodeDFileSetArray(buf, pStatus->df); + + return tlen; +} + +static int tsdbDecodeFSStatus(void **originBuf, void *buf, SFSStatus *pStatus, SFSHeader *pSFSHeader) { + tsdbResetFSStatus(pStatus); + pStatus->pmf = &(pStatus->mf); + + buf = tsdbDecodeSMFile(buf, pStatus->pmf); + return tsdbDecodeDFileSetArray(originBuf, buf, pStatus->df, pSFSHeader); +} + +static SFSStatus *tsdbNewFSStatus(int maxFSet) { + SFSStatus *pStatus = (SFSStatus *)calloc(1, sizeof(*pStatus)); + if (pStatus == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + + TSDB_FILE_SET_CLOSED(&(pStatus->mf)); + + pStatus->df = taosArrayInit(maxFSet, sizeof(SDFileSet)); + if (pStatus->df == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + free(pStatus); + return NULL; + } + + return pStatus; +} + +static SFSStatus *tsdbFreeFSStatus(SFSStatus *pStatus) { + if (pStatus) { + pStatus->df = taosArrayDestroy(&pStatus->df); + free(pStatus); + } + + return NULL; +} + +static void tsdbResetFSStatus(SFSStatus *pStatus) { + if (pStatus == NULL) { + return; + } + + TSDB_FILE_SET_CLOSED(&(pStatus->mf)); + + pStatus->pmf = NULL; + taosArrayClear(pStatus->df); +} + +static void tsdbSetStatusMFile(SFSStatus *pStatus, const SMFile *pMFile) { + ASSERT(pStatus->pmf == NULL); + + pStatus->pmf = &(pStatus->mf); + tsdbInitMFileEx(pStatus->pmf, (SMFile *)pMFile); +} + +static int tsdbAddDFileSetToStatus(SFSStatus *pStatus, const SDFileSet *pSet) { + if (taosArrayPush(pStatus->df, (void *)pSet) == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + TSDB_FSET_SET_CLOSED(((SDFileSet *)taosArrayGetLast(pStatus->df))); + + return 0; +} + +// ================== STsdbFS +STsdbFS *tsdbNewFS(STsdbCfg *pCfg) { + int keep = pCfg->keep; + int days = pCfg->daysPerFile; + int maxFSet = TSDB_MAX_FSETS(keep, days); + STsdbFS *pfs; + + pfs = (STsdbFS *)calloc(1, sizeof(*pfs)); + if (pfs == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + + int code = pthread_rwlock_init(&(pfs->lock), NULL); + if (code) { + terrno = TAOS_SYSTEM_ERROR(code); + free(pfs); + return NULL; + } + + pfs->cstatus = tsdbNewFSStatus(maxFSet); + if (pfs->cstatus == NULL) { + tsdbFreeFS(pfs); + return NULL; + } + + pfs->metaCache = taosHashInit(4096, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), true, HASH_NO_LOCK); + if (pfs->metaCache == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbFreeFS(pfs); + return NULL; + } + + pfs->intxn = false; + pfs->metaCacheComp = NULL; + + pfs->nstatus = tsdbNewFSStatus(maxFSet); + if (pfs->nstatus == NULL) { + tsdbFreeFS(pfs); + return NULL; + } + + return pfs; +} + +void *tsdbFreeFS(STsdbFS *pfs) { + if (pfs) { + pfs->nstatus = tsdbFreeFSStatus(pfs->nstatus); + taosHashCleanup(pfs->metaCache); + pfs->metaCache = NULL; + pfs->cstatus = tsdbFreeFSStatus(pfs->cstatus); + pthread_rwlock_destroy(&(pfs->lock)); + free(pfs); + } + + return NULL; +} + +static int tsdbProcessExpiredFS(STsdbRepo *pRepo) { + tsdbStartFSTxn(pRepo, 0, 0); + if (tsdbCreateMeta(pRepo) < 0) { + tsdbError("vgId:%d failed to create meta since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + if (tsdbApplyRtn(pRepo) < 0) { + tsdbEndFSTxnWithError(REPO_FS(pRepo)); + tsdbError("vgId:%d failed to apply rtn since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + if (tsdbEndFSTxn(pRepo) < 0) { + tsdbError("vgId:%d failed to end fs txn since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + return 0; +} + +static int tsdbCreateMeta(STsdbRepo *pRepo) { + STsdbFS *pfs = REPO_FS(pRepo); + SMFile * pOMFile = pfs->cstatus->pmf; + SMFile mf; + SDiskID did; + + if (pOMFile != NULL) { + // keep the old meta file + tsdbUpdateMFile(pfs, pOMFile); + return 0; + } + + // Create a new meta file + did.level = TFS_PRIMARY_LEVEL; + did.id = TFS_PRIMARY_ID; + tsdbInitMFile(&mf, did, REPO_ID(pRepo), FS_TXN_VERSION(REPO_FS(pRepo))); + + if (tsdbCreateMFile(&mf, true) < 0) { + tsdbError("vgId:%d failed to create META file since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + tsdbInfo("vgId:%d meta file %s is created", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(&mf)); + + if (tsdbUpdateMFileHeader(&mf) < 0) { + tsdbError("vgId:%d failed to update META file header since %s, revert it", REPO_ID(pRepo), tstrerror(terrno)); + tsdbApplyMFileChange(&mf, pOMFile); + return -1; + } + + TSDB_FILE_FSYNC(&mf); + tsdbCloseMFile(&mf); + tsdbUpdateMFile(pfs, &mf); + + return 0; +} + +int tsdbOpenFS(STsdbRepo *pRepo) { + STsdbFS *pfs = REPO_FS(pRepo); + char current[TSDB_FILENAME_LEN] = "\0"; + int nExpired = 0; + + ASSERT(pfs != NULL); + + tsdbGetTxnFname(REPO_ID(pRepo), TSDB_TXN_CURR_FILE, current); + + tsdbGetRtnSnap(pRepo, &pRepo->rtn); + if (access(current, F_OK) == 0) { + if (tsdbOpenFSFromCurrent(pRepo) < 0) { + tsdbError("vgId:%d failed to open FS since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + tsdbScanAndTryFixDFilesHeader(pRepo, &nExpired); + if (nExpired > 0) { + tsdbProcessExpiredFS(pRepo); + } + } else { + // should skip expired fileset inside of the function + if (tsdbRestoreCurrent(pRepo) < 0) { + tsdbError("vgId:%d failed to restore current file since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + } + + if (tsdbScanAndTryFixFS(pRepo) < 0) { + tsdbError("vgId:%d failed to scan and fix FS since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + // Load meta cache if has meta file + if ((!(pRepo->state & TSDB_STATE_BAD_META)) && tsdbLoadMetaCache(pRepo, true) < 0) { + tsdbError("vgId:%d failed to open FS while loading meta cache since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + return 0; +} + +void tsdbCloseFS(STsdbRepo *pRepo) { + // Do nothing +} + +// Start a new transaction to modify the file system +void tsdbStartFSTxn(STsdbRepo *pRepo, int64_t pointsAdd, int64_t storageAdd) { + STsdbFS *pfs = REPO_FS(pRepo); + ASSERT(pfs->intxn == false); + + pfs->intxn = true; + tsdbResetFSStatus(pfs->nstatus); + pfs->nstatus->meta = pfs->cstatus->meta; + if (pfs->cstatus->pmf == NULL) { + pfs->nstatus->meta.version = 0; + } else { + pfs->nstatus->meta.version = pfs->cstatus->meta.version + 1; + } + pfs->nstatus->meta.totalPoints = pfs->cstatus->meta.totalPoints + pointsAdd; + pfs->nstatus->meta.totalStorage = pfs->cstatus->meta.totalStorage += storageAdd; +} + +void tsdbUpdateFSTxnMeta(STsdbFS *pfs, STsdbFSMeta *pMeta) { pfs->nstatus->meta = *pMeta; } + +int tsdbEndFSTxn(STsdbRepo *pRepo) { + STsdbFS *pfs = REPO_FS(pRepo); + ASSERT(FS_IN_TXN(pfs)); + SFSStatus *pStatus; + + // Write current file system snapshot + if (tsdbSaveFSStatus(pfs->nstatus, REPO_ID(pRepo)) < 0) { + tsdbEndFSTxnWithError(pfs); + return -1; + } + + // Make new + tsdbWLockFS(pfs); + pStatus = pfs->cstatus; + pfs->cstatus = pfs->nstatus; + pfs->nstatus = pStatus; + tsdbUnLockFS(pfs); + + // Apply actual change to each file and SDFileSet + tsdbApplyFSTxnOnDisk(pfs->nstatus, pfs->cstatus); + + pfs->intxn = false; + return 0; +} + +int tsdbEndFSTxnWithError(STsdbFS *pfs) { + tsdbApplyFSTxnOnDisk(pfs->nstatus, pfs->cstatus); + // TODO: if mf change, reload pfs->metaCache + pfs->intxn = false; + return 0; +} + +void tsdbUpdateMFile(STsdbFS *pfs, const SMFile *pMFile) { tsdbSetStatusMFile(pfs->nstatus, pMFile); } + +int tsdbUpdateDFileSet(STsdbFS *pfs, const SDFileSet *pSet) { return tsdbAddDFileSetToStatus(pfs->nstatus, pSet); } + +static int tsdbSaveFSStatus(SFSStatus *pStatus, int vid) { + SFSHeader fsheader; + void * pBuf = NULL; + void * ptr; + char hbuf[TSDB_FILE_HEAD_SIZE] = "\0"; + char tfname[TSDB_FILENAME_LEN] = "\0"; + char cfname[TSDB_FILENAME_LEN] = "\0"; + + tsdbGetTxnFname(vid, TSDB_TXN_TEMP_FILE, tfname); + tsdbGetTxnFname(vid, TSDB_TXN_CURR_FILE, cfname); + + int fd = open(tfname, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0755); + if (fd < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + fsheader.version = TSDB_LATEST_SFS_VER; + if (pStatus->pmf == NULL) { + ASSERT(taosArrayGetSize(pStatus->df) == 0); + fsheader.len = 0; + } else { + fsheader.len = tsdbEncodeFSStatus(NULL, pStatus) + sizeof(TSCKSUM); + } + + // Encode header part and write + ptr = hbuf; + tsdbEncodeFSHeader(&ptr, &fsheader); + tsdbEncodeFSMeta(&ptr, &(pStatus->meta)); + + taosCalcChecksumAppend(0, (uint8_t *)hbuf, TSDB_FILE_HEAD_SIZE); + + if (taosWrite(fd, hbuf, TSDB_FILE_HEAD_SIZE) < TSDB_FILE_HEAD_SIZE) { + terrno = TAOS_SYSTEM_ERROR(errno); + close(fd); + remove(tfname); + return -1; + } + + // Encode file status and write to file + if (fsheader.len > 0) { + if (tsdbMakeRoom(&(pBuf), fsheader.len) < 0) { + close(fd); + remove(tfname); + return -1; + } + + ptr = pBuf; + tsdbEncodeFSStatus(&ptr, pStatus); + taosCalcChecksumAppend(0, (uint8_t *)pBuf, fsheader.len); + + if (taosWrite(fd, pBuf, fsheader.len) < fsheader.len) { + terrno = TAOS_SYSTEM_ERROR(errno); + close(fd); + (void)remove(tfname); + taosTZfree(pBuf); + return -1; + } + } + + // fsync, close and rename + if (taosFsync(fd) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + close(fd); + remove(tfname); + taosTZfree(pBuf); + return -1; + } + + (void)close(fd); + (void)taosRename(tfname, cfname); + taosTZfree(pBuf); + + return 0; +} + +static void tsdbApplyFSTxnOnDisk(SFSStatus *pFrom, SFSStatus *pTo) { + int ifrom = 0; + int ito = 0; + size_t sizeFrom, sizeTo; + SDFileSet *pSetFrom; + SDFileSet *pSetTo; + + sizeFrom = taosArrayGetSize(pFrom->df); + sizeTo = taosArrayGetSize(pTo->df); + + // Apply meta file change + (void)tsdbApplyMFileChange(pFrom->pmf, pTo->pmf); + + // Apply SDFileSet change + if (ifrom >= sizeFrom) { + pSetFrom = NULL; + } else { + pSetFrom = taosArrayGet(pFrom->df, ifrom); + } + + if (ito >= sizeTo) { + pSetTo = NULL; + } else { + pSetTo = taosArrayGet(pTo->df, ito); + } + + while (true) { + if ((pSetTo == NULL) && (pSetFrom == NULL)) break; + + if (pSetTo == NULL || (pSetFrom && pSetFrom->fid < pSetTo->fid)) { + tsdbApplyDFileSetChange(pSetFrom, NULL); + + ifrom++; + if (ifrom >= sizeFrom) { + pSetFrom = NULL; + } else { + pSetFrom = taosArrayGet(pFrom->df, ifrom); + } + } else if (pSetFrom == NULL || pSetFrom->fid > pSetTo->fid) { + // Do nothing + ito++; + if (ito >= sizeTo) { + pSetTo = NULL; + } else { + pSetTo = taosArrayGet(pTo->df, ito); + } + } else { + tsdbApplyDFileSetChange(pSetFrom, pSetTo); + + ifrom++; + if (ifrom >= sizeFrom) { + pSetFrom = NULL; + } else { + pSetFrom = taosArrayGet(pFrom->df, ifrom); + } + + ito++; + if (ito >= sizeTo) { + pSetTo = NULL; + } else { + pSetTo = taosArrayGet(pTo->df, ito); + } + } + } +} + +// ================== SFSIter +// ASSUMPTIONS: the FS Should be read locked when calling these functions +void tsdbFSIterInit(SFSIter *pIter, STsdbFS *pfs, int direction) { + pIter->pfs = pfs; + pIter->direction = direction; + + size_t size = taosArrayGetSize(pfs->cstatus->df); + + pIter->version = pfs->cstatus->meta.version; + + if (size == 0) { + pIter->index = -1; + pIter->fid = TSDB_IVLD_FID; + } else { + if (direction == TSDB_FS_ITER_FORWARD) { + pIter->index = 0; + } else { + pIter->index = (int)(size - 1); + } + + pIter->fid = ((SDFileSet *)taosArrayGet(pfs->cstatus->df, pIter->index))->fid; + } +} + +void tsdbFSIterSeek(SFSIter *pIter, int fid) { + STsdbFS *pfs = pIter->pfs; + size_t size = taosArrayGetSize(pfs->cstatus->df); + + int flags; + if (pIter->direction == TSDB_FS_ITER_FORWARD) { + flags = TD_GE; + } else { + flags = TD_LE; + } + + void *ptr = taosbsearch(&fid, pfs->cstatus->df->pData, size, sizeof(SDFileSet), tsdbComparFidFSet, flags); + if (ptr == NULL) { + pIter->index = -1; + pIter->fid = TSDB_IVLD_FID; + } else { + pIter->index = (int)(TARRAY_ELEM_IDX(pfs->cstatus->df, ptr)); + pIter->fid = ((SDFileSet *)ptr)->fid; + } +} + +SDFileSet *tsdbFSIterNext(SFSIter *pIter) { + STsdbFS * pfs = pIter->pfs; + SDFileSet *pSet; + + if (pIter->index < 0) { + ASSERT(pIter->fid == TSDB_IVLD_FID); + return NULL; + } + + ASSERT(pIter->fid != TSDB_IVLD_FID); + + if (pIter->version != pfs->cstatus->meta.version) { + pIter->version = pfs->cstatus->meta.version; + tsdbFSIterSeek(pIter, pIter->fid); + } + + if (pIter->index < 0) { + return NULL; + } + + pSet = (SDFileSet *)taosArrayGet(pfs->cstatus->df, pIter->index); + ASSERT(pSet->fid == pIter->fid); + + if (pIter->direction == TSDB_FS_ITER_FORWARD) { + pIter->index++; + if (pIter->index >= taosArrayGetSize(pfs->cstatus->df)) { + pIter->index = -1; + } + } else { + pIter->index--; + } + + if (pIter->index >= 0) { + pIter->fid = ((SDFileSet *)taosArrayGet(pfs->cstatus->df, pIter->index))->fid; + } else { + pIter->fid = TSDB_IVLD_FID; + } + + return pSet; +} + +static int tsdbComparFidFSet(const void *arg1, const void *arg2) { + int fid = *(int *)arg1; + SDFileSet *pSet = (SDFileSet *)arg2; + + if (fid < pSet->fid) { + return -1; + } else if (fid == pSet->fid) { + return 0; + } else { + return 1; + } +} + +static void tsdbGetTxnFname(int repoid, TSDB_TXN_FILE_T ftype, char fname[]) { + snprintf(fname, TSDB_FILENAME_LEN, "%s/vnode/vnode%d/tsdb/%s", TFS_PRIMARY_PATH(), repoid, tsdbTxnFname[ftype]); +} + +static int tsdbOpenFSFromCurrent(STsdbRepo *pRepo) { + STsdbFS * pfs = REPO_FS(pRepo); + int fd = -1; + void * buffer = NULL; + SFSHeader fsheader; + char current[TSDB_FILENAME_LEN] = "\0"; + void * ptr; + + tsdbGetTxnFname(REPO_ID(pRepo), TSDB_TXN_CURR_FILE, current); + + // current file exists, try to recover + fd = open(current, O_RDONLY | O_BINARY); + if (fd < 0) { + tsdbError("vgId:%d failed to open file %s since %s", REPO_ID(pRepo), current, strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(errno); + goto _err; + } + + if (tsdbMakeRoom(&buffer, TSDB_FILE_HEAD_SIZE) < 0) { + goto _err; + } + + int nread = (int)taosRead(fd, buffer, TSDB_FILE_HEAD_SIZE); + if (nread < 0) { + tsdbError("vgId:%d failed to read %d bytes from file %s since %s", REPO_ID(pRepo), TSDB_FILENAME_LEN, current, + strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(errno); + goto _err; + } + + if (nread < TSDB_FILE_HEAD_SIZE) { + tsdbError("vgId:%d failed to read header of file %s, read bytes:%d", REPO_ID(pRepo), current, nread); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + goto _err; + } + + if (!taosCheckChecksumWhole((uint8_t *)buffer, TSDB_FILE_HEAD_SIZE)) { + tsdbError("vgId:%d header of file %s failed checksum check", REPO_ID(pRepo), current); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + goto _err; + } + + SFSStatus *pStatus = pfs->cstatus; + ptr = buffer; + ptr = tsdbDecodeFSHeader(ptr, &fsheader); + ptr = tsdbDecodeFSMeta(ptr, &(pStatus->meta)); + + if (fsheader.version != TSDB_FS_VER_0) { + // TODO: handle file version change + } + + if (fsheader.len > 0) { + if (tsdbMakeRoom(&buffer, fsheader.len) < 0) { + goto _err; + } + + nread = (int)taosRead(fd, buffer, fsheader.len); + if (nread < 0) { + tsdbError("vgId:%d failed to read file %s since %s", REPO_ID(pRepo), current, strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(errno); + goto _err; + } + + if (nread < fsheader.len) { + tsdbError("vgId:%d failed to read %d bytes from file %s", REPO_ID(pRepo), fsheader.len, current); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + goto _err; + } + + if (!taosCheckChecksumWhole((uint8_t *)buffer, fsheader.len)) { + tsdbError("vgId:%d file %s is corrupted since wrong checksum", REPO_ID(pRepo), current); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + goto _err; + } + + ptr = buffer; + if (tsdbDecodeFSStatus(&buffer, ptr, pStatus, &fsheader) < 0) { + goto _err; + } + } else { + tsdbResetFSStatus(pStatus); + } + + taosTZfree(buffer); + close(fd); + + return 0; + +_err: + if (fd >= 0) { + close(fd); + } + taosTZfree(buffer); + return -1; +} + +// Scan and try to fix incorrect files +static int tsdbScanAndTryFixFS(STsdbRepo *pRepo) { + STsdbFS * pfs = REPO_FS(pRepo); + SFSStatus *pStatus = pfs->cstatus; + + if (tsdbScanAndTryFixMFile(pRepo) < 0) { + tsdbError("vgId:%d failed to fix MFile since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + size_t size = taosArrayGetSize(pStatus->df); + + for (size_t i = 0; i < size; i++) { + SDFileSet *pSet = (SDFileSet *)taosArrayGet(pStatus->df, i); + + if (tsdbScanAndTryFixDFileSet(pRepo, pSet) < 0) { + tsdbError("vgId:%d failed to fix DFileSet since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + } + + // remove those unused files + tsdbScanRootDir(pRepo); + tsdbScanDataDir(pRepo); + return 0; +} + +int tsdbLoadMetaCache(STsdbRepo *pRepo, bool recoverMeta) { + char tbuf[128]; + STsdbFS * pfs = REPO_FS(pRepo); + SMFile mf; + SMFile * pMFile = &mf; + void * pBuf = NULL; + SKVRecord rInfo; + int64_t maxBufSize = 0; + SMFInfo minfo; + + taosHashClear(pfs->metaCache); + + // No meta file, just return + if (pfs->cstatus->pmf == NULL) return 0; + + mf = pfs->cstatus->mf; + // Load cache first + if (tsdbOpenMFile(pMFile, O_RDONLY) < 0) { + return -1; + } + + if (tsdbLoadMFileHeader(pMFile, &minfo) < 0) { + tsdbCloseMFile(pMFile); + return -1; + } + + while (true) { + int64_t tsize = tsdbReadMFile(pMFile, tbuf, sizeof(SKVRecord)); + if (tsize == 0) break; + + if (tsize < 0) { + tsdbError("vgId:%d failed to read META file since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + if (tsize < sizeof(SKVRecord)) { + tsdbError("vgId:%d failed to read %" PRIzu " bytes from file %s", REPO_ID(pRepo), sizeof(SKVRecord), + TSDB_FILE_FULL_NAME(pMFile)); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbCloseMFile(pMFile); + return -1; + } + + void *ptr = tsdbDecodeKVRecord(tbuf, &rInfo); + ASSERT(POINTER_DISTANCE(ptr, tbuf) == sizeof(SKVRecord)); + // ASSERT((rInfo.offset > 0) ? (pStore->info.size == rInfo.offset) : true); + + if (rInfo.offset < 0) { + taosHashRemove(pfs->metaCache, (void *)(&rInfo.uid), sizeof(rInfo.uid)); +#if 0 + pStore->info.size += sizeof(SKVRecord); + pStore->info.nRecords--; + pStore->info.nDels++; + pStore->info.tombSize += (rInfo.size + sizeof(SKVRecord) * 2); +#endif + } else { + ASSERT(rInfo.offset > 0 && rInfo.size > 0); + if (taosHashPut(pfs->metaCache, (void *)(&rInfo.uid), sizeof(rInfo.uid), &rInfo, sizeof(rInfo)) < 0) { + tsdbError("vgId:%d failed to load meta cache from file %s since OOM", REPO_ID(pRepo), + TSDB_FILE_FULL_NAME(pMFile)); + terrno = TSDB_CODE_COM_OUT_OF_MEMORY; + tsdbCloseMFile(pMFile); + return -1; + } + + maxBufSize = MAX(maxBufSize, rInfo.size); + + if (tsdbSeekMFile(pMFile, rInfo.size, SEEK_CUR) < 0) { + tsdbError("vgId:%d failed to lseek file %s since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile), + tstrerror(terrno)); + tsdbCloseMFile(pMFile); + return -1; + } + +#if 0 + pStore->info.size += (sizeof(SKVRecord) + rInfo.size); + pStore->info.nRecords++; +#endif + } + } + + if (recoverMeta) { + pBuf = malloc((size_t)maxBufSize); + if (pBuf == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbCloseMFile(pMFile); + return -1; + } + + SKVRecord *pRecord = taosHashIterate(pfs->metaCache, NULL); + while (pRecord) { + if (tsdbSeekMFile(pMFile, pRecord->offset + sizeof(SKVRecord), SEEK_SET) < 0) { + tsdbError("vgId:%d failed to seek file %s since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile), + tstrerror(terrno)); + tfree(pBuf); + tsdbCloseMFile(pMFile); + return -1; + } + + int nread = (int)tsdbReadMFile(pMFile, pBuf, pRecord->size); + if (nread < 0) { + tsdbError("vgId:%d failed to read file %s since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile), + tstrerror(terrno)); + tfree(pBuf); + tsdbCloseMFile(pMFile); + return -1; + } + + if (nread < pRecord->size) { + tsdbError("vgId:%d failed to read file %s since file corrupted, expected read:%" PRId64 " actual read:%d", + REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile), pRecord->size, nread); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tfree(pBuf); + tsdbCloseMFile(pMFile); + return -1; + } + + if (tsdbRestoreTable(pRepo, pBuf, (int)pRecord->size) < 0) { + tsdbError("vgId:%d failed to restore table, uid %" PRId64 ", since %s" PRIu64, REPO_ID(pRepo), pRecord->uid, + tstrerror(terrno)); + tfree(pBuf); + tsdbCloseMFile(pMFile); + return -1; + } + + pRecord = taosHashIterate(pfs->metaCache, pRecord); + } + + tsdbOrgMeta(pRepo); + } + + tsdbCloseMFile(pMFile); + tfree(pBuf); + return 0; +} + +static int tsdbScanRootDir(STsdbRepo *pRepo) { + char rootDir[TSDB_FILENAME_LEN]; + char bname[TSDB_FILENAME_LEN]; + STsdbFS * pfs = REPO_FS(pRepo); + const TFILE *pf; + + tsdbGetRootDir(REPO_ID(pRepo), rootDir); + TDIR *tdir = tfsOpendir(rootDir); + if (tdir == NULL) { + tsdbError("vgId:%d failed to open directory %s since %s", REPO_ID(pRepo), rootDir, tstrerror(terrno)); + return -1; + } + + while ((pf = tfsReaddir(tdir))) { + tfsbasename(pf, bname); + + if (strcmp(bname, tsdbTxnFname[TSDB_TXN_CURR_FILE]) == 0 || strcmp(bname, "data") == 0) { + // Skip current file and data directory + continue; + } + + if (pfs->cstatus->pmf && tfsIsSameFile(pf, &(pfs->cstatus->pmf->f))) { + continue; + } + + (void)tfsremove(pf); + tsdbDebug("vgId:%d invalid file %s is removed", REPO_ID(pRepo), TFILE_NAME(pf)); + } + + tfsClosedir(tdir); + + return 0; +} + +static int tsdbScanDataDir(STsdbRepo *pRepo) { + char dataDir[TSDB_FILENAME_LEN]; + char bname[TSDB_FILENAME_LEN]; + STsdbFS * pfs = REPO_FS(pRepo); + const TFILE *pf; + + tsdbGetDataDir(REPO_ID(pRepo), dataDir); + TDIR *tdir = tfsOpendir(dataDir); + if (tdir == NULL) { + tsdbError("vgId:%d failed to open directory %s since %s", REPO_ID(pRepo), dataDir, tstrerror(terrno)); + return -1; + } + + while ((pf = tfsReaddir(tdir))) { + tfsbasename(pf, bname); + + if (!tsdbIsTFileInFS(pfs, pf)) { + (void)tfsremove(pf); + tsdbDebug("vgId:%d invalid file %s is removed", REPO_ID(pRepo), TFILE_NAME(pf)); + } + } + + tfsClosedir(tdir); + + return 0; +} + +static bool tsdbIsTFileInFS(STsdbFS *pfs, const TFILE *pf) { + SFSIter fsiter; + tsdbFSIterInit(&fsiter, pfs, TSDB_FS_ITER_FORWARD); + SDFileSet *pSet; + + while ((pSet = tsdbFSIterNext(&fsiter))) { + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + SDFile *pDFile = TSDB_DFILE_IN_SET(pSet, ftype); + if (tfsIsSameFile(pf, TSDB_FILE_F(pDFile))) { + return true; + } + } + } + + return false; +} + +static int tsdbRestoreMeta(STsdbRepo *pRepo) { + char rootDir[TSDB_FILENAME_LEN]; + char bname[TSDB_FILENAME_LEN]; + TDIR * tdir = NULL; + const TFILE *pf = NULL; + const char * pattern = "^meta(-ver[0-9]+)?$"; + regex_t regex; + STsdbFS * pfs = REPO_FS(pRepo); + + regcomp(®ex, pattern, REG_EXTENDED); + + tsdbInfo("vgId:%d try to restore meta", REPO_ID(pRepo)); + + tsdbGetRootDir(REPO_ID(pRepo), rootDir); + + tdir = tfsOpendir(rootDir); + if (tdir == NULL) { + tsdbError("vgId:%d failed to open dir %s since %s", REPO_ID(pRepo), rootDir, tstrerror(terrno)); + regfree(®ex); + return -1; + } + + while ((pf = tfsReaddir(tdir))) { + tfsbasename(pf, bname); + + if (strcmp(bname, "data") == 0) { + // Skip the data/ directory + continue; + } + + if (strcmp(bname, tsdbTxnFname[TSDB_TXN_TEMP_FILE]) == 0) { + // Skip current.t file + tsdbInfo("vgId:%d file %s exists, remove it", REPO_ID(pRepo), TFILE_NAME(pf)); + (void)tfsremove(pf); + continue; + } + + int code = regexec(®ex, bname, 0, NULL, 0); + if (code == 0) { + // Match + if (pfs->cstatus->pmf != NULL) { + tsdbError("vgId:%d failed to restore meta since two file exists, file1 %s and file2 %s", REPO_ID(pRepo), + TSDB_FILE_FULL_NAME(pfs->cstatus->pmf), TFILE_NAME(pf)); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tfsClosedir(tdir); + regfree(®ex); + return -1; + } else { + uint32_t _version = 0; + if (strcmp(bname, "meta") != 0) { + sscanf(bname, "meta-ver%" PRIu32, &_version); + pfs->cstatus->meta.version = _version; + } + + pfs->cstatus->pmf = &(pfs->cstatus->mf); + pfs->cstatus->pmf->f = *pf; + TSDB_FILE_SET_CLOSED(pfs->cstatus->pmf); + + if (tsdbOpenMFile(pfs->cstatus->pmf, O_RDONLY) < 0) { + tsdbError("vgId:%d failed to restore meta since %s", REPO_ID(pRepo), tstrerror(terrno)); + tfsClosedir(tdir); + regfree(®ex); + return -1; + } + + if (tsdbLoadMFileHeader(pfs->cstatus->pmf, &(pfs->cstatus->pmf->info)) < 0) { + tsdbError("vgId:%d failed to restore meta since %s", REPO_ID(pRepo), tstrerror(terrno)); + tsdbCloseMFile(pfs->cstatus->pmf); + tfsClosedir(tdir); + regfree(®ex); + return -1; + } + + if (tsdbForceKeepFile) { + struct stat tfstat; + + // Get real file size + if (fstat(pfs->cstatus->pmf->fd, &tfstat) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbCloseMFile(pfs->cstatus->pmf); + tfsClosedir(tdir); + regfree(®ex); + return -1; + } + + if (pfs->cstatus->pmf->info.size != tfstat.st_size) { + int64_t tfsize = pfs->cstatus->pmf->info.size; + pfs->cstatus->pmf->info.size = tfstat.st_size; + tsdbInfo("vgId:%d file %s header size is changed from %" PRId64 " to %" PRId64, REPO_ID(pRepo), + TSDB_FILE_FULL_NAME(pfs->cstatus->pmf), tfsize, pfs->cstatus->pmf->info.size); + } + } + + tsdbCloseMFile(pfs->cstatus->pmf); + } + } else if (code == REG_NOMATCH) { + // Not match + tsdbInfo("vgId:%d invalid file %s exists, remove it", REPO_ID(pRepo), TFILE_NAME(pf)); + tfsremove(pf); + continue; + } else { + // Has other error + tsdbError("vgId:%d failed to restore meta file while run regexec since %s", REPO_ID(pRepo), strerror(code)); + terrno = TAOS_SYSTEM_ERROR(code); + tfsClosedir(tdir); + regfree(®ex); + return -1; + } + } + + if (pfs->cstatus->pmf) { + tsdbInfo("vgId:%d meta file %s is restored", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pfs->cstatus->pmf)); + } else { + tsdbInfo("vgId:%d no meta file is restored", REPO_ID(pRepo)); + } + + tfsClosedir(tdir); + regfree(®ex); + return 0; +} + +static int tsdbFetchTFileSet(STsdbRepo *pRepo, SArray **fArray) { + char dataDir[TSDB_FILENAME_LEN]; + char bname[TSDB_FILENAME_LEN]; + TDIR * tdir = NULL; + const TFILE *pf = NULL; + const char * pattern = "^v[0-9]+f[0-9]+\\.(head|data|last|smad|smal)(-ver[0-9]+)?$"; + regex_t regex; + + tsdbGetDataDir(REPO_ID(pRepo), dataDir); + + // Resource allocation and init + regcomp(®ex, pattern, REG_EXTENDED); + + *fArray = taosArrayInit(1024, sizeof(TFILE)); + if (*fArray == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbError("vgId:%d failed to fetch TFileSet while open directory %s since %s", REPO_ID(pRepo), dataDir, + tstrerror(terrno)); + regfree(®ex); + return -1; + } + + tdir = tfsOpendir(dataDir); + if (tdir == NULL) { + tsdbError("vgId:%d failed to fetch TFileSet while open directory %s since %s", REPO_ID(pRepo), dataDir, + tstrerror(terrno)); + taosArrayDestroy(fArray); + regfree(®ex); + return -1; + } + + while ((pf = tfsReaddir(tdir))) { + tfsbasename(pf, bname); + + int code = regexec(®ex, bname, 0, NULL, 0); + if (code == 0) { + if (taosArrayPush(*fArray, (void *)pf) == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tfsClosedir(tdir); + taosArrayDestroy(fArray); + regfree(®ex); + return -1; + } + } else if (code == REG_NOMATCH) { + // Not match + tsdbInfo("vgId:%d invalid file %s exists, remove it", REPO_ID(pRepo), TFILE_NAME(pf)); + (void)tfsremove(pf); + continue; + } else { + // Has other error + tsdbError("vgId:%d failed to fetch TFileSet Array while run regexec since %s", REPO_ID(pRepo), strerror(code)); + terrno = TAOS_SYSTEM_ERROR(code); + tfsClosedir(tdir); + taosArrayDestroy(fArray); + regfree(®ex); + return -1; + } + } + + tfsClosedir(tdir); + regfree(®ex); + + // Sort the array according to file name + taosArraySort(*fArray, tsdbComparTFILE); + return 0; +} + +// update the function if the DFileSet definition updates +static bool tsdbIsDFileSetValid(int nFiles) { + switch (nFiles) { + case TSDB_FILE_MIN: + case TSDB_FILE_MAX: + return true; + default: + return false; + } +} + +static int tsdbRestoreDFileSet(STsdbRepo *pRepo) { + const TFILE *pf = NULL; + SArray * fArray = NULL; + STsdbFS * pfs = REPO_FS(pRepo); + char dataDir[TSDB_FILENAME_LEN] = "\0"; + size_t fArraySize = 0; + + tsdbGetDataDir(REPO_ID(pRepo), dataDir); + + if (tsdbFetchTFileSet(pRepo, &fArray) < 0) { + tsdbError("vgId:%d failed to fetch TFileSet from %s to restore since %s", REPO_ID(pRepo), dataDir, + tstrerror(terrno)); + return -1; + } + + if ((fArraySize = taosArrayGetSize(fArray)) <= 0) { + taosArrayDestroy(&fArray); + tsdbInfo("vgId:%d size of DFileSet from %s is %" PRIu32, REPO_ID(pRepo), dataDir, (uint32_t)fArraySize); + return 0; + } + + // Loop to recover each file set + SDFileSet fset = {0}; + uint8_t nDFiles = 0; + bool isOneFSetFinish = true; + int lastFType = -1; + // one fileset ends when (1) the array ends or (2) encounter different fid + for (size_t index = 0; index < fArraySize; ++index) { + int tvid = -1, tfid = -1; + TSDB_FILE_T ttype = TSDB_FILE_MAX; + uint32_t tversion = -1; + char bname[TSDB_FILENAME_LEN] = "\0"; + + pf = taosArrayGet(fArray, index); + tfsbasename(pf, bname); + tsdbParseDFilename(bname, &tvid, &tfid, &ttype, &tversion); + ASSERT(tvid == REPO_ID(pRepo)); + SDFile *pDFile = TSDB_DFILE_IN_SET(&fset, ttype); + if (tfid < pRepo->rtn.minFid) { // skip the file expired + continue; + } + if ((isOneFSetFinish == false) && (lastFType == ttype)) { // only fetch the 1st file with same fid and type. + continue; + } + + lastFType = ttype; + + if (index == 0) { + memset(&fset, 0, sizeof(SDFileSet)); + TSDB_FSET_SET_CLOSED(&fset); + nDFiles = 1; + fset.fid = tfid; + pDFile->f = *pf; + isOneFSetFinish = false; + } else { + if (fset.fid == tfid) { + ++nDFiles; + pDFile->f = *pf; + // (1) the array ends + if (index == fArraySize - 1) { + if (tsdbIsDFileSetValid(nDFiles)) { + tsdbInfo("vgId:%d DFileSet %d is fetched, nDFiles=%" PRIu8, REPO_ID(pRepo), fset.fid, nDFiles); + isOneFSetFinish = true; + } else { + // return error in case of removing uncomplete DFileSets + terrno = TSDB_CODE_TDB_INCOMPLETE_DFILESET; + tsdbError("vgId:%d incomplete DFileSet, fid:%d, nDFiles=%" PRIu8, REPO_ID(pRepo), fset.fid, nDFiles); + taosArrayDestroy(&fArray); + return -1; + } + } + } else { + // (2) encounter different fid + if (tsdbIsDFileSetValid(nDFiles)) { + tsdbInfo("vgId:%d DFileSet %d is fetched, nDFiles=%" PRIu8, REPO_ID(pRepo), fset.fid, nDFiles); + isOneFSetFinish = true; + } else { + // return error in case of removing uncomplete DFileSets + terrno = TSDB_CODE_TDB_INCOMPLETE_DFILESET; + tsdbError("vgId:%d incomplete DFileSet, fid:%d, nDFiles=%" PRIu8, REPO_ID(pRepo), fset.fid, nDFiles); + taosArrayDestroy(&fArray); + return -1; +#if 0 + // next FSet + memset(&fset, 0, sizeof(SDFileSet)); + TSDB_FSET_SET_CLOSED(&fset); + nDFiles = 1; + fset.fid = tfid; + pDFile->f = *pf; + isOneFSetFinish = false; + continue; +#endif + } + } + } + + if (isOneFSetFinish) { + for (TSDB_FILE_T ftype = 0; ftype < nDFiles; ++ftype) { + SDFile * pDFile1 = TSDB_DFILE_IN_SET(&fset, ftype); + if (tsdbOpenDFile(pDFile1, O_RDONLY) < 0) { + tsdbError("vgId:%d failed to open DFile %s since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pDFile1), + tstrerror(terrno)); + taosArrayDestroy(&fArray); + return -1; + } + + if (tsdbLoadDFileHeader(pDFile1, &(pDFile1->info)) < 0) { + tsdbError("vgId:%d failed to load DFile %s header since %s", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pDFile1), + tstrerror(terrno)); + taosArrayDestroy(&fArray); + return -1; + } + + if (tsdbForceKeepFile) { + struct stat tfstat; + + // Get real file size + if (fstat(pDFile1->fd, &tfstat) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + taosArrayDestroy(&fArray); + return -1; + } + + if (pDFile1->info.size != tfstat.st_size) { + int64_t tfsize = pDFile1->info.size; + pDFile1->info.size = tfstat.st_size; + tsdbInfo("vgId:%d file %s header size is changed from %" PRId64 " to %" PRId64, REPO_ID(pRepo), + TSDB_FILE_FULL_NAME(pDFile1), tfsize, pDFile1->info.size); + } + } + + tsdbCloseDFile(pDFile1); + } + tsdbInfo("vgId:%d FSET %d is restored", REPO_ID(pRepo), fset.fid); + + // TODO: update the logic when TSDB_FSET_VER definition update. + if (nDFiles == TSDB_FILE_MIN) { + fset.ver = TSDB_FSET_VER_0; + } else { + fset.ver = TSDB_LATEST_FSET_VER; + } + + taosArrayPush(pfs->cstatus->df, &fset); + + // next FSet + memset(&fset, 0, sizeof(SDFileSet)); + TSDB_FSET_SET_CLOSED(&fset); + nDFiles = 1; + fset.fid = tfid; + pDFile->f = *pf; + isOneFSetFinish = false; + } + } + + // Resource release + taosArrayDestroy(&fArray); + + return 0; +} + +static int tsdbRestoreCurrent(STsdbRepo *pRepo) { + // Loop to recover mfile + if (tsdbRestoreMeta(pRepo) < 0) { + tsdbError("vgId:%d failed to restore current since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + // Loop to recover dfile set + if (tsdbRestoreDFileSet(pRepo) < 0) { + tsdbError("vgId:%d failed to restore DFileSet since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + if (tsdbSaveFSStatus(pRepo->fs->cstatus, REPO_ID(pRepo)) < 0) { + tsdbError("vgId:%d failed to restore corrent since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + return 0; +} + +static int tsdbComparTFILE(const void *arg1, const void *arg2) { + TFILE *pf1 = (TFILE *)arg1; + TFILE *pf2 = (TFILE *)arg2; + + int vid1, fid1, vid2, fid2; + TSDB_FILE_T ftype1, ftype2; + uint32_t version1, version2; + char bname1[TSDB_FILENAME_LEN]; + char bname2[TSDB_FILENAME_LEN]; + + tfsbasename(pf1, bname1); + tfsbasename(pf2, bname2); + tsdbParseDFilename(bname1, &vid1, &fid1, &ftype1, &version1); + tsdbParseDFilename(bname2, &vid2, &fid2, &ftype2, &version2); + + if (fid1 < fid2) { + return -1; + } else if (fid1 > fid2) { + return 1; + } else { + if (ftype1 < ftype2) { + return -1; + } else if (ftype1 > ftype2) { + return 1; + } else { + if (version1 < version2) { + return -1; + } else if (version1 > version2) { + return 1; + } else { + return 0; + } + } + } +} + +static void tsdbScanAndTryFixDFilesHeader(STsdbRepo *pRepo, int32_t *nExpired) { + STsdbFS * pfs = REPO_FS(pRepo); + SFSStatus *pStatus = pfs->cstatus; + SDFInfo info; + + for (size_t i = 0; i < taosArrayGetSize(pStatus->df); i++) { + SDFileSet fset; + tsdbInitDFileSetEx(&fset, (SDFileSet *)taosArrayGet(pStatus->df, i)); + if (fset.fid < pRepo->rtn.minFid) { + ++*nExpired; + } + tsdbDebug("vgId:%d scan DFileSet %d header", REPO_ID(pRepo), fset.fid); + + if (tsdbOpenDFileSet(&fset, O_RDWR) < 0) { + tsdbError("vgId:%d failed to open DFileSet %d since %s, continue", REPO_ID(pRepo), fset.fid, tstrerror(terrno)); + continue; + } + + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(&fset); ftype++) { + SDFile *pDFile = TSDB_DFILE_IN_SET(&fset, ftype); + + if ((tsdbLoadDFileHeader(pDFile, &info) < 0) || pDFile->info.size != info.size || + pDFile->info.magic != info.magic) { + if (tsdbUpdateDFileHeader(pDFile) < 0) { + tsdbError("vgId:%d failed to update DFile header of %s since %s, continue", REPO_ID(pRepo), + TSDB_FILE_FULL_NAME(pDFile), tstrerror(terrno)); + } else { + tsdbInfo("vgId:%d DFile header of %s is updated", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pDFile)); + TSDB_FILE_FSYNC(pDFile); + } + } else { + tsdbDebug("vgId:%d DFile header of %s is correct", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pDFile)); + } + } + + tsdbCloseDFileSet(&fset); + } +} diff --git a/source/dnode/vnode/tsdb2/src/tsdbFile.c b/source/dnode/vnode/tsdb2/src/tsdbFile.c new file mode 100644 index 0000000000..77d172893e --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbFile.c @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "tsdbint.h" + +static const char *TSDB_FNAME_SUFFIX[] = { + "head", // TSDB_FILE_HEAD + "data", // TSDB_FILE_DATA + "last", // TSDB_FILE_LAST + "smad", // TSDB_FILE_SMA_DATA(Small Materialized Aggregate for .data File) + "smal", // TSDB_FILE_SMA_LAST(Small Materialized Aggregate for .last File) + "", // TSDB_FILE_MAX + "meta", // TSDB_FILE_META +}; + +static void tsdbGetFilename(int vid, int fid, uint32_t ver, TSDB_FILE_T ftype, char *fname); +static int tsdbRollBackMFile(SMFile *pMFile); +static int tsdbEncodeDFInfo(void **buf, SDFInfo *pInfo); +static void *tsdbDecodeDFInfo(void *buf, SDFInfo *pInfo, TSDB_FVER_TYPE sfver); +static int tsdbRollBackDFile(SDFile *pDFile); + +// ============== SMFile +void tsdbInitMFile(SMFile *pMFile, SDiskID did, int vid, uint32_t ver) { + char fname[TSDB_FILENAME_LEN]; + + TSDB_FILE_SET_STATE(pMFile, TSDB_FILE_STATE_OK); + + memset(&(pMFile->info), 0, sizeof(pMFile->info)); + pMFile->info.magic = TSDB_FILE_INIT_MAGIC; + + tsdbGetFilename(vid, 0, ver, TSDB_FILE_META, fname); + tfsInitFile(TSDB_FILE_F(pMFile), did.level, did.id, fname); +} + +void tsdbInitMFileEx(SMFile *pMFile, const SMFile *pOMFile) { + *pMFile = *pOMFile; + TSDB_FILE_SET_CLOSED(pMFile); +} + +int tsdbEncodeSMFile(void **buf, SMFile *pMFile) { + int tlen = 0; + + tlen += tsdbEncodeMFInfo(buf, &(pMFile->info)); + tlen += tfsEncodeFile(buf, &(pMFile->f)); + + return tlen; +} + +void *tsdbDecodeSMFile(void *buf, SMFile *pMFile) { + buf = tsdbDecodeMFInfo(buf, &(pMFile->info)); + buf = tfsDecodeFile(buf, &(pMFile->f)); + TSDB_FILE_SET_CLOSED(pMFile); + + return buf; +} + +int tsdbEncodeSMFileEx(void **buf, SMFile *pMFile) { + int tlen = 0; + + tlen += tsdbEncodeMFInfo(buf, &(pMFile->info)); + tlen += taosEncodeString(buf, TSDB_FILE_FULL_NAME(pMFile)); + + return tlen; +} + +void *tsdbDecodeSMFileEx(void *buf, SMFile *pMFile) { + char *aname; + buf = tsdbDecodeMFInfo(buf, &(pMFile->info)); + buf = taosDecodeString(buf, &aname); + tstrncpy(TSDB_FILE_FULL_NAME(pMFile), aname, TSDB_FILENAME_LEN); + TSDB_FILE_SET_CLOSED(pMFile); + + tfree(aname); + + return buf; +} + +int tsdbApplyMFileChange(SMFile *from, SMFile *to) { + if (from == NULL && to == NULL) return 0; + + if (from != NULL) { + if (to == NULL) { + return tsdbRemoveMFile(from); + } else { + if (tfsIsSameFile(TSDB_FILE_F(from), TSDB_FILE_F(to))) { + if (from->info.size > to->info.size) { + tsdbRollBackMFile(to); + } + } else { + return tsdbRemoveMFile(from); + } + } + } + + return 0; +} + +int tsdbCreateMFile(SMFile *pMFile, bool updateHeader) { + ASSERT(pMFile->info.size == 0 && pMFile->info.magic == TSDB_FILE_INIT_MAGIC); + + pMFile->fd = open(TSDB_FILE_FULL_NAME(pMFile), O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0755); + if (pMFile->fd < 0) { + if (errno == ENOENT) { + // Try to create directory recursively + char *s = strdup(TFILE_REL_NAME(&(pMFile->f))); + if (tfsMkdirRecurAt(dirname(s), TSDB_FILE_LEVEL(pMFile), TSDB_FILE_ID(pMFile)) < 0) { + tfree(s); + return -1; + } + tfree(s); + + pMFile->fd = open(TSDB_FILE_FULL_NAME(pMFile), O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0755); + if (pMFile->fd < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + } else { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + } + + if (!updateHeader) { + return 0; + } + + pMFile->info.size += TSDB_FILE_HEAD_SIZE; + + if (tsdbUpdateMFileHeader(pMFile) < 0) { + tsdbCloseMFile(pMFile); + tsdbRemoveMFile(pMFile); + return -1; + } + + return 0; +} + +int tsdbUpdateMFileHeader(SMFile *pMFile) { + char buf[TSDB_FILE_HEAD_SIZE] = "\0"; + + if (tsdbSeekMFile(pMFile, 0, SEEK_SET) < 0) { + return -1; + } + + void *ptr = buf; + tsdbEncodeMFInfo(&ptr, TSDB_FILE_INFO(pMFile)); + + taosCalcChecksumAppend(0, (uint8_t *)buf, TSDB_FILE_HEAD_SIZE); + if (tsdbWriteMFile(pMFile, buf, TSDB_FILE_HEAD_SIZE) < 0) { + return -1; + } + + return 0; +} + +int tsdbLoadMFileHeader(SMFile *pMFile, SMFInfo *pInfo) { + char buf[TSDB_FILE_HEAD_SIZE] = "\0"; + + ASSERT(TSDB_FILE_OPENED(pMFile)); + + if (tsdbSeekMFile(pMFile, 0, SEEK_SET) < 0) { + return -1; + } + + if (tsdbReadMFile(pMFile, buf, TSDB_FILE_HEAD_SIZE) < 0) { + return -1; + } + + if (!taosCheckChecksumWhole((uint8_t *)buf, TSDB_FILE_HEAD_SIZE)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + return -1; + } + + tsdbDecodeMFInfo(buf, pInfo); + return 0; +} + +int tsdbScanAndTryFixMFile(STsdbRepo *pRepo) { + SMFile * pMFile = pRepo->fs->cstatus->pmf; + struct stat mfstat; + SMFile mf; + + if (pMFile == NULL) { + // No meta file, no need to scan + return 0; + } + + tsdbInitMFileEx(&mf, pMFile); + + if (access(TSDB_FILE_FULL_NAME(pMFile), F_OK) != 0) { + tsdbError("vgId:%d meta file %s not exist, report to upper layer to fix it", REPO_ID(pRepo), + TSDB_FILE_FULL_NAME(pMFile)); + pRepo->state |= TSDB_STATE_BAD_META; + TSDB_FILE_SET_STATE(pMFile, TSDB_FILE_STATE_BAD); + return 0; + } + + if (stat(TSDB_FILE_FULL_NAME(&mf), &mfstat) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + if (pMFile->info.size < mfstat.st_size) { + if (tsdbOpenMFile(&mf, O_WRONLY) < 0) { + return -1; + } + + if (taosFtruncate(mf.fd, mf.info.size) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbCloseMFile(&mf); + return -1; + } + + if (tsdbUpdateMFileHeader(&mf) < 0) { + tsdbCloseMFile(&mf); + return -1; + } + + tsdbCloseMFile(&mf); + tsdbInfo("vgId:%d file %s is truncated from %" PRId64 " to %" PRId64, REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile), + mfstat.st_size, pMFile->info.size); + } else if (pMFile->info.size > mfstat.st_size) { + tsdbError("vgId:%d meta file %s has wrong size %" PRId64 " expected %" PRId64 ", report to upper layer to fix it", + REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile), mfstat.st_size, pMFile->info.size); + pRepo->state |= TSDB_STATE_BAD_META; + TSDB_FILE_SET_STATE(pMFile, TSDB_FILE_STATE_BAD); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + return 0; + } else { + tsdbDebug("vgId:%d meta file %s passes the scan", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pMFile)); + } + + return 0; +} + +int tsdbEncodeMFInfo(void **buf, SMFInfo *pInfo) { + int tlen = 0; + + tlen += taosEncodeVariantI64(buf, pInfo->size); + tlen += taosEncodeVariantI64(buf, pInfo->tombSize); + tlen += taosEncodeVariantI64(buf, pInfo->nRecords); + tlen += taosEncodeVariantI64(buf, pInfo->nDels); + tlen += taosEncodeFixedU32(buf, pInfo->magic); + + return tlen; +} + +void *tsdbDecodeMFInfo(void *buf, SMFInfo *pInfo) { + buf = taosDecodeVariantI64(buf, &(pInfo->size)); + buf = taosDecodeVariantI64(buf, &(pInfo->tombSize)); + buf = taosDecodeVariantI64(buf, &(pInfo->nRecords)); + buf = taosDecodeVariantI64(buf, &(pInfo->nDels)); + buf = taosDecodeFixedU32(buf, &(pInfo->magic)); + + return buf; +} + +static int tsdbRollBackMFile(SMFile *pMFile) { + SMFile mf; + + tsdbInitMFileEx(&mf, pMFile); + + if (tsdbOpenMFile(&mf, O_WRONLY) < 0) { + return -1; + } + + if (taosFtruncate(TSDB_FILE_FD(&mf), pMFile->info.size) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbCloseMFile(&mf); + return -1; + } + + if (tsdbUpdateMFileHeader(&mf) < 0) { + tsdbCloseMFile(&mf); + return -1; + } + + TSDB_FILE_FSYNC(&mf); + + tsdbCloseMFile(&mf); + return 0; +} + +// ============== Operations on SDFile +void tsdbInitDFile(SDFile *pDFile, SDiskID did, int vid, int fid, uint32_t ver, TSDB_FILE_T ftype) { + char fname[TSDB_FILENAME_LEN]; + + TSDB_FILE_SET_STATE(pDFile, TSDB_FILE_STATE_OK); + + TSDB_FILE_SET_CLOSED(pDFile); + + memset(&(pDFile->info), 0, sizeof(pDFile->info)); + pDFile->info.magic = TSDB_FILE_INIT_MAGIC; + pDFile->info.fver = tsdbGetDFSVersion(ftype); + + tsdbGetFilename(vid, fid, ver, ftype, fname); + tfsInitFile(&(pDFile->f), did.level, did.id, fname); +} + +void tsdbInitDFileEx(SDFile *pDFile, SDFile *pODFile) { + *pDFile = *pODFile; + TSDB_FILE_SET_CLOSED(pDFile); +} + +int tsdbEncodeSDFile(void **buf, SDFile *pDFile) { + int tlen = 0; + + tlen += tsdbEncodeDFInfo(buf, &(pDFile->info)); + tlen += tfsEncodeFile(buf, &(pDFile->f)); + + return tlen; +} + +void *tsdbDecodeSDFile(void *buf, SDFile *pDFile, uint32_t sfver) { + buf = tsdbDecodeDFInfo(buf, &(pDFile->info), sfver); + buf = tfsDecodeFile(buf, &(pDFile->f)); + TSDB_FILE_SET_CLOSED(pDFile); + + return buf; +} + +static int tsdbEncodeSDFileEx(void **buf, SDFile *pDFile) { + int tlen = 0; + + tlen += tsdbEncodeDFInfo(buf, &(pDFile->info)); + tlen += taosEncodeString(buf, TSDB_FILE_FULL_NAME(pDFile)); + + return tlen; +} + +static void *tsdbDecodeSDFileEx(void *buf, SDFile *pDFile) { + char *aname; + // The sync module would send DFileSet with latest verion. + buf = tsdbDecodeDFInfo(buf, &(pDFile->info), TSDB_LATEST_SFS_VER); + buf = taosDecodeString(buf, &aname); + tstrncpy(TSDB_FILE_FULL_NAME(pDFile), aname, TSDB_FILENAME_LEN); + TSDB_FILE_SET_CLOSED(pDFile); + tfree(aname); + + return buf; +} + +int tsdbCreateDFile(SDFile *pDFile, bool updateHeader, TSDB_FILE_T fType) { + ASSERT(pDFile->info.size == 0 && pDFile->info.magic == TSDB_FILE_INIT_MAGIC); + + pDFile->fd = open(TSDB_FILE_FULL_NAME(pDFile), O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0755); + if (pDFile->fd < 0) { + if (errno == ENOENT) { + // Try to create directory recursively + char *s = strdup(TFILE_REL_NAME(&(pDFile->f))); + if (tfsMkdirRecurAt(dirname(s), TSDB_FILE_LEVEL(pDFile), TSDB_FILE_ID(pDFile)) < 0) { + tfree(s); + return -1; + } + tfree(s); + + pDFile->fd = open(TSDB_FILE_FULL_NAME(pDFile), O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0755); + if (pDFile->fd < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + } else { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + } + + if (!updateHeader) { + return 0; + } + + pDFile->info.size += TSDB_FILE_HEAD_SIZE; + pDFile->info.fver = tsdbGetDFSVersion(fType); + + if (tsdbUpdateDFileHeader(pDFile) < 0) { + tsdbCloseDFile(pDFile); + tsdbRemoveDFile(pDFile); + return -1; + } + + return 0; +} + +int tsdbUpdateDFileHeader(SDFile *pDFile) { + char buf[TSDB_FILE_HEAD_SIZE] = "\0"; + + if (tsdbSeekDFile(pDFile, 0, SEEK_SET) < 0) { + return -1; + } + + void *ptr = buf; + tsdbEncodeDFInfo(&ptr, &(pDFile->info)); + + taosCalcChecksumAppend(0, (uint8_t *)buf, TSDB_FILE_HEAD_SIZE); + if (tsdbWriteDFile(pDFile, buf, TSDB_FILE_HEAD_SIZE) < 0) { + return -1; + } + + return 0; +} + +int tsdbLoadDFileHeader(SDFile *pDFile, SDFInfo *pInfo) { + char buf[TSDB_FILE_HEAD_SIZE] = "\0"; + // uint32_t _version; + + ASSERT(TSDB_FILE_OPENED(pDFile)); + + if (tsdbSeekDFile(pDFile, 0, SEEK_SET) < 0) { + return -1; + } + + if (tsdbReadDFile(pDFile, buf, TSDB_FILE_HEAD_SIZE) < 0) { + return -1; + } + + if (!taosCheckChecksumWhole((uint8_t *)buf, TSDB_FILE_HEAD_SIZE)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + return -1; + } + + void *pBuf = buf; + pBuf = tsdbDecodeDFInfo(pBuf, pInfo, TSDB_LATEST_FVER); // only make sure the parameter sfver > 0 + return 0; +} + +static int tsdbScanAndTryFixDFile(STsdbRepo *pRepo, SDFile *pDFile) { + struct stat dfstat; + SDFile df; + + tsdbInitDFileEx(&df, pDFile); + + if (access(TSDB_FILE_FULL_NAME(pDFile), F_OK) != 0) { + tsdbError("vgId:%d data file %s not exist, report to upper layer to fix it", REPO_ID(pRepo), + TSDB_FILE_FULL_NAME(pDFile)); + pRepo->state |= TSDB_STATE_BAD_DATA; + TSDB_FILE_SET_STATE(pDFile, TSDB_FILE_STATE_BAD); + return 0; + } + + if (stat(TSDB_FILE_FULL_NAME(&df), &dfstat) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + if (pDFile->info.size < dfstat.st_size) { + if (tsdbOpenDFile(&df, O_WRONLY) < 0) { + return -1; + } + + if (taosFtruncate(df.fd, df.info.size) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbCloseDFile(&df); + return -1; + } + + if (tsdbUpdateDFileHeader(&df) < 0) { + tsdbCloseDFile(&df); + return -1; + } + + tsdbCloseDFile(&df); + tsdbInfo("vgId:%d file %s is truncated from %" PRId64 " to %" PRId64, REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pDFile), + dfstat.st_size, pDFile->info.size); + } else if (pDFile->info.size > dfstat.st_size) { + tsdbError("vgId:%d data file %s has wrong size %" PRId64 " expected %" PRId64 ", report to upper layer to fix it", + REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pDFile), dfstat.st_size, pDFile->info.size); + pRepo->state |= TSDB_STATE_BAD_DATA; + TSDB_FILE_SET_STATE(pDFile, TSDB_FILE_STATE_BAD); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + return 0; + } else { + tsdbDebug("vgId:%d file %s passes the scan", REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pDFile)); + } + + return 0; +} + +static int tsdbEncodeDFInfo(void **buf, SDFInfo *pInfo) { + int tlen = 0; + tlen += taosEncodeFixedU32(buf, pInfo->fver); + tlen += taosEncodeFixedU32(buf, pInfo->magic); + tlen += taosEncodeFixedU32(buf, pInfo->len); + tlen += taosEncodeFixedU32(buf, pInfo->totalBlocks); + tlen += taosEncodeFixedU32(buf, pInfo->totalSubBlocks); + tlen += taosEncodeFixedU32(buf, pInfo->offset); + tlen += taosEncodeFixedU64(buf, pInfo->size); + tlen += taosEncodeFixedU64(buf, pInfo->tombSize); + + return tlen; +} + +static void *tsdbDecodeDFInfo(void *buf, SDFInfo *pInfo, TSDB_FVER_TYPE sfver) { + if (sfver > TSDB_FS_VER_0) { + buf = taosDecodeFixedU32(buf, &(pInfo->fver)); + } else { + pInfo->fver = TSDB_FS_VER_0; // default value + } + buf = taosDecodeFixedU32(buf, &(pInfo->magic)); + buf = taosDecodeFixedU32(buf, &(pInfo->len)); + buf = taosDecodeFixedU32(buf, &(pInfo->totalBlocks)); + buf = taosDecodeFixedU32(buf, &(pInfo->totalSubBlocks)); + buf = taosDecodeFixedU32(buf, &(pInfo->offset)); + buf = taosDecodeFixedU64(buf, &(pInfo->size)); + buf = taosDecodeFixedU64(buf, &(pInfo->tombSize)); + + return buf; +} + +static int tsdbApplyDFileChange(SDFile *from, SDFile *to) { + ASSERT(from != NULL || to != NULL); + + if (from != NULL) { + if (to == NULL) { + tsdbRemoveDFile(from); + } else { + if (tfsIsSameFile(TSDB_FILE_F(from), TSDB_FILE_F(to))) { + if (from->info.size > to->info.size) { + tsdbRollBackDFile(to); + } + } else { + (void)tsdbRemoveDFile(from); + } + } + } + + return 0; +} + +static int tsdbRollBackDFile(SDFile *pDFile) { + SDFile df = *pDFile; + + if (tsdbOpenDFile(&df, O_WRONLY) < 0) { + return -1; + } + + if (taosFtruncate(TSDB_FILE_FD(&df), pDFile->info.size) < 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbCloseDFile(&df); + return -1; + } + + if (tsdbUpdateDFileHeader(&df) < 0) { + tsdbCloseDFile(&df); + return -1; + } + + TSDB_FILE_FSYNC(&df); + + tsdbCloseDFile(&df); + return 0; +} + +// ============== Operations on SDFileSet +void tsdbInitDFileSet(SDFileSet *pSet, SDiskID did, int vid, int fid, uint32_t ver, uint16_t fsetVer) { + pSet->fid = fid; + pSet->state = 0; + pSet->ver = fsetVer; + + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + SDFile *pDFile = TSDB_DFILE_IN_SET(pSet, ftype); + tsdbInitDFile(pDFile, did, vid, fid, ver, ftype); + } +} + +void tsdbInitDFileSetEx(SDFileSet *pSet, SDFileSet *pOSet) { + ASSERT_TSDB_FSET_NFILES_VALID(pOSet); + pSet->fid = pOSet->fid; + pSet->ver = pOSet->ver; + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + tsdbInitDFileEx(TSDB_DFILE_IN_SET(pSet, ftype), TSDB_DFILE_IN_SET(pOSet, ftype)); + } +} + +int tsdbEncodeDFileSet(void **buf, SDFileSet *pSet) { + int tlen = 0; + + tlen += taosEncodeFixedI32(buf, pSet->fid); + tlen += taosEncodeFixedU16(buf, pSet->ver); + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + tlen += tsdbEncodeSDFile(buf, TSDB_DFILE_IN_SET(pSet, ftype)); + } + + return tlen; +} + +void *tsdbDecodeDFileSet(void *buf, SDFileSet *pSet, uint32_t sfver) { + int32_t fid; + + buf = taosDecodeFixedI32(buf, &(fid)); + pSet->state = 0; + pSet->fid = fid; + + if (sfver > TSDB_FS_VER_0) { + buf = taosDecodeFixedU16(buf, &(pSet->ver)); + } + + ASSERT_TSDB_FSET_NFILES_VALID(pSet); + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + buf = tsdbDecodeSDFile(buf, TSDB_DFILE_IN_SET(pSet, ftype), sfver); + } + return buf; +} + +int tsdbEncodeDFileSetEx(void **buf, SDFileSet *pSet) { + int tlen = 0; + + tlen += taosEncodeFixedI32(buf, pSet->fid); + tlen += taosEncodeFixedU16(buf, pSet->ver); + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + tlen += tsdbEncodeSDFileEx(buf, TSDB_DFILE_IN_SET(pSet, ftype)); + } + + return tlen; +} + +void *tsdbDecodeDFileSetEx(void *buf, SDFileSet *pSet) { + int32_t fid; + + buf = taosDecodeFixedI32(buf, &(fid)); + buf = taosDecodeFixedU16(buf, &(pSet->ver)); + pSet->fid = fid; + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + buf = tsdbDecodeSDFileEx(buf, TSDB_DFILE_IN_SET(pSet, ftype)); + } + return buf; +} + +int tsdbApplyDFileSetChange(SDFileSet *from, SDFileSet *to) { + uint8_t nFilesFrom = from ? tsdbGetNFiles(from) : 0; + uint8_t nFilesTo = to ? tsdbGetNFiles(to) : 0; + for (TSDB_FILE_T ftype = 0; ftype < MAX(nFilesFrom, nFilesTo); ftype++) { + SDFile *pDFileFrom = ftype < nFilesFrom ? TSDB_DFILE_IN_SET(from, ftype) : NULL; + SDFile *pDFileTo = ftype < nFilesTo ? TSDB_DFILE_IN_SET(to, ftype) : NULL; + if (tsdbApplyDFileChange(pDFileFrom, pDFileTo) < 0) { + return -1; + } + } + + return 0; +} + +int tsdbCreateDFileSet(SDFileSet *pSet, bool updateHeader) { + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + if (tsdbCreateDFile(TSDB_DFILE_IN_SET(pSet, ftype), updateHeader, ftype) < 0) { + tsdbCloseDFileSet(pSet); + tsdbRemoveDFileSet(pSet); + return -1; + } + } + + return 0; +} + +int tsdbUpdateDFileSetHeader(SDFileSet *pSet) { + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + if (tsdbUpdateDFileHeader(TSDB_DFILE_IN_SET(pSet, ftype)) < 0) { + return -1; + } + } + return 0; +} + +int tsdbScanAndTryFixDFileSet(STsdbRepo *pRepo, SDFileSet *pSet) { + ASSERT_TSDB_FSET_NFILES_VALID(pSet); + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + if (tsdbScanAndTryFixDFile(pRepo, TSDB_DFILE_IN_SET(pSet, ftype)) < 0) { + return -1; + } + } + return 0; +} + +int tsdbParseDFilename(const char *fname, int *vid, int *fid, TSDB_FILE_T *ftype, uint32_t *_version) { + char *p = NULL; + *_version = 0; + *ftype = TSDB_FILE_MAX; + + sscanf(fname, "v%df%d.%m[a-z]-ver%" PRIu32, vid, fid, &p, _version); + for (TSDB_FILE_T i = 0; i < TSDB_FILE_MAX; i++) { + if (strcmp(p, TSDB_FNAME_SUFFIX[i]) == 0) { + *ftype = i; + break; + } + } + + tfree(p); + return 0; +} + +static void tsdbGetFilename(int vid, int fid, uint32_t ver, TSDB_FILE_T ftype, char *fname) { + ASSERT(ftype != TSDB_FILE_MAX); + + if (ftype < TSDB_FILE_MAX) { + if (ver == 0) { + snprintf(fname, TSDB_FILENAME_LEN, "vnode/vnode%d/tsdb/data/v%df%d.%s", vid, vid, fid, TSDB_FNAME_SUFFIX[ftype]); + } else { + snprintf(fname, TSDB_FILENAME_LEN, "vnode/vnode%d/tsdb/data/v%df%d.%s-ver%" PRIu32, vid, vid, fid, + TSDB_FNAME_SUFFIX[ftype], ver); + } + } else { + if (ver == 0) { + snprintf(fname, TSDB_FILENAME_LEN, "vnode/vnode%d/tsdb/%s", vid, TSDB_FNAME_SUFFIX[ftype]); + } else { + snprintf(fname, TSDB_FILENAME_LEN, "vnode/vnode%d/tsdb/%s-ver%" PRIu32, vid, TSDB_FNAME_SUFFIX[ftype], ver); + } + } +} diff --git a/source/dnode/vnode/tsdb2/src/tsdbHealth.c b/source/dnode/vnode/tsdb2/src/tsdbHealth.c new file mode 100644 index 0000000000..8198c48033 --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbHealth.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "os.h" +#include "taosmsg.h" +#include "tarray.h" +#include "query.h" +#include "tglobal.h" +#include "tlist.h" +#include "tsdbint.h" +#include "tsdbBuffer.h" +#include "tsdbLog.h" +#include "tsdbHealth.h" +#include "ttimer.h" +#include "tthread.h" + + +// return malloc new block count +int32_t tsdbInsertNewBlock(STsdbRepo * pRepo) { + STsdbBufPool *pPool = pRepo->pPool; + int32_t cnt = 0; + + if(tsdbAllowNewBlock(pRepo)) { + STsdbBufBlock *pBufBlock = tsdbNewBufBlock(pPool->bufBlockSize); + if (pBufBlock) { + if (tdListAppend(pPool->bufBlockList, (void *)(&pBufBlock)) < 0) { + // append error + tsdbFreeBufBlock(pBufBlock); + } else { + pPool->nElasticBlocks ++; + cnt ++ ; + } + } + } + return cnt; +} + +// switch anther thread to run +void* cbKillQueryFree(void* param) { + STsdbRepo* pRepo = (STsdbRepo*)param; + // vnode + if(pRepo->appH.notifyStatus) { + pRepo->appH.notifyStatus(pRepo->appH.appH, TSDB_STATUS_COMMIT_NOBLOCK, TSDB_CODE_SUCCESS); + } + + // free + if(pRepo->pthread){ + void* p = pRepo->pthread; + pRepo->pthread = NULL; + free(p); + } + + return NULL; +} + +// return true do free , false do nothing +bool tsdbUrgeQueryFree(STsdbRepo * pRepo) { + // check previous running + if(pRepo->pthread && taosThreadRunning(pRepo->pthread)) { + tsdbWarn("vgId:%d pre urge thread is runing. nBlocks=%d nElasticBlocks=%d", REPO_ID(pRepo), pRepo->pPool->nBufBlocks, pRepo->pPool->nElasticBlocks); + return false; + } + // create new + pRepo->pthread = taosCreateThread(cbKillQueryFree, pRepo); + if(pRepo->pthread == NULL) { + tsdbError("vgId:%d create urge thread error.", REPO_ID(pRepo)); + return false; + } + return true; +} + +bool tsdbAllowNewBlock(STsdbRepo* pRepo) { + int32_t nMaxElastic = pRepo->config.totalBlocks/3; + STsdbBufPool* pPool = pRepo->pPool; + if(pPool->nElasticBlocks >= nMaxElastic) { + tsdbWarn("vgId:%d tsdbAllowNewBlock return fasle. nElasticBlock(%d) >= MaxElasticBlocks(%d)", REPO_ID(pRepo), pPool->nElasticBlocks, nMaxElastic); + return false; + } + return true; +} + +bool tsdbNoProblem(STsdbRepo* pRepo) { + if(listNEles(pRepo->pPool->bufBlockList) == 0) + return false; + return true; +} \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/src/tsdbMain.c b/source/dnode/vnode/tsdb2/src/tsdbMain.c new file mode 100644 index 0000000000..45872b1dce --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbMain.c @@ -0,0 +1,1143 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +// no test file errors here +#include "taosdef.h" +#include "tsdbint.h" +#include "ttimer.h" +#include "tthread.h" + +#define IS_VALID_PRECISION(precision) \ + (((precision) >= TSDB_TIME_PRECISION_MILLI) && ((precision) <= TSDB_TIME_PRECISION_NANO)) +#define TSDB_DEFAULT_COMPRESSION TWO_STAGE_COMP +#define IS_VALID_COMPRESSION(compression) (((compression) >= NO_COMPRESSION) && ((compression) <= TWO_STAGE_COMP)) + +static int32_t tsdbCheckAndSetDefaultCfg(STsdbCfg *pCfg); +static STsdbRepo *tsdbNewRepo(STsdbCfg *pCfg, STsdbAppH *pAppH); +static void tsdbFreeRepo(STsdbRepo *pRepo); +static void tsdbStartStream(STsdbRepo *pRepo); +static void tsdbStopStream(STsdbRepo *pRepo); +static int tsdbRestoreLastColumns(STsdbRepo *pRepo, STable *pTable, SReadH* pReadh); +static int tsdbRestoreLastRow(STsdbRepo *pRepo, STable *pTable, SReadH* pReadh, SBlockIdx *pIdx); + +// Function declaration +int32_t tsdbCreateRepo(int repoid) { + char tsdbDir[TSDB_FILENAME_LEN] = "\0"; + char dataDir[TSDB_FILENAME_LEN] = "\0"; + + tsdbGetRootDir(repoid, tsdbDir); + if (tfsMkdir(tsdbDir) < 0) { + goto _err; + } + + tsdbGetDataDir(repoid, dataDir); + if (tfsMkdir(dataDir) < 0) { + goto _err; + } + + // TODO: need to create current file with nothing in + + return 0; + +_err: + tsdbError("vgId:%d failed to create TSDB repository since %s", repoid, tstrerror(terrno)); + return -1; +} + +int32_t tsdbDropRepo(int repoid) { + char tsdbDir[TSDB_FILENAME_LEN] = "\0"; + + tsdbGetRootDir(repoid, tsdbDir); + return tfsRmdir(tsdbDir); +} + +STsdbRepo *tsdbOpenRepo(STsdbCfg *pCfg, STsdbAppH *pAppH) { + STsdbRepo *pRepo; + STsdbCfg config = *pCfg; + + terrno = TSDB_CODE_SUCCESS; + + // Check and set default configurations + if (tsdbCheckAndSetDefaultCfg(&config) < 0) { + tsdbError("vgId:%d failed to open TSDB repository since %s", config.tsdbId, tstrerror(terrno)); + return NULL; + } + + // Create new TSDB object + if ((pRepo = tsdbNewRepo(&config, pAppH)) == NULL) { + tsdbError("vgId:%d failed to open TSDB repository while creating TSDB object since %s", config.tsdbId, + tstrerror(terrno)); + return NULL; + } + + // Open meta + if (tsdbOpenMeta(pRepo) < 0) { + tsdbError("vgId:%d failed to open TSDB repository while opening Meta since %s", config.tsdbId, tstrerror(terrno)); + tsdbCloseRepo(pRepo, false); + return NULL; + } + + if (tsdbOpenBufPool(pRepo) < 0) { + tsdbError("vgId:%d failed to open TSDB repository while opening buffer pool since %s", config.tsdbId, + tstrerror(terrno)); + tsdbCloseRepo(pRepo, false); + return NULL; + } + + if (tsdbOpenFS(pRepo) < 0) { + tsdbError("vgId:%d failed to open TSDB repository while opening FS since %s", config.tsdbId, tstrerror(terrno)); + tsdbCloseRepo(pRepo, false); + return NULL; + } + + // TODO: Restore information from data + if ((!(pRepo->state & TSDB_STATE_BAD_DATA)) && tsdbRestoreInfo(pRepo) < 0) { + tsdbError("vgId:%d failed to open TSDB repository while restore info since %s", config.tsdbId, tstrerror(terrno)); + tsdbCloseRepo(pRepo, false); + return NULL; + } + + pRepo->mergeBuf = NULL; + + tsdbStartStream(pRepo); + + tsdbDebug("vgId:%d, TSDB repository opened", REPO_ID(pRepo)); + + return pRepo; +} + +// Note: all working thread and query thread must stopped when calling this function +int tsdbCloseRepo(STsdbRepo *repo, int toCommit) { + if (repo == NULL) return 0; + + STsdbRepo *pRepo = repo; + int vgId = REPO_ID(pRepo); + + terrno = TSDB_CODE_SUCCESS; + + tsdbStopStream(pRepo); + if(pRepo->pthread){ + taosDestoryThread(pRepo->pthread); + pRepo->pthread = NULL; + } + + if (toCommit) { + tsdbSyncCommit(repo); + } + + tsem_wait(&(pRepo->readyToCommit)); + + tsdbUnRefMemTable(pRepo, pRepo->mem); + tsdbUnRefMemTable(pRepo, pRepo->imem); + pRepo->mem = NULL; + pRepo->imem = NULL; + + tsdbCloseFS(pRepo); + tsdbCloseBufPool(pRepo); + tsdbCloseMeta(pRepo); + tsdbFreeRepo(pRepo); + tsdbDebug("vgId:%d repository is closed", vgId); + + if (terrno != TSDB_CODE_SUCCESS) { + return -1; + } else { + return 0; + } +} + +STsdbCfg *tsdbGetCfg(const STsdbRepo *repo) { + ASSERT(repo != NULL); + return &((STsdbRepo *)repo)->config; +} + +int tsdbLockRepo(STsdbRepo *pRepo) { + int code = pthread_mutex_lock(&pRepo->mutex); + if (code != 0) { + tsdbError("vgId:%d failed to lock tsdb since %s", REPO_ID(pRepo), strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + pRepo->repoLocked = true; + return 0; +} + +int tsdbUnlockRepo(STsdbRepo *pRepo) { + ASSERT(IS_REPO_LOCKED(pRepo)); + pRepo->repoLocked = false; + int code = pthread_mutex_unlock(&pRepo->mutex); + if (code != 0) { + tsdbError("vgId:%d failed to unlock tsdb since %s", REPO_ID(pRepo), strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + return 0; +} + +int tsdbCheckWal(STsdbRepo *pRepo, uint32_t walSize) { // MB + STsdbCfg *pCfg = &(pRepo->config); + if ((walSize > tsdbWalFlushSize) && (walSize > (pCfg->totalBlocks / 2 * pCfg->cacheBlockSize))) { + if (tsdbAsyncCommit(pRepo) < 0) return -1; + } + return 0; +} + +int tsdbCheckCommit(STsdbRepo *pRepo) { + ASSERT(pRepo->mem != NULL); + STsdbCfg *pCfg = &(pRepo->config); + + STsdbBufBlock *pBufBlock = tsdbGetCurrBufBlock(pRepo); + ASSERT(pBufBlock != NULL); + if ((pRepo->mem->extraBuffList != NULL) || + ((listNEles(pRepo->mem->bufBlockList) >= pCfg->totalBlocks / 3) && (pBufBlock->remain < TSDB_BUFFER_RESERVE))) { + // trigger commit + if (tsdbAsyncCommit(pRepo) < 0) return -1; + } + return 0; +} + +STsdbMeta *tsdbGetMeta(STsdbRepo *pRepo) { return pRepo->tsdbMeta; } + +STsdbRepoInfo *tsdbGetStatus(STsdbRepo *pRepo) { return NULL; } + +int tsdbGetState(STsdbRepo *repo) { return repo->state; } + +int8_t tsdbGetCompactState(STsdbRepo *repo) { return (int8_t)(repo->compactState); } + +void tsdbReportStat(void *repo, int64_t *totalPoints, int64_t *totalStorage, int64_t *compStorage) { + ASSERT(repo != NULL); + STsdbRepo *pRepo = repo; + *totalPoints = pRepo->stat.pointsWritten; + *totalStorage = pRepo->stat.totalStorage; + *compStorage = pRepo->stat.compStorage; +} + +int32_t tsdbConfigRepo(STsdbRepo *repo, STsdbCfg *pCfg) { + // TODO: think about multithread cases + if (tsdbCheckAndSetDefaultCfg(pCfg) < 0) return -1; + + STsdbCfg * pRCfg = &repo->config; + + ASSERT(pRCfg->tsdbId == pCfg->tsdbId); + ASSERT(pRCfg->cacheBlockSize == pCfg->cacheBlockSize); + ASSERT(pRCfg->daysPerFile == pCfg->daysPerFile); + ASSERT(pRCfg->minRowsPerFileBlock == pCfg->minRowsPerFileBlock); + ASSERT(pRCfg->maxRowsPerFileBlock == pCfg->maxRowsPerFileBlock); + ASSERT(pRCfg->precision == pCfg->precision); + + bool configChanged = false; + if (pRCfg->compression != pCfg->compression) { + configChanged = true; + } + if (pRCfg->keep != pCfg->keep) { + configChanged = true; + } + if (pRCfg->keep1 != pCfg->keep1) { + configChanged = true; + } + if (pRCfg->keep2 != pCfg->keep2) { + configChanged = true; + } + if (pRCfg->cacheLastRow != pCfg->cacheLastRow) { + configChanged = true; + } + if (pRCfg->totalBlocks != pCfg->totalBlocks) { + configChanged = true; + } + + if (!configChanged) { + tsdbError("vgId:%d no config changed", REPO_ID(repo)); + } + + int code = pthread_mutex_lock(&repo->save_mutex); + if (code != 0) { + tsdbError("vgId:%d failed to lock tsdb save config mutex since %s", REPO_ID(repo), strerror(errno)); + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + + STsdbCfg * pSaveCfg = &repo->save_config; + *pSaveCfg = repo->config; + + pSaveCfg->compression = pCfg->compression; + pSaveCfg->keep = pCfg->keep; + pSaveCfg->keep1 = pCfg->keep1; + pSaveCfg->keep2 = pCfg->keep2; + pSaveCfg->cacheLastRow = pCfg->cacheLastRow; + pSaveCfg->totalBlocks = pCfg->totalBlocks; + + tsdbInfo("vgId:%d old config: compression(%d), keep(%d,%d,%d), cacheLastRow(%d),totalBlocks(%d)", + REPO_ID(repo), + pRCfg->compression, pRCfg->keep, pRCfg->keep1,pRCfg->keep2, + pRCfg->cacheLastRow, pRCfg->totalBlocks); + tsdbInfo("vgId:%d new config: compression(%d), keep(%d,%d,%d), cacheLastRow(%d),totalBlocks(%d)", + REPO_ID(repo), + pSaveCfg->compression, pSaveCfg->keep,pSaveCfg->keep1, pSaveCfg->keep2, + pSaveCfg->cacheLastRow,pSaveCfg->totalBlocks); + + repo->config_changed = true; + + pthread_mutex_unlock(&repo->save_mutex); + + // schedule a commit msg and wait for the new config applied + tsdbSyncCommitConfig(repo); + + return 0; +#if 0 + STsdbRepo *pRepo = (STsdbRepo *)repo; + STsdbCfg config = pRepo->config; + STsdbCfg * pRCfg = &pRepo->config; + + if (tsdbCheckAndSetDefaultCfg(pCfg) < 0) return -1; + + ASSERT(pRCfg->tsdbId == pCfg->tsdbId); + ASSERT(pRCfg->cacheBlockSize == pCfg->cacheBlockSize); + ASSERT(pRCfg->daysPerFile == pCfg->daysPerFile); + ASSERT(pRCfg->minRowsPerFileBlock == pCfg->minRowsPerFileBlock); + ASSERT(pRCfg->maxRowsPerFileBlock == pCfg->maxRowsPerFileBlock); + ASSERT(pRCfg->precision == pCfg->precision); + + bool configChanged = false; + if (pRCfg->compression != pCfg->compression) { + tsdbAlterCompression(pRepo, pCfg->compression); + config.compression = pCfg->compression; + configChanged = true; + } + if (pRCfg->keep != pCfg->keep) { + if (tsdbAlterKeep(pRepo, pCfg->keep) < 0) { + tsdbError("vgId:%d failed to configure repo when alter keep since %s", REPO_ID(pRepo), tstrerror(terrno)); + config.keep = pCfg->keep; + return -1; + } + configChanged = true; + } + if (pRCfg->totalBlocks != pCfg->totalBlocks) { + tsdbAlterCacheTotalBlocks(pRepo, pCfg->totalBlocks); + config.totalBlocks = pCfg->totalBlocks; + configChanged = true; + } + if (pRCfg->cacheLastRow != pCfg->cacheLastRow) { + config.cacheLastRow = pCfg->cacheLastRow; + configChanged = true; + } + + if (configChanged) { + if (tsdbSaveConfig(pRepo->rootDir, &config) < 0) { + tsdbError("vgId:%d failed to configure repository while save config since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + } + + return 0; +#endif +} + +uint32_t tsdbGetFileInfo(STsdbRepo *repo, char *name, uint32_t *index, uint32_t eindex, int64_t *size) { + // TODO + return 0; +#if 0 + STsdbRepo *pRepo = (STsdbRepo *)repo; + // STsdbMeta *pMeta = pRepo->tsdbMeta; + STsdbFileH *pFileH = pRepo->tsdbFileH; + uint32_t magic = 0; + char * fname = NULL; + + struct stat fState; + + tsdbDebug("vgId:%d name:%s index:%d eindex:%d", pRepo->config.tsdbId, name, *index, eindex); + ASSERT(*index <= eindex); + + if (name[0] == 0) { // get the file from index or after, but not larger than eindex + int fid = (*index) / TSDB_FILE_TYPE_MAX; + + if (pFileH->nFGroups == 0 || fid > pFileH->pFGroup[pFileH->nFGroups - 1].fileId) { + if (*index <= TSDB_META_FILE_INDEX && TSDB_META_FILE_INDEX <= eindex) { + fname = tsdbGetMetaFileName(pRepo->rootDir); + *index = TSDB_META_FILE_INDEX; + magic = TSDB_META_FILE_MAGIC(pRepo->tsdbMeta); + sprintf(name, "tsdb/%s", TSDB_META_FILE_NAME); + } else { + return 0; + } + } else { + SFileGroup *pFGroup = + taosbsearch(&fid, pFileH->pFGroup, pFileH->nFGroups, sizeof(SFileGroup), keyFGroupCompFunc, TD_GE); + if (pFGroup->fileId == fid) { + SFile *pFile = &pFGroup->files[(*index) % TSDB_FILE_TYPE_MAX]; + fname = strdup(TSDB_FILE_NAME(pFile)); + magic = pFile->info.magic; + char *tfname = strdup(fname); + sprintf(name, "tsdb/%s/%s", TSDB_DATA_DIR_NAME, basename(tfname)); + tfree(tfname); + } else { + if ((pFGroup->fileId + 1) * TSDB_FILE_TYPE_MAX - 1 < (int)eindex) { + SFile *pFile = &pFGroup->files[0]; + fname = strdup(TSDB_FILE_NAME(pFile)); + *index = pFGroup->fileId * TSDB_FILE_TYPE_MAX; + magic = pFile->info.magic; + char *tfname = strdup(fname); + sprintf(name, "tsdb/%s/%s", TSDB_DATA_DIR_NAME, basename(tfname)); + tfree(tfname); + } else { + return 0; + } + } + } + } else { // get the named file at the specified index. If not there, return 0 + fname = malloc(256); + sprintf(fname, "%s/vnode/vnode%d/%s", TFS_PRIMARY_PATH(), REPO_ID(pRepo), name); + if (access(fname, F_OK) != 0) { + tfree(fname); + return 0; + } + if (*index == TSDB_META_FILE_INDEX) { // get meta file + tsdbGetStoreInfo(fname, &magic, size); + } else { + char tfname[TSDB_FILENAME_LEN] = "\0"; + sprintf(tfname, "vnode/vnode%d/tsdb/%s/%s", REPO_ID(pRepo), TSDB_DATA_DIR_NAME, basename(name)); + tsdbGetFileInfoImpl(tfname, &magic, size); + } + tfree(fname); + return magic; + } + + if (stat(fname, &fState) < 0) { + tfree(fname); + return 0; + } + + *size = fState.st_size; + // magic = *size; + + tfree(fname); + return magic; +#endif +} + +void tsdbGetRootDir(int repoid, char dirName[]) { + snprintf(dirName, TSDB_FILENAME_LEN, "vnode/vnode%d/tsdb", repoid); +} + +void tsdbGetDataDir(int repoid, char dirName[]) { + snprintf(dirName, TSDB_FILENAME_LEN, "vnode/vnode%d/tsdb/data", repoid); +} + +static int32_t tsdbCheckAndSetDefaultCfg(STsdbCfg *pCfg) { + // Check tsdbId + if (pCfg->tsdbId < 0) { + tsdbError("vgId:%d invalid vgroup ID", pCfg->tsdbId); + terrno = TSDB_CODE_TDB_INVALID_CONFIG; + return -1; + } + + // Check precision + if (pCfg->precision == -1) { + pCfg->precision = TSDB_DEFAULT_PRECISION; + } else { + if (!IS_VALID_PRECISION(pCfg->precision)) { + tsdbError("vgId:%d invalid precision configuration %d", pCfg->tsdbId, pCfg->precision); + terrno = TSDB_CODE_TDB_INVALID_CONFIG; + return -1; + } + } + + // Check compression + if (pCfg->compression == -1) { + pCfg->compression = TSDB_DEFAULT_COMPRESSION; + } else { + if (!IS_VALID_COMPRESSION(pCfg->compression)) { + tsdbError("vgId:%d invalid compression configuration %d", pCfg->tsdbId, pCfg->precision); + terrno = TSDB_CODE_TDB_INVALID_CONFIG; + return -1; + } + } + + // Check daysPerFile + if (pCfg->daysPerFile == -1) { + pCfg->daysPerFile = TSDB_DEFAULT_DAYS_PER_FILE; + } else { + if (pCfg->daysPerFile < TSDB_MIN_DAYS_PER_FILE || pCfg->daysPerFile > TSDB_MAX_DAYS_PER_FILE) { + tsdbError( + "vgId:%d invalid daysPerFile configuration! daysPerFile %d TSDB_MIN_DAYS_PER_FILE %d TSDB_MAX_DAYS_PER_FILE " + "%d", + pCfg->tsdbId, pCfg->daysPerFile, TSDB_MIN_DAYS_PER_FILE, TSDB_MAX_DAYS_PER_FILE); + terrno = TSDB_CODE_TDB_INVALID_CONFIG; + return -1; + } + } + + // Check minRowsPerFileBlock and maxRowsPerFileBlock + if (pCfg->minRowsPerFileBlock == -1) { + pCfg->minRowsPerFileBlock = TSDB_DEFAULT_MIN_ROW_FBLOCK; + } else { + if (pCfg->minRowsPerFileBlock < TSDB_MIN_MIN_ROW_FBLOCK || pCfg->minRowsPerFileBlock > TSDB_MAX_MIN_ROW_FBLOCK) { + tsdbError( + "vgId:%d invalid minRowsPerFileBlock configuration! minRowsPerFileBlock %d TSDB_MIN_MIN_ROW_FBLOCK %d " + "TSDB_MAX_MIN_ROW_FBLOCK %d", + pCfg->tsdbId, pCfg->minRowsPerFileBlock, TSDB_MIN_MIN_ROW_FBLOCK, TSDB_MAX_MIN_ROW_FBLOCK); + terrno = TSDB_CODE_TDB_INVALID_CONFIG; + return -1; + } + } + + if (pCfg->maxRowsPerFileBlock == -1) { + pCfg->maxRowsPerFileBlock = TSDB_DEFAULT_MAX_ROW_FBLOCK; + } else { + if (pCfg->maxRowsPerFileBlock < TSDB_MIN_MAX_ROW_FBLOCK || pCfg->maxRowsPerFileBlock > TSDB_MAX_MAX_ROW_FBLOCK) { + tsdbError( + "vgId:%d invalid maxRowsPerFileBlock configuration! maxRowsPerFileBlock %d TSDB_MIN_MAX_ROW_FBLOCK %d " + "TSDB_MAX_MAX_ROW_FBLOCK %d", + pCfg->tsdbId, pCfg->maxRowsPerFileBlock, TSDB_MIN_MIN_ROW_FBLOCK, TSDB_MAX_MIN_ROW_FBLOCK); + terrno = TSDB_CODE_TDB_INVALID_CONFIG; + return -1; + } + } + + if (pCfg->minRowsPerFileBlock > pCfg->maxRowsPerFileBlock) { + tsdbError("vgId:%d invalid configuration! minRowsPerFileBlock %d maxRowsPerFileBlock %d", pCfg->tsdbId, + pCfg->minRowsPerFileBlock, pCfg->maxRowsPerFileBlock); + terrno = TSDB_CODE_TDB_INVALID_CONFIG; + return -1; + } + + // Check keep +#if 0 // already checked and set in mnodeSetDefaultDbCfg + if (pCfg->keep == -1) { + pCfg->keep = TSDB_DEFAULT_KEEP; + } else { + if (pCfg->keep < TSDB_MIN_KEEP || pCfg->keep > TSDB_MAX_KEEP) { + tsdbError( + "vgId:%d invalid keep configuration! keep %d TSDB_MIN_KEEP %d " + "TSDB_MAX_KEEP %d", + pCfg->tsdbId, pCfg->keep, TSDB_MIN_KEEP, TSDB_MAX_KEEP); + terrno = TSDB_CODE_TDB_INVALID_CONFIG; + return -1; + } + } + + if (pCfg->keep1 == 0) { + pCfg->keep1 = pCfg->keep; + } + + if (pCfg->keep2 == 0) { + pCfg->keep2 = pCfg->keep; + } +#endif + + int32_t keepMin = pCfg->keep1; + int32_t keepMid = pCfg->keep2; + int32_t keepMax = pCfg->keep; + + if (keepMin > keepMid) { + SWAP(keepMin, keepMid, int32_t); + } + if (keepMin > keepMax) { + SWAP(keepMin, keepMax, int32_t); + } + if (keepMid > keepMax) { + SWAP(keepMid, keepMax, int32_t); + } + + pCfg->keep = keepMax; + pCfg->keep1 = keepMin; + pCfg->keep2 = keepMid; + // update check + if (pCfg->update < TD_ROW_DISCARD_UPDATE || pCfg->update > TD_ROW_PARTIAL_UPDATE) + pCfg->update = TD_ROW_DISCARD_UPDATE; + + // update cacheLastRow + if (pCfg->cacheLastRow != 0) { + if (pCfg->cacheLastRow > 3) + pCfg->cacheLastRow = 1; + } + return 0; +} + +static STsdbRepo *tsdbNewRepo(STsdbCfg *pCfg, STsdbAppH *pAppH) { + STsdbRepo *pRepo = (STsdbRepo *)calloc(1, sizeof(*pRepo)); + if (pRepo == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + + pRepo->state = TSDB_STATE_OK; + pRepo->code = TSDB_CODE_SUCCESS; + pRepo->compactState = 0; + pRepo->config = *pCfg; + if (pAppH) { + pRepo->appH = *pAppH; + } + pRepo->repoLocked = false; + pRepo->pthread = NULL; + + int code = pthread_mutex_init(&(pRepo->mutex), NULL); + if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(code); + tsdbFreeRepo(pRepo); + return NULL; + } + + code = pthread_mutex_init(&(pRepo->save_mutex), NULL); + if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(code); + tsdbFreeRepo(pRepo); + return NULL; + } + pRepo->config_changed = false; + pRepo->cacheLastConfigVersion = 0; + + code = tsem_init(&(pRepo->readyToCommit), 0, 1); + if (code != 0) { + code = errno; + terrno = TAOS_SYSTEM_ERROR(code); + tsdbFreeRepo(pRepo); + return NULL; + } + + pRepo->tsdbMeta = tsdbNewMeta(pCfg); + if (pRepo->tsdbMeta == NULL) { + tsdbError("vgId:%d failed to create meta since %s", REPO_ID(pRepo), tstrerror(terrno)); + tsdbFreeRepo(pRepo); + return NULL; + } + + pRepo->pPool = tsdbNewBufPool(pCfg); + if (pRepo->pPool == NULL) { + tsdbError("vgId:%d failed to create buffer pool since %s", REPO_ID(pRepo), tstrerror(terrno)); + tsdbFreeRepo(pRepo); + return NULL; + } + + pRepo->fs = tsdbNewFS(pCfg); + if (pRepo->fs == NULL) { + tsdbError("vgId:%d failed to TSDB file system since %s", REPO_ID(pRepo), tstrerror(terrno)); + tsdbFreeRepo(pRepo); + return NULL; + } + + return pRepo; +} + +static void tsdbFreeRepo(STsdbRepo *pRepo) { + if (pRepo) { + tsdbFreeFS(pRepo->fs); + tsdbFreeBufPool(pRepo->pPool); + tsdbFreeMeta(pRepo->tsdbMeta); + tsdbFreeMergeBuf(pRepo->mergeBuf); + // tsdbFreeMemTable(pRepo->mem); + // tsdbFreeMemTable(pRepo->imem); + tsem_destroy(&(pRepo->readyToCommit)); + pthread_mutex_destroy(&pRepo->mutex); + free(pRepo); + } +} + +static void tsdbStartStream(STsdbRepo *pRepo) { + STsdbMeta *pMeta = pRepo->tsdbMeta; + + for (int i = 0; i < pMeta->maxTables; i++) { + STable *pTable = pMeta->tables[i]; + if (pTable && pTable->type == TSDB_STREAM_TABLE) { + pTable->cqhandle = (*pRepo->appH.cqCreateFunc)(pRepo->appH.cqH, TABLE_UID(pTable), TABLE_TID(pTable), TABLE_NAME(pTable)->data, pTable->sql, + tsdbGetTableSchemaImpl(pTable, false, false, -1, -1), 0); + } + } +} + +static void tsdbStopStream(STsdbRepo *pRepo) { + STsdbMeta *pMeta = pRepo->tsdbMeta; + + for (int i = 0; i < pMeta->maxTables; i++) { + STable *pTable = pMeta->tables[i]; + if (pTable && pTable->type == TSDB_STREAM_TABLE) { + (*pRepo->appH.cqDropFunc)(pTable->cqhandle); + } + } +} + +static int tsdbRestoreLastColumns(STsdbRepo *pRepo, STable *pTable, SReadH* pReadh) { + //tsdbInfo("tsdbRestoreLastColumns of table %s", pTable->name->data); + + STSchema *pSchema = tsdbGetTableLatestSchema(pTable); + if (pSchema == NULL) { + tsdbError("tsdbGetTableLatestSchema of table %s fail", pTable->name->data); + return 0; + } + + SBlock* pBlock; + int numColumns; + int32_t blockIdx; + SDataStatis* pBlockStatis = NULL; + // SMemRow row = NULL; + // restore last column data with last schema + + int err = 0; + + numColumns = schemaNCols(pSchema); + if (numColumns <= pTable->restoreColumnNum) { + pTable->hasRestoreLastColumn = true; + return 0; + } + if (pTable->lastColSVersion != schemaVersion(pSchema)) { + if (tsdbInitColIdCacheWithSchema(pTable, pSchema) < 0) { + return -1; + } + } + + // row = taosTMalloc(memRowMaxBytesFromSchema(pSchema)); + // if (row == NULL) { + // terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + // err = -1; + // goto out; + // } + + // memRowSetType(row, SMEM_ROW_DATA); + // tdInitDataRow(memRowDataBody(row), pSchema); + + // first load block index info + if (tsdbLoadBlockInfo(pReadh, NULL, NULL) < 0) { + err = -1; + goto out; + } + + pBlockStatis = calloc(numColumns, sizeof(SDataStatis)); + if (pBlockStatis == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + err = -1; + goto out; + } + memset(pBlockStatis, 0, numColumns * sizeof(SDataStatis)); + for(int32_t i = 0; i < numColumns; ++i) { + STColumn *pCol = schemaColAt(pSchema, i); + pBlockStatis[i].colId = pCol->colId; + } + + // load block from backward + SBlockIdx *pIdx = pReadh->pBlkIdx; + blockIdx = (int32_t)(pIdx->numOfBlocks - 1); + + while (numColumns > pTable->restoreColumnNum && blockIdx >= 0) { + bool loadStatisData = false; + pBlock = pReadh->pBlkInfo->blocks + blockIdx; + blockIdx -= 1; + + // load block data + if (tsdbLoadBlockData(pReadh, pBlock, NULL) < 0) { + err = -1; + goto out; + } + + // file block with sub-blocks has no statistics data + if (pBlock->numOfSubBlocks <= 1) { + if (tsdbLoadBlockStatis(pReadh, pBlock) == TSDB_STATIS_OK) { + tsdbGetBlockStatis(pReadh, pBlockStatis, (int)numColumns, pBlock); + loadStatisData = true; + } + } + TSDB_WLOCK_TABLE(pTable); // lock when update pTable->lastCols[] + for (int16_t i = 0; i < numColumns && numColumns > pTable->restoreColumnNum; ++i) { + STColumn *pCol = schemaColAt(pSchema, i); + // ignore loaded columns + if (pTable->lastCols[i].bytes != 0) { + continue; + } + + // ignore block which has no not-null colId column + if (loadStatisData && pBlockStatis[i].numOfNull == pBlock->numOfRows) { + continue; + } + + // OK,let's load row from backward to get not-null column + for (int32_t rowId = pBlock->numOfRows - 1; rowId >= 0; rowId--) { + SDataCol *pDataCol = pReadh->pDCols[0]->cols + i; + const void* pColData = tdGetColDataOfRow(pDataCol, rowId); + // tdAppendColVal(memRowDataBody(row), pColData, pCol->type, pCol->offset); + // SDataCol *pDataCol = readh.pDCols[0]->cols + j; + // void *value = tdGetRowDataOfCol(memRowDataBody(row), (int8_t)pCol->type, TD_DATA_ROW_HEAD_SIZE + + // + // pCol->offset); + if (isNull(pColData, pCol->type)) { + continue; + } + + int16_t idx = tsdbGetLastColumnsIndexByColId(pTable, pCol->colId); + if (idx == -1) { + tsdbError("tsdbRestoreLastColumns restore vgId:%d,table:%s cache column %d fail", REPO_ID(pRepo), pTable->name->data, pCol->colId); + continue; + } + // save not-null column + uint16_t bytes = IS_VAR_DATA_TYPE(pCol->type) ? varDataTLen(pColData) : pCol->bytes; + SDataCol *pLastCol = &(pTable->lastCols[idx]); + pLastCol->pData = malloc(bytes); + pLastCol->bytes = bytes; + pLastCol->colId = pCol->colId; + memcpy(pLastCol->pData, pColData, bytes); + + // save row ts(in column 0) + pDataCol = pReadh->pDCols[0]->cols + 0; + // pCol = schemaColAt(pSchema, 0); + // tdAppendColVal(memRowDataBody(row), tdGetColDataOfRow(pDataCol, rowId), pCol->type, pCol->offset); + // pLastCol->ts = memRowKey(row); + pLastCol->ts = tdGetKey(*(TKEY *)(tdGetColDataOfRow(pDataCol, rowId))); + + pTable->restoreColumnNum += 1; + + tsdbDebug("tsdbRestoreLastColumns restore vgId:%d,table:%s cache column %d, %" PRId64, REPO_ID(pRepo), pTable->name->data, pLastCol->colId, pLastCol->ts); + break; + } + } + TSDB_WUNLOCK_TABLE(pTable); + } + +out: + // taosTZfree(row); + tfree(pBlockStatis); + + if (err == 0 && numColumns <= pTable->restoreColumnNum) { + pTable->hasRestoreLastColumn = true; + } + + return err; +} + +static int tsdbRestoreLastRow(STsdbRepo *pRepo, STable *pTable, SReadH* pReadh, SBlockIdx *pIdx) { + ASSERT(pTable->lastRow == NULL); + if (tsdbLoadBlockInfo(pReadh, NULL, NULL) < 0) { + return -1; + } + + SBlock* pBlock = pReadh->pBlkInfo->blocks + pIdx->numOfBlocks - 1; + + if (tsdbLoadBlockData(pReadh, pBlock, NULL) < 0) { + return -1; + } + + // Get the data in row + + STSchema *pSchema = tsdbGetTableSchema(pTable); + SMemRow lastRow = taosTMalloc(memRowMaxBytesFromSchema(pSchema)); + if (lastRow == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + memRowSetType(lastRow, SMEM_ROW_DATA); + tdInitDataRow(memRowDataBody(lastRow), pSchema); + for (int icol = 0; icol < schemaNCols(pSchema); icol++) { + STColumn *pCol = schemaColAt(pSchema, icol); + SDataCol *pDataCol = pReadh->pDCols[0]->cols + icol; + tdAppendColVal(memRowDataBody(lastRow), tdGetColDataOfRow(pDataCol, pBlock->numOfRows - 1), pCol->type, + pCol->offset); + } + + TSKEY lastKey = memRowKey(lastRow); + + // during the load data in file, new data would be inserted and last row has been updated + TSDB_WLOCK_TABLE(pTable); + if (pTable->lastRow == NULL) { + pTable->lastKey = lastKey; + pTable->lastRow = lastRow; + TSDB_WUNLOCK_TABLE(pTable); + } else { + TSDB_WUNLOCK_TABLE(pTable); + taosTZfree(lastRow); + } + + return 0; +} + +int tsdbRestoreInfo(STsdbRepo *pRepo) { + SFSIter fsiter; + SReadH readh; + SDFileSet *pSet; + STsdbMeta *pMeta = pRepo->tsdbMeta; + STsdbCfg * pCfg = REPO_CFG(pRepo); + + if (tsdbInitReadH(&readh, pRepo) < 0) { + return -1; + } + + tsdbFSIterInit(&fsiter, REPO_FS(pRepo), TSDB_FS_ITER_BACKWARD); + + if (CACHE_LAST_NULL_COLUMN(pCfg)) { + for (int i = 1; i < pMeta->maxTables; i++) { + STable *pTable = pMeta->tables[i]; + if (pTable == NULL) continue; + pTable->restoreColumnNum = 0; + pTable->hasRestoreLastColumn = false; + } + } + + while ((pSet = tsdbFSIterNext(&fsiter)) != NULL) { + if (tsdbSetAndOpenReadFSet(&readh, pSet) < 0) { + tsdbDestroyReadH(&readh); + return -1; + } + + if (tsdbLoadBlockIdx(&readh) < 0) { + tsdbDestroyReadH(&readh); + return -1; + } + + for (int i = 1; i < pMeta->maxTables; i++) { + STable *pTable = pMeta->tables[i]; + if (pTable == NULL) continue; + + //tsdbInfo("tsdbRestoreInfo restore vgId:%d,table:%s", REPO_ID(pRepo), pTable->name->data); + + if (tsdbSetReadTable(&readh, pTable) < 0) { + tsdbDestroyReadH(&readh); + return -1; + } + + TSKEY lastKey = tsdbGetTableLastKeyImpl(pTable); + SBlockIdx *pIdx = readh.pBlkIdx; + if (pIdx && lastKey < pIdx->maxKey) { + pTable->lastKey = pIdx->maxKey; + + if (CACHE_LAST_ROW(pCfg) && tsdbRestoreLastRow(pRepo, pTable, &readh, pIdx) != 0) { + tsdbDestroyReadH(&readh); + return -1; + } + } + + // restore NULL columns + if (pIdx && CACHE_LAST_NULL_COLUMN(pCfg) && !pTable->hasRestoreLastColumn) { + if (tsdbRestoreLastColumns(pRepo, pTable, &readh) != 0) { + tsdbDestroyReadH(&readh); + return -1; + } + } + } + } + + tsdbDestroyReadH(&readh); + + // if (CACHE_LAST_NULL_COLUMN(pCfg)) { + // atomic_store_8(&pRepo->hasCachedLastColumn, 1); + // } + + return 0; +} + +int32_t tsdbLoadLastCache(STsdbRepo *pRepo, STable *pTable) { + SFSIter fsiter; + SReadH readh; + SDFileSet *pSet; + int cacheLastRowTableNum = 0; + int cacheLastColTableNum = 0; + + bool cacheLastRow = CACHE_LAST_ROW(&(pRepo->config)); + bool cacheLastCol = CACHE_LAST_NULL_COLUMN(&(pRepo->config)); + + tsdbDebug("tsdbLoadLastCache for %s, cacheLastRow:%d, cacheLastCol:%d", pTable->name->data, cacheLastRow, cacheLastCol); + + pTable->cacheLastConfigVersion = pRepo->cacheLastConfigVersion; + + if (!cacheLastRow && pTable->lastRow != NULL) { + taosTZfree(pTable->lastRow); + pTable->lastRow = NULL; + } + if (!cacheLastCol && pTable->lastCols != NULL) { + tsdbFreeLastColumns(pTable); + } + + if (!cacheLastRow && !cacheLastCol) { + return 0; + } + + cacheLastRowTableNum = (cacheLastRow && pTable->lastRow == NULL) ? 1 : 0; + cacheLastColTableNum = (cacheLastCol && pTable->lastCols == NULL) ? 1 : 0; + + if (cacheLastRowTableNum == 0 && cacheLastColTableNum == 0) { + return 0; + } + + if (tsdbInitReadH(&readh, pRepo) < 0) { + return -1; + } + + tsdbRLockFS(REPO_FS(pRepo)); + tsdbFSIterInit(&fsiter, REPO_FS(pRepo), TSDB_FS_ITER_BACKWARD); + + while ((cacheLastRowTableNum > 0 || cacheLastColTableNum > 0) && (pSet = tsdbFSIterNext(&fsiter)) != NULL) { + if (tsdbSetAndOpenReadFSet(&readh, pSet) < 0) { + tsdbUnLockFS(REPO_FS(pRepo)); + tsdbDestroyReadH(&readh); + return -1; + } + + if (tsdbLoadBlockIdx(&readh) < 0) { + tsdbUnLockFS(REPO_FS(pRepo)); + tsdbDestroyReadH(&readh); + return -1; + } + + // tsdbDebug("tsdbRestoreInfo restore vgId:%d,table:%s", REPO_ID(pRepo), pTable->name->data); + + if (tsdbSetReadTable(&readh, pTable) < 0) { + tsdbUnLockFS(REPO_FS(pRepo)); + tsdbDestroyReadH(&readh); + return -1; + } + + SBlockIdx *pIdx = readh.pBlkIdx; + + if (pIdx && (cacheLastRowTableNum > 0) && (pTable->lastRow == NULL)) { + if (tsdbRestoreLastRow(pRepo, pTable, &readh, pIdx) != 0) { + tsdbUnLockFS(REPO_FS(pRepo)); + tsdbDestroyReadH(&readh); + return -1; + } + cacheLastRowTableNum -= 1; + } + + // restore NULL columns + if (pIdx && (cacheLastColTableNum > 0) && !pTable->hasRestoreLastColumn) { + if (tsdbRestoreLastColumns(pRepo, pTable, &readh) != 0) { + tsdbUnLockFS(REPO_FS(pRepo)); + tsdbDestroyReadH(&readh); + return -1; + } + if (pTable->hasRestoreLastColumn) { + cacheLastColTableNum -= 1; + } + } + } + + tsdbUnLockFS(REPO_FS(pRepo)); + tsdbDestroyReadH(&readh); + + return 0; +} + +UNUSED_FUNC int tsdbCacheLastData(STsdbRepo *pRepo, STsdbCfg* oldCfg) { + bool cacheLastRow = false, cacheLastCol = false; + SFSIter fsiter; + SReadH readh; + SDFileSet *pSet; + STsdbMeta *pMeta = pRepo->tsdbMeta; + int tableNum = 0; + int maxTableIdx = 0; + int cacheLastRowTableNum = 0; + int cacheLastColTableNum = 0; + + bool need_free_last_row = CACHE_LAST_ROW(oldCfg) && !CACHE_LAST_ROW(&(pRepo->config)); + bool need_free_last_col = CACHE_LAST_NULL_COLUMN(oldCfg) && !CACHE_LAST_NULL_COLUMN(&(pRepo->config)); + + if (CACHE_LAST_ROW(&(pRepo->config)) || CACHE_LAST_NULL_COLUMN(&(pRepo->config))) { + tsdbInfo("tsdbCacheLastData cache last data since cacheLast option changed"); + cacheLastRow = !CACHE_LAST_ROW(oldCfg) && CACHE_LAST_ROW(&(pRepo->config)); + cacheLastCol = !CACHE_LAST_NULL_COLUMN(oldCfg) && CACHE_LAST_NULL_COLUMN(&(pRepo->config)); + } + + // calc max table idx and table num + for (int i = 1; i < pMeta->maxTables; i++) { + STable *pTable = pMeta->tables[i]; + if (pTable == NULL) continue; + tableNum += 1; + maxTableIdx = i; + if (cacheLastCol) { + pTable->restoreColumnNum = 0; + pTable->hasRestoreLastColumn = false; + } + } + + // if close last option,need to free data + if (need_free_last_row || need_free_last_col) { + // if (need_free_last_col) { + // atomic_store_8(&pRepo->hasCachedLastColumn, 0); + // } + tsdbInfo("free cache last data since cacheLast option changed"); + for (int i = 1; i <= maxTableIdx; i++) { + STable *pTable = pMeta->tables[i]; + if (pTable == NULL) continue; + if (need_free_last_row) { + taosTZfree(pTable->lastRow); + pTable->lastRow = NULL; + } + if (need_free_last_col) { + tsdbFreeLastColumns(pTable); + pTable->hasRestoreLastColumn = false; + } + } + } + + if (!cacheLastRow && !cacheLastCol) { + return 0; + } + + cacheLastRowTableNum = cacheLastRow ? tableNum : 0; + cacheLastColTableNum = cacheLastCol ? tableNum : 0; + + if (tsdbInitReadH(&readh, pRepo) < 0) { + return -1; + } + + tsdbFSIterInit(&fsiter, REPO_FS(pRepo), TSDB_FS_ITER_BACKWARD); + + while ((pSet = tsdbFSIterNext(&fsiter)) != NULL && (cacheLastRowTableNum > 0 || cacheLastColTableNum > 0)) { + if (tsdbSetAndOpenReadFSet(&readh, pSet) < 0) { + tsdbDestroyReadH(&readh); + return -1; + } + + if (tsdbLoadBlockIdx(&readh) < 0) { + tsdbDestroyReadH(&readh); + return -1; + } + + for (int i = 1; i <= maxTableIdx; i++) { + STable *pTable = pMeta->tables[i]; + if (pTable == NULL) continue; + + //tsdbInfo("tsdbRestoreInfo restore vgId:%d,table:%s", REPO_ID(pRepo), pTable->name->data); + + if (tsdbSetReadTable(&readh, pTable) < 0) { + tsdbDestroyReadH(&readh); + return -1; + } + + SBlockIdx *pIdx = readh.pBlkIdx; + + if (pIdx && cacheLastRowTableNum > 0 && pTable->lastRow == NULL) { + pTable->lastKey = pIdx->maxKey; + + if (tsdbRestoreLastRow(pRepo, pTable, &readh, pIdx) != 0) { + tsdbDestroyReadH(&readh); + return -1; + } + cacheLastRowTableNum -= 1; + } + + // restore NULL columns + if (pIdx && cacheLastColTableNum > 0 && !pTable->hasRestoreLastColumn) { + if (tsdbRestoreLastColumns(pRepo, pTable, &readh) != 0) { + tsdbDestroyReadH(&readh); + return -1; + } + if (pTable->hasRestoreLastColumn) { + cacheLastColTableNum -= 1; + } + } + } + } + + tsdbDestroyReadH(&readh); + + // if (cacheLastCol) { + // atomic_store_8(&pRepo->hasCachedLastColumn, 1); + // } + + return 0; +} diff --git a/source/dnode/vnode/tsdb2/src/tsdbMemTable.c b/source/dnode/vnode/tsdb2/src/tsdbMemTable.c new file mode 100644 index 0000000000..8958df3ced --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbMemTable.c @@ -0,0 +1,1080 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "tdataformat.h" +#include "tfunctional.h" +#include "tsdbint.h" +#include "tskiplist.h" +#include "tsdbRowMergeBuf.h" + +#define TSDB_DATA_SKIPLIST_LEVEL 5 +#define TSDB_MAX_INSERT_BATCH 512 + +typedef struct { + int32_t totalLen; + int32_t len; + SMemRow row; +} SSubmitBlkIter; + +typedef struct { + int32_t totalLen; + int32_t len; + void * pMsg; +} SSubmitMsgIter; + +static SMemTable * tsdbNewMemTable(STsdbRepo *pRepo); +static void tsdbFreeMemTable(SMemTable *pMemTable); +static STableData* tsdbNewTableData(STsdbCfg *pCfg, STable *pTable); +static void tsdbFreeTableData(STableData *pTableData); +static char * tsdbGetTsTupleKey(const void *data); +static int tsdbAdjustMemMaxTables(SMemTable *pMemTable, int maxTables); +static int tsdbAppendTableRowToCols(STable *pTable, SDataCols *pCols, STSchema **ppSchema, SMemRow row); +static int tsdbInitSubmitBlkIter(SSubmitBlk *pBlock, SSubmitBlkIter *pIter); +static SMemRow tsdbGetSubmitBlkNext(SSubmitBlkIter *pIter); +static int tsdbScanAndConvertSubmitMsg(STsdbRepo *pRepo, SSubmitMsg *pMsg); +static int tsdbInsertDataToTable(STsdbRepo *pRepo, SSubmitBlk *pBlock, int32_t *affectedrows); +static int tsdbInitSubmitMsgIter(SSubmitMsg *pMsg, SSubmitMsgIter *pIter); +static int tsdbGetSubmitMsgNext(SSubmitMsgIter *pIter, SSubmitBlk **pPBlock); +static int tsdbCheckTableSchema(STsdbRepo *pRepo, SSubmitBlk *pBlock, STable *pTable); +static int tsdbUpdateTableLatestInfo(STsdbRepo *pRepo, STable *pTable, SMemRow row); + +static FORCE_INLINE int tsdbCheckRowRange(STsdbRepo *pRepo, STable *pTable, SMemRow row, TSKEY minKey, TSKEY maxKey, + TSKEY now); + +int32_t tsdbInsertData(STsdbRepo *repo, SSubmitMsg *pMsg, SShellSubmitRspMsg *pRsp) { + STsdbRepo * pRepo = repo; + SSubmitMsgIter msgIter = {0}; + SSubmitBlk * pBlock = NULL; + int32_t affectedrows = 0, numOfRows = 0; + + if (tsdbScanAndConvertSubmitMsg(pRepo, pMsg) < 0) { + if (terrno != TSDB_CODE_TDB_TABLE_RECONFIGURE) { + tsdbError("vgId:%d failed to insert data since %s", REPO_ID(pRepo), tstrerror(terrno)); + } + return -1; + } + + tsdbInitSubmitMsgIter(pMsg, &msgIter); + while (true) { + tsdbGetSubmitMsgNext(&msgIter, &pBlock); + if (pBlock == NULL) break; + if (tsdbInsertDataToTable(pRepo, pBlock, &affectedrows) < 0) { + return -1; + } + numOfRows += pBlock->numOfRows; + } + + if (pRsp != NULL) { + pRsp->affectedRows = htonl(affectedrows); + pRsp->numOfRows = htonl(numOfRows); + } + + if (tsdbCheckCommit(pRepo) < 0) return -1; + return 0; +} + +// ---------------- INTERNAL FUNCTIONS ---------------- +int tsdbRefMemTable(STsdbRepo *pRepo, SMemTable *pMemTable) { + if (pMemTable == NULL) return 0; + int ref = T_REF_INC(pMemTable); + tsdbDebug("vgId:%d ref memtable %p ref %d", REPO_ID(pRepo), pMemTable, ref); + return 0; +} + +// Need to lock the repository +int tsdbUnRefMemTable(STsdbRepo *pRepo, SMemTable *pMemTable) { + if (pMemTable == NULL) return 0; + + int ref = T_REF_DEC(pMemTable); + tsdbDebug("vgId:%d unref memtable %p ref %d", REPO_ID(pRepo), pMemTable, ref); + if (ref == 0) { + STsdbBufPool *pBufPool = pRepo->pPool; + + SListNode *pNode = NULL; + bool addNew = false; + if (tsdbLockRepo(pRepo) < 0) return -1; + while ((pNode = tdListPopHead(pMemTable->bufBlockList)) != NULL) { + if (pBufPool->nRecycleBlocks > 0) { + tsdbRecycleBufferBlock(pBufPool, pNode, false); + pBufPool->nRecycleBlocks -= 1; + } else { + if(pBufPool->nElasticBlocks > 0 && listNEles(pBufPool->bufBlockList) > 2) { + tsdbRecycleBufferBlock(pBufPool, pNode, true); + } else { + tdListAppendNode(pBufPool->bufBlockList, pNode); + addNew = true; + } + } + } + if (addNew) { + int code = pthread_cond_signal(&pBufPool->poolNotEmpty); + if (code != 0) { + if (tsdbUnlockRepo(pRepo) < 0) return -1; + tsdbError("vgId:%d failed to signal pool not empty since %s", REPO_ID(pRepo), strerror(code)); + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + } + + if (tsdbUnlockRepo(pRepo) < 0) return -1; + + for (int i = 0; i < pMemTable->maxTables; i++) { + if (pMemTable->tData[i] != NULL) { + tsdbFreeTableData(pMemTable->tData[i]); + } + } + + tdListDiscard(pMemTable->actList); + tdListDiscard(pMemTable->bufBlockList); + tsdbFreeMemTable(pMemTable); + } + return 0; +} + +int tsdbTakeMemSnapshot(STsdbRepo *pRepo, SMemSnapshot *pSnapshot, SArray *pATable) { + memset(pSnapshot, 0, sizeof(*pSnapshot)); + + if (tsdbLockRepo(pRepo) < 0) return -1; + + pSnapshot->omem = pRepo->mem; + pSnapshot->imem = pRepo->imem; + tsdbRefMemTable(pRepo, pRepo->mem); + tsdbRefMemTable(pRepo, pRepo->imem); + + if (tsdbUnlockRepo(pRepo) < 0) return -1; + + if (pSnapshot->omem) { + taosRLockLatch(&(pSnapshot->omem->latch)); + + pSnapshot->mem = &(pSnapshot->mtable); + + pSnapshot->mem->tData = (STableData **)calloc(pSnapshot->omem->maxTables, sizeof(STableData *)); + if (pSnapshot->mem->tData == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + taosRUnLockLatch(&(pSnapshot->omem->latch)); + tsdbUnRefMemTable(pRepo, pSnapshot->omem); + tsdbUnRefMemTable(pRepo, pSnapshot->imem); + pSnapshot->mem = NULL; + pSnapshot->imem = NULL; + pSnapshot->omem = NULL; + return -1; + } + + pSnapshot->mem->keyFirst = pSnapshot->omem->keyFirst; + pSnapshot->mem->keyLast = pSnapshot->omem->keyLast; + pSnapshot->mem->numOfRows = pSnapshot->omem->numOfRows; + pSnapshot->mem->maxTables = pSnapshot->omem->maxTables; + + for (size_t i = 0; i < taosArrayGetSize(pATable); i++) { + STable * pTable = *(STable **)taosArrayGet(pATable, i); + int32_t tid = TABLE_TID(pTable); + STableData *pTableData = (tid < pSnapshot->omem->maxTables) ? pSnapshot->omem->tData[tid] : NULL; + + if ((pTableData == NULL) || (TABLE_UID(pTable) != pTableData->uid)) continue; + + pSnapshot->mem->tData[tid] = pTableData; + T_REF_INC(pTableData); + } + + taosRUnLockLatch(&(pSnapshot->omem->latch)); + } + + tsdbDebug("vgId:%d take memory snapshot, pMem %p pIMem %p", REPO_ID(pRepo), pSnapshot->omem, pSnapshot->imem); + return 0; +} + +void tsdbUnTakeMemSnapShot(STsdbRepo *pRepo, SMemSnapshot *pSnapshot) { + tsdbDebug("vgId:%d untake memory snapshot, pMem %p pIMem %p", REPO_ID(pRepo), pSnapshot->omem, pSnapshot->imem); + + if (pSnapshot->mem) { + ASSERT(pSnapshot->omem != NULL); + + for (size_t i = 0; i < pSnapshot->mem->maxTables; i++) { + STableData *pTableData = pSnapshot->mem->tData[i]; + if (pTableData) { + tsdbFreeTableData(pTableData); + } + } + tfree(pSnapshot->mem->tData); + + tsdbUnRefMemTable(pRepo, pSnapshot->omem); + } + + tsdbUnRefMemTable(pRepo, pSnapshot->imem); + + pSnapshot->mem = NULL; + pSnapshot->imem = NULL; + pSnapshot->omem = NULL; +} + +void *tsdbAllocBytes(STsdbRepo *pRepo, int bytes) { + STsdbCfg * pCfg = &pRepo->config; + STsdbBufBlock *pBufBlock = NULL; + void * ptr = NULL; + + // Either allocate from buffer blocks or from SYSTEM memory pool + if (pRepo->mem == NULL) { + SMemTable *pMemTable = tsdbNewMemTable(pRepo); + if (pMemTable == NULL) return NULL; + pRepo->mem = pMemTable; + } + + ASSERT(pRepo->mem != NULL); + + pBufBlock = tsdbGetCurrBufBlock(pRepo); + if ((pRepo->mem->extraBuffList != NULL) || + ((listNEles(pRepo->mem->bufBlockList) >= pCfg->totalBlocks / 3) && (pBufBlock->remain < bytes))) { + // allocate from SYSTEM buffer pool + if (pRepo->mem->extraBuffList == NULL) { + pRepo->mem->extraBuffList = tdListNew(0); + if (pRepo->mem->extraBuffList == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + } + + ASSERT(pRepo->mem->extraBuffList != NULL); + SListNode *pNode = (SListNode *)malloc(sizeof(SListNode) + bytes); + if (pNode == NULL) { + if (listNEles(pRepo->mem->extraBuffList) == 0) { + tdListFree(pRepo->mem->extraBuffList); + pRepo->mem->extraBuffList = NULL; + } + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + + pNode->next = pNode->prev = NULL; + tdListAppendNode(pRepo->mem->extraBuffList, pNode); + ptr = (void *)(pNode->data); + tsdbTrace("vgId:%d allocate %d bytes from SYSTEM buffer block", REPO_ID(pRepo), bytes); + } else { // allocate from TSDB buffer pool + if (pBufBlock == NULL || pBufBlock->remain < bytes) { + ASSERT(listNEles(pRepo->mem->bufBlockList) < pCfg->totalBlocks / 3); + if (tsdbLockRepo(pRepo) < 0) return NULL; + SListNode *pNode = tsdbAllocBufBlockFromPool(pRepo); + tdListAppendNode(pRepo->mem->bufBlockList, pNode); + if (tsdbUnlockRepo(pRepo) < 0) return NULL; + pBufBlock = tsdbGetCurrBufBlock(pRepo); + } + + ASSERT(pBufBlock->remain >= bytes); + ptr = POINTER_SHIFT(pBufBlock->data, pBufBlock->offset); + pBufBlock->offset += bytes; + pBufBlock->remain -= bytes; + tsdbTrace("vgId:%d allocate %d bytes from TSDB buffer block, nBlocks %d offset %d remain %d", REPO_ID(pRepo), bytes, + listNEles(pRepo->mem->bufBlockList), pBufBlock->offset, pBufBlock->remain); + } + + return ptr; +} + +int tsdbSyncCommitConfig(STsdbRepo* pRepo) { + ASSERT(pRepo->config_changed == true); + tsem_wait(&(pRepo->readyToCommit)); + + if (pRepo->code != TSDB_CODE_SUCCESS) { + tsdbWarn("vgId:%d try to commit config when TSDB not in good state: %s", REPO_ID(pRepo), tstrerror(terrno)); + } + + if (tsdbLockRepo(pRepo) < 0) return -1; + tsdbScheduleCommit(pRepo, COMMIT_CONFIG_REQ); + if (tsdbUnlockRepo(pRepo) < 0) return -1; + + tsem_wait(&(pRepo->readyToCommit)); + tsem_post(&(pRepo->readyToCommit)); + + if (pRepo->code != TSDB_CODE_SUCCESS) { + terrno = pRepo->code; + return -1; + } + + terrno = TSDB_CODE_SUCCESS; + return 0; +} + +int tsdbAsyncCommit(STsdbRepo *pRepo) { + tsem_wait(&(pRepo->readyToCommit)); + + ASSERT(pRepo->imem == NULL); + if (pRepo->mem == NULL) { + tsem_post(&(pRepo->readyToCommit)); + return 0; + } + + if (pRepo->code != TSDB_CODE_SUCCESS) { + tsdbWarn("vgId:%d try to commit when TSDB not in good state: %s", REPO_ID(pRepo), tstrerror(terrno)); + } + + if (pRepo->appH.notifyStatus) pRepo->appH.notifyStatus(pRepo->appH.appH, TSDB_STATUS_COMMIT_START, TSDB_CODE_SUCCESS); + if (tsdbLockRepo(pRepo) < 0) return -1; + pRepo->imem = pRepo->mem; + pRepo->mem = NULL; + tsdbScheduleCommit(pRepo, COMMIT_REQ); + if (tsdbUnlockRepo(pRepo) < 0) return -1; + + return 0; +} + +int tsdbSyncCommit(STsdbRepo *repo) { + STsdbRepo *pRepo = repo; + + tsdbAsyncCommit(pRepo); + tsem_wait(&(pRepo->readyToCommit)); + tsem_post(&(pRepo->readyToCommit)); + + if (pRepo->code != TSDB_CODE_SUCCESS) { + terrno = pRepo->code; + return -1; + } else { + terrno = TSDB_CODE_SUCCESS; + return 0; + } +} + +/** + * This is an important function to load data or try to load data from memory skiplist iterator. + * + * This function load memory data until: + * 1. iterator ends + * 2. data key exceeds maxKey + * 3. rowsIncreased = rowsInserted - rowsDeleteSucceed >= maxRowsToRead + * 4. operations in pCols not exceeds its max capacity if pCols is given + * + * The function tries to procceed AS MUCH AS POSSIBLE. + */ +int tsdbLoadDataFromCache(STable *pTable, SSkipListIterator *pIter, TSKEY maxKey, int maxRowsToRead, SDataCols *pCols, + TKEY *filterKeys, int nFilterKeys, bool keepDup, SMergeInfo *pMergeInfo) { + ASSERT(maxRowsToRead > 0 && nFilterKeys >= 0); + if (pIter == NULL) return 0; + STSchema * pSchema = NULL; + TSKEY rowKey = 0; + TSKEY fKey = 0; + bool isRowDel = false; + int filterIter = 0; + SMemRow row = NULL; + SMergeInfo mInfo; + + if (pMergeInfo == NULL) pMergeInfo = &mInfo; + + memset(pMergeInfo, 0, sizeof(*pMergeInfo)); + pMergeInfo->keyFirst = INT64_MAX; + pMergeInfo->keyLast = INT64_MIN; + if (pCols) tdResetDataCols(pCols); + + row = tsdbNextIterRow(pIter); + if (row == NULL || memRowKey(row) > maxKey) { + rowKey = INT64_MAX; + isRowDel = false; + } else { + rowKey = memRowKey(row); + isRowDel = memRowDeleted(row); + } + + if (filterIter >= nFilterKeys) { + fKey = INT64_MAX; + } else { + fKey = tdGetKey(filterKeys[filterIter]); + } + + while (true) { + if (fKey == INT64_MAX && rowKey == INT64_MAX) break; + + if (fKey < rowKey) { + pMergeInfo->keyFirst = MIN(pMergeInfo->keyFirst, fKey); + pMergeInfo->keyLast = MAX(pMergeInfo->keyLast, fKey); + + filterIter++; + if (filterIter >= nFilterKeys) { + fKey = INT64_MAX; + } else { + fKey = tdGetKey(filterKeys[filterIter]); + } + } else if (fKey > rowKey) { + if (isRowDel) { + pMergeInfo->rowsDeleteFailed++; + } else { + if (pMergeInfo->rowsInserted - pMergeInfo->rowsDeleteSucceed >= maxRowsToRead) break; + if (pCols && pMergeInfo->nOperations >= pCols->maxPoints) break; + pMergeInfo->rowsInserted++; + pMergeInfo->nOperations++; + pMergeInfo->keyFirst = MIN(pMergeInfo->keyFirst, rowKey); + pMergeInfo->keyLast = MAX(pMergeInfo->keyLast, rowKey); + tsdbAppendTableRowToCols(pTable, pCols, &pSchema, row); + } + + tSkipListIterNext(pIter); + row = tsdbNextIterRow(pIter); + if (row == NULL || memRowKey(row) > maxKey) { + rowKey = INT64_MAX; + isRowDel = false; + } else { + rowKey = memRowKey(row); + isRowDel = memRowDeleted(row); + } + } else { + if (isRowDel) { + ASSERT(!keepDup); + if (pCols && pMergeInfo->nOperations >= pCols->maxPoints) break; + pMergeInfo->rowsDeleteSucceed++; + pMergeInfo->nOperations++; + tsdbAppendTableRowToCols(pTable, pCols, &pSchema, row); + } else { + if (keepDup) { + if (pCols && pMergeInfo->nOperations >= pCols->maxPoints) break; + pMergeInfo->rowsUpdated++; + pMergeInfo->nOperations++; + pMergeInfo->keyFirst = MIN(pMergeInfo->keyFirst, rowKey); + pMergeInfo->keyLast = MAX(pMergeInfo->keyLast, rowKey); + tsdbAppendTableRowToCols(pTable, pCols, &pSchema, row); + } else { + pMergeInfo->keyFirst = MIN(pMergeInfo->keyFirst, fKey); + pMergeInfo->keyLast = MAX(pMergeInfo->keyLast, fKey); + } + } + + tSkipListIterNext(pIter); + row = tsdbNextIterRow(pIter); + if (row == NULL || memRowKey(row) > maxKey) { + rowKey = INT64_MAX; + isRowDel = false; + } else { + rowKey = memRowKey(row); + isRowDel = memRowDeleted(row); + } + + filterIter++; + if (filterIter >= nFilterKeys) { + fKey = INT64_MAX; + } else { + fKey = tdGetKey(filterKeys[filterIter]); + } + } + } + + return 0; +} + +// ---------------- LOCAL FUNCTIONS ---------------- +static SMemTable* tsdbNewMemTable(STsdbRepo *pRepo) { + STsdbMeta *pMeta = pRepo->tsdbMeta; + + SMemTable *pMemTable = (SMemTable *)calloc(1, sizeof(*pMemTable)); + if (pMemTable == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + pMemTable->keyFirst = INT64_MAX; + pMemTable->keyLast = 0; + pMemTable->numOfRows = 0; + + pMemTable->maxTables = pMeta->maxTables; + pMemTable->tData = (STableData **)calloc(pMemTable->maxTables, sizeof(STableData *)); + if (pMemTable->tData == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + pMemTable->actList = tdListNew(0); + if (pMemTable->actList == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + pMemTable->bufBlockList = tdListNew(sizeof(STsdbBufBlock*)); + if (pMemTable->bufBlockList == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + T_REF_INC(pMemTable); + + return pMemTable; + +_err: + tsdbFreeMemTable(pMemTable); + return NULL; +} + +static void tsdbFreeMemTable(SMemTable* pMemTable) { + if (pMemTable) { + ASSERT((pMemTable->bufBlockList == NULL) ? true : (listNEles(pMemTable->bufBlockList) == 0)); + ASSERT((pMemTable->actList == NULL) ? true : (listNEles(pMemTable->actList) == 0)); + + tdListFree(pMemTable->extraBuffList); + tdListFree(pMemTable->bufBlockList); + tdListFree(pMemTable->actList); + tfree(pMemTable->tData); + free(pMemTable); + } +} + +static STableData *tsdbNewTableData(STsdbCfg *pCfg, STable *pTable) { + STableData *pTableData = (STableData *)calloc(1, sizeof(*pTableData)); + if (pTableData == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + + pTableData->uid = TABLE_UID(pTable); + pTableData->keyFirst = INT64_MAX; + pTableData->keyLast = 0; + pTableData->numOfRows = 0; + + uint8_t skipListCreateFlags; + if(pCfg->update == TD_ROW_DISCARD_UPDATE) + skipListCreateFlags = SL_DISCARD_DUP_KEY; + else + skipListCreateFlags = SL_UPDATE_DUP_KEY; + + pTableData->pData = + tSkipListCreate(TSDB_DATA_SKIPLIST_LEVEL, TSDB_DATA_TYPE_TIMESTAMP, TYPE_BYTES[TSDB_DATA_TYPE_TIMESTAMP], + tkeyComparFn, skipListCreateFlags, tsdbGetTsTupleKey); + if (pTableData->pData == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + free(pTableData); + return NULL; + } + + T_REF_INC(pTableData); + + return pTableData; +} + +static void tsdbFreeTableData(STableData *pTableData) { + if (pTableData) { + int32_t ref = T_REF_DEC(pTableData); + if (ref == 0) { + tSkipListDestroy(pTableData->pData); + free(pTableData); + } + } +} + +static char *tsdbGetTsTupleKey(const void *data) { return memRowKeys((SMemRow)data); } + +static int tsdbAdjustMemMaxTables(SMemTable *pMemTable, int maxTables) { + ASSERT(pMemTable->maxTables < maxTables); + + STableData **pTableData = (STableData **)calloc(maxTables, sizeof(STableData *)); + if (pTableData == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + memcpy((void *)pTableData, (void *)pMemTable->tData, sizeof(STableData *) * pMemTable->maxTables); + + STableData **tData = pMemTable->tData; + + taosWLockLatch(&(pMemTable->latch)); + pMemTable->maxTables = maxTables; + pMemTable->tData = pTableData; + taosWUnLockLatch(&(pMemTable->latch)); + + tfree(tData); + + return 0; +} + +static int tsdbAppendTableRowToCols(STable *pTable, SDataCols *pCols, STSchema **ppSchema, SMemRow row) { + if (pCols) { + if (*ppSchema == NULL || schemaVersion(*ppSchema) != memRowVersion(row)) { + *ppSchema = tsdbGetTableSchemaImpl(pTable, false, false, memRowVersion(row), (int8_t)memRowType(row)); + if (*ppSchema == NULL) { + ASSERT(false); + return -1; + } + } + + tdAppendMemRowToDataCol(row, *ppSchema, pCols, true, 0); + } + + return 0; +} + +static int tsdbInitSubmitBlkIter(SSubmitBlk *pBlock, SSubmitBlkIter *pIter) { + if (pBlock->dataLen <= 0) return -1; + pIter->totalLen = pBlock->dataLen; + pIter->len = 0; + pIter->row = (SMemRow)(pBlock->data + pBlock->schemaLen); + return 0; +} + +static SMemRow tsdbGetSubmitBlkNext(SSubmitBlkIter *pIter) { + SMemRow row = pIter->row; // firstly, get current row + if (row == NULL) return NULL; + + pIter->len += memRowTLen(row); + if (pIter->len >= pIter->totalLen) { // reach the end + pIter->row = NULL; + } else { + pIter->row = (char *)row + memRowTLen(row); // secondly, move to next row + } + + return row; +} + +static FORCE_INLINE int tsdbCheckRowRange(STsdbRepo *pRepo, STable *pTable, SMemRow row, TSKEY minKey, TSKEY maxKey, + TSKEY now) { + TSKEY rowKey = memRowKey(row); + if (rowKey < minKey || rowKey > maxKey) { + tsdbError("vgId:%d table %s tid %d uid %" PRIu64 " timestamp is out of range! now %" PRId64 " minKey %" PRId64 + " maxKey %" PRId64 " row key %" PRId64, + REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), TABLE_TID(pTable), TABLE_UID(pTable), now, minKey, maxKey, + rowKey); + terrno = TSDB_CODE_TDB_TIMESTAMP_OUT_OF_RANGE; + return -1; + } + + return 0; +} + +static int tsdbScanAndConvertSubmitMsg(STsdbRepo *pRepo, SSubmitMsg *pMsg) { + ASSERT(pMsg != NULL); + STsdbMeta * pMeta = pRepo->tsdbMeta; + SSubmitMsgIter msgIter = {0}; + SSubmitBlk * pBlock = NULL; + SSubmitBlkIter blkIter = {0}; + SMemRow row = NULL; + TSKEY now = taosGetTimestamp(pRepo->config.precision); + TSKEY minKey = now - tsTickPerDay[pRepo->config.precision] * pRepo->config.keep; + TSKEY maxKey = now + tsTickPerDay[pRepo->config.precision] * pRepo->config.daysPerFile; + + terrno = TSDB_CODE_SUCCESS; + pMsg->length = htonl(pMsg->length); + pMsg->numOfBlocks = htonl(pMsg->numOfBlocks); + + if (tsdbInitSubmitMsgIter(pMsg, &msgIter) < 0) return -1; + while (true) { + if (tsdbGetSubmitMsgNext(&msgIter, &pBlock) < 0) return -1; + if (pBlock == NULL) break; + + pBlock->uid = htobe64(pBlock->uid); + pBlock->tid = htonl(pBlock->tid); + pBlock->sversion = htonl(pBlock->sversion); + pBlock->dataLen = htonl(pBlock->dataLen); + pBlock->schemaLen = htonl(pBlock->schemaLen); + pBlock->numOfRows = htons(pBlock->numOfRows); + + if (pBlock->tid <= 0 || pBlock->tid >= pMeta->maxTables) { + tsdbError("vgId:%d failed to get table to insert data, uid %" PRIu64 " tid %d", REPO_ID(pRepo), pBlock->uid, + pBlock->tid); + terrno = TSDB_CODE_TDB_INVALID_TABLE_ID; + return -1; + } + + STable *pTable = pMeta->tables[pBlock->tid]; + if (pTable == NULL || TABLE_UID(pTable) != pBlock->uid) { + tsdbError("vgId:%d failed to get table to insert data, uid %" PRIu64 " tid %d", REPO_ID(pRepo), pBlock->uid, + pBlock->tid); + terrno = TSDB_CODE_TDB_INVALID_TABLE_ID; + return -1; + } + + if (TABLE_TYPE(pTable) == TSDB_SUPER_TABLE) { + tsdbError("vgId:%d invalid action trying to insert a super table %s", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable)); + terrno = TSDB_CODE_TDB_INVALID_ACTION; + return -1; + } + + // Check schema version and update schema if needed + if (tsdbCheckTableSchema(pRepo, pBlock, pTable) < 0) { + if (terrno == TSDB_CODE_TDB_TABLE_RECONFIGURE) { + continue; + } else { + return -1; + } + } + + tsdbInitSubmitBlkIter(pBlock, &blkIter); + while ((row = tsdbGetSubmitBlkNext(&blkIter)) != NULL) { + if (tsdbCheckRowRange(pRepo, pTable, row, minKey, maxKey, now) < 0) { + return -1; + } + } + } + + if (terrno != TSDB_CODE_SUCCESS) return -1; + return 0; +} + +//row1 has higher priority +static SMemRow tsdbInsertDupKeyMerge(SMemRow row1, SMemRow row2, STsdbRepo* pRepo, + STSchema **ppSchema1, STSchema **ppSchema2, + STable* pTable, int32_t* pPoints, SMemRow* pLastRow) { + + //for compatiblity, duplicate key inserted when update=0 should be also calculated as affected rows! + if(row1 == NULL && row2 == NULL && pRepo->config.update == TD_ROW_DISCARD_UPDATE) { + (*pPoints)++; + return NULL; + } + + tsdbTrace("vgId:%d a row is %s table %s tid %d uid %" PRIu64 " key %" PRIu64, REPO_ID(pRepo), + "updated in", TABLE_CHAR_NAME(pTable), TABLE_TID(pTable), TABLE_UID(pTable), + memRowKey(row1)); + + if(row2 == NULL || pRepo->config.update != TD_ROW_PARTIAL_UPDATE) { + void* pMem = tsdbAllocBytes(pRepo, memRowTLen(row1)); + if(pMem == NULL) return NULL; + memRowCpy(pMem, row1); + (*pPoints)++; + *pLastRow = pMem; + return pMem; + } + + STSchema *pSchema1 = *ppSchema1; + STSchema *pSchema2 = *ppSchema2; + SMergeBuf * pBuf = &pRepo->mergeBuf; + int dv1 = memRowVersion(row1); + int dv2 = memRowVersion(row2); + if(pSchema1 == NULL || schemaVersion(pSchema1) != dv1) { + if(pSchema2 != NULL && schemaVersion(pSchema2) == dv1) { + *ppSchema1 = pSchema2; + } else { + *ppSchema1 = tsdbGetTableSchemaImpl(pTable, false, false, memRowVersion(row1), (int8_t)memRowType(row1)); + } + pSchema1 = *ppSchema1; + } + + if(pSchema2 == NULL || schemaVersion(pSchema2) != dv2) { + if(schemaVersion(pSchema1) == dv2) { + pSchema2 = pSchema1; + } else { + *ppSchema2 = tsdbGetTableSchemaImpl(pTable, false, false, memRowVersion(row2), (int8_t)memRowType(row2)); + pSchema2 = *ppSchema2; + } + } + + SMemRow tmp = tsdbMergeTwoRows(pBuf, row1, row2, pSchema1, pSchema2); + + void* pMem = tsdbAllocBytes(pRepo, memRowTLen(tmp)); + if(pMem == NULL) return NULL; + memRowCpy(pMem, tmp); + + (*pPoints)++; + *pLastRow = pMem; + return pMem; +} + +static void* tsdbInsertDupKeyMergePacked(void** args) { + return tsdbInsertDupKeyMerge(args[0], args[1], args[2], (STSchema**)&args[3], (STSchema**)&args[4], args[5], args[6], args[7]); +} + +static void tsdbSetupSkipListHookFns(SSkipList* pSkipList, STsdbRepo *pRepo, STable *pTable, int32_t* pPoints, SMemRow* pLastRow) { + + if(pSkipList->insertHandleFn == NULL) { + tGenericSavedFunc *dupHandleSavedFunc = genericSavedFuncInit((GenericVaFunc)&tsdbInsertDupKeyMergePacked, 9); + dupHandleSavedFunc->args[2] = pRepo; + dupHandleSavedFunc->args[3] = NULL; + dupHandleSavedFunc->args[4] = NULL; + dupHandleSavedFunc->args[5] = pTable; + pSkipList->insertHandleFn = dupHandleSavedFunc; + } + pSkipList->insertHandleFn->args[6] = pPoints; + pSkipList->insertHandleFn->args[7] = pLastRow; +} + +static int tsdbInsertDataToTable(STsdbRepo* pRepo, SSubmitBlk* pBlock, int32_t *pAffectedRows) { + + STsdbMeta *pMeta = pRepo->tsdbMeta; + int32_t points = 0; + STable *pTable = NULL; + SSubmitBlkIter blkIter = {0}; + SMemTable *pMemTable = NULL; + STableData *pTableData = NULL; + STsdbCfg *pCfg = &(pRepo->config); + + tsdbInitSubmitBlkIter(pBlock, &blkIter); + if(blkIter.row == NULL) return 0; + TSKEY firstRowKey = memRowKey(blkIter.row); + + tsdbAllocBytes(pRepo, 0); + pMemTable = pRepo->mem; + + ASSERT(pMemTable != NULL); + ASSERT(pBlock->tid < pMeta->maxTables); + + pTable = pMeta->tables[pBlock->tid]; + + ASSERT(pTable != NULL && TABLE_UID(pTable) == pBlock->uid); + + + if (TABLE_TID(pTable) >= pMemTable->maxTables) { + if (tsdbAdjustMemMaxTables(pMemTable, pMeta->maxTables) < 0) { + return -1; + } + } + pTableData = pMemTable->tData[TABLE_TID(pTable)]; + + if (pTableData == NULL || pTableData->uid != TABLE_UID(pTable)) { + if (pTableData != NULL) { + taosWLockLatch(&(pMemTable->latch)); + pMemTable->tData[TABLE_TID(pTable)] = NULL; + tsdbFreeTableData(pTableData); + taosWUnLockLatch(&(pMemTable->latch)); + } + + pTableData = tsdbNewTableData(pCfg, pTable); + if (pTableData == NULL) { + tsdbError("vgId:%d failed to insert data to table %s uid %" PRId64 " tid %d since %s", REPO_ID(pRepo), + TABLE_CHAR_NAME(pTable), TABLE_UID(pTable), TABLE_TID(pTable), tstrerror(terrno)); + return -1; + } + + pRepo->mem->tData[TABLE_TID(pTable)] = pTableData; + } + + ASSERT((pTableData != NULL) && pTableData->uid == TABLE_UID(pTable)); + + SMemRow lastRow = NULL; + int64_t osize = SL_SIZE(pTableData->pData); + tsdbSetupSkipListHookFns(pTableData->pData, pRepo, pTable, &points, &lastRow); + tSkipListPutBatchByIter(pTableData->pData, &blkIter, (iter_next_fn_t)tsdbGetSubmitBlkNext); + int64_t dsize = SL_SIZE(pTableData->pData) - osize; + (*pAffectedRows) += points; + + if(lastRow != NULL) { + TSKEY lastRowKey = memRowKey(lastRow); + if (pMemTable->keyFirst > firstRowKey) pMemTable->keyFirst = firstRowKey; + pMemTable->numOfRows += dsize; + + if (pTableData->keyFirst > firstRowKey) pTableData->keyFirst = firstRowKey; + pTableData->numOfRows += dsize; + if (pMemTable->keyLast < lastRowKey) pMemTable->keyLast = lastRowKey; + if (pTableData->keyLast < lastRowKey) pTableData->keyLast = lastRowKey; + if (tsdbUpdateTableLatestInfo(pRepo, pTable, lastRow) < 0) { + return -1; + } + } + + STSchema *pSchema = tsdbGetTableSchemaByVersion(pTable, pBlock->sversion, -1); + pRepo->stat.pointsWritten += points * schemaNCols(pSchema); + pRepo->stat.totalStorage += points * schemaVLen(pSchema); + + return 0; +} + + +static int tsdbInitSubmitMsgIter(SSubmitMsg *pMsg, SSubmitMsgIter *pIter) { + if (pMsg == NULL) { + terrno = TSDB_CODE_TDB_SUBMIT_MSG_MSSED_UP; + return -1; + } + + pIter->totalLen = pMsg->length; + pIter->len = 0; + pIter->pMsg = pMsg; + if (pMsg->length <= TSDB_SUBMIT_MSG_HEAD_SIZE) { + terrno = TSDB_CODE_TDB_SUBMIT_MSG_MSSED_UP; + return -1; + } + + return 0; +} + +static int tsdbGetSubmitMsgNext(SSubmitMsgIter *pIter, SSubmitBlk **pPBlock) { + if (pIter->len == 0) { + pIter->len += TSDB_SUBMIT_MSG_HEAD_SIZE; + } else { + SSubmitBlk *pSubmitBlk = (SSubmitBlk *)POINTER_SHIFT(pIter->pMsg, pIter->len); + pIter->len += (sizeof(SSubmitBlk) + pSubmitBlk->dataLen + pSubmitBlk->schemaLen); + } + + if (pIter->len > pIter->totalLen) { + terrno = TSDB_CODE_TDB_SUBMIT_MSG_MSSED_UP; + *pPBlock = NULL; + return -1; + } + + *pPBlock = (pIter->len == pIter->totalLen) ? NULL : (SSubmitBlk *)POINTER_SHIFT(pIter->pMsg, pIter->len); + + return 0; +} + +static int tsdbCheckTableSchema(STsdbRepo *pRepo, SSubmitBlk *pBlock, STable *pTable) { + ASSERT(pTable != NULL); + + STSchema *pSchema = tsdbGetTableSchemaImpl(pTable, false, false, -1, -1); + int sversion = schemaVersion(pSchema); + + if (pBlock->sversion == sversion) { + return 0; + } else { + if (TABLE_TYPE(pTable) == TSDB_STREAM_TABLE) { // stream table is not allowed to change schema + terrno = TSDB_CODE_TDB_IVD_TB_SCHEMA_VERSION; + return -1; + } + } + + if (pBlock->sversion > sversion) { // may need to update table schema + if (pBlock->schemaLen > 0) { + tsdbDebug( + "vgId:%d table %s tid %d uid %" PRIu64 " schema version %d is out of data, client version %d, update...", + REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), TABLE_TID(pTable), TABLE_UID(pTable), sversion, pBlock->sversion); + ASSERT(pBlock->schemaLen % sizeof(STColumn) == 0); + int numOfCols = pBlock->schemaLen / sizeof(STColumn); + STColumn *pTCol = (STColumn *)pBlock->data; + + STSchemaBuilder schemaBuilder = {0}; + if (tdInitTSchemaBuilder(&schemaBuilder, pBlock->sversion) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbError("vgId:%d failed to update schema of table %s since %s", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), + tstrerror(terrno)); + return -1; + } + + for (int i = 0; i < numOfCols; i++) { + if (tdAddColToSchema(&schemaBuilder, pTCol[i].type, htons(pTCol[i].colId), htons(pTCol[i].bytes)) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbError("vgId:%d failed to update schema of table %s since %s", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), + tstrerror(terrno)); + tdDestroyTSchemaBuilder(&schemaBuilder); + return -1; + } + } + + STSchema *pNSchema = tdGetSchemaFromBuilder(&schemaBuilder); + if (pNSchema == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tdDestroyTSchemaBuilder(&schemaBuilder); + return -1; + } + + tdDestroyTSchemaBuilder(&schemaBuilder); + tsdbUpdateTableSchema(pRepo, pTable, pNSchema, true); + } else { + tsdbDebug( + "vgId:%d table %s tid %d uid %" PRIu64 " schema version %d is out of data, client version %d, reconfigure...", + REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), TABLE_TID(pTable), TABLE_UID(pTable), sversion, pBlock->sversion); + terrno = TSDB_CODE_TDB_TABLE_RECONFIGURE; + return -1; + } + } else { + ASSERT(pBlock->sversion >= 0); + if (tsdbGetTableSchemaImpl(pTable, false, false, pBlock->sversion, -1) == NULL) { + tsdbError("vgId:%d invalid submit schema version %d to table %s tid %d from client", REPO_ID(pRepo), + pBlock->sversion, TABLE_CHAR_NAME(pTable), TABLE_TID(pTable)); + terrno = TSDB_CODE_TDB_IVD_TB_SCHEMA_VERSION; + return -1; + } + } + + return 0; +} + + +static void updateTableLatestColumn(STsdbRepo *pRepo, STable *pTable, SMemRow row) { + tsdbDebug("vgId:%d updateTableLatestColumn, %s row version:%d", REPO_ID(pRepo), pTable->name->data, + memRowVersion(row)); + + STSchema* pSchema = tsdbGetTableLatestSchema(pTable); + if (tsdbUpdateLastColSchema(pTable, pSchema) < 0) { + return; + } + + pSchema = tsdbGetTableSchemaByVersion(pTable, memRowVersion(row), (int8_t)memRowType(row)); + if (pSchema == NULL) { + return; + } + + SDataCol *pLatestCols = pTable->lastCols; + int32_t kvIdx = 0; + + for (int16_t j = 0; j < schemaNCols(pSchema); j++) { + STColumn *pTCol = schemaColAt(pSchema, j); + // ignore not exist colId + int16_t idx = tsdbGetLastColumnsIndexByColId(pTable, pTCol->colId); + if (idx == -1) { + continue; + } + + void *value = NULL; + + value = tdGetMemRowDataOfColEx(row, pTCol->colId, (int8_t)pTCol->type, + TD_DATA_ROW_HEAD_SIZE + pSchema->columns[j].offset, &kvIdx); + + if ((value == NULL) || isNull(value, pTCol->type)) { + continue; + } + // lock + TSDB_WLOCK_TABLE(pTable); + SDataCol *pDataCol = &(pLatestCols[idx]); + if (pDataCol->pData == NULL) { + pDataCol->pData = malloc(pTCol->bytes); + pDataCol->bytes = pTCol->bytes; + } else if (pDataCol->bytes < pTCol->bytes) { + pDataCol->pData = realloc(pDataCol->pData, pTCol->bytes); + pDataCol->bytes = pTCol->bytes; + } + // the actual value size + uint16_t bytes = IS_VAR_DATA_TYPE(pTCol->type) ? varDataTLen(value) : pTCol->bytes; + // the actual data size CANNOT larger than column size + assert(pTCol->bytes >= bytes); + memcpy(pDataCol->pData, value, bytes); + //tsdbInfo("updateTableLatestColumn vgId:%d cache column %d for %d,%s", REPO_ID(pRepo), j, pDataCol->bytes, (char*)pDataCol->pData); + pDataCol->ts = memRowKey(row); + // unlock + TSDB_WUNLOCK_TABLE(pTable); + } +} + +static int tsdbUpdateTableLatestInfo(STsdbRepo *pRepo, STable *pTable, SMemRow row) { + STsdbCfg *pCfg = &pRepo->config; + + // if cacheLastRow config has been reset, free the lastRow + if (!pCfg->cacheLastRow && pTable->lastRow != NULL) { + SMemRow cachedLastRow = pTable->lastRow; + TSDB_WLOCK_TABLE(pTable); + pTable->lastRow = NULL; + TSDB_WUNLOCK_TABLE(pTable); + taosTZfree(cachedLastRow); + } + + if (tsdbGetTableLastKeyImpl(pTable) <= memRowKey(row)) { + if (CACHE_LAST_ROW(pCfg) || pTable->lastRow != NULL) { + SMemRow nrow = pTable->lastRow; + if (taosTSizeof(nrow) < memRowTLen(row)) { + SMemRow orow = nrow; + nrow = taosTMalloc(memRowTLen(row)); + if (nrow == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + memRowCpy(nrow, row); + TSDB_WLOCK_TABLE(pTable); + pTable->lastKey = memRowKey(row); + pTable->lastRow = nrow; + TSDB_WUNLOCK_TABLE(pTable); + taosTZfree(orow); + } else { + TSDB_WLOCK_TABLE(pTable); + pTable->lastKey = memRowKey(row); + memRowCpy(nrow, row); + TSDB_WUNLOCK_TABLE(pTable); + } + } else { + pTable->lastKey = memRowKey(row); + } + + if (CACHE_LAST_NULL_COLUMN(pCfg)) { + updateTableLatestColumn(pRepo, pTable, row); + } + } + + pTable->cacheLastConfigVersion = pRepo->cacheLastConfigVersion; + + return 0; +} diff --git a/source/dnode/vnode/tsdb2/src/tsdbMeta.c b/source/dnode/vnode/tsdb2/src/tsdbMeta.c new file mode 100644 index 0000000000..a095bff61e --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbMeta.c @@ -0,0 +1,1687 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#include "tsdbint.h" +#include "tcompare.h" +#include "tutil.h" + +#define TSDB_SUPER_TABLE_SL_LEVEL 5 +#define DEFAULT_TAG_INDEX_COLUMN 0 + +static char * getTagIndexKey(const void *pData); +static STable *tsdbNewTable(); +static STable *tsdbCreateTableFromCfg(STableCfg *pCfg, bool isSuper, STable *pSTable); +static void tsdbFreeTable(STable *pTable); +static int tsdbAddTableToMeta(STsdbRepo *pRepo, STable *pTable, bool addIdx, bool lock); +static void tsdbRemoveTableFromMeta(STsdbRepo *pRepo, STable *pTable, bool rmFromIdx, bool lock); +static int tsdbAddTableIntoIndex(STsdbMeta *pMeta, STable *pTable, bool refSuper); +static int tsdbRemoveTableFromIndex(STsdbMeta *pMeta, STable *pTable); +static int tsdbInitTableCfg(STableCfg *config, ETableType type, uint64_t uid, int32_t tid); +static int tsdbTableSetSchema(STableCfg *config, STSchema *pSchema, bool dup); +static int tsdbTableSetName(STableCfg *config, char *name, bool dup); +static int tsdbTableSetTagSchema(STableCfg *config, STSchema *pSchema, bool dup); +static int tsdbTableSetSName(STableCfg *config, char *sname, bool dup); +static int tsdbTableSetSuperUid(STableCfg *config, uint64_t uid); +static int tsdbTableSetTagValue(STableCfg *config, SKVRow row, bool dup); +static int tsdbTableSetStreamSql(STableCfg *config, char *sql, bool dup); +static int tsdbEncodeTableName(void **buf, tstr *name); +static void * tsdbDecodeTableName(void *buf, tstr **name); +static int tsdbEncodeTable(void **buf, STable *pTable); +static void * tsdbDecodeTable(void *buf, STable **pRTable); +static int tsdbGetTableEncodeSize(int8_t act, STable *pTable); +static void * tsdbInsertTableAct(STsdbRepo *pRepo, int8_t act, void *buf, STable *pTable); +static int tsdbRemoveTableFromStore(STsdbRepo *pRepo, STable *pTable); +static int tsdbRmTableFromMeta(STsdbRepo *pRepo, STable *pTable); +static int tsdbAdjustMetaTables(STsdbRepo *pRepo, int tid); +static int tsdbCheckTableTagVal(SKVRow *pKVRow, STSchema *pSchema); +static int tsdbInsertNewTableAction(STsdbRepo *pRepo, STable* pTable); +static int tsdbAddSchema(STable *pTable, STSchema *pSchema); +static void tsdbFreeTableSchema(STable *pTable); + +// ------------------ OUTER FUNCTIONS ------------------ +int tsdbCreateTable(STsdbRepo *repo, STableCfg *pCfg) { + STsdbRepo *pRepo = (STsdbRepo *)repo; + STsdbMeta *pMeta = pRepo->tsdbMeta; + STable * super = NULL; + STable * table = NULL; + bool newSuper = false; + bool superChanged = false; + int tid = pCfg->tableId.tid; + STable * pTable = NULL; + + if (tid < 1 || tid > TSDB_MAX_TABLES) { + tsdbError("vgId:%d failed to create table since invalid tid %d", REPO_ID(pRepo), tid); + terrno = TSDB_CODE_TDB_IVD_CREATE_TABLE_INFO; + goto _err; + } + + if (tid < pMeta->maxTables && pMeta->tables[tid] != NULL) { + if (TABLE_UID(pMeta->tables[tid]) == pCfg->tableId.uid) { + tsdbError("vgId:%d table %s already exists, tid %d uid %" PRId64, REPO_ID(pRepo), + TABLE_CHAR_NAME(pMeta->tables[tid]), TABLE_TID(pMeta->tables[tid]), TABLE_UID(pMeta->tables[tid])); + return 0; + } else { + tsdbInfo("vgId:%d table %s at tid %d uid %" PRIu64 + " exists, replace it with new table, this can be not reasonable", + REPO_ID(pRepo), TABLE_CHAR_NAME(pMeta->tables[tid]), TABLE_TID(pMeta->tables[tid]), + TABLE_UID(pMeta->tables[tid])); + tsdbDropTable(pRepo, pMeta->tables[tid]->tableId); + } + } + + pTable = tsdbGetTableByUid(pMeta, pCfg->tableId.uid); + if (pTable != NULL) { + tsdbError("vgId:%d table %s already exists, tid %d uid %" PRId64, REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), + TABLE_TID(pTable), TABLE_UID(pTable)); + terrno = TSDB_CODE_TDB_TABLE_ALREADY_EXIST; + goto _err; + } + + if (pCfg->type == TSDB_CHILD_TABLE) { + super = tsdbGetTableByUid(pMeta, pCfg->superUid); + if (super == NULL) { // super table not exists, try to create it + newSuper = true; + super = tsdbCreateTableFromCfg(pCfg, true, NULL); + if (super == NULL) goto _err; + } else { + if (TABLE_TYPE(super) != TSDB_SUPER_TABLE || TABLE_UID(super) != pCfg->superUid) { + terrno = TSDB_CODE_TDB_IVD_CREATE_TABLE_INFO; + goto _err; + } + + if (schemaVersion(pCfg->tagSchema) > schemaVersion(super->tagSchema)) { + // tag schema out of date, need to update super table tag version + STSchema *pOldSchema = super->tagSchema; + TSDB_WLOCK_TABLE(super); + super->tagSchema = tdDupSchema(pCfg->tagSchema); + TSDB_WUNLOCK_TABLE(super); + tdFreeSchema(pOldSchema); + + superChanged = true; + } + } + } + + table = tsdbCreateTableFromCfg(pCfg, false, super); + if (table == NULL) goto _err; + + // Register to meta + tsdbWLockRepoMeta(pRepo); + if (newSuper) { + if (tsdbAddTableToMeta(pRepo, super, true, false) < 0) { + super = NULL; + tsdbUnlockRepoMeta(pRepo); + goto _err; + } + } + if (tsdbAddTableToMeta(pRepo, table, true, false) < 0) { + table = NULL; + tsdbUnlockRepoMeta(pRepo); + goto _err; + } + tsdbUnlockRepoMeta(pRepo); + + // Write to memtable action + if (newSuper || superChanged) { + // add insert new super table action + if (tsdbInsertNewTableAction(pRepo, super) != 0) { + goto _err; + } + } + // add insert new table action + if (tsdbInsertNewTableAction(pRepo, table) != 0) { + goto _err; + } + + if (tsdbCheckCommit(pRepo) < 0) return -1; + + return 0; + +_err: + if (newSuper) { + tsdbFreeTable(super); + } + tsdbFreeTable(table); + return -1; +} + +int tsdbDropTable(STsdbRepo *repo, STableId tableId) { + STsdbRepo *pRepo = (STsdbRepo *)repo; + STsdbMeta *pMeta = pRepo->tsdbMeta; + uint64_t uid = tableId.uid; + int tid = 0; + char * tbname = NULL; + + STable *pTable = tsdbGetTableByUid(pMeta, uid); + if (pTable == NULL) { + tsdbError("vgId:%d failed to drop table since table not exists! tid:%d uid %" PRIu64, REPO_ID(pRepo), tableId.tid, + uid); + terrno = TSDB_CODE_TDB_INVALID_TABLE_ID; + return -1; + } + + tsdbDebug("vgId:%d try to drop table %s type %d", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), TABLE_TYPE(pTable)); + + tid = TABLE_TID(pTable); + tbname = strdup(TABLE_CHAR_NAME(pTable)); + if (tbname == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + // Write to KV store first + if (tsdbRemoveTableFromStore(pRepo, pTable) < 0) { + tsdbError("vgId:%d failed to drop table %s since %s", REPO_ID(pRepo), tbname, tstrerror(terrno)); + goto _err; + } + + // Remove table from Meta + if (tsdbRmTableFromMeta(pRepo, pTable) < 0) { + tsdbError("vgId:%d failed to drop table %s since %s", REPO_ID(pRepo), tbname, tstrerror(terrno)); + goto _err; + } + + tsdbDebug("vgId:%d, table %s is dropped! tid:%d, uid:%" PRId64, pRepo->config.tsdbId, tbname, tid, uid); + free(tbname); + + if (tsdbCheckCommit(pRepo) < 0) goto _err; + + return 0; + +_err: + tfree(tbname); + return -1; +} + +void *tsdbGetTableTagVal(const void* pTable, int32_t colId, int16_t type) { + // TODO: this function should be changed also + + STSchema *pSchema = tsdbGetTableTagSchema((STable*) pTable); + STColumn *pCol = tdGetColOfID(pSchema, colId); + if (pCol == NULL) { + return NULL; // No matched tag volumn + } + + char *val = NULL; + if (pCol->type == TSDB_DATA_TYPE_JSON){ + val = ((STable*)pTable)->tagVal; + }else{ + val = tdGetKVRowValOfCol(((STable*)pTable)->tagVal, colId); + assert(type == pCol->type); + } + + return val; +} + +char *tsdbGetTableName(void* pTable) { + // TODO: need to change as thread-safe + + if (pTable == NULL) { + return NULL; + } else { + return (char*) (((STable *)pTable)->name); + } +} + +STableCfg *tsdbCreateTableCfgFromMsg(SMDCreateTableMsg *pMsg) { + if (pMsg == NULL) return NULL; + + SSchema *pSchema = (SSchema *)pMsg->data; + int16_t numOfCols = htons(pMsg->numOfColumns); + int16_t numOfTags = htons(pMsg->numOfTags); + + STSchemaBuilder schemaBuilder = {0}; + + STableCfg *pCfg = (STableCfg *)calloc(1, sizeof(STableCfg)); + if (pCfg == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + + if (tsdbInitTableCfg(pCfg, pMsg->tableType, htobe64(pMsg->uid), htonl(pMsg->tid)) < 0) goto _err; + if (tdInitTSchemaBuilder(&schemaBuilder, htonl(pMsg->sversion)) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + for (int i = 0; i < numOfCols; i++) { + if (tdAddColToSchema(&schemaBuilder, pSchema[i].type, htons(pSchema[i].colId), htons(pSchema[i].bytes)) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + } + if (tsdbTableSetSchema(pCfg, tdGetSchemaFromBuilder(&schemaBuilder), false) < 0) goto _err; + if (tsdbTableSetName(pCfg, pMsg->tableFname, true) < 0) goto _err; + + if (numOfTags > 0) { + // Decode tag schema + tdResetTSchemaBuilder(&schemaBuilder, htonl(pMsg->tversion)); + for (int i = numOfCols; i < numOfCols + numOfTags; i++) { + if (tdAddColToSchema(&schemaBuilder, pSchema[i].type, htons(pSchema[i].colId), htons(pSchema[i].bytes)) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + } + if (tsdbTableSetTagSchema(pCfg, tdGetSchemaFromBuilder(&schemaBuilder), false) < 0) goto _err; + if (tsdbTableSetSName(pCfg, pMsg->stableFname, true) < 0) goto _err; + if (tsdbTableSetSuperUid(pCfg, htobe64(pMsg->superTableUid)) < 0) goto _err; + + int32_t tagDataLen = htonl(pMsg->tagDataLen); + if (tagDataLen) { + char *pTagData = pMsg->data + (numOfCols + numOfTags) * sizeof(SSchema); + tsdbTableSetTagValue(pCfg, pTagData, true); + } + } + + if (pMsg->tableType == TSDB_STREAM_TABLE) { + char *sql = pMsg->data + (numOfCols + numOfTags) * sizeof(SSchema); + tsdbTableSetStreamSql(pCfg, sql, true); + } + + tdDestroyTSchemaBuilder(&schemaBuilder); + + return pCfg; + +_err: + tdDestroyTSchemaBuilder(&schemaBuilder); + tsdbClearTableCfg(pCfg); + return NULL; +} + +static UNUSED_FUNC int32_t colIdCompar(const void* left, const void* right) { + int16_t colId = *(int16_t*) left; + STColumn* p2 = (STColumn*) right; + + if (colId == p2->colId) { + return 0; + } + + return (colId < p2->colId)? -1:1; +} + +int tsdbUpdateTableTagValue(STsdbRepo *repo, SUpdateTableTagValMsg *pMsg) { + STsdbRepo *pRepo = (STsdbRepo *)repo; + STsdbMeta *pMeta = pRepo->tsdbMeta; + STSchema * pNewSchema = NULL; + + pMsg->uid = htobe64(pMsg->uid); + pMsg->tid = htonl(pMsg->tid); + pMsg->tversion = htons(pMsg->tversion); + pMsg->colId = htons(pMsg->colId); + pMsg->bytes = htons(pMsg->bytes); + pMsg->tagValLen = htonl(pMsg->tagValLen); + pMsg->numOfTags = htons(pMsg->numOfTags); + pMsg->schemaLen = htonl(pMsg->schemaLen); + for (int i = 0; i < pMsg->numOfTags; i++) { + STColumn *pTCol = (STColumn *)pMsg->data + i; + pTCol->bytes = htons(pTCol->bytes); + pTCol->colId = htons(pTCol->colId); + } + + STable *pTable = tsdbGetTableByUid(pMeta, pMsg->uid); + if (pTable == NULL || TABLE_TID(pTable) != pMsg->tid) { + tsdbError("vgId:%d failed to update table tag value since invalid table id %d uid %" PRIu64, REPO_ID(pRepo), + pMsg->tid, pMsg->uid); + terrno = TSDB_CODE_TDB_INVALID_TABLE_ID; + return -1; + } + + if (TABLE_TYPE(pTable) != TSDB_CHILD_TABLE) { + tsdbError("vgId:%d try to update tag value of a non-child table, invalid action", REPO_ID(pRepo)); + terrno = TSDB_CODE_TDB_INVALID_ACTION; + return -1; + } + + if (schemaVersion(pTable->pSuper->tagSchema) > pMsg->tversion) { + tsdbError( + "vgId:%d failed to update tag value of table %s since version out of date, client tag version %d server tag " + "version %d", + REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), pMsg->tversion, schemaVersion(pTable->pSuper->tagSchema)); + terrno = TSDB_CODE_TDB_TAG_VER_OUT_OF_DATE; + return -1; + } + + if (schemaVersion(pTable->pSuper->tagSchema) < pMsg->tversion) { // tag schema out of data, + tsdbDebug("vgId:%d need to update tag schema of table %s tid %d uid %" PRIu64 + " since out of date, current version %d new version %d", + REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), TABLE_TID(pTable), TABLE_UID(pTable), + schemaVersion(pTable->pSuper->tagSchema), pMsg->tversion); + + STSchemaBuilder schemaBuilder = {0}; + + STColumn *pTCol = (STColumn *)pMsg->data; + ASSERT(pMsg->schemaLen % sizeof(STColumn) == 0 && pTCol[0].colId == colColId(schemaColAt(pTable->pSuper->tagSchema, 0))); + if (tdInitTSchemaBuilder(&schemaBuilder, pMsg->tversion) < 0) { + tsdbDebug("vgId:%d failed to update tag schema of table %s tid %d uid %" PRIu64 " since out of memory", + REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), TABLE_TID(pTable), TABLE_UID(pTable)); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + for (int i = 0; i < (pMsg->schemaLen / sizeof(STColumn)); i++) { + if (tdAddColToSchema(&schemaBuilder, pTCol[i].type, pTCol[i].colId, pTCol[i].bytes) < 0) { + tdDestroyTSchemaBuilder(&schemaBuilder); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + } + pNewSchema = tdGetSchemaFromBuilder(&schemaBuilder); + if (pNewSchema == NULL) { + tdDestroyTSchemaBuilder(&schemaBuilder); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + tdDestroyTSchemaBuilder(&schemaBuilder); + } + + // Change in memory + if (pNewSchema != NULL) { // change super table tag schema + TSDB_WLOCK_TABLE(pTable->pSuper); + STSchema *pOldSchema = pTable->pSuper->tagSchema; + pTable->pSuper->tagSchema = pNewSchema; + tdFreeSchema(pOldSchema); + TSDB_WUNLOCK_TABLE(pTable->pSuper); + } + + bool isChangeIndexCol = (pMsg->colId == colColId(schemaColAt(pTable->pSuper->tagSchema, 0))) + || pMsg->type == TSDB_DATA_TYPE_JSON; + // STColumn *pCol = bsearch(&(pMsg->colId), pMsg->data, pMsg->numOfTags, sizeof(STColumn), colIdCompar); + // ASSERT(pCol != NULL); + + if (isChangeIndexCol) { + tsdbWLockRepoMeta(pRepo); + tsdbRemoveTableFromIndex(pMeta, pTable); + } + TSDB_WLOCK_TABLE(pTable); + if (pMsg->type == TSDB_DATA_TYPE_JSON){ + kvRowFree(pTable->tagVal); + pTable->tagVal = tdKVRowDup(POINTER_SHIFT(pMsg->data, pMsg->schemaLen)); + }else{ + tdSetKVRowDataOfCol(&(pTable->tagVal), pMsg->colId, pMsg->type, POINTER_SHIFT(pMsg->data, pMsg->schemaLen)); + } + TSDB_WUNLOCK_TABLE(pTable); + if (isChangeIndexCol) { + tsdbAddTableIntoIndex(pMeta, pTable, false); + tsdbUnlockRepoMeta(pRepo); + } + + // Update on file + int tlen1 = (pNewSchema) ? tsdbGetTableEncodeSize(TSDB_UPDATE_META, pTable->pSuper) : 0; + int tlen2 = tsdbGetTableEncodeSize(TSDB_UPDATE_META, pTable); + void *buf = tsdbAllocBytes(pRepo, tlen1+tlen2); + ASSERT(buf != NULL); + if (pNewSchema) { + void *pBuf = tsdbInsertTableAct(pRepo, TSDB_UPDATE_META, buf, pTable->pSuper); + ASSERT(POINTER_DISTANCE(pBuf, buf) == tlen1); + buf = pBuf; + } + tsdbInsertTableAct(pRepo, TSDB_UPDATE_META, buf, pTable); + + if (tsdbCheckCommit(pRepo) < 0) return -1; + + return 0; +} + +// ------------------ INTERNAL FUNCTIONS ------------------ +static int tsdbInsertNewTableAction(STsdbRepo *pRepo, STable* pTable) { + int tlen = 0; + void *pBuf = NULL; + + tlen = tsdbGetTableEncodeSize(TSDB_UPDATE_META, pTable); + pBuf = tsdbAllocBytes(pRepo, tlen); + if (pBuf == NULL) { + return -1; + } + void *tBuf = tsdbInsertTableAct(pRepo, TSDB_UPDATE_META, pBuf, pTable); + ASSERT(POINTER_DISTANCE(tBuf, pBuf) == tlen); + + return 0; +} + +STsdbMeta *tsdbNewMeta(STsdbCfg *pCfg) { + STsdbMeta *pMeta = (STsdbMeta *)calloc(1, sizeof(*pMeta)); + if (pMeta == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + int code = pthread_rwlock_init(&pMeta->rwLock, NULL); + if (code != 0) { + tsdbError("vgId:%d failed to init TSDB meta r/w lock since %s", pCfg->tsdbId, strerror(code)); + terrno = TAOS_SYSTEM_ERROR(code); + goto _err; + } + + pMeta->maxTables = TSDB_INIT_NTABLES + 1; + pMeta->tables = (STable **)calloc(pMeta->maxTables, sizeof(STable *)); + if (pMeta->tables == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + pMeta->superList = tdListNew(sizeof(STable *)); + if (pMeta->superList == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + pMeta->uidMap = taosHashInit((size_t)(TSDB_INIT_NTABLES * 1.1), taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), true, false); + if (pMeta->uidMap == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + return pMeta; + +_err: + tsdbFreeMeta(pMeta); + return NULL; +} + +void tsdbFreeMeta(STsdbMeta *pMeta) { + if (pMeta) { + taosHashCleanup(pMeta->uidMap); + tdListFree(pMeta->superList); + tfree(pMeta->tables); + pthread_rwlock_destroy(&pMeta->rwLock); + free(pMeta); + } +} + +int tsdbOpenMeta(STsdbRepo *pRepo) { + return 0; +#if 0 + char * fname = NULL; + STsdbMeta *pMeta = pRepo->tsdbMeta; + ASSERT(pMeta != NULL); + + fname = tsdbGetMetaFileName(pRepo->rootDir); + if (fname == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + // pMeta->pStore = tdOpenKVStore(fname, tsdbRestoreTable, tsdbOrgMeta, (void *)pRepo); + // if (pMeta->pStore == NULL) { + // tsdbError("vgId:%d failed to open TSDB meta while open the kv store since %s", REPO_ID(pRepo), tstrerror(terrno)); + // goto _err; + // } + + tsdbDebug("vgId:%d open TSDB meta succeed", REPO_ID(pRepo)); + tfree(fname); + return 0; + +_err: + tfree(fname); + return -1; +#endif +} + +int tsdbCloseMeta(STsdbRepo *pRepo) { + STsdbMeta *pMeta = pRepo->tsdbMeta; + SListNode *pNode = NULL; + STable * pTable = NULL; + + if (pMeta == NULL) return 0; + // tdCloseKVStore(pMeta->pStore); + for (int i = 1; i < pMeta->maxTables; i++) { + tsdbFreeTable(pMeta->tables[i]); + } + + while ((pNode = tdListPopHead(pMeta->superList)) != NULL) { + tdListNodeGetData(pMeta->superList, pNode, (void *)(&pTable)); + tsdbFreeTable(pTable); + listNodeFree(pNode); + } + + tsdbDebug("vgId:%d TSDB meta is closed", REPO_ID(pRepo)); + return 0; +} + +STable *tsdbGetTableByUid(STsdbMeta *pMeta, uint64_t uid) { + void *ptr = taosHashGet(pMeta->uidMap, (char *)(&uid), sizeof(uid)); + + if (ptr == NULL) return NULL; + + return *(STable **)ptr; +} + +STSchema *tsdbGetTableSchemaByVersion(STable *pTable, int16_t _version, int8_t rowType) { + return tsdbGetTableSchemaImpl(pTable, true, false, _version, rowType); +} + +int tsdbWLockRepoMeta(STsdbRepo *pRepo) { + int code = pthread_rwlock_wrlock(&(pRepo->tsdbMeta->rwLock)); + if (code != 0) { + tsdbError("vgId:%d failed to write lock TSDB meta since %s", REPO_ID(pRepo), strerror(code)); + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + + return 0; +} + +int tsdbRLockRepoMeta(STsdbRepo *pRepo) { + int code = pthread_rwlock_rdlock(&(pRepo->tsdbMeta->rwLock)); + if (code != 0) { + tsdbError("vgId:%d failed to read lock TSDB meta since %s", REPO_ID(pRepo), strerror(code)); + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + + return 0; +} + +int tsdbUnlockRepoMeta(STsdbRepo *pRepo) { + int code = pthread_rwlock_unlock(&(pRepo->tsdbMeta->rwLock)); + if (code != 0) { + tsdbError("vgId:%d failed to unlock TSDB meta since %s", REPO_ID(pRepo), strerror(code)); + terrno = TAOS_SYSTEM_ERROR(code); + return -1; + } + + return 0; +} + +void tsdbRefTable(STable *pTable) { + int32_t ref = T_REF_INC(pTable); + UNUSED(ref); + tsdbDebug("ref table %s uid %" PRIu64 " tid:%d, refCount:%d", TABLE_CHAR_NAME(pTable), TABLE_UID(pTable), TABLE_TID(pTable), ref); +} + +void tsdbUnRefTable(STable *pTable) { + uint64_t uid = TABLE_UID(pTable); + int32_t tid = TABLE_TID(pTable); + int32_t ref = T_REF_DEC(pTable); + + tsdbDebug("unref table, uid:%" PRIu64 " tid:%d, refCount:%d", uid, tid, ref); + + if (ref == 0) { + if (TABLE_TYPE(pTable) == TSDB_CHILD_TABLE) { + tsdbUnRefTable(pTable->pSuper); + } + tsdbFreeTable(pTable); + } +} + +void tsdbFreeLastColumns(STable* pTable) { + if (pTable->lastCols == NULL) { + return; + } + + for (int i = 0; i < pTable->maxColNum; ++i) { + if (pTable->lastCols[i].bytes == 0) { + continue; + } + tfree(pTable->lastCols[i].pData); + pTable->lastCols[i].bytes = 0; + pTable->lastCols[i].pData = NULL; + } + tfree(pTable->lastCols); + pTable->lastCols = NULL; + pTable->maxColNum = 0; + pTable->lastColSVersion = -1; + pTable->restoreColumnNum = 0; + pTable->hasRestoreLastColumn = false; +} + +int16_t tsdbGetLastColumnsIndexByColId(STable* pTable, int16_t colId) { + if (pTable->lastCols == NULL) { + return -1; + } + // TODO: use binary search instead + for (int16_t i = 0; i < pTable->maxColNum; ++i) { + if (pTable->lastCols[i].colId == colId) { + return i; + } + } + + return -1; +} + +int tsdbInitColIdCacheWithSchema(STable* pTable, STSchema* pSchema) { + TSDB_WLOCK_TABLE(pTable); + if (pTable->lastCols == NULL) { + int16_t numOfColumn = pSchema->numOfCols; + + pTable->lastCols = (SDataCol *)malloc(numOfColumn * sizeof(SDataCol)); + if (pTable->lastCols == NULL) { + TSDB_WUNLOCK_TABLE(pTable); + return -1; + } + + for (int16_t i = 0; i < numOfColumn; ++i) { + STColumn *pCol = schemaColAt(pSchema, i); + SDataCol *pDataCol = &(pTable->lastCols[i]); + pDataCol->bytes = 0; + pDataCol->pData = NULL; + pDataCol->colId = pCol->colId; + } + + pTable->lastColSVersion = schemaVersion(pSchema); + pTable->maxColNum = numOfColumn; + pTable->restoreColumnNum = 0; + pTable->hasRestoreLastColumn = false; + } + TSDB_WUNLOCK_TABLE(pTable); + return 0; +} + +STSchema* tsdbGetTableLatestSchema(STable *pTable) { + return tsdbGetTableSchemaByVersion(pTable, -1, -1); +} + +int tsdbUpdateLastColSchema(STable *pTable, STSchema *pNewSchema) { + if (pTable->lastColSVersion == schemaVersion(pNewSchema)) { + return 0; + } + + tsdbDebug("tsdbUpdateLastColSchema:%s,%d->%d", pTable->name->data, pTable->lastColSVersion, schemaVersion(pNewSchema)); + + int16_t numOfCols = pNewSchema->numOfCols; + SDataCol *lastCols = (SDataCol*)malloc(numOfCols * sizeof(SDataCol)); + if (lastCols == NULL) { + return -1; + } + + TSDB_WLOCK_TABLE(pTable); + + for (int16_t i = 0; i < numOfCols; ++i) { + STColumn *pCol = schemaColAt(pNewSchema, i); + int16_t idx = tsdbGetLastColumnsIndexByColId(pTable, pCol->colId); + + SDataCol* pDataCol = &(lastCols[i]); + if (idx != -1) { + // move col data to new last column array + SDataCol* pOldDataCol = &(pTable->lastCols[idx]); + memcpy(pDataCol, pOldDataCol, sizeof(SDataCol)); + } else { + // init new colid data + pDataCol->colId = pCol->colId; + pDataCol->bytes = 0; + pDataCol->pData = NULL; + } + } + + SDataCol *oldLastCols = pTable->lastCols; + int16_t oldLastColNum = pTable->maxColNum; + + pTable->lastColSVersion = schemaVersion(pNewSchema); + pTable->lastCols = lastCols; + pTable->maxColNum = numOfCols; + + if (oldLastCols == NULL) { + TSDB_WUNLOCK_TABLE(pTable); + return 0; + } + + // free old schema last column datas + for (int16_t i = 0; i < oldLastColNum; ++i) { + SDataCol* pDataCol = &(oldLastCols[i]); + if (pDataCol->bytes == 0) { + continue; + } + int16_t idx = tsdbGetLastColumnsIndexByColId(pTable, pDataCol->colId); + if (idx != -1) { + continue; + } + + // free not exist column data + tfree(pDataCol->pData); + } + TSDB_WUNLOCK_TABLE(pTable); + tfree(oldLastCols); + + return 0; +} + +void tsdbUpdateTableSchema(STsdbRepo *pRepo, STable *pTable, STSchema *pSchema, bool insertAct) { + ASSERT(TABLE_TYPE(pTable) != TSDB_STREAM_TABLE && TABLE_TYPE(pTable) != TSDB_SUPER_TABLE); + STsdbMeta *pMeta = pRepo->tsdbMeta; + + STable *pCTable = (TABLE_TYPE(pTable) == TSDB_CHILD_TABLE) ? pTable->pSuper : pTable; + ASSERT(schemaVersion(pSchema) > schemaVersion(*(STSchema **)taosArrayGetLast(pCTable->schema))); + + TSDB_WLOCK_TABLE(pCTable); + tsdbAddSchema(pCTable, pSchema); + + if (schemaNCols(pSchema) > pMeta->maxCols) pMeta->maxCols = schemaNCols(pSchema); + if (schemaTLen(pSchema) > pMeta->maxRowBytes) pMeta->maxRowBytes = schemaTLen(pSchema); + TSDB_WUNLOCK_TABLE(pCTable); + + if (insertAct) { + if (tsdbInsertNewTableAction(pRepo, pCTable) != 0) { + tsdbError("vgId:%d table %s tid %d uid %" PRIu64 " tsdbInsertNewTableAction fail", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), + TABLE_TID(pTable), TABLE_UID(pTable)); + } + } +} + +int tsdbRestoreTable(STsdbRepo *pRepo, void *cont, int contLen) { + STable *pTable = NULL; + + if (!taosCheckChecksumWhole((uint8_t *)cont, contLen)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + return -1; + } + + tsdbDecodeTable(cont, &pTable); + + if (tsdbAddTableToMeta(pRepo, pTable, false, false) < 0) { + tsdbFreeTable(pTable); + return -1; + } + + tsdbTrace("vgId:%d table %s tid %d uid %" PRIu64 " is restored from file", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), + TABLE_TID(pTable), TABLE_UID(pTable)); + return 0; +} + +void tsdbOrgMeta(STsdbRepo *pRepo) { + STsdbMeta *pMeta = pRepo->tsdbMeta; + + for (int i = 1; i < pMeta->maxTables; i++) { + STable *pTable = pMeta->tables[i]; + if (pTable != NULL && pTable->type == TSDB_CHILD_TABLE) { + tsdbAddTableIntoIndex(pMeta, pTable, true); + } + } +} + +// ------------------ LOCAL FUNCTIONS ------------------ +static char *getTagIndexKey(const void *pData) { + STable *pTable = (STable *)pData; + + STSchema *pSchema = tsdbGetTableTagSchema(pTable); + STColumn *pCol = schemaColAt(pSchema, DEFAULT_TAG_INDEX_COLUMN); + void * res = tdGetKVRowValOfCol(pTable->tagVal, pCol->colId); + if (res == NULL) { + // treat the column as NULL if we cannot find it + res = (char*)getNullValue(pCol->type); + } + return res; +} + +static STable *tsdbNewTable() { + STable *pTable = (STable *)calloc(1, sizeof(*pTable)); + if (pTable == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + + pTable->lastKey = TSKEY_INITIAL_VAL; + + pTable->lastCols = NULL; + pTable->restoreColumnNum = 0; + pTable->cacheLastConfigVersion = 0; + pTable->maxColNum = 0; + pTable->hasRestoreLastColumn = false; + pTable->lastColSVersion = -1; + return pTable; +} + +static STable *tsdbCreateTableFromCfg(STableCfg *pCfg, bool isSuper, STable *pSTable) { + STable *pTable = NULL; + size_t tsize = 0; + + pTable = tsdbNewTable(); + if (pTable == NULL) goto _err; + + if (isSuper) { + pTable->type = TSDB_SUPER_TABLE; + tsize = strnlen(pCfg->sname, TSDB_TABLE_NAME_LEN - 1); + pTable->name = calloc(1, tsize + VARSTR_HEADER_SIZE + 1); + if (pTable->name == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + STR_WITH_SIZE_TO_VARSTR(pTable->name, pCfg->sname, (VarDataLenT)tsize); + TABLE_UID(pTable) = pCfg->superUid; + TABLE_TID(pTable) = -1; + TABLE_SUID(pTable) = -1; + pTable->pSuper = NULL; + if (tsdbAddSchema(pTable, tdDupSchema(pCfg->schema)) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + pTable->tagSchema = tdDupSchema(pCfg->tagSchema); + if (pTable->tagSchema == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + pTable->tagVal = NULL; + STColumn *pCol = schemaColAt(pTable->tagSchema, DEFAULT_TAG_INDEX_COLUMN); + if(pCol->type == TSDB_DATA_TYPE_JSON){ + assert(pTable->tagSchema->numOfCols == 1); + pTable->jsonKeyMap = taosHashInit(8, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); + if (pTable->jsonKeyMap == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbFreeTable(pTable); + return NULL; + } + taosHashSetFreeFp(pTable->jsonKeyMap, taosArrayDestroyForHash); + }else{ + pTable->pIndex = tSkipListCreate(TSDB_SUPER_TABLE_SL_LEVEL, colType(pCol), (uint8_t)(colBytes(pCol)), NULL, + SL_ALLOW_DUP_KEY, getTagIndexKey); + if (pTable->pIndex == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + } + } else { + pTable->type = pCfg->type; + tsize = strnlen(pCfg->name, TSDB_TABLE_NAME_LEN - 1); + pTable->name = calloc(1, tsize + VARSTR_HEADER_SIZE + 1); + if (pTable->name == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + STR_WITH_SIZE_TO_VARSTR(pTable->name, pCfg->name, (VarDataLenT)tsize); + TABLE_UID(pTable) = pCfg->tableId.uid; + TABLE_TID(pTable) = pCfg->tableId.tid; + + if (pCfg->type == TSDB_CHILD_TABLE) { + TABLE_SUID(pTable) = pCfg->superUid; + if (tsdbCheckTableTagVal(pCfg->tagValues, pSTable->tagSchema) < 0) { + goto _err; + } + pTable->tagVal = tdKVRowDup(pCfg->tagValues); + if (pTable->tagVal == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + } else { + TABLE_SUID(pTable) = -1; + if (tsdbAddSchema(pTable, tdDupSchema(pCfg->schema)) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + + if (TABLE_TYPE(pTable) == TSDB_STREAM_TABLE) { + pTable->sql = strdup(pCfg->sql); + if (pTable->sql == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _err; + } + } + } + } + + T_REF_INC(pTable); + + tsdbDebug("table %s tid %d uid %" PRIu64 " is created", TABLE_CHAR_NAME(pTable), TABLE_TID(pTable), + TABLE_UID(pTable)); + + return pTable; + +_err: + tsdbFreeTable(pTable); + return NULL; +} + +static void tsdbFreeTable(STable *pTable) { + if (pTable) { + if (pTable->name != NULL) + tsdbTrace("table %s tid %d uid %" PRIu64 " is freed", TABLE_CHAR_NAME(pTable), TABLE_TID(pTable), + TABLE_UID(pTable)); + tfree(TABLE_NAME(pTable)); + if (TABLE_TYPE(pTable) != TSDB_CHILD_TABLE) { + tsdbFreeTableSchema(pTable); + + if (TABLE_TYPE(pTable) == TSDB_SUPER_TABLE) { + tdFreeSchema(pTable->tagSchema); + } + } + + kvRowFree(pTable->tagVal); + + tSkipListDestroy(pTable->pIndex); + taosHashCleanup(pTable->jsonKeyMap); + taosTZfree(pTable->lastRow); + tfree(pTable->sql); + + tsdbFreeLastColumns(pTable); + free(pTable); + } +} + +static int tsdbAddTableToMeta(STsdbRepo *pRepo, STable *pTable, bool addIdx, bool lock) { + STsdbMeta *pMeta = pRepo->tsdbMeta; + + if (lock && tsdbWLockRepoMeta(pRepo) < 0) { + tsdbError("vgId:%d failed to add table %s to meta since %s", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), + tstrerror(terrno)); + return -1; + } + + if (TABLE_TYPE(pTable) == TSDB_SUPER_TABLE) { + if (tdListAppend(pMeta->superList, (void *)(&pTable)) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbError("vgId:%d failed to add table %s to meta since %s", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), + tstrerror(terrno)); + goto _err; + } + } else { + if (TABLE_TID(pTable) >= pMeta->maxTables) { + if (tsdbAdjustMetaTables(pRepo, TABLE_TID(pTable)) < 0) goto _err; + } + if (TABLE_TYPE(pTable) == TSDB_CHILD_TABLE && addIdx) { // add STABLE to the index + if (tsdbAddTableIntoIndex(pMeta, pTable, true) < 0) { + tsdbDebug("vgId:%d failed to add table %s to meta while add table to index since %s", REPO_ID(pRepo), + TABLE_CHAR_NAME(pTable), tstrerror(terrno)); + goto _err; + } + } + ASSERT(TABLE_TID(pTable) < pMeta->maxTables); + pMeta->tables[TABLE_TID(pTable)] = pTable; + pMeta->nTables++; + } + + if (taosHashPut(pMeta->uidMap, (char *)(&pTable->tableId.uid), sizeof(pTable->tableId.uid), (void *)(&pTable), + sizeof(pTable)) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbError("vgId:%d failed to add table %s to meta while put into uid map since %s", REPO_ID(pRepo), + TABLE_CHAR_NAME(pTable), tstrerror(terrno)); + goto _err; + } + + if (TABLE_TYPE(pTable) != TSDB_CHILD_TABLE) { + STSchema *pSchema = tsdbGetTableSchemaImpl(pTable, false, false, -1, -1); + if (schemaNCols(pSchema) > pMeta->maxCols) pMeta->maxCols = schemaNCols(pSchema); + if (schemaTLen(pSchema) > pMeta->maxRowBytes) pMeta->maxRowBytes = schemaTLen(pSchema); + } + + if (lock && tsdbUnlockRepoMeta(pRepo) < 0) return -1; + if (TABLE_TYPE(pTable) == TSDB_STREAM_TABLE && addIdx) { + pTable->cqhandle = (*pRepo->appH.cqCreateFunc)(pRepo->appH.cqH, TABLE_UID(pTable), TABLE_TID(pTable), TABLE_NAME(pTable)->data, pTable->sql, + tsdbGetTableSchemaImpl(pTable, false, false, -1, -1), 1); + } + + tsdbDebug("vgId:%d table %s tid %d uid %" PRIu64 " is added to meta", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), + TABLE_TID(pTable), TABLE_UID(pTable)); + return 0; + +_err: + tsdbRemoveTableFromMeta(pRepo, pTable, false, false); + if (lock) tsdbUnlockRepoMeta(pRepo); + return -1; +} + +static void tsdbRemoveTableFromMeta(STsdbRepo *pRepo, STable *pTable, bool rmFromIdx, bool lock) { + STsdbMeta *pMeta = pRepo->tsdbMeta; + SListIter lIter = {0}; + SListNode *pNode = NULL; + STable * tTable = NULL; + + STSchema *pSchema = tsdbGetTableSchemaImpl(pTable, false, false, -1, -1); + int maxCols = schemaNCols(pSchema); + int maxRowBytes = schemaTLen(pSchema); + + if (lock) tsdbWLockRepoMeta(pRepo); + + if (TABLE_TYPE(pTable) == TSDB_SUPER_TABLE) { + tdListInitIter(pMeta->superList, &lIter, TD_LIST_BACKWARD); + + while ((pNode = tdListNext(&lIter)) != NULL) { + tdListNodeGetData(pMeta->superList, pNode, (void *)(&tTable)); + if (pTable == tTable) { + tdListPopNode(pMeta->superList, pNode); + free(pNode); + break; + } + } + } else { + pMeta->tables[pTable->tableId.tid] = NULL; + if (TABLE_TYPE(pTable) == TSDB_CHILD_TABLE && rmFromIdx) { + tsdbRemoveTableFromIndex(pMeta, pTable); + } + + pMeta->nTables--; + } + + taosHashRemove(pMeta->uidMap, (char *)(&(TABLE_UID(pTable))), sizeof(TABLE_UID(pTable))); + + if (maxCols == pMeta->maxCols || maxRowBytes == pMeta->maxRowBytes) { + maxCols = 0; + maxRowBytes = 0; + for (int i = 0; i < pMeta->maxTables; i++) { + STable *_pTable = pMeta->tables[i]; + if (_pTable != NULL) { + pSchema = tsdbGetTableSchemaImpl(_pTable, false, false, -1, -1); + maxCols = MAX(maxCols, schemaNCols(pSchema)); + maxRowBytes = MAX(maxRowBytes, schemaTLen(pSchema)); + } + } + } + pMeta->maxCols = maxCols; + pMeta->maxRowBytes = maxRowBytes; + + if (lock) tsdbUnlockRepoMeta(pRepo); + tsdbDebug("vgId:%d table %s uid %" PRIu64 " is removed from meta", REPO_ID(pRepo), TABLE_CHAR_NAME(pTable), TABLE_UID(pTable)); + tsdbUnRefTable(pTable); +} + +void* tsdbGetJsonTagValue(STable* pTable, char* key, int32_t keyLen, int16_t* retColId){ + assert(TABLE_TYPE(pTable) == TSDB_CHILD_TABLE); + STable* superTable= pTable->pSuper; + SArray** data = (SArray**)taosHashGet(superTable->jsonKeyMap, key, keyLen); + if(data == NULL) return NULL; + JsonMapValue jmvalue = {pTable, 0}; + JsonMapValue* p = taosArraySearch(*data, &jmvalue, tsdbCompareJsonMapValue, TD_EQ); + if (p == NULL) return NULL; + int16_t colId = p->colId + 1; + if(retColId) *retColId = p->colId; + return tdGetKVRowValOfCol(pTable->tagVal, colId); +} + +int tsdbCompareJsonMapValue(const void* a, const void* b) { + const JsonMapValue* x = (const JsonMapValue*)a; + const JsonMapValue* y = (const JsonMapValue*)b; + if (x->table > y->table) return 1; + if (x->table < y->table) return -1; + return 0; +} + +static int tsdbAddTableIntoIndex(STsdbMeta *pMeta, STable *pTable, bool refSuper) { + ASSERT(pTable->type == TSDB_CHILD_TABLE && pTable != NULL); + STable *pSTable = tsdbGetTableByUid(pMeta, TABLE_SUID(pTable)); + ASSERT(pSTable != NULL); + + pTable->pSuper = pSTable; + if (refSuper) T_REF_INC(pSTable); + + if(pSTable->tagSchema->columns[0].type == TSDB_DATA_TYPE_JSON){ + ASSERT(pSTable->tagSchema->numOfCols == 1); + int16_t nCols = kvRowNCols(pTable->tagVal); + ASSERT(nCols%2 == 1); + // check first + for (int j = 0; j < nCols; ++j) { + if (j != 0 && j % 2 == 0) continue; // jump value + SColIdx *pColIdx = kvRowColIdxAt(pTable->tagVal, j); + void *val = (kvRowColVal(pTable->tagVal, pColIdx)); + if (j == 0) { // json value is the first + int8_t jsonPlaceHolder = *(int8_t *)val; + ASSERT(jsonPlaceHolder == TSDB_DATA_JSON_PLACEHOLDER); + continue; + } + if (j == 1) { + uint32_t jsonNULL = *(uint32_t *)(varDataVal(val)); + ASSERT(jsonNULL == TSDB_DATA_JSON_NULL); + } + + // then insert + char keyMd5[TSDB_MAX_JSON_KEY_MD5_LEN] = {0}; + jsonKeyMd5(varDataVal(val), varDataLen(val), keyMd5); + SArray *tablistNew = NULL; + SArray **tablist = (SArray **)taosHashGet(pSTable->jsonKeyMap, keyMd5, TSDB_MAX_JSON_KEY_MD5_LEN); + if (tablist == NULL) { + tablistNew = taosArrayInit(8, sizeof(JsonMapValue)); + if (tablistNew == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbError("out of memory when alloc json tag array"); + return -1; + } + if (taosHashPut(pSTable->jsonKeyMap, keyMd5, TSDB_MAX_JSON_KEY_MD5_LEN, &tablistNew, sizeof(void *)) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbError("out of memory when put json tag array"); + return -1; + } + } else { + tablistNew = *tablist; + } + + JsonMapValue jmvalue = {pTable, pColIdx->colId}; + void* p = taosArraySearch(tablistNew, &jmvalue, tsdbCompareJsonMapValue, TD_EQ); + if (p == NULL) { + p = taosArraySearch(tablistNew, &jmvalue, tsdbCompareJsonMapValue, TD_GE); + if(p == NULL){ + taosArrayPush(tablistNew, &jmvalue); + }else{ + taosArrayInsert(tablistNew, TARRAY_ELEM_IDX(tablistNew, p), &jmvalue); + } + }else{ + tsdbError("insert dumplicate"); + } + } + }else{ + tSkipListPut(pSTable->pIndex, (void *)pTable); + } + + return 0; +} + +static int tsdbRemoveTableFromIndex(STsdbMeta *pMeta, STable *pTable) { + ASSERT(pTable->type == TSDB_CHILD_TABLE && pTable != NULL); + + STable *pSTable = pTable->pSuper; + ASSERT(pSTable != NULL); + + if(pSTable->tagSchema->columns[0].type == TSDB_DATA_TYPE_JSON){ + ASSERT(pSTable->tagSchema->numOfCols == 1); + int16_t nCols = kvRowNCols(pTable->tagVal); + ASSERT(nCols%2 == 1); + for (int j = 0; j < nCols; ++j) { + if (j != 0 && j%2 == 0) continue; // jump value + SColIdx * pColIdx = kvRowColIdxAt(pTable->tagVal, j); + void* val = (kvRowColVal(pTable->tagVal, pColIdx)); + if (j == 0){ // json value is the first + int8_t jsonPlaceHolder = *(int8_t*)val; + ASSERT(jsonPlaceHolder == TSDB_DATA_JSON_PLACEHOLDER); + continue; + } + if (j == 1){ + uint32_t jsonNULL = *(uint32_t*)(varDataVal(val)); + ASSERT(jsonNULL == TSDB_DATA_JSON_NULL); + } + + char keyMd5[TSDB_MAX_JSON_KEY_MD5_LEN] = {0}; + jsonKeyMd5(varDataVal(val), varDataLen(val), keyMd5); + SArray** tablist = (SArray **)taosHashGet(pSTable->jsonKeyMap, keyMd5, TSDB_MAX_JSON_KEY_MD5_LEN); + if(tablist == NULL) { + tsdbError("json tag no key error,%d", j); + continue; + } + + JsonMapValue jmvalue = {pTable, pColIdx->colId}; + void* p = taosArraySearch(*tablist, &jmvalue, tsdbCompareJsonMapValue, TD_EQ); + if (p == NULL) { + tsdbError("json tag no tableid error,%d", j); + continue; + } + taosArrayRemove(*tablist, TARRAY_ELEM_IDX(*tablist, p)); + } + }else { + char * key = getTagIndexKey(pTable); + SArray *res = tSkipListGet(pSTable->pIndex, key); + + size_t size = taosArrayGetSize(res); + ASSERT(size > 0); + + for (int32_t i = 0; i < size; ++i) { + SSkipListNode *pNode = taosArrayGetP(res, i); + + // STableIndexElem* pElem = (STableIndexElem*) SL_GET_NODE_DATA(pNode); + if ((STable *)SL_GET_NODE_DATA(pNode) == pTable) { // this is the exact what we need + tSkipListRemoveNode(pSTable->pIndex, pNode); + } + } + + taosArrayDestroy(&res); + } + return 0; +} + +static int tsdbInitTableCfg(STableCfg *config, ETableType type, uint64_t uid, int32_t tid) { + if (type != TSDB_CHILD_TABLE && type != TSDB_NORMAL_TABLE && type != TSDB_STREAM_TABLE) { + terrno = TSDB_CODE_TDB_INVALID_TABLE_TYPE; + return -1; + } + + memset((void *)config, 0, sizeof(*config)); + + config->type = type; + config->superUid = TSDB_INVALID_SUPER_TABLE_ID; + config->tableId.uid = uid; + config->tableId.tid = tid; + return 0; +} + +static int tsdbTableSetSchema(STableCfg *config, STSchema *pSchema, bool dup) { + if (dup) { + config->schema = tdDupSchema(pSchema); + if (config->schema == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + } else { + config->schema = pSchema; + } + return 0; +} + +static int tsdbTableSetName(STableCfg *config, char *name, bool dup) { + if (dup) { + config->name = strdup(name); + if (config->name == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + } else { + config->name = name; + } + + return 0; +} + +static int tsdbTableSetTagSchema(STableCfg *config, STSchema *pSchema, bool dup) { + if (config->type != TSDB_CHILD_TABLE) { + terrno = TSDB_CODE_TDB_INVALID_CREATE_TB_MSG; + return -1; + } + + if (dup) { + config->tagSchema = tdDupSchema(pSchema); + if (config->tagSchema == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + } else { + config->tagSchema = pSchema; + } + return 0; +} + +static int tsdbTableSetSName(STableCfg *config, char *sname, bool dup) { + if (config->type != TSDB_CHILD_TABLE) { + terrno = TSDB_CODE_TDB_INVALID_CREATE_TB_MSG; + return -1; + } + + if (dup) { + config->sname = strdup(sname); + if (config->sname == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + } else { + config->sname = sname; + } + return 0; +} + +static int tsdbTableSetSuperUid(STableCfg *config, uint64_t uid) { + if (config->type != TSDB_CHILD_TABLE || uid == TSDB_INVALID_SUPER_TABLE_ID) { + terrno = TSDB_CODE_TDB_INVALID_CREATE_TB_MSG; + return -1; + } + + config->superUid = uid; + return 0; +} + +static int tsdbTableSetTagValue(STableCfg *config, SKVRow row, bool dup) { + if (config->type != TSDB_CHILD_TABLE) { + terrno = TSDB_CODE_TDB_INVALID_CREATE_TB_MSG; + return -1; + } + + if (dup) { + config->tagValues = tdKVRowDup(row); + if (config->tagValues == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + } else { + config->tagValues = row; + } + + return 0; +} + +static int tsdbTableSetStreamSql(STableCfg *config, char *sql, bool dup) { + if (config->type != TSDB_STREAM_TABLE) { + terrno = TSDB_CODE_TDB_INVALID_CREATE_TB_MSG; + return -1; + } + + if (dup) { + config->sql = strdup(sql); + if (config->sql == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + } else { + config->sql = sql; + } + + return 0; +} + +void tsdbClearTableCfg(STableCfg *config) { + if (config) { + if (config->schema) tdFreeSchema(config->schema); + if (config->tagSchema) tdFreeSchema(config->tagSchema); + if (config->tagValues) kvRowFree(config->tagValues); + tfree(config->name); + tfree(config->sname); + tfree(config->sql); + free(config); + } +} + +static int tsdbEncodeTableName(void **buf, tstr *name) { + int tlen = 0; + + tlen += taosEncodeFixedI16(buf, name->len); + if (buf != NULL) { + memcpy(*buf, name->data, name->len); + *buf = POINTER_SHIFT(*buf, name->len); + } + tlen += name->len; + + return tlen; +} + +static void *tsdbDecodeTableName(void *buf, tstr **name) { + VarDataLenT len = 0; + + buf = taosDecodeFixedI16(buf, &len); + *name = calloc(1, sizeof(tstr) + len + 1); + if (*name == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + (*name)->len = len; + memcpy((*name)->data, buf, len); + + buf = POINTER_SHIFT(buf, len); + return buf; +} + +static int tsdbEncodeTable(void **buf, STable *pTable) { + ASSERT(pTable != NULL); + int tlen = 0; + + tlen += taosEncodeFixedU8(buf, pTable->type); + tlen += tsdbEncodeTableName(buf, pTable->name); + tlen += taosEncodeFixedU64(buf, TABLE_UID(pTable)); + tlen += taosEncodeFixedI32(buf, TABLE_TID(pTable)); + + if (TABLE_TYPE(pTable) == TSDB_CHILD_TABLE) { + tlen += taosEncodeFixedU64(buf, TABLE_SUID(pTable)); + tlen += tdEncodeKVRow(buf, pTable->tagVal); + } else { + uint32_t arraySize = (uint32_t)taosArrayGetSize(pTable->schema); + if(arraySize > UINT8_MAX) { + tlen += taosEncodeFixedU8(buf, 0); + tlen += taosEncodeFixedU32(buf, arraySize); + } else { + tlen += taosEncodeFixedU8(buf, (uint8_t)arraySize); + } + for (uint32_t i = 0; i < arraySize; i++) { + STSchema *pSchema = taosArrayGetP(pTable->schema, i); + tlen += tdEncodeSchema(buf, pSchema); + } + + if (TABLE_TYPE(pTable) == TSDB_SUPER_TABLE) { + tlen += tdEncodeSchema(buf, pTable->tagSchema); + } + + if (TABLE_TYPE(pTable) == TSDB_STREAM_TABLE) { + tlen += taosEncodeString(buf, pTable->sql); + } + } + + return tlen; +} + +static void *tsdbDecodeTable(void *buf, STable **pRTable) { + STable *pTable = tsdbNewTable(); + if (pTable == NULL) return NULL; + + uint8_t type = 0; + + buf = taosDecodeFixedU8(buf, &type); + pTable->type = type; + buf = tsdbDecodeTableName(buf, &(pTable->name)); + buf = taosDecodeFixedU64(buf, &TABLE_UID(pTable)); + buf = taosDecodeFixedI32(buf, &TABLE_TID(pTable)); + + if (TABLE_TYPE(pTable) == TSDB_CHILD_TABLE) { + buf = taosDecodeFixedU64(buf, &TABLE_SUID(pTable)); + buf = tdDecodeKVRow(buf, &(pTable->tagVal)); + } else { + uint32_t nSchemas = 0; + buf = taosDecodeFixedU8(buf, (uint8_t *)&nSchemas); + if(nSchemas == 0) { + buf = taosDecodeFixedU32(buf, &nSchemas); + } + for (int i = 0; i < nSchemas; i++) { + STSchema *pSchema; + buf = tdDecodeSchema(buf, &pSchema); + tsdbAddSchema(pTable, pSchema); + } + + if (TABLE_TYPE(pTable) == TSDB_SUPER_TABLE) { + buf = tdDecodeSchema(buf, &(pTable->tagSchema)); + STColumn *pCol = schemaColAt(pTable->tagSchema, DEFAULT_TAG_INDEX_COLUMN); + if(pCol->type == TSDB_DATA_TYPE_JSON){ + assert(pTable->tagSchema->numOfCols == 1); + pTable->jsonKeyMap = taosHashInit(8, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); + if (pTable->jsonKeyMap == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbFreeTable(pTable); + return NULL; + } + taosHashSetFreeFp(pTable->jsonKeyMap, taosArrayDestroyForHash); + }else{ + pTable->pIndex = tSkipListCreate(TSDB_SUPER_TABLE_SL_LEVEL, colType(pCol), (uint8_t)(colBytes(pCol)), NULL, + SL_ALLOW_DUP_KEY, getTagIndexKey); + if (pTable->pIndex == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbFreeTable(pTable); + return NULL; + } + } + } + + if (TABLE_TYPE(pTable) == TSDB_STREAM_TABLE) { + buf = taosDecodeString(buf, &(pTable->sql)); + } + } + + T_REF_INC(pTable); + + *pRTable = pTable; + + return buf; +} + +static SArray* getJsonTagTableList(STable *pTable){ + uint32_t key = TSDB_DATA_JSON_NULL; + char keyMd5[TSDB_MAX_JSON_KEY_MD5_LEN] = {0}; + jsonKeyMd5(&key, INT_BYTES, keyMd5); + SArray** tablist = (SArray**)taosHashGet(pTable->jsonKeyMap, keyMd5, TSDB_MAX_JSON_KEY_MD5_LEN); + + return *tablist; +} + +static int tsdbGetTableEncodeSize(int8_t act, STable *pTable) { + int tlen = 0; + if (act == TSDB_UPDATE_META) { + tlen = sizeof(SListNode) + sizeof(SActObj) + sizeof(SActCont) + tsdbEncodeTable(NULL, pTable) + sizeof(TSCKSUM); + } else { + if (TABLE_TYPE(pTable) == TSDB_SUPER_TABLE) { + size_t tableSize = 0; + if(pTable->tagSchema->columns[0].type == TSDB_DATA_TYPE_JSON){ + SArray* tablist = getJsonTagTableList(pTable); + tableSize = taosArrayGetSize(tablist); + }else{ + tableSize = SL_SIZE(pTable->pIndex); + } + tlen = (int)((sizeof(SListNode) + sizeof(SActObj)) * (tableSize + 1)); + } else { + tlen = sizeof(SListNode) + sizeof(SActObj); + } + } + + return tlen; +} + +static void *tsdbInsertTableAct(STsdbRepo *pRepo, int8_t act, void *buf, STable *pTable) { + SListNode *pNode = (SListNode *)buf; + SActObj * pAct = (SActObj *)(pNode->data); + SActCont * pCont = (SActCont *)POINTER_SHIFT(pAct, sizeof(*pAct)); + void * pBuf = (void *)pCont; + + pNode->prev = pNode->next = NULL; + pAct->act = act; + pAct->uid = TABLE_UID(pTable); + + if (act == TSDB_UPDATE_META) { + pBuf = (void *)(pCont->cont); + pCont->len = tsdbEncodeTable(&pBuf, pTable) + sizeof(TSCKSUM); + taosCalcChecksumAppend(0, (uint8_t *)pCont->cont, pCont->len); + pBuf = POINTER_SHIFT(pBuf, sizeof(TSCKSUM)); + } + + tdListAppendNode(pRepo->mem->actList, pNode); + + return pBuf; +} + +static int tsdbRemoveTableFromStore(STsdbRepo *pRepo, STable *pTable) { + int tlen = tsdbGetTableEncodeSize(TSDB_DROP_META, pTable); + void *buf = tsdbAllocBytes(pRepo, tlen); + if (buf == NULL) { + return -1; + } + + void *pBuf = buf; + if (TABLE_TYPE(pTable) == TSDB_SUPER_TABLE) { + if(pTable->tagSchema->columns[0].type == TSDB_DATA_TYPE_JSON){ + SArray* tablist = getJsonTagTableList(pTable); + for (int i = 0; i < taosArrayGetSize(tablist); ++i) { + JsonMapValue* p = taosArrayGet(tablist, i); + ASSERT(TABLE_TYPE((STable *)(p->table)) == TSDB_CHILD_TABLE); + pBuf = tsdbInsertTableAct(pRepo, TSDB_DROP_META, pBuf, p->table); + } + }else { + SSkipListIterator *pIter = tSkipListCreateIter(pTable->pIndex); + if (pIter == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + while (tSkipListIterNext(pIter)) { + STable *tTable = (STable *)SL_GET_NODE_DATA(tSkipListIterGet(pIter)); + ASSERT(TABLE_TYPE(tTable) == TSDB_CHILD_TABLE); + pBuf = tsdbInsertTableAct(pRepo, TSDB_DROP_META, pBuf, tTable); + } + + tSkipListDestroyIter(pIter); + } + } + pBuf = tsdbInsertTableAct(pRepo, TSDB_DROP_META, pBuf, pTable); + + ASSERT(POINTER_DISTANCE(pBuf, buf) == tlen); + + return 0; +} + +static int tsdbRmTableFromMeta(STsdbRepo *pRepo, STable *pTable) { + if (TABLE_TYPE(pTable) == TSDB_SUPER_TABLE) { + tsdbWLockRepoMeta(pRepo); + if(pTable->tagSchema->columns[0].type == TSDB_DATA_TYPE_JSON){ + SArray* tablist = getJsonTagTableList(pTable); + for (int i = 0; i < taosArrayGetSize(tablist); ++i) { + JsonMapValue* p = taosArrayGet(tablist, i); + tsdbRemoveTableFromMeta(pRepo, p->table, false, false); + } + }else{ + SSkipListIterator *pIter = tSkipListCreateIter(pTable->pIndex); + if (pIter == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + while (tSkipListIterNext(pIter)) { + STable *tTable = (STable *)SL_GET_NODE_DATA(tSkipListIterGet(pIter)); + tsdbRemoveTableFromMeta(pRepo, tTable, false, false); + } + tSkipListDestroyIter(pIter); + } + tsdbRemoveTableFromMeta(pRepo, pTable, false, false); + tsdbUnlockRepoMeta(pRepo); + } else { + if ((TABLE_TYPE(pTable) == TSDB_STREAM_TABLE) && pTable->cqhandle) pRepo->appH.cqDropFunc(pTable->cqhandle); + tsdbRemoveTableFromMeta(pRepo, pTable, true, true); + } + + return 0; +} + +static int tsdbAdjustMetaTables(STsdbRepo *pRepo, int tid) { + STsdbMeta *pMeta = pRepo->tsdbMeta; + ASSERT(tid >= pMeta->maxTables); + + int maxTables = tsdbGetNextMaxTables(tid); + + STable **tables = (STable **)calloc(maxTables, sizeof(STable *)); + if (tables == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + memcpy((void *)tables, (void *)pMeta->tables, sizeof(STable *) * pMeta->maxTables); + pMeta->maxTables = maxTables; + + STable **tTables = pMeta->tables; + pMeta->tables = tables; + tfree(tTables); + tsdbDebug("vgId:%d tsdb meta maxTables is adjusted as %d", REPO_ID(pRepo), maxTables); + + return 0; +} + +static int tsdbCheckTableTagVal(SKVRow *pKVRow, STSchema *pSchema) { + for (size_t i = 0; i < kvRowNCols(pKVRow); i++) { + SColIdx * pColIdx = kvRowColIdxAt(pKVRow, i); + STColumn *pCol = tdGetColOfID(pSchema, pColIdx->colId); + + if ((pCol == NULL) || (!IS_VAR_DATA_TYPE(pCol->type))) continue; + + void *pValue = tdGetKVRowValOfCol(pKVRow, pCol->colId); + if (varDataTLen(pValue) > pCol->bytes) { + terrno = TSDB_CODE_TDB_IVLD_TAG_VAL; + return -1; + } + } + + return 0; +} + +static int tsdbAddSchema(STable *pTable, STSchema *pSchema) { + ASSERT(TABLE_TYPE(pTable) != TSDB_CHILD_TABLE); + + if (pTable->schema == NULL) { + pTable->schema = taosArrayInit(TSDB_MAX_TABLE_SCHEMAS, sizeof(SSchema *)); + if (pTable->schema == NULL) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + } + + ASSERT(taosArrayGetSize(pTable->schema) == 0 || + schemaVersion(pSchema) > schemaVersion(*(STSchema **)taosArrayGetLast(pTable->schema))); + + if (taosArrayPush(pTable->schema, &pSchema) == NULL) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + return 0; +} + +static void tsdbFreeTableSchema(STable *pTable) { + ASSERT(pTable != NULL); + + if (pTable->schema) { + for (size_t i = 0; i < taosArrayGetSize(pTable->schema); i++) { + STSchema *pSchema = taosArrayGetP(pTable->schema, i); + tdFreeSchema(pSchema); + } + + taosArrayDestroy(&pTable->schema); + } +} + diff --git a/source/dnode/vnode/tsdb2/src/tsdbRead.c b/source/dnode/vnode/tsdb2/src/tsdbRead.c new file mode 100644 index 0000000000..6d9030bf7f --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbRead.c @@ -0,0 +1,4578 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "os.h" +#include "tdataformat.h" +#include "tskiplist.h" +#include "tulog.h" +#include "talgo.h" +#include "tcompare.h" +#include "exception.h" + +#include "taosdef.h" +#include "tlosertree.h" +#include "tsdbint.h" +#include "texpr.h" +#include "qFilter.h" +#include "cJSON.h" + +#define EXTRA_BYTES 2 +#define ASCENDING_TRAVERSE(o) (o == TSDB_ORDER_ASC) +#define QH_GET_NUM_OF_COLS(handle) ((size_t)(taosArrayGetSize((handle)->pColumns))) + +#define GET_FILE_DATA_BLOCK_INFO(_checkInfo, _block) \ + ((SDataBlockInfo){.window = {.skey = (_block)->keyFirst, .ekey = (_block)->keyLast}, \ + .numOfCols = (_block)->numOfCols, \ + .rows = (_block)->numOfRows, \ + .tid = (_checkInfo)->tableId.tid, \ + .uid = (_checkInfo)->tableId.uid}) + +// limit offset start optimization for rows read over this value +#define OFFSET_SKIP_THRESHOLD 5000 + +enum { + TSDB_QUERY_TYPE_ALL = 1, + TSDB_QUERY_TYPE_LAST = 2, +}; + +enum { + TSDB_CACHED_TYPE_NONE = 0, + TSDB_CACHED_TYPE_LASTROW = 1, + TSDB_CACHED_TYPE_LAST = 2, +}; + +typedef struct SQueryFilePos { + int32_t fid; + int32_t slot; + int32_t pos; + int64_t lastKey; + int32_t rows; + bool mixBlock; + bool blockCompleted; + STimeWindow win; +} SQueryFilePos; + +typedef struct SDataBlockLoadInfo { + SDFileSet* fileGroup; + int32_t slot; + int32_t tid; + SArray* pLoadedCols; +} SDataBlockLoadInfo; + +typedef struct SLoadCompBlockInfo { + int32_t tid; /* table tid */ + int32_t fileId; +} SLoadCompBlockInfo; + +enum { + CHECKINFO_CHOSEN_MEM = 0, + CHECKINFO_CHOSEN_IMEM = 1, + CHECKINFO_CHOSEN_BOTH = 2 //for update=2(merge case) +}; + + +typedef struct STableCheckInfo { + STableId tableId; + TSKEY lastKey; + STable* pTableObj; + SBlockInfo* pCompInfo; + int32_t compSize; + int32_t numOfBlocks:29; // number of qualified data blocks not the original blocks + uint8_t chosen:2; // indicate which iterator should move forward + bool initBuf; // whether to initialize the in-memory skip list iterator or not + SSkipListIterator* iter; // mem buffer skip list iterator + SSkipListIterator* iiter; // imem buffer skip list iterator +} STableCheckInfo; + +typedef struct STableBlockInfo { + SBlock *compBlock; + STableCheckInfo *pTableCheckInfo; +} STableBlockInfo; + +typedef struct SBlockOrderSupporter { + int32_t numOfTables; + STableBlockInfo** pDataBlockInfo; + int32_t* blockIndexArray; + int32_t* numOfBlocksPerTable; +} SBlockOrderSupporter; + +typedef struct SIOCostSummary { + int64_t blockLoadTime; + int64_t statisInfoLoadTime; + int64_t checkForNextTime; + int64_t headFileLoad; + int64_t headFileLoadTime; +} SIOCostSummary; + +typedef struct STsdbQueryHandle { + STsdbRepo* pTsdb; + SQueryFilePos cur; // current position + int16_t order; + int64_t offset; // limit offset + int64_t srows; // skip offset rows + int64_t frows; // forbid skip offset rows + STimeWindow window; // the primary query time window that applies to all queries + SDataStatis* statis; // query level statistics, only one table block statistics info exists at any time + int32_t numOfBlocks; + SArray* pColumns; // column list, SColumnInfoData array list + bool locateStart; + int32_t outputCapacity; + int32_t realNumOfRows; + SArray* pTableCheckInfo; // SArray + int32_t activeIndex; + bool checkFiles; // check file stage + int8_t cachelastrow; // check if last row cached + bool loadExternalRow; // load time window external data rows + bool currentLoadExternalRows; // current load external rows + int32_t loadType; // block load type + uint64_t qId; // query info handle, for debug purpose + int32_t type; // query type: retrieve all data blocks, 2. retrieve only last row, 3. retrieve direct prev|next rows + SDFileSet* pFileGroup; + SFSIter fileIter; + SReadH rhelper; + STableBlockInfo* pDataBlockInfo; + SDataCols *pDataCols; // in order to hold current file data block + int32_t allocSize; // allocated data block size + SMemRef *pMemRef; + SArray *defaultLoadColumn;// default load column + SDataBlockLoadInfo dataBlockLoadInfo; /* record current block load information */ + SLoadCompBlockInfo compBlockLoadInfo; /* record current compblock information in SQueryAttr */ + + SArray *prev; // previous row which is before than time window + SArray *next; // next row which is after the query time window + SIOCostSummary cost; +} STsdbQueryHandle; + +typedef struct STableGroupSupporter { + int32_t numOfCols; + SColIndex* pCols; + STSchema* pTagSchema; +} STableGroupSupporter; + +typedef struct SRange { + int32_t from; + int32_t to; +} SRange; + +static STimeWindow updateLastrowForEachGroup(STableGroupInfo *groupList); +static int32_t checkForCachedLastRow(STsdbQueryHandle* pQueryHandle, STableGroupInfo *groupList); +static int32_t checkForCachedLast(STsdbQueryHandle* pQueryHandle); +static int32_t lazyLoadCacheLast(STsdbQueryHandle* pQueryHandle); +static int32_t tsdbGetCachedLastRow(STable* pTable, SMemRow* pRes, TSKEY* lastKey); + +static void changeQueryHandleForInterpQuery(TsdbQueryHandleT pHandle); +static void doMergeTwoLevelData(STsdbQueryHandle* pQueryHandle, STableCheckInfo* pCheckInfo, SBlock* pBlock); +static int32_t binarySearchForKey(char* pValue, int num, TSKEY key, int order); +static int32_t tsdbReadRowsFromCache(STableCheckInfo* pCheckInfo, TSKEY maxKey, int maxRowsToRead, STimeWindow* win, STsdbQueryHandle* pQueryHandle); +static int32_t tsdbCheckInfoCompar(const void* key1, const void* key2); +static int32_t doGetExternalRow(STsdbQueryHandle* pQueryHandle, int16_t type, SMemRef* pMemRef); +static void* doFreeColumnInfoData(SArray* pColumnInfoData); +static void* destroyTableCheckInfo(SArray* pTableCheckInfo); +static bool tsdbGetExternalRow(TsdbQueryHandleT pHandle); +static int32_t tsdbQueryTableList(STable* pTable, SArray* pRes, void* filterInfo); + +static void tsdbInitDataBlockLoadInfo(SDataBlockLoadInfo* pBlockLoadInfo) { + pBlockLoadInfo->slot = -1; + pBlockLoadInfo->tid = -1; + pBlockLoadInfo->fileGroup = NULL; +} + +static void tsdbInitCompBlockLoadInfo(SLoadCompBlockInfo* pCompBlockLoadInfo) { + pCompBlockLoadInfo->tid = -1; + pCompBlockLoadInfo->fileId = -1; +} + +static SArray* getColumnIdList(STsdbQueryHandle* pQueryHandle) { + size_t numOfCols = QH_GET_NUM_OF_COLS(pQueryHandle); + assert(numOfCols <= TSDB_MAX_COLUMNS); + + SArray* pIdList = taosArrayInit(numOfCols, sizeof(int16_t)); + for (int32_t i = 0; i < numOfCols; ++i) { + SColumnInfoData* pCol = taosArrayGet(pQueryHandle->pColumns, i); + taosArrayPush(pIdList, &pCol->info.colId); + } + + return pIdList; +} + +static SArray* getDefaultLoadColumns(STsdbQueryHandle* pQueryHandle, bool loadTS) { + SArray* pLocalIdList = getColumnIdList(pQueryHandle); + + // check if the primary time stamp column needs to load + int16_t colId = *(int16_t*)taosArrayGet(pLocalIdList, 0); + + // the primary timestamp column does not be included in the the specified load column list, add it + if (loadTS && colId != 0) { + int16_t columnId = 0; + taosArrayInsert(pLocalIdList, 0, &columnId); + } + + return pLocalIdList; +} + +static void tsdbMayTakeMemSnapshot(STsdbQueryHandle* pQueryHandle, SArray* psTable) { + assert(pQueryHandle != NULL && pQueryHandle->pMemRef != NULL); + + SMemRef* pMemRef = pQueryHandle->pMemRef; + if (pQueryHandle->pMemRef->ref++ == 0) { + tsdbTakeMemSnapshot(pQueryHandle->pTsdb, &(pMemRef->snapshot), psTable); + } + + taosArrayDestroy(&psTable); +} + +static void tsdbMayUnTakeMemSnapshot(STsdbQueryHandle* pQueryHandle) { + assert(pQueryHandle != NULL); + SMemRef* pMemRef = pQueryHandle->pMemRef; + if (pMemRef == NULL) { // it has been freed + return; + } + + if (--pMemRef->ref == 0) { + tsdbUnTakeMemSnapShot(pQueryHandle->pTsdb, &(pMemRef->snapshot)); + } + + pQueryHandle->pMemRef = NULL; +} + +int64_t tsdbGetNumOfRowsInMemTable(TsdbQueryHandleT* pHandle) { + STsdbQueryHandle* pQueryHandle = (STsdbQueryHandle*) pHandle; + + int64_t rows = 0; + SMemRef* pMemRef = pQueryHandle->pMemRef; + if (pMemRef == NULL) { return rows; } + + STableData* pMem = NULL; + STableData* pIMem = NULL; + + SMemTable* pMemT = pMemRef->snapshot.mem; + SMemTable* pIMemT = pMemRef->snapshot.imem; + + size_t size = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + for (int32_t i = 0; i < size; ++i) { + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, i); + + if (pMemT && pCheckInfo->tableId.tid < pMemT->maxTables) { + pMem = pMemT->tData[pCheckInfo->tableId.tid]; + rows += (pMem && pMem->uid == pCheckInfo->tableId.uid) ? pMem->numOfRows : 0; + } + if (pIMemT && pCheckInfo->tableId.tid < pIMemT->maxTables) { + pIMem = pIMemT->tData[pCheckInfo->tableId.tid]; + rows += (pIMem && pIMem->uid == pCheckInfo->tableId.uid) ? pIMem->numOfRows : 0; + } + } + return rows; +} + +static SArray* createCheckInfoFromTableGroup(STsdbQueryHandle* pQueryHandle, STableGroupInfo* pGroupList, STsdbMeta* pMeta, SArray** psTable) { + size_t sizeOfGroup = taosArrayGetSize(pGroupList->pGroupList); + assert(sizeOfGroup >= 1 && pMeta != NULL); + + // allocate buffer in order to load data blocks from file + SArray* pTableCheckInfo = taosArrayInit(pGroupList->numOfTables, sizeof(STableCheckInfo)); + if (pTableCheckInfo == NULL) { + return NULL; + } + + SArray* pTable = taosArrayInit(4, sizeof(STable*)); + if (pTable == NULL) { + taosArrayDestroy(&pTableCheckInfo); + return NULL; + } + + // todo apply the lastkey of table check to avoid to load header file + for (int32_t i = 0; i < sizeOfGroup; ++i) { + SArray* group = *(SArray**) taosArrayGet(pGroupList->pGroupList, i); + + size_t gsize = taosArrayGetSize(group); + assert(gsize > 0); + + for (int32_t j = 0; j < gsize; ++j) { + STableKeyInfo* pKeyInfo = (STableKeyInfo*) taosArrayGet(group, j); + + STableCheckInfo info = { .lastKey = pKeyInfo->lastKey, .pTableObj = pKeyInfo->pTable }; + assert(info.pTableObj != NULL && (info.pTableObj->type == TSDB_NORMAL_TABLE || + info.pTableObj->type == TSDB_CHILD_TABLE || info.pTableObj->type == TSDB_STREAM_TABLE)); + + info.tableId.tid = info.pTableObj->tableId.tid; + info.tableId.uid = info.pTableObj->tableId.uid; + + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + if (info.lastKey == INT64_MIN || info.lastKey < pQueryHandle->window.skey) { + info.lastKey = pQueryHandle->window.skey; + } + + assert(info.lastKey >= pQueryHandle->window.skey && info.lastKey <= pQueryHandle->window.ekey); + } else { + assert(info.lastKey >= pQueryHandle->window.ekey && info.lastKey <= pQueryHandle->window.skey); + } + + taosArrayPush(pTableCheckInfo, &info); + tsdbDebug("%p check table uid:%"PRId64", tid:%d from lastKey:%"PRId64" 0x%"PRIx64, pQueryHandle, info.tableId.uid, + info.tableId.tid, info.lastKey, pQueryHandle->qId); + } + } + + taosArraySort(pTableCheckInfo, tsdbCheckInfoCompar); + + size_t gsize = taosArrayGetSize(pTableCheckInfo); + + for (int32_t i = 0; i < gsize; ++i) { + STableCheckInfo* pInfo = (STableCheckInfo*) taosArrayGet(pTableCheckInfo, i); + taosArrayPush(pTable, &pInfo->pTableObj); + } + + *psTable = pTable; + return pTableCheckInfo; +} + +static void resetCheckInfo(STsdbQueryHandle* pQueryHandle) { + size_t numOfTables = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + assert(numOfTables >= 1); + + // todo apply the lastkey of table check to avoid to load header file + for (int32_t i = 0; i < numOfTables; ++i) { + STableCheckInfo* pCheckInfo = (STableCheckInfo*) taosArrayGet(pQueryHandle->pTableCheckInfo, i); + pCheckInfo->lastKey = pQueryHandle->window.skey; + pCheckInfo->iter = tSkipListDestroyIter(pCheckInfo->iter); + pCheckInfo->iiter = tSkipListDestroyIter(pCheckInfo->iiter); + pCheckInfo->initBuf = false; + + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + assert(pCheckInfo->lastKey >= pQueryHandle->window.skey); + } else { + assert(pCheckInfo->lastKey <= pQueryHandle->window.skey); + } + } +} + +// only one table, not need to sort again +static SArray* createCheckInfoFromCheckInfo(STableCheckInfo* pCheckInfo, TSKEY skey, SArray** psTable) { + SArray* pNew = taosArrayInit(1, sizeof(STableCheckInfo)); + SArray* pTable = taosArrayInit(1, sizeof(STable*)); + + STableCheckInfo info = { .lastKey = skey, .pTableObj = pCheckInfo->pTableObj}; + + info.tableId = pCheckInfo->tableId; + taosArrayPush(pNew, &info); + taosArrayPush(pTable, &pCheckInfo->pTableObj); + + *psTable = pTable; + return pNew; +} + +static bool emptyQueryTimewindow(STsdbQueryHandle* pQueryHandle) { + assert(pQueryHandle != NULL); + + STimeWindow* w = &pQueryHandle->window; + bool asc = ASCENDING_TRAVERSE(pQueryHandle->order); + + return ((asc && w->skey > w->ekey) || (!asc && w->ekey > w->skey)); +} + +// Update the query time window according to the data time to live(TTL) information, in order to avoid to return +// the expired data to client, even it is queried already. +static int64_t getEarliestValidTimestamp(STsdbRepo* pTsdb) { + STsdbCfg* pCfg = &pTsdb->config; + + int64_t now = taosGetTimestamp(pCfg->precision); + return now - (tsTickPerDay[pCfg->precision] * pCfg->keep) + 1; // needs to add one tick +} + +static void setQueryTimewindow(STsdbQueryHandle* pQueryHandle, STsdbQueryCond* pCond) { + pQueryHandle->window = pCond->twindow; + + bool updateTs = false; + int64_t startTs = getEarliestValidTimestamp(pQueryHandle->pTsdb); + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + if (startTs > pQueryHandle->window.skey) { + pQueryHandle->window.skey = startTs; + pCond->twindow.skey = startTs; + updateTs = true; + } + } else { + if (startTs > pQueryHandle->window.ekey) { + pQueryHandle->window.ekey = startTs; + pCond->twindow.ekey = startTs; + updateTs = true; + } + } + + if (updateTs) { + tsdbDebug("%p update the query time window, old:%" PRId64 " - %" PRId64 ", new:%" PRId64 " - %" PRId64 + ", 0x%" PRIx64, pQueryHandle, pCond->twindow.skey, pCond->twindow.ekey, pQueryHandle->window.skey, + pQueryHandle->window.ekey, pQueryHandle->qId); + } +} + +static STsdbQueryHandle* tsdbQueryTablesImpl(STsdbRepo* tsdb, STsdbQueryCond* pCond, uint64_t qId, SMemRef* pMemRef) { + STsdbQueryHandle* pQueryHandle = calloc(1, sizeof(STsdbQueryHandle)); + if (pQueryHandle == NULL) { + goto _end; + } + + pQueryHandle->order = pCond->order; + pQueryHandle->offset = pCond->offset; + pQueryHandle->srows = 0; + pQueryHandle->frows = 0; + pQueryHandle->pTsdb = tsdb; + pQueryHandle->type = TSDB_QUERY_TYPE_ALL; + pQueryHandle->cur.fid = INT32_MIN; + pQueryHandle->cur.win = TSWINDOW_INITIALIZER; + pQueryHandle->checkFiles = true; + pQueryHandle->activeIndex = 0; // current active table index + pQueryHandle->qId = qId; + pQueryHandle->allocSize = 0; + pQueryHandle->locateStart = false; + pQueryHandle->pMemRef = pMemRef; + pQueryHandle->loadType = pCond->type; + + pQueryHandle->outputCapacity = ((STsdbRepo*)tsdb)->config.maxRowsPerFileBlock; + pQueryHandle->loadExternalRow = pCond->loadExternalRows; + pQueryHandle->currentLoadExternalRows = pCond->loadExternalRows; + + if (tsdbInitReadH(&pQueryHandle->rhelper, (STsdbRepo*)tsdb) != 0) { + goto _end; + } + + assert(pCond != NULL && pMemRef != NULL); + setQueryTimewindow(pQueryHandle, pCond); + + if (pCond->numOfCols > 0) { + // allocate buffer in order to load data blocks from file + pQueryHandle->statis = calloc(pCond->numOfCols, sizeof(SDataStatis)); + if (pQueryHandle->statis == NULL) { + goto _end; + } + + // todo: use list instead of array? + pQueryHandle->pColumns = taosArrayInit(pCond->numOfCols, sizeof(SColumnInfoData)); + if (pQueryHandle->pColumns == NULL) { + goto _end; + } + + for (int32_t i = 0; i < pCond->numOfCols; ++i) { + SColumnInfoData colInfo = {{0}, 0}; + + colInfo.info = pCond->colList[i]; + colInfo.pData = calloc(1, EXTRA_BYTES + pQueryHandle->outputCapacity * pCond->colList[i].bytes); + if (colInfo.pData == NULL) { + goto _end; + } + + taosArrayPush(pQueryHandle->pColumns, &colInfo); + pQueryHandle->statis[i].colId = colInfo.info.colId; + } + + pQueryHandle->defaultLoadColumn = getDefaultLoadColumns(pQueryHandle, true); + } + + STsdbMeta* pMeta = tsdbGetMeta(tsdb); + assert(pMeta != NULL); + + pQueryHandle->pDataCols = tdNewDataCols(pMeta->maxCols, pQueryHandle->pTsdb->config.maxRowsPerFileBlock); + if (pQueryHandle->pDataCols == NULL) { + tsdbError("%p failed to malloc buf for pDataCols, %"PRIu64, pQueryHandle, pQueryHandle->qId); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _end; + } + + tsdbInitDataBlockLoadInfo(&pQueryHandle->dataBlockLoadInfo); + tsdbInitCompBlockLoadInfo(&pQueryHandle->compBlockLoadInfo); + + return (TsdbQueryHandleT) pQueryHandle; + + _end: + tsdbCleanupQueryHandle(pQueryHandle); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; +} + +TsdbQueryHandleT* tsdbQueryTables(STsdbRepo* tsdb, STsdbQueryCond* pCond, STableGroupInfo* groupList, uint64_t qId, SMemRef* pRef) { + STsdbQueryHandle* pQueryHandle = tsdbQueryTablesImpl(tsdb, pCond, qId, pRef); + if (pQueryHandle == NULL) { + return NULL; + } + + if (emptyQueryTimewindow(pQueryHandle)) { + return (TsdbQueryHandleT*) pQueryHandle; + } + + STsdbMeta* pMeta = tsdbGetMeta(tsdb); + assert(pMeta != NULL); + + SArray* psTable = NULL; + + // todo apply the lastkey of table check to avoid to load header file + pQueryHandle->pTableCheckInfo = createCheckInfoFromTableGroup(pQueryHandle, groupList, pMeta, &psTable); + if (pQueryHandle->pTableCheckInfo == NULL) { + tsdbCleanupQueryHandle(pQueryHandle); + taosArrayDestroy(&psTable); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return NULL; + } + + tsdbMayTakeMemSnapshot(pQueryHandle, psTable); + + tsdbDebug("%p total numOfTable:%" PRIzu " in query, 0x%"PRIx64, pQueryHandle, taosArrayGetSize(pQueryHandle->pTableCheckInfo), pQueryHandle->qId); + return (TsdbQueryHandleT) pQueryHandle; +} + +void tsdbResetQueryHandle(TsdbQueryHandleT queryHandle, STsdbQueryCond *pCond) { + STsdbQueryHandle* pQueryHandle = queryHandle; + + if (emptyQueryTimewindow(pQueryHandle)) { + if (pCond->order != pQueryHandle->order) { + pQueryHandle->order = pCond->order; + SWAP(pQueryHandle->window.skey, pQueryHandle->window.ekey, int64_t); + } + + return; + } + + pQueryHandle->order = pCond->order; + pQueryHandle->offset = pCond->offset; + pQueryHandle->srows = 0; + pQueryHandle->frows = 0; + pQueryHandle->window = pCond->twindow; + pQueryHandle->type = TSDB_QUERY_TYPE_ALL; + pQueryHandle->cur.fid = -1; + pQueryHandle->cur.win = TSWINDOW_INITIALIZER; + pQueryHandle->checkFiles = true; + pQueryHandle->activeIndex = 0; // current active table index + pQueryHandle->locateStart = false; + pQueryHandle->loadExternalRow = pCond->loadExternalRows; + + if (ASCENDING_TRAVERSE(pCond->order)) { + assert(pQueryHandle->window.skey <= pQueryHandle->window.ekey); + } else { + assert(pQueryHandle->window.skey >= pQueryHandle->window.ekey); + } + + // allocate buffer in order to load data blocks from file + memset(pQueryHandle->statis, 0, sizeof(SDataStatis)); + + tsdbInitDataBlockLoadInfo(&pQueryHandle->dataBlockLoadInfo); + tsdbInitCompBlockLoadInfo(&pQueryHandle->compBlockLoadInfo); + + resetCheckInfo(pQueryHandle); +} + +void tsdbResetQueryHandleForNewTable(TsdbQueryHandleT queryHandle, STsdbQueryCond *pCond, STableGroupInfo* groupList) { + STsdbQueryHandle* pQueryHandle = queryHandle; + + pQueryHandle->order = pCond->order; + pQueryHandle->window = pCond->twindow; + pQueryHandle->type = TSDB_QUERY_TYPE_ALL; + pQueryHandle->cur.fid = -1; + pQueryHandle->cur.win = TSWINDOW_INITIALIZER; + pQueryHandle->checkFiles = true; + pQueryHandle->activeIndex = 0; // current active table index + pQueryHandle->locateStart = false; + pQueryHandle->loadExternalRow = pCond->loadExternalRows; + + if (ASCENDING_TRAVERSE(pCond->order)) { + assert(pQueryHandle->window.skey <= pQueryHandle->window.ekey); + } else { + assert(pQueryHandle->window.skey >= pQueryHandle->window.ekey); + } + + // allocate buffer in order to load data blocks from file + memset(pQueryHandle->statis, 0, sizeof(SDataStatis)); + + tsdbInitDataBlockLoadInfo(&pQueryHandle->dataBlockLoadInfo); + tsdbInitCompBlockLoadInfo(&pQueryHandle->compBlockLoadInfo); + + SArray* pTable = NULL; + STsdbMeta* pMeta = tsdbGetMeta(pQueryHandle->pTsdb); + + pQueryHandle->pTableCheckInfo = destroyTableCheckInfo(pQueryHandle->pTableCheckInfo); + + pQueryHandle->pTableCheckInfo = createCheckInfoFromTableGroup(pQueryHandle, groupList, pMeta, &pTable); + if (pQueryHandle->pTableCheckInfo == NULL) { + tsdbCleanupQueryHandle(pQueryHandle); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + } + + pQueryHandle->prev = doFreeColumnInfoData(pQueryHandle->prev); + pQueryHandle->next = doFreeColumnInfoData(pQueryHandle->next); +} + +static int32_t lazyLoadCacheLast(STsdbQueryHandle* pQueryHandle) { + STsdbRepo* pRepo = pQueryHandle->pTsdb; + + if (!pQueryHandle->pTableCheckInfo) { + tsdbError("%p table check info is NULL", pQueryHandle); + terrno = TSDB_CODE_QRY_APP_ERROR; + return -1; + } + + size_t numOfTables = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + int32_t code = 0; + for (size_t i = 0; i < numOfTables; ++i) { + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, i); + STable* pTable = pCheckInfo->pTableObj; + if (pTable->cacheLastConfigVersion == pRepo->cacheLastConfigVersion) { + continue; + } + code = tsdbLoadLastCache(pRepo, pTable); + if (code != 0) { + tsdbError("%p uid:%" PRId64 ", tid:%d, failed to load last cache since %s", pQueryHandle, pTable->tableId.uid, + pTable->tableId.tid, tstrerror(terrno)); + break; + } + } + + return code; +} + +TsdbQueryHandleT tsdbQueryLastRow(STsdbRepo *tsdb, STsdbQueryCond *pCond, STableGroupInfo *groupList, uint64_t qId, SMemRef* pMemRef) { + pCond->twindow = updateLastrowForEachGroup(groupList); + + // no qualified table + if (groupList->numOfTables == 0) { + return NULL; + } + + STsdbQueryHandle *pQueryHandle = (STsdbQueryHandle*) tsdbQueryTables(tsdb, pCond, groupList, qId, pMemRef); + if (pQueryHandle == NULL) { + return NULL; + } + + if (lazyLoadCacheLast(pQueryHandle) != TSDB_CODE_SUCCESS) { + return NULL; + } + + int32_t code = checkForCachedLastRow(pQueryHandle, groupList); + if (code != TSDB_CODE_SUCCESS) { // set the numOfTables to be 0 + terrno = code; + return NULL; + } + + assert(pCond->order == TSDB_ORDER_ASC && pCond->twindow.skey <= pCond->twindow.ekey); + if (pQueryHandle->cachelastrow) { + pQueryHandle->type = TSDB_QUERY_TYPE_LAST; + } + + return pQueryHandle; +} + +TsdbQueryHandleT tsdbQueryCacheLast(STsdbRepo *tsdb, STsdbQueryCond *pCond, STableGroupInfo *groupList, uint64_t qId, SMemRef* pMemRef) { + STsdbQueryHandle *pQueryHandle = (STsdbQueryHandle*) tsdbQueryTables(tsdb, pCond, groupList, qId, pMemRef); + if (pQueryHandle == NULL) { + return NULL; + } + + if (lazyLoadCacheLast(pQueryHandle) != TSDB_CODE_SUCCESS) { + return NULL; + } + + int32_t code = checkForCachedLast(pQueryHandle); + if (code != TSDB_CODE_SUCCESS) { // set the numOfTables to be 0 + terrno = code; + return NULL; + } + + if (pQueryHandle->cachelastrow) { + pQueryHandle->type = TSDB_QUERY_TYPE_LAST; + } + + return pQueryHandle; +} + + +SArray* tsdbGetQueriedTableList(TsdbQueryHandleT *pHandle) { + assert(pHandle != NULL); + + STsdbQueryHandle *pQueryHandle = (STsdbQueryHandle*) pHandle; + + size_t size = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + SArray* res = taosArrayInit(size, POINTER_BYTES); + + for(int32_t i = 0; i < size; ++i) { + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, i); + taosArrayPush(res, &pCheckInfo->pTableObj); + } + + return res; +} + +TsdbQueryHandleT tsdbQueryRowsInExternalWindow(STsdbRepo *tsdb, STsdbQueryCond* pCond, STableGroupInfo *groupList, uint64_t qId, SMemRef* pRef) { + STsdbQueryHandle *pQueryHandle = (STsdbQueryHandle*) tsdbQueryTables(tsdb, pCond, groupList, qId, pRef); + //pQueryHandle->loadExternalRow = true; + //pQueryHandle->currentLoadExternalRows = true; + + return pQueryHandle; +} + +static bool initTableMemIterator(STsdbQueryHandle* pHandle, STableCheckInfo* pCheckInfo) { + STable* pTable = pCheckInfo->pTableObj; + assert(pTable != NULL); + + if (pCheckInfo->initBuf) { + return true; + } + + pCheckInfo->initBuf = true; + int32_t order = pHandle->order; + + // no data in buffer, abort + if (pHandle->pMemRef->snapshot.mem == NULL && pHandle->pMemRef->snapshot.imem == NULL) { + return false; + } + + assert(pCheckInfo->iter == NULL && pCheckInfo->iiter == NULL); + + STableData* pMem = NULL; + STableData* pIMem = NULL; + + SMemTable* pMemT = pHandle->pMemRef->snapshot.mem; + SMemTable* pIMemT = pHandle->pMemRef->snapshot.imem; + + if (pMemT && pCheckInfo->tableId.tid < pMemT->maxTables) { + pMem = pMemT->tData[pCheckInfo->tableId.tid]; + if (pMem != NULL && pMem->uid == pCheckInfo->tableId.uid) { // check uid + TKEY tLastKey = keyToTkey(pCheckInfo->lastKey); + pCheckInfo->iter = + tSkipListCreateIterFromVal(pMem->pData, (const char*)&tLastKey, TSDB_DATA_TYPE_TIMESTAMP, order); + } + } + + if (pIMemT && pCheckInfo->tableId.tid < pIMemT->maxTables) { + pIMem = pIMemT->tData[pCheckInfo->tableId.tid]; + if (pIMem != NULL && pIMem->uid == pCheckInfo->tableId.uid) { // check uid + TKEY tLastKey = keyToTkey(pCheckInfo->lastKey); + pCheckInfo->iiter = + tSkipListCreateIterFromVal(pIMem->pData, (const char*)&tLastKey, TSDB_DATA_TYPE_TIMESTAMP, order); + } + } + + // both iterators are NULL, no data in buffer right now + if (pCheckInfo->iter == NULL && pCheckInfo->iiter == NULL) { + return false; + } + + bool memEmpty = (pCheckInfo->iter == NULL) || (pCheckInfo->iter != NULL && !tSkipListIterNext(pCheckInfo->iter)); + bool imemEmpty = (pCheckInfo->iiter == NULL) || (pCheckInfo->iiter != NULL && !tSkipListIterNext(pCheckInfo->iiter)); + if (memEmpty && imemEmpty) { // buffer is empty + return false; + } + + if (!memEmpty) { + SSkipListNode* node = tSkipListIterGet(pCheckInfo->iter); + assert(node != NULL); + + SMemRow row = (SMemRow)SL_GET_NODE_DATA(node); + TSKEY key = memRowKey(row); // first timestamp in buffer + tsdbDebug("%p uid:%" PRId64 ", tid:%d check data in mem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64 + "-%" PRId64 ", lastKey:%" PRId64 ", numOfRows:%"PRId64", 0x%"PRIx64, + pHandle, pCheckInfo->tableId.uid, pCheckInfo->tableId.tid, key, order, pMem->keyFirst, pMem->keyLast, + pCheckInfo->lastKey, pMem->numOfRows, pHandle->qId); + + if (ASCENDING_TRAVERSE(order)) { + assert(pCheckInfo->lastKey <= key); + } else { + assert(pCheckInfo->lastKey >= key); + } + + } else { + tsdbDebug("%p uid:%"PRId64", tid:%d no data in mem, 0x%"PRIx64, pHandle, pCheckInfo->tableId.uid, pCheckInfo->tableId.tid, + pHandle->qId); + } + + if (!imemEmpty) { + SSkipListNode* node = tSkipListIterGet(pCheckInfo->iiter); + assert(node != NULL); + + SMemRow row = (SMemRow)SL_GET_NODE_DATA(node); + TSKEY key = memRowKey(row); // first timestamp in buffer + tsdbDebug("%p uid:%" PRId64 ", tid:%d check data in imem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64 + "-%" PRId64 ", lastKey:%" PRId64 ", numOfRows:%"PRId64", 0x%"PRIx64, + pHandle, pCheckInfo->tableId.uid, pCheckInfo->tableId.tid, key, order, pIMem->keyFirst, pIMem->keyLast, + pCheckInfo->lastKey, pIMem->numOfRows, pHandle->qId); + + if (ASCENDING_TRAVERSE(order)) { + assert(pCheckInfo->lastKey <= key); + } else { + assert(pCheckInfo->lastKey >= key); + } + } else { + tsdbDebug("%p uid:%"PRId64", tid:%d no data in imem, 0x%"PRIx64, pHandle, pCheckInfo->tableId.uid, pCheckInfo->tableId.tid, + pHandle->qId); + } + + return true; +} + +static void destroyTableMemIterator(STableCheckInfo* pCheckInfo) { + tSkipListDestroyIter(pCheckInfo->iter); + tSkipListDestroyIter(pCheckInfo->iiter); +} + +static TSKEY extractFirstTraverseKey(STableCheckInfo* pCheckInfo, int32_t order, int32_t update) { + SMemRow rmem = NULL, rimem = NULL; + if (pCheckInfo->iter) { + SSkipListNode* node = tSkipListIterGet(pCheckInfo->iter); + if (node != NULL) { + rmem = (SMemRow)SL_GET_NODE_DATA(node); + } + } + + if (pCheckInfo->iiter) { + SSkipListNode* node = tSkipListIterGet(pCheckInfo->iiter); + if (node != NULL) { + rimem = (SMemRow)SL_GET_NODE_DATA(node); + } + } + + if (rmem == NULL && rimem == NULL) { + return TSKEY_INITIAL_VAL; + } + + if (rmem != NULL && rimem == NULL) { + pCheckInfo->chosen = CHECKINFO_CHOSEN_MEM; + return memRowKey(rmem); + } + + if (rmem == NULL && rimem != NULL) { + pCheckInfo->chosen = CHECKINFO_CHOSEN_IMEM; + return memRowKey(rimem); + } + + TSKEY r1 = memRowKey(rmem); + TSKEY r2 = memRowKey(rimem); + + if (r1 == r2) { + if(update == TD_ROW_DISCARD_UPDATE){ + pCheckInfo->chosen = CHECKINFO_CHOSEN_IMEM; + tSkipListIterNext(pCheckInfo->iter); + return r2; + } + else if(update == TD_ROW_OVERWRITE_UPDATE) { + pCheckInfo->chosen = CHECKINFO_CHOSEN_MEM; + tSkipListIterNext(pCheckInfo->iiter); + return r1; + } else { + pCheckInfo->chosen = CHECKINFO_CHOSEN_BOTH; + return r1; + } + } else { + if (ASCENDING_TRAVERSE(order)) { + if (r1 < r2) { + pCheckInfo->chosen = CHECKINFO_CHOSEN_MEM; + return r1; + } else { + pCheckInfo->chosen = CHECKINFO_CHOSEN_IMEM; + return r2; + } + } else { + if (r1 < r2) { + pCheckInfo->chosen = CHECKINFO_CHOSEN_IMEM; + return r2; + } else { + pCheckInfo->chosen = CHECKINFO_CHOSEN_MEM; + return r1; + } + } + } +} + +static SMemRow getSMemRowInTableMem(STableCheckInfo* pCheckInfo, int32_t order, int32_t update, SMemRow* extraRow) { + SMemRow rmem = NULL, rimem = NULL; + if (pCheckInfo->iter) { + SSkipListNode* node = tSkipListIterGet(pCheckInfo->iter); + if (node != NULL) { + rmem = (SMemRow)SL_GET_NODE_DATA(node); + } + } + + if (pCheckInfo->iiter) { + SSkipListNode* node = tSkipListIterGet(pCheckInfo->iiter); + if (node != NULL) { + rimem = (SMemRow)SL_GET_NODE_DATA(node); + } + } + + if (rmem == NULL && rimem == NULL) { + return NULL; + } + + if (rmem != NULL && rimem == NULL) { + pCheckInfo->chosen = 0; + return rmem; + } + + if (rmem == NULL && rimem != NULL) { + pCheckInfo->chosen = 1; + return rimem; + } + + TSKEY r1 = memRowKey(rmem); + TSKEY r2 = memRowKey(rimem); + + if (r1 == r2) { + if (update == TD_ROW_DISCARD_UPDATE) { + tSkipListIterNext(pCheckInfo->iter); + pCheckInfo->chosen = CHECKINFO_CHOSEN_IMEM; + return rimem; + } else if(update == TD_ROW_OVERWRITE_UPDATE){ + tSkipListIterNext(pCheckInfo->iiter); + pCheckInfo->chosen = CHECKINFO_CHOSEN_MEM; + return rmem; + } else { + pCheckInfo->chosen = CHECKINFO_CHOSEN_BOTH; + extraRow = rimem; + return rmem; + } + } else { + if (ASCENDING_TRAVERSE(order)) { + if (r1 < r2) { + pCheckInfo->chosen = CHECKINFO_CHOSEN_MEM; + return rmem; + } else { + pCheckInfo->chosen = CHECKINFO_CHOSEN_IMEM; + return rimem; + } + } else { + if (r1 < r2) { + pCheckInfo->chosen = CHECKINFO_CHOSEN_IMEM; + return rimem; + } else { + pCheckInfo->chosen = CHECKINFO_CHOSEN_MEM; + return rmem; + } + } + } +} + +static bool moveToNextRowInMem(STableCheckInfo* pCheckInfo) { + bool hasNext = false; + if (pCheckInfo->chosen == CHECKINFO_CHOSEN_MEM) { + if (pCheckInfo->iter != NULL) { + hasNext = tSkipListIterNext(pCheckInfo->iter); + } + + if (hasNext) { + return hasNext; + } + + if (pCheckInfo->iiter != NULL) { + return tSkipListIterGet(pCheckInfo->iiter) != NULL; + } + } else if (pCheckInfo->chosen == CHECKINFO_CHOSEN_IMEM){ + if (pCheckInfo->iiter != NULL) { + hasNext = tSkipListIterNext(pCheckInfo->iiter); + } + + if (hasNext) { + return hasNext; + } + + if (pCheckInfo->iter != NULL) { + return tSkipListIterGet(pCheckInfo->iter) != NULL; + } + } else { + if (pCheckInfo->iter != NULL) { + hasNext = tSkipListIterNext(pCheckInfo->iter); + } + if (pCheckInfo->iiter != NULL) { + hasNext = tSkipListIterNext(pCheckInfo->iiter) || hasNext; + } + } + + return hasNext; +} + +static bool hasMoreDataInCache(STsdbQueryHandle* pHandle) { + STsdbCfg *pCfg = &pHandle->pTsdb->config; + size_t size = taosArrayGetSize(pHandle->pTableCheckInfo); + assert(pHandle->activeIndex < size && pHandle->activeIndex >= 0 && size >= 1); + pHandle->cur.fid = INT32_MIN; + + STableCheckInfo* pCheckInfo = taosArrayGet(pHandle->pTableCheckInfo, pHandle->activeIndex); + + STable* pTable = pCheckInfo->pTableObj; + assert(pTable != NULL); + + if (!pCheckInfo->initBuf) { + initTableMemIterator(pHandle, pCheckInfo); + } + + SMemRow row = getSMemRowInTableMem(pCheckInfo, pHandle->order, pCfg->update, NULL); + if (row == NULL) { + return false; + } + + pCheckInfo->lastKey = memRowKey(row); // first timestamp in buffer + tsdbDebug("%p uid:%" PRId64", tid:%d check data in buffer from skey:%" PRId64 ", order:%d, 0x%"PRIx64, pHandle, + pCheckInfo->tableId.uid, pCheckInfo->tableId.tid, pCheckInfo->lastKey, pHandle->order, pHandle->qId); + + // all data in mem are checked already. + if ((pCheckInfo->lastKey > pHandle->window.ekey && ASCENDING_TRAVERSE(pHandle->order)) || + (pCheckInfo->lastKey < pHandle->window.ekey && !ASCENDING_TRAVERSE(pHandle->order))) { + return false; + } + + int32_t step = ASCENDING_TRAVERSE(pHandle->order)? 1:-1; + STimeWindow* win = &pHandle->cur.win; + pHandle->cur.rows = tsdbReadRowsFromCache(pCheckInfo, pHandle->window.ekey, pHandle->outputCapacity, win, pHandle); + + // update the last key value + pCheckInfo->lastKey = win->ekey + step; + pHandle->cur.lastKey = win->ekey + step; + pHandle->cur.mixBlock = true; + + if (!ASCENDING_TRAVERSE(pHandle->order)) { + SWAP(win->skey, win->ekey, TSKEY); + } + + return true; +} + +static int32_t getFileIdFromKey(TSKEY key, int32_t daysPerFile, int32_t precision) { + assert(precision >= TSDB_TIME_PRECISION_MICRO || precision <= TSDB_TIME_PRECISION_NANO); + if (key == TSKEY_INITIAL_VAL) { + return INT32_MIN; + } + + if (key < 0) { + key -= (daysPerFile * tsTickPerDay[precision]); + } + + int64_t fid = (int64_t)(key / (daysPerFile * tsTickPerDay[precision])); // set the starting fileId + if (fid < 0L && llabs(fid) > INT32_MAX) { // data value overflow for INT32 + fid = INT32_MIN; + } + + if (fid > 0L && fid > INT32_MAX) { + fid = INT32_MAX; + } + + return (int32_t)fid; +} + +static int32_t binarySearchForBlock(SBlock* pBlock, int32_t numOfBlocks, TSKEY skey, int32_t order) { + int32_t firstSlot = 0; + int32_t lastSlot = numOfBlocks - 1; + + int32_t midSlot = firstSlot; + + while (1) { + numOfBlocks = lastSlot - firstSlot + 1; + midSlot = (firstSlot + (numOfBlocks >> 1)); + + if (numOfBlocks == 1) break; + + if (skey > pBlock[midSlot].keyLast) { + if (numOfBlocks == 2) break; + if ((order == TSDB_ORDER_DESC) && (skey < pBlock[midSlot + 1].keyFirst)) break; + firstSlot = midSlot + 1; + } else if (skey < pBlock[midSlot].keyFirst) { + if ((order == TSDB_ORDER_ASC) && (skey > pBlock[midSlot - 1].keyLast)) break; + lastSlot = midSlot - 1; + } else { + break; // got the slot + } + } + + return midSlot; +} + +// array :1 2 3 5 7 -2 (8 9) skip 4 and 6 +int32_t memMoveByArray(SBlock *blocks, SArray *pArray) { + // pArray is NULL or size is zero , no need block to move + if(pArray == NULL) + return 0; + size_t count = taosArrayGetSize(pArray); + if(count == 0) + return 0; + + // memmove + int32_t num = 0; + SRange* ranges = (SRange*)TARRAY_GET_START(pArray); + for(size_t i = 0; i < count; i++) { + int32_t step = ranges[i].to - ranges[i].from + 1; + memmove(blocks + num, blocks + ranges[i].from, sizeof(SBlock) * step); + num += step; + } + + return num; +} + +// if block data in memory return false else true +bool blockNoItemInMem(STsdbQueryHandle* q, SBlock* pBlock) { + if(q->pMemRef == NULL) { + return false; + } + + // mem + if(q->pMemRef->snapshot.mem) { + SMemTable* mem = q->pMemRef->snapshot.mem; + if(timeIntersect(mem->keyFirst, mem->keyLast, pBlock->keyFirst, pBlock->keyLast)) + return false; + } + // imem + if(q->pMemRef->snapshot.imem) { + SMemTable* imem = q->pMemRef->snapshot.imem; + if(timeIntersect(imem->keyFirst, imem->keyLast, pBlock->keyFirst, pBlock->keyLast)) + return false; + } + + return true; +} + +#define MAYBE_IN_MEMORY_ROWS 4000 // approximately the capacity of one block +// skip blocks . return value is skip blocks number, skip rows reduce from *pOffset +static int32_t offsetSkipBlock(STsdbQueryHandle* q, SBlockInfo* pBlockInfo, int64_t skey, int64_t ekey, + int32_t sblock, int32_t eblock, SArray** ppArray, bool order) { + int32_t num = 0; + SBlock* blocks = pBlockInfo->blocks; + SArray* pArray = NULL; + SRange range; + range.from = -1; + + // + // ASC + // + if(order) { + for(int32_t i = sblock; i < eblock; i++) { + bool skip = false; + SBlock* pBlock = &blocks[i]; + if(i == sblock && skey > pBlock->keyFirst) { + q->frows += pBlock->numOfRows; // some rows time < s + } else { + // check can skip + if(q->srows + q->frows + pBlock->numOfRows + MAYBE_IN_MEMORY_ROWS < q->offset) { // approximately calculate + if(blockNoItemInMem(q, pBlock)) { + // can skip + q->srows += pBlock->numOfRows; + skip = true; + } else { + q->frows += pBlock->numOfRows; // maybe have some row in memroy + } + } else { + // the remainder be put to pArray + if(pArray == NULL) + pArray = taosArrayInit(1, sizeof(SRange)); + if(range.from == -1) { + range.from = i; + } else { + if(range.to + 1 != i) { + // add the previous + taosArrayPush(pArray, &range); + range.from = i; + } + } + range.to = eblock - 1; + taosArrayPush(pArray, &range); + range.from = -1; + break; + } + } + + if(skip) { + num ++; + } else { + // can't skip, append block index to pArray + if(pArray == NULL) + pArray = taosArrayInit(10, sizeof(SRange)); + if(range.from == -1) { + range.from = i; + } else { + if(range.to + 1 != i) { + // add the previous + taosArrayPush(pArray, &range); + range.from = i; + } + } + range.to = i; + } + } + // end append + if(range.from != -1) { + if(pArray == NULL) + pArray = taosArrayInit(1, sizeof(SRange)); + taosArrayPush(pArray, &range); + } + + // ASC return + *ppArray = pArray; + return num; + } + + // DES + for(int32_t i = eblock - 1; i >= sblock; i--) { + bool skip = false; + SBlock* pBlock = &blocks[i]; + if(i == eblock - 1 && ekey < pBlock->keyLast) { + q->frows += pBlock->numOfRows; // some rows time > e + } else { + // check can skip + if(q->srows + q->frows + pBlock->numOfRows + MAYBE_IN_MEMORY_ROWS < q->offset) { // approximately calculate + if(blockNoItemInMem(q, pBlock)) { + // can skip + q->srows += pBlock->numOfRows; + skip = true; + } else { + q->frows += pBlock->numOfRows; // maybe have some row in memroy + } + } else { + // the remainder be put to pArray + if(pArray == NULL) + pArray = taosArrayInit(1, sizeof(SRange)); + if(range.from == -1) { + range.from = i; + } else { + if(range.to - 1 != i) { + // add the previous + taosArrayPush(pArray, &range); + range.from = i; + } + } + range.to = 0; + taosArrayPush(pArray, &range); + range.from = -1; + break; + } + } + + if(skip) { + num ++; + } else { + // can't skip, append block index to pArray + if(pArray == NULL) + pArray = taosArrayInit(10, sizeof(SRange)); + if(range.from == -1) { + range.from = i; + } else { + if(range.to + 1 != i) { + // add the previous + taosArrayPush(pArray, &range); + range.from = i; + } + } + range.to = i; + } + } + + // end append + if(range.from != -1) { + if(pArray == NULL) + pArray = taosArrayInit(1, sizeof(SRange)); + taosArrayPush(pArray, &range); + } + if(pArray == NULL) + return num; + + // reverse array + size_t count = taosArrayGetSize(pArray); + SRange* ranges = TARRAY_GET_START(pArray); + SArray* pArray1 = taosArrayInit(count, sizeof(SRange)); + + size_t i = count - 1; + while(i >= 0) { + range.from = ranges[i].to; + range.to = ranges[i].from; + taosArrayPush(pArray1, &range); + if(i == 0) + break; + i --; + } + + *ppArray = pArray1; + taosArrayDestroy(&pArray); + return num; +} + +// shrink blocks by condition of query +static void shrinkBlocksByQuery(STsdbQueryHandle *pQueryHandle, STableCheckInfo *pCheckInfo) { + SBlockInfo *pCompInfo = pCheckInfo->pCompInfo; + SBlockIdx *compIndex = pQueryHandle->rhelper.pBlkIdx; + bool order = ASCENDING_TRAVERSE(pQueryHandle->order); + + if (order) { + assert(pCheckInfo->lastKey <= pQueryHandle->window.ekey && pQueryHandle->window.skey <= pQueryHandle->window.ekey); + } else { + assert(pCheckInfo->lastKey >= pQueryHandle->window.ekey && pQueryHandle->window.skey >= pQueryHandle->window.ekey); + } + + TSKEY s = TSKEY_INITIAL_VAL, e = TSKEY_INITIAL_VAL; + s = MIN(pCheckInfo->lastKey, pQueryHandle->window.ekey); + e = MAX(pCheckInfo->lastKey, pQueryHandle->window.ekey); + + // discard the unqualified data block based on the query time window + int32_t start = binarySearchForBlock(pCompInfo->blocks, compIndex->numOfBlocks, s, TSDB_ORDER_ASC); + if (s > pCompInfo->blocks[start].keyLast) { + return ; + } + + int32_t end = start; + // locate e index of blocks -> end + while (end < (int32_t)compIndex->numOfBlocks && (pCompInfo->blocks[end].keyFirst <= e)) { + end += 1; + } + + // calc offset can skip blocks number + int32_t nSkip = 0; + SArray *pArray = NULL; + if(pQueryHandle->offset > 0) { + nSkip = offsetSkipBlock(pQueryHandle, pCompInfo, s, e, start, end, &pArray, order); + } + + if(nSkip > 0) { // have offset and can skip + pCheckInfo->numOfBlocks = memMoveByArray(pCompInfo->blocks, pArray); + } else { // no offset + pCheckInfo->numOfBlocks = end - start; + if(start > 0) + memmove(pCompInfo->blocks, &pCompInfo->blocks[start], pCheckInfo->numOfBlocks * sizeof(SBlock)); + } + + if(pArray) + taosArrayDestroy(&pArray); +} + +// load one table (tsd_index point to) need load blocks info and put into pCheckInfo->pCompInfo->blocks +static int32_t loadBlockInfo(STsdbQueryHandle * pQueryHandle, int32_t tsd_index, int32_t* numOfBlocks) { + // + // ONE PART. Load all blocks info from one table of tsd_index + // + int32_t code = 0; + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, tsd_index); + pCheckInfo->numOfBlocks = 0; + if (tsdbSetReadTable(&pQueryHandle->rhelper, pCheckInfo->pTableObj) != TSDB_CODE_SUCCESS) { + code = terrno; + return code; + } + + SBlockIdx* compIndex = pQueryHandle->rhelper.pBlkIdx; + // no data block in this file, try next file + if (compIndex == NULL || compIndex->uid != pCheckInfo->tableId.uid) { + return 0; // no data blocks in the file belongs to pCheckInfo->pTable + } + + if (pCheckInfo->compSize < (int32_t)compIndex->len) { + assert(compIndex->len > 0); + char* t = realloc(pCheckInfo->pCompInfo, compIndex->len); + if (t == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + code = TSDB_CODE_TDB_OUT_OF_MEMORY; + return code; + } + + pCheckInfo->pCompInfo = (SBlockInfo*)t; + pCheckInfo->compSize = compIndex->len; + } + + if (tsdbLoadBlockInfo(&(pQueryHandle->rhelper), (void**)(&pCheckInfo->pCompInfo), + (uint32_t*)(&pCheckInfo->compSize)) < 0) { + return terrno; + } + + // + // TWO PART. shrink no need blocks from all blocks by condition of query + // + shrinkBlocksByQuery(pQueryHandle, pCheckInfo); + (*numOfBlocks) += pCheckInfo->numOfBlocks; + + return 0; +} + +static int32_t getFileCompInfo(STsdbQueryHandle* pQueryHandle, int32_t* numOfBlocks) { + // load all the comp offset value for all tables in this file + int32_t code = TSDB_CODE_SUCCESS; + *numOfBlocks = 0; + + pQueryHandle->cost.headFileLoad += 1; + int64_t s = taosGetTimestampUs(); + + size_t numOfTables = 0; + if (pQueryHandle->loadType == BLOCK_LOAD_TABLE_SEQ_ORDER) { + code = loadBlockInfo(pQueryHandle, pQueryHandle->activeIndex, numOfBlocks); + } else if (pQueryHandle->loadType == BLOCK_LOAD_OFFSET_SEQ_ORDER) { + numOfTables = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + + for (int32_t i = 0; i < numOfTables; ++i) { + code = loadBlockInfo(pQueryHandle, i, numOfBlocks); + if (code != TSDB_CODE_SUCCESS) { + int64_t e = taosGetTimestampUs(); + + pQueryHandle->cost.headFileLoadTime += (e - s); + return code; + } + } + } else { + assert(0); + } + + int64_t e = taosGetTimestampUs(); + pQueryHandle->cost.headFileLoadTime += (e - s); + return code; +} + +static int32_t doLoadFileDataBlock(STsdbQueryHandle* pQueryHandle, SBlock* pBlock, STableCheckInfo* pCheckInfo, int32_t slotIndex) { + int64_t st = taosGetTimestampUs(); + + STSchema *pSchema = tsdbGetTableSchema(pCheckInfo->pTableObj); + int32_t code = tdInitDataCols(pQueryHandle->pDataCols, pSchema); + if (code != TSDB_CODE_SUCCESS) { + tsdbError("%p failed to malloc buf for pDataCols, 0x%"PRIx64, pQueryHandle, pQueryHandle->qId); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _error; + } + + code = tdInitDataCols(pQueryHandle->rhelper.pDCols[0], pSchema); + if (code != TSDB_CODE_SUCCESS) { + tsdbError("%p failed to malloc buf for rhelper.pDataCols[0], 0x%"PRIx64, pQueryHandle, pQueryHandle->qId); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _error; + } + + code = tdInitDataCols(pQueryHandle->rhelper.pDCols[1], pSchema); + if (code != TSDB_CODE_SUCCESS) { + tsdbError("%p failed to malloc buf for rhelper.pDataCols[1], 0x%"PRIx64, pQueryHandle, pQueryHandle->qId); + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + goto _error; + } + + int16_t* colIds = pQueryHandle->defaultLoadColumn->pData; + + int32_t ret = tsdbLoadBlockDataCols(&(pQueryHandle->rhelper), pBlock, pCheckInfo->pCompInfo, colIds, (int)(QH_GET_NUM_OF_COLS(pQueryHandle))); + if (ret != TSDB_CODE_SUCCESS) { + int32_t c = terrno; + assert(c != TSDB_CODE_SUCCESS); + goto _error; + } + + SDataBlockLoadInfo* pBlockLoadInfo = &pQueryHandle->dataBlockLoadInfo; + + pBlockLoadInfo->fileGroup = pQueryHandle->pFileGroup; + pBlockLoadInfo->slot = pQueryHandle->cur.slot; + pBlockLoadInfo->tid = pCheckInfo->pTableObj->tableId.tid; + + SDataCols* pCols = pQueryHandle->rhelper.pDCols[0]; + assert(pCols->numOfRows != 0 && pCols->numOfRows <= pBlock->numOfRows); + + pBlock->numOfRows = pCols->numOfRows; + + // Convert from TKEY to TSKEY for primary timestamp column if current block has timestamp before 1970-01-01T00:00:00Z + if(pBlock->keyFirst < 0 && colIds[0] == PRIMARYKEY_TIMESTAMP_COL_INDEX) { + int64_t* src = pCols->cols[0].pData; + for(int32_t i = 0; i < pBlock->numOfRows; ++i) { + src[i] = tdGetKey(src[i]); + } + } + + int64_t elapsedTime = (taosGetTimestampUs() - st); + pQueryHandle->cost.blockLoadTime += elapsedTime; + + tsdbDebug("%p load file block into buffer, index:%d, brange:%"PRId64"-%"PRId64", rows:%d, elapsed time:%"PRId64 " us, 0x%"PRIx64, + pQueryHandle, slotIndex, pBlock->keyFirst, pBlock->keyLast, pBlock->numOfRows, elapsedTime, pQueryHandle->qId); + return TSDB_CODE_SUCCESS; + +_error: + pBlock->numOfRows = 0; + + tsdbError("%p error occurs in loading file block, index:%d, brange:%"PRId64"-%"PRId64", rows:%d, 0x%"PRIx64, + pQueryHandle, slotIndex, pBlock->keyFirst, pBlock->keyLast, pBlock->numOfRows, pQueryHandle->qId); + return terrno; +} + +static int32_t getEndPosInDataBlock(STsdbQueryHandle* pQueryHandle, SDataBlockInfo* pBlockInfo); +static int32_t doCopyRowsFromFileBlock(STsdbQueryHandle* pQueryHandle, int32_t capacity, int32_t numOfRows, int32_t start, int32_t end); +static void moveDataToFront(STsdbQueryHandle* pQueryHandle, int32_t numOfRows, int32_t numOfCols); +static void doCheckGeneratedBlockRange(STsdbQueryHandle* pQueryHandle); +static void copyAllRemainRowsFromFileBlock(STsdbQueryHandle* pQueryHandle, STableCheckInfo* pCheckInfo, SDataBlockInfo* pBlockInfo, int32_t endPos); + +static int32_t handleDataMergeIfNeeded(STsdbQueryHandle* pQueryHandle, SBlock* pBlock, STableCheckInfo* pCheckInfo){ + SQueryFilePos* cur = &pQueryHandle->cur; + STsdbCfg* pCfg = &pQueryHandle->pTsdb->config; + SDataBlockInfo binfo = GET_FILE_DATA_BLOCK_INFO(pCheckInfo, pBlock); + TSKEY key; + int32_t code = TSDB_CODE_SUCCESS; + + /*bool hasData = */ initTableMemIterator(pQueryHandle, pCheckInfo); + assert(cur->pos >= 0 && cur->pos <= binfo.rows); + + key = extractFirstTraverseKey(pCheckInfo, pQueryHandle->order, pCfg->update); + + if (key != TSKEY_INITIAL_VAL) { + tsdbDebug("%p key in mem:%"PRId64", 0x%"PRIx64, pQueryHandle, key, pQueryHandle->qId); + } else { + tsdbDebug("%p no data in mem, 0x%"PRIx64, pQueryHandle, pQueryHandle->qId); + } + + if ((ASCENDING_TRAVERSE(pQueryHandle->order) && (key != TSKEY_INITIAL_VAL && key <= binfo.window.ekey)) || + (!ASCENDING_TRAVERSE(pQueryHandle->order) && (key != TSKEY_INITIAL_VAL && key >= binfo.window.skey))) { + + if ((ASCENDING_TRAVERSE(pQueryHandle->order) && (key != TSKEY_INITIAL_VAL && key < binfo.window.skey)) || + (!ASCENDING_TRAVERSE(pQueryHandle->order) && (key != TSKEY_INITIAL_VAL && key > binfo.window.ekey))) { + + // do not load file block into buffer + int32_t step = ASCENDING_TRAVERSE(pQueryHandle->order) ? 1 : -1; + + TSKEY maxKey = ASCENDING_TRAVERSE(pQueryHandle->order)? (binfo.window.skey - step):(binfo.window.ekey - step); + cur->rows = tsdbReadRowsFromCache(pCheckInfo, maxKey, pQueryHandle->outputCapacity, &cur->win, pQueryHandle); + pQueryHandle->realNumOfRows = cur->rows; + + // update the last key value + pCheckInfo->lastKey = cur->win.ekey + step; + if (!ASCENDING_TRAVERSE(pQueryHandle->order)) { + SWAP(cur->win.skey, cur->win.ekey, TSKEY); + } + + cur->mixBlock = true; + cur->blockCompleted = false; + return code; + } + + + // return error, add test cases + if ((code = doLoadFileDataBlock(pQueryHandle, pBlock, pCheckInfo, cur->slot)) != TSDB_CODE_SUCCESS) { + return code; + } + + doMergeTwoLevelData(pQueryHandle, pCheckInfo, pBlock); + } else { + /* + * no data in cache, only load data from file + * during the query processing, data in cache will not be checked anymore. + * + * Here the buffer is not enough, so only part of file block can be loaded into memory buffer + */ + assert(pQueryHandle->outputCapacity >= binfo.rows); + int32_t endPos = getEndPosInDataBlock(pQueryHandle, &binfo); + + if ((cur->pos == 0 && endPos == binfo.rows -1 && ASCENDING_TRAVERSE(pQueryHandle->order)) || + (cur->pos == (binfo.rows - 1) && endPos == 0 && (!ASCENDING_TRAVERSE(pQueryHandle->order)))) { + pQueryHandle->realNumOfRows = binfo.rows; + + cur->rows = binfo.rows; + cur->win = binfo.window; + cur->mixBlock = false; + cur->blockCompleted = true; + + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + cur->lastKey = binfo.window.ekey + 1; + cur->pos = binfo.rows; + } else { + cur->lastKey = binfo.window.skey - 1; + cur->pos = -1; + } + } else { // partially copy to dest buffer + copyAllRemainRowsFromFileBlock(pQueryHandle, pCheckInfo, &binfo, endPos); + cur->mixBlock = true; + } + + assert(cur->blockCompleted); + if (cur->rows == binfo.rows) { + tsdbDebug("%p whole file block qualified, brange:%"PRId64"-%"PRId64", rows:%d, lastKey:%"PRId64", tid:%d, %"PRIx64, + pQueryHandle, cur->win.skey, cur->win.ekey, cur->rows, cur->lastKey, binfo.tid, pQueryHandle->qId); + } else { + tsdbDebug("%p create data block from remain file block, brange:%"PRId64"-%"PRId64", rows:%d, total:%d, lastKey:%"PRId64", tid:%d, %"PRIx64, + pQueryHandle, cur->win.skey, cur->win.ekey, cur->rows, binfo.rows, cur->lastKey, binfo.tid, pQueryHandle->qId); + } + + } + + return code; +} + +static int32_t loadFileDataBlock(STsdbQueryHandle* pQueryHandle, SBlock* pBlock, STableCheckInfo* pCheckInfo, bool* exists) { + SQueryFilePos* cur = &pQueryHandle->cur; + int32_t code = TSDB_CODE_SUCCESS; + bool asc = ASCENDING_TRAVERSE(pQueryHandle->order); + + if (asc) { + // query ended in/started from current block + if (pQueryHandle->window.ekey < pBlock->keyLast || pCheckInfo->lastKey > pBlock->keyFirst) { + if ((code = doLoadFileDataBlock(pQueryHandle, pBlock, pCheckInfo, cur->slot)) != TSDB_CODE_SUCCESS) { + *exists = false; + return code; + } + + SDataCols* pTSCol = pQueryHandle->rhelper.pDCols[0]; + assert(pTSCol->cols->type == TSDB_DATA_TYPE_TIMESTAMP && pTSCol->numOfRows == pBlock->numOfRows); + + if (pCheckInfo->lastKey > pBlock->keyFirst) { + cur->pos = + binarySearchForKey(pTSCol->cols[0].pData, pBlock->numOfRows, pCheckInfo->lastKey, pQueryHandle->order); + } else { + cur->pos = 0; + } + + assert(pCheckInfo->lastKey <= pBlock->keyLast); + doMergeTwoLevelData(pQueryHandle, pCheckInfo, pBlock); + } else { // the whole block is loaded in to buffer + cur->pos = asc? 0:(pBlock->numOfRows - 1); + code = handleDataMergeIfNeeded(pQueryHandle, pBlock, pCheckInfo); + } + } else { //desc order, query ended in current block + if (pQueryHandle->window.ekey > pBlock->keyFirst || pCheckInfo->lastKey < pBlock->keyLast) { + if ((code = doLoadFileDataBlock(pQueryHandle, pBlock, pCheckInfo, cur->slot)) != TSDB_CODE_SUCCESS) { + *exists = false; + return code; + } + + SDataCols* pTsCol = pQueryHandle->rhelper.pDCols[0]; + if (pCheckInfo->lastKey < pBlock->keyLast) { + cur->pos = binarySearchForKey(pTsCol->cols[0].pData, pBlock->numOfRows, pCheckInfo->lastKey, pQueryHandle->order); + } else { + cur->pos = pBlock->numOfRows - 1; + } + + assert(pCheckInfo->lastKey >= pBlock->keyFirst); + doMergeTwoLevelData(pQueryHandle, pCheckInfo, pBlock); + } else { + cur->pos = asc? 0:(pBlock->numOfRows-1); + code = handleDataMergeIfNeeded(pQueryHandle, pBlock, pCheckInfo); + } + } + + *exists = pQueryHandle->realNumOfRows > 0; + return code; +} + +// search last keyList[ret] < key order asc and keyList[ret] > key order desc +static int doBinarySearchKey(TSKEY* keyList, int num, int pos, TSKEY key, int order) { + // start end posistion + int s, e; + s = pos; + + // check + assert(pos >=0 && pos < num); + assert(num > 0); + + if (order == TSDB_ORDER_ASC) { + // find the first position which is smaller than the key + e = num - 1; + if (key < keyList[pos]) + return -1; + while (1) { + // check can return + if (key >= keyList[e]) + return e; + if (key <= keyList[s]) + return s; + if (e - s <= 1) + return s; + + // change start or end position + int mid = s + (e - s + 1)/2; + if (keyList[mid] > key) + e = mid; + else if(keyList[mid] < key) + s = mid; + else + return mid; + } + } else { // DESC + // find the first position which is bigger than the key + e = 0; + if (key > keyList[pos]) + return -1; + while (1) { + // check can return + if (key <= keyList[e]) + return e; + if (key >= keyList[s]) + return s; + if (s - e <= 1) + return s; + + // change start or end position + int mid = s - (s - e + 1)/2; + if (keyList[mid] < key) + e = mid; + else if(keyList[mid] > key) + s = mid; + else + return mid; + } + } +} + +static int32_t doCopyRowsFromFileBlock(STsdbQueryHandle* pQueryHandle, int32_t capacity, int32_t numOfRows, int32_t start, int32_t end) { + char* pData = NULL; + int32_t step = ASCENDING_TRAVERSE(pQueryHandle->order)? 1 : -1; + + SDataCols* pCols = pQueryHandle->rhelper.pDCols[0]; + TSKEY* tsArray = pCols->cols[0].pData; + + int32_t num = end - start + 1; + assert(num >= 0); + + if (num == 0) { + return numOfRows; + } + + int32_t requiredNumOfCols = (int32_t)taosArrayGetSize(pQueryHandle->pColumns); + + //data in buffer has greater timestamp, copy data in file block + int32_t i = 0, j = 0; + while(i < requiredNumOfCols && j < pCols->numOfCols) { + SColumnInfoData* pColInfo = taosArrayGet(pQueryHandle->pColumns, i); + + SDataCol* src = &pCols->cols[j]; + if (src->colId < pColInfo->info.colId) { + j++; + continue; + } + + int32_t bytes = pColInfo->info.bytes; + + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + pData = (char*)pColInfo->pData + numOfRows * pColInfo->info.bytes; + } else { + pData = (char*)pColInfo->pData + (capacity - numOfRows - num) * pColInfo->info.bytes; + } + + if (!isAllRowsNull(src) && pColInfo->info.colId == src->colId) { + if (pColInfo->info.type != TSDB_DATA_TYPE_BINARY && pColInfo->info.type != TSDB_DATA_TYPE_NCHAR) { + memmove(pData, (char*)src->pData + bytes * start, bytes * num); + } else { // handle the var-string + char* dst = pData; + + // todo refactor, only copy one-by-one + for (int32_t k = start; k < num + start; ++k) { + const char* p = tdGetColDataOfRow(src, k); + memcpy(dst, p, varDataTLen(p)); + dst += bytes; + } + } + + j++; + i++; + } else { // pColInfo->info.colId < src->colId, it is a NULL data + if (pColInfo->info.type == TSDB_DATA_TYPE_BINARY || pColInfo->info.type == TSDB_DATA_TYPE_NCHAR) { + char* dst = pData; + + for(int32_t k = start; k < num + start; ++k) { + setVardataNull(dst, pColInfo->info.type); + dst += bytes; + } + } else { + setNullN(pData, pColInfo->info.type, pColInfo->info.bytes, num); + } + i++; + } + } + + while (i < requiredNumOfCols) { // the remain columns are all null data + SColumnInfoData* pColInfo = taosArrayGet(pQueryHandle->pColumns, i); + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + pData = (char*)pColInfo->pData + numOfRows * pColInfo->info.bytes; + } else { + pData = (char*)pColInfo->pData + (capacity - numOfRows - num) * pColInfo->info.bytes; + } + + if (pColInfo->info.type == TSDB_DATA_TYPE_BINARY || pColInfo->info.type == TSDB_DATA_TYPE_NCHAR) { + char* dst = pData; + + for(int32_t k = start; k < num + start; ++k) { + setVardataNull(dst, pColInfo->info.type); + dst += pColInfo->info.bytes; + } + } else { + setNullN(pData, pColInfo->info.type, pColInfo->info.bytes, num); + } + + i++; + } + + pQueryHandle->cur.win.ekey = tsArray[end]; + pQueryHandle->cur.lastKey = tsArray[end] + step; + + return numOfRows + num; +} + +// Note: row1 always has high priority +static void mergeTwoRowFromMem(STsdbQueryHandle* pQueryHandle, int32_t capacity, int32_t numOfRows, + SMemRow row1, SMemRow row2, int32_t numOfCols, STable* pTable, + STSchema* pSchema1, STSchema* pSchema2, bool forceSetNull) { + char* pData = NULL; + STSchema* pSchema; + SMemRow row; + int16_t colId; + int16_t offset; + + bool isRow1DataRow = isDataRow(row1); + bool isRow2DataRow = false; + bool isChosenRowDataRow; + int32_t chosen_itr; + void *value; + + // the schema version info is embeded in SDataRow + int32_t numOfColsOfRow1 = 0; + + if (pSchema1 == NULL) { + pSchema1 = tsdbGetTableSchemaByVersion(pTable, memRowVersion(row1), (int8_t)memRowType(row1)); + } + if(isRow1DataRow) { + numOfColsOfRow1 = schemaNCols(pSchema1); + } else { + numOfColsOfRow1 = kvRowNCols(memRowKvBody(row1)); + } + + int32_t numOfColsOfRow2 = 0; + if(row2) { + isRow2DataRow = isDataRow(row2); + if (pSchema2 == NULL) { + pSchema2 = tsdbGetTableSchemaByVersion(pTable, memRowVersion(row2), (int8_t)memRowType(row2)); + } + if(isRow2DataRow) { + numOfColsOfRow2 = schemaNCols(pSchema2); + } else { + numOfColsOfRow2 = kvRowNCols(memRowKvBody(row2)); + } + } + + + int32_t i = 0, j = 0, k = 0; + while(i < numOfCols && (j < numOfColsOfRow1 || k < numOfColsOfRow2)) { + SColumnInfoData* pColInfo = taosArrayGet(pQueryHandle->pColumns, i); + + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + pData = (char*)pColInfo->pData + numOfRows * pColInfo->info.bytes; + } else { + pData = (char*)pColInfo->pData + (capacity - numOfRows - 1) * pColInfo->info.bytes; + } + + int32_t colIdOfRow1; + if(j >= numOfColsOfRow1) { + colIdOfRow1 = INT32_MAX; + } else if(isRow1DataRow) { + colIdOfRow1 = pSchema1->columns[j].colId; + } else { + void *rowBody = memRowKvBody(row1); + SColIdx *pColIdx = kvRowColIdxAt(rowBody, j); + colIdOfRow1 = pColIdx->colId; + } + + int32_t colIdOfRow2; + if(k >= numOfColsOfRow2) { + colIdOfRow2 = INT32_MAX; + } else if(isRow2DataRow) { + colIdOfRow2 = pSchema2->columns[k].colId; + } else { + void *rowBody = memRowKvBody(row2); + SColIdx *pColIdx = kvRowColIdxAt(rowBody, k); + colIdOfRow2 = pColIdx->colId; + } + + if(colIdOfRow1 == colIdOfRow2) { + if(colIdOfRow1 < pColInfo->info.colId) { + j++; + k++; + continue; + } + row = row1; + pSchema = pSchema1; + isChosenRowDataRow = isRow1DataRow; + chosen_itr = j; + } else if(colIdOfRow1 < colIdOfRow2) { + if(colIdOfRow1 < pColInfo->info.colId) { + j++; + continue; + } + row = row1; + pSchema = pSchema1; + isChosenRowDataRow = isRow1DataRow; + chosen_itr = j; + } else { + if(colIdOfRow2 < pColInfo->info.colId) { + k++; + continue; + } + row = row2; + pSchema = pSchema2; + chosen_itr = k; + isChosenRowDataRow = isRow2DataRow; + } + if(isChosenRowDataRow) { + colId = pSchema->columns[chosen_itr].colId; + offset = pSchema->columns[chosen_itr].offset; + void *rowBody = memRowDataBody(row); + value = tdGetRowDataOfCol(rowBody, (int8_t)pColInfo->info.type, TD_DATA_ROW_HEAD_SIZE + offset); + } else { + void *rowBody = memRowKvBody(row); + SColIdx *pColIdx = kvRowColIdxAt(rowBody, chosen_itr); + colId = pColIdx->colId; + offset = pColIdx->offset; + value = tdGetKvRowDataOfCol(rowBody, offset); + } + + + if (colId == pColInfo->info.colId) { + if(forceSetNull || (!isNull(value, (int8_t)pColInfo->info.type))) { + switch (pColInfo->info.type) { + case TSDB_DATA_TYPE_BINARY: + case TSDB_DATA_TYPE_NCHAR: + memcpy(pData, value, varDataTLen(value)); + break; + case TSDB_DATA_TYPE_NULL: + case TSDB_DATA_TYPE_BOOL: + case TSDB_DATA_TYPE_TINYINT: + case TSDB_DATA_TYPE_UTINYINT: + *(uint8_t *)pData = *(uint8_t *)value; + break; + case TSDB_DATA_TYPE_SMALLINT: + case TSDB_DATA_TYPE_USMALLINT: + *(uint16_t *)pData = *(uint16_t *)value; + break; + case TSDB_DATA_TYPE_INT: + case TSDB_DATA_TYPE_UINT: + *(uint32_t *)pData = *(uint32_t *)value; + break; + case TSDB_DATA_TYPE_BIGINT: + case TSDB_DATA_TYPE_UBIGINT: + *(uint64_t *)pData = *(uint64_t *)value; + break; + case TSDB_DATA_TYPE_FLOAT: + SET_FLOAT_PTR(pData, value); + break; + case TSDB_DATA_TYPE_DOUBLE: + SET_DOUBLE_PTR(pData, value); + break; + case TSDB_DATA_TYPE_TIMESTAMP: + if (pColInfo->info.colId == PRIMARYKEY_TIMESTAMP_COL_INDEX) { + *(TSKEY *)pData = tdGetKey(*(TKEY *)value); + } else { + *(TSKEY *)pData = *(TSKEY *)value; + } + break; + default: + memcpy(pData, value, pColInfo->info.bytes); + } + } + i++; + + if(row == row1) { + j++; + } else { + k++; + } + } else { + if(forceSetNull) { + if (pColInfo->info.type == TSDB_DATA_TYPE_BINARY || pColInfo->info.type == TSDB_DATA_TYPE_NCHAR) { + setVardataNull(pData, pColInfo->info.type); + } else { + setNull(pData, pColInfo->info.type, pColInfo->info.bytes); + } + } + i++; + } + } + + if(forceSetNull) { + while (i < numOfCols) { // the remain columns are all null data + SColumnInfoData* pColInfo = taosArrayGet(pQueryHandle->pColumns, i); + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + pData = (char*)pColInfo->pData + numOfRows * pColInfo->info.bytes; + } else { + pData = (char*)pColInfo->pData + (capacity - numOfRows - 1) * pColInfo->info.bytes; + } + + if (pColInfo->info.type == TSDB_DATA_TYPE_BINARY || pColInfo->info.type == TSDB_DATA_TYPE_NCHAR) { + setVardataNull(pData, pColInfo->info.type); + } else { + setNull(pData, pColInfo->info.type, pColInfo->info.bytes); + } + + i++; + } + } +} + +static void moveDataToFront(STsdbQueryHandle* pQueryHandle, int32_t numOfRows, int32_t numOfCols) { + if (numOfRows == 0 || ASCENDING_TRAVERSE(pQueryHandle->order)) { + return; + } + + // if the buffer is not full in case of descending order query, move the data in the front of the buffer + if (numOfRows < pQueryHandle->outputCapacity) { + int32_t emptySize = pQueryHandle->outputCapacity - numOfRows; + for(int32_t i = 0; i < numOfCols; ++i) { + SColumnInfoData* pColInfo = taosArrayGet(pQueryHandle->pColumns, i); + memmove((char*)pColInfo->pData, (char*)pColInfo->pData + emptySize * pColInfo->info.bytes, numOfRows * pColInfo->info.bytes); + } + } +} + +static void getQualifiedRowsPos(STsdbQueryHandle* pQueryHandle, int32_t startPos, int32_t endPos, int32_t numOfExisted, + int32_t* start, int32_t* end) { + *start = -1; + + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + int32_t remain = endPos - startPos + 1; + if (remain + numOfExisted > pQueryHandle->outputCapacity) { + *end = (pQueryHandle->outputCapacity - numOfExisted) + startPos - 1; + } else { + *end = endPos; + } + + *start = startPos; + } else { + int32_t remain = (startPos - endPos) + 1; + if (remain + numOfExisted > pQueryHandle->outputCapacity) { + *end = startPos + 1 - (pQueryHandle->outputCapacity - numOfExisted); + } else { + *end = endPos; + } + + *start = *end; + *end = startPos; + } +} + +static void updateInfoAfterMerge(STsdbQueryHandle* pQueryHandle, STableCheckInfo* pCheckInfo, int32_t numOfRows, int32_t endPos) { + SQueryFilePos* cur = &pQueryHandle->cur; + + pCheckInfo->lastKey = cur->lastKey; + pQueryHandle->realNumOfRows = numOfRows; + cur->rows = numOfRows; + cur->pos = endPos; +} + +static void doCheckGeneratedBlockRange(STsdbQueryHandle* pQueryHandle) { + SQueryFilePos* cur = &pQueryHandle->cur; + + if (cur->rows > 0) { + if (ASCENDING_TRAVERSE(pQueryHandle->order)) { + assert(cur->win.skey >= pQueryHandle->window.skey && cur->win.ekey <= pQueryHandle->window.ekey); + } else { + assert(cur->win.skey >= pQueryHandle->window.ekey && cur->win.ekey <= pQueryHandle->window.skey); + } + + SColumnInfoData* pColInfoData = taosArrayGet(pQueryHandle->pColumns, 0); + assert(cur->win.skey == ((TSKEY*)pColInfoData->pData)[0] && cur->win.ekey == ((TSKEY*)pColInfoData->pData)[cur->rows-1]); + } else { + cur->win = pQueryHandle->window; + + int32_t step = ASCENDING_TRAVERSE(pQueryHandle->order)? 1:-1; + cur->lastKey = pQueryHandle->window.ekey + step; + } +} + +static void copyAllRemainRowsFromFileBlock(STsdbQueryHandle* pQueryHandle, STableCheckInfo* pCheckInfo, SDataBlockInfo* pBlockInfo, int32_t endPos) { + SQueryFilePos* cur = &pQueryHandle->cur; + + SDataCols* pCols = pQueryHandle->rhelper.pDCols[0]; + TSKEY* tsArray = pCols->cols[0].pData; + + int32_t step = ASCENDING_TRAVERSE(pQueryHandle->order)? 1:-1; + int32_t numOfCols = (int32_t)(QH_GET_NUM_OF_COLS(pQueryHandle)); + + int32_t pos = cur->pos; + + int32_t start = cur->pos; + int32_t end = endPos; + + if (!ASCENDING_TRAVERSE(pQueryHandle->order)) { + SWAP(start, end, int32_t); + } + + assert(pQueryHandle->outputCapacity >= (end - start + 1)); + int32_t numOfRows = doCopyRowsFromFileBlock(pQueryHandle, pQueryHandle->outputCapacity, 0, start, end); + + // the time window should always be ascending order: skey <= ekey + cur->win = (STimeWindow) {.skey = tsArray[start], .ekey = tsArray[end]}; + cur->mixBlock = (numOfRows != pBlockInfo->rows); + cur->lastKey = tsArray[endPos] + step; + cur->blockCompleted = true; + + // if the buffer is not full in case of descending order query, move the data in the front of the buffer + moveDataToFront(pQueryHandle, numOfRows, numOfCols); + + // The value of pos may be -1 or pBlockInfo->rows, and it is invalid in both cases. + pos = endPos + step; + updateInfoAfterMerge(pQueryHandle, pCheckInfo, numOfRows, pos); + doCheckGeneratedBlockRange(pQueryHandle); + + tsdbDebug("%p uid:%" PRIu64",tid:%d data block created, mixblock:%d, brange:%"PRIu64"-%"PRIu64" rows:%d, 0x%"PRIx64, + pQueryHandle, pCheckInfo->tableId.uid, pCheckInfo->tableId.tid, cur->mixBlock, cur->win.skey, + cur->win.ekey, cur->rows, pQueryHandle->qId); +} + +int32_t getEndPosInDataBlock(STsdbQueryHandle* pQueryHandle, SDataBlockInfo* pBlockInfo) { + // NOTE: reverse the order to find the end position in data block + int32_t endPos = -1; + + SQueryFilePos* cur = &pQueryHandle->cur; + SDataCols* pCols = pQueryHandle->rhelper.pDCols[0]; + + if (ASCENDING_TRAVERSE(pQueryHandle->order) && pQueryHandle->window.ekey >= pBlockInfo->window.ekey) { + endPos = pBlockInfo->rows - 1; + cur->mixBlock = (cur->pos != 0); + } else if (!ASCENDING_TRAVERSE(pQueryHandle->order) && pQueryHandle->window.ekey <= pBlockInfo->window.skey) { + endPos = 0; + cur->mixBlock = (cur->pos != pBlockInfo->rows - 1); + } else { + assert(pCols->numOfRows > 0); + int pos = ASCENDING_TRAVERSE(pQueryHandle->order)? 0 : pBlockInfo->rows - 1; + endPos = doBinarySearchKey(pCols->cols[0].pData, pCols->numOfRows, pos, pQueryHandle->window.ekey, pQueryHandle->order); + assert(endPos != -1); + cur->mixBlock = true; + } + + return endPos; +} + +// only return the qualified data to client in terms of query time window, data rows in the same block but do not +// be included in the query time window will be discarded +static void doMergeTwoLevelData(STsdbQueryHandle* pQueryHandle, STableCheckInfo* pCheckInfo, SBlock* pBlock) { + SQueryFilePos* cur = &pQueryHandle->cur; + SDataBlockInfo blockInfo = GET_FILE_DATA_BLOCK_INFO(pCheckInfo, pBlock); + STsdbCfg* pCfg = &pQueryHandle->pTsdb->config; + + initTableMemIterator(pQueryHandle, pCheckInfo); + + SDataCols* pCols = pQueryHandle->rhelper.pDCols[0]; + assert(pCols->cols[0].type == TSDB_DATA_TYPE_TIMESTAMP && pCols->cols[0].colId == PRIMARYKEY_TIMESTAMP_COL_INDEX && + cur->pos >= 0 && cur->pos < pBlock->numOfRows); + + // key read from file + TSKEY* keyFile = pCols->cols[0].pData; + assert(pCols->numOfRows == pBlock->numOfRows && keyFile[0] == pBlock->keyFirst && keyFile[pBlock->numOfRows-1] == pBlock->keyLast); + + int32_t step = ASCENDING_TRAVERSE(pQueryHandle->order)? 1:-1; + int32_t numOfCols = (int32_t)(QH_GET_NUM_OF_COLS(pQueryHandle)); + + STable* pTable = pCheckInfo->pTableObj; + int32_t endPos = getEndPosInDataBlock(pQueryHandle, &blockInfo); + + + tsdbDebug("%p uid:%" PRIu64",tid:%d start merge data block, file block range:%"PRIu64"-%"PRIu64" rows:%d, start:%d," + "end:%d, 0x%"PRIx64, + pQueryHandle, pCheckInfo->tableId.uid, pCheckInfo->tableId.tid, blockInfo.window.skey, blockInfo.window.ekey, + blockInfo.rows, cur->pos, endPos, pQueryHandle->qId); + + // compared with the data from in-memory buffer, to generate the correct timestamp array list + int32_t numOfRows = 0; + + int16_t rv1 = -1; + int16_t rv2 = -1; + STSchema* pSchema1 = NULL; + STSchema* pSchema2 = NULL; + + // position in file ->fpos + int32_t pos = cur->pos; + cur->win = TSWINDOW_INITIALIZER; + + // no data in buffer, load data from file directly + if (pCheckInfo->iiter == NULL && pCheckInfo->iter == NULL) { + copyAllRemainRowsFromFileBlock(pQueryHandle, pCheckInfo, &blockInfo, endPos); + return; + } else if (pCheckInfo->iter != NULL || pCheckInfo->iiter != NULL) { + SSkipListNode* node = NULL; + do { + SMemRow row2 = NULL; + SMemRow row1 = getSMemRowInTableMem(pCheckInfo, pQueryHandle->order, pCfg->update, &row2); + if (row1 == NULL) { + break; + } + + TSKEY keyMem = memRowKey(row1); + if ((keyMem > pQueryHandle->window.ekey && ASCENDING_TRAVERSE(pQueryHandle->order)) || + (keyMem < pQueryHandle->window.ekey && !ASCENDING_TRAVERSE(pQueryHandle->order))) { + break; + } + + // break if pos not in this block endPos range. note old code when pos is -1 can crash. + if(ASCENDING_TRAVERSE(pQueryHandle->order)) { //ASC + if(pos > endPos || keyFile[pos] > pQueryHandle->window.ekey) + break; + } else { //DESC + if(pos < endPos || keyFile[pos] < pQueryHandle->window.ekey) + break; + } + + if ((keyMem < keyFile[pos] && ASCENDING_TRAVERSE(pQueryHandle->order)) || + (keyMem > keyFile[pos] && !ASCENDING_TRAVERSE(pQueryHandle->order))) { + if (rv1 != memRowVersion(row1)) { + pSchema1 = tsdbGetTableSchemaByVersion(pTable, memRowVersion(row1), (int8_t)memRowType(row1)); + rv1 = memRowVersion(row1); + } + if(row2 && rv2 != memRowVersion(row2)) { + pSchema2 = tsdbGetTableSchemaByVersion(pTable, memRowVersion(row2), (int8_t)memRowType(row2)); + rv2 = memRowVersion(row2); + } + + mergeTwoRowFromMem(pQueryHandle, pQueryHandle->outputCapacity, numOfRows, row1, row2, numOfCols, pTable, pSchema1, pSchema2, true); + numOfRows += 1; + // record start key with memory key if not + if (cur->win.skey == TSKEY_INITIAL_VAL) { + cur->win.skey = keyMem; + } + + cur->win.ekey = keyMem; + cur->lastKey = keyMem + step; + cur->mixBlock = true; + + moveToNextRowInMem(pCheckInfo); + // same select mem key if update is true + } else if (keyMem == keyFile[pos]) { + if (pCfg->update) { + if(pCfg->update == TD_ROW_PARTIAL_UPDATE) { + doCopyRowsFromFileBlock(pQueryHandle, pQueryHandle->outputCapacity, numOfRows, pos, pos); + } + if (rv1 != memRowVersion(row1)) { + pSchema1 = tsdbGetTableSchemaByVersion(pTable, memRowVersion(row1), (int8_t)memRowType(row1)); + rv1 = memRowVersion(row1); + } + if(row2 && rv2 != memRowVersion(row2)) { + pSchema2 = tsdbGetTableSchemaByVersion(pTable, memRowVersion(row2), (int8_t)memRowType(row2)); + rv2 = memRowVersion(row2); + } + + bool forceSetNull = pCfg->update != TD_ROW_PARTIAL_UPDATE; + mergeTwoRowFromMem(pQueryHandle, pQueryHandle->outputCapacity, numOfRows, row1, row2, numOfCols, pTable, pSchema1, pSchema2, forceSetNull); + numOfRows += 1; + if (cur->win.skey == TSKEY_INITIAL_VAL) { + cur->win.skey = keyMem; + } + + cur->win.ekey = keyMem; + cur->lastKey = keyMem + step; + cur->mixBlock = true; + + //mem move next + moveToNextRowInMem(pCheckInfo); + //file move next, discard file row + pos += step; + } else { + // not update, only mem move to next, discard mem row + moveToNextRowInMem(pCheckInfo); + } + // put file row + } else if ((keyMem > keyFile[pos] && ASCENDING_TRAVERSE(pQueryHandle->order)) || + (keyMem < keyFile[pos] && !ASCENDING_TRAVERSE(pQueryHandle->order))) { + if (cur->win.skey == TSKEY_INITIAL_VAL) { + cur->win.skey = keyFile[pos]; + } + + int32_t end = doBinarySearchKey(pCols->cols[0].pData, pCols->numOfRows, pos, keyMem, pQueryHandle->order); + assert(end != -1); + + if (keyFile[end] == keyMem) { // the value of key in cache equals to the end timestamp value, ignore it + if (pCfg->update == TD_ROW_DISCARD_UPDATE) { + moveToNextRowInMem(pCheckInfo); + } else { + // can update, don't copy then deal on next loop with keyMem == keyFile[pos] + end -= step; + } + } + + int32_t qstart = 0, qend = 0; + getQualifiedRowsPos(pQueryHandle, pos, end, numOfRows, &qstart, &qend); + + if(qend >= qstart) { + // copy qend - qstart + 1 rows from file + numOfRows = doCopyRowsFromFileBlock(pQueryHandle, pQueryHandle->outputCapacity, numOfRows, qstart, qend); + int32_t num = qend - qstart + 1; + pos += num * step; + } else { + // nothing copy from file + pos += step; + } + + cur->win.ekey = ASCENDING_TRAVERSE(pQueryHandle->order)? keyFile[qend] : keyFile[qstart]; + cur->lastKey = cur->win.ekey + step; + } + } while (numOfRows < pQueryHandle->outputCapacity); + + if (numOfRows < pQueryHandle->outputCapacity) { + /** + * if cache is empty, load remain file block data. In contrast, if there are remain data in cache, do NOT + * copy them all to result buffer, since it may be overlapped with file data block. + */ + if (node == NULL || + ((memRowKey((SMemRow)SL_GET_NODE_DATA(node)) > pQueryHandle->window.ekey) && + ASCENDING_TRAVERSE(pQueryHandle->order)) || + ((memRowKey((SMemRow)SL_GET_NODE_DATA(node)) < pQueryHandle->window.ekey) && + !ASCENDING_TRAVERSE(pQueryHandle->order))) { + // no data in cache or data in cache is greater than the ekey of time window, load data from file block + if (cur->win.skey == TSKEY_INITIAL_VAL) { + cur->win.skey = keyFile[pos]; + } + + int32_t start = -1, end = -1; + getQualifiedRowsPos(pQueryHandle, pos, endPos, numOfRows, &start, &end); + + numOfRows = doCopyRowsFromFileBlock(pQueryHandle, pQueryHandle->outputCapacity, numOfRows, start, end); + pos += (end - start + 1) * step; + + cur->win.ekey = ASCENDING_TRAVERSE(pQueryHandle->order)? keyFile[end] : keyFile[start]; + cur->lastKey = cur->win.ekey + step; + cur->mixBlock = true; + } + } + } + + cur->blockCompleted = + (((pos > endPos || cur->lastKey > pQueryHandle->window.ekey) && ASCENDING_TRAVERSE(pQueryHandle->order)) || + ((pos < endPos || cur->lastKey < pQueryHandle->window.ekey) && !ASCENDING_TRAVERSE(pQueryHandle->order))); + + if (!ASCENDING_TRAVERSE(pQueryHandle->order)) { + SWAP(cur->win.skey, cur->win.ekey, TSKEY); + } + + moveDataToFront(pQueryHandle, numOfRows, numOfCols); + updateInfoAfterMerge(pQueryHandle, pCheckInfo, numOfRows, pos); + doCheckGeneratedBlockRange(pQueryHandle); + + tsdbDebug("%p uid:%" PRIu64",tid:%d data block created, mixblock:%d, brange:%"PRIu64"-%"PRIu64" rows:%d, 0x%"PRIx64, + pQueryHandle, pCheckInfo->tableId.uid, pCheckInfo->tableId.tid, cur->mixBlock, cur->win.skey, + cur->win.ekey, cur->rows, pQueryHandle->qId); +} + +int32_t binarySearchForKey(char* pValue, int num, TSKEY key, int order) { + int firstPos, lastPos, midPos = -1; + int numOfRows; + TSKEY* keyList; + + if (num <= 0) return -1; + + keyList = (TSKEY*)pValue; + firstPos = 0; + lastPos = num - 1; + + if (order == TSDB_ORDER_DESC) { + // find the first position which is smaller than the key + while (1) { + if (key >= keyList[lastPos]) return lastPos; + if (key == keyList[firstPos]) return firstPos; + if (key < keyList[firstPos]) return firstPos - 1; + + numOfRows = lastPos - firstPos + 1; + midPos = (numOfRows >> 1) + firstPos; + + if (key < keyList[midPos]) { + lastPos = midPos - 1; + } else if (key > keyList[midPos]) { + firstPos = midPos + 1; + } else { + break; + } + } + + } else { + // find the first position which is bigger than the key + while (1) { + if (key <= keyList[firstPos]) return firstPos; + if (key == keyList[lastPos]) return lastPos; + + if (key > keyList[lastPos]) { + lastPos = lastPos + 1; + if (lastPos >= num) + return -1; + else + return lastPos; + } + + numOfRows = lastPos - firstPos + 1; + midPos = (numOfRows >> 1) + firstPos; + + if (key < keyList[midPos]) { + lastPos = midPos - 1; + } else if (key > keyList[midPos]) { + firstPos = midPos + 1; + } else { + break; + } + } + } + + return midPos; +} + +static void cleanBlockOrderSupporter(SBlockOrderSupporter* pSupporter, int32_t numOfTables) { + tfree(pSupporter->numOfBlocksPerTable); + tfree(pSupporter->blockIndexArray); + + for (int32_t i = 0; i < numOfTables; ++i) { + STableBlockInfo* pBlockInfo = pSupporter->pDataBlockInfo[i]; + tfree(pBlockInfo); + } + + tfree(pSupporter->pDataBlockInfo); +} + +static int32_t dataBlockOrderCompar(const void* pLeft, const void* pRight, void* param) { + int32_t leftTableIndex = *(int32_t*)pLeft; + int32_t rightTableIndex = *(int32_t*)pRight; + + SBlockOrderSupporter* pSupporter = (SBlockOrderSupporter*)param; + + int32_t leftTableBlockIndex = pSupporter->blockIndexArray[leftTableIndex]; + int32_t rightTableBlockIndex = pSupporter->blockIndexArray[rightTableIndex]; + + if (leftTableBlockIndex > pSupporter->numOfBlocksPerTable[leftTableIndex]) { + /* left block is empty */ + return 1; + } else if (rightTableBlockIndex > pSupporter->numOfBlocksPerTable[rightTableIndex]) { + /* right block is empty */ + return -1; + } + + STableBlockInfo* pLeftBlockInfoEx = &pSupporter->pDataBlockInfo[leftTableIndex][leftTableBlockIndex]; + STableBlockInfo* pRightBlockInfoEx = &pSupporter->pDataBlockInfo[rightTableIndex][rightTableBlockIndex]; + + // assert(pLeftBlockInfoEx->compBlock->offset != pRightBlockInfoEx->compBlock->offset); +#if 0 // TODO: temporarily comment off requested by Dr. Liao + if (pLeftBlockInfoEx->compBlock->offset == pRightBlockInfoEx->compBlock->offset && + pLeftBlockInfoEx->compBlock->last == pRightBlockInfoEx->compBlock->last) { + tsdbError("error in header file, two block with same offset:%" PRId64, (int64_t)pLeftBlockInfoEx->compBlock->offset); + } +#endif + + return pLeftBlockInfoEx->compBlock->offset > pRightBlockInfoEx->compBlock->offset ? 1 : -1; +} + +static int32_t createDataBlocksInfo(STsdbQueryHandle* pQueryHandle, int32_t numOfBlocks, int32_t* numOfAllocBlocks) { + size_t size = sizeof(STableBlockInfo) * numOfBlocks; + + if (pQueryHandle->allocSize < size) { + pQueryHandle->allocSize = (int32_t)size; + char* tmp = realloc(pQueryHandle->pDataBlockInfo, pQueryHandle->allocSize); + if (tmp == NULL) { + return TSDB_CODE_TDB_OUT_OF_MEMORY; + } + + pQueryHandle->pDataBlockInfo = (STableBlockInfo*) tmp; + } + + memset(pQueryHandle->pDataBlockInfo, 0, size); + *numOfAllocBlocks = numOfBlocks; + + // access data blocks according to the offset of each block in asc/desc order. + int32_t numOfTables = (int32_t)taosArrayGetSize(pQueryHandle->pTableCheckInfo); + + SBlockOrderSupporter sup = {0}; + sup.numOfTables = numOfTables; + sup.numOfBlocksPerTable = calloc(1, sizeof(int32_t) * numOfTables); + sup.blockIndexArray = calloc(1, sizeof(int32_t) * numOfTables); + sup.pDataBlockInfo = calloc(1, POINTER_BYTES * numOfTables); + + if (sup.numOfBlocksPerTable == NULL || sup.blockIndexArray == NULL || sup.pDataBlockInfo == NULL) { + cleanBlockOrderSupporter(&sup, 0); + return TSDB_CODE_TDB_OUT_OF_MEMORY; + } + + int32_t cnt = 0; + int32_t numOfQualTables = 0; + + for (int32_t j = 0; j < numOfTables; ++j) { + STableCheckInfo* pTableCheck = (STableCheckInfo*)taosArrayGet(pQueryHandle->pTableCheckInfo, j); + if (pTableCheck->numOfBlocks <= 0) { + continue; + } + + SBlock* pBlock = pTableCheck->pCompInfo->blocks; + sup.numOfBlocksPerTable[numOfQualTables] = pTableCheck->numOfBlocks; + + char* buf = malloc(sizeof(STableBlockInfo) * pTableCheck->numOfBlocks); + if (buf == NULL) { + cleanBlockOrderSupporter(&sup, numOfQualTables); + return TSDB_CODE_TDB_OUT_OF_MEMORY; + } + + sup.pDataBlockInfo[numOfQualTables] = (STableBlockInfo*)buf; + + for (int32_t k = 0; k < pTableCheck->numOfBlocks; ++k) { + STableBlockInfo* pBlockInfo = &sup.pDataBlockInfo[numOfQualTables][k]; + + pBlockInfo->compBlock = &pBlock[k]; + pBlockInfo->pTableCheckInfo = pTableCheck; + cnt++; + } + + numOfQualTables++; + } + + assert(numOfBlocks == cnt); + + // since there is only one table qualified, blocks are not sorted + if (numOfQualTables == 1) { + memcpy(pQueryHandle->pDataBlockInfo, sup.pDataBlockInfo[0], sizeof(STableBlockInfo) * numOfBlocks); + cleanBlockOrderSupporter(&sup, numOfQualTables); + + tsdbDebug("%p create data blocks info struct completed for 1 table, %d blocks not sorted 0x%"PRIx64, pQueryHandle, cnt, + pQueryHandle->qId); + return TSDB_CODE_SUCCESS; + } + + tsdbDebug("%p create data blocks info struct completed, %d blocks in %d tables 0x%"PRIx64, pQueryHandle, cnt, + numOfQualTables, pQueryHandle->qId); + + assert(cnt <= numOfBlocks && numOfQualTables <= numOfTables); // the pTableQueryInfo[j]->numOfBlocks may be 0 + sup.numOfTables = numOfQualTables; + + SLoserTreeInfo* pTree = NULL; + uint8_t ret = tLoserTreeCreate(&pTree, sup.numOfTables, &sup, dataBlockOrderCompar); + if (ret != TSDB_CODE_SUCCESS) { + cleanBlockOrderSupporter(&sup, numOfTables); + return TSDB_CODE_TDB_OUT_OF_MEMORY; + } + + int32_t numOfTotal = 0; + + while (numOfTotal < cnt) { + int32_t pos = pTree->pNode[0].index; + int32_t index = sup.blockIndexArray[pos]++; + + STableBlockInfo* pBlocksInfo = sup.pDataBlockInfo[pos]; + pQueryHandle->pDataBlockInfo[numOfTotal++] = pBlocksInfo[index]; + + // set data block index overflow, in order to disable the offset comparator + if (sup.blockIndexArray[pos] >= sup.numOfBlocksPerTable[pos]) { + sup.blockIndexArray[pos] = sup.numOfBlocksPerTable[pos] + 1; + } + + tLoserTreeAdjust(pTree, pos + sup.numOfTables); + } + + /* + * available when no import exists + * for(int32_t i = 0; i < cnt - 1; ++i) { + * assert((*pDataBlockInfo)[i].compBlock->offset < (*pDataBlockInfo)[i+1].compBlock->offset); + * } + */ + + tsdbDebug("%p %d data blocks sort completed, 0x%"PRIx64, pQueryHandle, cnt, pQueryHandle->qId); + cleanBlockOrderSupporter(&sup, numOfTables); + free(pTree); + + return TSDB_CODE_SUCCESS; +} + +static int32_t getFirstFileDataBlock(STsdbQueryHandle* pQueryHandle, bool* exists); + +static int32_t getDataBlockRv(STsdbQueryHandle* pQueryHandle, STableBlockInfo* pNext, bool *exists) { + int32_t step = ASCENDING_TRAVERSE(pQueryHandle->order)? 1 : -1; + SQueryFilePos* cur = &pQueryHandle->cur; + + while(1) { + int32_t code = loadFileDataBlock(pQueryHandle, pNext->compBlock, pNext->pTableCheckInfo, exists); + if (code != TSDB_CODE_SUCCESS || *exists) { + return code; + } + + if ((cur->slot == pQueryHandle->numOfBlocks - 1 && ASCENDING_TRAVERSE(pQueryHandle->order)) || + (cur->slot == 0 && !ASCENDING_TRAVERSE(pQueryHandle->order))) { + // all data blocks in current file has been checked already, try next file if exists + return getFirstFileDataBlock(pQueryHandle, exists); + } else { // next block of the same file + cur->slot += step; + cur->mixBlock = false; + cur->blockCompleted = false; + pNext = &pQueryHandle->pDataBlockInfo[cur->slot]; + } + } +} + +static int32_t getFirstFileDataBlock(STsdbQueryHandle* pQueryHandle, bool* exists) { + pQueryHandle->numOfBlocks = 0; + SQueryFilePos* cur = &pQueryHandle->cur; + + int32_t code = TSDB_CODE_SUCCESS; + + int32_t numOfBlocks = 0; + int32_t numOfTables = (int32_t)taosArrayGetSize(pQueryHandle->pTableCheckInfo); + + STsdbCfg* pCfg = &pQueryHandle->pTsdb->config; + STimeWindow win = TSWINDOW_INITIALIZER; + + while (true) { + tsdbRLockFS(REPO_FS(pQueryHandle->pTsdb)); + + if ((pQueryHandle->pFileGroup = tsdbFSIterNext(&pQueryHandle->fileIter)) == NULL) { + tsdbUnLockFS(REPO_FS(pQueryHandle->pTsdb)); + break; + } + + tsdbGetFidKeyRange(pCfg->daysPerFile, pCfg->precision, pQueryHandle->pFileGroup->fid, &win.skey, &win.ekey); + + // current file are not overlapped with query time window, ignore remain files + if ((ASCENDING_TRAVERSE(pQueryHandle->order) && win.skey > pQueryHandle->window.ekey) || + (!ASCENDING_TRAVERSE(pQueryHandle->order) && win.ekey < pQueryHandle->window.ekey)) { + tsdbUnLockFS(REPO_FS(pQueryHandle->pTsdb)); + tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, 0x%"PRIx64, pQueryHandle, + pQueryHandle->window.skey, pQueryHandle->window.ekey, pQueryHandle->qId); + pQueryHandle->pFileGroup = NULL; + assert(pQueryHandle->numOfBlocks == 0); + break; + } + + if (tsdbSetAndOpenReadFSet(&pQueryHandle->rhelper, pQueryHandle->pFileGroup) < 0) { + tsdbUnLockFS(REPO_FS(pQueryHandle->pTsdb)); + code = terrno; + break; + } + + tsdbUnLockFS(REPO_FS(pQueryHandle->pTsdb)); + + if (tsdbLoadBlockIdx(&pQueryHandle->rhelper) < 0) { + code = terrno; + break; + } + + if ((code = getFileCompInfo(pQueryHandle, &numOfBlocks)) != TSDB_CODE_SUCCESS) { + break; + } + + tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, 0x%"PRIx64, pQueryHandle, numOfBlocks, numOfTables, + pQueryHandle->pFileGroup->fid, pQueryHandle->qId); + + assert(numOfBlocks >= 0); + if (numOfBlocks == 0) { + continue; + } + + // todo return error code to query engine + if ((code = createDataBlocksInfo(pQueryHandle, numOfBlocks, &pQueryHandle->numOfBlocks)) != TSDB_CODE_SUCCESS) { + break; + } + + assert(numOfBlocks >= pQueryHandle->numOfBlocks); + if (pQueryHandle->numOfBlocks > 0) { + break; + } + } + + // no data in file anymore + if (pQueryHandle->numOfBlocks <= 0 || code != TSDB_CODE_SUCCESS) { + if (code == TSDB_CODE_SUCCESS) { + assert(pQueryHandle->pFileGroup == NULL); + } + + cur->fid = INT32_MIN; // denote that there are no data in file anymore + *exists = false; + return code; + } + + assert(pQueryHandle->pFileGroup != NULL && pQueryHandle->numOfBlocks > 0); + cur->slot = ASCENDING_TRAVERSE(pQueryHandle->order)? 0:pQueryHandle->numOfBlocks-1; + cur->fid = pQueryHandle->pFileGroup->fid; + + STableBlockInfo* pBlockInfo = &pQueryHandle->pDataBlockInfo[cur->slot]; + return getDataBlockRv(pQueryHandle, pBlockInfo, exists); +} + +static bool isEndFileDataBlock(SQueryFilePos* cur, int32_t numOfBlocks, bool ascTrav) { + assert(cur != NULL && numOfBlocks > 0); + return (cur->slot == numOfBlocks - 1 && ascTrav) || (cur->slot == 0 && !ascTrav); +} + +static void moveToNextDataBlockInCurrentFile(STsdbQueryHandle* pQueryHandle) { + int32_t step = ASCENDING_TRAVERSE(pQueryHandle->order)? 1 : -1; + + SQueryFilePos* cur = &pQueryHandle->cur; + assert(cur->slot < pQueryHandle->numOfBlocks && cur->slot >= 0); + + cur->slot += step; + cur->mixBlock = false; + cur->blockCompleted = false; +} + +int32_t tsdbGetFileBlocksDistInfo(TsdbQueryHandleT* queryHandle, STableBlockDist* pTableBlockInfo) { + STsdbQueryHandle* pQueryHandle = (STsdbQueryHandle*) queryHandle; + + pTableBlockInfo->totalSize = 0; + pTableBlockInfo->totalRows = 0; + STsdbFS* pFileHandle = REPO_FS(pQueryHandle->pTsdb); + + // find the start data block in file + pQueryHandle->locateStart = true; + STsdbCfg* pCfg = &pQueryHandle->pTsdb->config; + int32_t fid = getFileIdFromKey(pQueryHandle->window.skey, pCfg->daysPerFile, pCfg->precision); + + tsdbRLockFS(pFileHandle); + tsdbFSIterInit(&pQueryHandle->fileIter, pFileHandle, pQueryHandle->order); + tsdbFSIterSeek(&pQueryHandle->fileIter, fid); + tsdbUnLockFS(pFileHandle); + + pTableBlockInfo->numOfFiles += 1; + + int32_t code = TSDB_CODE_SUCCESS; + int32_t numOfBlocks = 0; + int32_t numOfTables = (int32_t)taosArrayGetSize(pQueryHandle->pTableCheckInfo); + int defaultRows = TSDB_DEFAULT_BLOCK_ROWS(pCfg->maxRowsPerFileBlock); + STimeWindow win = TSWINDOW_INITIALIZER; + + while (true) { + numOfBlocks = 0; + tsdbRLockFS(REPO_FS(pQueryHandle->pTsdb)); + + if ((pQueryHandle->pFileGroup = tsdbFSIterNext(&pQueryHandle->fileIter)) == NULL) { + tsdbUnLockFS(REPO_FS(pQueryHandle->pTsdb)); + break; + } + + tsdbGetFidKeyRange(pCfg->daysPerFile, pCfg->precision, pQueryHandle->pFileGroup->fid, &win.skey, &win.ekey); + + // current file are not overlapped with query time window, ignore remain files + if ((ASCENDING_TRAVERSE(pQueryHandle->order) && win.skey > pQueryHandle->window.ekey) || + (!ASCENDING_TRAVERSE(pQueryHandle->order) && win.ekey < pQueryHandle->window.ekey)) { + tsdbUnLockFS(REPO_FS(pQueryHandle->pTsdb)); + tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, 0x%"PRIx64, pQueryHandle, + pQueryHandle->window.skey, pQueryHandle->window.ekey, pQueryHandle->qId); + pQueryHandle->pFileGroup = NULL; + break; + } + + pTableBlockInfo->numOfFiles += 1; + if (tsdbSetAndOpenReadFSet(&pQueryHandle->rhelper, pQueryHandle->pFileGroup) < 0) { + tsdbUnLockFS(REPO_FS(pQueryHandle->pTsdb)); + code = terrno; + break; + } + + tsdbUnLockFS(REPO_FS(pQueryHandle->pTsdb)); + + if (tsdbLoadBlockIdx(&pQueryHandle->rhelper) < 0) { + code = terrno; + break; + } + + if ((code = getFileCompInfo(pQueryHandle, &numOfBlocks)) != TSDB_CODE_SUCCESS) { + break; + } + + tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, 0x%"PRIx64, pQueryHandle, numOfBlocks, numOfTables, + pQueryHandle->pFileGroup->fid, pQueryHandle->qId); + + if (numOfBlocks == 0) { + continue; + } + + for (int32_t i = 0; i < numOfTables; ++i) { + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, i); + + SBlock* pBlock = pCheckInfo->pCompInfo->blocks; + for (int32_t j = 0; j < pCheckInfo->numOfBlocks; ++j) { + pTableBlockInfo->totalSize += pBlock[j].len; + + int32_t numOfRows = pBlock[j].numOfRows; + pTableBlockInfo->totalRows += numOfRows; + if (numOfRows > pTableBlockInfo->maxRows) pTableBlockInfo->maxRows = numOfRows; + if (numOfRows < pTableBlockInfo->minRows) pTableBlockInfo->minRows = numOfRows; + if (numOfRows < defaultRows) pTableBlockInfo->numOfSmallBlocks+=1; + int32_t stepIndex = (numOfRows-1)/TSDB_BLOCK_DIST_STEP_ROWS; + SFileBlockInfo *blockInfo = (SFileBlockInfo*)taosArrayGet(pTableBlockInfo->dataBlockInfos, stepIndex); + blockInfo->numBlocksOfStep++; + } + } + } + + return code; +} + +static int32_t getDataBlocksInFiles(STsdbQueryHandle* pQueryHandle, bool* exists) { + STsdbFS* pFileHandle = REPO_FS(pQueryHandle->pTsdb); + SQueryFilePos* cur = &pQueryHandle->cur; + + // find the start data block in file + if (!pQueryHandle->locateStart) { + pQueryHandle->locateStart = true; + STsdbCfg* pCfg = &pQueryHandle->pTsdb->config; + int32_t fid = getFileIdFromKey(pQueryHandle->window.skey, pCfg->daysPerFile, pCfg->precision); + + tsdbRLockFS(pFileHandle); + tsdbFSIterInit(&pQueryHandle->fileIter, pFileHandle, pQueryHandle->order); + tsdbFSIterSeek(&pQueryHandle->fileIter, fid); + tsdbUnLockFS(pFileHandle); + + return getFirstFileDataBlock(pQueryHandle, exists); + } else { + // check if current file block is all consumed + STableBlockInfo* pBlockInfo = &pQueryHandle->pDataBlockInfo[cur->slot]; + STableCheckInfo* pCheckInfo = pBlockInfo->pTableCheckInfo; + + // current block is done, try next + if ((!cur->mixBlock) || cur->blockCompleted) { + // all data blocks in current file has been checked already, try next file if exists + } else { + tsdbDebug("%p continue in current data block, index:%d, pos:%d, 0x%"PRIx64, pQueryHandle, cur->slot, cur->pos, + pQueryHandle->qId); + int32_t code = handleDataMergeIfNeeded(pQueryHandle, pBlockInfo->compBlock, pCheckInfo); + *exists = (pQueryHandle->realNumOfRows > 0); + + if (code != TSDB_CODE_SUCCESS || *exists) { + return code; + } + } + + // current block is empty, try next block in file + // all data blocks in current file has been checked already, try next file if exists + if (isEndFileDataBlock(cur, pQueryHandle->numOfBlocks, ASCENDING_TRAVERSE(pQueryHandle->order))) { + return getFirstFileDataBlock(pQueryHandle, exists); + } else { + moveToNextDataBlockInCurrentFile(pQueryHandle); + STableBlockInfo* pNext = &pQueryHandle->pDataBlockInfo[cur->slot]; + return getDataBlockRv(pQueryHandle, pNext, exists); + } + } +} + +static bool doHasDataInBuffer(STsdbQueryHandle* pQueryHandle) { + size_t numOfTables = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + + while (pQueryHandle->activeIndex < numOfTables) { + if (hasMoreDataInCache(pQueryHandle)) { + return true; + } + + pQueryHandle->activeIndex += 1; + } + + // no data in memtable or imemtable, decrease the memory reference. + // TODO !! +// tsdbMayUnTakeMemSnapshot(pQueryHandle); + return false; +} + +//todo not unref yet, since it is not support multi-group interpolation query +static UNUSED_FUNC void changeQueryHandleForInterpQuery(TsdbQueryHandleT pHandle) { + // filter the queried time stamp in the first place + STsdbQueryHandle* pQueryHandle = (STsdbQueryHandle*) pHandle; + + // starts from the buffer in case of descending timestamp order check data blocks + size_t numOfTables = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + + int32_t i = 0; + while(i < numOfTables) { + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, i); + + // the first qualified table for interpolation query + if ((pQueryHandle->window.skey <= pCheckInfo->pTableObj->lastKey) && + (pCheckInfo->pTableObj->lastKey != TSKEY_INITIAL_VAL)) { + break; + } + + i++; + } + + // there are no data in all the tables + if (i == numOfTables) { + return; + } + + STableCheckInfo info = *(STableCheckInfo*) taosArrayGet(pQueryHandle->pTableCheckInfo, i); + taosArrayClear(pQueryHandle->pTableCheckInfo); + + info.lastKey = pQueryHandle->window.skey; + taosArrayPush(pQueryHandle->pTableCheckInfo, &info); +} + +static int tsdbReadRowsFromCache(STableCheckInfo* pCheckInfo, TSKEY maxKey, int maxRowsToRead, STimeWindow* win, + STsdbQueryHandle* pQueryHandle) { + int numOfRows = 0; + int32_t numOfCols = (int32_t)taosArrayGetSize(pQueryHandle->pColumns); + STsdbCfg* pCfg = &pQueryHandle->pTsdb->config; + win->skey = TSKEY_INITIAL_VAL; + + int64_t st = taosGetTimestampUs(); + STable* pTable = pCheckInfo->pTableObj; + int16_t rv = -1; + STSchema* pSchema = NULL; + + do { + SMemRow row = getSMemRowInTableMem(pCheckInfo, pQueryHandle->order, pCfg->update, NULL); + if (row == NULL) { + break; + } + + TSKEY key = memRowKey(row); + if ((key > maxKey && ASCENDING_TRAVERSE(pQueryHandle->order)) || (key < maxKey && !ASCENDING_TRAVERSE(pQueryHandle->order))) { + tsdbDebug("%p key:%"PRIu64" beyond qrange:%"PRId64" - %"PRId64", no more data in buffer", pQueryHandle, key, pQueryHandle->window.skey, + pQueryHandle->window.ekey); + + break; + } + + if (win->skey == INT64_MIN) { + win->skey = key; + } + + win->ekey = key; + if (rv != memRowVersion(row)) { + pSchema = tsdbGetTableSchemaByVersion(pTable, memRowVersion(row), (int8_t)memRowType(row)); + rv = memRowVersion(row); + } + mergeTwoRowFromMem(pQueryHandle, maxRowsToRead, numOfRows, row, NULL, numOfCols, pTable, pSchema, NULL, true); + + if (++numOfRows >= maxRowsToRead) { + moveToNextRowInMem(pCheckInfo); + break; + } + + } while(moveToNextRowInMem(pCheckInfo)); + + assert(numOfRows <= maxRowsToRead); + + // if the buffer is not full in case of descending order query, move the data in the front of the buffer + if (!ASCENDING_TRAVERSE(pQueryHandle->order) && numOfRows < maxRowsToRead) { + int32_t emptySize = maxRowsToRead - numOfRows; + + for(int32_t i = 0; i < numOfCols; ++i) { + SColumnInfoData* pColInfo = taosArrayGet(pQueryHandle->pColumns, i); + memmove((char*)pColInfo->pData, (char*)pColInfo->pData + emptySize * pColInfo->info.bytes, numOfRows * pColInfo->info.bytes); + } + } + + int64_t elapsedTime = taosGetTimestampUs() - st; + tsdbDebug("%p build data block from cache completed, elapsed time:%"PRId64" us, numOfRows:%d, numOfCols:%d, 0x%"PRIx64, pQueryHandle, + elapsedTime, numOfRows, numOfCols, pQueryHandle->qId); + + return numOfRows; +} + +static int32_t getAllTableList(STable* pSuperTable, SArray* list) { + STSchema* pTagSchema = tsdbGetTableTagSchema(pSuperTable); + if(pTagSchema && pTagSchema->numOfCols == 1 && pTagSchema->columns[0].type == TSDB_DATA_TYPE_JSON){ + uint32_t key = TSDB_DATA_JSON_NULL; + char keyMd5[TSDB_MAX_JSON_KEY_MD5_LEN] = {0}; + jsonKeyMd5(&key, INT_BYTES, keyMd5); + SArray** tablist = (SArray**)taosHashGet(pSuperTable->jsonKeyMap, keyMd5, TSDB_MAX_JSON_KEY_MD5_LEN); + + for (int i = 0; i < taosArrayGetSize(*tablist); ++i) { + JsonMapValue* p = taosArrayGet(*tablist, i); + STableKeyInfo info = {.pTable = p->table, .lastKey = TSKEY_INITIAL_VAL}; + taosArrayPush(list, &info); + } + }else{ + SSkipListIterator* iter = tSkipListCreateIter(pSuperTable->pIndex); + while (tSkipListIterNext(iter)) { + SSkipListNode* pNode = tSkipListIterGet(iter); + + STable* pTable = (STable*) SL_GET_NODE_DATA((SSkipListNode*) pNode); + + STableKeyInfo info = {.pTable = pTable, .lastKey = TSKEY_INITIAL_VAL}; + taosArrayPush(list, &info); + } + + tSkipListDestroyIter(iter); + } + return TSDB_CODE_SUCCESS; +} + +static bool loadBlockOfActiveTable(STsdbQueryHandle* pQueryHandle) { + if (pQueryHandle->checkFiles) { + // check if the query range overlaps with the file data block + bool exists = true; + + int32_t code = getDataBlocksInFiles(pQueryHandle, &exists); + if (code != TSDB_CODE_SUCCESS) { + pQueryHandle->checkFiles = false; + return false; + } + + if (exists) { + tsdbRetrieveDataBlock((TsdbQueryHandleT*) pQueryHandle, NULL); + if (pQueryHandle->currentLoadExternalRows && pQueryHandle->window.skey == pQueryHandle->window.ekey) { + SColumnInfoData* pColInfo = taosArrayGet(pQueryHandle->pColumns, 0); + assert(*(int64_t*)pColInfo->pData == pQueryHandle->window.skey); + } + + pQueryHandle->currentLoadExternalRows = false; // clear the flag, since the exact matched row is found. + return exists; + } + + pQueryHandle->checkFiles = false; + } + + if (hasMoreDataInCache(pQueryHandle)) { + pQueryHandle->currentLoadExternalRows = false; + return true; + } + + // current result is empty + if (pQueryHandle->currentLoadExternalRows && pQueryHandle->window.skey == pQueryHandle->window.ekey && pQueryHandle->cur.rows == 0) { + SMemRef* pMemRef = pQueryHandle->pMemRef; + + doGetExternalRow(pQueryHandle, TSDB_PREV_ROW, pMemRef); + doGetExternalRow(pQueryHandle, TSDB_NEXT_ROW, pMemRef); + + bool result = tsdbGetExternalRow(pQueryHandle); + + pQueryHandle->prev = doFreeColumnInfoData(pQueryHandle->prev); + pQueryHandle->next = doFreeColumnInfoData(pQueryHandle->next); + pQueryHandle->currentLoadExternalRows = false; + + return result; + } + + return false; +} + +static bool loadCachedLastRow(STsdbQueryHandle* pQueryHandle) { + // the last row is cached in buffer, return it directly. + // here note that the pQueryHandle->window must be the TS_INITIALIZER + int32_t numOfCols = (int32_t)(QH_GET_NUM_OF_COLS(pQueryHandle)); + size_t numOfTables = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + assert(numOfTables > 0 && numOfCols > 0); + + SQueryFilePos* cur = &pQueryHandle->cur; + + SMemRow pRow = NULL; + TSKEY key = TSKEY_INITIAL_VAL; + int32_t step = ASCENDING_TRAVERSE(pQueryHandle->order)? 1:-1; + + if (++pQueryHandle->activeIndex < numOfTables) { + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, pQueryHandle->activeIndex); + int32_t ret = tsdbGetCachedLastRow(pCheckInfo->pTableObj, &pRow, &key); + if (ret != TSDB_CODE_SUCCESS) { + return false; + } + mergeTwoRowFromMem(pQueryHandle, pQueryHandle->outputCapacity, 0, pRow, NULL, numOfCols, pCheckInfo->pTableObj, NULL, NULL, true); + tfree(pRow); + + // update the last key value + pCheckInfo->lastKey = key + step; + + cur->rows = 1; // only one row + cur->lastKey = key + step; + cur->mixBlock = true; + cur->win.skey = key; + cur->win.ekey = key; + + return true; + } + + return false; +} + + + +static bool loadCachedLast(STsdbQueryHandle* pQueryHandle) { + // the last row is cached in buffer, return it directly. + // here note that the pQueryHandle->window must be the TS_INITIALIZER + int32_t tgNumOfCols = (int32_t)QH_GET_NUM_OF_COLS(pQueryHandle); + size_t numOfTables = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + int32_t numOfRows = 0; + assert(numOfTables > 0 && tgNumOfCols > 0); + SQueryFilePos* cur = &pQueryHandle->cur; + TSKEY priKey = TSKEY_INITIAL_VAL; + int32_t priIdx = -1; + SColumnInfoData* pColInfo = NULL; + + while (++pQueryHandle->activeIndex < numOfTables) { + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, pQueryHandle->activeIndex); + STable* pTable = pCheckInfo->pTableObj; + char* pData = NULL; + + int32_t numOfCols = pTable->maxColNum; + + if (pTable->lastCols == NULL || pTable->maxColNum <= 0) { + tsdbWarn("no last cached for table %s, uid:%" PRIu64 ",tid:%d", pTable->name->data, pTable->tableId.uid, pTable->tableId.tid); + continue; + } + + int32_t i = 0, j = 0; + + // lock pTable->lastCols[i] as it would be released when schema update(tsdbUpdateLastColSchema) + TSDB_RLOCK_TABLE(pTable); + while(i < tgNumOfCols && j < numOfCols) { + pColInfo = taosArrayGet(pQueryHandle->pColumns, i); + if (pTable->lastCols[j].colId < pColInfo->info.colId) { + j++; + continue; + } else if (pTable->lastCols[j].colId > pColInfo->info.colId) { + i++; + continue; + } + + pData = (char*)pColInfo->pData + numOfRows * pColInfo->info.bytes; + + if (pTable->lastCols[j].bytes > 0) { + void* value = pTable->lastCols[j].pData; + switch (pColInfo->info.type) { + case TSDB_DATA_TYPE_BINARY: + case TSDB_DATA_TYPE_NCHAR: + memcpy(pData, value, varDataTLen(value)); + break; + case TSDB_DATA_TYPE_NULL: + case TSDB_DATA_TYPE_BOOL: + case TSDB_DATA_TYPE_TINYINT: + case TSDB_DATA_TYPE_UTINYINT: + *(uint8_t *)pData = *(uint8_t *)value; + break; + case TSDB_DATA_TYPE_SMALLINT: + case TSDB_DATA_TYPE_USMALLINT: + *(uint16_t *)pData = *(uint16_t *)value; + break; + case TSDB_DATA_TYPE_INT: + case TSDB_DATA_TYPE_UINT: + *(uint32_t *)pData = *(uint32_t *)value; + break; + case TSDB_DATA_TYPE_BIGINT: + case TSDB_DATA_TYPE_UBIGINT: + *(uint64_t *)pData = *(uint64_t *)value; + break; + case TSDB_DATA_TYPE_FLOAT: + SET_FLOAT_PTR(pData, value); + break; + case TSDB_DATA_TYPE_DOUBLE: + SET_DOUBLE_PTR(pData, value); + break; + case TSDB_DATA_TYPE_TIMESTAMP: + if (pColInfo->info.colId == PRIMARYKEY_TIMESTAMP_COL_INDEX) { + priKey = tdGetKey(*(TKEY *)value); + priIdx = i; + + i++; + j++; + continue; + } else { + *(TSKEY *)pData = *(TSKEY *)value; + } + break; + default: + memcpy(pData, value, pColInfo->info.bytes); + } + + for (int32_t n = 0; n < tgNumOfCols; ++n) { + if (n == i) { + continue; + } + + pColInfo = taosArrayGet(pQueryHandle->pColumns, n); + pData = (char*)pColInfo->pData + numOfRows * pColInfo->info.bytes;; + + if (pColInfo->info.colId == PRIMARYKEY_TIMESTAMP_COL_INDEX) { + *(TSKEY *)pData = pTable->lastCols[j].ts; + continue; + } + + if (pColInfo->info.type == TSDB_DATA_TYPE_BINARY || pColInfo->info.type == TSDB_DATA_TYPE_NCHAR) { + setVardataNull(pData, pColInfo->info.type); + } else { + setNull(pData, pColInfo->info.type, pColInfo->info.bytes); + } + } + + numOfRows++; + assert(numOfRows < pQueryHandle->outputCapacity); + } + + i++; + j++; + } + TSDB_RUNLOCK_TABLE(pTable); + + // leave the real ts column as the last row, because last function only (not stable) use the last row as res + if (priKey != TSKEY_INITIAL_VAL) { + pColInfo = taosArrayGet(pQueryHandle->pColumns, priIdx); + pData = (char*)pColInfo->pData + numOfRows * pColInfo->info.bytes; + + *(TSKEY *)pData = priKey; + + for (int32_t n = 0; n < tgNumOfCols; ++n) { + if (n == priIdx) { + continue; + } + + pColInfo = taosArrayGet(pQueryHandle->pColumns, n); + pData = (char*)pColInfo->pData + numOfRows * pColInfo->info.bytes;; + + assert (pColInfo->info.colId != PRIMARYKEY_TIMESTAMP_COL_INDEX); + + if (pColInfo->info.type == TSDB_DATA_TYPE_BINARY || pColInfo->info.type == TSDB_DATA_TYPE_NCHAR) { + setVardataNull(pData, pColInfo->info.type); + } else { + setNull(pData, pColInfo->info.type, pColInfo->info.bytes); + } + } + + numOfRows++; + } + + if (numOfRows > 0) { + cur->rows = numOfRows; + cur->mixBlock = true; + + return true; + } + } + + return false; +} + +void tsdbSwitchTable(TsdbQueryHandleT queryHandle) { + STsdbQueryHandle* pQueryHandle = (STsdbQueryHandle*) queryHandle; + + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, pQueryHandle->activeIndex); + pCheckInfo->numOfBlocks = 0; + + pQueryHandle->locateStart = false; + pQueryHandle->checkFiles = true; + pQueryHandle->cur.rows = 0; + pQueryHandle->currentLoadExternalRows = pQueryHandle->loadExternalRow; + + terrno = TSDB_CODE_SUCCESS; + + ++pQueryHandle->activeIndex; +} + + +static bool loadDataBlockFromTableSeq(STsdbQueryHandle* pQueryHandle) { + size_t numOfTables = taosArrayGetSize(pQueryHandle->pTableCheckInfo); + assert(numOfTables > 0); + + int64_t stime = taosGetTimestampUs(); + + while(pQueryHandle->activeIndex < numOfTables) { + if (loadBlockOfActiveTable(pQueryHandle)) { + return true; + } + + STableCheckInfo* pCheckInfo = taosArrayGet(pQueryHandle->pTableCheckInfo, pQueryHandle->activeIndex); + pCheckInfo->numOfBlocks = 0; + + pQueryHandle->activeIndex += 1; + pQueryHandle->locateStart = false; + pQueryHandle->checkFiles = true; + pQueryHandle->cur.rows = 0; + pQueryHandle->currentLoadExternalRows = pQueryHandle->loadExternalRow; + + terrno = TSDB_CODE_SUCCESS; + + int64_t elapsedTime = taosGetTimestampUs() - stime; + pQueryHandle->cost.checkForNextTime += elapsedTime; + } + + return false; +} + +// handle data in cache situation +bool tsdbNextDataBlock(TsdbQueryHandleT pHandle) { + STsdbQueryHandle* pQueryHandle = (STsdbQueryHandle*) pHandle; + if (pQueryHandle == NULL) { + return false; + } + + if (emptyQueryTimewindow(pQueryHandle)) { + tsdbDebug("%p query window not overlaps with the data set, no result returned, 0x%"PRIx64, pQueryHandle, pQueryHandle->qId); + return false; + } + + int64_t stime = taosGetTimestampUs(); + int64_t elapsedTime = stime; + + // TODO refactor: remove "type" + if (pQueryHandle->type == TSDB_QUERY_TYPE_LAST) { + if (pQueryHandle->cachelastrow == TSDB_CACHED_TYPE_LASTROW) { + return loadCachedLastRow(pQueryHandle); + } else if (pQueryHandle->cachelastrow == TSDB_CACHED_TYPE_LAST) { + return loadCachedLast(pQueryHandle); + } + } + + if (pQueryHandle->loadType == BLOCK_LOAD_TABLE_SEQ_ORDER) { + return loadDataBlockFromTableSeq(pQueryHandle); + } else { // loadType == RR and Offset Order + if (pQueryHandle->checkFiles) { + // check if the query range overlaps with the file data block + bool exists = true; + + int32_t code = getDataBlocksInFiles(pQueryHandle, &exists); + if (code != TSDB_CODE_SUCCESS) { + pQueryHandle->activeIndex = 0; + pQueryHandle->checkFiles = false; + + return false; + } + + if (exists) { + pQueryHandle->cost.checkForNextTime += (taosGetTimestampUs() - stime); + return exists; + } + + pQueryHandle->activeIndex = 0; + pQueryHandle->checkFiles = false; + } + + // TODO: opt by consider the scan order + bool ret = doHasDataInBuffer(pQueryHandle); + terrno = TSDB_CODE_SUCCESS; + + elapsedTime = taosGetTimestampUs() - stime; + pQueryHandle->cost.checkForNextTime += elapsedTime; + return ret; + } +} + +static int32_t doGetExternalRow(STsdbQueryHandle* pQueryHandle, int16_t type, SMemRef* pMemRef) { + STsdbQueryHandle* pSecQueryHandle = NULL; + + if (type == TSDB_PREV_ROW && pQueryHandle->prev) { + return TSDB_CODE_SUCCESS; + } + + if (type == TSDB_NEXT_ROW && pQueryHandle->next) { + return TSDB_CODE_SUCCESS; + } + + // prepare the structure + int32_t numOfCols = (int32_t) QH_GET_NUM_OF_COLS(pQueryHandle); + + if (type == TSDB_PREV_ROW) { + pQueryHandle->prev = taosArrayInit(numOfCols, sizeof(SColumnInfoData)); + if (pQueryHandle->prev == NULL) { + terrno = TSDB_CODE_QRY_OUT_OF_MEMORY; + goto out_of_memory; + } + } else { + pQueryHandle->next = taosArrayInit(numOfCols, sizeof(SColumnInfoData)); + if (pQueryHandle->next == NULL) { + terrno = TSDB_CODE_QRY_OUT_OF_MEMORY; + goto out_of_memory; + } + } + + SArray* row = (type == TSDB_PREV_ROW)? pQueryHandle->prev : pQueryHandle->next; + + for (int32_t i = 0; i < numOfCols; ++i) { + SColumnInfoData* pCol = taosArrayGet(pQueryHandle->pColumns, i); + + SColumnInfoData colInfo = {{0}, 0}; + colInfo.info = pCol->info; + colInfo.pData = calloc(1, pCol->info.bytes); + if (colInfo.pData == NULL) { + terrno = TSDB_CODE_QRY_OUT_OF_MEMORY; + goto out_of_memory; + } + + taosArrayPush(row, &colInfo); + } + + // load the previous row + STsdbQueryCond cond = {.numOfCols = numOfCols, .loadExternalRows = false, .type = BLOCK_LOAD_OFFSET_SEQ_ORDER}; + if (type == TSDB_PREV_ROW) { + cond.order = TSDB_ORDER_DESC; + cond.twindow = (STimeWindow){pQueryHandle->window.skey, INT64_MIN}; + } else { + cond.order = TSDB_ORDER_ASC; + cond.twindow = (STimeWindow){pQueryHandle->window.skey, INT64_MAX}; + } + + cond.colList = calloc(cond.numOfCols, sizeof(SColumnInfo)); + if (cond.colList == NULL) { + terrno = TSDB_CODE_QRY_OUT_OF_MEMORY; + goto out_of_memory; + } + + for (int32_t i = 0; i < cond.numOfCols; ++i) { + SColumnInfoData* pColInfoData = taosArrayGet(pQueryHandle->pColumns, i); + memcpy(&cond.colList[i], &pColInfoData->info, sizeof(SColumnInfo)); + } + + pSecQueryHandle = tsdbQueryTablesImpl(pQueryHandle->pTsdb, &cond, pQueryHandle->qId, pMemRef); + tfree(cond.colList); + if (pSecQueryHandle == NULL) { + goto out_of_memory; + } + + // current table, only one table + STableCheckInfo* pCurrent = taosArrayGet(pQueryHandle->pTableCheckInfo, pQueryHandle->activeIndex); + + SArray* psTable = NULL; + pSecQueryHandle->pTableCheckInfo = createCheckInfoFromCheckInfo(pCurrent, pSecQueryHandle->window.skey, &psTable); + if (pSecQueryHandle->pTableCheckInfo == NULL) { + taosArrayDestroy(&psTable); + terrno = TSDB_CODE_QRY_OUT_OF_MEMORY; + goto out_of_memory; + } + + + tsdbMayTakeMemSnapshot(pSecQueryHandle, psTable); + if (!tsdbNextDataBlock((void*)pSecQueryHandle)) { + // no result in current query, free the corresponding result rows structure + if (type == TSDB_PREV_ROW) { + pQueryHandle->prev = doFreeColumnInfoData(pQueryHandle->prev); + } else { + pQueryHandle->next = doFreeColumnInfoData(pQueryHandle->next); + } + + goto out_of_memory; + } + + SDataBlockInfo blockInfo = {{0}, 0}; + tsdbRetrieveDataBlockInfo((void*)pSecQueryHandle, &blockInfo); + tsdbRetrieveDataBlock((void*)pSecQueryHandle, pSecQueryHandle->defaultLoadColumn); + + row = (type == TSDB_PREV_ROW)? pQueryHandle->prev:pQueryHandle->next; + int32_t pos = (type == TSDB_PREV_ROW)?pSecQueryHandle->cur.rows - 1:0; + + for (int32_t i = 0; i < numOfCols; ++i) { + SColumnInfoData* pCol = taosArrayGet(row, i); + SColumnInfoData* s = taosArrayGet(pSecQueryHandle->pColumns, i); + memcpy((char*)pCol->pData, (char*)s->pData + s->info.bytes * pos, pCol->info.bytes); + } + +out_of_memory: + tsdbCleanupQueryHandle(pSecQueryHandle); + return terrno; +} + +bool tsdbGetExternalRow(TsdbQueryHandleT pHandle) { + STsdbQueryHandle* pQueryHandle = (STsdbQueryHandle*) pHandle; + SQueryFilePos* cur = &pQueryHandle->cur; + + cur->fid = INT32_MIN; + cur->mixBlock = true; + if (pQueryHandle->prev == NULL || pQueryHandle->next == NULL) { + cur->rows = 0; + return false; + } + + int32_t numOfCols = (int32_t) QH_GET_NUM_OF_COLS(pQueryHandle); + for (int32_t i = 0; i < numOfCols; ++i) { + SColumnInfoData* pColInfoData = taosArrayGet(pQueryHandle->pColumns, i); + SColumnInfoData* first = taosArrayGet(pQueryHandle->prev, i); + + memcpy(pColInfoData->pData, first->pData, pColInfoData->info.bytes); + + SColumnInfoData* sec = taosArrayGet(pQueryHandle->next, i); + memcpy(((char*)pColInfoData->pData) + pColInfoData->info.bytes, sec->pData, pColInfoData->info.bytes); + + if (i == 0 && pColInfoData->info.type == TSDB_DATA_TYPE_TIMESTAMP) { + cur->win.skey = *(TSKEY*)pColInfoData->pData; + cur->win.ekey = *(TSKEY*)(((char*)pColInfoData->pData) + TSDB_KEYSIZE); + } + } + + cur->rows = 2; + return true; +} + +/* + * if lastRow == NULL, return TSDB_CODE_TDB_NO_CACHE_LAST_ROW + * else set pRes and return TSDB_CODE_SUCCESS and save lastKey + */ +int32_t tsdbGetCachedLastRow(STable* pTable, SMemRow* pRes, TSKEY* lastKey) { + int32_t code = TSDB_CODE_SUCCESS; + + TSDB_RLOCK_TABLE(pTable); + + if (!pTable->lastRow) { + code = TSDB_CODE_TDB_NO_CACHE_LAST_ROW; + goto out; + } + + if (pRes) { + *pRes = tdMemRowDup(pTable->lastRow); + if (*pRes == NULL) { + code = TSDB_CODE_TDB_OUT_OF_MEMORY; + } + } + +out: + TSDB_RUNLOCK_TABLE(pTable); + return code; +} + +bool isTsdbCacheLastRow(TsdbQueryHandleT* pQueryHandle) { + return ((STsdbQueryHandle *)pQueryHandle)->cachelastrow > TSDB_CACHED_TYPE_NONE; +} + +int32_t checkForCachedLastRow(STsdbQueryHandle* pQueryHandle, STableGroupInfo *groupList) { + assert(pQueryHandle != NULL && groupList != NULL); + + TSKEY key = TSKEY_INITIAL_VAL; + + SArray* group = taosArrayGetP(groupList->pGroupList, 0); + assert(group != NULL); + + STableKeyInfo* pInfo = (STableKeyInfo*)taosArrayGet(group, 0); + + int32_t code = 0; + + if (((STable*)pInfo->pTable)->lastRow) { + code = tsdbGetCachedLastRow(pInfo->pTable, NULL, &key); + if (code != TSDB_CODE_SUCCESS) { + pQueryHandle->cachelastrow = TSDB_CACHED_TYPE_NONE; + } else { + pQueryHandle->cachelastrow = TSDB_CACHED_TYPE_LASTROW; + } + } + + // update the tsdb query time range + if (pQueryHandle->cachelastrow != TSDB_CACHED_TYPE_NONE) { + pQueryHandle->window = TSWINDOW_INITIALIZER; + pQueryHandle->checkFiles = false; + pQueryHandle->activeIndex = -1; // start from -1 + } + + return code; +} + +int32_t checkForCachedLast(STsdbQueryHandle* pQueryHandle) { + assert(pQueryHandle != NULL); + + int32_t code = 0; + + STsdbRepo* pRepo = pQueryHandle->pTsdb; + + if (pRepo && CACHE_LAST_NULL_COLUMN(&(pRepo->config))) { + pQueryHandle->cachelastrow = TSDB_CACHED_TYPE_LAST; + } + + // update the tsdb query time range + if (pQueryHandle->cachelastrow) { + pQueryHandle->checkFiles = false; + pQueryHandle->activeIndex = -1; // start from -1 + } + + return code; +} + + +STimeWindow updateLastrowForEachGroup(STableGroupInfo *groupList) { + STimeWindow window = {INT64_MAX, INT64_MIN}; + + int32_t totalNumOfTable = 0; + SArray* emptyGroup = taosArrayInit(16, sizeof(int32_t)); + + // NOTE: starts from the buffer in case of descending timestamp order check data blocks + size_t numOfGroups = taosArrayGetSize(groupList->pGroupList); + for(int32_t j = 0; j < numOfGroups; ++j) { + SArray* pGroup = taosArrayGetP(groupList->pGroupList, j); + TSKEY key = TSKEY_INITIAL_VAL; + + STableKeyInfo keyInfo = {0}; + + size_t numOfTables = taosArrayGetSize(pGroup); + for(int32_t i = 0; i < numOfTables; ++i) { + STableKeyInfo* pInfo = (STableKeyInfo*) taosArrayGet(pGroup, i); + + // if the lastKey equals to INT64_MIN, there is no data in this table + TSKEY lastKey = ((STable*)(pInfo->pTable))->lastKey; + if (key < lastKey) { + key = lastKey; + + keyInfo.pTable = pInfo->pTable; + keyInfo.lastKey = key; + pInfo->lastKey = key; + + if (key < window.skey) { + window.skey = key; + } + + if (key > window.ekey) { + window.ekey = key; + } + } + } + + // clear current group, unref unused table + for (int32_t i = 0; i < numOfTables; ++i) { + STableKeyInfo* pInfo = (STableKeyInfo*)taosArrayGet(pGroup, i); + + // keyInfo.pTable may be NULL here. + if (pInfo->pTable != keyInfo.pTable) { + tsdbUnRefTable(pInfo->pTable); + } + } + + // more than one table in each group, only one table left for each group + if (keyInfo.pTable != NULL) { + totalNumOfTable++; + if (taosArrayGetSize(pGroup) == 1) { + // do nothing + } else { + taosArrayClear(pGroup); + taosArrayPush(pGroup, &keyInfo); + } + } else { // mark all the empty groups, and remove it later + taosArrayDestroy(&pGroup); + taosArrayPush(emptyGroup, &j); + } + } + + // window does not being updated, so set the original + if (window.skey == INT64_MAX && window.ekey == INT64_MIN) { + window = TSWINDOW_INITIALIZER; + assert(totalNumOfTable == 0 && taosArrayGetSize(groupList->pGroupList) == numOfGroups); + } + + taosArrayRemoveBatch(groupList->pGroupList, TARRAY_GET_START(emptyGroup), (int32_t) taosArrayGetSize(emptyGroup)); + taosArrayDestroy(&emptyGroup); + + groupList->numOfTables = totalNumOfTable; + return window; +} + +void tsdbRetrieveDataBlockInfo(TsdbQueryHandleT* pQueryHandle, SDataBlockInfo* pDataBlockInfo) { + STsdbQueryHandle* pHandle = (STsdbQueryHandle*)pQueryHandle; + SQueryFilePos* cur = &pHandle->cur; + STable* pTable = NULL; + + // there are data in file + if (pHandle->cur.fid != INT32_MIN) { + STableBlockInfo* pBlockInfo = &pHandle->pDataBlockInfo[cur->slot]; + pTable = pBlockInfo->pTableCheckInfo->pTableObj; + } else { + STableCheckInfo* pCheckInfo = taosArrayGet(pHandle->pTableCheckInfo, pHandle->activeIndex); + pTable = pCheckInfo->pTableObj; + } + + pDataBlockInfo->uid = pTable->tableId.uid; + pDataBlockInfo->tid = pTable->tableId.tid; + pDataBlockInfo->rows = cur->rows; + pDataBlockInfo->window = cur->win; + pDataBlockInfo->numOfCols = (int32_t)(QH_GET_NUM_OF_COLS(pHandle)); +} + +/* + * return null for mixed data block, if not a complete file data block, the statistics value will always return NULL + */ +int32_t tsdbRetrieveDataBlockStatisInfo(TsdbQueryHandleT* pQueryHandle, SDataStatis** pBlockStatis) { + STsdbQueryHandle* pHandle = (STsdbQueryHandle*) pQueryHandle; + + SQueryFilePos* c = &pHandle->cur; + if (c->mixBlock) { + *pBlockStatis = NULL; + return TSDB_CODE_SUCCESS; + } + + STableBlockInfo* pBlockInfo = &pHandle->pDataBlockInfo[c->slot]; + assert((c->slot >= 0 && c->slot < pHandle->numOfBlocks) || ((c->slot == pHandle->numOfBlocks) && (c->slot == 0))); + + // file block with sub-blocks has no statistics data + if (pBlockInfo->compBlock->numOfSubBlocks > 1) { + *pBlockStatis = NULL; + return TSDB_CODE_SUCCESS; + } + + int64_t stime = taosGetTimestampUs(); + int statisStatus = tsdbLoadBlockStatis(&pHandle->rhelper, pBlockInfo->compBlock); + if (statisStatus < TSDB_STATIS_OK) { + return terrno; + } else if (statisStatus > TSDB_STATIS_OK) { + *pBlockStatis = NULL; + return TSDB_CODE_SUCCESS; + } + + int16_t* colIds = pHandle->defaultLoadColumn->pData; + + size_t numOfCols = QH_GET_NUM_OF_COLS(pHandle); + memset(pHandle->statis, 0, numOfCols * sizeof(SDataStatis)); + for(int32_t i = 0; i < numOfCols; ++i) { + pHandle->statis[i].colId = colIds[i]; + } + + tsdbGetBlockStatis(&pHandle->rhelper, pHandle->statis, (int)numOfCols, pBlockInfo->compBlock); + + // always load the first primary timestamp column data + SDataStatis* pPrimaryColStatis = &pHandle->statis[0]; + assert(pPrimaryColStatis->colId == PRIMARYKEY_TIMESTAMP_COL_INDEX); + + pPrimaryColStatis->numOfNull = 0; + pPrimaryColStatis->min = pBlockInfo->compBlock->keyFirst; + pPrimaryColStatis->max = pBlockInfo->compBlock->keyLast; + + //update the number of NULL data rows + for(int32_t i = 1; i < numOfCols; ++i) { + if (pHandle->statis[i].numOfNull == -1) { // set the column data are all NULL + pHandle->statis[i].numOfNull = pBlockInfo->compBlock->numOfRows; + } + } + + int64_t elapsed = taosGetTimestampUs() - stime; + pHandle->cost.statisInfoLoadTime += elapsed; + + *pBlockStatis = pHandle->statis; + return TSDB_CODE_SUCCESS; +} + +SArray* tsdbRetrieveDataBlock(TsdbQueryHandleT* pQueryHandle, SArray* pIdList) { + /** + * In the following two cases, the data has been loaded to SColumnInfoData. + * 1. data is from cache, 2. data block is not completed qualified to query time range + */ + STsdbQueryHandle* pHandle = (STsdbQueryHandle*)pQueryHandle; + + if (pHandle->cur.fid == INT32_MIN) { + return pHandle->pColumns; + } else { + STableBlockInfo* pBlockInfo = &pHandle->pDataBlockInfo[pHandle->cur.slot]; + STableCheckInfo* pCheckInfo = pBlockInfo->pTableCheckInfo; + + if (pHandle->cur.mixBlock) { + return pHandle->pColumns; + } else { + SDataBlockInfo binfo = GET_FILE_DATA_BLOCK_INFO(pCheckInfo, pBlockInfo->compBlock); + assert(pHandle->realNumOfRows <= binfo.rows); + + // data block has been loaded, todo extract method + SDataBlockLoadInfo* pBlockLoadInfo = &pHandle->dataBlockLoadInfo; + + if (pBlockLoadInfo->slot == pHandle->cur.slot && pBlockLoadInfo->fileGroup->fid == pHandle->cur.fid && + pBlockLoadInfo->tid == pCheckInfo->pTableObj->tableId.tid) { + return pHandle->pColumns; + } else { // only load the file block + SBlock* pBlock = pBlockInfo->compBlock; + if (doLoadFileDataBlock(pHandle, pBlock, pCheckInfo, pHandle->cur.slot) != TSDB_CODE_SUCCESS) { + return NULL; + } + + // todo refactor + int32_t numOfRows = doCopyRowsFromFileBlock(pHandle, pHandle->outputCapacity, 0, 0, pBlock->numOfRows - 1); + + // if the buffer is not full in case of descending order query, move the data in the front of the buffer + if (!ASCENDING_TRAVERSE(pHandle->order) && numOfRows < pHandle->outputCapacity) { + int32_t emptySize = pHandle->outputCapacity - numOfRows; + int32_t reqNumOfCols = (int32_t)taosArrayGetSize(pHandle->pColumns); + + for(int32_t i = 0; i < reqNumOfCols; ++i) { + SColumnInfoData* pColInfo = taosArrayGet(pHandle->pColumns, i); + memmove((char*)pColInfo->pData, (char*)pColInfo->pData + emptySize * pColInfo->info.bytes, numOfRows * pColInfo->info.bytes); + } + } + + return pHandle->pColumns; + } + } + } +} + +void filterPrepare(void* expr, void* param) { + tExprNode* pExpr = (tExprNode*)expr; + if (pExpr->_node.info != NULL) { + return; + } + + pExpr->_node.info = calloc(1, sizeof(tQueryInfo)); + + STSchema* pTSSchema = (STSchema*) param; + tQueryInfo* pInfo = pExpr->_node.info; + tVariant* pCond = pExpr->_node.pRight->pVal; + SSchema* pSchema = pExpr->_node.pLeft->pSchema; + + pInfo->sch = *pSchema; + pInfo->optr = pExpr->_node.optr; + pInfo->compare = getComparFunc(pInfo->sch.type, pInfo->optr); + pInfo->indexed = pTSSchema->columns->colId == pInfo->sch.colId; + + if (pInfo->optr == TSDB_RELATION_IN) { + int dummy = -1; + SHashObj *pObj = NULL; + if (pInfo->sch.colId == TSDB_TBNAME_COLUMN_INDEX) { + SArray *arr = (SArray *)(pCond->arr); + + size_t size = taosArrayGetSize(arr); + pObj = taosHashInit(size * 2, taosGetDefaultHashFunction(pInfo->sch.type), true, false); + + for (size_t i = 0; i < size; i++) { + char* p = taosArrayGetP(arr, i); + strntolower_s(varDataVal(p), varDataVal(p), varDataLen(p)); + taosHashPut(pObj, varDataVal(p), varDataLen(p), &dummy, sizeof(dummy)); + } + } else { + buildFilterSetFromBinary((void **)&pObj, pCond->pz, pCond->nLen); + } + + pInfo->q = (char *)pObj; + } else if (pCond != NULL) { + uint32_t size = pCond->nLen * TSDB_NCHAR_SIZE; + if (size < (uint32_t)pSchema->bytes) { + size = pSchema->bytes; + } + + // to make sure tonchar does not cause invalid write, since the '\0' needs at least sizeof(wchar_t) space. + pInfo->q = calloc(1, size + TSDB_NCHAR_SIZE + VARSTR_HEADER_SIZE); + tVariantDump(pCond, pInfo->q, pSchema->type, true); + } +} + +static int32_t tableGroupComparFn(const void *p1, const void *p2, const void *param) { + STableGroupSupporter* pTableGroupSupp = (STableGroupSupporter*) param; + STable* pTable1 = ((STableKeyInfo*) p1)->pTable; + STable* pTable2 = ((STableKeyInfo*) p2)->pTable; + + for (int32_t i = 0; i < pTableGroupSupp->numOfCols; ++i) { + SColIndex* pColIndex = &pTableGroupSupp->pCols[i]; + int32_t colIndex = pColIndex->colIndex; + + assert(colIndex >= TSDB_TBNAME_COLUMN_INDEX); + + char * f1 = NULL; + char * f2 = NULL; + int32_t type = 0; + int32_t bytes = 0; + + if (colIndex == TSDB_TBNAME_COLUMN_INDEX) { + f1 = (char*) TABLE_NAME(pTable1); + f2 = (char*) TABLE_NAME(pTable2); + type = TSDB_DATA_TYPE_BINARY; + bytes = tGetTbnameColumnSchema()->bytes; + } else { + if (pTableGroupSupp->pTagSchema && colIndex < pTableGroupSupp->pTagSchema->numOfCols) { + STColumn* pCol = schemaColAt(pTableGroupSupp->pTagSchema, colIndex); + bytes = pCol->bytes; + type = pCol->type; + if (type == TSDB_DATA_TYPE_JSON){ + f1 = getJsonTagValueElment(pTable1, pColIndex->name, (int32_t)strlen(pColIndex->name), NULL, TSDB_MAX_JSON_TAGS_LEN); + f2 = getJsonTagValueElment(pTable2, pColIndex->name, (int32_t)strlen(pColIndex->name), NULL, TSDB_MAX_JSON_TAGS_LEN); + }else{ + f1 = tdGetKVRowValOfCol(pTable1->tagVal, pCol->colId); + f2 = tdGetKVRowValOfCol(pTable2->tagVal, pCol->colId); + } + } + } + + // this tags value may be NULL + if (f1 == NULL && f2 == NULL) { + continue; + } + + if (f1 == NULL) { + return -1; + } + + if (f2 == NULL) { + return 1; + } + + int32_t ret = doCompare(f1, f2, type, bytes); + if (ret == 0) { + continue; + } else { + return ret; + } + } + + return 0; +} + +static int tsdbCheckInfoCompar(const void* key1, const void* key2) { + if (((STableCheckInfo*)key1)->tableId.tid < ((STableCheckInfo*)key2)->tableId.tid) { + return -1; + } else if (((STableCheckInfo*)key1)->tableId.tid > ((STableCheckInfo*)key2)->tableId.tid) { + return 1; + } else { + ASSERT(false); + return 0; + } +} + +void createTableGroupImpl(SArray* pGroups, SArray* pTableList, size_t numOfTables, TSKEY skey, + STableGroupSupporter* pSupp, __ext_compar_fn_t compareFn) { + STable* pTable = taosArrayGetP(pTableList, 0); + + SArray* g = taosArrayInit(16, sizeof(STableKeyInfo)); + + STableKeyInfo info = {.pTable = pTable, .lastKey = skey}; + taosArrayPush(g, &info); + tsdbRefTable(pTable); + + for (int32_t i = 1; i < numOfTables; ++i) { + STable** prev = taosArrayGet(pTableList, i - 1); + STable** p = taosArrayGet(pTableList, i); + + int32_t ret = compareFn(prev, p, pSupp); + assert(ret == 0 || ret == -1); + + tsdbRefTable(*p); + assert((*p)->type == TSDB_CHILD_TABLE); + + if (ret == 0) { + STableKeyInfo info1 = {.pTable = *p, .lastKey = skey}; + taosArrayPush(g, &info1); + } else { + taosArrayPush(pGroups, &g); // current group is ended, start a new group + g = taosArrayInit(16, sizeof(STableKeyInfo)); + + STableKeyInfo info1 = {.pTable = *p, .lastKey = skey}; + taosArrayPush(g, &info1); + } + } + + taosArrayPush(pGroups, &g); +} + +SArray* createTableGroup(SArray* pTableList, STSchema* pTagSchema, SColIndex* pCols, int32_t numOfOrderCols, TSKEY skey) { + assert(pTableList != NULL); + SArray* pTableGroup = taosArrayInit(1, POINTER_BYTES); + + size_t size = taosArrayGetSize(pTableList); + if (size == 0) { + tsdbDebug("no qualified tables"); + return pTableGroup; + } + + if (numOfOrderCols == 0 || size == 1) { // no group by tags clause or only one table + SArray* sa = taosArrayInit(size, sizeof(STableKeyInfo)); + if (sa == NULL) { + taosArrayDestroy(&pTableGroup); + return NULL; + } + + for(int32_t i = 0; i < size; ++i) { + STableKeyInfo *pKeyInfo = taosArrayGet(pTableList, i); + tsdbRefTable(pKeyInfo->pTable); + + STableKeyInfo info = {.pTable = pKeyInfo->pTable, .lastKey = skey}; + taosArrayPush(sa, &info); + } + + taosArrayPush(pTableGroup, &sa); + tsdbDebug("all %" PRIzu " tables belong to one group", size); + } else { + STableGroupSupporter sup = {0}; + sup.numOfCols = numOfOrderCols; + sup.pTagSchema = pTagSchema; + sup.pCols = pCols; + + taosqsort(pTableList->pData, size, sizeof(STableKeyInfo), &sup, tableGroupComparFn); + createTableGroupImpl(pTableGroup, pTableList, size, skey, &sup, tableGroupComparFn); + } + + return pTableGroup; +} + +int32_t tsdbQuerySTableByTagCond(STsdbRepo* tsdb, uint64_t uid, TSKEY skey, const char* pTagCond, size_t len, + STableGroupInfo* pGroupInfo, SColIndex* pColIndex, int32_t numOfCols) { + SArray* res = NULL; + if (tsdbRLockRepoMeta(tsdb) < 0) goto _error; + + STable* pTable = tsdbGetTableByUid(tsdbGetMeta(tsdb), uid); + if (pTable == NULL) { + tsdbError("%p failed to get stable, uid:%" PRIu64, tsdb, uid); + terrno = TSDB_CODE_TDB_INVALID_TABLE_ID; + tsdbUnlockRepoMeta(tsdb); + + goto _error; + } + + if (pTable->type != TSDB_SUPER_TABLE) { + tsdbError("%p query normal tag not allowed, uid:%" PRIu64 ", tid:%d, name:%s", tsdb, uid, pTable->tableId.tid, + pTable->name->data); + terrno = TSDB_CODE_COM_OPS_NOT_SUPPORT; //basically, this error is caused by invalid sql issued by client + + tsdbUnlockRepoMeta(tsdb); + goto _error; + } + + //NOTE: not add ref count for super table + res = taosArrayInit(8, sizeof(STableKeyInfo)); + STSchema* pTagSchema = tsdbGetTableTagSchema(pTable); + + // no tags and tbname condition, all child tables of this stable are involved + if (pTagCond == NULL || len == 0) { + int32_t ret = getAllTableList(pTable, res); + if (ret != TSDB_CODE_SUCCESS) { + tsdbUnlockRepoMeta(tsdb); + goto _error; + } + + pGroupInfo->numOfTables = (uint32_t) taosArrayGetSize(res); + pGroupInfo->pGroupList = createTableGroup(res, pTagSchema, pColIndex, numOfCols, skey); + + tsdbDebug("%p no table name/tag condition, all tables qualified, numOfTables:%u, group:%zu", tsdb, + pGroupInfo->numOfTables, taosArrayGetSize(pGroupInfo->pGroupList)); + + taosArrayDestroy(&res); + if (tsdbUnlockRepoMeta(tsdb) < 0) goto _error; + return ret; + } + + int32_t ret = TSDB_CODE_SUCCESS; + tExprNode* expr = NULL; + + TRY(TSDB_MAX_TAG_CONDITIONS) { + expr = exprTreeFromBinary(pTagCond, len); + CLEANUP_EXECUTE(); + + } CATCH( code ) { + CLEANUP_EXECUTE(); + terrno = code; + tsdbUnlockRepoMeta(tsdb); // unlock tsdb in any cases + + goto _error; + // TODO: more error handling + } END_TRY + + void *filterInfo = calloc(1, sizeof(SFilterInfo)); + ((SFilterInfo*)filterInfo)->pTable = pTable; + ret = filterInitFromTree(expr, &filterInfo, 0); + tExprTreeDestroy(expr, NULL); + + if (ret != TSDB_CODE_SUCCESS) { + terrno = ret; + tsdbUnlockRepoMeta(tsdb); + filterFreeInfo(filterInfo); + goto _error; + } + + ret = tsdbQueryTableList(pTable, res, filterInfo); + if (ret != TSDB_CODE_SUCCESS) { + terrno = ret; + tsdbUnlockRepoMeta(tsdb); + filterFreeInfo(filterInfo); + goto _error; + } + + filterFreeInfo(filterInfo); + + pGroupInfo->numOfTables = (uint32_t)taosArrayGetSize(res); + pGroupInfo->pGroupList = createTableGroup(res, pTagSchema, pColIndex, numOfCols, skey); + + tsdbDebug("%p stable tid:%d, uid:%"PRIu64" query, numOfTables:%u, belong to %" PRIzu " groups", tsdb, pTable->tableId.tid, + pTable->tableId.uid, pGroupInfo->numOfTables, taosArrayGetSize(pGroupInfo->pGroupList)); + + taosArrayDestroy(&res); + + if (tsdbUnlockRepoMeta(tsdb) < 0) goto _error; + return ret; + + _error: + + taosArrayDestroy(&res); + return terrno; +} + +int32_t tsdbGetOneTableGroup(STsdbRepo* tsdb, uint64_t uid, TSKEY startKey, STableGroupInfo* pGroupInfo) { + if (tsdbRLockRepoMeta(tsdb) < 0) goto _error; + + STable* pTable = tsdbGetTableByUid(tsdbGetMeta(tsdb), uid); + if (pTable == NULL) { + terrno = TSDB_CODE_TDB_INVALID_TABLE_ID; + tsdbUnlockRepoMeta(tsdb); + goto _error; + } + + assert(pTable->type == TSDB_CHILD_TABLE || pTable->type == TSDB_NORMAL_TABLE || pTable->type == TSDB_STREAM_TABLE); + tsdbRefTable(pTable); + if (tsdbUnlockRepoMeta(tsdb) < 0) goto _error; + + pGroupInfo->numOfTables = 1; + pGroupInfo->pGroupList = taosArrayInit(1, POINTER_BYTES); + + SArray* group = taosArrayInit(1, sizeof(STableKeyInfo)); + + STableKeyInfo info = {.pTable = pTable, .lastKey = startKey}; + taosArrayPush(group, &info); + + taosArrayPush(pGroupInfo->pGroupList, &group); + return TSDB_CODE_SUCCESS; + + _error: + return terrno; +} + +int32_t tsdbGetTableGroupFromIdList(STsdbRepo* tsdb, SArray* pTableIdList, STableGroupInfo* pGroupInfo) { + if (tsdbRLockRepoMeta(tsdb) < 0) { + return terrno; + } + + assert(pTableIdList != NULL); + size_t size = taosArrayGetSize(pTableIdList); + pGroupInfo->pGroupList = taosArrayInit(1, POINTER_BYTES); + SArray* group = taosArrayInit(1, sizeof(STableKeyInfo)); + + for(int32_t i = 0; i < size; ++i) { + STableIdInfo *id = taosArrayGet(pTableIdList, i); + + STable* pTable = tsdbGetTableByUid(tsdbGetMeta(tsdb), id->uid); + if (pTable == NULL) { + tsdbWarn("table uid:%"PRIu64", tid:%d has been drop already", id->uid, id->tid); + continue; + } + + if (pTable->type == TSDB_SUPER_TABLE) { + tsdbError("direct query on super tale is not allowed, table uid:%"PRIu64", tid:%d", id->uid, id->tid); + terrno = TSDB_CODE_QRY_INVALID_MSG; + tsdbUnlockRepoMeta(tsdb); + taosArrayDestroy(&group); + return terrno; + } + + tsdbRefTable(pTable); + + STableKeyInfo info = {.pTable = pTable, .lastKey = id->key}; + taosArrayPush(group, &info); + } + + if (tsdbUnlockRepoMeta(tsdb) < 0) { + taosArrayDestroy(&group); + return terrno; + } + + pGroupInfo->numOfTables = (uint32_t) taosArrayGetSize(group); + if (pGroupInfo->numOfTables > 0) { + taosArrayPush(pGroupInfo->pGroupList, &group); + } else { + taosArrayDestroy(&group); + } + + return TSDB_CODE_SUCCESS; +} + +static void* doFreeColumnInfoData(SArray* pColumnInfoData) { + if (pColumnInfoData == NULL) { + return NULL; + } + + size_t cols = taosArrayGetSize(pColumnInfoData); + for (int32_t i = 0; i < cols; ++i) { + SColumnInfoData* pColInfo = taosArrayGet(pColumnInfoData, i); + tfree(pColInfo->pData); + } + + taosArrayDestroy(&pColumnInfoData); + return NULL; +} + +static void* destroyTableCheckInfo(SArray* pTableCheckInfo) { + size_t size = taosArrayGetSize(pTableCheckInfo); + for (int32_t i = 0; i < size; ++i) { + STableCheckInfo* p = taosArrayGet(pTableCheckInfo, i); + destroyTableMemIterator(p); + + tfree(p->pCompInfo); + } + + taosArrayDestroy(&pTableCheckInfo); + return NULL; +} + +void tsdbCleanupQueryHandle(TsdbQueryHandleT queryHandle) { + STsdbQueryHandle* pQueryHandle = (STsdbQueryHandle*)queryHandle; + if (pQueryHandle == NULL) { + return; + } + + pQueryHandle->pColumns = doFreeColumnInfoData(pQueryHandle->pColumns); + + taosArrayDestroy(&pQueryHandle->defaultLoadColumn); + tfree(pQueryHandle->pDataBlockInfo); + tfree(pQueryHandle->statis); + + if (!emptyQueryTimewindow(pQueryHandle)) { + tsdbMayUnTakeMemSnapshot(pQueryHandle); + } else { + assert(pQueryHandle->pTableCheckInfo == NULL); + } + + if (pQueryHandle->pTableCheckInfo != NULL) { + pQueryHandle->pTableCheckInfo = destroyTableCheckInfo(pQueryHandle->pTableCheckInfo); + } + + tsdbDestroyReadH(&pQueryHandle->rhelper); + + tdFreeDataCols(pQueryHandle->pDataCols); + pQueryHandle->pDataCols = NULL; + + pQueryHandle->prev = doFreeColumnInfoData(pQueryHandle->prev); + pQueryHandle->next = doFreeColumnInfoData(pQueryHandle->next); + + SIOCostSummary* pCost = &pQueryHandle->cost; + + tsdbDebug("%p :io-cost summary: head-file read cnt:%"PRIu64", head-file time:%"PRIu64" us, statis-info:%"PRId64" us, datablock:%" PRId64" us, check data:%"PRId64" us, 0x%"PRIx64, + pQueryHandle, pCost->headFileLoad, pCost->headFileLoadTime, pCost->statisInfoLoadTime, pCost->blockLoadTime, pCost->checkForNextTime, pQueryHandle->qId); + + tfree(pQueryHandle); +} + +void tsdbDestroyTableGroup(STableGroupInfo *pGroupList) { + assert(pGroupList != NULL); + + size_t numOfGroup = taosArrayGetSize(pGroupList->pGroupList); + + for(int32_t i = 0; i < numOfGroup; ++i) { + SArray* p = taosArrayGetP(pGroupList->pGroupList, i); + + size_t numOfTables = taosArrayGetSize(p); + for(int32_t j = 0; j < numOfTables; ++j) { + STable* pTable = taosArrayGetP(p, j); + if (pTable != NULL) { // in case of handling retrieve data from tsdb + tsdbUnRefTable(pTable); + } + //assert(pTable != NULL); + } + + taosArrayDestroy(&p); + } + + taosHashCleanup(pGroupList->map); + taosArrayDestroy(&pGroupList->pGroupList); + pGroupList->numOfTables = 0; +} + + +static FORCE_INLINE int32_t tsdbGetTagDataFromId(void *param, int32_t id, void **data) { + STable* pTable = (STable*)(SL_GET_NODE_DATA((SSkipListNode *)param)); + + if (id == TSDB_TBNAME_COLUMN_INDEX) { + *data = TABLE_NAME(pTable); + } else { + *data = tdGetKVRowValOfCol(pTable->tagVal, id); + } + + return TSDB_CODE_SUCCESS; +} + + + +static void queryIndexedColumn(SSkipList* pSkipList, void* filterInfo, SArray* res) { + SSkipListIterator* iter = NULL; + char *startVal = NULL; + int32_t order = 0; + int32_t inRange = 0; + int32_t flag = 0; + bool all = false; + int8_t *addToResult = NULL; + + filterGetIndexedColumnInfo(filterInfo, &startVal, &order, &flag); + + tsdbDebug("filter index column start, order:%d, flag:%d", order, flag); + + while (order) { + if (FILTER_GET_FLAG(order, TSDB_ORDER_ASC)) { + iter = tSkipListCreateIterFromVal(pSkipList, startVal, pSkipList->type, TSDB_ORDER_ASC); + FILTER_CLR_FLAG(order, TSDB_ORDER_ASC); + } else { + iter = tSkipListCreateIterFromVal(pSkipList, startVal, pSkipList->type, TSDB_ORDER_DESC); + FILTER_CLR_FLAG(order, TSDB_ORDER_DESC); + } + + while (tSkipListIterNext(iter)) { + SSkipListNode *pNode = tSkipListIterGet(iter); + + if (inRange == 0 || !FILTER_GET_FLAG(flag, FI_ACTION_NO_NEED)) { + tsdbDebug("filter index column, filter it"); + filterSetColFieldData(filterInfo, pNode, tsdbGetTagDataFromId); + all = filterExecute(filterInfo, 1, &addToResult, NULL, 0); + } + + char *pData = SL_GET_NODE_DATA(pNode); + + tsdbDebug("filter index column, table:%s, result:%d", ((STable *)pData)->name->data, all); + + if (all || (addToResult && *addToResult)) { + STableKeyInfo info = {.pTable = (void*)pData, .lastKey = TSKEY_INITIAL_VAL}; + taosArrayPush(res, &info); + inRange = 1; + } else if (inRange){ + break; + } + } + + inRange = 0; + + tfree(addToResult); + tSkipListDestroyIter(iter); + } + + tsdbDebug("filter index column end"); +} + +static void queryIndexlessColumn(SSkipList* pSkipList, void* filterInfo, SArray* res) { + SSkipListIterator* iter = tSkipListCreateIter(pSkipList); + int8_t *addToResult = NULL; + + while (tSkipListIterNext(iter)) { + + SSkipListNode *pNode = tSkipListIterGet(iter); + + filterSetColFieldData(filterInfo, pNode, tsdbGetTagDataFromId); + + char *pData = SL_GET_NODE_DATA(pNode); + + bool all = filterExecute(filterInfo, 1, &addToResult, NULL, 0); + + if (all || (addToResult && *addToResult)) { + STableKeyInfo info = {.pTable = (void*)pData, .lastKey = TSKEY_INITIAL_VAL}; + taosArrayPush(res, &info); + } + } + + tfree(addToResult); + + tSkipListDestroyIter(iter); +} + +static FORCE_INLINE int32_t tsdbGetJsonTagDataFromId(void *param, int32_t id, char* name, void **data) { + JsonMapValue* jsonMapV = (JsonMapValue*)(param); + STable* pTable = (STable*)(jsonMapV->table); + + if (id == TSDB_TBNAME_COLUMN_INDEX) { + *data = TABLE_NAME(pTable); + } else { + void* jsonData = tsdbGetJsonTagValue(pTable, name, TSDB_MAX_JSON_KEY_MD5_LEN, NULL); + // jsonData == NULL for ? operation + // if(jsonData != NULL) jsonData += CHAR_BYTES; // jump type + *data = jsonData; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t queryByJsonTag(STable* pTable, void* filterInfo, SArray* res){ + // get all table in fields, and dumplicate it + SArray* tabList = NULL; + bool needQueryAll = false; + SFilterInfo* info = (SFilterInfo*)filterInfo; + for (uint16_t i = 0; i < info->fields[FLD_TYPE_COLUMN].num; ++i) { + SFilterField* fi = &info->fields[FLD_TYPE_COLUMN].fields[i]; + SSchema* sch = fi->desc; + if (sch->colId == TSDB_TBNAME_COLUMN_INDEX) { + tabList = taosArrayInit(32, sizeof(JsonMapValue)); + getAllTableList(pTable, tabList); // query all table + needQueryAll = true; + break; + } + } + for (uint16_t i = 0; i < info->unitNum; ++i) { // is null operation need query all table + SFilterUnit* unit = &info->units[i]; + if (unit->compare.optr == TSDB_RELATION_ISNULL) { + tabList = taosArrayInit(32, sizeof(JsonMapValue)); + getAllTableList(pTable, tabList); // query all table + needQueryAll = true; + break; + } + } + + for (uint16_t i = 0; i < info->fields[FLD_TYPE_COLUMN].num; ++i) { + if (needQueryAll) break; // query all table + SFilterField* fi = &info->fields[FLD_TYPE_COLUMN].fields[i]; + SSchema* sch = fi->desc; + char* key = sch->name; + + SArray** data = (SArray**)taosHashGet(pTable->jsonKeyMap, key, TSDB_MAX_JSON_KEY_MD5_LEN); + if(data == NULL) continue; + if(tabList == NULL) { + tabList = taosArrayDup(*data); + }else{ + for(int j = 0; j < taosArrayGetSize(*data); j++){ + void* element = taosArrayGet(*data, j); + void* p = taosArraySearch(tabList, element, tsdbCompareJsonMapValue, TD_EQ); + if (p == NULL) { + p = taosArraySearch(tabList, element, tsdbCompareJsonMapValue, TD_GE); + if(p == NULL){ + taosArrayPush(tabList, element); + }else{ + taosArrayInsert(tabList, TARRAY_ELEM_IDX(tabList, p), element); + } + } + } + } + } + if(tabList == NULL){ + tsdbError("json key not exist, no candidate table"); + return TSDB_CODE_SUCCESS; + } + size_t size = taosArrayGetSize(tabList); + int8_t *addToResult = NULL; + for(int i = 0; i < size; i++){ + JsonMapValue* data = taosArrayGet(tabList, i); + filterSetJsonColFieldData(filterInfo, data, tsdbGetJsonTagDataFromId); + bool all = filterExecute(filterInfo, 1, &addToResult, NULL, 0); + + if (all || (addToResult && *addToResult)) { + STableKeyInfo kInfo = {.pTable = (void*)(data->table), .lastKey = TSKEY_INITIAL_VAL}; + taosArrayPush(res, &kInfo); + } + } + tfree(addToResult); + taosArrayDestroy(&tabList); + return TSDB_CODE_SUCCESS; +} + +static int32_t tsdbQueryTableList(STable* pTable, SArray* pRes, void* filterInfo) { + STSchema* pTSSchema = pTable->tagSchema; + + if(pTSSchema->columns->type == TSDB_DATA_TYPE_JSON){ + return queryByJsonTag(pTable, filterInfo, pRes); + }else{ + bool indexQuery = false; + SSkipList *pSkipList = pTable->pIndex; + + filterIsIndexedColumnQuery(filterInfo, pTSSchema->columns->colId, &indexQuery); + + if (indexQuery) { + queryIndexedColumn(pSkipList, filterInfo, pRes); + } else { + queryIndexlessColumn(pSkipList, filterInfo, pRes); + } + } + + return TSDB_CODE_SUCCESS; +} + +void* getJsonTagValueElment(void* data, char* key, int32_t keyLen, char* dst, int16_t bytes){ + char keyMd5[TSDB_MAX_JSON_KEY_MD5_LEN] = {0}; + jsonKeyMd5(key, keyLen, keyMd5); + + void* result = tsdbGetJsonTagValue(data, keyMd5, TSDB_MAX_JSON_KEY_MD5_LEN, NULL); + if (result == NULL){ // json key no result + if(!dst) return NULL; + *dst = TSDB_DATA_TYPE_JSON; + setNull(dst + CHAR_BYTES, TSDB_DATA_TYPE_JSON, 0); + return dst; + } + + char* realData = POINTER_SHIFT(result, CHAR_BYTES); + if(*(char*)result == TSDB_DATA_TYPE_NCHAR || *(char*)result == TSDB_DATA_TYPE_BINARY) { + assert(varDataTLen(realData) < bytes); + if(!dst) return result; + memcpy(dst, result, CHAR_BYTES + varDataTLen(realData)); + return dst; + }else if (*(char*)result == TSDB_DATA_TYPE_DOUBLE || *(char*)result == TSDB_DATA_TYPE_BIGINT) { + if(!dst) return result; + memcpy(dst, result, CHAR_BYTES + LONG_BYTES); + return dst; + }else if (*(char*)result == TSDB_DATA_TYPE_BOOL) { + if(!dst) return result; + memcpy(dst, result, CHAR_BYTES + CHAR_BYTES); + return dst; + }else { + assert(0); + } + return result; +} + +void getJsonTagValueAll(void* data, void* dst, int16_t bytes) { + char* json = parseTagDatatoJson(data); + char* tagData = POINTER_SHIFT(dst, CHAR_BYTES); + *(char*)dst = TSDB_DATA_TYPE_JSON; + if(json == NULL){ + setNull(tagData, TSDB_DATA_TYPE_JSON, 0); + return; + } + + int32_t length = 0; + if(!taosMbsToUcs4(json, strlen(json), varDataVal(tagData), bytes - VARSTR_HEADER_SIZE - CHAR_BYTES, &length)){ + tsdbError("getJsonTagValueAll mbstoucs4 error! length:%d", length); + } + varDataSetLen(tagData, length); + assert(varDataTLen(tagData) <= bytes); + tfree(json); +} + +char* parseTagDatatoJson(void *p){ + char* string = NULL; + cJSON *json = cJSON_CreateObject(); + if (json == NULL) + { + goto end; + } + + int16_t nCols = kvRowNCols(p); + ASSERT(nCols%2 == 1); + char tagJsonKey[TSDB_MAX_JSON_KEY_LEN + 1] = {0}; + for (int j = 0; j < nCols; ++j) { + SColIdx * pColIdx = kvRowColIdxAt(p, j); + void* val = (kvRowColVal(p, pColIdx)); + if (j == 0){ + int8_t jsonPlaceHolder = *(int8_t*)val; + ASSERT(jsonPlaceHolder == TSDB_DATA_JSON_PLACEHOLDER); + continue; + } + if(j == 1){ + uint32_t jsonNULL = *(uint32_t*)(varDataVal(val)); + ASSERT(jsonNULL == TSDB_DATA_JSON_NULL); + continue; + } + if (j == 2){ + if(*(uint32_t*)(varDataVal(val + CHAR_BYTES)) == TSDB_DATA_JSON_NULL) goto end; + continue; + } + if (j%2 == 1) { // json key encode by binary + ASSERT(varDataLen(val) <= TSDB_MAX_JSON_KEY_LEN); + memset(tagJsonKey, 0, sizeof(tagJsonKey)); + memcpy(tagJsonKey, varDataVal(val), varDataLen(val)); + }else{ // json value + char* realData = POINTER_SHIFT(val, CHAR_BYTES); + char type = *(char*)val; + if(type == TSDB_DATA_TYPE_BINARY) { + assert(*(uint32_t*)varDataVal(realData) == TSDB_DATA_JSON_null); // json null value + assert(varDataLen(realData) == INT_BYTES); + cJSON* value = cJSON_CreateNull(); + if (value == NULL) + { + goto end; + } + cJSON_AddItemToObject(json, tagJsonKey, value); + }else if(type == TSDB_DATA_TYPE_NCHAR) { + cJSON* value = NULL; + if (varDataLen(realData) > 0){ + char *tagJsonValue = calloc(varDataLen(realData), 1); + int32_t length = taosUcs4ToMbs(varDataVal(realData), varDataLen(realData), tagJsonValue); + if (length < 0) { + tsdbError("charset:%s to %s. val:%s convert json value failed.", DEFAULT_UNICODE_ENCODEC, tsCharset, + (char*)val); + free(tagJsonValue); + goto end; + } + value = cJSON_CreateString(tagJsonValue); + free(tagJsonValue); + if (value == NULL) + { + goto end; + } + }else if(varDataLen(realData) == 0){ + value = cJSON_CreateString(""); + }else{ + assert(0); + } + + cJSON_AddItemToObject(json, tagJsonKey, value); + }else if(type == TSDB_DATA_TYPE_DOUBLE){ + double jsonVd = *(double*)(realData); + cJSON* value = cJSON_CreateNumber(jsonVd); + if (value == NULL) + { + goto end; + } + cJSON_AddItemToObject(json, tagJsonKey, value); + }else if(type == TSDB_DATA_TYPE_BIGINT){ + int64_t jsonVd = *(int64_t*)(realData); + cJSON* value = cJSON_CreateNumber((double)jsonVd); + if (value == NULL) + { + goto end; + } + cJSON_AddItemToObject(json, tagJsonKey, value); + }else if (type == TSDB_DATA_TYPE_BOOL) { + char jsonVd = *(char*)(realData); + cJSON* value = cJSON_CreateBool(jsonVd); + if (value == NULL) + { + goto end; + } + cJSON_AddItemToObject(json, tagJsonKey, value); + } + else{ + tsdbError("unsupportted json value"); + } + } + } + string = cJSON_PrintUnformatted(json); +end: + cJSON_Delete(json); + return string; +} + +// obtain queryHandle attribute +int64_t tsdbSkipOffset(TsdbQueryHandleT queryHandle) { + STsdbQueryHandle* pQueryHandle = (STsdbQueryHandle*)queryHandle; + if (pQueryHandle) { + return pQueryHandle->srows; + } + return 0; +} \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/src/tsdbReadImpl.c b/source/dnode/vnode/tsdb2/src/tsdbReadImpl.c new file mode 100644 index 0000000000..f2678c627f --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbReadImpl.c @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "tsdbint.h" + +#define TSDB_KEY_COL_OFFSET 0 + +static void tsdbResetReadTable(SReadH *pReadh); +static void tsdbResetReadFile(SReadH *pReadh); +static int tsdbLoadBlockDataImpl(SReadH *pReadh, SBlock *pBlock, SDataCols *pDataCols); +static int tsdbCheckAndDecodeColumnData(SDataCol *pDataCol, void *content, int32_t len, int8_t comp, int numOfRows, + int maxPoints, char *buffer, int bufferSize); +static int tsdbLoadBlockDataColsImpl(SReadH *pReadh, SBlock *pBlock, SDataCols *pDataCols, int16_t *colIds, + int numOfColIds); +static int tsdbLoadColData(SReadH *pReadh, SDFile *pDFile, SBlock *pBlock, SBlockCol *pBlockCol, SDataCol *pDataCol); +static int tsdbLoadBlockStatisFromDFile(SReadH *pReadh, SBlock *pBlock); +static int tsdbLoadBlockStatisFromAggr(SReadH *pReadh, SBlock *pBlock); + +int tsdbInitReadH(SReadH *pReadh, STsdbRepo *pRepo) { + ASSERT(pReadh != NULL && pRepo != NULL); + + STsdbCfg *pCfg = REPO_CFG(pRepo); + + memset((void *)pReadh, 0, sizeof(*pReadh)); + pReadh->pRepo = pRepo; + + TSDB_FSET_SET_CLOSED(TSDB_READ_FSET(pReadh)); + + pReadh->aBlkIdx = taosArrayInit(1024, sizeof(SBlockIdx)); + if (pReadh->aBlkIdx == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + pReadh->pDCols[0] = tdNewDataCols(0, pCfg->maxRowsPerFileBlock); + if (pReadh->pDCols[0] == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbDestroyReadH(pReadh); + return -1; + } + + pReadh->pDCols[1] = tdNewDataCols(0, pCfg->maxRowsPerFileBlock); + if (pReadh->pDCols[1] == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + tsdbDestroyReadH(pReadh); + return -1; + } + + return 0; +} + +void tsdbDestroyReadH(SReadH *pReadh) { + if (pReadh == NULL) return; + pReadh->pExBuf = taosTZfree(pReadh->pExBuf); + pReadh->pCBuf = taosTZfree(pReadh->pCBuf); + pReadh->pBuf = taosTZfree(pReadh->pBuf); + pReadh->pDCols[0] = tdFreeDataCols(pReadh->pDCols[0]); + pReadh->pDCols[1] = tdFreeDataCols(pReadh->pDCols[1]); + pReadh->pAggrBlkData = taosTZfree(pReadh->pAggrBlkData); + pReadh->pBlkData = taosTZfree(pReadh->pBlkData); + pReadh->pBlkInfo = taosTZfree(pReadh->pBlkInfo); + pReadh->cidx = 0; + pReadh->pBlkIdx = NULL; + pReadh->pTable = NULL; + pReadh->aBlkIdx = taosArrayDestroy(&pReadh->aBlkIdx); + tsdbCloseDFileSet(TSDB_READ_FSET(pReadh)); + pReadh->pRepo = NULL; +} + +int tsdbSetAndOpenReadFSet(SReadH *pReadh, SDFileSet *pSet) { + ASSERT(pSet != NULL); + tsdbResetReadFile(pReadh); + + pReadh->rSet = *pSet; + TSDB_FSET_SET_CLOSED(TSDB_READ_FSET(pReadh)); + if (tsdbOpenDFileSet(TSDB_READ_FSET(pReadh), O_RDONLY) < 0) { + tsdbError("vgId:%d failed to open file set %d since %s", TSDB_READ_REPO_ID(pReadh), TSDB_FSET_FID(pSet), + tstrerror(terrno)); + return -1; + } + + return 0; +} + +void tsdbCloseAndUnsetFSet(SReadH *pReadh) { tsdbResetReadFile(pReadh); } + +int tsdbLoadBlockIdx(SReadH *pReadh) { + SDFile * pHeadf = TSDB_READ_HEAD_FILE(pReadh); + SBlockIdx blkIdx; + + ASSERT(taosArrayGetSize(pReadh->aBlkIdx) == 0); + + // No data at all, just return + if (pHeadf->info.offset <= 0) return 0; + + if (tsdbSeekDFile(pHeadf, pHeadf->info.offset, SEEK_SET) < 0) { + tsdbError("vgId:%d failed to load SBlockIdx part while seek file %s since %s, offset:%u len :%u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), tstrerror(terrno), pHeadf->info.offset, + pHeadf->info.len); + return -1; + } + + if (tsdbMakeRoom((void **)(&TSDB_READ_BUF(pReadh)), pHeadf->info.len) < 0) return -1; + + int64_t nread = tsdbReadDFile(pHeadf, TSDB_READ_BUF(pReadh), pHeadf->info.len); + if (nread < 0) { + tsdbError("vgId:%d failed to load SBlockIdx part while read file %s since %s, offset:%u len :%u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), tstrerror(terrno), pHeadf->info.offset, + pHeadf->info.len); + return -1; + } + + if (nread < pHeadf->info.len) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d SBlockIdx part in file %s is corrupted, offset:%u expected bytes:%u read bytes: %" PRId64, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), pHeadf->info.offset, pHeadf->info.len, nread); + return -1; + } + + if (!taosCheckChecksumWhole((uint8_t *)TSDB_READ_BUF(pReadh), pHeadf->info.len)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d SBlockIdx part in file %s is corrupted since wrong checksum, offset:%u len :%u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), pHeadf->info.offset, pHeadf->info.len); + return -1; + } + + void *ptr = TSDB_READ_BUF(pReadh); + int tsize = 0; + while (POINTER_DISTANCE(ptr, TSDB_READ_BUF(pReadh)) < (pHeadf->info.len - sizeof(TSCKSUM))) { + ptr = tsdbDecodeSBlockIdx(ptr, &blkIdx); + ASSERT(ptr != NULL); + + if (taosArrayPush(pReadh->aBlkIdx, (void *)(&blkIdx)) == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + tsize++; + ASSERT(tsize == 1 || ((SBlockIdx *)taosArrayGet(pReadh->aBlkIdx, tsize - 2))->tid < + ((SBlockIdx *)taosArrayGet(pReadh->aBlkIdx, tsize - 1))->tid); + } + + return 0; +} + +int tsdbSetReadTable(SReadH *pReadh, STable *pTable) { + STSchema *pSchema = tsdbGetTableSchemaImpl(pTable, false, false, -1, -1); + + pReadh->pTable = pTable; + + if (tdInitDataCols(pReadh->pDCols[0], pSchema) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + if (tdInitDataCols(pReadh->pDCols[1], pSchema) < 0) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + + size_t size = taosArrayGetSize(pReadh->aBlkIdx); + if (size > 0) { + while (true) { + if (pReadh->cidx >= size) { + pReadh->pBlkIdx = NULL; + break; + } + + SBlockIdx *pBlkIdx = taosArrayGet(pReadh->aBlkIdx, pReadh->cidx); + if (pBlkIdx->tid == TABLE_TID(pTable)) { + if (pBlkIdx->uid == TABLE_UID(pTable)) { + pReadh->pBlkIdx = pBlkIdx; + } else { + pReadh->pBlkIdx = NULL; + } + pReadh->cidx++; + break; + } else if (pBlkIdx->tid > TABLE_TID(pTable)) { + pReadh->pBlkIdx = NULL; + break; + } else { + pReadh->cidx++; + } + } + } else { + pReadh->pBlkIdx = NULL; + } + + return 0; +} + +#if 0 +int tsdbLoadBlockInfo(SReadH *pReadh, void *pTarget) { + ASSERT(pReadh->pBlkIdx != NULL); + + SDFile * pHeadf = TSDB_READ_HEAD_FILE(pReadh); + SBlockIdx *pBlkIdx = pReadh->pBlkIdx; + + if (tsdbSeekDFile(pHeadf, pBlkIdx->offset, SEEK_SET) < 0) { + tsdbError("vgId:%d failed to load SBlockInfo part while seek file %s since %s, offset:%u len:%u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), tstrerror(terrno), pBlkIdx->offset, pBlkIdx->len); + return -1; + } + + if (tsdbMakeRoom((void **)(&(pReadh->pBlkInfo)), pBlkIdx->len) < 0) return -1; + + int64_t nread = tsdbReadDFile(pHeadf, (void *)(pReadh->pBlkInfo), pBlkIdx->len); + if (nread < 0) { + tsdbError("vgId:%d failed to load SBlockInfo part while read file %s since %s, offset:%u len :%u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), tstrerror(terrno), pBlkIdx->offset, pBlkIdx->len); + return -1; + } + + if (nread < pBlkIdx->len) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d SBlockInfo part in file %s is corrupted, offset:%u expected bytes:%u read bytes:%" PRId64, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), pBlkIdx->offset, pBlkIdx->len, nread); + return -1; + } + + if (!taosCheckChecksumWhole((uint8_t *)(pReadh->pBlkInfo), pBlkIdx->len)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d SBlockInfo part in file %s is corrupted since wrong checksum, offset:%u len :%u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), pBlkIdx->offset, pBlkIdx->len); + return -1; + } + + ASSERT(pBlkIdx->tid == pReadh->pBlkInfo->tid && pBlkIdx->uid == pReadh->pBlkInfo->uid); + + if (pTarget) { + memcpy(pTarget, (void *)(pReadh->pBlkInfo), pBlkIdx->len); + } + + return 0; +} +#endif + +static FORCE_INLINE int32_t tsdbGetSBlockVer(int32_t fver) { + switch (fver) { + case TSDB_FS_VER_0: + return TSDB_SBLK_VER_0; + case TSDB_FS_VER_1: + return TSDB_SBLK_VER_1; + default: + return SBlockVerLatest; + } +} + +static FORCE_INLINE size_t tsdbSizeOfSBlock(int32_t sBlkVer) { + switch (sBlkVer) { + case TSDB_SBLK_VER_0: + return sizeof(SBlockV0); + case TSDB_SBLK_VER_1: + return sizeof(SBlockV1); + default: + return sizeof(SBlock); + } +} + +static int tsdbSBlkInfoRefactor(SDFile *pHeadf, SBlockInfo **pDstBlkInfo, SBlockIdx *pBlkIdx, uint32_t *dstBlkInfoLen) { + int sBlkVer = tsdbGetSBlockVer(pHeadf->info.fver); + if (sBlkVer > TSDB_SBLK_VER_0) { + *dstBlkInfoLen = pBlkIdx->len; + return TSDB_CODE_SUCCESS; + } + size_t originBlkSize = tsdbSizeOfSBlock(sBlkVer); + size_t nBlks = (pBlkIdx->len - sizeof(SBlockInfo)) / originBlkSize; + + *dstBlkInfoLen = (uint32_t)(sizeof(SBlockInfo) + nBlks * sizeof(SBlock)); + + if (pBlkIdx->len == *dstBlkInfoLen) { + return TSDB_CODE_SUCCESS; + } + + ASSERT(*dstBlkInfoLen >= pBlkIdx->len); + + SBlockInfo *tmpBlkInfo = NULL; + if (tsdbMakeRoom((void **)(&tmpBlkInfo), *dstBlkInfoLen) < 0) return -1; + memset(tmpBlkInfo, 0, *dstBlkInfoLen); // the blkVer is set to 0 + memcpy(tmpBlkInfo, *pDstBlkInfo, sizeof(SBlockInfo)); // copy header + uint32_t nSubBlks = 0; + for (int i = 0; i < nBlks; ++i) { + SBlock *tmpBlk = tmpBlkInfo->blocks + i; + memcpy(tmpBlk, POINTER_SHIFT((*pDstBlkInfo)->blocks, i * originBlkSize), originBlkSize); + if (i < pBlkIdx->numOfBlocks) { // super blocks + if (tmpBlk->numOfSubBlocks > 1) { // has sub blocks + tmpBlk->offset = sizeof(SBlockInfo) + (pBlkIdx->numOfBlocks + nSubBlks) * sizeof(SBlock); + nSubBlks += tmpBlk->numOfSubBlocks; + } + } + // TODO: update the fields if the SBlock definition change later + } + + taosTZfree(*pDstBlkInfo); + *pDstBlkInfo = tmpBlkInfo; + + return TSDB_CODE_SUCCESS; +} + +int tsdbLoadBlockInfo(SReadH *pReadh, void **pTarget, uint32_t *extendedLen) { + ASSERT(pReadh->pBlkIdx != NULL); + + SDFile * pHeadf = TSDB_READ_HEAD_FILE(pReadh); + SBlockIdx * pBlkIdx = pReadh->pBlkIdx; + + if (tsdbSeekDFile(pHeadf, pBlkIdx->offset, SEEK_SET) < 0) { + tsdbError("vgId:%d failed to load SBlockInfo part while seek file %s since %s, offset:%u len:%u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), tstrerror(terrno), pBlkIdx->offset, pBlkIdx->len); + return -1; + } + + if (tsdbMakeRoom((void **)(&pReadh->pBlkInfo), pBlkIdx->len) < 0) return -1; + + int64_t nread = tsdbReadDFile(pHeadf, (void *)(pReadh->pBlkInfo), pBlkIdx->len); + if (nread < 0) { + tsdbError("vgId:%d failed to load SBlockInfo part while read file %s since %s, offset:%u len :%u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), tstrerror(terrno), pBlkIdx->offset, pBlkIdx->len); + return -1; + } + + if (nread < pBlkIdx->len) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d SBlockInfo part in file %s is corrupted, offset:%u expected bytes:%u read bytes:%" PRId64, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), pBlkIdx->offset, pBlkIdx->len, nread); + return -1; + } + + if (!taosCheckChecksumWhole((uint8_t *)(pReadh->pBlkInfo), pBlkIdx->len)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d SBlockInfo part in file %s is corrupted since wrong checksum, offset:%u len :%u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pHeadf), pBlkIdx->offset, pBlkIdx->len); + return -1; + } + + ASSERT(pBlkIdx->tid == pReadh->pBlkInfo->tid && pBlkIdx->uid == pReadh->pBlkInfo->uid); + + uint32_t dstBlkInfoLen = 0; + if (tsdbSBlkInfoRefactor(pHeadf, &(pReadh->pBlkInfo), pBlkIdx, &dstBlkInfoLen) < 0) { + return -1; + } + + if (extendedLen != NULL) { + if (pTarget != NULL) { + if (*extendedLen < dstBlkInfoLen) { + char *t = realloc(*pTarget, dstBlkInfoLen); + if (t == NULL) { + terrno = TSDB_CODE_TDB_OUT_OF_MEMORY; + return -1; + } + *pTarget = t; + } + memcpy(*pTarget, (void *)(pReadh->pBlkInfo), dstBlkInfoLen); + } + *extendedLen = dstBlkInfoLen; + } + + return TSDB_CODE_SUCCESS; +} + +int tsdbLoadBlockData(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo) { + ASSERT(pBlock->numOfSubBlocks > 0); + int8_t update = pReadh->pRepo->config.update; + + SBlock *iBlock = pBlock; + if (pBlock->numOfSubBlocks > 1) { + if (pBlkInfo) { + iBlock = (SBlock *)POINTER_SHIFT(pBlkInfo, pBlock->offset); + } else { + iBlock = (SBlock *)POINTER_SHIFT(pReadh->pBlkInfo, pBlock->offset); + } + } + + if (tsdbLoadBlockDataImpl(pReadh, iBlock, pReadh->pDCols[0]) < 0) return -1; + for (int i = 1; i < pBlock->numOfSubBlocks; i++) { + iBlock++; + if (tsdbLoadBlockDataImpl(pReadh, iBlock, pReadh->pDCols[1]) < 0) return -1; + if (tdMergeDataCols(pReadh->pDCols[0], pReadh->pDCols[1], pReadh->pDCols[1]->numOfRows, NULL, update != TD_ROW_PARTIAL_UPDATE) < 0) return -1; + } + + ASSERT(pReadh->pDCols[0]->numOfRows == pBlock->numOfRows); + ASSERT(dataColsKeyFirst(pReadh->pDCols[0]) == pBlock->keyFirst); + ASSERT(dataColsKeyLast(pReadh->pDCols[0]) == pBlock->keyLast); + + return 0; +} + +int tsdbLoadBlockDataCols(SReadH *pReadh, SBlock *pBlock, SBlockInfo *pBlkInfo, int16_t *colIds, int numOfColsIds) { + ASSERT(pBlock->numOfSubBlocks > 0); + int8_t update = pReadh->pRepo->config.update; + + SBlock *iBlock = pBlock; + if (pBlock->numOfSubBlocks > 1) { + if (pBlkInfo) { + iBlock = POINTER_SHIFT(pBlkInfo, pBlock->offset); + } else { + iBlock = POINTER_SHIFT(pReadh->pBlkInfo, pBlock->offset); + } + } + + if (tsdbLoadBlockDataColsImpl(pReadh, iBlock, pReadh->pDCols[0], colIds, numOfColsIds) < 0) return -1; + for (int i = 1; i < pBlock->numOfSubBlocks; i++) { + iBlock++; + if (tsdbLoadBlockDataColsImpl(pReadh, iBlock, pReadh->pDCols[1], colIds, numOfColsIds) < 0) return -1; + if (tdMergeDataCols(pReadh->pDCols[0], pReadh->pDCols[1], pReadh->pDCols[1]->numOfRows, NULL, update != TD_ROW_PARTIAL_UPDATE) < 0) return -1; + } + + ASSERT(pReadh->pDCols[0]->numOfRows == pBlock->numOfRows); + ASSERT(dataColsKeyFirst(pReadh->pDCols[0]) == pBlock->keyFirst); + ASSERT(dataColsKeyLast(pReadh->pDCols[0]) == pBlock->keyLast); + + return 0; +} + +static int tsdbLoadBlockStatisFromDFile(SReadH *pReadh, SBlock *pBlock) { + SDFile *pDFile = (pBlock->last) ? TSDB_READ_LAST_FILE(pReadh) : TSDB_READ_DATA_FILE(pReadh); + if (tsdbSeekDFile(pDFile, pBlock->offset, SEEK_SET) < 0) { + tsdbError("vgId:%d failed to load block statis part while seek file %s to offset %" PRId64 " since %s", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), (int64_t)pBlock->offset, tstrerror(terrno)); + return -1; + } + + size_t size = tsdbBlockStatisSize(pBlock->numOfCols, (uint32_t)pBlock->blkVer); + if (tsdbMakeRoom((void **)(&(pReadh->pBlkData)), size) < 0) return -1; + + int64_t nread = tsdbReadDFile(pDFile, (void *)(pReadh->pBlkData), size); + if (nread < 0) { + tsdbError("vgId:%d failed to load block statis part while read file %s since %s, offset:%" PRId64 " len :%" PRIzu, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), tstrerror(terrno), (int64_t)pBlock->offset, size); + return -1; + } + + if (nread < size) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d block statis part in file %s is corrupted, offset:%" PRId64 " expected bytes:%" PRIzu + " read bytes: %" PRId64, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), (int64_t)pBlock->offset, size, nread); + return -1; + } + + if (!taosCheckChecksumWhole((uint8_t *)(pReadh->pBlkData), (uint32_t)size)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d block statis part in file %s is corrupted since wrong checksum, offset:%" PRId64 " len :%" PRIzu, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), (int64_t)pBlock->offset, size); + return -1; + } + return 0; +} + +static int tsdbLoadBlockStatisFromAggr(SReadH *pReadh, SBlock *pBlock) { + ASSERT((pBlock->blkVer > TSDB_SBLK_VER_0) && (pBlock->aggrStat)); // TODO: remove after pass all the test + SDFile *pDFileAggr = pBlock->last ? TSDB_READ_SMAL_FILE(pReadh) : TSDB_READ_SMAD_FILE(pReadh); + + if (tsdbSeekDFile(pDFileAggr, pBlock->aggrOffset, SEEK_SET) < 0) { + tsdbError("vgId:%d failed to load block aggr part while seek file %s to offset %" PRIu64 " since %s", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFileAggr), (uint64_t)pBlock->aggrOffset, + tstrerror(terrno)); + return -1; + } + + size_t sizeAggr = tsdbBlockAggrSize(pBlock->numOfCols, (uint32_t)pBlock->blkVer); + if (tsdbMakeRoom((void **)(&(pReadh->pAggrBlkData)), sizeAggr) < 0) return -1; + + int64_t nreadAggr = tsdbReadDFile(pDFileAggr, (void *)(pReadh->pAggrBlkData), sizeAggr); + if (nreadAggr < 0) { + tsdbError("vgId:%d failed to load block aggr part while read file %s since %s, offset:%" PRIu64 " len :%" PRIzu, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFileAggr), tstrerror(terrno), + (uint64_t)pBlock->aggrOffset, sizeAggr); + return -1; + } + + if (nreadAggr < sizeAggr) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d block aggr part in file %s is corrupted, offset:%" PRIu64 " expected bytes:%" PRIzu + " read bytes: %" PRId64, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFileAggr), (uint64_t)pBlock->aggrOffset, sizeAggr, + nreadAggr); + return -1; + } + + if (!taosCheckChecksumWhole((uint8_t *)(pReadh->pAggrBlkData), (uint32_t)sizeAggr)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d block aggr part in file %s is corrupted since wrong checksum, offset:%" PRIu64 " len :%" PRIzu, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFileAggr), (uint64_t)pBlock->aggrOffset, sizeAggr); + return -1; + } + return 0; +} + +int tsdbLoadBlockStatis(SReadH *pReadh, SBlock *pBlock) { + ASSERT(pBlock->numOfSubBlocks <= 1); + + if (pBlock->blkVer > TSDB_SBLK_VER_0) { + if (pBlock->aggrStat) { + return tsdbLoadBlockStatisFromAggr(pReadh, pBlock); + } + return TSDB_STATIS_NONE; + } + return tsdbLoadBlockStatisFromDFile(pReadh, pBlock); +} + +int tsdbLoadBlockOffset(SReadH *pReadh, SBlock *pBlock) { + ASSERT(pBlock->numOfSubBlocks <= 1); + return tsdbLoadBlockStatisFromDFile(pReadh, pBlock); +} + +int tsdbEncodeSBlockIdx(void **buf, SBlockIdx *pIdx) { + int tlen = 0; + + tlen += taosEncodeVariantI32(buf, pIdx->tid); + tlen += taosEncodeVariantU32(buf, pIdx->len); + tlen += taosEncodeVariantU32(buf, pIdx->offset); + tlen += taosEncodeFixedU8(buf, pIdx->hasLast); + tlen += taosEncodeVariantU32(buf, pIdx->numOfBlocks); + tlen += taosEncodeFixedU64(buf, pIdx->uid); + tlen += taosEncodeFixedU64(buf, pIdx->maxKey); + + return tlen; +} + +void *tsdbDecodeSBlockIdx(void *buf, SBlockIdx *pIdx) { + uint8_t hasLast = 0; + uint32_t numOfBlocks = 0; + uint64_t value = 0; + + if ((buf = taosDecodeVariantI32(buf, &(pIdx->tid))) == NULL) return NULL; + if ((buf = taosDecodeVariantU32(buf, &(pIdx->len))) == NULL) return NULL; + if ((buf = taosDecodeVariantU32(buf, &(pIdx->offset))) == NULL) return NULL; + if ((buf = taosDecodeFixedU8(buf, &(hasLast))) == NULL) return NULL; + pIdx->hasLast = hasLast; + if ((buf = taosDecodeVariantU32(buf, &(numOfBlocks))) == NULL) return NULL; + pIdx->numOfBlocks = numOfBlocks; + if ((buf = taosDecodeFixedU64(buf, &value)) == NULL) return NULL; + pIdx->uid = (int64_t)value; + if ((buf = taosDecodeFixedU64(buf, &value)) == NULL) return NULL; + pIdx->maxKey = (TSKEY)value; + + return buf; +} + +void tsdbGetBlockStatis(SReadH *pReadh, SDataStatis *pStatis, int numOfCols, SBlock *pBlock) { + if (pBlock->blkVer == TSDB_SBLK_VER_0) { + SBlockData *pBlockData = pReadh->pBlkData; + + for (int i = 0, j = 0; i < numOfCols;) { + if (j >= pBlockData->numOfCols) { + pStatis[i].numOfNull = -1; + i++; + continue; + } + SBlockColV0 *pSBlkCol = ((SBlockColV0 *)(pBlockData->cols)) + j; + if (pStatis[i].colId == pSBlkCol->colId) { + pStatis[i].sum = pSBlkCol->sum; + pStatis[i].max = pSBlkCol->max; + pStatis[i].min = pSBlkCol->min; + pStatis[i].maxIndex = pSBlkCol->maxIndex; + pStatis[i].minIndex = pSBlkCol->minIndex; + pStatis[i].numOfNull = pSBlkCol->numOfNull; + i++; + j++; + } else if (pStatis[i].colId < pSBlkCol->colId) { + pStatis[i].numOfNull = -1; + i++; + } else { + j++; + } + } + } else if (pBlock->aggrStat) { + SAggrBlkData *pAggrBlkData = pReadh->pAggrBlkData; + + for (int i = 0, j = 0; i < numOfCols;) { + if (j >= pBlock->numOfCols) { + pStatis[i].numOfNull = -1; + i++; + continue; + } + SAggrBlkCol *pAggrBlkCol = ((SAggrBlkCol *)(pAggrBlkData)) + j; + if (pStatis[i].colId == pAggrBlkCol->colId) { + pStatis[i].sum = pAggrBlkCol->sum; + pStatis[i].max = pAggrBlkCol->max; + pStatis[i].min = pAggrBlkCol->min; + pStatis[i].maxIndex = pAggrBlkCol->maxIndex; + pStatis[i].minIndex = pAggrBlkCol->minIndex; + pStatis[i].numOfNull = pAggrBlkCol->numOfNull; + i++; + j++; + } else if (pStatis[i].colId < pAggrBlkCol->colId) { + pStatis[i].numOfNull = -1; + i++; + } else { + j++; + } + } + } +} + +static void tsdbResetReadTable(SReadH *pReadh) { + tdResetDataCols(pReadh->pDCols[0]); + tdResetDataCols(pReadh->pDCols[1]); + pReadh->cidx = 0; + pReadh->pBlkIdx = NULL; + pReadh->pTable = NULL; +} + +static void tsdbResetReadFile(SReadH *pReadh) { + tsdbResetReadTable(pReadh); + taosArrayClear(pReadh->aBlkIdx); + tsdbCloseDFileSet(TSDB_READ_FSET(pReadh)); +} + +static int tsdbLoadBlockDataImpl(SReadH *pReadh, SBlock *pBlock, SDataCols *pDataCols) { + ASSERT(pBlock->numOfSubBlocks == 0 || pBlock->numOfSubBlocks == 1); + + SDFile *pDFile = (pBlock->last) ? TSDB_READ_LAST_FILE(pReadh) : TSDB_READ_DATA_FILE(pReadh); + + tdResetDataCols(pDataCols); + if (tsdbMakeRoom((void **)(&TSDB_READ_BUF(pReadh)), pBlock->len) < 0) return -1; + + SBlockData *pBlockData = (SBlockData *)TSDB_READ_BUF(pReadh); + + if (tsdbSeekDFile(pDFile, pBlock->offset, SEEK_SET) < 0) { + tsdbError("vgId:%d failed to load block data part while seek file %s to offset %" PRId64 " since %s", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), (int64_t)pBlock->offset, tstrerror(terrno)); + return -1; + } + + int64_t nread = tsdbReadDFile(pDFile, TSDB_READ_BUF(pReadh), pBlock->len); + if (nread < 0) { + tsdbError("vgId:%d failed to load block data part while read file %s since %s, offset:%" PRId64 " len :%d", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), tstrerror(terrno), (int64_t)pBlock->offset, + pBlock->len); + return -1; + } + + if (nread < pBlock->len) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d block data part in file %s is corrupted, offset:%" PRId64 + " expected bytes:%d read bytes: %" PRId64, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), (int64_t)pBlock->offset, pBlock->len, nread); + return -1; + } + + int32_t tsize = (int32_t)tsdbBlockStatisSize(pBlock->numOfCols, (uint32_t)pBlock->blkVer); + if (!taosCheckChecksumWhole((uint8_t *)TSDB_READ_BUF(pReadh), tsize)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d block statis part in file %s is corrupted since wrong checksum, offset:%" PRId64 " len :%d", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), (int64_t)pBlock->offset, tsize); + return -1; + } + + ASSERT(tsize < pBlock->len); + ASSERT(pBlockData->numOfCols == pBlock->numOfCols); + + pDataCols->numOfRows = pBlock->numOfRows; + + // Recover the data + int ccol = 0; // loop iter for SBlockCol object + int dcol = 0; // loop iter for SDataCols object + SBlockCol blockCol = {0}; + SBlockCol *pBlockCol = &blockCol; + while (dcol < pDataCols->numOfCols) { + SDataCol *pDataCol = &(pDataCols->cols[dcol]); + if (dcol != 0 && ccol >= pBlockData->numOfCols) { + // Set current column as NULL and forward + dataColReset(pDataCol); + dcol++; + continue; + } + + int16_t tcolId = 0; + uint32_t toffset = TSDB_KEY_COL_OFFSET; + int32_t tlen = pBlock->keyLen; + + + if (dcol != 0) { + tsdbGetSBlockCol(pBlock, &pBlockCol, pBlockData->cols, ccol); + tcolId = pBlockCol->colId; + toffset = tsdbGetBlockColOffset(pBlockCol); + tlen = pBlockCol->len; + } else { + ASSERT(pDataCol->colId == tcolId); + } + + if (tcolId == pDataCol->colId) { + if (pBlock->algorithm == TWO_STAGE_COMP) { + int zsize = pDataCol->bytes * pBlock->numOfRows + COMP_OVERFLOW_BYTES; + if (tsdbMakeRoom((void **)(&TSDB_READ_COMP_BUF(pReadh)), zsize) < 0) return -1; + } + + if (tsdbCheckAndDecodeColumnData(pDataCol, POINTER_SHIFT(pBlockData, tsize + toffset), tlen, pBlock->algorithm, + pBlock->numOfRows, pDataCols->maxPoints, TSDB_READ_COMP_BUF(pReadh), + (int)taosTSizeof(TSDB_READ_COMP_BUF(pReadh))) < 0) { + tsdbError("vgId:%d file %s is broken at column %d block offset %" PRId64 " column offset %u", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), tcolId, (int64_t)pBlock->offset, toffset); + return -1; + } + + if (dcol != 0) { + ccol++; + } + dcol++; + } else if (tcolId < pDataCol->colId) { + ccol++; + } else { + // Set current column as NULL and forward + dataColReset(pDataCol); + dcol++; + } + } + + return 0; +} + +static int tsdbCheckAndDecodeColumnData(SDataCol *pDataCol, void *content, int32_t len, int8_t comp, int numOfRows, + int maxPoints, char *buffer, int bufferSize) { + if (!taosCheckChecksumWhole((uint8_t *)content, len)) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + return -1; + } + + tdAllocMemForCol(pDataCol, maxPoints); + + // Decode the data + if (comp) { + // Need to decompress + int tlen = (*(tDataTypes[pDataCol->type].decompFunc))(content, len - sizeof(TSCKSUM), numOfRows, pDataCol->pData, + pDataCol->spaceSize, comp, buffer, bufferSize); + if (tlen <= 0) { + tsdbError("Failed to decompress column, file corrupted, len:%d comp:%d numOfRows:%d maxPoints:%d bufferSize:%d", + len, comp, numOfRows, maxPoints, bufferSize); + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + return -1; + } + pDataCol->len = tlen; + } else { + // No need to decompress, just memcpy it + pDataCol->len = len - sizeof(TSCKSUM); + memcpy(pDataCol->pData, content, pDataCol->len); + } + + if (IS_VAR_DATA_TYPE(pDataCol->type)) { + dataColSetOffset(pDataCol, numOfRows); + } + return 0; +} + +static int tsdbLoadBlockDataColsImpl(SReadH *pReadh, SBlock *pBlock, SDataCols *pDataCols, int16_t *colIds, + int numOfColIds) { + ASSERT(pBlock->numOfSubBlocks == 0 || pBlock->numOfSubBlocks == 1); + ASSERT(colIds[0] == 0); + + SDFile * pDFile = (pBlock->last) ? TSDB_READ_LAST_FILE(pReadh) : TSDB_READ_DATA_FILE(pReadh); + SBlockCol blockCol = {0}; + + tdResetDataCols(pDataCols); + + // If only load timestamp column, no need to load SBlockData part + if (numOfColIds > 1 && tsdbLoadBlockOffset(pReadh, pBlock) < 0) return -1; + + pDataCols->numOfRows = pBlock->numOfRows; + + int dcol = 0; + int ccol = 0; + for (int i = 0; i < numOfColIds; i++) { + int16_t colId = colIds[i]; + SDataCol * pDataCol = NULL; + SBlockCol *pBlockCol = NULL; + + while (true) { + if (dcol >= pDataCols->numOfCols) { + pDataCol = NULL; + break; + } + pDataCol = &pDataCols->cols[dcol]; + if (pDataCol->colId > colId) { + pDataCol = NULL; + break; + } else { + dcol++; + if (pDataCol->colId == colId) break; + } + } + + if (pDataCol == NULL) continue; + ASSERT(pDataCol->colId == colId); + + if (colId == 0) { // load the key row + blockCol.colId = colId; + blockCol.len = pBlock->keyLen; + blockCol.type = pDataCol->type; + blockCol.offset = TSDB_KEY_COL_OFFSET; + pBlockCol = &blockCol; + } else { // load non-key rows + while (true) { + if (ccol >= pBlock->numOfCols) { + pBlockCol = NULL; + break; + } + + pBlockCol = &blockCol; + tsdbGetSBlockCol(pBlock, &pBlockCol, pReadh->pBlkData->cols, ccol); + + if (pBlockCol->colId > colId) { + pBlockCol = NULL; + break; + } else { + ccol++; + if (pBlockCol->colId == colId) break; + } + } + + if (pBlockCol == NULL) { + dataColReset(pDataCol); + continue; + } + + ASSERT(pBlockCol->colId == pDataCol->colId); + } + + if (tsdbLoadColData(pReadh, pDFile, pBlock, pBlockCol, pDataCol) < 0) return -1; + } + + return 0; +} + +static int tsdbLoadColData(SReadH *pReadh, SDFile *pDFile, SBlock *pBlock, SBlockCol *pBlockCol, SDataCol *pDataCol) { + ASSERT(pDataCol->colId == pBlockCol->colId); + + STsdbRepo *pRepo = TSDB_READ_REPO(pReadh); + STsdbCfg * pCfg = REPO_CFG(pRepo); + int tsize = pDataCol->bytes * pBlock->numOfRows + COMP_OVERFLOW_BYTES; + + if (tsdbMakeRoom((void **)(&TSDB_READ_BUF(pReadh)), pBlockCol->len) < 0) return -1; + if (tsdbMakeRoom((void **)(&TSDB_READ_COMP_BUF(pReadh)), tsize) < 0) return -1; + + int64_t offset = pBlock->offset + tsdbBlockStatisSize(pBlock->numOfCols, (uint32_t)pBlock->blkVer) + + tsdbGetBlockColOffset(pBlockCol); + if (tsdbSeekDFile(pDFile, offset, SEEK_SET) < 0) { + tsdbError("vgId:%d failed to load block column data while seek file %s to offset %" PRId64 " since %s", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), offset, tstrerror(terrno)); + return -1; + } + + int64_t nread = tsdbReadDFile(pDFile, TSDB_READ_BUF(pReadh), pBlockCol->len); + if (nread < 0) { + tsdbError("vgId:%d failed to load block column data while read file %s since %s, offset:%" PRId64 " len :%d", + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), tstrerror(terrno), offset, pBlockCol->len); + return -1; + } + + if (nread < pBlockCol->len) { + terrno = TSDB_CODE_TDB_FILE_CORRUPTED; + tsdbError("vgId:%d block column data in file %s is corrupted, offset:%" PRId64 " expected bytes:%d" PRIzu + " read bytes: %" PRId64, + TSDB_READ_REPO_ID(pReadh), TSDB_FILE_FULL_NAME(pDFile), offset, pBlockCol->len, nread); + return -1; + } + + if (tsdbCheckAndDecodeColumnData(pDataCol, pReadh->pBuf, pBlockCol->len, pBlock->algorithm, pBlock->numOfRows, + pCfg->maxRowsPerFileBlock, pReadh->pCBuf, (int32_t)taosTSizeof(pReadh->pCBuf)) < 0) { + tsdbError("vgId:%d file %s is broken at column %d offset %" PRId64, REPO_ID(pRepo), TSDB_FILE_FULL_NAME(pDFile), + pBlockCol->colId, offset); + return -1; + } + + return 0; +} diff --git a/source/dnode/vnode/tsdb2/src/tsdbRecover.c b/source/dnode/vnode/tsdb2/src/tsdbRecover.c new file mode 100644 index 0000000000..6dea4a4e57 --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbRecover.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/src/tsdbRowMergeBuf.c b/source/dnode/vnode/tsdb2/src/tsdbRowMergeBuf.c new file mode 100644 index 0000000000..5ce580f70f --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbRowMergeBuf.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "tsdbRowMergeBuf.h" +#include "tdataformat.h" + +// row1 has higher priority +SMemRow tsdbMergeTwoRows(SMergeBuf *pBuf, SMemRow row1, SMemRow row2, STSchema *pSchema1, STSchema *pSchema2) { + if(row2 == NULL) return row1; + if(row1 == NULL) return row2; + ASSERT(pSchema1->version == memRowVersion(row1)); + ASSERT(pSchema2->version == memRowVersion(row2)); + + if(tsdbMergeBufMakeSureRoom(pBuf, pSchema1, pSchema2) < 0) { + return NULL; + } + return mergeTwoMemRows(*pBuf, row1, row2, pSchema1, pSchema2); +} diff --git a/source/dnode/vnode/tsdb2/src/tsdbScan.c b/source/dnode/vnode/tsdb2/src/tsdbScan.c new file mode 100644 index 0000000000..382f7b11ae --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbScan.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "tsdbint.h" + +#if 0 +#ifndef _TSDB_PLUGINS + +int tsdbScanFGroup(STsdbScanHandle* pScanHandle, char* rootDir, int fid) { return 0; } + +STsdbScanHandle* tsdbNewScanHandle() { return NULL; } + +void tsdbSetScanLogStream(STsdbScanHandle* pScanHandle, FILE* fLogStream) {} + +int tsdbSetAndOpenScanFile(STsdbScanHandle* pScanHandle, char* rootDir, int fid) { return 0; } + +int tsdbScanSBlockIdx(STsdbScanHandle* pScanHandle) { return 0; } + +int tsdbScanSBlock(STsdbScanHandle* pScanHandle, int idx) { return 0; } + +int tsdbCloseScanFile(STsdbScanHandle* pScanHandle) { return 0; } + +void tsdbFreeScanHandle(STsdbScanHandle* pScanHandle) {} + +#endif +#endif \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/src/tsdbSync.c b/source/dnode/vnode/tsdb2/src/tsdbSync.c new file mode 100644 index 0000000000..0e01cf37bb --- /dev/null +++ b/source/dnode/vnode/tsdb2/src/tsdbSync.c @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#define _DEFAULT_SOURCE +#include "os.h" +#include "taoserror.h" +#include "tsdbint.h" + +// Sync handle +typedef struct { + STsdbRepo *pRepo; + SRtn rtn; + SOCKET socketFd; + void * pBuf; + bool mfChanged; + SMFile * pmf; + SMFile mf; + SDFileSet df; + SDFileSet *pdf; +} SSyncH; + +#define SYNC_BUFFER(sh) ((sh)->pBuf) + +static void tsdbInitSyncH(SSyncH *pSyncH, STsdbRepo *pRepo, SOCKET socketFd); +static void tsdbDestroySyncH(SSyncH *pSyncH); +static int32_t tsdbSyncSendMeta(SSyncH *pSynch); +static int32_t tsdbSyncRecvMeta(SSyncH *pSynch); +static int32_t tsdbSendMetaInfo(SSyncH *pSynch); +static int32_t tsdbRecvMetaInfo(SSyncH *pSynch); +static int32_t tsdbSendDecision(SSyncH *pSynch, bool toSend); +static int32_t tsdbRecvDecision(SSyncH *pSynch, bool *toSend); +static int32_t tsdbSyncSendDFileSetArray(SSyncH *pSynch); +static int32_t tsdbSyncRecvDFileSetArray(SSyncH *pSynch); +static bool tsdbIsTowFSetSame(SDFileSet *pSet1, SDFileSet *pSet2); +static int32_t tsdbSyncSendDFileSet(SSyncH *pSynch, SDFileSet *pSet); +static int32_t tsdbSendDFileSetInfo(SSyncH *pSynch, SDFileSet *pSet); +static int32_t tsdbRecvDFileSetInfo(SSyncH *pSynch); +static int tsdbReload(STsdbRepo *pRepo, bool isMfChanged); + +int32_t tsdbSyncSend(void *tsdb, SOCKET socketFd) { + STsdbRepo *pRepo = (STsdbRepo *)tsdb; + SSyncH synch = {0}; + + tsdbInitSyncH(&synch, pRepo, socketFd); + // Disable TSDB commit + tsem_wait(&(pRepo->readyToCommit)); + + if (tsdbSyncSendMeta(&synch) < 0) { + tsdbError("vgId:%d, failed to send metafile since %s", REPO_ID(pRepo), tstrerror(terrno)); + goto _err; + } + + if (tsdbSyncSendDFileSetArray(&synch) < 0) { + tsdbError("vgId:%d, failed to send filesets since %s", REPO_ID(pRepo), tstrerror(terrno)); + goto _err; + } + + // Enable TSDB commit + tsem_post(&(pRepo->readyToCommit)); + tsdbDestroySyncH(&synch); + return 0; + +_err: + tsem_post(&(pRepo->readyToCommit)); + tsdbDestroySyncH(&synch); + return -1; +} + +int32_t tsdbSyncRecv(void *tsdb, SOCKET socketFd) { + STsdbRepo *pRepo = (STsdbRepo *)tsdb; + SSyncH synch = {0}; + + pRepo->state = TSDB_STATE_OK; + + tsdbInitSyncH(&synch, pRepo, socketFd); + tsem_wait(&(pRepo->readyToCommit)); + tsdbStartFSTxn(pRepo, 0, 0); + + if (tsdbSyncRecvMeta(&synch) < 0) { + tsdbError("vgId:%d, failed to recv metafile since %s", REPO_ID(pRepo), tstrerror(terrno)); + goto _err; + } + + if (tsdbSyncRecvDFileSetArray(&synch) < 0) { + tsdbError("vgId:%d, failed to recv filesets since %s", REPO_ID(pRepo), tstrerror(terrno)); + goto _err; + } + + tsdbEndFSTxn(pRepo); + tsem_post(&(pRepo->readyToCommit)); + tsdbDestroySyncH(&synch); + + // Reload file change + tsdbReload(pRepo, synch.mfChanged); + + return 0; + +_err: + tsdbEndFSTxnWithError(REPO_FS(pRepo)); + tsem_post(&(pRepo->readyToCommit)); + tsdbDestroySyncH(&synch); + return -1; +} + +static void tsdbInitSyncH(SSyncH *pSyncH, STsdbRepo *pRepo, SOCKET socketFd) { + pSyncH->pRepo = pRepo; + pSyncH->socketFd = socketFd; + tsdbGetRtnSnap(pRepo, &(pSyncH->rtn)); +} + +static void tsdbDestroySyncH(SSyncH *pSyncH) { taosTZfree(pSyncH->pBuf); } + +static int32_t tsdbSyncSendMeta(SSyncH *pSynch) { + STsdbRepo *pRepo = pSynch->pRepo; + bool toSendMeta = false; + SMFile mf; + + // Send meta info to remote + tsdbInfo("vgId:%d, metainfo will be sent", REPO_ID(pRepo)); + if (tsdbSendMetaInfo(pSynch) < 0) { + tsdbError("vgId:%d, failed to send metainfo since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + if (pRepo->fs->cstatus->pmf == NULL) { + // No meta file, not need to wait to retrieve meta file + tsdbInfo("vgId:%d, metafile not exist, no need to send", REPO_ID(pRepo)); + return 0; + } + + if (tsdbRecvDecision(pSynch, &toSendMeta) < 0) { + tsdbError("vgId:%d, failed to recv decision while send meta since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + if (toSendMeta) { + tsdbInitMFileEx(&mf, pRepo->fs->cstatus->pmf); + if (tsdbOpenMFile(&mf, O_RDONLY) < 0) { + tsdbError("vgId:%d, failed to open file while send metafile since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + int64_t writeLen = mf.info.size; + tsdbInfo("vgId:%d, metafile:%s will be sent, size:%" PRId64, REPO_ID(pRepo), mf.f.aname, writeLen); + + int64_t ret = taosSendFile(pSynch->socketFd, TSDB_FILE_FD(&mf), 0, writeLen); + if (ret != writeLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to send metafile since %s, ret:%" PRId64 " writeLen:%" PRId64, REPO_ID(pRepo), + tstrerror(terrno), ret, writeLen); + tsdbCloseMFile(&mf); + return -1; + } + + tsdbCloseMFile(&mf); + tsdbInfo("vgId:%d, metafile is sent", REPO_ID(pRepo)); + } else { + tsdbInfo("vgId:%d, metafile is same, no need to send", REPO_ID(pRepo)); + } + + return 0; +} + +static int32_t tsdbSyncRecvMeta(SSyncH *pSynch) { + STsdbRepo *pRepo = pSynch->pRepo; + SMFile * pLMFile = pRepo->fs->cstatus->pmf; + + // Recv meta info from remote + if (tsdbRecvMetaInfo(pSynch) < 0) { + tsdbError("vgId:%d, failed to recv metainfo since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + // No meta file, do nothing (rm local meta file) + if (pSynch->pmf == NULL) { + if (pLMFile == NULL) { + pSynch->mfChanged = false; + } else { + pSynch->mfChanged = true; + } + tsdbInfo("vgId:%d, metafile not exist in remote, no need to recv", REPO_ID(pRepo)); + return 0; + } + + if (pLMFile == NULL || pSynch->pmf->info.size != pLMFile->info.size || + pSynch->pmf->info.magic != pLMFile->info.magic || TSDB_FILE_IS_BAD(pLMFile)) { + // Local has no meta file or has a different meta file, need to copy from remote + pSynch->mfChanged = true; + + if (tsdbSendDecision(pSynch, true) < 0) { + tsdbError("vgId:%d, failed to send decision while recv metafile since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + tsdbInfo("vgId:%d, metafile will be received", REPO_ID(pRepo)); + + // Recv from remote + SMFile mf; + SDiskID did = {.level = TFS_PRIMARY_LEVEL, .id = TFS_PRIMARY_ID}; + tsdbInitMFile(&mf, did, REPO_ID(pRepo), FS_TXN_VERSION(REPO_FS(pRepo))); + if (tsdbCreateMFile(&mf, false) < 0) { + tsdbError("vgId:%d, failed to create file while recv metafile since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + tsdbInfo("vgId:%d, metafile:%s is created", REPO_ID(pRepo), mf.f.aname); + + int64_t readLen = pSynch->pmf->info.size; + int64_t ret = taosCopyFds(pSynch->socketFd, TSDB_FILE_FD(&mf), readLen); + if (ret != readLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to recv metafile since %s, ret:%" PRId64 " readLen:%" PRId64, REPO_ID(pRepo), + tstrerror(terrno), ret, readLen); + tsdbCloseMFile(&mf); + tsdbRemoveMFile(&mf); + return -1; + } + + tsdbInfo("vgId:%d, metafile is received, size:%" PRId64, REPO_ID(pRepo), readLen); + + mf.info = pSynch->pmf->info; + tsdbCloseMFile(&mf); + tsdbUpdateMFile(REPO_FS(pRepo), &mf); + } else { + pSynch->mfChanged = false; + tsdbInfo("vgId:%d, metafile is same, no need to recv", REPO_ID(pRepo)); + if (tsdbSendDecision(pSynch, false) < 0) { + tsdbError("vgId:%d, failed to send decision while recv metafile since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + tsdbUpdateMFile(REPO_FS(pRepo), pLMFile); + } + + return 0; +} + +static int32_t tsdbSendMetaInfo(SSyncH *pSynch) { + STsdbRepo *pRepo = pSynch->pRepo; + uint32_t tlen = 0; + SMFile * pMFile = pRepo->fs->cstatus->pmf; + + if (pMFile) { + tlen = tlen + tsdbEncodeSMFileEx(NULL, pMFile) + sizeof(TSCKSUM); + } + + if (tsdbMakeRoom((void **)(&SYNC_BUFFER(pSynch)), tlen + sizeof(tlen)) < 0) { + tsdbError("vgId:%d, failed to makeroom while send metainfo since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + void *ptr = SYNC_BUFFER(pSynch); + taosEncodeFixedU32(&ptr, tlen); + void *tptr = ptr; + if (pMFile) { + tsdbEncodeSMFileEx(&ptr, pMFile); + taosCalcChecksumAppend(0, (uint8_t *)tptr, tlen); + } + + int32_t writeLen = tlen + sizeof(uint32_t); + int32_t ret = taosWriteMsg(pSynch->socketFd, SYNC_BUFFER(pSynch), writeLen); + if (ret != writeLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to send metainfo since %s, ret:%d writeLen:%d", REPO_ID(pRepo), tstrerror(terrno), ret, + writeLen); + return -1; + } + + tsdbInfo("vgId:%d, metainfo is sent, tlen:%d, writeLen:%d", REPO_ID(pRepo), tlen, writeLen); + return 0; +} + +static int32_t tsdbRecvMetaInfo(SSyncH *pSynch) { + STsdbRepo *pRepo = pSynch->pRepo; + uint32_t tlen = 0; + char buf[64] = {0}; + + int32_t readLen = sizeof(uint32_t); + int32_t ret = taosReadMsg(pSynch->socketFd, buf, readLen); + if (ret != readLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to recv metalen, ret:%d readLen:%d", REPO_ID(pRepo), ret, readLen); + return -1; + } + + taosDecodeFixedU32(buf, &tlen); + + tsdbInfo("vgId:%d, metalen is received, readLen:%d, tlen:%d", REPO_ID(pRepo), readLen, tlen); + if (tlen == 0) { + pSynch->pmf = NULL; + return 0; + } + + if (tsdbMakeRoom((void **)(&SYNC_BUFFER(pSynch)), tlen) < 0) { + tsdbError("vgId:%d, failed to makeroom while recv metainfo since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + ret = taosReadMsg(pSynch->socketFd, SYNC_BUFFER(pSynch), tlen); + if (ret != tlen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to recv metainfo, ret:%d tlen:%d", REPO_ID(pRepo), ret, tlen); + return -1; + } + + tsdbInfo("vgId:%d, metainfo is received, tlen:%d", REPO_ID(pRepo), tlen); + if (!taosCheckChecksumWhole((uint8_t *)SYNC_BUFFER(pSynch), tlen)) { + terrno = TSDB_CODE_TDB_MESSED_MSG; + tsdbError("vgId:%d, failed to checksum while recv metainfo since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + pSynch->pmf = &(pSynch->mf); + tsdbDecodeSMFileEx(SYNC_BUFFER(pSynch), pSynch->pmf); + + return 0; +} + +static int32_t tsdbSendDecision(SSyncH *pSynch, bool toSend) { + STsdbRepo *pRepo = pSynch->pRepo; + uint8_t decision = toSend; + + int32_t writeLen = sizeof(uint8_t); + int32_t ret = taosWriteMsg(pSynch->socketFd, (void *)(&decision), writeLen); + if (ret != writeLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to send decison, ret:%d writeLen:%d", REPO_ID(pRepo), ret, writeLen); + return -1; + } + + return 0; +} + +static int32_t tsdbRecvDecision(SSyncH *pSynch, bool *toSend) { + STsdbRepo *pRepo = pSynch->pRepo; + uint8_t decision = 0; + + int32_t readLen = sizeof(uint8_t); + int32_t ret = taosReadMsg(pSynch->socketFd, (void *)(&decision), readLen); + if (ret != readLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to recv decison, ret:%d readLen:%d", REPO_ID(pRepo), ret, readLen); + return -1; + } + + *toSend = decision; + return 0; +} + +static int32_t tsdbSyncSendDFileSetArray(SSyncH *pSynch) { + STsdbRepo *pRepo = pSynch->pRepo; + STsdbFS * pfs = REPO_FS(pRepo); + SFSIter fsiter; + SDFileSet *pSet; + + tsdbFSIterInit(&fsiter, pfs, TSDB_FS_ITER_FORWARD); + + do { + pSet = tsdbFSIterNext(&fsiter); + if (tsdbSyncSendDFileSet(pSynch, pSet) < 0) { + tsdbError("vgId:%d, failed to send fileset:%d since %s", REPO_ID(pRepo), pSet ? pSet->fid : -1, + tstrerror(terrno)); + return -1; + } + + // No more file set to send, jut break + if (pSet == NULL) { + tsdbInfo("vgId:%d, no filesets any more", REPO_ID(pRepo)); + break; + } + } while (true); + + return 0; +} + +static int32_t tsdbSyncRecvDFileSetArray(SSyncH *pSynch) { + STsdbRepo *pRepo = pSynch->pRepo; + STsdbFS * pfs = REPO_FS(pRepo); + SFSIter fsiter; + SDFileSet *pLSet; // Local file set + + tsdbFSIterInit(&fsiter, pfs, TSDB_FS_ITER_FORWARD); + + pLSet = tsdbFSIterNext(&fsiter); + if (tsdbRecvDFileSetInfo(pSynch) < 0) { + tsdbError("vgId:%d, failed to recv fileset since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + while (true) { + if (pLSet == NULL && pSynch->pdf == NULL) { + tsdbInfo("vgId:%d, all filesets is disposed", REPO_ID(pRepo)); + break; + } else { + tsdbInfo("vgId:%d, fileset local:%d remote:%d, will be disposed", REPO_ID(pRepo), pLSet != NULL ? pLSet->fid : -1, + pSynch->pdf != NULL ? pSynch->pdf->fid : -1); + } + + if (pLSet && (pSynch->pdf == NULL || pLSet->fid < pSynch->pdf->fid)) { + // remote not has pLSet->fid set, just remove local (do nothing to remote the fset) + tsdbInfo("vgId:%d, fileset:%d smaller than remote:%d, remove it", REPO_ID(pRepo), pLSet->fid, + pSynch->pdf != NULL ? pSynch->pdf->fid : -1); + pLSet = tsdbFSIterNext(&fsiter); + } else { + if (pLSet && pSynch->pdf && pLSet->fid == pSynch->pdf->fid && tsdbIsTowFSetSame(pLSet, pSynch->pdf) && + tsdbFSetIsOk(pLSet)) { + // Just keep local files and notify remote not to send + tsdbInfo("vgId:%d, fileset:%d is same and no need to recv", REPO_ID(pRepo), pLSet->fid); + + if (tsdbUpdateDFileSet(pfs, pLSet) < 0) { + tsdbError("vgId:%d, failed to update fileset since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + if (tsdbSendDecision(pSynch, false) < 0) { + tsdbError("vgId:%d, failed to send decision since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + } else { + // Need to copy from remote + int fidLevel = tsdbGetFidLevel(pSynch->pdf->fid, &(pSynch->rtn)); + if (fidLevel < 0) { // expired fileset + tsdbInfo("vgId:%d, fileset:%d will be skipped as expired", REPO_ID(pRepo), pSynch->pdf->fid); + if (tsdbSendDecision(pSynch, false) < 0) { + tsdbError("vgId:%d, failed to send decision since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + // Move forward + if (tsdbRecvDFileSetInfo(pSynch) < 0) { + tsdbError("vgId:%d, failed to recv fileset since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + if (pLSet) { + pLSet = tsdbFSIterNext(&fsiter); + } + // Next loop + continue; + } else { + tsdbInfo("vgId:%d, fileset:%d will be received", REPO_ID(pRepo), pSynch->pdf->fid); + // Notify remote to send there file here + if (tsdbSendDecision(pSynch, true) < 0) { + tsdbError("vgId:%d, failed to send decision since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + } + + // Create local files and copy from remote + SDiskID did; + SDFileSet fset; + + tfsAllocDisk(fidLevel, &(did.level), &(did.id)); + if (did.level == TFS_UNDECIDED_LEVEL) { + terrno = TSDB_CODE_TDB_NO_AVAIL_DISK; + tsdbError("vgId:%d, failed allc disk since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + tsdbInitDFileSet(&fset, did, REPO_ID(pRepo), pSynch->pdf->fid, FS_TXN_VERSION(pfs), pSynch->pdf->ver); + + // Create new FSET + if (tsdbCreateDFileSet(&fset, false) < 0) { + tsdbError("vgId:%d, failed to create fileset since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSynch->pdf); ftype++) { + SDFile *pDFile = TSDB_DFILE_IN_SET(&fset, ftype); // local file + SDFile *pRDFile = TSDB_DFILE_IN_SET(pSynch->pdf, ftype); // remote file + + tsdbInfo("vgId:%d, file:%s will be received, osize:%" PRIu64 " rsize:%" PRIu64, REPO_ID(pRepo), + pDFile->f.aname, pDFile->info.size, pRDFile->info.size); + + int64_t writeLen = pRDFile->info.size; + int64_t ret = taosCopyFds(pSynch->socketFd, pDFile->fd, writeLen); + if (ret != writeLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to recv file:%s since %s, ret:%" PRId64 " writeLen:%" PRId64, REPO_ID(pRepo), + pDFile->f.aname, tstrerror(terrno), ret, writeLen); + tsdbCloseDFileSet(&fset); + tsdbRemoveDFileSet(&fset); + return -1; + } + + // Update new file info + pDFile->info = pRDFile->info; + tsdbInfo("vgId:%d, file:%s is received, size:%" PRId64, REPO_ID(pRepo), pDFile->f.aname, writeLen); + } + + tsdbCloseDFileSet(&fset); + if (tsdbUpdateDFileSet(pfs, &fset) < 0) { + tsdbInfo("vgId:%d, fileset:%d failed to update since %s", REPO_ID(pRepo), fset.fid, tstrerror(terrno)); + return -1; + } + + tsdbInfo("vgId:%d, fileset:%d is received", REPO_ID(pRepo), pSynch->pdf->fid); + } + + // Move forward + if (tsdbRecvDFileSetInfo(pSynch) < 0) { + tsdbError("vgId:%d, failed to recv fileset since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + if (pLSet) { + pLSet = tsdbFSIterNext(&fsiter); + } + } + +#if 0 + if (pLSet == NULL) { + // Copy from remote >>>>>>>>>>> + } else { + if (pSynch->pdf == NULL) { + // Remove local file, just ignore ++++++++++++++ + pLSet = tsdbFSIterNext(&fsiter); + } else { + if (pLSet->fid < pSynch->pdf->fid) { + // Remove local file, just ignore ++++++++++++ + pLSet = tsdbFSIterNext(&fsiter); + } else if (pLSet->fid > pSynch->pdf->fid){ + // Copy from remote >>>>>>>>>>>>>> + if (tsdbRecvDFileSetInfo(pSynch) < 0) { + // TODO + return -1; + } + } else { + if (true/*TODO: is same fset*/) { + // No need to copy --------------------- + } else { + // copy from remote >>>>>>>>>>>>>. + } + } + } + } +#endif + } + + return 0; +} + +static bool tsdbIsTowFSetSame(SDFileSet *pSet1, SDFileSet *pSet2) { + if (pSet1->ver != pSet2->ver) { + return false; + } + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet1); ftype++) { + SDFile *pDFile1 = TSDB_DFILE_IN_SET(pSet1, ftype); + SDFile *pDFile2 = TSDB_DFILE_IN_SET(pSet2, ftype); + + if (pDFile1->info.size != pDFile2->info.size || pDFile1->info.magic != pDFile2->info.magic) { + return false; + } + } + + return true; +} + +static int32_t tsdbSyncSendDFileSet(SSyncH *pSynch, SDFileSet *pSet) { + STsdbRepo *pRepo = pSynch->pRepo; + bool toSend = false; + + // skip expired fileset + if (pSet && tsdbGetFidLevel(pSet->fid, &(pSynch->rtn)) < 0) { + tsdbInfo("vgId:%d, don't sync send since fileset:%d smaller than minFid:%d", REPO_ID(pRepo), pSet->fid, + pSynch->rtn.minFid); + return 0; + } + + if (tsdbSendDFileSetInfo(pSynch, pSet) < 0) { + tsdbError("vgId:%d, failed to send fileset:%d info since %s", REPO_ID(pRepo), pSet ? pSet->fid : -1, tstrerror(terrno)); + return -1; + } + + // No file any more, no need to send file, just return + if (pSet == NULL) { + return 0; + } + + if (tsdbRecvDecision(pSynch, &toSend) < 0) { + tsdbError("vgId:%d, failed to recv decision while send fileset:%d since %s", REPO_ID(pRepo), pSet->fid, + tstrerror(terrno)); + return -1; + } + + if (toSend) { + tsdbInfo("vgId:%d, fileset:%d will be sent", REPO_ID(pRepo), pSet->fid); + + for (TSDB_FILE_T ftype = 0; ftype < tsdbGetNFiles(pSet); ftype++) { + SDFile df = *TSDB_DFILE_IN_SET(pSet, ftype); + + if (tsdbOpenDFile(&df, O_RDONLY) < 0) { + tsdbError("vgId:%d, failed to file:%s since %s", REPO_ID(pRepo), df.f.aname, tstrerror(terrno)); + return -1; + } + + int64_t writeLen = df.info.size; + tsdbInfo("vgId:%d, file:%s will be sent, size:%" PRId64, REPO_ID(pRepo), df.f.aname, writeLen); + + int64_t ret = taosSendFile(pSynch->socketFd, TSDB_FILE_FD(&df), 0, writeLen); + if (ret != writeLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to send file:%s since %s, ret:%" PRId64 " writeLen:%" PRId64, REPO_ID(pRepo), + df.f.aname, tstrerror(terrno), ret, writeLen); + tsdbCloseDFile(&df); + return -1; + } + + tsdbInfo("vgId:%d, file:%s is sent", REPO_ID(pRepo), df.f.aname); + tsdbCloseDFile(&df); + } + + tsdbInfo("vgId:%d, fileset:%d is sent", REPO_ID(pRepo), pSet->fid); + } else { + tsdbInfo("vgId:%d, fileset:%d is same, no need to send", REPO_ID(pRepo), pSet->fid); + } + + return 0; +} + +static int32_t tsdbSendDFileSetInfo(SSyncH *pSynch, SDFileSet *pSet) { + STsdbRepo *pRepo = pSynch->pRepo; + uint32_t tlen = 0; + + if (pSet) { + tlen = tsdbEncodeDFileSetEx(NULL, pSet) + sizeof(TSCKSUM); + } + + if (tsdbMakeRoom((void **)(&SYNC_BUFFER(pSynch)), tlen + sizeof(tlen)) < 0) { + tsdbError("vgId:%d, failed to makeroom while send fileinfo since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + void *ptr = SYNC_BUFFER(pSynch); + taosEncodeFixedU32(&ptr, tlen); + void *tptr = ptr; + if (pSet) { + tsdbEncodeDFileSetEx(&ptr, pSet); + taosCalcChecksumAppend(0, (uint8_t *)tptr, tlen); + } + + int32_t writeLen = tlen + sizeof(uint32_t); + int32_t ret = taosWriteMsg(pSynch->socketFd, SYNC_BUFFER(pSynch), writeLen); + if (ret != writeLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to send fileinfo, ret:%d writeLen:%d", REPO_ID(pRepo), ret, writeLen); + return -1; + } + + return 0; +} + +static int32_t tsdbRecvDFileSetInfo(SSyncH *pSynch) { + STsdbRepo *pRepo = pSynch->pRepo; + uint32_t tlen; + char buf[64] = {0}; + + int32_t readLen = sizeof(uint32_t); + int32_t ret = taosReadMsg(pSynch->socketFd, buf, readLen); + if (ret != readLen) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + taosDecodeFixedU32(buf, &tlen); + + tsdbInfo("vgId:%d, fileinfo len:%d is received", REPO_ID(pRepo), tlen); + if (tlen == 0) { + pSynch->pdf = NULL; + return 0; + } + + if (tsdbMakeRoom((void **)(&SYNC_BUFFER(pSynch)), tlen) < 0) { + tsdbError("vgId:%d, failed to makeroom while recv fileinfo since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + ret = taosReadMsg(pSynch->socketFd, SYNC_BUFFER(pSynch), tlen); + if (ret != tlen) { + terrno = TAOS_SYSTEM_ERROR(errno); + tsdbError("vgId:%d, failed to recv fileinfo, ret:%d readLen:%d", REPO_ID(pRepo), ret, tlen); + return -1; + } + + if (!taosCheckChecksumWhole((uint8_t *)SYNC_BUFFER(pSynch), tlen)) { + terrno = TSDB_CODE_TDB_MESSED_MSG; + tsdbError("vgId:%d, failed to checksum while recv fileinfo since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + pSynch->pdf = &(pSynch->df); + tsdbDecodeDFileSetEx(SYNC_BUFFER(pSynch), pSynch->pdf); + + return 0; +} + +static int tsdbReload(STsdbRepo *pRepo, bool isMfChanged) { + // TODO: may need to stop and restart stream + // if (isMfChanged) { + tsdbCloseMeta(pRepo); + tsdbFreeMeta(pRepo->tsdbMeta); + pRepo->tsdbMeta = tsdbNewMeta(REPO_CFG(pRepo)); + tsdbOpenMeta(pRepo); + tsdbLoadMetaCache(pRepo, true); + // } + + tsdbUnRefMemTable(pRepo, pRepo->mem); + tsdbUnRefMemTable(pRepo, pRepo->imem); + pRepo->mem = NULL; + pRepo->imem = NULL; + + if (tsdbRestoreInfo(pRepo) < 0) { + tsdbError("vgId:%d failed to restore info from file since %s", REPO_ID(pRepo), tstrerror(terrno)); + return -1; + } + + return 0; +} \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/tests/CMakeLists.txt b/source/dnode/vnode/tsdb2/tests/CMakeLists.txt new file mode 100644 index 0000000000..a3477aef95 --- /dev/null +++ b/source/dnode/vnode/tsdb2/tests/CMakeLists.txt @@ -0,0 +1,6 @@ +AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_LIST) + +add_executable(tsdbTests ${SOURCE_LIST}) +target_link_libraries(tsdbTests gtest gtest_main pthread common tsdb tutil trpc) + +add_test(NAME unit COMMAND ${CMAKE_CURRENT_BINARY_DIR}/tsdbTests) \ No newline at end of file diff --git a/source/dnode/vnode/tsdb2/tests/tsdbTests.cpp b/source/dnode/vnode/tsdb2/tests/tsdbTests.cpp new file mode 100644 index 0000000000..dc804856fd --- /dev/null +++ b/source/dnode/vnode/tsdb2/tests/tsdbTests.cpp @@ -0,0 +1,163 @@ +#include +#include +#include + +#include "tsdb.h" +#include "tsdbMain.h" + +static double getCurTime() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + tv.tv_usec * 1E-6; +} + +typedef struct { + STsdbRepo *pRepo; + bool isAscend; + int tid; + uint64_t uid; + int sversion; + TSKEY startTime; + TSKEY interval; + int totalRows; + int rowsPerSubmit; + STSchema * pSchema; +} SInsertInfo; + +static int insertData(SInsertInfo *pInfo) { + SSubmitMsg *pMsg = + (SSubmitMsg *)malloc(sizeof(SSubmitMsg) + sizeof(SSubmitBlk) + dataRowMaxBytesFromSchema(pInfo->pSchema) * pInfo->rowsPerSubmit); + if (pMsg == NULL) return -1; + TSKEY start_time = pInfo->startTime; + + // Loop to write data + double stime = getCurTime(); + + for (int k = 0; k < pInfo->totalRows/pInfo->rowsPerSubmit; k++) { + memset((void *)pMsg, 0, sizeof(SSubmitMsg)); + SSubmitBlk *pBlock = (SSubmitBlk *)pMsg->blocks; + pBlock->uid = pInfo->uid; + pBlock->tid = pInfo->tid; + pBlock->sversion = pInfo->sversion; + pBlock->dataLen = 0; + pBlock->schemaLen = 0; + pBlock->numOfRows = 0; + for (int i = 0; i < pInfo->rowsPerSubmit; i++) { + // start_time += 1000; + if (pInfo->isAscend) { + start_time += pInfo->interval; + } else { + start_time -= pInfo->interval; + } + SDataRow row = (SDataRow)(pBlock->data + pBlock->dataLen); + tdInitDataRow(row, pInfo->pSchema); + + for (int j = 0; j < schemaNCols(pInfo->pSchema); j++) { + STColumn *pTCol = schemaColAt(pInfo->pSchema, j); + if (j == 0) { // Just for timestamp + tdAppendColVal(row, (void *)(&start_time), pTCol->type, pTCol->offset); + } else { // For int + int val = 10; + tdAppendColVal(row, (void *)(&val), pTCol->type, pTCol->offset); + } + } + pBlock->dataLen += dataRowLen(row); + pBlock->numOfRows++; + } + pMsg->length = sizeof(SSubmitMsg) + sizeof(SSubmitBlk) + pBlock->dataLen; + pMsg->numOfBlocks = 1; + + pBlock->dataLen = htonl(pBlock->dataLen); + pBlock->numOfRows = htonl(pBlock->numOfRows); + pBlock->schemaLen = htonl(pBlock->schemaLen); + pBlock->uid = htobe64(pBlock->uid); + pBlock->tid = htonl(pBlock->tid); + + pBlock->sversion = htonl(pBlock->sversion); + pBlock->padding = htonl(pBlock->padding); + + pMsg->length = htonl(pMsg->length); + pMsg->numOfBlocks = htonl(pMsg->numOfBlocks); + + if (tsdbInsertData(pInfo->pRepo, pMsg, NULL) < 0) { + tfree(pMsg); + return -1; + } + } + + double etime = getCurTime(); + + printf("Spent %f seconds to write %d records\n", etime - stime, pInfo->totalRows); + tfree(pMsg); + return 0; +} + +static void tsdbSetCfg(STsdbCfg *pCfg, int32_t tsdbId, int32_t cacheBlockSize, int32_t totalBlocks, int32_t maxTables, + int32_t daysPerFile, int32_t keep, int32_t minRows, int32_t maxRows, int8_t precision, + int8_t compression) { + pCfg->tsdbId = tsdbId; + pCfg->cacheBlockSize = cacheBlockSize; + pCfg->totalBlocks = totalBlocks; + // pCfg->maxTables = maxTables; + pCfg->daysPerFile = daysPerFile; + pCfg->keep = keep; + pCfg->minRowsPerFileBlock = minRows; + pCfg->maxRowsPerFileBlock = maxRows; + pCfg->precision = precision; + pCfg->compression = compression; +} + +static void tsdbSetTableCfg(STableCfg *pCfg) { + STSchemaBuilder schemaBuilder = {0}; + + pCfg->type = TSDB_NORMAL_TABLE; + pCfg->superUid = TSDB_INVALID_SUPER_TABLE_ID; + pCfg->tableId.tid = 1; + pCfg->tableId.uid = 5849583783847394; + tdInitTSchemaBuilder(&schemaBuilder, 0); + + int colId = 0; + for (int i = 0; i < 5; i++) { + tdAddColToSchema(&schemaBuilder, (colId == 0) ? TSDB_DATA_TYPE_TIMESTAMP : TSDB_DATA_TYPE_INT, colId, 0); + colId++; + } + + pCfg->schema = tdGetSchemaFromBuilder(&schemaBuilder); + pCfg->name = strdup("t1"); + + tdDestroyTSchemaBuilder(&schemaBuilder); +} + +TEST(TsdbTest, testInsertSpeed) { + int vnode = 1; + int ret = 0; + STsdbCfg tsdbCfg; + STableCfg tableCfg; + std::string testDir = "./test"; + char * rootDir = strdup((testDir + "/vnode" + std::to_string(vnode)).c_str()); + + tsdbDebugFlag = 131; //NOTE: you must set the flag + + taosRemoveDir(rootDir); + + // Create and open repository + tsdbSetCfg(&tsdbCfg, 1, 16, 4, -1, -1, -1, -1, -1, -1, -1); + tsdbCreateRepo(rootDir, &tsdbCfg); + STsdbRepo *repo = tsdbOpenRepo(rootDir, NULL); + ASSERT_NE(repo, nullptr); + + // Create table + tsdbSetTableCfg(&tableCfg); + tsdbCreateTable(repo, &tableCfg); + + // Insert data + SInsertInfo iInfo = {repo, true, 1, 5849583783847394, 0, 1590000000000, 10, 10000000, 100, tableCfg.schema}; + + insertData(&iInfo); + + tsdbCloseRepo(repo, 1); +} + +static char *getTKey(const void *data) { + return (char *)data; +} \ No newline at end of file