homework-jianmu/source/dnode/vnode/src/tsdb/tsdbRead2.c

6059 lines
200 KiB
C

/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "osDef.h"
#include "tsdb.h"
#include "tsdbDataFileRW.h"
#include "tsdbFS2.h"
#include "tsdbMerge.h"
#include "tsdbReadUtil.h"
#include "tsdbUtil2.h"
#include "tsimplehash.h"
#define ASCENDING_TRAVERSE(o) (o == TSDB_ORDER_ASC)
#define getCurrentKeyInSttBlock(_r) (&((_r)->currentKey))
#define tColRowGetKeyDeepCopy(_pBlock, _irow, _slotId, _pKey) \
do { \
(_pKey)->ts = (_pBlock)->aTSKEY[(_irow)]; \
(_pKey)->numOfPKs = 0; \
if ((_slotId) != -1) { \
tColRowGetPriamyKeyDeepCopy(_pBlock, _irow, _slotId, _pKey); \
} \
} while (0)
#define outOfTimeWindow(_ts, _window) (((_ts) > (_window)->ekey) || ((_ts) < (_window)->skey))
typedef struct {
bool overlapWithNeighborBlock;
bool hasDupTs;
bool overlapWithDelInfo;
bool overlapWithSttBlock;
bool overlapWithKeyInBuf;
bool partiallyRequired;
bool moreThanCapcity;
} SDataBlockToLoadInfo;
static int32_t getCurrentBlockInfo(SDataBlockIter* pBlockIter, SFileDataBlockInfo** pInfo, const char* idStr);
static int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
STsdbReader* pReader);
static void getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader, TSDBROW** pRes);
static int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, SRowKey* pKey,
STsdbReader* pReader);
static int32_t doMergeRowsInSttBlock(SSttBlockReader* pSttBlockReader, STableBlockScanInfo* pScanInfo,
SRowMerger* pMerger, int32_t pkSrcSlot, SVersionRange* pVerRange, const char* id);
static int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, SRowKey* pCurKey, SArray* pDelList,
STsdbReader* pReader);
static int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, SRow* pTSRow,
STableBlockScanInfo* pScanInfo);
static int32_t doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
int32_t rowIndex);
static void setComposedBlockFlag(STsdbReader* pReader, bool composed);
static bool hasBeenDropped(const SArray* pDelList, int32_t* index, int64_t key, int64_t ver, int32_t order,
SVersionRange* pVerRange, bool hasPk);
static int32_t doMergeMemTableMultiRows(TSDBROW* pRow, SRowKey* pKey, uint64_t uid, SIterInfo* pIter, SArray* pDelList,
TSDBROW* pResRow, STsdbReader* pReader, bool* freeTSRow);
static int32_t doMergeMemIMemRows(TSDBROW* pRow, SRowKey* pRowKey, TSDBROW* piRow, SRowKey* piRowKey,
STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, SRow** pTSRow);
static int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, SRowKey* pKey,
STsdbReader* pReader);
static int32_t mergeRowsInSttBlocks(SSttBlockReader* pSttBlockReader, STableBlockScanInfo* pScanInfo,
STsdbReader* pReader);
static int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, int32_t order, SReadCostSummary* pCost);
static void getTsdbByRetentions(SVnode* pVnode, SQueryTableDataCond* pCond, SRetention* retentions, const char* idstr,
int8_t* pLevel, STsdb** pTsdb);
static SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level);
static int32_t doBuildDataBlock(STsdbReader* pReader);
static TSDBKEY getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader);
static bool hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo);
static bool hasDataInSttBlock(STableBlockScanInfo* pInfo);
static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter);
static int32_t getInitialDelIndex(const SArray* pDelSkyline, int32_t order);
static int32_t resetTableListIndex(SReaderStatus* pStatus, const char* id);
static void getMemTableTimeRange(STsdbReader* pReader, int64_t* pMaxKey, int64_t* pMinKey);
static void updateComposedBlockInfo(STsdbReader* pReader, double el, STableBlockScanInfo* pBlockScanInfo);
static int32_t buildFromPreFilesetBuffer(STsdbReader* pReader);
static void resetPreFilesetMemTableListIndex(SReaderStatus* pStatus);
FORCE_INLINE int32_t pkCompEx(SRowKey* p1, SRowKey* p2) {
if (p2 == NULL) {
return 1;
}
if (p1 == NULL) {
return -1;
}
if (p1->ts < p2->ts) {
return -1;
} else if (p1->ts > p2->ts) {
return 1;
}
if (p1->numOfPKs == 0) {
return 0;
} else {
return tRowKeyCompare(p1, p2);
}
}
static void tColRowGetPriamyKeyDeepCopy(SBlockData* pBlock, int32_t irow, int32_t slotId, SRowKey* pKey) {
SColData* pColData = &pBlock->aColData[slotId];
SColVal cv;
tColDataGetValue(pColData, irow, &cv);
pKey->numOfPKs = 1;
pKey->pks[0].type = cv.value.type;
if (IS_NUMERIC_TYPE(cv.value.type)) {
pKey->pks[0].val = cv.value.val;
} else {
pKey->pks[0].nData = cv.value.nData;
(void)memcpy(pKey->pks[0].pData, cv.value.pData, cv.value.nData);
}
}
// for test purpose, todo remove it
static int32_t tGetPrimaryKeyIndex(uint8_t* p, SPrimaryKeyIndex* index) {
int32_t n = 0;
n += tGetI8(p + n, &index->type);
n += tGetU32v(p + n, &index->offset);
return n;
}
static void tRowGetPrimaryKeyDeepCopy(SRow* pRow, SRowKey* pKey) {
SPrimaryKeyIndex indices[TD_MAX_PK_COLS];
uint8_t* data = pRow->data;
for (int32_t i = 0; i < pRow->numOfPKs; i++) {
data += tGetPrimaryKeyIndex(data, &indices[i]);
}
// primary keys
for (int32_t i = 0; i < pRow->numOfPKs; i++) {
pKey->pks[i].type = indices[i].type;
uint8_t* tdata = data + indices[i].offset;
if (pRow->flag >> 4) {
tdata += tGetI16v(tdata, NULL);
}
if (IS_VAR_DATA_TYPE(indices[i].type)) {
tdata += tGetU32v(tdata, &pKey->pks[i].nData);
(void)memcpy(pKey->pks[i].pData, tdata, pKey->pks[i].nData);
} else {
(void)memcpy(&pKey->pks[i].val, data + indices[i].offset, tDataTypes[pKey->pks[i].type].bytes);
}
}
}
static int32_t setColumnIdSlotList(SBlockLoadSuppInfo* pSupInfo, SColumnInfo* pCols, const int32_t* pSlotIdList,
int32_t numOfCols) {
bool initSucc = true;
pSupInfo->pk.pk = 0;
pSupInfo->numOfPks = 0;
pSupInfo->pkSrcSlot = -1;
pSupInfo->pkDstSlot = -1;
pSupInfo->smaValid = true;
pSupInfo->numOfCols = numOfCols;
pSupInfo->colId = taosMemoryMalloc(numOfCols * (sizeof(int16_t) * 2 + POINTER_BYTES));
if (pSupInfo->colId == NULL) {
taosMemoryFree(pSupInfo->colId);
return terrno;
}
pSupInfo->slotId = (int16_t*)((char*)pSupInfo->colId + (sizeof(int16_t) * numOfCols));
pSupInfo->buildBuf = (char**)((char*)pSupInfo->slotId + (sizeof(int16_t) * numOfCols));
for (int32_t i = 0; i < numOfCols; ++i) {
pSupInfo->colId[i] = pCols[i].colId;
pSupInfo->slotId[i] = pSlotIdList[i];
if (IS_VAR_DATA_TYPE(pCols[i].type)) {
pSupInfo->buildBuf[i] = taosMemoryMalloc(pCols[i].bytes);
if (pSupInfo->buildBuf[i] == NULL) {
tsdbError("failed to prepare memory for set columnId slot list, size:%d, code:out of memory", pCols[i].bytes);
initSucc = false;
}
} else {
pSupInfo->buildBuf[i] = NULL;
}
if (pCols[i].pk) {
pSupInfo->pk = pCols[i];
pSupInfo->pkSrcSlot = i - 1;
pSupInfo->pkDstSlot = pSlotIdList[i];
pSupInfo->numOfPks += 1;
}
}
return (initSucc)? TSDB_CODE_SUCCESS:TSDB_CODE_OUT_OF_MEMORY;
}
static int32_t updateBlockSMAInfo(STSchema* pSchema, SBlockLoadSuppInfo* pSupInfo) {
int32_t i = 0, j = 0;
while (i < pSchema->numOfCols && j < pSupInfo->numOfCols) {
STColumn* pTCol = &pSchema->columns[i];
if (pTCol->colId == pSupInfo->colId[j]) {
if (!IS_BSMA_ON(pTCol) && (PRIMARYKEY_TIMESTAMP_COL_ID != pTCol->colId)) {
pSupInfo->smaValid = false;
return TSDB_CODE_SUCCESS;
}
i += 1;
j += 1;
} else if (pTCol->colId < pSupInfo->colId[j]) { // do nothing
i += 1;
} else {
return TSDB_CODE_TDB_INVALID_TABLE_SCHEMA_VER;
}
}
return TSDB_CODE_SUCCESS;
}
static bool isEmptyQueryTimeWindow(STimeWindow* pWindow) { return pWindow->skey > pWindow->ekey; }
// Update the query time window according to the data time to live(TTL) information, in order to avoid to return
// the expired data to client, even it is queried already.
static STimeWindow updateQueryTimeWindow(STsdb* pTsdb, STimeWindow* pWindow) {
int64_t earlyTs = tsdbGetEarliestTs(pTsdb);
STimeWindow win = *pWindow;
if (win.skey < earlyTs) {
win.skey = earlyTs;
}
return win;
}
// init file iterator
static int32_t initFilesetIterator(SFilesetIter* pIter, TFileSetArray* pFileSetArray, STsdbReader* pReader) {
SBlockLoadSuppInfo* pInfo = &pReader->suppInfo;
size_t numOfFileset = TARRAY2_SIZE(pFileSetArray);
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
int32_t code = TSDB_CODE_SUCCESS;
pIter->index = asc ? -1 : numOfFileset;
pIter->order = pReader->info.order;
pIter->pFilesetList = pFileSetArray;
pIter->numOfFiles = numOfFileset;
if (pIter->pSttBlockReader == NULL) {
pIter->pSttBlockReader = taosMemoryCalloc(1, sizeof(struct SSttBlockReader));
if (pIter->pSttBlockReader == NULL) {
tsdbError("failed to prepare the last block iterator, since:%s %s", tstrerror(terrno), pReader->idStr);
return terrno;
}
}
SSttBlockReader* pSttReader = pIter->pSttBlockReader;
pSttReader->order = pReader->info.order;
pSttReader->window = pReader->info.window;
pSttReader->verRange = pReader->info.verRange;
pSttReader->numOfPks = pReader->suppInfo.numOfPks;
pSttReader->uid = 0;
tMergeTreeClose(&pSttReader->mergeTree);
code = initRowKey(&pSttReader->currentKey, INT64_MIN, pInfo->numOfPks, pInfo->pk.type, pInfo->pk.bytes, asc);
if (code != TSDB_CODE_SUCCESS) {
tsdbError("failed init row key, %s", pReader->idStr);
} else {
tsdbDebug("init fileset iterator, total files:%d %s", pIter->numOfFiles, pReader->idStr);
}
return code;
}
static int32_t filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader, bool* hasNext) {
bool asc = ASCENDING_TRAVERSE(pIter->order);
int32_t step = asc ? 1 : -1;
int32_t code = 0;
pIter->index += step;
if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
*hasNext = false;
return TSDB_CODE_SUCCESS;
}
SReadCostSummary* pCost = &pReader->cost;
pIter->pSttBlockReader->uid = 0;
tMergeTreeClose(&pIter->pSttBlockReader->mergeTree);
destroySttBlockReader(pReader->status.pLDataIterArray, &pCost->sttCost);
pReader->status.pLDataIterArray = taosArrayInit(4, POINTER_BYTES);
if (pReader->status.pLDataIterArray == NULL) {
return terrno;
}
// check file the time range of coverage
STimeWindow win = {0};
while (1) {
if (pReader->pFileReader != NULL) {
tsdbDataFileReaderClose(&pReader->pFileReader);
}
pReader->status.pCurrentFileset = pIter->pFilesetList->data[pIter->index];
STFileObj** pFileObj = pReader->status.pCurrentFileset->farr;
if (pFileObj[0] != NULL || pFileObj[3] != NULL) {
SDataFileReaderConfig conf = {.tsdb = pReader->pTsdb, .szPage = pReader->pTsdb->pVnode->config.tsdbPageSize};
const char* filesName[4] = {0};
if (pFileObj[0] != NULL) {
conf.files[0].file = *pFileObj[0]->f;
conf.files[0].exist = true;
filesName[0] = pFileObj[0]->fname;
conf.files[1].file = *pFileObj[1]->f;
conf.files[1].exist = true;
filesName[1] = pFileObj[1]->fname;
conf.files[2].file = *pFileObj[2]->f;
conf.files[2].exist = true;
filesName[2] = pFileObj[2]->fname;
}
if (pFileObj[3] != NULL) {
conf.files[3].exist = true;
conf.files[3].file = *pFileObj[3]->f;
filesName[3] = pFileObj[3]->fname;
}
code = tsdbDataFileReaderOpen(filesName, &conf, &pReader->pFileReader);
if (code != TSDB_CODE_SUCCESS) {
goto _err;
}
pReader->cost.headFileLoad += 1;
}
int32_t fid = pReader->status.pCurrentFileset->fid;
tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);
// current file are no longer overlapped with query time window, ignore remain files
if ((asc && win.skey > pReader->info.window.ekey) || (!asc && win.ekey < pReader->info.window.skey)) {
tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, %s", pReader,
pReader->info.window.skey, pReader->info.window.ekey, pReader->idStr);
*hasNext = false;
return TSDB_CODE_SUCCESS;
}
if ((asc && (win.ekey < pReader->info.window.skey)) || ((!asc) && (win.skey > pReader->info.window.ekey))) {
pIter->index += step;
if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
*hasNext = false;
return TSDB_CODE_SUCCESS;
}
continue;
}
tsdbDebug("%p file found fid:%d for qrange:%" PRId64 "-%" PRId64 ", %s", pReader, fid, pReader->info.window.skey,
pReader->info.window.ekey, pReader->idStr);
*hasNext = true;
return TSDB_CODE_SUCCESS;
}
_err:
*hasNext = false;
return code;
}
bool shouldFreePkBuf(SBlockLoadSuppInfo* pSupp) { return (pSupp->numOfPks > 0) && IS_VAR_DATA_TYPE(pSupp->pk.type); }
int32_t resetDataBlockIterator(SDataBlockIter* pIter, int32_t order, bool needFree, const char* id) {
pIter->order = order;
pIter->index = -1;
pIter->numOfBlocks = 0;
if (pIter->blockList == NULL) {
pIter->blockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
if (pIter->blockList == NULL) {
tsdbError("%s failed to reset block iter, func:%s at line:%d code:%s", id, __func__, __LINE__, tstrerror(terrno));
return terrno;
}
} else {
clearDataBlockIterator(pIter, needFree);
}
return TSDB_CODE_SUCCESS;
}
static void initReaderStatus(SReaderStatus* pStatus) {
pStatus->pTableIter = NULL;
pStatus->loadFromFile = true;
}
static int32_t createResBlock(SQueryTableDataCond* pCond, int32_t capacity, SSDataBlock** pResBlock) {
QRY_PARAM_CHECK(pResBlock);
SSDataBlock* pBlock = NULL;
int32_t code = createDataBlock(&pBlock);
if (code != 0) {
return code;
}
for (int32_t i = 0; i < pCond->numOfCols; ++i) {
SColumnInfoData colInfo = {0};
colInfo.info = pCond->colList[i];
code = blockDataAppendColInfo(pBlock, &colInfo);
if (code != TSDB_CODE_SUCCESS) {
taosMemoryFree(pBlock);
return code;
}
}
code = blockDataEnsureCapacity(pBlock, capacity);
if (code != TSDB_CODE_SUCCESS) {
taosMemoryFree(pBlock);
}
*pResBlock = pBlock;
return code;
}
static int32_t tsdbInitReaderLock(STsdbReader* pReader) {
int32_t code = taosThreadMutexInit(&pReader->readerMutex, NULL);
tsdbTrace("tsdb/read: %p, post-init read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);
return code;
}
static int32_t tsdbUninitReaderLock(STsdbReader* pReader) {
int32_t code = TSDB_CODE_SUCCESS;
tsdbTrace("tsdb/read: %p, pre-uninit read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);
code = taosThreadMutexDestroy(&pReader->readerMutex);
tsdbTrace("tsdb/read: %p, post-uninit read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);
return code;
}
static int32_t tsdbAcquireReader(STsdbReader* pReader) {
int32_t code = -1;
tsdbTrace("tsdb/read: %s, pre-take read mutex: %p, code: %d", pReader->idStr, &pReader->readerMutex, code);
code = taosThreadMutexLock(&pReader->readerMutex);
if (code != 0) {
tsdbError("tsdb/read:%p, failed to lock reader mutex, code:%s", pReader->idStr, tstrerror(code));
} else {
tsdbTrace("tsdb/read: %s, post-take read mutex: %p, code: %d", pReader->idStr, &pReader->readerMutex, code);
}
return code;
}
static int32_t tsdbTryAcquireReader(STsdbReader* pReader) {
int32_t code = taosThreadMutexTryLock(&pReader->readerMutex);
if (code != TSDB_CODE_SUCCESS) {
tsdbError("tsdb/read: %p, post-trytake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);
} else {
tsdbTrace("tsdb/read: %p, post-trytask read mutex: %p", pReader, &pReader->readerMutex);
}
return code;
}
static int32_t tsdbReleaseReader(STsdbReader* pReader) {
int32_t code = taosThreadMutexUnlock(&pReader->readerMutex);
if (code != TSDB_CODE_SUCCESS) {
tsdbError("tsdb/read: %p post-untake read mutex:%p failed, code:%d", pReader, &pReader->readerMutex, code);
} else {
tsdbTrace("tsdb/read: %p, post-untake read mutex: %p", pReader, &pReader->readerMutex);
}
return code;
}
void tsdbReleaseDataBlock2(STsdbReader* pReader) {
SReaderStatus* pStatus = &pReader->status;
if (!pStatus->composedDataBlock) {
(void) tsdbReleaseReader(pReader);
}
}
static int32_t initResBlockInfo(SResultBlockInfo* pResBlockInfo, int64_t capacity, SSDataBlock* pResBlock,
SQueryTableDataCond* pCond, SBlockLoadSuppInfo* pSup) {
pResBlockInfo->capacity = capacity;
pResBlockInfo->pResBlock = pResBlock;
int32_t code = 0;
if (pResBlockInfo->pResBlock == NULL) {
pResBlockInfo->freeBlock = true;
pResBlockInfo->pResBlock = NULL;
code = createResBlock(pCond, pResBlockInfo->capacity, &pResBlockInfo->pResBlock);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pSup->numOfPks > 0) {
SSDataBlock* p = pResBlockInfo->pResBlock;
p->info.pks[0].type = pSup->pk.type;
p->info.pks[1].type = pSup->pk.type;
if (IS_VAR_DATA_TYPE(pSup->pk.type)) {
p->info.pks[0].pData = taosMemoryCalloc(1, pSup->pk.bytes);
if (p->info.pks[0].pData == NULL) {
return terrno;
}
p->info.pks[1].pData = taosMemoryCalloc(1, pSup->pk.bytes);
if (p->info.pks[1].pData == NULL) {
taosMemoryFreeClear(p->info.pks[0].pData);
return terrno;
}
p->info.pks[0].nData = pSup->pk.bytes;
p->info.pks[1].nData = pSup->pk.bytes;
}
}
} else {
pResBlockInfo->freeBlock = false;
}
return code;
}
static int32_t tsdbReaderCreate(SVnode* pVnode, SQueryTableDataCond* pCond, void** ppReader, int32_t capacity,
SSDataBlock* pResBlock, const char* idstr) {
int32_t code = 0;
int8_t level = 0;
STsdbReader* pReader = (STsdbReader*)taosMemoryCalloc(1, sizeof(*pReader));
if (pReader == NULL) {
code = terrno;
goto _end;
}
if (VND_IS_TSMA(pVnode)) {
tsdbDebug("vgId:%d, tsma is selected to query, %s", TD_VID(pVnode), idstr);
}
initReaderStatus(&pReader->status);
getTsdbByRetentions(pVnode, pCond, pVnode->config.tsdbCfg.retentions, idstr, &level, &pReader->pTsdb);
pReader->info.suid = pCond->suid;
pReader->info.order = pCond->order;
pReader->info.verRange = getQueryVerRange(pVnode, pCond, level);
pReader->info.window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
pReader->idStr = (idstr != NULL) ? taosStrdup(idstr) : NULL;
if (idstr != NULL && pReader->idStr == NULL) {
code = terrno;
goto _end;
}
pReader->type = pCond->type;
pReader->bFilesetDelimited = false;
pReader->blockInfoBuf.numPerBucket = 1000; // 1000 tables per bucket
if (pCond->numOfCols <= 0) {
tsdbError("vgId:%d, invalid column number %d in query cond, %s", TD_VID(pVnode), pCond->numOfCols, idstr);
code = TSDB_CODE_INVALID_PARA;
goto _end;
}
// allocate buffer in order to load data blocks from file
SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
pSup->tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
code = setColumnIdSlotList(pSup, pCond->colList, pCond->pSlotList, pCond->numOfCols);
if (code != TSDB_CODE_SUCCESS) {
goto _end;
}
code = initResBlockInfo(&pReader->resBlockInfo, capacity, pResBlock, pCond, pSup);
if (code != TSDB_CODE_SUCCESS) {
goto _end;
}
code = tBlockDataCreate(&pReader->status.fileBlockData);
if (code != TSDB_CODE_SUCCESS) {
goto _end;
}
if (pReader->suppInfo.colId[0] != PRIMARYKEY_TIMESTAMP_COL_ID) {
tsdbError("the first column isn't primary timestamp, %d, %s", pReader->suppInfo.colId[0], pReader->idStr);
code = TSDB_CODE_INVALID_PARA;
goto _end;
}
pReader->status.pPrimaryTsCol = taosArrayGet(pReader->resBlockInfo.pResBlock->pDataBlock, pSup->slotId[0]);
if (pReader->status.pPrimaryTsCol == NULL) {
code = TSDB_CODE_INVALID_PARA;
goto _end;
}
int32_t type = pReader->status.pPrimaryTsCol->info.type;
if (type != TSDB_DATA_TYPE_TIMESTAMP) {
tsdbError("the first column isn't primary timestamp in result block, actual: %s, %s", tDataTypes[type].name,
pReader->idStr);
code = TSDB_CODE_INVALID_PARA;
goto _end;
}
code = tsdbInitReaderLock(pReader);
if (code != TSDB_CODE_SUCCESS) {
goto _end;
}
code = tsem_init(&pReader->resumeAfterSuspend, 0, 0);
if (code != TSDB_CODE_SUCCESS) {
goto _end;
}
*ppReader = pReader;
return code;
_end:
tsdbReaderClose2(pReader);
*ppReader = NULL;
return code;
}
static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFileReader* pFileReader, SArray* pIndexList) {
int64_t st = taosGetTimestampUs();
int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
if (pFileReader == NULL) {
return TSDB_CODE_SUCCESS;
}
const TBrinBlkArray* pBlkArray = NULL;
int32_t code = tsdbDataFileReadBrinBlk(pFileReader, &pBlkArray);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
#if 0
LRUHandle* handle = NULL;
int32_t code = tsdbCacheGetBlockIdx(pFileReader->pTsdb->biCache, pFileReader, &handle);
if (code != TSDB_CODE_SUCCESS || handle == NULL) {
goto _end;
}
SArray* aBlockIdx = (SArray*)taosLRUCacheValue(pFileReader->pTsdb->biCache, handle);
size_t num = taosArrayGetSize(aBlockIdx);
if (num == 0) {
tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
return TSDB_CODE_SUCCESS;
}
#endif
// todo binary search to the start position
int64_t et1 = taosGetTimestampUs();
SBrinBlk* pBrinBlk = NULL;
STableUidList* pList = &pReader->status.uidList;
int32_t i = 0;
while (i < TARRAY2_SIZE(pBlkArray)) {
pBrinBlk = &pBlkArray->data[i];
if (pBrinBlk->maxTbid.suid < pReader->info.suid) {
i += 1;
continue;
}
if (pBrinBlk->minTbid.suid > pReader->info.suid) { // not include the queried table/super table, quit the loop
break;
}
if (!(pBrinBlk->minTbid.suid <= pReader->info.suid && pBrinBlk->maxTbid.suid >= pReader->info.suid)) {
tsdbError("tsdb failed at: %s %d", __func__, __LINE__);
return TSDB_CODE_INTERNAL_ERROR;
}
if (pBrinBlk->maxTbid.suid == pReader->info.suid && pBrinBlk->maxTbid.uid < pList->tableUidList[0]) {
i += 1;
continue;
}
if (pBrinBlk->minTbid.suid == pReader->info.suid && pBrinBlk->minTbid.uid > pList->tableUidList[numOfTables - 1]) {
break;
}
void* p1 = taosArrayPush(pIndexList, pBrinBlk);
if (p1 == NULL) {
return terrno;
}
i += 1;
}
int64_t et2 = taosGetTimestampUs();
tsdbDebug("load block index for %d/%d tables completed, elapsed time:%.2f ms, set BrinBlk:%.2f ms, size:%.2f Kb %s",
numOfTables, (int32_t)pBlkArray->size, (et1 - st) / 1000.0, (et2 - et1) / 1000.0,
pBlkArray->size * sizeof(SBrinBlk) / 1024.0, pReader->idStr);
pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;
_end:
// tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
return code;
}
static int32_t loadFileBlockBrinInfo(STsdbReader* pReader, SArray* pIndexList, SBlockNumber* pBlockNum,
SArray* pTableScanInfoList) {
int32_t k = 0;
size_t sizeInDisk = 0;
int64_t st = taosGetTimestampUs();
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
STimeWindow w = pReader->info.window;
SBrinRecord* pRecord = NULL;
int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
SBrinRecordIter iter = {0};
// clear info for the new file
cleanupInfoForNextFileset(pReader->status.pTableMap);
initBrinRecordIter(&iter, pReader->pFileReader, pIndexList);
while (1) {
int32_t code = getNextBrinRecord(&iter, &pRecord);
if (code != TSDB_CODE_SUCCESS) {
clearBrinBlockIter(&iter);
return code;
}
if (pRecord == NULL) {
break;
}
if (pRecord->suid > pReader->info.suid) {
break;
}
uint64_t uid = pReader->status.uidList.tableUidList[k];
if (pRecord->suid < pReader->info.suid) {
continue;
}
if (uid < pRecord->uid) { // forward the table uid index
while (k < numOfTables && pReader->status.uidList.tableUidList[k] < pRecord->uid) {
k += 1;
}
if (k >= numOfTables) {
break;
}
uid = pReader->status.uidList.tableUidList[k];
}
if (pRecord->uid < uid) {
continue;
}
if (!(pRecord->suid == pReader->info.suid && uid == pRecord->uid)) {
tsdbError("tsdb failed at: %s:%d", __func__, __LINE__);
clearBrinBlockIter(&iter);
return TSDB_CODE_INTERNAL_ERROR;
}
STableBlockScanInfo* pScanInfo = NULL;
code = getTableBlockScanInfo(pReader->status.pTableMap, uid, &pScanInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
clearBrinBlockIter(&iter);
return code;
}
// here we should find the first timestamp that is greater than the lastProcKey
// the window is an open interval NOW.
if (asc) {
w.skey = pScanInfo->lastProcKey.ts;
} else {
w.ekey = pScanInfo->lastProcKey.ts;
}
// NOTE: specialized for open interval
if (((w.skey < INT64_MAX) && ((w.skey + 1) > w.ekey)) || (w.skey == INT64_MAX)) {
k += 1;
if (k >= numOfTables) {
break;
} else {
continue;
}
}
// 1. time range check
if (pRecord->firstKey.key.ts > w.ekey || pRecord->lastKey.key.ts < w.skey) {
continue;
}
if (asc) {
if (pkCompEx(&pRecord->lastKey.key, &pScanInfo->lastProcKey) <= 0) {
continue;
}
} else {
if (pkCompEx(&pRecord->firstKey.key, &pScanInfo->lastProcKey) >= 0) {
continue;
}
}
// 2. version range check, version range is an CLOSED interval
if (pRecord->minVer > pReader->info.verRange.maxVer || pRecord->maxVer < pReader->info.verRange.minVer) {
continue;
}
if (pScanInfo->pBlockList == NULL) {
pScanInfo->pBlockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
if (pScanInfo->pBlockList == NULL) {
clearBrinBlockIter(&iter);
return terrno;
}
}
if (pScanInfo->pBlockIdxList == NULL) {
pScanInfo->pBlockIdxList = taosArrayInit(4, sizeof(STableDataBlockIdx));
if (pScanInfo->pBlockIdxList == NULL) {
clearBrinBlockIter(&iter);
return terrno;
}
}
SFileDataBlockInfo blockInfo = {.tbBlockIdx = TARRAY_SIZE(pScanInfo->pBlockList)};
code = recordToBlockInfo(&blockInfo, pRecord);
if (code != TSDB_CODE_SUCCESS) {
clearBrinBlockIter(&iter);
return code;
}
void* p1 = taosArrayPush(pScanInfo->pBlockList, &blockInfo);
if (p1 == NULL) {
clearBrinBlockIter(&iter);
return terrno;
}
// todo: refactor to record the fileset skey/ekey
if (pScanInfo->filesetWindow.skey > pRecord->firstKey.key.ts) {
pScanInfo->filesetWindow.skey = pRecord->firstKey.key.ts;
}
if (pScanInfo->filesetWindow.ekey < pRecord->lastKey.key.ts) {
pScanInfo->filesetWindow.ekey = pRecord->lastKey.key.ts;
}
pBlockNum->numOfBlocks += 1;
if (taosArrayGetSize(pTableScanInfoList) == 0) {
p1 = taosArrayPush(pTableScanInfoList, &pScanInfo);
} else {
STableBlockScanInfo** p = taosArrayGetLast(pTableScanInfoList);
if (p == NULL) {
clearBrinBlockIter(&iter);
return TSDB_CODE_INVALID_PARA;
}
if ((*p)->uid != uid) {
p1 = taosArrayPush(pTableScanInfoList, &pScanInfo);
}
}
if (p1 == NULL) {
clearBrinBlockIter(&iter);
return terrno;
}
}
clearBrinBlockIter(&iter);
pBlockNum->numOfSttFiles = pReader->status.pCurrentFileset->lvlArr->size;
int32_t total = pBlockNum->numOfSttFiles + pBlockNum->numOfBlocks;
double el = (taosGetTimestampUs() - st) / 1000.0;
tsdbDebug(
"load block of %d tables completed, blocks:%d in %d tables, stt-files:%d, block-info-size:%.2f Kb, elapsed "
"time:%.2f ms %s",
numOfTables, pBlockNum->numOfBlocks, (int32_t)taosArrayGetSize(pTableScanInfoList), pBlockNum->numOfSttFiles,
sizeInDisk / 1000.0, el, pReader->idStr);
pReader->cost.numOfBlocks += total;
pReader->cost.headFileLoadTime += el;
return TSDB_CODE_SUCCESS;
}
static void setBlockAllDumped(SFileBlockDumpInfo* pDumpInfo, int64_t maxKey, int32_t order) {
pDumpInfo->allDumped = true;
}
static void updateLastKeyInfo(SRowKey* pKey, SFileDataBlockInfo* pBlockInfo, SDataBlockInfo* pInfo, int32_t numOfPks,
bool asc) {
pKey->ts = asc ? pInfo->window.ekey : pInfo->window.skey;
pKey->numOfPKs = numOfPks;
if (pKey->numOfPKs <= 0) {
return;
}
if (IS_NUMERIC_TYPE(pKey->pks[0].type)) {
pKey->pks[0].val = asc ? pBlockInfo->lastPk.val : pBlockInfo->firstPk.val;
} else {
uint8_t* p = asc ? pBlockInfo->lastPk.pData : pBlockInfo->firstPk.pData;
pKey->pks[0].nData = asc ? varDataLen(pBlockInfo->lastPk.pData) : varDataLen(pBlockInfo->firstPk.pData);
(void)memcpy(pKey->pks[0].pData, p, pKey->pks[0].nData);
}
}
static int32_t doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_t colIndex, SColVal* pColVal,
SBlockLoadSuppInfo* pSup) {
int32_t code = 0;
if (IS_VAR_DATA_TYPE(pColVal->value.type)) {
if (!COL_VAL_IS_VALUE(pColVal)) {
colDataSetNULL(pColInfoData, rowIndex);
} else {
varDataSetLen(pSup->buildBuf[colIndex], pColVal->value.nData);
if ((pColVal->value.nData + VARSTR_HEADER_SIZE) > pColInfoData->info.bytes) {
tsdbWarn("column cid:%d actual data len %d is bigger than schema len %d", pColVal->cid, pColVal->value.nData,
pColInfoData->info.bytes);
return TSDB_CODE_TDB_INVALID_TABLE_SCHEMA_VER;
}
if (pColVal->value.nData > 0) { // pData may be null, if nData is 0
(void)memcpy(varDataVal(pSup->buildBuf[colIndex]), pColVal->value.pData, pColVal->value.nData);
}
code = colDataSetVal(pColInfoData, rowIndex, pSup->buildBuf[colIndex], false);
}
} else {
code = colDataSetVal(pColInfoData, rowIndex, (const char*)&pColVal->value.val, !COL_VAL_IS_VALUE(pColVal));
}
return code;
}
static int32_t getCurrentBlockInfo(SDataBlockIter* pBlockIter, SFileDataBlockInfo** pInfo, const char* id) {
*pInfo = NULL;
if (pBlockIter->blockList == NULL) {
return TSDB_CODE_FAILED;
}
size_t num = TARRAY_SIZE(pBlockIter->blockList);
if (num == 0) {
if (num != pBlockIter->numOfBlocks) {
tsdbError("tsdb read failed at: %s:%d %s", __func__, __LINE__, id);
}
return TSDB_CODE_FAILED;
}
*pInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
return (*pInfo) != NULL? TSDB_CODE_SUCCESS:TSDB_CODE_FAILED;
}
static int32_t doBinarySearchKey(const TSKEY* keyList, int num, int pos, TSKEY key, int order) {
// start end position
int s, e;
s = pos;
// check
if (!(pos >= 0 && pos < num && num > 0)) {
return -1;
}
if (order == TSDB_ORDER_ASC) {
// find the first position which is smaller than the key
e = num - 1;
if (key < keyList[pos]) return -1;
while (1) {
// check can return
if (key >= keyList[e]) return e;
if (key <= keyList[s]) return s;
if (e - s <= 1) return s;
// change start or end position
int mid = s + (e - s + 1) / 2;
if (keyList[mid] > key)
e = mid;
else if (keyList[mid] < key)
s = mid;
else
return mid;
}
} else { // DESC
// find the first position which is bigger than the key
e = 0;
if (key > keyList[pos]) return -1;
while (1) {
// check can return
if (key <= keyList[e]) return e;
if (key >= keyList[s]) return s;
if (s - e <= 1) return s;
// change start or end position
int mid = s - (s - e + 1) / 2;
if (keyList[mid] < key)
e = mid;
else if (keyList[mid] > key)
s = mid;
else
return mid;
}
}
}
// handle the repeat ts cases.
static int32_t findFirstPos(const int64_t* pTsList, int32_t num, int32_t startPos, bool asc) {
int32_t i = startPos;
int64_t startTs = pTsList[startPos];
if (asc) {
while (i < num && (pTsList[i] == startTs)) {
i++;
}
return i - 1;
} else {
while (i >= 0 && (pTsList[i] == startTs)) {
i--;
}
return i + 1;
}
}
static int32_t getEndPosInDataBlock(STsdbReader* pReader, SBlockData* pBlockData, SBrinRecord* pRecord, int32_t pos) {
// NOTE: reverse the order to find the end position in data block
int32_t endPos = -1;
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
if (asc && pReader->info.window.ekey >= pRecord->lastKey.key.ts) {
endPos = pRecord->numRow - 1;
} else if (!asc && pReader->info.window.skey <= pRecord->firstKey.key.ts) {
endPos = 0;
} else {
int64_t key = asc ? pReader->info.window.ekey : pReader->info.window.skey;
endPos = doBinarySearchKey(pBlockData->aTSKEY, pRecord->numRow, pos, key, pReader->info.order);
if (endPos == -1) {
return endPos;
}
endPos = findFirstPos(pBlockData->aTSKEY, pRecord->numRow, endPos, asc);
}
if ((pReader->info.verRange.maxVer >= pRecord->minVer && pReader->info.verRange.maxVer < pRecord->maxVer) ||
(pReader->info.verRange.minVer <= pRecord->maxVer && pReader->info.verRange.minVer > pRecord->minVer)) {
int32_t i = endPos;
if (asc) {
for (; i >= 0; --i) {
if (pBlockData->aVersion[i] <= pReader->info.verRange.maxVer) {
break;
}
}
} else {
for (; i < pRecord->numRow; ++i) {
if (pBlockData->aVersion[i] >= pReader->info.verRange.minVer) {
break;
}
}
}
endPos = i;
}
return endPos;
}
static void copyPrimaryTsCol(const SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
int32_t dumpedRows, bool asc) {
if (asc) {
(void)memcpy(pColData->pData, &pBlockData->aTSKEY[pDumpInfo->rowIndex], dumpedRows * sizeof(int64_t));
} else {
int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
(void)memcpy(pColData->pData, &pBlockData->aTSKEY[startIndex], dumpedRows * sizeof(int64_t));
// todo: opt perf by extract the loop
// reverse the array list
int32_t mid = dumpedRows >> 1u;
int64_t* pts = (int64_t*)pColData->pData;
for (int32_t j = 0; j < mid; ++j) {
int64_t t = pts[j];
pts[j] = pts[dumpedRows - j - 1];
pts[dumpedRows - j - 1] = t;
}
}
}
// a faster version of copy procedure.
static void copyNumericCols(const SColData* pData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
int32_t dumpedRows, bool asc) {
uint8_t* p = NULL;
if (asc) {
p = pData->pData + tDataTypes[pData->type].bytes * pDumpInfo->rowIndex;
} else {
int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
p = pData->pData + tDataTypes[pData->type].bytes * startIndex;
}
int32_t step = asc ? 1 : -1;
// make sure it is aligned to 8bit, the allocated memory address is aligned to 256bit
// 1. copy data in a batch model
(void)memcpy(pColData->pData, p, dumpedRows * tDataTypes[pData->type].bytes);
// 2. reverse the array list in case of descending order scan data block
if (!asc) {
switch (pColData->info.type) {
case TSDB_DATA_TYPE_TIMESTAMP:
case TSDB_DATA_TYPE_DOUBLE:
case TSDB_DATA_TYPE_BIGINT:
case TSDB_DATA_TYPE_UBIGINT: {
int32_t mid = dumpedRows >> 1u;
int64_t* pts = (int64_t*)pColData->pData;
for (int32_t j = 0; j < mid; ++j) {
int64_t t = pts[j];
pts[j] = pts[dumpedRows - j - 1];
pts[dumpedRows - j - 1] = t;
}
break;
}
case TSDB_DATA_TYPE_BOOL:
case TSDB_DATA_TYPE_TINYINT:
case TSDB_DATA_TYPE_UTINYINT: {
int32_t mid = dumpedRows >> 1u;
int8_t* pts = (int8_t*)pColData->pData;
for (int32_t j = 0; j < mid; ++j) {
int8_t t = pts[j];
pts[j] = pts[dumpedRows - j - 1];
pts[dumpedRows - j - 1] = t;
}
break;
}
case TSDB_DATA_TYPE_SMALLINT:
case TSDB_DATA_TYPE_USMALLINT: {
int32_t mid = dumpedRows >> 1u;
int16_t* pts = (int16_t*)pColData->pData;
for (int32_t j = 0; j < mid; ++j) {
int64_t t = pts[j];
pts[j] = pts[dumpedRows - j - 1];
pts[dumpedRows - j - 1] = t;
}
break;
}
case TSDB_DATA_TYPE_FLOAT:
case TSDB_DATA_TYPE_INT:
case TSDB_DATA_TYPE_UINT: {
int32_t mid = dumpedRows >> 1u;
int32_t* pts = (int32_t*)pColData->pData;
for (int32_t j = 0; j < mid; ++j) {
int32_t t = pts[j];
pts[j] = pts[dumpedRows - j - 1];
pts[dumpedRows - j - 1] = t;
}
break;
}
}
}
// 3. if the null value exists, check items one-by-one
if (pData->flag != HAS_VALUE) {
int32_t rowIndex = 0;
for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step, rowIndex++) {
uint8_t v = tColDataGetBitValue(pData, j);
if (v == 0 || v == 1) {
colDataSetNull_f(pColData->nullbitmap, rowIndex);
pColData->hasNull = true;
}
}
}
}
static void blockInfoToRecord(SBrinRecord* record, SFileDataBlockInfo* pBlockInfo, SBlockLoadSuppInfo* pSupp) {
record->uid = pBlockInfo->uid;
record->firstKey = (STsdbRowKey){.key = {.ts = pBlockInfo->firstKey, .numOfPKs = pSupp->numOfPks}};
record->lastKey = (STsdbRowKey){.key = {.ts = pBlockInfo->lastKey, .numOfPKs = pSupp->numOfPks}};
if (pSupp->numOfPks > 0) {
SValue* pFirst = &record->firstKey.key.pks[0];
SValue* pLast = &record->lastKey.key.pks[0];
pFirst->type = pSupp->pk.type;
pLast->type = pSupp->pk.type;
if (IS_VAR_DATA_TYPE(pFirst->type)) {
pFirst->pData = (uint8_t*)varDataVal(pBlockInfo->firstPk.pData);
pFirst->nData = varDataLen(pBlockInfo->firstPk.pData);
pLast->pData = (uint8_t*)varDataVal(pBlockInfo->lastPk.pData);
pLast->nData = varDataLen(pBlockInfo->lastPk.pData);
} else {
pFirst->val = pBlockInfo->firstPk.val;
pLast->val = pBlockInfo->lastPk.val;
}
}
record->minVer = pBlockInfo->minVer;
record->maxVer = pBlockInfo->maxVer;
record->blockOffset = pBlockInfo->blockOffset;
record->smaOffset = pBlockInfo->smaOffset;
record->blockSize = pBlockInfo->blockSize;
record->blockKeySize = pBlockInfo->blockKeySize;
record->smaSize = pBlockInfo->smaSize;
record->numRow = pBlockInfo->numRow;
record->count = pBlockInfo->count;
}
static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader, SRowKey* pLastProcKey) {
SReaderStatus* pStatus = &pReader->status;
SDataBlockIter* pBlockIter = &pStatus->blockIter;
SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
SBlockData* pBlockData = &pStatus->fileBlockData;
SFileDataBlockInfo* pBlockInfo = NULL;
SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
int32_t numOfOutputCols = pSupInfo->numOfCols;
int32_t code = TSDB_CODE_SUCCESS;
int64_t st = taosGetTimestampUs();
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
int32_t step = asc ? 1 : -1;
code = getCurrentBlockInfo(pBlockIter, &pBlockInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
SColVal cv = {0};
SBrinRecord tmp;
blockInfoToRecord(&tmp, pBlockInfo, pSupInfo);
SBrinRecord* pRecord = &tmp;
// no data exists, return directly.
if (pBlockData->nRow == 0 || pBlockData->aTSKEY == 0) {
tsdbWarn("%p no need to copy since no data in blockData, table uid:%" PRIu64 " has been dropped, %s", pReader,
pBlockInfo->uid, pReader->idStr);
pResBlock->info.rows = 0;
return 0;
}
// row index of dump info remain the initial position, let's find the appropriate start position.
if (((pDumpInfo->rowIndex == 0) && asc) || ((pDumpInfo->rowIndex == (pRecord->numRow - 1)) && (!asc))) {
if (asc && pReader->info.window.skey <= pRecord->firstKey.key.ts &&
pReader->info.verRange.minVer <= pRecord->minVer) {
// pDumpInfo->rowIndex = 0;
} else if (!asc && pReader->info.window.ekey >= pRecord->lastKey.key.ts &&
pReader->info.verRange.maxVer >= pRecord->maxVer) {
// pDumpInfo->rowIndex = pRecord->numRow - 1;
} else { // find the appropriate the start position in current block, and set it to be the current rowIndex
int32_t pos = asc ? pRecord->numRow - 1 : 0;
int32_t order = asc ? TSDB_ORDER_DESC : TSDB_ORDER_ASC;
int64_t key = asc ? pReader->info.window.skey : pReader->info.window.ekey;
pDumpInfo->rowIndex = doBinarySearchKey(pBlockData->aTSKEY, pRecord->numRow, pos, key, order);
if (pDumpInfo->rowIndex < 0) {
tsdbError(
"%p failed to locate the start position in current block, global index:%d, table index:%d, brange:%" PRId64
"-%" PRId64 ", minVer:%" PRId64 ", maxVer:%" PRId64 " %s",
pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pRecord->firstKey.key.ts, pRecord->lastKey.key.ts,
pRecord->minVer, pRecord->maxVer, pReader->idStr);
return TSDB_CODE_INVALID_PARA;
}
pDumpInfo->rowIndex = findFirstPos(pBlockData->aTSKEY, pRecord->numRow, pDumpInfo->rowIndex, (!asc));
if (!(pReader->info.verRange.minVer <= pRecord->maxVer && pReader->info.verRange.maxVer >= pRecord->minVer)) {
tsdbError("tsdb failed at: %s:%d", __func__, __LINE__);
return TSDB_CODE_INVALID_PARA;
}
// find the appropriate start position that satisfies the version requirement.
if ((pReader->info.verRange.maxVer >= pRecord->minVer && pReader->info.verRange.maxVer < pRecord->maxVer) ||
(pReader->info.verRange.minVer <= pRecord->maxVer && pReader->info.verRange.minVer > pRecord->minVer)) {
int32_t i = pDumpInfo->rowIndex;
if (asc) {
for (; i < pRecord->numRow; ++i) {
if (pBlockData->aVersion[i] >= pReader->info.verRange.minVer) {
break;
}
}
} else {
for (; i >= 0; --i) {
if (pBlockData->aVersion[i] <= pReader->info.verRange.maxVer) {
break;
}
}
}
pDumpInfo->rowIndex = i;
}
}
}
// time window check
int32_t endIndex = getEndPosInDataBlock(pReader, pBlockData, pRecord, pDumpInfo->rowIndex);
if (endIndex == -1) {
setBlockAllDumped(pDumpInfo, pReader->info.window.ekey, pReader->info.order);
return TSDB_CODE_SUCCESS;
}
endIndex += step;
int32_t dumpedRows = asc ? (endIndex - pDumpInfo->rowIndex) : (pDumpInfo->rowIndex - endIndex);
if (dumpedRows > pReader->resBlockInfo.capacity) { // output buffer check
dumpedRows = pReader->resBlockInfo.capacity;
} else if (dumpedRows <= 0) { // no qualified rows in current data block, quit directly.
setBlockAllDumped(pDumpInfo, pReader->info.window.ekey, pReader->info.order);
return TSDB_CODE_SUCCESS;
}
int32_t i = 0;
int32_t rowIndex = 0;
SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
if (pColData == NULL) {
return TSDB_CODE_INVALID_PARA;
}
if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
copyPrimaryTsCol(pBlockData, pDumpInfo, pColData, dumpedRows, asc);
i += 1;
}
int32_t colIndex = 0;
int32_t num = pBlockData->nColData;
while (i < numOfOutputCols && colIndex < num) {
rowIndex = 0;
SColData* pData = tBlockDataGetColDataByIdx(pBlockData, colIndex);
if (pData->cid < pSupInfo->colId[i]) {
colIndex += 1;
} else if (pData->cid == pSupInfo->colId[i]) {
pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
if (pColData == NULL) {
return TSDB_CODE_INVALID_PARA;
}
if (pData->flag == HAS_NONE || pData->flag == HAS_NULL || pData->flag == (HAS_NULL | HAS_NONE)) {
colDataSetNNULL(pColData, 0, dumpedRows);
} else {
if (IS_MATHABLE_TYPE(pColData->info.type)) {
copyNumericCols(pData, pDumpInfo, pColData, dumpedRows, asc);
} else { // varchar/nchar type
for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step) {
tColDataGetValue(pData, j, &cv);
code = doCopyColVal(pColData, rowIndex++, i, &cv, pSupInfo);
if (code) {
return code;
}
}
}
}
colIndex += 1;
i += 1;
} else { // the specified column does not exist in file block, fill with null data
pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
if (pColData == NULL) {
return TSDB_CODE_INVALID_PARA;
}
colDataSetNNULL(pColData, 0, dumpedRows);
i += 1;
}
}
// fill the mis-matched columns with null value
while (i < numOfOutputCols) {
pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
if (pColData == NULL) {
return TSDB_CODE_INVALID_PARA;
}
colDataSetNNULL(pColData, 0, dumpedRows);
i += 1;
}
pResBlock->info.dataLoad = 1;
pResBlock->info.rows = dumpedRows;
pDumpInfo->rowIndex += step * dumpedRows;
tColRowGetKeyDeepCopy(pBlockData, pDumpInfo->rowIndex - step, pSupInfo->pkSrcSlot, pLastProcKey);
// check if current block are all handled
if (pDumpInfo->rowIndex >= 0 && pDumpInfo->rowIndex < pRecord->numRow) {
int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
// the remain data has out of query time window, ignore current block
if (outOfTimeWindow(ts, &pReader->info.window)) {
setBlockAllDumped(pDumpInfo, ts, pReader->info.order);
}
} else {
int64_t ts = asc ? pRecord->lastKey.key.ts : pRecord->firstKey.key.ts;
setBlockAllDumped(pDumpInfo, ts, pReader->info.order);
}
double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
pReader->cost.blockLoadTime += elapsedTime;
int32_t unDumpedRows = asc ? pRecord->numRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
tsdbDebug("%p copy file block to sdatablock, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", uid:%" PRIu64 " elapsed time:%.2f ms, %s",
pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pRecord->firstKey.key.ts, pRecord->lastKey.key.ts,
dumpedRows, unDumpedRows, pRecord->minVer, pRecord->maxVer, pBlockInfo->uid, elapsedTime, pReader->idStr);
return TSDB_CODE_SUCCESS;
}
static FORCE_INLINE STSchema* getTableSchemaImpl(STsdbReader* pReader, uint64_t uid) {
if (pReader->info.pSchema != NULL) {
terrno = TSDB_CODE_INVALID_PARA;
tsdbError("tsdb invalid input param at: %s:%d", __func__, __LINE__);
return NULL;
}
int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->info.suid, uid, -1, &pReader->info.pSchema);
if (code != TSDB_CODE_SUCCESS || pReader->info.pSchema == NULL) {
terrno = code;
tsdbError("failed to get table schema, uid:%" PRIu64 ", it may have been dropped, ver:-1, %s", uid, pReader->idStr);
return NULL;
}
code = tsdbRowMergerInit(&pReader->status.merger, pReader->info.pSchema);
if (code != TSDB_CODE_SUCCESS) {
terrno = code;
tsdbError("failed to init merger, code:%s, %s", tstrerror(code), pReader->idStr);
return NULL;
}
return pReader->info.pSchema;
}
static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockIter, SBlockData* pBlockData,
uint64_t uid) {
int32_t code = 0;
STSchema* pSchema = pReader->info.pSchema;
int64_t st = taosGetTimestampUs();
SFileDataBlockInfo* pBlockInfo = NULL;
SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
tBlockDataReset(pBlockData);
if (pReader->info.pSchema == NULL) {
pSchema = getTableSchemaImpl(pReader, uid);
if (pSchema == NULL) {
code = terrno;
tsdbError("%p table uid:%" PRIu64 " failed to get tableschema, code:%s, %s", pReader, uid, tstrerror(code),
pReader->idStr);
return code;
}
}
code = getCurrentBlockInfo(pBlockIter, &pBlockInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
SBrinRecord tmp;
blockInfoToRecord(&tmp, pBlockInfo, pSup);
SBrinRecord* pRecord = &tmp;
code = tsdbDataFileReadBlockDataByColumn(pReader->pFileReader, pRecord, pBlockData, pSchema, &pSup->colId[1],
pSup->numOfCols - 1);
if (code != TSDB_CODE_SUCCESS) {
tsdbError("%p error occurs in loading file block, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
", rows:%d, code:%s %s",
pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlockInfo->firstKey, pBlockInfo->lastKey,
pBlockInfo->numRow, tstrerror(code), pReader->idStr);
return code;
}
double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
tsdbDebug("%p load file block into buffer, global index:%d, index in table block list:%d, brange:%" PRId64 "-%" PRId64
", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pRecord->firstKey.key.ts, pRecord->lastKey.key.ts,
pRecord->numRow, pRecord->minVer, pRecord->maxVer, elapsedTime, pReader->idStr);
pReader->cost.blockLoadTime += elapsedTime;
pDumpInfo->allDumped = false;
return code;
}
/**
* This is an two rectangles overlap cases.
*/
static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* pVerRange, SFileDataBlockInfo* pBlock) {
return (pWindow->ekey < pBlock->lastKey && pWindow->ekey >= pBlock->firstKey) ||
(pWindow->skey > pBlock->firstKey && pWindow->skey <= pBlock->lastKey) ||
(pVerRange->minVer > pBlock->minVer && pVerRange->minVer <= pBlock->maxVer) ||
(pVerRange->maxVer < pBlock->maxVer && pVerRange->maxVer >= pBlock->minVer);
}
static bool getNeighborBlockOfTable(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pBlockInfo,
STableBlockScanInfo* pScanInfo, int32_t* nextIndex, int32_t order,
SBrinRecord* pRecord, SBlockLoadSuppInfo* pSupInfo) {
bool asc = ASCENDING_TRAVERSE(order);
int32_t step = asc ? 1 : -1;
if (asc && pBlockInfo->tbBlockIdx >= taosArrayGetSize(pScanInfo->pBlockIdxList) - 1) {
return false;
}
if (!asc && pBlockInfo->tbBlockIdx == 0) {
return false;
}
STableDataBlockIdx* pTableDataBlockIdx = taosArrayGet(pScanInfo->pBlockIdxList, pBlockInfo->tbBlockIdx + step);
if (pTableDataBlockIdx == NULL) {
return TSDB_CODE_INVALID_PARA;
}
SFileDataBlockInfo* p = taosArrayGet(pBlockIter->blockList, pTableDataBlockIdx->globalIndex);
if (p == NULL) {
return TSDB_CODE_INVALID_PARA;
}
blockInfoToRecord(pRecord, p, pSupInfo);
*nextIndex = pBlockInfo->tbBlockIdx + step;
return true;
}
static int32_t setFileBlockActiveInBlockIter(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t index,
int32_t step) {
int32_t code = TSDB_CODE_SUCCESS;
if (index < 0 || index >= pBlockIter->numOfBlocks) {
return TSDB_CODE_FAILED;
}
void* p = taosArrayGet(pBlockIter->blockList, index);
if (p == NULL) {
return TSDB_CODE_INVALID_PARA;
}
SFileDataBlockInfo fblock = *(SFileDataBlockInfo*) p;
pBlockIter->index += step;
if (index != pBlockIter->index) {
if (index > pBlockIter->index) {
for (int32_t i = index - 1; i >= pBlockIter->index; --i) {
SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, i);
if (pBlockInfo == NULL) {
return TSDB_CODE_INVALID_PARA;
}
STableBlockScanInfo* pBlockScanInfo = NULL;
code = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, &pBlockScanInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
STableDataBlockIdx* pTableDataBlockIdx = taosArrayGet(pBlockScanInfo->pBlockIdxList, pBlockInfo->tbBlockIdx);
if (pTableDataBlockIdx == NULL) {
return TSDB_CODE_INVALID_PARA;
}
pTableDataBlockIdx->globalIndex = i + 1;
taosArraySet(pBlockIter->blockList, i + 1, pBlockInfo);
}
} else if (index < pBlockIter->index) {
for (int32_t i = index + 1; i <= pBlockIter->index; ++i) {
SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, i);
if (pBlockInfo == NULL) {
return TSDB_CODE_INVALID_PARA;
}
STableBlockScanInfo* pBlockScanInfo = NULL;
code = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, &pBlockScanInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
STableDataBlockIdx* pTableDataBlockIdx = taosArrayGet(pBlockScanInfo->pBlockIdxList, pBlockInfo->tbBlockIdx);
if (pTableDataBlockIdx == NULL) {
return TSDB_CODE_INVALID_PARA;
}
pTableDataBlockIdx->globalIndex = i - 1;
taosArraySet(pBlockIter->blockList, i - 1, pBlockInfo);
}
}
taosArraySet(pBlockIter->blockList, pBlockIter->index, &fblock);
STableBlockScanInfo* pBlockScanInfo = NULL;
code = getTableBlockScanInfo(pReader->status.pTableMap, fblock.uid, &pBlockScanInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
STableDataBlockIdx* pTableDataBlockIdx = taosArrayGet(pBlockScanInfo->pBlockIdxList, fblock.tbBlockIdx);
if (pTableDataBlockIdx == NULL) {
return TSDB_CODE_INVALID_PARA;
}
pTableDataBlockIdx->globalIndex = pBlockIter->index;
}
return TSDB_CODE_SUCCESS;
}
// todo: this attribute could be acquired during extractin the global ordered block list.
static bool overlapWithNeighborBlock2(SFileDataBlockInfo* pBlock, SBrinRecord* pRec, int32_t order, int32_t pkType,
int32_t numOfPk) {
// it is the last block in current file, no chance to overlap with neighbor blocks.
if (ASCENDING_TRAVERSE(order)) {
if (pBlock->lastKey == pRec->firstKey.key.ts) {
if (numOfPk > 0) {
SValue v1 = {.type = pkType};
if (IS_VAR_DATA_TYPE(pkType)) {
v1.pData = (uint8_t*)varDataVal(pBlock->lastPk.pData), v1.nData = varDataLen(pBlock->lastPk.pData);
} else {
v1.val = pBlock->lastPk.val;
}
return (tValueCompare(&v1, &pRec->firstKey.key.pks[0]) == 0);
} else { // no pk
return true;
}
} else {
return false;
}
} else {
if (pBlock->firstKey == pRec->lastKey.key.ts) {
if (numOfPk > 0) {
SValue v1 = {.type = pkType};
if (IS_VAR_DATA_TYPE(pkType)) {
v1.pData = (uint8_t*)varDataVal(pBlock->firstPk.pData), v1.nData = varDataLen(pBlock->firstPk.pData);
} else {
v1.val = pBlock->firstPk.val;
}
return (tValueCompare(&v1, &pRec->lastKey.key.pks[0]) == 0);
} else { // no pk
return true;
}
} else {
return false;
}
}
}
static int64_t getBoarderKeyInFiles(SFileDataBlockInfo* pBlock, STableBlockScanInfo* pScanInfo, int32_t order) {
bool ascScan = ASCENDING_TRAVERSE(order);
int64_t key = 0;
if (pScanInfo->sttKeyInfo.status == STT_FILE_HAS_DATA) {
int64_t keyInStt = pScanInfo->sttKeyInfo.nextProcKey.ts;
key = ascScan ? TMIN(pBlock->firstKey, keyInStt) : TMAX(pBlock->lastKey, keyInStt);
} else {
key = ascScan ? pBlock->firstKey : pBlock->lastKey;
}
return key;
}
static bool bufferDataInFileBlockGap(TSDBKEY keyInBuf, SFileDataBlockInfo* pBlock, STableBlockScanInfo* pScanInfo,
int32_t order) {
bool ascScan = ASCENDING_TRAVERSE(order);
int64_t key = getBoarderKeyInFiles(pBlock, pScanInfo, order);
return (ascScan && (keyInBuf.ts != TSKEY_INITIAL_VAL && keyInBuf.ts < key)) ||
(!ascScan && (keyInBuf.ts != TSKEY_INITIAL_VAL && keyInBuf.ts > key));
}
static bool keyOverlapFileBlock(TSDBKEY key, SFileDataBlockInfo* pBlock, SVersionRange* pVerRange) {
return (key.ts >= pBlock->firstKey && key.ts <= pBlock->lastKey) && (pBlock->maxVer >= pVerRange->minVer) &&
(pBlock->minVer <= pVerRange->maxVer);
}
static void getBlockToLoadInfo(SDataBlockToLoadInfo* pInfo, SFileDataBlockInfo* pBlockInfo,
STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, STsdbReader* pReader) {
SBrinRecord rec = {0};
int32_t neighborIndex = 0;
int32_t order = pReader->info.order;
SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
bool hasNeighbor =
getNeighborBlockOfTable(&pReader->status.blockIter, pBlockInfo, pScanInfo, &neighborIndex, order, &rec, pSupInfo);
// overlap with neighbor
if (hasNeighbor) {
pInfo->overlapWithNeighborBlock =
overlapWithNeighborBlock2(pBlockInfo, &rec, order, pSupInfo->pk.type, pSupInfo->numOfPks);
}
SBrinRecord pRecord;
blockInfoToRecord(&pRecord, pBlockInfo, pSupInfo);
// has duplicated ts of different version in this block
pInfo->hasDupTs = (pBlockInfo->numRow > pBlockInfo->count) || (pBlockInfo->count <= 0);
pInfo->overlapWithDelInfo = overlapWithDelSkyline(pScanInfo, &pRecord, order);
// todo handle the primary key overlap case
if (pScanInfo->sttKeyInfo.status == STT_FILE_HAS_DATA) {
int64_t nextProcKeyInStt = pScanInfo->sttKeyInfo.nextProcKey.ts;
pInfo->overlapWithSttBlock = !(pBlockInfo->lastKey < nextProcKeyInStt || pBlockInfo->firstKey > nextProcKeyInStt);
}
pInfo->moreThanCapcity = pBlockInfo->numRow > pReader->resBlockInfo.capacity;
pInfo->partiallyRequired = dataBlockPartiallyRequired(&pReader->info.window, &pReader->info.verRange, pBlockInfo);
pInfo->overlapWithKeyInBuf = keyOverlapFileBlock(keyInBuf, pBlockInfo, &pReader->info.verRange);
}
// 1. the version of all rows should be less than the endVersion
// 2. current block should not overlap with next neighbor block
// 3. current timestamp should not be overlap with each other
// 4. output buffer should be large enough to hold all rows in current block
// 5. delete info should not overlap with current block data
// 6. current block should not contain the duplicated ts
static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pScanInfo,
TSDBKEY keyInBuf) {
SDataBlockToLoadInfo info = {0};
getBlockToLoadInfo(&info, pBlockInfo, pScanInfo, keyInBuf, pReader);
bool loadDataBlock =
(info.overlapWithNeighborBlock || info.hasDupTs || info.partiallyRequired || info.overlapWithKeyInBuf ||
info.moreThanCapcity || info.overlapWithDelInfo || info.overlapWithSttBlock);
// log the reason why load the datablock for profile
if (loadDataBlock) {
tsdbDebug("%p uid:%" PRIu64
" need to load the datablock, overlapneighbor:%d, hasDup:%d, partiallyRequired:%d, "
"overlapWithKey:%d, greaterThanBuf:%d, overlapWithDel:%d, overlapWithSttBlock:%d, %s",
pReader, pBlockInfo->uid, info.overlapWithNeighborBlock, info.hasDupTs, info.partiallyRequired,
info.overlapWithKeyInBuf, info.moreThanCapcity, info.overlapWithDelInfo, info.overlapWithSttBlock,
pReader->idStr);
}
return loadDataBlock;
}
static bool isCleanFileDataBlock(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pScanInfo,
TSDBKEY keyInBuf) {
SDataBlockToLoadInfo info = {0};
getBlockToLoadInfo(&info, pBlockInfo, pScanInfo, keyInBuf, pReader);
bool isCleanFileBlock = !(info.overlapWithNeighborBlock || info.hasDupTs || info.overlapWithKeyInBuf ||
info.overlapWithDelInfo || info.overlapWithSttBlock);
return isCleanFileBlock;
}
static int32_t initRowMergeIfNeeded(STsdbReader* pReader, int64_t uid) {
SRowMerger* pMerger = &pReader->status.merger;
int32_t code = 0;
if (pMerger->pArray == NULL) {
STSchema* ps = getTableSchemaImpl(pReader, uid);
if (ps == NULL) {
return terrno;
}
}
return code;
}
static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, int64_t endKey) {
if (!(pBlockScanInfo->iiter.hasVal || pBlockScanInfo->iter.hasVal)) {
return TSDB_CODE_SUCCESS;
}
int32_t code = initRowMergeIfNeeded(pReader, pBlockScanInfo->uid);
if (code != 0) {
return code;
}
int64_t st = taosGetTimestampUs();
SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
code = buildDataBlockFromBufImpl(pBlockScanInfo, endKey, pReader->resBlockInfo.capacity, pReader);
double el = (taosGetTimestampUs() - st) / 1000.0;
updateComposedBlockInfo(pReader, el, pBlockScanInfo);
tsdbDebug("%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%" PRId64 ", brange:%" PRId64
" - %" PRId64 ", uid:%" PRIu64 ", %s",
pReader, el, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey, pBlockScanInfo->uid,
pReader->idStr);
pReader->cost.buildmemBlock += el;
return code;
}
static bool tryCopyDistinctRowFromFileBlock(STsdbReader* pReader, SBlockData* pBlockData, SRowKey* pKey,
SFileBlockDumpInfo* pDumpInfo, bool* copied) {
// opt version
// 1. it is not a border point
// 2. the direct next point is not an duplicated timestamp
int32_t code = TSDB_CODE_SUCCESS;
*copied = false;
bool asc = (pReader->info.order == TSDB_ORDER_ASC);
if ((pDumpInfo->rowIndex < pDumpInfo->totalRows - 1 && asc) || (pDumpInfo->rowIndex > 0 && (!asc))) {
int32_t step = ASCENDING_TRAVERSE(pReader->info.order) ? 1 : -1;
SRowKey nextRowKey;
tColRowGetKey(pBlockData, pDumpInfo->rowIndex + step, &nextRowKey);
if (pkCompEx(pKey, &nextRowKey) != 0) { // merge is not needed
code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, pBlockData, pDumpInfo->rowIndex);
if (code) {
return code;
}
pDumpInfo->rowIndex += step;
*copied = true;
}
}
return code;
}
static int32_t nextRowFromSttBlocks(SSttBlockReader* pSttBlockReader, STableBlockScanInfo* pScanInfo, int32_t pkSrcSlot,
SVersionRange* pVerRange) {
int32_t code = 0;
int32_t order = pSttBlockReader->order;
int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
SRowKey* pNextProc = &pScanInfo->sttKeyInfo.nextProcKey;
while (1) {
bool hasVal = false;
code = tMergeTreeNext(&pSttBlockReader->mergeTree, &hasVal);
if (code) {
tsdbError("failed to iter the next row in stt-file merge tree, code:%s, %s", tstrerror(code),
pSttBlockReader->mergeTree.idStr);
return code;
}
if (!hasVal) { // the next value will be the accessed key in stt
pScanInfo->sttKeyInfo.status = STT_FILE_NO_DATA;
// next file, the timestamps in the next file must be greater than those in current
pNextProc->ts += step;
if (pSttBlockReader->numOfPks > 0) {
if (IS_NUMERIC_TYPE(pNextProc->pks[0].type)) {
pNextProc->pks[0].val = INT64_MIN;
} else {
memset(pNextProc->pks[0].pData, 0, pNextProc->pks[0].nData);
}
}
return code;
}
TSDBROW* pRow = tMergeTreeGetRow(&pSttBlockReader->mergeTree);
int64_t key = pRow->pBlockData->aTSKEY[pRow->iRow];
int64_t ver = pRow->pBlockData->aVersion[pRow->iRow];
if (pSttBlockReader->numOfPks == 0) {
pSttBlockReader->currentKey.ts = key;
} else {
tColRowGetKeyDeepCopy(pRow->pBlockData, pRow->iRow, pkSrcSlot, &pSttBlockReader->currentKey);
}
tColRowGetKeyDeepCopy(pRow->pBlockData, pRow->iRow, pkSrcSlot, pNextProc);
if (pScanInfo->delSkyline != NULL && TARRAY_SIZE(pScanInfo->delSkyline) > 0) {
if (!hasBeenDropped(pScanInfo->delSkyline, &pScanInfo->sttBlockDelIndex, key, ver, order, pVerRange,
pSttBlockReader->numOfPks > 0)) {
pScanInfo->sttKeyInfo.status = STT_FILE_HAS_DATA;
return code;
}
} else {
pScanInfo->sttKeyInfo.status = STT_FILE_HAS_DATA;
return code;
}
}
return code;
}
static void doPinSttBlock(SSttBlockReader* pSttBlockReader) { tMergeTreePinSttBlock(&pSttBlockReader->mergeTree); }
static void doUnpinSttBlock(SSttBlockReader* pSttBlockReader) { tMergeTreeUnpinSttBlock(&pSttBlockReader->mergeTree); }
static bool tryCopyDistinctRowFromSttBlock(TSDBROW* fRow, SSttBlockReader* pSttBlockReader,
STableBlockScanInfo* pScanInfo, SRowKey* pSttKey, STsdbReader* pReader,
bool* copied) {
int32_t code = TSDB_CODE_SUCCESS;
*copied = false;
// avoid the fetch next row replace the referenced stt block in buffer
doPinSttBlock(pSttBlockReader);
code = nextRowFromSttBlocks(pSttBlockReader, pScanInfo, pReader->suppInfo.pkSrcSlot, &pReader->info.verRange);
doUnpinSttBlock(pSttBlockReader);
if (code) {
return code;
}
if (hasDataInSttBlock(pScanInfo)) {
SRowKey* pNext = getCurrentKeyInSttBlock(pSttBlockReader);
if (pkCompEx(pSttKey, pNext) != 0) {
code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, fRow->pBlockData, fRow->iRow);
*copied = (code == TSDB_CODE_SUCCESS);
return code;
}
} else {
code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, fRow->pBlockData, fRow->iRow);
*copied = (code == TSDB_CODE_SUCCESS);
return code;
}
return code;
}
static FORCE_INLINE STSchema* doGetSchemaForTSRow(int32_t sversion, STsdbReader* pReader, uint64_t uid) {
// always set the newest schema version in pReader->info.pSchema
if (pReader->info.pSchema == NULL) {
STSchema* ps = getTableSchemaImpl(pReader, uid);
if (ps == NULL) {
return NULL;
}
}
if (pReader->info.pSchema && sversion == pReader->info.pSchema->version) {
return pReader->info.pSchema;
}
void** p = tSimpleHashGet(pReader->pSchemaMap, &sversion, sizeof(sversion));
if (p != NULL) {
return *(STSchema**)p;
}
STSchema* ptr = NULL;
int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->info.suid, uid, sversion, &ptr);
if (code != TSDB_CODE_SUCCESS) {
terrno = code;
return NULL;
} else {
code = tSimpleHashPut(pReader->pSchemaMap, &sversion, sizeof(sversion), &ptr, POINTER_BYTES);
if (code != TSDB_CODE_SUCCESS) {
terrno = code;
return NULL;
}
return ptr;
}
}
static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, TSDBROW* pRow,
SIterInfo* pIter, SSttBlockReader* pSttBlockReader) {
SRowMerger* pMerger = &pReader->status.merger;
SRow* pTSRow = NULL;
SBlockData* pBlockData = &pReader->status.fileBlockData;
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
int32_t pkSrcSlot = pReader->suppInfo.pkSrcSlot;
SRowKey* pSttKey = NULL;
if (hasDataInSttBlock(pBlockScanInfo) && (!pBlockScanInfo->cleanSttBlocks)) {
pSttKey = getCurrentKeyInSttBlock(pSttBlockReader);
}
SRowKey k = {0};
tRowGetKeyEx(pRow, &k);
STSchema* pSchema = NULL;
if (pRow->type == TSDBROW_ROW_FMT) {
pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
if (pSchema == NULL) {
return terrno;
}
}
SRowKey* pfKey = &(SRowKey){0};
if (hasDataInFileBlock(pBlockData, pDumpInfo)) {
tColRowGetKey(pBlockData, pDumpInfo->rowIndex, pfKey);
} else {
pfKey = NULL;
}
TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
// merge is not initialized yet, due to the fact that the pReader->info.pSchema is not initialized
int32_t code = initRowMergeIfNeeded(pReader, pBlockScanInfo->uid);
if (code != 0) {
return code;
}
SRowKey minKey = k;
if (pReader->info.order == TSDB_ORDER_ASC) {
if (pfKey != NULL && pkCompEx(pfKey, &minKey) < 0) {
minKey = *pfKey;
}
if (pSttKey != NULL && pkCompEx(pSttKey, &minKey) < 0) {
minKey = *pSttKey;
}
} else {
if (pfKey != NULL && pkCompEx(pfKey, &minKey) > 0) {
minKey = *pfKey;
}
if (pSttKey != NULL && pkCompEx(pSttKey, &minKey) > 0) {
minKey = *pSttKey;
}
}
// copy the last key before the time of stt reader loading the next stt block, in which the underlying data block may
// be changed, resulting in the corresponding changing of the value of sttRowKey
tRowKeyAssign(&pBlockScanInfo->lastProcKey, &minKey);
// file block ---> stt block -----> mem
if (pkCompEx(&minKey, pfKey) == 0) {
code = tsdbRowMergerAdd(pMerger, &fRow, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pfKey, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
if (pkCompEx(&minKey, pSttKey) == 0) {
TSDBROW* fRow1 = tMergeTreeGetRow(&pSttBlockReader->mergeTree);
code = tsdbRowMergerAdd(pMerger, fRow1, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInSttBlock(pSttBlockReader, pBlockScanInfo, pMerger, pkSrcSlot, &pReader->info.verRange, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
if (pkCompEx(&minKey, &k) == 0) {
code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInBuf(pIter, pBlockScanInfo->uid, &k, pBlockScanInfo->delSkyline, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
code = tsdbRowMergerGetRow(pMerger, &pTSRow);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
taosMemoryFree(pTSRow);
tsdbRowMergerClear(pMerger);
return code;
}
static int32_t mergeFileBlockAndSttBlock(STsdbReader* pReader, SSttBlockReader* pSttBlockReader, SRowKey* pKey,
STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
SRowMerger* pMerger = &pReader->status.merger;
int32_t code = TSDB_CODE_SUCCESS;
int32_t pkSrcSlot = pReader->suppInfo.pkSrcSlot;
// merge is not initialized yet, due to the fact that the pReader->info.pSchema is not initialized
code = initRowMergeIfNeeded(pReader, pBlockScanInfo->uid);
if (code != 0) {
return code;
}
bool dataInDataFile = hasDataInFileBlock(pBlockData, pDumpInfo);
bool dataInSttFile = hasDataInSttBlock(pBlockScanInfo);
if (dataInDataFile && (!dataInSttFile)) {
// no stt file block available, only data block exists
return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pKey, pReader);
} else if ((!dataInDataFile) && dataInSttFile) {
// no data in data file exists
return mergeRowsInSttBlocks(pSttBlockReader, pBlockScanInfo, pReader);
} else if (pBlockScanInfo->cleanSttBlocks && pReader->info.execMode == READER_EXEC_ROWS) {
// opt model for count data in stt file, which is not overlap with data blocks in files.
return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pKey, pReader);
} else {
// row in both stt file blocks and data file blocks
TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
SRowKey* pSttKey = getCurrentKeyInSttBlock(pSttBlockReader);
int32_t ret = pkCompEx(pKey, pSttKey);
if (ASCENDING_TRAVERSE(pReader->info.order)) {
if (ret < 0) { // asc
return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pKey, pReader);
} else if (ret > 0) {
return mergeRowsInSttBlocks(pSttBlockReader, pBlockScanInfo, pReader);
}
} else { // desc
if (ret > 0) {
return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pKey, pReader);
} else if (ret < 0) {
return mergeRowsInSttBlocks(pSttBlockReader, pBlockScanInfo, pReader);
}
}
// pKey == pSttKey
tRowKeyAssign(&pBlockScanInfo->lastProcKey, pKey);
// the following for key == sttKey->key.ts
// file block ------> stt block
SRow* pTSRow = NULL;
code = tsdbRowMergerAdd(pMerger, &fRow, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pKey, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
TSDBROW* pRow1 = tMergeTreeGetRow(&pSttBlockReader->mergeTree);
code = tsdbRowMergerAdd(pMerger, pRow1, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
// pSttKey will be changed when sttBlockReader iterates to the next row, so use pKey instead.
code = doMergeRowsInSttBlock(pSttBlockReader, pBlockScanInfo, pMerger, pkSrcSlot, &pReader->info.verRange, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = tsdbRowMergerGetRow(pMerger, &pTSRow);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
taosMemoryFree(pTSRow);
tsdbRowMergerClear(pMerger);
return code;
}
}
static int32_t doMergeMultiLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
SSttBlockReader* pSttBlockReader) {
SRowMerger* pMerger = &pReader->status.merger;
SRow* pTSRow = NULL;
int32_t code = TSDB_CODE_SUCCESS;
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
SArray* pDelList = pBlockScanInfo->delSkyline;
int32_t pkSrcSlot = pReader->suppInfo.pkSrcSlot;
TSDBROW* pRow = NULL;
TSDBROW* piRow = NULL;
getValidMemRow(&pBlockScanInfo->iter, pDelList, pReader, &pRow);
getValidMemRow(&pBlockScanInfo->iiter, pDelList, pReader, &piRow);
SRowKey* pSttKey = NULL;
if (hasDataInSttBlock(pBlockScanInfo) && (!pBlockScanInfo->cleanSttBlocks)) {
pSttKey = getCurrentKeyInSttBlock(pSttBlockReader);
}
SRowKey* pfKey = &(SRowKey){0};
if (hasDataInFileBlock(pBlockData, pDumpInfo)) {
tColRowGetKey(pBlockData, pDumpInfo->rowIndex, pfKey);
} else {
pfKey = NULL;
}
SRowKey k = {0}, ik = {0};
tRowGetKeyEx(pRow, &k);
tRowGetKeyEx(piRow, &ik);
STSchema* pSchema = NULL;
if (pRow->type == TSDBROW_ROW_FMT) {
pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
if (pSchema == NULL) {
return terrno;
}
}
STSchema* piSchema = NULL;
if (piRow->type == TSDBROW_ROW_FMT) {
piSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
if (piSchema == NULL) {
return terrno;
}
}
// merge is not initialized yet, due to the fact that the pReader->info.pSchema is not initialized
code = initRowMergeIfNeeded(pReader, pBlockScanInfo->uid);
if (code != 0) {
return code;
}
SRowKey minKey = k;
if (ASCENDING_TRAVERSE(pReader->info.order)) {
if (pkCompEx(&ik, &minKey) < 0) { // minKey > ik.key.ts) {
minKey = ik;
}
if ((pfKey != NULL) && (pkCompEx(pfKey, &minKey) < 0)) {
minKey = *pfKey;
}
if ((pSttKey != NULL) && (pkCompEx(pSttKey, &minKey) < 0)) {
minKey = *pSttKey;
}
} else {
if (pkCompEx(&ik, &minKey) > 0) {
minKey = ik;
}
if ((pfKey != NULL) && (pkCompEx(pfKey, &minKey) > 0)) {
minKey = *pfKey;
}
if ((pSttKey != NULL) && (pkCompEx(pSttKey, &minKey) > 0)) {
minKey = *pSttKey;
}
}
tRowKeyAssign(&pBlockScanInfo->lastProcKey, &minKey);
// file block -----> stt block -----> imem -----> mem
if (pkCompEx(&minKey, pfKey) == 0) {
TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
code = tsdbRowMergerAdd(pMerger, &fRow, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pfKey, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
if (pkCompEx(&minKey, pSttKey) == 0) {
TSDBROW* pRow1 = tMergeTreeGetRow(&pSttBlockReader->mergeTree);
code = tsdbRowMergerAdd(pMerger, pRow1, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInSttBlock(pSttBlockReader, pBlockScanInfo, pMerger, pkSrcSlot, &pReader->info.verRange, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
if (pkCompEx(&minKey, &ik) == 0) {
code = tsdbRowMergerAdd(pMerger, piRow, piSchema);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, &ik, pBlockScanInfo->delSkyline, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
if (pkCompEx(&minKey, &k) == 0) {
code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, &k, pBlockScanInfo->delSkyline, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
code = tsdbRowMergerGetRow(pMerger, &pTSRow);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
taosMemoryFree(pTSRow);
tsdbRowMergerClear(pMerger);
return code;
}
int32_t doInitMemDataIter(STsdbReader* pReader, STbData** pData, STableBlockScanInfo* pBlockScanInfo, STsdbRowKey* pKey,
SMemTable* pMem, SIterInfo* pIter, const char* type) {
int32_t code = TSDB_CODE_SUCCESS;
int32_t backward = (!ASCENDING_TRAVERSE(pReader->info.order));
pIter->hasVal = false;
if (pMem != NULL) {
*pData = tsdbGetTbDataFromMemTable(pMem, pReader->info.suid, pBlockScanInfo->uid);
if ((*pData) != NULL) {
code = tsdbTbDataIterCreate((*pData), pKey, backward, &pIter->iter);
if (code == TSDB_CODE_SUCCESS) {
pIter->hasVal = (tsdbTbDataIterGet(pIter->iter) != NULL);
tsdbDebug("%p uid:%" PRIu64 ", check data in %s from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
"-%" PRId64 " %s",
pReader, pBlockScanInfo->uid, type, pKey->key.ts, pReader->info.order, (*pData)->minKey,
(*pData)->maxKey, pReader->idStr);
} else {
tsdbError("%p uid:%" PRIu64 ", failed to create iterator for %s, code:%s, %s", pReader, pBlockScanInfo->uid,
type, tstrerror(code), pReader->idStr);
return code;
}
}
} else {
tsdbDebug("%p uid:%" PRIu64 ", no data in %s, %s", pReader, pBlockScanInfo->uid, type, pReader->idStr);
}
return code;
}
static void doForwardDataIter(SRowKey* pKey, SIterInfo* pIter, STableBlockScanInfo* pBlockScanInfo,
STsdbReader* pReader) {
SRowKey rowKey = {0};
TSDBROW* pRow = NULL;
while (1) {
getValidMemRow(pIter, pBlockScanInfo->delSkyline, pReader, &pRow);
if (!pIter->hasVal) {
break;
}
tRowGetKeyEx(pRow, &rowKey);
int32_t ret = pkCompEx(pKey, &rowKey);
if (ret == 0) {
pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
} else {
break;
}
}
}
// handle the open interval issue. Find the first row key that is greater than the given one.
static void forwardDataIter(SRowKey* pKey, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
doForwardDataIter(pKey, &pBlockScanInfo->iter, pBlockScanInfo, pReader);
doForwardDataIter(pKey, &pBlockScanInfo->iiter, pBlockScanInfo, pReader);
}
static int32_t initMemDataIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
STbData* d = NULL;
STbData* di = NULL;
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
bool forward = true;
STsdbReadSnap* pSnap = pReader->pReadSnap;
STimeWindow* pWindow = &pReader->info.window;
if (pBlockScanInfo->iterInit) {
return TSDB_CODE_SUCCESS;
}
STsdbRowKey startKey;
startKey.key = pBlockScanInfo->lastProcKey;
startKey.version = asc ? pReader->info.verRange.minVer : pReader->info.verRange.maxVer;
if ((asc && (startKey.key.ts < pWindow->skey)) || ((!asc) && startKey.key.ts > pWindow->ekey)) {
startKey.key.ts = asc ? pWindow->skey : pWindow->ekey;
forward = false;
}
int32_t code = doInitMemDataIter(pReader, &d, pBlockScanInfo, &startKey, pSnap->pMem, &pBlockScanInfo->iter, "mem");
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doInitMemDataIter(pReader, &di, pBlockScanInfo, &startKey, pSnap->pIMem, &pBlockScanInfo->iiter, "imem");
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = loadMemTombData(&pBlockScanInfo->pMemDelData, d, di, pReader->info.verRange.maxVer);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (forward) {
forwardDataIter(&startKey.key, pBlockScanInfo, pReader);
}
pBlockScanInfo->iterInit = true;
return TSDB_CODE_SUCCESS;
}
static bool isValidFileBlockRow(SBlockData* pBlockData, int32_t rowIndex, STableBlockScanInfo* pBlockScanInfo, bool asc,
STsdbReaderInfo* pInfo, STsdbReader* pReader) {
// it is an multi-table data block
if (pBlockData->aUid != NULL) {
uint64_t uid = pBlockData->aUid[rowIndex];
if (uid != pBlockScanInfo->uid) { // move to next row
return false;
}
}
// check for version and time range
int64_t ver = pBlockData->aVersion[rowIndex];
if (ver > pInfo->verRange.maxVer || ver < pInfo->verRange.minVer) {
return false;
}
int64_t ts = pBlockData->aTSKEY[rowIndex];
if (ts > pInfo->window.ekey || ts < pInfo->window.skey) {
return false;
}
if ((asc && (ts < pBlockScanInfo->lastProcKey.ts)) || ((!asc) && (ts > pBlockScanInfo->lastProcKey.ts))) {
return false;
}
if (ts == pBlockScanInfo->lastProcKey.ts) { // todo opt perf
SRowKey nextRowKey; // lazy eval
tColRowGetKey(pBlockData, rowIndex, &nextRowKey);
if (pkCompEx(&pBlockScanInfo->lastProcKey, &nextRowKey) == 0) {
return false;
}
}
if (pBlockScanInfo->delSkyline != NULL && TARRAY_SIZE(pBlockScanInfo->delSkyline) > 0) {
bool dropped = hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, ts, ver, pInfo->order,
&pInfo->verRange, pReader->suppInfo.numOfPks > 0);
if (dropped) {
return false;
}
}
return true;
}
static void initSttBlockReader(SSttBlockReader* pSttBlockReader, STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
int32_t order = pReader->info.order;
bool asc = ASCENDING_TRAVERSE(order);
// the stt block reader has been initialized for this table.
if (pSttBlockReader->uid == pScanInfo->uid) {
return;
}
if (pSttBlockReader->uid != 0) {
tMergeTreeClose(&pSttBlockReader->mergeTree);
}
pSttBlockReader->uid = pScanInfo->uid;
// second or third time init stt block reader
if (pScanInfo->cleanSttBlocks && (pReader->info.execMode == READER_EXEC_ROWS)) {
// only allowed to retrieve clean stt blocks for count once
if (pScanInfo->sttBlockReturned) {
pScanInfo->sttKeyInfo.status = STT_FILE_NO_DATA;
tsdbDebug("uid:%" PRIu64 " set no stt-file data after stt-block retrieved, %s", pScanInfo->uid, pReader->idStr);
}
return;
}
STimeWindow w = pSttBlockReader->window;
if (asc) {
w.skey = pScanInfo->sttKeyInfo.nextProcKey.ts;
} else {
w.ekey = pScanInfo->sttKeyInfo.nextProcKey.ts;
}
int64_t st = taosGetTimestampUs();
tsdbDebug("init stt block reader, window:%" PRId64 "-%" PRId64 ", uid:%" PRIu64 ", %s", w.skey, w.ekey,
pScanInfo->uid, pReader->idStr);
SMergeTreeConf conf = {
.uid = pScanInfo->uid,
.suid = pReader->info.suid,
.pTsdb = pReader->pTsdb,
.timewindow = w,
.verRange = pSttBlockReader->verRange,
.strictTimeRange = false,
.pSchema = pReader->info.pSchema,
.pCurrentFileset = pReader->status.pCurrentFileset,
.backward = (pSttBlockReader->order == TSDB_ORDER_DESC),
.pSttFileBlockIterArray = pReader->status.pLDataIterArray,
.pCols = pReader->suppInfo.colId,
.numOfCols = pReader->suppInfo.numOfCols,
.loadTombFn = loadSttTombDataForAll,
.pCurRowKey = &pScanInfo->sttKeyInfo.nextProcKey,
.pReader = pReader,
.idstr = pReader->idStr,
.rspRows = (pReader->info.execMode == READER_EXEC_ROWS),
};
SSttDataInfoForTable info = {.pKeyRangeList = taosArrayInit(4, sizeof(SSttKeyRange))};
if (info.pKeyRangeList == NULL) {
pReader->code = terrno;
return;
}
int32_t code = tMergeTreeOpen2(&pSttBlockReader->mergeTree, &conf, &info);
if (code != TSDB_CODE_SUCCESS) {
taosArrayDestroy(info.pKeyRangeList);
pReader->code = code;
return;
}
code = initMemDataIterator(pScanInfo, pReader);
if (code != TSDB_CODE_SUCCESS) {
taosArrayDestroy(info.pKeyRangeList);
pReader->code = code;
return;
}
code = initDelSkylineIterator(pScanInfo, pReader->info.order, &pReader->cost);
if (code != TSDB_CODE_SUCCESS) {
taosArrayDestroy(info.pKeyRangeList);
pReader->code = code;
return;
}
if (conf.rspRows) {
pScanInfo->cleanSttBlocks = isCleanSttBlock(info.pKeyRangeList, &pReader->info.window, pScanInfo, order);
if (pScanInfo->cleanSttBlocks) {
pScanInfo->numOfRowsInStt = info.numOfRows;
// calculate the time window for data in stt files
for (int32_t i = 0; i < taosArrayGetSize(info.pKeyRangeList); ++i) {
SSttKeyRange* pKeyRange = taosArrayGet(info.pKeyRangeList, i);
if (pKeyRange == NULL) {
continue;
}
if (pkCompEx(&pScanInfo->sttRange.skey, &pKeyRange->skey) > 0) {
tRowKeyAssign(&pScanInfo->sttRange.skey, &pKeyRange->skey);
}
if (pkCompEx(&pScanInfo->sttRange.ekey, &pKeyRange->ekey) < 0) {
tRowKeyAssign(&pScanInfo->sttRange.ekey, &pKeyRange->ekey);
}
}
pScanInfo->sttKeyInfo.status = taosArrayGetSize(info.pKeyRangeList) ? STT_FILE_HAS_DATA : STT_FILE_NO_DATA;
SRowKey* p = asc ? &pScanInfo->sttRange.skey : &pScanInfo->sttRange.ekey;
tRowKeyAssign(&pScanInfo->sttKeyInfo.nextProcKey, p);
} else { // not clean stt blocks
INIT_KEYRANGE(&pScanInfo->sttRange); // reset the time window
code = nextRowFromSttBlocks(pSttBlockReader, pScanInfo, pReader->suppInfo.pkSrcSlot, &pReader->info.verRange);
}
} else {
pScanInfo->cleanSttBlocks = false;
INIT_KEYRANGE(&pScanInfo->sttRange); // reset the time window
code = nextRowFromSttBlocks(pSttBlockReader, pScanInfo, pReader->suppInfo.pkSrcSlot, &pReader->info.verRange);
}
pScanInfo->sttBlockReturned = false;
taosArrayDestroy(info.pKeyRangeList);
int64_t el = taosGetTimestampUs() - st;
pReader->cost.initSttBlockReader += (el / 1000.0);
tsdbDebug("init stt block reader completed, elapsed time:%" PRId64 "us %s", el, pReader->idStr);
if (code != 0) {
pReader->code = code;
}
}
static bool hasDataInSttBlock(STableBlockScanInfo* pInfo) { return pInfo->sttKeyInfo.status == STT_FILE_HAS_DATA; }
bool hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo) {
if ((pBlockData->nRow > 0) && (pBlockData->nRow != pDumpInfo->totalRows)) {
return false; // this is an invalid result.
}
return pBlockData->nRow > 0 && (!pDumpInfo->allDumped);
}
int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, SRowKey* pKey,
STsdbReader* pReader) {
SRowMerger* pMerger = &pReader->status.merger;
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
bool copied = false;
int32_t code = tryCopyDistinctRowFromFileBlock(pReader, pBlockData, pKey, pDumpInfo, &copied);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
// merge is not initialized yet, due to the fact that the pReader->info.pSchema is not initialized
code = initRowMergeIfNeeded(pReader, pBlockScanInfo->uid);
if (code != 0) {
return code;
}
tRowKeyAssign(&pBlockScanInfo->lastProcKey, pKey);
if (copied) {
return TSDB_CODE_SUCCESS;
} else {
TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
SRow* pTSRow = NULL;
code = tsdbRowMergerAdd(pMerger, &fRow, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pKey, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = tsdbRowMergerGetRow(pMerger, &pTSRow);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
taosMemoryFree(pTSRow);
tsdbRowMergerClear(pMerger);
return code;
}
}
int32_t mergeRowsInSttBlocks(SSttBlockReader* pSttBlockReader, STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
bool copied = false;
SRow* pTSRow = NULL;
int32_t pkSrcSlot = pReader->suppInfo.pkSrcSlot;
SRowMerger* pMerger = &pReader->status.merger;
// let's record the last processed key
tRowKeyAssign(&pScanInfo->lastProcKey, getCurrentKeyInSttBlock(pSttBlockReader));
TSDBROW* pRow = tMergeTreeGetRow(&pSttBlockReader->mergeTree);
TSDBROW fRow = {.iRow = pRow->iRow, .type = TSDBROW_COL_FMT, .pBlockData = pRow->pBlockData};
if (IS_VAR_DATA_TYPE(pScanInfo->lastProcKey.pks[0].type)) {
tsdbTrace("fRow ptr:%p, %d, uid:%" PRIu64 ", ts:%" PRId64 " pk:%s %s", pRow->pBlockData, pRow->iRow,
pSttBlockReader->uid, fRow.pBlockData->aTSKEY[fRow.iRow], pScanInfo->lastProcKey.pks[0].pData,
pReader->idStr);
}
int32_t code =
tryCopyDistinctRowFromSttBlock(&fRow, pSttBlockReader, pScanInfo, &pScanInfo->lastProcKey, pReader, &copied);
if (code) {
return code;
}
if (copied) {
return TSDB_CODE_SUCCESS;
} else {
code = tsdbRowMergerAdd(pMerger, &fRow, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
TSDBROW* pRow1 = tMergeTreeGetRow(&pSttBlockReader->mergeTree);
code = tsdbRowMergerAdd(pMerger, pRow1, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInSttBlock(pSttBlockReader, pScanInfo, pMerger, pkSrcSlot, &pReader->info.verRange, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = tsdbRowMergerGetRow(pMerger, &pTSRow);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pScanInfo);
taosMemoryFree(pTSRow);
tsdbRowMergerClear(pMerger);
return code;
}
}
static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo,
SBlockData* pBlockData, SSttBlockReader* pSttBlockReader) {
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
TSDBROW *pRow = NULL, *piRow = NULL;
SRowKey* pKey = &(SRowKey){0};
if (hasDataInFileBlock(pBlockData, pDumpInfo)) {
tColRowGetKey(pBlockData, pDumpInfo->rowIndex, pKey);
} else {
pKey = NULL;
}
if (pBlockScanInfo->iter.hasVal) {
getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader, &pRow);
}
if (pBlockScanInfo->iiter.hasVal) {
getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader, &piRow);
}
// two levels of mem-table does contain the valid rows
if (pRow != NULL && piRow != NULL) {
return doMergeMultiLevelRows(pReader, pBlockScanInfo, pBlockData, pSttBlockReader);
}
// imem + file + stt block
if (pBlockScanInfo->iiter.hasVal) {
return doMergeBufAndFileRows(pReader, pBlockScanInfo, piRow, &pBlockScanInfo->iiter, pSttBlockReader);
}
// mem + file + stt block
if (pBlockScanInfo->iter.hasVal) {
return doMergeBufAndFileRows(pReader, pBlockScanInfo, pRow, &pBlockScanInfo->iter, pSttBlockReader);
}
// files data blocks + stt block
return mergeFileBlockAndSttBlock(pReader, pSttBlockReader, pKey, pBlockScanInfo, pBlockData);
}
static int32_t loadNeighborIfOverlap(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pBlockScanInfo,
STsdbReader* pReader, bool* loadNeighbor) {
int32_t code = TSDB_CODE_SUCCESS;
int32_t order = pReader->info.order;
SDataBlockIter* pIter = &pReader->status.blockIter;
SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
int32_t nextIndex = -1;
SBrinRecord rec = {0};
*loadNeighbor = false;
bool hasNeighbor = getNeighborBlockOfTable(pIter, pBlockInfo, pBlockScanInfo, &nextIndex, order, &rec, pSupInfo);
if (!hasNeighbor) { // do nothing
return code;
}
// load next block
if (overlapWithNeighborBlock2(pBlockInfo, &rec, order, pReader->suppInfo.pk.type, pReader->suppInfo.numOfPks)) {
SReaderStatus* pStatus = &pReader->status;
SDataBlockIter* pBlockIter = &pStatus->blockIter;
// 1. find the next neighbor block in the scan block list
STableDataBlockIdx* tableDataBlockIdx = taosArrayGet(pBlockScanInfo->pBlockIdxList, nextIndex);
if (tableDataBlockIdx == NULL) {
return TSDB_CODE_INVALID_PARA;
}
// 2. remove it from the scan block list
int32_t neighborIndex = tableDataBlockIdx->globalIndex;
code = setFileBlockActiveInBlockIter(pReader, pBlockIter, neighborIndex, step);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
// 3. load the neighbor block, and set it to be the currently accessed file data block
code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pBlockInfo->uid);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
// 4. check the data values
initBlockDumpInfo(pReader, pBlockIter);
*loadNeighbor = true;
}
return code;
}
void updateComposedBlockInfo(STsdbReader* pReader, double el, STableBlockScanInfo* pBlockScanInfo) {
SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
pResBlock->info.id.uid = (pBlockScanInfo != NULL) ? pBlockScanInfo->uid : 0;
pResBlock->info.dataLoad = 1;
pResBlock->info.version = pReader->info.verRange.maxVer;
int32_t code = blockDataUpdateTsWindow(pResBlock, pReader->suppInfo.slotId[0]);
code = blockDataUpdatePkRange(pResBlock, pReader->suppInfo.pkDstSlot, ASCENDING_TRAVERSE(pReader->info.order));
setComposedBlockFlag(pReader, true);
pReader->cost.composedBlocks += 1;
pReader->cost.buildComposedBlockTime += el;
}
static int32_t buildComposedDataBlock(STsdbReader* pReader) {
int32_t code = TSDB_CODE_SUCCESS;
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
int64_t st = taosGetTimestampUs();
int32_t step = asc ? 1 : -1;
double el = 0;
SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
SFileDataBlockInfo* pBlockInfo = NULL;
SSttBlockReader* pSttBlockReader = pReader->status.fileIter.pSttBlockReader;
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
code = getCurrentBlockInfo(&pReader->status.blockIter, &pBlockInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return 0;
}
if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockInfo->uid, sizeof(pBlockInfo->uid))) {
setBlockAllDumped(pDumpInfo, pBlockInfo->lastKey, pReader->info.order);
return code;
}
STableBlockScanInfo* pBlockScanInfo = NULL;
code = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, &pBlockScanInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
goto _end;
}
TSDBKEY keyInBuf = getCurrentKeyInBuf(pBlockScanInfo, pReader);
// it is a clean block, load it directly
int64_t cap = pReader->resBlockInfo.capacity;
bool directCopy = isCleanFileDataBlock(pReader, pBlockInfo, pBlockScanInfo, keyInBuf) &&
(pBlockInfo->numRow <= cap) && (pBlockScanInfo->sttKeyInfo.status == STT_FILE_NO_DATA) &&
((asc && ((pBlockInfo->lastKey < keyInBuf.ts) || (keyInBuf.ts == INT64_MIN))) ||
(!asc && (pBlockInfo->lastKey > keyInBuf.ts)));
if (directCopy) {
code = copyBlockDataToSDataBlock(pReader, &pBlockScanInfo->lastProcKey);
goto _end;
}
SBlockData* pBlockData = &pReader->status.fileBlockData;
initSttBlockReader(pSttBlockReader, pBlockScanInfo, pReader);
if (pReader->code != 0) {
code = pReader->code;
goto _end;
}
while (1) {
bool hasBlockData = false;
{
while (pBlockData->nRow > 0 && pBlockData->uid == pBlockScanInfo->uid) {
// find the first qualified row in data block
if (isValidFileBlockRow(pBlockData, pDumpInfo->rowIndex, pBlockScanInfo, asc, &pReader->info, pReader)) {
hasBlockData = true;
break;
}
pDumpInfo->rowIndex += step;
if (pDumpInfo->rowIndex >= pBlockData->nRow || pDumpInfo->rowIndex < 0) {
// NOTE: get the new block info
code = getCurrentBlockInfo(&pReader->status.blockIter, &pBlockInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
goto _end;
}
// continue check for the next file block if the last ts in the current block
// is overlapped with the next neighbor block
bool loadNeighbor = false;
code = loadNeighborIfOverlap(pBlockInfo, pBlockScanInfo, pReader, &loadNeighbor);
if ((!loadNeighbor) || (code != 0)) {
setBlockAllDumped(pDumpInfo, pBlockInfo->lastKey, pReader->info.order);
break;
}
}
}
}
// no data in last block and block, no need to proceed.
if (hasBlockData == false) {
break;
}
code = buildComposedDataBlockImpl(pReader, pBlockScanInfo, pBlockData, pSttBlockReader);
if (code) {
goto _end;
}
// currently loaded file data block is consumed
if ((pBlockData->nRow > 0) && (pDumpInfo->rowIndex >= pBlockData->nRow || pDumpInfo->rowIndex < 0)) {
setBlockAllDumped(pDumpInfo, pBlockInfo->lastKey, pReader->info.order);
break;
}
if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
break;
}
}
_end:
el = (taosGetTimestampUs() - st) / 1000.0;
updateComposedBlockInfo(pReader, el, pBlockScanInfo);
if (pResBlock->info.rows > 0) {
tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
", elapsed time:%.2f ms %s",
pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
pResBlock->info.rows, el, pReader->idStr);
}
return code;
}
void setComposedBlockFlag(STsdbReader* pReader, bool composed) { pReader->status.composedDataBlock = composed; }
int32_t getInitialDelIndex(const SArray* pDelSkyline, int32_t order) {
if (pDelSkyline == NULL) {
return 0;
}
return ASCENDING_TRAVERSE(order) ? 0 : taosArrayGetSize(pDelSkyline) - 1;
}
int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, int32_t order, SReadCostSummary* pCost) {
int32_t code = 0;
int32_t newDelDataInFile = taosArrayGetSize(pBlockScanInfo->pFileDelData);
if (newDelDataInFile == 0 &&
((pBlockScanInfo->delSkyline != NULL) || (TARRAY_SIZE(pBlockScanInfo->pMemDelData) == 0))) {
return code;
}
int64_t st = taosGetTimestampUs();
if (pBlockScanInfo->delSkyline != NULL) {
taosArrayClear(pBlockScanInfo->delSkyline);
} else {
pBlockScanInfo->delSkyline = taosArrayInit(4, sizeof(TSDBKEY));
if (pBlockScanInfo->delSkyline == NULL) {
return terrno;
}
}
SArray* pSource = pBlockScanInfo->pFileDelData;
if (pSource == NULL) {
pSource = pBlockScanInfo->pMemDelData;
} else {
void* p1 = taosArrayAddAll(pSource, pBlockScanInfo->pMemDelData);
if (p1 == NULL) {
return terrno;
}
}
code = tsdbBuildDeleteSkyline(pSource, 0, taosArrayGetSize(pSource) - 1, pBlockScanInfo->delSkyline);
taosArrayClear(pBlockScanInfo->pFileDelData);
int32_t index = getInitialDelIndex(pBlockScanInfo->delSkyline, order);
pBlockScanInfo->iter.index = index;
pBlockScanInfo->iiter.index = index;
pBlockScanInfo->fileDelIndex = index;
pBlockScanInfo->sttBlockDelIndex = index;
double el = taosGetTimestampUs() - st;
pCost->createSkylineIterTime = el / 1000.0;
return code;
}
TSDBKEY getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
TSDBKEY key = {.ts = TSKEY_INITIAL_VAL}, ikey = {.ts = TSKEY_INITIAL_VAL};
bool hasKey = false, hasIKey = false;
TSDBROW* pRow = NULL;
TSDBROW* pIRow = NULL;
getValidMemRow(&pScanInfo->iter, pScanInfo->delSkyline, pReader, &pRow);
if (pRow != NULL) {
hasKey = true;
key = TSDBROW_KEY(pRow);
}
getValidMemRow(&pScanInfo->iiter, pScanInfo->delSkyline, pReader, &pIRow);
if (pIRow != NULL) {
hasIKey = true;
ikey = TSDBROW_KEY(pIRow);
}
if (hasKey) {
if (hasIKey) { // has data in mem & imem
if (asc) {
return key.ts <= ikey.ts ? key : ikey;
} else {
return key.ts <= ikey.ts ? ikey : key;
}
} else { // no data in imem
return key;
}
} else {
// no data in mem & imem, return the initial value
// only imem has data, return ikey
return ikey;
}
}
static void prepareDurationForNextFileSet(STsdbReader* pReader) {
if (pReader->status.bProcMemFirstFileset) {
pReader->status.prevFilesetStartKey = INT64_MIN;
pReader->status.prevFilesetEndKey = INT64_MAX;
pReader->status.bProcMemFirstFileset = false;
}
int32_t fid = pReader->status.pCurrentFileset->fid;
STimeWindow winFid = {0};
tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &winFid.skey, &winFid.ekey);
if (ASCENDING_TRAVERSE(pReader->info.order)) {
pReader->status.bProcMemPreFileset = !(pReader->status.memTableMaxKey < pReader->status.prevFilesetStartKey ||
(winFid.skey - 1) < pReader->status.memTableMinKey);
} else {
pReader->status.bProcMemPreFileset = !(pReader->status.memTableMaxKey < (winFid.ekey + 1) ||
pReader->status.prevFilesetEndKey < pReader->status.memTableMinKey);
}
if (pReader->status.bProcMemPreFileset) {
tsdbDebug("will start pre-fileset %d buffer processing. %s", fid, pReader->idStr);
pReader->status.procMemUidList.tableUidList = pReader->status.uidList.tableUidList;
resetPreFilesetMemTableListIndex(&pReader->status);
}
if (!pReader->status.bProcMemPreFileset) {
if (pReader->notifyFn) {
STsdReaderNotifyInfo info = {0};
info.duration.filesetId = fid;
pReader->notifyFn(TSD_READER_NOTIFY_DURATION_START, &info, pReader->notifyParam);
tsdbDebug("new duration %d start notification when no buffer preceeding fileset, %s", fid, pReader->idStr);
}
}
pReader->status.prevFilesetStartKey = winFid.skey;
pReader->status.prevFilesetEndKey = winFid.ekey;
}
static int32_t moveToNextFile(STsdbReader* pReader, SBlockNumber* pBlockNum, SArray* pTableList) {
SReaderStatus* pStatus = &pReader->status;
pBlockNum->numOfBlocks = 0;
pBlockNum->numOfSttFiles = 0;
size_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
SArray* pIndexList = taosArrayInit(numOfTables, sizeof(SBrinBlk));
if (pIndexList == NULL) {
return terrno;
}
while (1) {
// only check here, since the iterate data in memory is very fast.
if (pReader->code != TSDB_CODE_SUCCESS) {
tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", tstrerror(pReader->code), pReader->idStr);
return pReader->code;
}
bool hasNext = false;
int32_t code = filesetIteratorNext(&pStatus->fileIter, pReader, &hasNext);
if (code != TSDB_CODE_SUCCESS) {
taosArrayDestroy(pIndexList);
return code;
}
if (!hasNext) { // no data files on disk
break;
}
taosArrayClear(pIndexList);
code = doLoadBlockIndex(pReader, pReader->pFileReader, pIndexList);
if (code != TSDB_CODE_SUCCESS) {
taosArrayDestroy(pIndexList);
return code;
}
if (taosArrayGetSize(pIndexList) > 0 || pReader->status.pCurrentFileset->lvlArr->size > 0) {
code = loadFileBlockBrinInfo(pReader, pIndexList, pBlockNum, pTableList);
if (code != TSDB_CODE_SUCCESS) {
taosArrayDestroy(pIndexList);
return code;
}
if (pBlockNum->numOfBlocks + pBlockNum->numOfSttFiles > 0) {
if (pReader->bFilesetDelimited) {
prepareDurationForNextFileSet(pReader);
}
break;
}
}
// no blocks in current file, try next files
}
taosArrayDestroy(pIndexList);
return loadDataFileTombDataForAll(pReader);
}
// pTableIter can be NULL, no need to handle the return value
static int32_t resetTableListIndex(SReaderStatus* pStatus, const char* id) {
STableUidList* pList = &pStatus->uidList;
pList->currentIndex = 0;
uint64_t uid = pList->tableUidList[0];
pStatus->pTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
if (pStatus->pTableIter == NULL) {
tsdbError("%s failed to load tableBlockScanInfo for uid:%"PRId64", code: internal error", id, uid);
return TSDB_CODE_INTERNAL_ERROR;
}
return 0;
}
static void resetPreFilesetMemTableListIndex(SReaderStatus* pStatus) {
STableUidList* pList = &pStatus->procMemUidList;
pList->currentIndex = 0;
uint64_t uid = pList->tableUidList[0];
pStatus->pProcMemTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
}
static bool moveToNextTable(STableUidList* pOrderedCheckInfo, SReaderStatus* pStatus) {
pOrderedCheckInfo->currentIndex += 1;
if (pOrderedCheckInfo->currentIndex >= tSimpleHashGetSize(pStatus->pTableMap)) {
pStatus->pTableIter = NULL;
return false;
}
uint64_t uid = pOrderedCheckInfo->tableUidList[pOrderedCheckInfo->currentIndex];
pStatus->pTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
return (pStatus->pTableIter != NULL);
}
static bool moveToNextTableForPreFileSetMem(SReaderStatus* pStatus) {
STableUidList* pUidList = &pStatus->procMemUidList;
pUidList->currentIndex += 1;
if (pUidList->currentIndex >= tSimpleHashGetSize(pStatus->pTableMap)) {
pStatus->pProcMemTableIter = NULL;
return false;
}
uint64_t uid = pUidList->tableUidList[pUidList->currentIndex];
pStatus->pProcMemTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
return (pStatus->pProcMemTableIter != NULL);
}
static int32_t buildCleanBlockFromSttFiles(STsdbReader* pReader, STableBlockScanInfo* pScanInfo) {
SReaderStatus* pStatus = &pReader->status;
SSttBlockReader* pSttBlockReader = pStatus->fileIter.pSttBlockReader;
SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
SDataBlockInfo* pInfo = &pResBlock->info;
int32_t code = blockDataEnsureCapacity(pResBlock, pScanInfo->numOfRowsInStt);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
pInfo->rows = pScanInfo->numOfRowsInStt;
pInfo->id.uid = pScanInfo->uid;
pInfo->dataLoad = 1;
pInfo->window.skey = pScanInfo->sttRange.skey.ts;
pInfo->window.ekey = pScanInfo->sttRange.ekey.ts;
setComposedBlockFlag(pReader, true);
pScanInfo->sttKeyInfo.nextProcKey.ts = asc ? pScanInfo->sttRange.ekey.ts + 1 : pScanInfo->sttRange.skey.ts - 1;
pScanInfo->sttKeyInfo.status = STT_FILE_NO_DATA;
if (asc) {
tRowKeyAssign(&pScanInfo->lastProcKey, &pScanInfo->sttRange.ekey);
} else {
tRowKeyAssign(&pScanInfo->lastProcKey, &pScanInfo->sttRange.skey);
}
pScanInfo->sttBlockReturned = true;
pSttBlockReader->mergeTree.pIter = NULL;
tsdbDebug("%p uid:%" PRId64 " return clean stt block as one, brange:%" PRId64 "-%" PRId64 " rows:%" PRId64 " %s",
pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
pResBlock->info.rows, pReader->idStr);
return code;
}
static void buildCleanBlockFromDataFiles(STsdbReader* pReader, STableBlockScanInfo* pScanInfo,
SFileDataBlockInfo* pBlockInfo, int32_t blockIndex) {
// whole data block is required, return it directly
SReaderStatus* pStatus = &pReader->status;
SDataBlockInfo* pInfo = &pReader->resBlockInfo.pResBlock->info;
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
pInfo->rows = pBlockInfo->numRow;
pInfo->id.uid = pScanInfo->uid;
pInfo->dataLoad = 0;
pInfo->version = pReader->info.verRange.maxVer;
pInfo->window = (STimeWindow){.skey = pBlockInfo->firstKey, .ekey = pBlockInfo->lastKey};
if (pReader->suppInfo.numOfPks > 0) {
if (IS_NUMERIC_TYPE(pReader->suppInfo.pk.type)) {
pInfo->pks[0].val = pBlockInfo->firstPk.val;
pInfo->pks[1].val = pBlockInfo->lastPk.val;
} else {
(void)memcpy(pInfo->pks[0].pData, varDataVal(pBlockInfo->firstPk.pData), varDataLen(pBlockInfo->firstPk.pData));
(void)memcpy(pInfo->pks[1].pData, varDataVal(pBlockInfo->lastPk.pData), varDataLen(pBlockInfo->lastPk.pData));
pInfo->pks[0].nData = varDataLen(pBlockInfo->firstPk.pData);
pInfo->pks[1].nData = varDataLen(pBlockInfo->lastPk.pData);
}
}
// update the last key for the corresponding table
setComposedBlockFlag(pReader, false);
setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlockInfo->lastKey, pReader->info.order);
updateLastKeyInfo(&pScanInfo->lastProcKey, pBlockInfo, pInfo, pReader->suppInfo.numOfPks, asc);
tsdbDebug("%p uid:%" PRIu64
" clean file block retrieved from file, global index:%d, "
"table index:%d, rows:%d, brange:%" PRId64 "-%" PRId64 ", %s",
pReader, pScanInfo->uid, blockIndex, pBlockInfo->tbBlockIdx, pBlockInfo->numRow, pBlockInfo->firstKey,
pBlockInfo->lastKey, pReader->idStr);
}
static int32_t doLoadSttBlockSequentially(STsdbReader* pReader) {
SReaderStatus* pStatus = &pReader->status;
SSttBlockReader* pSttBlockReader = pStatus->fileIter.pSttBlockReader;
STableUidList* pUidList = &pStatus->uidList;
int32_t code = TSDB_CODE_SUCCESS;
if (tSimpleHashGetSize(pStatus->pTableMap) == 0) {
return TSDB_CODE_SUCCESS;
}
SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
while (1) {
if (pReader->code != TSDB_CODE_SUCCESS) {
tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", tstrerror(pReader->code), pReader->idStr);
return pReader->code;
}
// load the last data block of current table
STableBlockScanInfo* pScanInfo = NULL;
if (pStatus->pTableIter == NULL) {
tsdbError("table Iter is null, invalid pScanInfo, try next table %s", pReader->idStr);
bool hasNexTable = moveToNextTable(pUidList, pStatus);
if (!hasNexTable) {
return TSDB_CODE_SUCCESS;
}
continue;
} else {
pScanInfo = *(STableBlockScanInfo**) pStatus->pTableIter;
}
if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pScanInfo->uid, sizeof(pScanInfo->uid))) {
// reset the index in last block when handing a new file
bool hasNexTable = moveToNextTable(pUidList, pStatus);
if (!hasNexTable) {
return TSDB_CODE_SUCCESS;
}
continue;
}
initSttBlockReader(pSttBlockReader, pScanInfo, pReader);
if (pReader->code != TSDB_CODE_SUCCESS) {
return pReader->code;
}
if (!hasDataInSttBlock(pScanInfo)) {
bool hasNexTable = moveToNextTable(pUidList, pStatus);
if (!hasNexTable) {
return TSDB_CODE_SUCCESS;
}
continue;
}
// if only require the total rows, no need to load data from stt file if it is clean stt blocks
if (pReader->info.execMode == READER_EXEC_ROWS && pScanInfo->cleanSttBlocks) {
code = buildCleanBlockFromSttFiles(pReader, pScanInfo);
return code;
}
int64_t st = taosGetTimestampUs();
while (1) {
// no data in stt block and block, no need to proceed.
if (!hasDataInSttBlock(pScanInfo)) {
break;
}
code = buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pSttBlockReader);
if (code) {
return code;
}
if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
break;
}
}
double el = (taosGetTimestampUs() - st) / 1000.0;
updateComposedBlockInfo(pReader, el, pScanInfo);
if (pResBlock->info.rows > 0) {
tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
", elapsed time:%.2f ms %s",
pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
pResBlock->info.rows, el, pReader->idStr);
return TSDB_CODE_SUCCESS;
}
// current table is exhausted, let's try next table
bool hasNexTable = moveToNextTable(pUidList, pStatus);
if (!hasNexTable) {
return TSDB_CODE_SUCCESS;
}
}
}
// current active data block not overlap with the stt-files/stt-blocks
static bool notOverlapWithFiles(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pScanInfo, bool asc) {
if ((!hasDataInSttBlock(pScanInfo)) || (pScanInfo->cleanSttBlocks == true)) {
return true;
} else {
int64_t keyInStt = pScanInfo->sttKeyInfo.nextProcKey.ts;
return (asc && pBlockInfo->lastKey < keyInStt) || (!asc && pBlockInfo->firstKey > keyInStt);
}
}
static int32_t doBuildDataBlock(STsdbReader* pReader) {
SReaderStatus* pStatus = &pReader->status;
SDataBlockIter* pBlockIter = &pStatus->blockIter;
STableBlockScanInfo* pScanInfo = NULL;
SFileDataBlockInfo* pBlockInfo = NULL;
SSttBlockReader* pSttBlockReader = pReader->status.fileIter.pSttBlockReader;
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
int32_t code = TSDB_CODE_SUCCESS;
code = getCurrentBlockInfo(pBlockIter, &pBlockInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockInfo->uid, sizeof(pBlockInfo->uid))) {
setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlockInfo->lastKey, pReader->info.order);
return code;
}
if (pReader->code != TSDB_CODE_SUCCESS) {
return pReader->code;
}
code = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, &pScanInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pScanInfo->sttKeyInfo.status == STT_FILE_READER_UNINIT) {
initSttBlockReader(pSttBlockReader, pScanInfo, pReader);
if (pReader->code != 0) {
return pReader->code;
}
}
TSDBKEY keyInBuf = getCurrentKeyInBuf(pScanInfo, pReader);
if (fileBlockShouldLoad(pReader, pBlockInfo, pScanInfo, keyInBuf)) {
code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pScanInfo->uid);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
// build composed data block
code = buildComposedDataBlock(pReader);
} else if (bufferDataInFileBlockGap(keyInBuf, pBlockInfo, pScanInfo, pReader->info.order)) {
// data in memory that are earlier than current file block and stt blocks
// rows in buffer should be less than the file block in asc, greater than file block in desc
int64_t endKey = getBoarderKeyInFiles(pBlockInfo, pScanInfo, pReader->info.order);
code = buildDataBlockFromBuf(pReader, pScanInfo, endKey);
} else {
if (notOverlapWithFiles(pBlockInfo, pScanInfo, asc)) {
int64_t keyInStt = pScanInfo->sttKeyInfo.nextProcKey.ts;
if ((!hasDataInSttBlock(pScanInfo)) || (asc && pBlockInfo->lastKey < keyInStt) ||
(!asc && pBlockInfo->firstKey > keyInStt)) {
// the stt blocks may located in the gap of different data block, but the whole sttRange may overlap with the
// data block, so the overlap check is invalid actually.
buildCleanBlockFromDataFiles(pReader, pScanInfo, pBlockInfo, pBlockIter->index);
} else { // clean stt block
if (!(pReader->info.execMode == READER_EXEC_ROWS && pSttBlockReader->mergeTree.pIter == NULL)) {
tsdbError("tsdb reader failed at: %s:%d", __func__, __LINE__);
return TSDB_CODE_INTERNAL_ERROR;
}
code = buildCleanBlockFromSttFiles(pReader, pScanInfo);
return code;
}
} else {
SBlockData* pBData = &pReader->status.fileBlockData;
tBlockDataReset(pBData);
SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
tsdbDebug("load data in stt block firstly %s", pReader->idStr);
int64_t st = taosGetTimestampUs();
// let's load data from stt files, make sure clear the cleanStt block flag before load the data from stt files
initSttBlockReader(pSttBlockReader, pScanInfo, pReader);
if (pReader->code != 0) {
return pReader->code;
}
// no data in stt block, no need to proceed.
while (hasDataInSttBlock(pScanInfo)) {
if (pScanInfo->sttKeyInfo.status != STT_FILE_HAS_DATA) {
tsdbError("tsdb reader failed at: %s:%d", __func__, __LINE__);
return TSDB_CODE_INTERNAL_ERROR;
}
code = buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pSttBlockReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
break;
}
// data in stt now overlaps with current active file data block, need to composed with file data block.
SRowKey* pSttKey = getCurrentKeyInSttBlock(pSttBlockReader);
if ((pSttKey->ts >= pBlockInfo->firstKey && asc) || (pSttKey->ts <= pBlockInfo->lastKey && (!asc))) {
tsdbDebug("%p lastKeyInStt:%" PRId64 ", overlap with file block, brange:%" PRId64 "-%" PRId64 " %s", pReader,
pSttKey->ts, pBlockInfo->firstKey, pBlockInfo->lastKey, pReader->idStr);
break;
}
}
double el = (taosGetTimestampUs() - st) / 1000.0;
updateComposedBlockInfo(pReader, el, pScanInfo);
if (pResBlock->info.rows > 0) {
tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
", elapsed time:%.2f ms %s",
pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
pResBlock->info.rows, el, pReader->idStr);
}
}
}
return (pReader->code != TSDB_CODE_SUCCESS) ? pReader->code : code;
}
static int32_t buildBlockFromBufferSeqForPreFileset(STsdbReader* pReader, int64_t endKey) {
SReaderStatus* pStatus = &pReader->status;
tsdbDebug("seq load data blocks from cache that preceeds fileset %d, %s", pReader->status.pCurrentFileset->fid,
pReader->idStr);
while (1) {
if (pReader->code != TSDB_CODE_SUCCESS) {
tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", tstrerror(pReader->code), pReader->idStr);
return pReader->code;
}
STableBlockScanInfo** pBlockScanInfo = pStatus->pProcMemTableIter;
if (pReader->pIgnoreTables &&
taosHashGet(*pReader->pIgnoreTables, &(*pBlockScanInfo)->uid, sizeof((*pBlockScanInfo)->uid))) {
bool hasNexTable = moveToNextTableForPreFileSetMem(pStatus);
if (!hasNexTable) {
return TSDB_CODE_SUCCESS;
}
continue;
}
int32_t code = initMemDataIterator(*pBlockScanInfo, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = initDelSkylineIterator(*pBlockScanInfo, pReader->info.order, &pReader->cost);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = buildDataBlockFromBuf(pReader, *pBlockScanInfo, endKey);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pReader->resBlockInfo.pResBlock->info.rows > 0) {
return TSDB_CODE_SUCCESS;
}
// current table is exhausted, let's try next table
bool hasNexTable = moveToNextTableForPreFileSetMem(pStatus);
if (!hasNexTable) {
return TSDB_CODE_SUCCESS;
}
}
}
static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader, int64_t endKey) {
SReaderStatus* pStatus = &pReader->status;
STableUidList* pUidList = &pStatus->uidList;
tsdbDebug("seq load data blocks from cache, %s", pReader->idStr);
while (1) {
if (pReader->code != TSDB_CODE_SUCCESS) {
tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", tstrerror(pReader->code), pReader->idStr);
return pReader->code;
}
STableBlockScanInfo** pBlockScanInfo = pStatus->pTableIter;
if (pBlockScanInfo == NULL || *pBlockScanInfo == NULL) {
return TSDB_CODE_SUCCESS;
}
if (pReader->pIgnoreTables &&
taosHashGet(*pReader->pIgnoreTables, &(*pBlockScanInfo)->uid, sizeof((*pBlockScanInfo)->uid))) {
bool hasNexTable = moveToNextTable(pUidList, pStatus);
if (!hasNexTable) {
return TSDB_CODE_SUCCESS;
}
continue;
}
int32_t code = initMemDataIterator(*pBlockScanInfo, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = initDelSkylineIterator(*pBlockScanInfo, pReader->info.order, &pReader->cost);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = buildDataBlockFromBuf(pReader, *pBlockScanInfo, endKey);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pReader->resBlockInfo.pResBlock->info.rows > 0) {
return TSDB_CODE_SUCCESS;
}
// current table is exhausted, let's try next table
bool hasNexTable = moveToNextTable(pUidList, pStatus);
if (!hasNexTable) {
return TSDB_CODE_SUCCESS;
}
}
}
// set the correct start position in case of the first/last file block, according to the query time window
static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
SFileDataBlockInfo* pBlockInfo = NULL;
SReaderStatus* pStatus = &pReader->status;
SFileBlockDumpInfo* pDumpInfo = &pStatus->fBlockDumpInfo;
int32_t code = getCurrentBlockInfo(pBlockIter, &pBlockInfo, pReader->idStr);
if (code == TSDB_CODE_SUCCESS) {
pDumpInfo->totalRows = pBlockInfo->numRow;
pDumpInfo->rowIndex = ASCENDING_TRAVERSE(pReader->info.order) ? 0 : pBlockInfo->numRow - 1;
} else {
pDumpInfo->totalRows = 0;
pDumpInfo->rowIndex = 0;
}
pDumpInfo->allDumped = false;
}
static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
SBlockNumber num = {0};
SArray* pTableList = taosArrayInit(40, POINTER_BYTES);
if (pTableList == NULL) {
return terrno;
}
int32_t code = moveToNextFile(pReader, &num, pTableList);
if (code != TSDB_CODE_SUCCESS) {
taosArrayDestroy(pTableList);
return code;
}
// all data files are consumed, try data in buffer
if (num.numOfBlocks + num.numOfSttFiles == 0) {
pReader->status.loadFromFile = false;
taosArrayDestroy(pTableList);
return code;
}
// initialize the block iterator for a new fileset
if (num.numOfBlocks > 0) {
code = initBlockIterator(pReader, pBlockIter, num.numOfBlocks, pTableList);
} else { // no block data, only last block exists
tBlockDataReset(&pReader->status.fileBlockData);
code = resetDataBlockIterator(pBlockIter, pReader->info.order, shouldFreePkBuf(&pReader->suppInfo), pReader->idStr);
if (code) {
return code;
}
code = resetTableListIndex(&pReader->status, pReader->idStr);
if (code) {
return code;
}
}
if (code == TSDB_CODE_SUCCESS) { // set the correct start position according to the query time window
initBlockDumpInfo(pReader, pBlockIter);
}
taosArrayDestroy(pTableList);
return code;
}
static bool fileBlockPartiallyRead(SFileBlockDumpInfo* pDumpInfo, bool asc) {
return (!pDumpInfo->allDumped) &&
((pDumpInfo->rowIndex > 0 && asc) || (pDumpInfo->rowIndex < (pDumpInfo->totalRows - 1) && (!asc)));
}
typedef enum {
TSDB_READ_RETURN = 0x1,
TSDB_READ_CONTINUE = 0x2,
} ERetrieveType;
static int32_t doReadDataFromSttFiles(STsdbReader* pReader, ERetrieveType* pReturnType) {
int32_t code = TSDB_CODE_SUCCESS;
SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
SDataBlockIter* pBlockIter = &pReader->status.blockIter;
*pReturnType = TSDB_READ_RETURN;
tsdbDebug("seq load data blocks from stt files %s", pReader->idStr);
while (1) {
code = doLoadSttBlockSequentially(pReader);
if (code != TSDB_CODE_SUCCESS) {
*pReturnType = TSDB_READ_RETURN;
return code;
}
if (pResBlock->info.rows > 0) {
*pReturnType = TSDB_READ_RETURN;
return code;
}
// all data blocks are checked in this stt file, now let's try the next file set
if (pReader->status.pTableIter != NULL) {
code = TSDB_CODE_INTERNAL_ERROR;
tsdbError("tsdb reader failed at: %s:%d, code:%s", __func__, __LINE__, tstrerror(code));
return code;
}
code = initForFirstBlockInFile(pReader, pBlockIter);
// error happens or all the data files are completely checked
if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
*pReturnType = TSDB_READ_RETURN;
return code;
}
if (pReader->status.bProcMemPreFileset) {
code = buildFromPreFilesetBuffer(pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pResBlock->info.rows > 0) {
pReader->status.processingMemPreFileSet = true;
*pReturnType = TSDB_READ_RETURN;
return code;
}
}
if (pBlockIter->numOfBlocks > 0) { // there are data blocks existed.
*pReturnType = TSDB_READ_CONTINUE;
return code;
} else { // all blocks in data file are checked, let's check the data in stt-files
code = resetTableListIndex(&pReader->status, pReader->idStr);
if (code) {
return code;
}
}
}
}
static int32_t buildBlockFromFiles(STsdbReader* pReader) {
int32_t code = TSDB_CODE_SUCCESS;
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
SDataBlockIter* pBlockIter = &pReader->status.blockIter;
SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
SBlockData* pBlockData = &pReader->status.fileBlockData;
const char* id = pReader->idStr;
if (pBlockIter->numOfBlocks == 0) {
// let's try to extract data from stt files.
ERetrieveType type = 0;
code = doReadDataFromSttFiles(pReader, &type);
if (code != 0 || type == TSDB_READ_RETURN) {
return code;
}
code = doBuildDataBlock(pReader);
if (code != TSDB_CODE_SUCCESS || pResBlock->info.rows > 0) {
return code;
}
}
while (1) {
if (fileBlockPartiallyRead(pDumpInfo, asc)) { // file data block is partially loaded
code = buildComposedDataBlock(pReader);
} else {
// current block are exhausted, try the next file block
if (pDumpInfo->allDumped) {
// try next data block in current file
bool hasNext = blockIteratorNext(&pReader->status.blockIter, pReader->idStr);
if (hasNext) { // check for the next block in the block accessed order list
initBlockDumpInfo(pReader, pBlockIter);
} else {
// all data blocks in files are checked, let's check the data in last files.
// data blocks in current file are exhausted, let's try the next file now
if (pBlockData->uid != 0) {
tBlockDataClear(pBlockData);
}
tBlockDataReset(pBlockData);
code = resetDataBlockIterator(pBlockIter, pReader->info.order, shouldFreePkBuf(&pReader->suppInfo), id);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = resetTableListIndex(&pReader->status, id);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
ERetrieveType type = 0;
code = doReadDataFromSttFiles(pReader, &type);
if (code != 0 || type == TSDB_READ_RETURN) {
return code;
}
}
}
code = doBuildDataBlock(pReader);
}
if (code != TSDB_CODE_SUCCESS || pResBlock->info.rows > 0) {
return code;
}
}
}
static void getTsdbByRetentions(SVnode* pVnode, SQueryTableDataCond* pCond, SRetention* retentions, const char* idStr,
int8_t* pLevel, STsdb** pTsdb) {
if (pTsdb == NULL) {
return;
}
*pTsdb = NULL;
if (VND_IS_RSMA(pVnode) && !pCond->skipRollup) {
int8_t level = 0;
int8_t precision = pVnode->config.tsdbCfg.precision;
int64_t now = taosGetTimestamp(precision);
int64_t offset = tsQueryRsmaTolerance * ((precision == TSDB_TIME_PRECISION_MILLI) ? 1L
: (precision == TSDB_TIME_PRECISION_MICRO) ? 1000L
: 1000000L);
for (int32_t i = 0; i < TSDB_RETENTION_MAX; ++i) {
SRetention* pRetention = retentions + level;
if (pRetention->keep <= 0) {
if (level > 0) {
--level;
}
break;
}
if ((now - pRetention->keep) <= (pCond->twindows.skey + offset)) {
break;
}
++level;
}
const char* str = (idStr != NULL) ? idStr : "";
if (level == TSDB_RETENTION_L0) {
*pLevel = TSDB_RETENTION_L0;
tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L0, str);
*pTsdb = VND_RSMA0(pVnode);
return;
} else if (level == TSDB_RETENTION_L1) {
*pLevel = TSDB_RETENTION_L1;
tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L1, str);
*pTsdb = VND_RSMA1(pVnode);
return;
} else {
*pLevel = TSDB_RETENTION_L2;
tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L2, str);
*pTsdb = VND_RSMA2(pVnode);
return;
}
}
*pTsdb = VND_TSDB(pVnode);
}
SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level) {
int64_t startVer = (pCond->startVersion == -1) ? 0 : pCond->startVersion;
int64_t endVer = 0;
if (pCond->endVersion == -1) {
// user not specified end version, set current maximum version of vnode as the endVersion
endVer = pVnode->state.applied;
} else {
endVer = (pCond->endVersion > pVnode->state.applied) ? pVnode->state.applied : pCond->endVersion;
}
return (SVersionRange){.minVer = startVer, .maxVer = endVer};
}
static int32_t reverseSearchStartPos(const SArray* pDelList, int32_t index, int64_t key, bool asc) {
size_t num = taosArrayGetSize(pDelList);
int32_t start = index;
if (asc) {
if (start >= num - 1) {
start = num - 1;
}
TSDBKEY* p = taosArrayGet(pDelList, start);
if (p == NULL) {
return TSDB_CODE_INVALID_PARA;
}
while (p->ts >= key && start > 0) {
start -= 1;
}
} else {
if (index <= 0) {
start = 0;
}
TSDBKEY* p = taosArrayGet(pDelList, start);
if (p == NULL) {
return TSDB_CODE_INVALID_PARA;
}
while (p->ts <= key && start < num - 1) {
start += 1;
}
}
return start;
}
bool hasBeenDropped(const SArray* pDelList, int32_t* index, int64_t key, int64_t ver, int32_t order,
SVersionRange* pVerRange, bool hasPk) {
if (pDelList == NULL || (TARRAY_SIZE(pDelList) == 0)) {
return false;
}
size_t num = taosArrayGetSize(pDelList);
bool asc = ASCENDING_TRAVERSE(order);
int32_t step = asc ? 1 : -1;
if (hasPk) { // handle the case where duplicated timestamps existed.
*index = reverseSearchStartPos(pDelList, *index, key, asc);
}
if (asc) {
if (*index >= num - 1) {
TSDBKEY* last = taosArrayGetLast(pDelList);
if (last == NULL) {
return false;
}
if (key > last->ts) {
return false;
} else if (key == last->ts) {
TSDBKEY* prev = taosArrayGet(pDelList, num - 2);
if (prev == NULL) {
return false;
}
return (prev->version >= ver && prev->version <= pVerRange->maxVer && prev->version >= pVerRange->minVer);
}
} else {
TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
TSDBKEY* pNext = taosArrayGet(pDelList, (*index) + 1);
if (pCurrent == NULL || pNext == NULL) {
return false;
}
if (key < pCurrent->ts) {
return false;
}
if (pCurrent->ts <= key && pNext->ts >= key && pCurrent->version >= ver &&
pVerRange->maxVer >= pCurrent->version) {
return true;
}
while (pNext->ts <= key && (*index) < num - 1) {
(*index) += 1;
if ((*index) < num - 1) {
pCurrent = taosArrayGet(pDelList, *index);
pNext = taosArrayGet(pDelList, (*index) + 1);
if (pCurrent == NULL || pNext == NULL) {
return false;
}
// it is not a consecutive deletion range, ignore it
if (pCurrent->version == 0 && pNext->version > 0) {
continue;
}
if (pCurrent->ts <= key && pNext->ts >= key && pCurrent->version >= ver &&
pVerRange->maxVer >= pCurrent->version) {
return true;
}
}
}
return false;
}
} else {
if (*index <= 0) {
TSDBKEY* pFirst = taosArrayGet(pDelList, 0);
if (pFirst == NULL) {
return false;
}
if (key < pFirst->ts) {
return false;
} else if (key == pFirst->ts) {
return pFirst->version >= ver;
} else {
tsdbError("unexpected error, key:%" PRId64 ", first:%" PRId64, key, pFirst->ts);
}
} else {
TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
TSDBKEY* pPrev = taosArrayGet(pDelList, (*index) - 1);
if (pCurrent == NULL || pPrev == NULL) {
return false;
}
if (key > pCurrent->ts) {
return false;
}
if (pPrev->ts <= key && pCurrent->ts >= key && pPrev->version >= ver) {
return true;
}
while (pPrev->ts >= key && (*index) > 1) {
(*index) += step;
if ((*index) >= 1) {
pCurrent = taosArrayGet(pDelList, *index);
pPrev = taosArrayGet(pDelList, (*index) - 1);
if (pCurrent == NULL || pPrev == NULL) {
return false;
}
// it is not a consecutive deletion range, ignore it
if (pCurrent->version > 0 && pPrev->version == 0) {
continue;
}
if (pPrev->ts <= key && pCurrent->ts >= key && pPrev->version >= ver) {
return true;
}
}
}
return false;
}
}
return false;
}
FORCE_INLINE void getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader, TSDBROW** pRes) {
*pRes = NULL;
if (!pIter->hasVal) {
return;
}
int32_t order = pReader->info.order;
TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter);
TSDBKEY key;
TSDBROW_INIT_KEY(pRow, key);
if (outOfTimeWindow(key.ts, &pReader->info.window)) {
pIter->hasVal = false;
return;
}
// it is a valid data version
if (key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer) {
if (pDelList == NULL || TARRAY_SIZE(pDelList) == 0) {
*pRes = pRow;
return;
} else {
bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange,
pReader->suppInfo.numOfPks > 0);
if (!dropped) {
*pRes = pRow;
return;
}
}
}
while (1) {
pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
if (!pIter->hasVal) {
return;
}
pRow = tsdbTbDataIterGet(pIter->iter);
TSDBROW_INIT_KEY(pRow, key);
if (outOfTimeWindow(key.ts, &pReader->info.window)) {
pIter->hasVal = false;
return;
}
if (key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer) {
if (pDelList == NULL || TARRAY_SIZE(pDelList) == 0) {
*pRes = pRow;
return;
} else {
bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange,
pReader->suppInfo.numOfPks > 0);
if (!dropped) {
*pRes = pRow;
return;
}
}
}
}
}
int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, SRowKey* pCurKey, SArray* pDelList, STsdbReader* pReader) {
SRowMerger* pMerger = &pReader->status.merger;
int32_t code = 0;
while (1) {
pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
if (!pIter->hasVal) {
break;
}
// data exists but not valid
TSDBROW* pRow = NULL;
getValidMemRow(pIter, pDelList, pReader, &pRow);
if (pRow == NULL) {
break;
}
// ts is not identical, quit
if (TSDBROW_TS(pRow) != pCurKey->ts) {
break;
}
if (pCurKey->numOfPKs > 0) {
SRowKey nextKey = {0};
tRowGetKeyEx(pRow, &nextKey);
if (pkCompEx(pCurKey, &nextKey) != 0) {
break;
}
}
STSchema* pTSchema = NULL;
if (pRow->type == TSDBROW_ROW_FMT) {
pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, uid);
if (pTSchema == NULL) {
return terrno;
}
}
code = tsdbRowMergerAdd(pMerger, pRow, pTSchema);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
return code;
}
static int32_t doMergeRowsInFileBlockImpl(SBlockData* pBlockData, int32_t* rowIndex, SRowKey* pKey, SRowMerger* pMerger,
SVersionRange* pVerRange, int32_t step) {
int32_t code = 0;
while ((*rowIndex) < pBlockData->nRow && (*rowIndex) >= 0) {
SRowKey cur;
tColRowGetKey(pBlockData, (*rowIndex), &cur);
if (pkCompEx(&cur, pKey) != 0) {
break;
}
if (pBlockData->aVersion[(*rowIndex)] > pVerRange->maxVer ||
pBlockData->aVersion[(*rowIndex)] < pVerRange->minVer) {
(*rowIndex) += step;
continue;
}
TSDBROW fRow = tsdbRowFromBlockData(pBlockData, (*rowIndex));
code = tsdbRowMergerAdd(pMerger, &fRow, NULL);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
(*rowIndex) += step;
}
return code;
}
typedef enum {
CHECK_FILEBLOCK_CONT = 0x1,
CHECK_FILEBLOCK_QUIT = 0x2,
} CHECK_FILEBLOCK_STATE;
static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanInfo* pScanInfo,
SFileDataBlockInfo* pFBlock, SRowMerger* pMerger, SRowKey* pKey,
CHECK_FILEBLOCK_STATE* state) {
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
SBlockData* pBlockData = &pReader->status.fileBlockData;
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
SVersionRange* pVerRange = &pReader->info.verRange;
bool loadNeighbor = true;
int32_t step = ASCENDING_TRAVERSE(pReader->info.order) ? 1 : -1;
int32_t code = loadNeighborIfOverlap(pFBlock, pScanInfo, pReader, &loadNeighbor);
*state = CHECK_FILEBLOCK_QUIT;
if (loadNeighbor && (code == TSDB_CODE_SUCCESS)) {
code = doMergeRowsInFileBlockImpl(pBlockData, &pDumpInfo->rowIndex, pKey, pMerger, pVerRange, step);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if ((pDumpInfo->rowIndex >= pDumpInfo->totalRows && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
*state = CHECK_FILEBLOCK_CONT;
}
}
return code;
}
int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, SRowKey* pKey,
STsdbReader* pReader) {
SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
SRowMerger* pMerger = &pReader->status.merger;
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
int32_t step = asc ? 1 : -1;
SVersionRange* pRange = &pReader->info.verRange;
int32_t code = 0;
pDumpInfo->rowIndex += step;
if ((pDumpInfo->rowIndex <= pBlockData->nRow - 1 && asc) || (pDumpInfo->rowIndex >= 0 && !asc)) {
code = doMergeRowsInFileBlockImpl(pBlockData, &pDumpInfo->rowIndex, pKey, pMerger, pRange, step);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
// all rows are consumed, let's try next file block
if ((pDumpInfo->rowIndex >= pBlockData->nRow && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
while (1) {
CHECK_FILEBLOCK_STATE st;
SFileDataBlockInfo* pFileBlockInfo = NULL;
code = getCurrentBlockInfo(&pReader->status.blockIter, &pFileBlockInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pFileBlockInfo == NULL) {
st = CHECK_FILEBLOCK_QUIT;
break;
}
code = checkForNeighborFileBlock(pReader, pScanInfo, pFileBlockInfo, pMerger, pKey, &st);
if (st == CHECK_FILEBLOCK_QUIT || code != TSDB_CODE_SUCCESS) {
break;
}
}
}
return code;
}
int32_t doMergeRowsInSttBlock(SSttBlockReader* pSttBlockReader, STableBlockScanInfo* pScanInfo, SRowMerger* pMerger,
int32_t pkSrcSlot, SVersionRange* pVerRange, const char* idStr) {
SRowKey* pRowKey = &pScanInfo->lastProcKey;
int32_t code = TSDB_CODE_SUCCESS;
while (1) {
code = nextRowFromSttBlocks(pSttBlockReader, pScanInfo, pkSrcSlot, pVerRange);
if (code || (!hasDataInSttBlock(pScanInfo))) {
return code;
}
SRowKey* pNextKey = getCurrentKeyInSttBlock(pSttBlockReader);
int32_t ret = pkCompEx(pRowKey, pNextKey);
if (ret == 0) {
TSDBROW* pRow1 = tMergeTreeGetRow(&pSttBlockReader->mergeTree);
code = tsdbRowMergerAdd(pMerger, pRow1, NULL);
if (code != TSDB_CODE_SUCCESS) {
break;
}
} else {
tsdbTrace("uid:%" PRIu64 " last del index:%d, del range:%d, lastKeyInStt:%" PRId64 ", %s", pScanInfo->uid,
pScanInfo->sttBlockDelIndex, (int32_t)taosArrayGetSize(pScanInfo->delSkyline),
pScanInfo->sttKeyInfo.nextProcKey.ts, idStr);
break;
}
}
return code;
}
int32_t doMergeMemTableMultiRows(TSDBROW* pRow, SRowKey* pKey, uint64_t uid, SIterInfo* pIter, SArray* pDelList,
TSDBROW* pResRow, STsdbReader* pReader, bool* freeTSRow) {
SRowMerger* pMerger = &pReader->status.merger;
TSDBROW* pNextRow = NULL;
TSDBROW current = *pRow;
{ // if the timestamp of the next valid row has a different ts, return current row directly
pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
if (!pIter->hasVal) {
*pResRow = *pRow;
*freeTSRow = false;
return TSDB_CODE_SUCCESS;
} else { // has next point in mem/imem
getValidMemRow(pIter, pDelList, pReader, &pNextRow);
if (pNextRow == NULL) {
*pResRow = current;
*freeTSRow = false;
return TSDB_CODE_SUCCESS;
}
if (TSDBROW_TS(&current) != TSDBROW_TS(pNextRow)) {
*pResRow = current;
*freeTSRow = false;
return TSDB_CODE_SUCCESS;
}
if (pKey->numOfPKs > 0) {
SRowKey nextRowKey = {0};
tRowGetKeyEx(pNextRow, &nextRowKey);
if (pkCompEx(pKey, &nextRowKey) != 0) {
*pResRow = current;
*freeTSRow = false;
return TSDB_CODE_SUCCESS;
}
}
}
}
terrno = 0;
int32_t code = 0;
// start to merge duplicated rows
STSchema* pTSchema = NULL;
if (current.type == TSDBROW_ROW_FMT) { // get the correct schema for row-wise data in memory
pTSchema = doGetSchemaForTSRow(current.pTSRow->sver, pReader, uid);
if (pTSchema == NULL) {
return terrno;
}
}
code = tsdbRowMergerAdd(pMerger, &current, pTSchema);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
STSchema* pTSchema1 = NULL;
if (pNextRow->type == TSDBROW_ROW_FMT) { // get the correct schema for row-wise data in memory
pTSchema1 = doGetSchemaForTSRow(pNextRow->pTSRow->sver, pReader, uid);
if (pTSchema1 == NULL) {
return terrno;
}
}
code = tsdbRowMergerAdd(pMerger, pNextRow, pTSchema1);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInBuf(pIter, uid, pKey, pDelList, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = tsdbRowMergerGetRow(pMerger, &pResRow->pTSRow);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
pResRow->type = TSDBROW_ROW_FMT;
tsdbRowMergerClear(pMerger);
*freeTSRow = true;
return TSDB_CODE_SUCCESS;
}
int32_t doMergeMemIMemRows(TSDBROW* pRow, SRowKey* pRowKey, TSDBROW* piRow, SRowKey* piRowKey,
STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, SRow** pTSRow) {
SRowMerger* pMerger = &pReader->status.merger;
int32_t code = TSDB_CODE_SUCCESS;
STSchema* pSchema = NULL;
if (pRow->type == TSDBROW_ROW_FMT) {
pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
if (pSchema == NULL) {
return terrno;
}
}
STSchema* piSchema = NULL;
if (piRow->type == TSDBROW_ROW_FMT) {
piSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
if (piSchema == NULL) {
return terrno;
}
}
if (ASCENDING_TRAVERSE(pReader->info.order)) { // ascending order imem --> mem
code = tsdbRowMergerAdd(pMerger, piRow, piSchema);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, piRowKey, pBlockScanInfo->delSkyline, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, pRowKey, pBlockScanInfo->delSkyline, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
} else {
code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, pRowKey, pBlockScanInfo->delSkyline, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = tsdbRowMergerAdd(pMerger, piRow, piSchema);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, piRowKey, pBlockScanInfo->delSkyline, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
tRowKeyAssign(&pBlockScanInfo->lastProcKey, pRowKey);
code = tsdbRowMergerGetRow(pMerger, pTSRow);
tsdbRowMergerClear(pMerger);
return code;
}
static int32_t tsdbGetNextRowInMem(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, TSDBROW* pResRow,
int64_t endKey, bool* freeTSRow) {
TSDBROW* pRow = NULL;
TSDBROW* piRow = NULL;
getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader, &pRow);
getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader, &piRow);
SArray* pDelList = pBlockScanInfo->delSkyline;
uint64_t uid = pBlockScanInfo->uid;
SIterInfo* piter = &pBlockScanInfo->iter;
SIterInfo* piiter = &pBlockScanInfo->iiter;
SRowKey rowKey = {0}, irowKey = {0};
// todo refactor
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
if (piter->hasVal) {
tRowGetKeyEx(pRow, &rowKey);
if ((rowKey.ts >= endKey && asc) || (rowKey.ts <= endKey && !asc)) {
pRow = NULL;
}
}
if (piiter->hasVal) {
tRowGetKeyEx(piRow, &irowKey);
if ((irowKey.ts >= endKey && asc) || (irowKey.ts <= endKey && !asc)) {
piRow = NULL;
}
}
if (pRow != NULL && piRow != NULL) {
int32_t code = TSDB_CODE_SUCCESS;
if (rowKey.numOfPKs == 0) {
if ((rowKey.ts > irowKey.ts && asc) || (rowKey.ts < irowKey.ts && (!asc))) { // ik.ts < k.ts
code = doMergeMemTableMultiRows(piRow, &irowKey, uid, piiter, pDelList, pResRow, pReader, freeTSRow);
} else if ((rowKey.ts < irowKey.ts && asc) || (rowKey.ts > irowKey.ts && (!asc))) {
code = doMergeMemTableMultiRows(pRow, &rowKey, uid, piter, pDelList, pResRow, pReader, freeTSRow);
} else { // ik.ts == k.ts
*freeTSRow = true;
pResRow->type = TSDBROW_ROW_FMT;
code = doMergeMemIMemRows(pRow, &rowKey, piRow, &irowKey, pBlockScanInfo, pReader, &pResRow->pTSRow);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
} else {
int32_t ret = pkCompEx(&rowKey, &irowKey);
if (ret != 0) {
if ((ret > 0 && asc) || (ret < 0 && (!asc))) { // ik.ts < k.ts
code = doMergeMemTableMultiRows(piRow, &irowKey, uid, piiter, pDelList, pResRow, pReader, freeTSRow);
} else if ((ret < 0 && asc) || (ret > 0 && (!asc))) {
code = doMergeMemTableMultiRows(pRow, &rowKey, uid, piter, pDelList, pResRow, pReader, freeTSRow);
}
} else { // ik.ts == k.ts
*freeTSRow = true;
pResRow->type = TSDBROW_ROW_FMT;
code = doMergeMemIMemRows(pRow, &rowKey, piRow, &irowKey, pBlockScanInfo, pReader, &pResRow->pTSRow);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
}
return code;
}
if (piter->hasVal && pRow != NULL) {
return doMergeMemTableMultiRows(pRow, &rowKey, uid, piter, pDelList, pResRow, pReader, freeTSRow);
}
if (piiter->hasVal && piRow != NULL) {
return doMergeMemTableMultiRows(piRow, &irowKey, uid, piiter, pDelList, pResRow, pReader, freeTSRow);
}
return TSDB_CODE_SUCCESS;
}
int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, SRow* pTSRow, STableBlockScanInfo* pScanInfo) {
int32_t outputRowIndex = pBlock->info.rows;
int64_t uid = pScanInfo->uid;
int32_t code = TSDB_CODE_SUCCESS;
SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
STSchema* pSchema = doGetSchemaForTSRow(pTSRow->sver, pReader, uid);
if (pSchema == NULL) {
return terrno;
}
SColVal colVal = {0};
int32_t i = 0, j = 0;
if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
SColumnInfoData* pColData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
if (pColData == NULL) {
return TSDB_CODE_INVALID_PARA;
}
((int64_t*)pColData->pData)[outputRowIndex] = pTSRow->ts;
i += 1;
}
while (i < pSupInfo->numOfCols && j < pSchema->numOfCols) {
col_id_t colId = pSupInfo->colId[i];
if (colId == pSchema->columns[j].colId) {
SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
if (pColInfoData == NULL) {
return TSDB_CODE_INVALID_PARA;
}
code = tRowGet(pTSRow, pSchema, j, &colVal);
if (code) {
return code;
}
code = doCopyColVal(pColInfoData, outputRowIndex, i, &colVal, pSupInfo);
if (code) {
return code;
}
i += 1;
j += 1;
} else if (colId < pSchema->columns[j].colId) {
SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
if (pColInfoData == NULL) {
return TSDB_CODE_INVALID_PARA;
}
colDataSetNULL(pColInfoData, outputRowIndex);
i += 1;
} else if (colId > pSchema->columns[j].colId) {
j += 1;
}
}
// set null value since current column does not exist in the "pSchema"
while (i < pSupInfo->numOfCols) {
SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
if (pColInfoData == NULL) {
return TSDB_CODE_INVALID_PARA;
}
colDataSetNULL(pColInfoData, outputRowIndex);
i += 1;
}
pBlock->info.dataLoad = 1;
pBlock->info.rows += 1;
return TSDB_CODE_SUCCESS;
}
int32_t doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
int32_t rowIndex) {
int32_t i = 0, j = 0;
int32_t outputRowIndex = pResBlock->info.rows;
int32_t code = TSDB_CODE_SUCCESS;
SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
((int64_t*)pReader->status.pPrimaryTsCol->pData)[outputRowIndex] = pBlockData->aTSKEY[rowIndex];
i += 1;
SColVal cv = {0};
int32_t numOfInputCols = pBlockData->nColData;
int32_t numOfOutputCols = pSupInfo->numOfCols;
while (i < numOfOutputCols && j < numOfInputCols) {
SColData* pData = tBlockDataGetColDataByIdx(pBlockData, j);
if (pData->cid < pSupInfo->colId[i]) {
j += 1;
continue;
}
SColumnInfoData* pCol = TARRAY_GET_ELEM(pResBlock->pDataBlock, pSupInfo->slotId[i]);
if (pData->cid == pSupInfo->colId[i]) {
tColDataGetValue(pData, rowIndex, &cv);
code = doCopyColVal(pCol, outputRowIndex, i, &cv, pSupInfo);
if (code) {
return code;
}
j += 1;
} else if (pData->cid > pCol->info.colId) {
// the specified column does not exist in file block, fill with null data
colDataSetNULL(pCol, outputRowIndex);
}
i += 1;
}
while (i < numOfOutputCols) {
SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
if (pCol == NULL) {
return TSDB_CODE_INVALID_PARA;
}
colDataSetNULL(pCol, outputRowIndex);
i += 1;
}
pResBlock->info.dataLoad = 1;
pResBlock->info.rows += 1;
return TSDB_CODE_SUCCESS;
}
int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
STsdbReader* pReader) {
SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
int32_t code = TSDB_CODE_SUCCESS;
do {
TSDBROW row = {.type = -1};
bool freeTSRow = false;
code = tsdbGetNextRowInMem(pBlockScanInfo, pReader, &row, endKey, &freeTSRow);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (row.type == -1) {
break;
}
if (row.type == TSDBROW_ROW_FMT) {
code = doAppendRowFromTSRow(pBlock, pReader, row.pTSRow, pBlockScanInfo);
if (code == TSDB_CODE_SUCCESS) {
pBlockScanInfo->lastProcKey.ts = row.pTSRow->ts;
pBlockScanInfo->lastProcKey.numOfPKs = row.pTSRow->numOfPKs;
if (row.pTSRow->numOfPKs > 0) {
tRowGetPrimaryKeyDeepCopy(row.pTSRow, &pBlockScanInfo->lastProcKey);
}
}
if (freeTSRow) {
taosMemoryFree(row.pTSRow);
}
if (code) {
return code;
}
} else {
code = doAppendRowFromFileBlock(pBlock, pReader, row.pBlockData, row.iRow);
if (code) {
return code;
}
tColRowGetKeyDeepCopy(row.pBlockData, row.iRow, pReader->suppInfo.pkSrcSlot, &pBlockScanInfo->lastProcKey);
}
// no data in buffer, return immediately
if (!(pBlockScanInfo->iter.hasVal || pBlockScanInfo->iiter.hasVal)) {
break;
}
if (pBlock->info.rows >= capacity) {
break;
}
} while (1);
return code;
}
// TODO refactor: with createDataBlockScanInfo
int32_t tsdbSetTableList2(STsdbReader* pReader, const void* pTableList, int32_t num) {
int32_t code = TSDB_CODE_SUCCESS;
int32_t size = tSimpleHashGetSize(pReader->status.pTableMap);
STableBlockScanInfo** p = NULL;
int32_t iter = 0;
code = tsdbAcquireReader(pReader);
if (code) {
return code;
}
while ((p = tSimpleHashIterate(pReader->status.pTableMap, p, &iter)) != NULL) {
clearBlockScanInfo(*p);
}
tSimpleHashClear(pReader->status.pTableMap);
if (size < num) {
code = ensureBlockScanInfoBuf(&pReader->blockInfoBuf, num);
if (code) {
(void) tsdbReleaseReader(pReader);
return code;
}
char* p1 = taosMemoryRealloc(pReader->status.uidList.tableUidList, sizeof(uint64_t) * num);
if (p1 == NULL) {
(void) tsdbReleaseReader(pReader);
return terrno;
}
pReader->status.uidList.tableUidList = (uint64_t*)p1;
}
STableUidList* pUidList = &pReader->status.uidList;
pUidList->currentIndex = 0;
STableKeyInfo* pList = (STableKeyInfo*)pTableList;
for (int32_t i = 0; i < num; ++i) {
pUidList->tableUidList[i] = pList[i].uid;
STableBlockScanInfo* pInfo = NULL;
code = getPosInBlockInfoBuf(&pReader->blockInfoBuf, i, &pInfo);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
code = initTableBlockScanInfo(pInfo, pList[i].uid, pReader->status.pTableMap, pReader);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
}
(void) tsdbReleaseReader(pReader);
return code;
}
uint64_t tsdbGetReaderMaxVersion2(STsdbReader* pReader) { return pReader->info.verRange.maxVer; }
static int32_t doOpenReaderImpl(STsdbReader* pReader) {
SReaderStatus* pStatus = &pReader->status;
SDataBlockIter* pBlockIter = &pStatus->blockIter;
int32_t code = TSDB_CODE_SUCCESS;
if (pReader->bFilesetDelimited) {
getMemTableTimeRange(pReader, &pReader->status.memTableMaxKey, &pReader->status.memTableMinKey);
pReader->status.bProcMemFirstFileset = true;
}
code = initFilesetIterator(&pStatus->fileIter, pReader->pReadSnap->pfSetArray, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = resetDataBlockIterator(&pStatus->blockIter, pReader->info.order, shouldFreePkBuf(&pReader->suppInfo),
pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pStatus->fileIter.numOfFiles == 0) {
pStatus->loadFromFile = false;
} else {
code = initForFirstBlockInFile(pReader, pBlockIter);
}
if (!pStatus->loadFromFile) {
code = resetTableListIndex(pStatus, pReader->idStr);
}
return code;
}
static void freeSchemaFunc(void* param) {
void** p = (void**)param;
taosMemoryFreeClear(*p);
}
static void clearSharedPtr(STsdbReader* p) {
if (p) {
p->status.pTableMap = NULL;
p->status.uidList.tableUidList = NULL;
p->info.pSchema = NULL;
p->pReadSnap = NULL;
p->pSchemaMap = NULL;
}
}
static int32_t setSharedPtr(STsdbReader* pDst, const STsdbReader* pSrc) {
pDst->status.pTableMap = pSrc->status.pTableMap;
pDst->status.uidList = pSrc->status.uidList;
pDst->info.pSchema = pSrc->info.pSchema;
pDst->pSchemaMap = pSrc->pSchemaMap;
pDst->pReadSnap = pSrc->pReadSnap;
pDst->pReadSnap->pfSetArray = pSrc->pReadSnap->pfSetArray;
if (pDst->info.pSchema) {
return tsdbRowMergerInit(&pDst->status.merger, pDst->info.pSchema);
}
return TSDB_CODE_SUCCESS;
}
// ====================================== EXPOSED APIs ======================================
int32_t tsdbReaderOpen2(void* pVnode, SQueryTableDataCond* pCond, void* pTableList, int32_t numOfTables,
SSDataBlock* pResBlock, void** ppReader, const char* idstr, SHashObj** pIgnoreTables) {
STimeWindow window = pCond->twindows;
SVnodeCfg* pConf = &(((SVnode*)pVnode)->config);
int32_t code = 0;
int32_t lino = 0;
int32_t capacity = pConf->tsdbCfg.maxRows;
if (pResBlock != NULL) {
code = blockDataEnsureCapacity(pResBlock, capacity);
TSDB_CHECK_CODE(code, lino, _err);
}
code = tsdbReaderCreate(pVnode, pCond, ppReader, capacity, pResBlock, idstr);
TSDB_CHECK_CODE(code, lino, _err);
// check for query time window
STsdbReader* pReader = *ppReader;
if (isEmptyQueryTimeWindow(&pReader->info.window) && pCond->type == TIMEWINDOW_RANGE_CONTAINED) {
tsdbDebug("%p query window not overlaps with the data set, no result returned, %s", pReader, pReader->idStr);
return TSDB_CODE_SUCCESS;
}
if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
// update the SQueryTableDataCond to create inner reader
int32_t order = pCond->order;
if (order == TSDB_ORDER_ASC) {
pCond->twindows.ekey = window.skey - 1;
pCond->twindows.skey = INT64_MIN;
pCond->order = TSDB_ORDER_DESC;
} else {
pCond->twindows.skey = window.ekey + 1;
pCond->twindows.ekey = INT64_MAX;
pCond->order = TSDB_ORDER_ASC;
}
// here we only need one more row, so the capacity is set to be ONE.
code = tsdbReaderCreate(pVnode, pCond, (void**)&((STsdbReader*)pReader)->innerReader[0], 1, pResBlock, idstr);
TSDB_CHECK_CODE(code, lino, _err);
if (order == TSDB_ORDER_ASC) {
pCond->twindows.skey = window.ekey + 1;
pCond->twindows.ekey = INT64_MAX;
} else {
pCond->twindows.skey = INT64_MIN;
pCond->twindows.ekey = window.ekey - 1;
}
pCond->order = order;
code = tsdbReaderCreate(pVnode, pCond, (void**)&((STsdbReader*)pReader)->innerReader[1], 1, pResBlock, idstr);
TSDB_CHECK_CODE(code, lino, _err);
}
// NOTE: the endVersion in pCond is the data version not schema version, so pCond->endVersion is not correct here.
// no valid error code set in metaGetTbTSchema, so let's set the error code here.
// we should proceed in case of tmq processing.
if (pCond->suid != 0) {
code = metaGetTbTSchemaMaybeNull(pReader->pTsdb->pVnode->pMeta, pReader->info.suid, -1, 1, &pReader->info.pSchema);
TSDB_CHECK_CODE(code, lino, _err);
if (pReader->info.pSchema == NULL) {
tsdbWarn("failed to get table schema, suid:%" PRIu64 ", ver:-1, %s", pReader->info.suid, pReader->idStr);
}
} else if (numOfTables > 0) {
STableKeyInfo* pKey = pTableList;
code = metaGetTbTSchemaMaybeNull(pReader->pTsdb->pVnode->pMeta, pKey->uid, -1, 1, &pReader->info.pSchema);
TSDB_CHECK_CODE(code, lino, _err);
if (pReader->info.pSchema == NULL) {
tsdbWarn("failed to get table schema, uid:%" PRIu64 ", ver:-1, %s", pKey->uid, pReader->idStr);
}
}
if (pReader->info.pSchema != NULL) {
code = tsdbRowMergerInit(&pReader->status.merger, pReader->info.pSchema);
TSDB_CHECK_CODE(code, lino, _err);
}
pReader->pSchemaMap = tSimpleHashInit(8, taosFastHash);
if (pReader->pSchemaMap == NULL) {
tsdbError("failed init schema hash for reader %s", pReader->idStr);
TSDB_CHECK_NULL(pReader->pSchemaMap, code, lino, _err, terrno);
}
tSimpleHashSetFreeFp(pReader->pSchemaMap, freeSchemaFunc);
if (pReader->info.pSchema != NULL) {
code = updateBlockSMAInfo(pReader->info.pSchema, &pReader->suppInfo);
TSDB_CHECK_CODE(code, lino, _err);
}
STsdbReader* p = (pReader->innerReader[0] != NULL) ? pReader->innerReader[0] : pReader;
code = createDataBlockScanInfo(p, &pReader->blockInfoBuf, pTableList, &pReader->status.uidList, numOfTables, &pReader->status.pTableMap);
TSDB_CHECK_CODE(code, lino, _err);
pReader->status.pLDataIterArray = taosArrayInit(4, POINTER_BYTES);
TSDB_CHECK_NULL(pReader->status.pLDataIterArray, code, lino, _err, terrno);
pReader->flag = READER_STATUS_SUSPEND;
pReader->info.execMode = pCond->notLoadData ? READER_EXEC_ROWS : READER_EXEC_DATA;
pReader->pIgnoreTables = pIgnoreTables;
tsdbDebug("%p total numOfTable:%d, window:%" PRId64 " - %" PRId64 ", verRange:%" PRId64 " - %" PRId64
" in this query %s",
pReader, numOfTables, pReader->info.window.skey, pReader->info.window.ekey, pReader->info.verRange.minVer,
pReader->info.verRange.maxVer, pReader->idStr);
return code;
_err:
tsdbError("failed to create data reader, error at:%d code:%s %s", lino, tstrerror(code), idstr);
tsdbReaderClose2(*ppReader);
*ppReader = NULL; // reset the pointer value.
return code;
}
void tsdbReaderClose2(STsdbReader* pReader) {
if (pReader == NULL) {
return;
}
int32_t code = tsdbAcquireReader(pReader);
if (code) {
return;
}
{
if (pReader->innerReader[0] != NULL || pReader->innerReader[1] != NULL) {
STsdbReader* p = pReader->innerReader[0];
clearSharedPtr(p);
p = pReader->innerReader[1];
clearSharedPtr(p);
tsdbReaderClose2(pReader->innerReader[0]);
tsdbReaderClose2(pReader->innerReader[1]);
}
}
SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
TARRAY2_DESTROY(&pSupInfo->colAggArray, NULL);
if (pSupInfo->buildBuf) {
for (int32_t i = 0; i < pSupInfo->numOfCols; ++i) {
if (pSupInfo->buildBuf[i] != NULL) {
taosMemoryFreeClear(pSupInfo->buildBuf[i]);
}
}
}
if (pReader->resBlockInfo.freeBlock) {
blockDataDestroy(pReader->resBlockInfo.pResBlock);
pReader->resBlockInfo.pResBlock = NULL;
}
taosMemoryFree(pSupInfo->colId);
tBlockDataDestroy(&pReader->status.fileBlockData);
cleanupDataBlockIterator(&pReader->status.blockIter, shouldFreePkBuf(&pReader->suppInfo));
size_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
if (pReader->status.pTableMap != NULL) {
destroyAllBlockScanInfo(pReader->status.pTableMap);
pReader->status.pTableMap = NULL;
}
clearBlockScanInfoBuf(&pReader->blockInfoBuf);
if (pReader->pFileReader != NULL) {
tsdbDataFileReaderClose(&pReader->pFileReader);
}
SReadCostSummary* pCost = &pReader->cost;
SFilesetIter* pFilesetIter = &pReader->status.fileIter;
if (pFilesetIter->pSttBlockReader != NULL) {
SSttBlockReader* pSttBlockReader = pFilesetIter->pSttBlockReader;
tMergeTreeClose(&pSttBlockReader->mergeTree);
clearRowKey(&pSttBlockReader->currentKey);
taosMemoryFree(pSttBlockReader);
}
destroySttBlockReader(pReader->status.pLDataIterArray, &pCost->sttCost);
taosMemoryFreeClear(pReader->status.uidList.tableUidList);
tsdbTrace("tsdb/reader-close: %p, untake snapshot", pReader);
void* p = pReader->pReadSnap;
if ((p == atomic_val_compare_exchange_ptr((void**)&pReader->pReadSnap, p, NULL)) && (p != NULL)) {
tsdbUntakeReadSnap2(pReader, p, true);
pReader->pReadSnap = NULL;
}
(void) tsem_destroy(&pReader->resumeAfterSuspend);
(void) tsdbReleaseReader(pReader);
(void) tsdbUninitReaderLock(pReader);
tsdbDebug(
"%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%" PRId64
" SMA-time:%.2f ms, fileBlocks:%" PRId64
", fileBlocks-load-time:%.2f ms, "
"build in-memory-block-time:%.2f ms, sttBlocks:%" PRId64 ", sttBlocks-time:%.2f ms, sttStatisBlock:%" PRId64
", stt-statis-Block-time:%.2f ms, composed-blocks:%" PRId64
", composed-blocks-time:%.2fms, STableBlockScanInfo size:%.2f Kb, createTime:%.2f ms,createSkylineIterTime:%.2f "
"ms, initSttBlockReader:%.2fms, %s",
pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaDataLoad, pCost->smaLoadTime, pCost->numOfBlocks,
pCost->blockLoadTime, pCost->buildmemBlock, pCost->sttCost.loadBlocks, pCost->sttCost.blockElapsedTime,
pCost->sttCost.loadStatisBlocks, pCost->sttCost.statisElapsedTime, pCost->composedBlocks,
pCost->buildComposedBlockTime, numOfTables * sizeof(STableBlockScanInfo) / 1000.0, pCost->createScanInfoList,
pCost->createSkylineIterTime, pCost->initSttBlockReader, pReader->idStr);
taosMemoryFree(pReader->idStr);
tsdbRowMergerCleanup(&pReader->status.merger);
taosMemoryFree(pReader->info.pSchema);
tSimpleHashCleanup(pReader->pSchemaMap);
taosMemoryFreeClear(pReader);
}
static int32_t doSuspendCurrentReader(STsdbReader* pCurrentReader) {
SReaderStatus* pStatus = &pCurrentReader->status;
if (pStatus->loadFromFile) {
tsdbDataFileReaderClose(&pCurrentReader->pFileReader);
SReadCostSummary* pCost = &pCurrentReader->cost;
destroySttBlockReader(pStatus->pLDataIterArray, &pCost->sttCost);
pStatus->pLDataIterArray = taosArrayInit(4, POINTER_BYTES);
if (pStatus->pLDataIterArray == NULL) {
return terrno;
}
}
// resetDataBlockScanInfo excluding lastKey
STableBlockScanInfo** p = NULL;
int32_t step = ASCENDING_TRAVERSE(pCurrentReader->info.order) ? 1 : -1;
int32_t iter = 0;
while ((p = tSimpleHashIterate(pStatus->pTableMap, p, &iter)) != NULL) {
STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;
clearBlockScanInfo(pInfo);
// pInfo->sttKeyInfo.nextProcKey = pInfo->lastProcKey.ts + step;
// pInfo->sttKeyInfo.nextProcKey = pInfo->lastProcKey + step;
}
pStatus->uidList.currentIndex = 0;
initReaderStatus(pStatus);
return TSDB_CODE_SUCCESS;
}
int32_t tsdbReaderSuspend2(STsdbReader* pReader) {
// save reader's base state & reset top state to be reconstructed from base state
int32_t code = 0;
pReader->status.suspendInvoked = true; // record the suspend status
if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
if (pReader->step == EXTERNAL_ROWS_PREV) {
code = doSuspendCurrentReader(pReader->innerReader[0]);
} else if (pReader->step == EXTERNAL_ROWS_MAIN) {
code = doSuspendCurrentReader(pReader);
} else {
code = doSuspendCurrentReader(pReader->innerReader[1]);
}
} else {
code = doSuspendCurrentReader(pReader);
}
// make sure only release once
void* p = pReader->pReadSnap;
if ((p == atomic_val_compare_exchange_ptr((void**)&pReader->pReadSnap, p, NULL)) && (p != NULL)) {
tsdbUntakeReadSnap2(pReader, p, false);
pReader->pReadSnap = NULL;
}
if (pReader->bFilesetDelimited) {
pReader->status.memTableMinKey = INT64_MAX;
pReader->status.memTableMaxKey = INT64_MIN;
}
pReader->flag = READER_STATUS_SUSPEND;
if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
clearSharedPtr(pReader->innerReader[0]);
clearSharedPtr(pReader->innerReader[1]);
}
#if SUSPEND_RESUME_TEST
tsem_post(&pReader->resumeAfterSuspend);
#endif
tsdbDebug("reader: %p suspended in this query %s, step:%d", pReader, pReader->idStr, pReader->step);
return code;
}
static int32_t tsdbSetQueryReseek(void* pQHandle) {
int32_t code = 0;
STsdbReader* pReader = pQHandle;
code = tsdbTryAcquireReader(pReader);
if (code == 0) {
if (pReader->flag == READER_STATUS_SUSPEND) {
code = tsdbReleaseReader(pReader);
return code;
}
code = tsdbReaderSuspend2(pReader);
(void) tsdbReleaseReader(pReader);
return code;
} else if (code == EBUSY) {
return TSDB_CODE_VND_QUERY_BUSY;
} else {
terrno = TAOS_SYSTEM_ERROR(code);
return TSDB_CODE_FAILED;
}
}
int32_t tsdbReaderResume2(STsdbReader* pReader) {
int32_t code = 0;
STableBlockScanInfo** pBlockScanInfo = pReader->status.pTableIter;
// restore reader's state, task snapshot
int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
if (numOfTables > 0) {
tsdbTrace("tsdb/reader: %p, take snapshot", pReader);
code = tsdbTakeReadSnap2(pReader, tsdbSetQueryReseek, &pReader->pReadSnap, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
goto _err;
}
// open reader failure may cause the flag still to be READER_STATUS_SUSPEND, which may cause suspend reader failure.
// So we need to set it A.S.A.P
pReader->flag = READER_STATUS_NORMAL;
if (pReader->type == TIMEWINDOW_RANGE_CONTAINED) {
code = doOpenReaderImpl(pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
} else {
STsdbReader* pPrevReader = pReader->innerReader[0];
STsdbReader* pNextReader = pReader->innerReader[1];
// we need only one row
pPrevReader->resBlockInfo.capacity = 1;
code = setSharedPtr(pPrevReader, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
pNextReader->resBlockInfo.capacity = 1;
code = setSharedPtr(pNextReader, pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pReader->step == 0 || pReader->step == EXTERNAL_ROWS_PREV) {
code = doOpenReaderImpl(pPrevReader);
} else if (pReader->step == EXTERNAL_ROWS_MAIN) {
code = doOpenReaderImpl(pReader);
} else {
code = doOpenReaderImpl(pNextReader);
}
if (code != TSDB_CODE_SUCCESS) {
return code;
}
}
}
tsdbDebug("reader: %p resumed uid %" PRIu64 ", numOfTable:%" PRId32 ", in this query %s", pReader,
pBlockScanInfo ? (*pBlockScanInfo)->uid : 0, numOfTables, pReader->idStr);
return code;
_err:
tsdbError("failed to resume data reader, code:%s %s", tstrerror(code), pReader->idStr);
return code;
}
static int32_t buildFromPreFilesetBuffer(STsdbReader* pReader) {
int32_t code = TSDB_CODE_SUCCESS;
SReaderStatus* pStatus = &pReader->status;
SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
int32_t fid = pReader->status.pCurrentFileset->fid;
STimeWindow win = {0};
tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);
int64_t endKey = (ASCENDING_TRAVERSE(pReader->info.order)) ? win.skey : win.ekey;
code = buildBlockFromBufferSeqForPreFileset(pReader, endKey);
if (code != TSDB_CODE_SUCCESS || pBlock->info.rows > 0) {
return code;
} else {
tsdbDebug("finished pre-fileset %d buffer processing. %s", fid, pReader->idStr);
pStatus->bProcMemPreFileset = false;
pStatus->processingMemPreFileSet = false;
if (pReader->notifyFn) {
STsdReaderNotifyInfo info = {0};
info.duration.filesetId = fid;
pReader->notifyFn(TSD_READER_NOTIFY_DURATION_START, &info, pReader->notifyParam);
tsdbDebug("new duration %d start notification when buffer pre-fileset, %s", fid, pReader->idStr);
}
}
return code;
}
static int32_t doTsdbNextDataBlockFilesetDelimited(STsdbReader* pReader) {
SReaderStatus* pStatus = &pReader->status;
int32_t code = TSDB_CODE_SUCCESS;
SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
if (pStatus->loadFromFile) {
if (pStatus->bProcMemPreFileset) {
code = buildFromPreFilesetBuffer(pReader);
if (code != TSDB_CODE_SUCCESS || pBlock->info.rows > 0) {
return code;
}
}
code = buildBlockFromFiles(pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
tsdbTrace("block from file rows: %" PRId64 ", will process pre-file set buffer: %d. %s", pBlock->info.rows,
pStatus->bProcMemFirstFileset, pReader->idStr);
if (pStatus->bProcMemPreFileset) {
if (pBlock->info.rows > 0) {
if (pReader->notifyFn && !pReader->status.processingMemPreFileSet) {
int32_t fid = pReader->status.pCurrentFileset->fid;
STsdReaderNotifyInfo info = {0};
info.duration.filesetId = fid;
pReader->notifyFn(TSD_READER_NOTIFY_NEXT_DURATION_BLOCK, &info, pReader->notifyParam);
}
} else {
pStatus->bProcMemPreFileset = false;
}
}
if (pBlock->info.rows <= 0) {
code = resetTableListIndex(&pReader->status, pReader->idStr);
if (code) {
return code;
}
int64_t endKey = (ASCENDING_TRAVERSE(pReader->info.order)) ? INT64_MAX : INT64_MIN;
code = buildBlockFromBufferSequentially(pReader, endKey);
}
} else { // no data in files, let's try the buffer
int64_t endKey = (ASCENDING_TRAVERSE(pReader->info.order)) ? INT64_MAX : INT64_MIN;
code = buildBlockFromBufferSequentially(pReader, endKey);
}
return code;
}
static int32_t doTsdbNextDataBlockFilesFirst(STsdbReader* pReader) {
SReaderStatus* pStatus = &pReader->status;
int32_t code = TSDB_CODE_SUCCESS;
SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
if (pStatus->loadFromFile) {
code = buildBlockFromFiles(pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pBlock->info.rows <= 0) {
code = resetTableListIndex(&pReader->status, pReader->idStr);
if (code) {
return code;
}
int64_t endKey = (ASCENDING_TRAVERSE(pReader->info.order)) ? INT64_MAX : INT64_MIN;
code = buildBlockFromBufferSequentially(pReader, endKey);
}
} else { // no data in files, let's try the buffer
int64_t endKey = (ASCENDING_TRAVERSE(pReader->info.order)) ? INT64_MAX : INT64_MIN;
code = buildBlockFromBufferSequentially(pReader, endKey);
}
return code;
}
static int32_t doTsdbNextDataBlock2(STsdbReader* pReader, bool* hasNext) {
int32_t code = TSDB_CODE_SUCCESS;
// cleanup the data that belongs to the previous data block
SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
blockDataCleanup(pBlock);
*hasNext = false;
SReaderStatus* pStatus = &pReader->status;
if (tSimpleHashGetSize(pStatus->pTableMap) == 0) {
return code;
}
if (!pReader->bFilesetDelimited) {
code = doTsdbNextDataBlockFilesFirst(pReader);
} else {
code = doTsdbNextDataBlockFilesetDelimited(pReader);
}
*hasNext = pBlock->info.rows > 0;
return code;
}
int32_t tsdbNextDataBlock2(STsdbReader* pReader, bool* hasNext) {
int32_t code = TSDB_CODE_SUCCESS;
*hasNext = false;
if (isEmptyQueryTimeWindow(&pReader->info.window) || pReader->step == EXTERNAL_ROWS_NEXT ||
pReader->code != TSDB_CODE_SUCCESS) {
return (pReader->code != TSDB_CODE_SUCCESS) ? pReader->code : code;
}
SReaderStatus* pStatus = &pReader->status;
// NOTE: the following codes is used to perform test for suspend/resume for tsdbReader when it blocks the commit
// the data should be ingested in round-robin and all the child tables should be createted before ingesting data
// the version range of query will be used to identify the correctness of suspend/resume functions.
// this function will blocked before loading the SECOND block from vnode-buffer, and restart itself from sst-files
#if SUSPEND_RESUME_TEST
if (!pReader->status.suspendInvoked && !pReader->status.loadFromFile) {
tsem_wait(&pReader->resumeAfterSuspend);
}
#endif
code = tsdbAcquireReader(pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
tsdbTrace("tsdb/read: %p, take read mutex, code: %d", pReader, code);
if (pReader->flag == READER_STATUS_SUSPEND) {
code = tsdbReaderResume2(pReader);
if (code != TSDB_CODE_SUCCESS) {
// release reader failure should be suppressed here, to avoid over-write the original error code
(void) tsdbReleaseReader(pReader);
return code;
}
}
if (pReader->innerReader[0] != NULL && pReader->step == 0) {
code = doTsdbNextDataBlock2(pReader->innerReader[0], hasNext);
if (code) {
(void) tsdbReleaseReader(pReader);
return code;
}
pReader->step = EXTERNAL_ROWS_PREV;
if (*hasNext) {
pStatus = &pReader->innerReader[0]->status;
if (pStatus->composedDataBlock) {
tsdbTrace("tsdb/read: %p, unlock read mutex", pReader);
code = tsdbReleaseReader(pReader);
}
return code;
}
}
if (pReader->step == EXTERNAL_ROWS_PREV) {
// prepare for the main scan
if (tSimpleHashGetSize(pReader->status.pTableMap) > 0) {
code = doOpenReaderImpl(pReader);
}
int32_t step = 1;
resetAllDataBlockScanInfo(pReader->status.pTableMap, pReader->innerReader[0]->info.window.ekey, step);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
pReader->step = EXTERNAL_ROWS_MAIN;
}
code = doTsdbNextDataBlock2(pReader, hasNext);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
if (*hasNext) {
if (pStatus->composedDataBlock) {
tsdbTrace("tsdb/read: %p, unlock read mutex", pReader);
code = tsdbReleaseReader(pReader);
}
return code;
}
if (pReader->step == EXTERNAL_ROWS_MAIN && pReader->innerReader[1] != NULL) {
// prepare for the next row scan
if (tSimpleHashGetSize(pReader->status.pTableMap) > 0) {
code = doOpenReaderImpl(pReader->innerReader[1]);
}
int32_t step = -1;
resetAllDataBlockScanInfo(pReader->innerReader[1]->status.pTableMap, pReader->info.window.ekey, step);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
code = doTsdbNextDataBlock2(pReader->innerReader[1], hasNext);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
pReader->step = EXTERNAL_ROWS_NEXT;
if (*hasNext) {
pStatus = &pReader->innerReader[1]->status;
if (pStatus->composedDataBlock) {
tsdbTrace("tsdb/read: %p, unlock read mutex", pReader);
code = tsdbReleaseReader(pReader);
}
return code;
}
}
tsdbTrace("tsdb/read: %p, unlock read mutex", pReader);
code = tsdbReleaseReader(pReader);
return code;
}
static void doFillNullColSMA(SBlockLoadSuppInfo* pSup, int32_t numOfRows, int32_t numOfCols, SColumnDataAgg* pTsAgg) {
// do fill all null column value SMA info
int32_t i = 0, j = 0;
int32_t size = (int32_t)TARRAY2_SIZE(&pSup->colAggArray);
int32_t code = TARRAY2_INSERT_PTR(&pSup->colAggArray, 0, pTsAgg);
if (code != TSDB_CODE_SUCCESS) {
return;
}
size++;
while (j < numOfCols && i < size) {
SColumnDataAgg* pAgg = &pSup->colAggArray.data[i];
if (pAgg->colId == pSup->colId[j]) {
i += 1;
j += 1;
} else if (pAgg->colId < pSup->colId[j]) {
i += 1;
} else if (pSup->colId[j] < pAgg->colId) {
if (pSup->colId[j] != PRIMARYKEY_TIMESTAMP_COL_ID) {
SColumnDataAgg nullColAgg = {.colId = pSup->colId[j], .numOfNull = numOfRows};
code = TARRAY2_INSERT_PTR(&pSup->colAggArray, i, &nullColAgg);
if (code != TSDB_CODE_SUCCESS) {
return;
}
i += 1;
size++;
}
j += 1;
}
}
while (j < numOfCols) {
if (pSup->colId[j] != PRIMARYKEY_TIMESTAMP_COL_ID) {
SColumnDataAgg nullColAgg = {.colId = pSup->colId[j], .numOfNull = numOfRows};
code = TARRAY2_INSERT_PTR(&pSup->colAggArray, i, &nullColAgg);
if (code != TSDB_CODE_SUCCESS) {
return;
}
i += 1;
}
j++;
}
}
int32_t tsdbRetrieveDatablockSMA2(STsdbReader* pReader, SSDataBlock* pDataBlock, bool* allHave, bool* hasNullSMA) {
SColumnDataAgg** pBlockSMA = &pDataBlock->pBlockAgg;
SFileDataBlockInfo* pBlockInfo = NULL;
int32_t code = 0;
*allHave = false;
*pBlockSMA = NULL;
if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
return TSDB_CODE_SUCCESS;
}
// there is no statistics data for composed block
if (pReader->status.composedDataBlock || (!pReader->suppInfo.smaValid)) {
return TSDB_CODE_SUCCESS;
}
code = getCurrentBlockInfo(&pReader->status.blockIter, &pBlockInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
if (pResBlock->info.id.uid != pBlockInfo->uid) {
return TSDB_CODE_SUCCESS;
}
// int64_t st = taosGetTimestampUs();
TARRAY2_CLEAR(&pSup->colAggArray, 0);
SBrinRecord pRecord;
blockInfoToRecord(&pRecord, pBlockInfo, pSup);
code = tsdbDataFileReadBlockSma(pReader->pFileReader, &pRecord, &pSup->colAggArray);
if (code != TSDB_CODE_SUCCESS) {
tsdbDebug("vgId:%d, failed to load block SMA for uid %" PRIu64 ", code:%s, %s", 0, pBlockInfo->uid, tstrerror(code),
pReader->idStr);
return code;
}
if (pSup->colAggArray.size > 0) {
*allHave = true;
} else {
*pBlockSMA = NULL;
return TSDB_CODE_SUCCESS;
}
// always load the first primary timestamp column data
SColumnDataAgg* pTsAgg = &pSup->tsColAgg;
pTsAgg->numOfNull = 0;
pTsAgg->colId = PRIMARYKEY_TIMESTAMP_COL_ID;
pTsAgg->min = pResBlock->info.window.skey;
pTsAgg->max = pResBlock->info.window.ekey;
// update the number of NULL data rows
size_t numOfCols = pSup->numOfCols;
if (pResBlock->pBlockAgg == NULL) {
size_t num = taosArrayGetSize(pResBlock->pDataBlock);
pResBlock->pBlockAgg = taosMemoryCalloc(num, sizeof(SColumnDataAgg));
if (pResBlock->pBlockAgg == NULL) {
return terrno;
}
for (int i = 0; i < num; ++i) {
pResBlock->pBlockAgg[i].colId = -1;
}
}
// do fill all null column value SMA info
doFillNullColSMA(pSup, pBlockInfo->numRow, numOfCols, pTsAgg);
size_t size = pSup->colAggArray.size;
int32_t i = 0, j = 0;
while (j < numOfCols && i < size) {
SColumnDataAgg* pAgg = &pSup->colAggArray.data[i];
if (pAgg->colId == pSup->colId[j]) {
pResBlock->pBlockAgg[pSup->slotId[j]] = *pAgg;
i += 1;
j += 1;
} else if (pAgg->colId < pSup->colId[j]) {
i += 1;
} else if (pSup->colId[j] < pAgg->colId) {
pResBlock->pBlockAgg[pSup->slotId[j]].colId = -1;
*allHave = false;
j += 1;
}
}
*pBlockSMA = pResBlock->pBlockAgg;
pReader->cost.smaDataLoad += 1;
// double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
pReader->cost.smaLoadTime += 0; // elapsedTime;
tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", %s", 0, pBlockInfo->uid, pReader->idStr);
return code;
}
static int32_t doRetrieveDataBlock(STsdbReader* pReader, SSDataBlock** pBlock) {
SReaderStatus* pStatus = &pReader->status;
int32_t code = TSDB_CODE_SUCCESS;
SFileDataBlockInfo* pBlockInfo = NULL;
*pBlock = NULL;
code = getCurrentBlockInfo(&pStatus->blockIter, &pBlockInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pReader->code != TSDB_CODE_SUCCESS) {
return pReader->code;
}
STableBlockScanInfo* pBlockScanInfo = NULL;
code = getTableBlockScanInfo(pStatus->pTableMap, pBlockInfo->uid, &pBlockScanInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = doLoadFileBlockData(pReader, &pStatus->blockIter, &pStatus->fileBlockData, pBlockScanInfo->uid);
if (code != TSDB_CODE_SUCCESS) {
tBlockDataReset(&pStatus->fileBlockData);
return code;
}
code = copyBlockDataToSDataBlock(pReader, &pBlockScanInfo->lastProcKey);
if (code != TSDB_CODE_SUCCESS) {
tBlockDataReset(&pStatus->fileBlockData);
}
*pBlock = pReader->resBlockInfo.pResBlock;
return code;
}
int32_t tsdbRetrieveDataBlock2(STsdbReader* pReader, SSDataBlock** pBlock, SArray* pIdList) {
*pBlock = NULL;
STsdbReader* pTReader = pReader;
if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
if (pReader->step == EXTERNAL_ROWS_PREV) {
pTReader = pReader->innerReader[0];
} else if (pReader->step == EXTERNAL_ROWS_NEXT) {
pTReader = pReader->innerReader[1];
}
}
SReaderStatus* pStatus = &pTReader->status;
if (pStatus->composedDataBlock || pReader->info.execMode == READER_EXEC_ROWS) {
// tsdbReaderSuspend2(pReader);
// tsdbReaderResume2(pReader);
*pBlock = pTReader->resBlockInfo.pResBlock;
return TSDB_CODE_SUCCESS;
}
int32_t code = doRetrieveDataBlock(pTReader, pBlock);
tsdbTrace("tsdb/read-retrieve: %p, unlock read mutex", pReader);
(void) tsdbReleaseReader(pReader);
// tsdbReaderSuspend2(pReader);
// tsdbReaderResume2(pReader);
return code;
}
int32_t tsdbReaderReset2(STsdbReader* pReader, SQueryTableDataCond* pCond) {
int32_t code = TSDB_CODE_SUCCESS;
tsdbTrace("tsdb/reader-reset: %p, take read mutex", pReader);
code = tsdbAcquireReader(pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pReader->flag == READER_STATUS_SUSPEND) {
code = tsdbReaderResume2(pReader);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
}
if (isEmptyQueryTimeWindow(&pReader->info.window) || pReader->pReadSnap == NULL) {
tsdbDebug("tsdb reader reset return %p, %s", pReader->pReadSnap, pReader->idStr);
return tsdbReleaseReader(pReader);
}
SReaderStatus* pStatus = &pReader->status;
SDataBlockIter* pBlockIter = &pStatus->blockIter;
pReader->info.order = pCond->order;
pReader->type = TIMEWINDOW_RANGE_CONTAINED;
pReader->info.window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
pStatus->loadFromFile = true;
pStatus->pTableIter = NULL;
// allocate buffer in order to load data blocks from file
memset(&pReader->suppInfo.tsColAgg, 0, sizeof(SColumnDataAgg));
pReader->suppInfo.tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
tsdbDataFileReaderClose(&pReader->pFileReader);
int32_t numOfTables = tSimpleHashGetSize(pStatus->pTableMap);
code = initFilesetIterator(&pStatus->fileIter, pReader->pReadSnap->pfSetArray, pReader);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
code = resetDataBlockIterator(pBlockIter, pReader->info.order, shouldFreePkBuf(&pReader->suppInfo), pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
code = resetTableListIndex(&pReader->status, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
bool asc = ASCENDING_TRAVERSE(pReader->info.order);
int32_t step = asc ? 1 : -1;
int64_t ts = 0;
if (asc) {
ts = (pReader->info.window.skey > INT64_MIN) ? pReader->info.window.skey - 1 : pReader->info.window.skey;
} else {
ts = (pReader->info.window.ekey < INT64_MAX) ? pReader->info.window.ekey + 1 : pReader->info.window.ekey;
}
resetAllDataBlockScanInfo(pStatus->pTableMap, ts, step);
// no data in files, let's try buffer in memory
if (pStatus->fileIter.numOfFiles == 0) {
pStatus->loadFromFile = false;
code = resetTableListIndex(pStatus, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
} else {
code = initForFirstBlockInFile(pReader, pBlockIter);
if (code != TSDB_CODE_SUCCESS) {
tsdbError("%p reset reader failed, numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s", pReader,
numOfTables, pReader->info.window.skey, pReader->info.window.ekey, pReader->idStr);
(void) tsdbReleaseReader(pReader);
return code;
}
}
tsdbDebug("%p reset reader, suid:%" PRIu64 ", numOfTables:%d, skey:%" PRId64 ", query range:%" PRId64 " - %" PRId64
" in query %s",
pReader, pReader->info.suid, numOfTables, pCond->twindows.skey, pReader->info.window.skey,
pReader->info.window.ekey, pReader->idStr);
code = tsdbReleaseReader(pReader);
return code;
}
static int32_t getBucketIndex(int32_t startRow, int32_t bucketRange, int32_t numOfRows, int32_t numOfBucket) {
if (numOfRows < startRow) {
return 0;
}
int32_t bucketIndex = ((numOfRows - startRow) / bucketRange);
if (bucketIndex == numOfBucket) {
bucketIndex -= 1;
}
return bucketIndex;
}
int32_t tsdbGetFileBlocksDistInfo2(STsdbReader* pReader, STableBlockDistInfo* pTableBlockInfo) {
int32_t code = TSDB_CODE_SUCCESS;
const int32_t numOfBuckets = 20.0;
pTableBlockInfo->totalSize = 0;
pTableBlockInfo->totalRows = 0;
pTableBlockInfo->numOfVgroups = 1;
// find the start data block in file
code = tsdbAcquireReader(pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pReader->flag == READER_STATUS_SUSPEND) {
code = tsdbReaderResume2(pReader);
if (code != TSDB_CODE_SUCCESS) {
return tsdbReleaseReader(pReader);
}
}
SMergeTreeConf conf = {
.pReader = pReader,
.pSchema = pReader->info.pSchema,
.pCols = pReader->suppInfo.colId,
.numOfCols = pReader->suppInfo.numOfCols,
.suid = pReader->info.suid,
};
SReaderStatus* pStatus = &pReader->status;
if (pStatus->pCurrentFileset != NULL) {
pTableBlockInfo->numOfSttRows += tsdbGetRowsInSttFiles(pStatus->pCurrentFileset, pStatus->pLDataIterArray,
pReader->pTsdb, &conf, pReader->idStr);
}
STsdbCfg* pc = &pReader->pTsdb->pVnode->config.tsdbCfg;
pTableBlockInfo->defMinRows = pc->minRows;
pTableBlockInfo->defMaxRows = pc->maxRows;
int32_t bucketRange = ceil(((double)(pc->maxRows - pc->minRows)) / numOfBuckets);
pTableBlockInfo->numOfFiles += 1;
int32_t numOfTables = (int32_t)tSimpleHashGetSize(pStatus->pTableMap);
SDataBlockIter* pBlockIter = &pStatus->blockIter;
pTableBlockInfo->numOfFiles += pStatus->fileIter.numOfFiles;
if (pBlockIter->numOfBlocks > 0) {
pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
}
pTableBlockInfo->numOfTables = numOfTables;
bool hasNext = (pBlockIter->numOfBlocks > 0);
while (true) {
if (hasNext) {
SFileDataBlockInfo* pBlockInfo = NULL;
code = getCurrentBlockInfo(pBlockIter, &pBlockInfo, pReader->idStr);
if (code != TSDB_CODE_SUCCESS) {
break;
}
int32_t numOfRows = pBlockInfo->numRow;
pTableBlockInfo->totalRows += numOfRows;
if (numOfRows > pTableBlockInfo->maxRows) {
pTableBlockInfo->maxRows = numOfRows;
}
if (numOfRows < pTableBlockInfo->minRows) {
pTableBlockInfo->minRows = numOfRows;
}
pTableBlockInfo->totalSize += pBlockInfo->blockSize;
int32_t bucketIndex = getBucketIndex(pTableBlockInfo->defMinRows, bucketRange, numOfRows, numOfBuckets);
pTableBlockInfo->blockRowsHisto[bucketIndex]++;
hasNext = blockIteratorNext(&pStatus->blockIter, pReader->idStr);
} else {
code = initForFirstBlockInFile(pReader, pBlockIter);
if ((code != TSDB_CODE_SUCCESS) || (pStatus->loadFromFile == false)) {
break;
}
// add the data in stt files of new fileset
if (pStatus->pCurrentFileset != NULL) {
pTableBlockInfo->numOfSttRows += tsdbGetRowsInSttFiles(pStatus->pCurrentFileset, pStatus->pLDataIterArray,
pReader->pTsdb, &conf, pReader->idStr);
}
pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
hasNext = (pBlockIter->numOfBlocks > 0);
}
}
// record the data in stt files
(void) tsdbReleaseReader(pReader);
return code;
}
static void getMemTableTimeRange(STsdbReader* pReader, int64_t* pMaxKey, int64_t* pMinKey) {
int32_t code = TSDB_CODE_SUCCESS;
int64_t rows = 0;
SReaderStatus* pStatus = &pReader->status;
int32_t iter = 0;
int64_t maxKey = INT64_MIN;
int64_t minKey = INT64_MAX;
void* pHashIter = tSimpleHashIterate(pStatus->pTableMap, NULL, &iter);
while (pHashIter != NULL) {
STableBlockScanInfo* pBlockScanInfo = *(STableBlockScanInfo**)pHashIter;
STbData* d = NULL;
if (pReader->pReadSnap->pMem != NULL) {
d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->info.suid, pBlockScanInfo->uid);
if (d != NULL) {
if (d->maxKey > maxKey) {
maxKey = d->maxKey;
}
if (d->minKey < minKey) {
minKey = d->minKey;
}
}
}
STbData* di = NULL;
if (pReader->pReadSnap->pIMem != NULL) {
di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->info.suid, pBlockScanInfo->uid);
if (di != NULL) {
if (di->maxKey > maxKey) {
maxKey = di->maxKey;
}
if (di->minKey < minKey) {
minKey = di->minKey;
}
}
}
// current table is exhausted, let's try the next table
pHashIter = tSimpleHashIterate(pStatus->pTableMap, pHashIter, &iter);
}
*pMaxKey = maxKey;
*pMinKey = minKey;
}
int64_t tsdbGetNumOfRowsInMemTable2(STsdbReader* pReader) {
int32_t code = TSDB_CODE_SUCCESS;
int64_t rows = 0;
SReaderStatus* pStatus = &pReader->status;
code = tsdbAcquireReader(pReader);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
if (pReader->flag == READER_STATUS_SUSPEND) {
code = tsdbReaderResume2(pReader);
if (code != TSDB_CODE_SUCCESS) {
(void) tsdbReleaseReader(pReader);
return code;
}
}
int32_t iter = 0;
pStatus->pTableIter = tSimpleHashIterate(pStatus->pTableMap, NULL, &iter);
while (pStatus->pTableIter != NULL) {
STableBlockScanInfo* pBlockScanInfo = *(STableBlockScanInfo**)pStatus->pTableIter;
STbData* d = NULL;
if (pReader->pReadSnap->pMem != NULL) {
d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->info.suid, pBlockScanInfo->uid);
if (d != NULL) {
rows += tsdbGetNRowsInTbData(d);
}
}
STbData* di = NULL;
if (pReader->pReadSnap->pIMem != NULL) {
di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->info.suid, pBlockScanInfo->uid);
if (di != NULL) {
rows += tsdbGetNRowsInTbData(di);
}
}
// current table is exhausted, let's try the next table
pStatus->pTableIter = tSimpleHashIterate(pStatus->pTableMap, pStatus->pTableIter, &iter);
}
(void) tsdbReleaseReader(pReader);
return rows;
}
int32_t tsdbGetTableSchema(SMeta* pMeta, int64_t uid, STSchema** pSchema, int64_t* suid) {
SMetaReader mr = {0};
metaReaderDoInit(&mr, pMeta, META_READER_LOCK);
int32_t code = metaReaderGetTableEntryByUidCache(&mr, uid);
if (code != TSDB_CODE_SUCCESS) {
code = TSDB_CODE_TDB_INVALID_TABLE_ID;
metaReaderClear(&mr);
return code;
}
*suid = 0;
// only child table and ordinary table is allowed, super table is not allowed.
if (mr.me.type == TSDB_CHILD_TABLE) {
tDecoderClear(&mr.coder);
*suid = mr.me.ctbEntry.suid;
code = metaReaderGetTableEntryByUidCache(&mr, *suid);
if (code != TSDB_CODE_SUCCESS) {
code = TSDB_CODE_TDB_INVALID_TABLE_ID;
metaReaderClear(&mr);
return code;
}
} else if (mr.me.type == TSDB_NORMAL_TABLE) { // do nothing
} else {
code = TSDB_CODE_INVALID_PARA;
metaReaderClear(&mr);
return code;
}
metaReaderClear(&mr);
// get the newest table schema version
code = metaGetTbTSchemaEx(pMeta, *suid, uid, -1, pSchema);
return code;
}
int32_t tsdbTakeReadSnap2(STsdbReader* pReader, _query_reseek_func_t reseek, STsdbReadSnap** ppSnap, const char* id) {
int32_t code = 0;
STsdb* pTsdb = pReader->pTsdb;
SVersionRange* pRange = &pReader->info.verRange;
int32_t lino = 0;
*ppSnap = NULL;
// lock
code = taosThreadMutexLock(&pTsdb->mutex);
if (code != TSDB_CODE_SUCCESS) {
tsdbError("%s failed to lock tsdb, code:%s", id, tstrerror(code));
return code;
}
// alloc
STsdbReadSnap* pSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(STsdbReadSnap));
if (pSnap == NULL) {
(void) taosThreadMutexUnlock(&pTsdb->mutex);
TSDB_CHECK_NULL(pSnap, code, lino, _exit, terrno);
}
// take snapshot
if (pTsdb->mem && (pRange->minVer <= pTsdb->mem->maxVer && pRange->maxVer >= pTsdb->mem->minVer)) {
pSnap->pMem = pTsdb->mem;
pSnap->pNode = taosMemoryMalloc(sizeof(*pSnap->pNode));
if (pSnap->pNode == NULL) {
(void) taosThreadMutexUnlock(&pTsdb->mutex);
TSDB_CHECK_NULL(pSnap->pNode, code, lino, _exit, terrno);
}
pSnap->pNode->pQHandle = pReader;
pSnap->pNode->reseek = reseek;
code = tsdbRefMemTable(pTsdb->mem, pSnap->pNode);
TSDB_CHECK_CODE(code, lino, _exit);
}
if (pTsdb->imem && (pRange->minVer <= pTsdb->imem->maxVer && pRange->maxVer >= pTsdb->imem->minVer)) {
pSnap->pIMem = pTsdb->imem;
pSnap->pINode = taosMemoryMalloc(sizeof(*pSnap->pINode));
if (pSnap->pINode == NULL) {
code = terrno;
if (pTsdb->mem && pSnap->pNode) {
tsdbUnrefMemTable(pTsdb->mem, pSnap->pNode, true); // unref the previous refed mem
}
(void) taosThreadMutexUnlock(&pTsdb->mutex);
goto _exit;
}
pSnap->pINode->pQHandle = pReader;
pSnap->pINode->reseek = reseek;
code = tsdbRefMemTable(pTsdb->imem, pSnap->pINode);
TSDB_CHECK_CODE(code, lino, _exit);
}
// fs
code = tsdbFSCreateRefSnapshotWithoutLock(pTsdb->pFS, &pSnap->pfSetArray);
if (code) {
if (pSnap->pNode) {
tsdbUnrefMemTable(pTsdb->mem, pSnap->pNode, true); // unref the previous refed mem
}
if (pSnap->pINode) {
tsdbUnrefMemTable(pTsdb->imem, pSnap->pINode, true);
}
(void) taosThreadMutexUnlock(&pTsdb->mutex);
TSDB_CHECK_CODE(code, lino, _exit);
}
// unlock
(void) taosThreadMutexUnlock(&pTsdb->mutex);
*ppSnap = pSnap;
tsdbTrace("%s vgId:%d, take read snapshot", id, TD_VID(pTsdb->pVnode));
return code;
_exit:
tsdbError("%s vgId:%d take read snapshot failed, line:%d code:%s", id, TD_VID(pTsdb->pVnode), lino, tstrerror(code));
if (pSnap) {
if (pSnap->pNode) taosMemoryFree(pSnap->pNode);
if (pSnap->pINode) taosMemoryFree(pSnap->pINode);
taosMemoryFree(pSnap);
}
return code;
}
void tsdbUntakeReadSnap2(STsdbReader* pReader, STsdbReadSnap* pSnap, bool proactive) {
STsdb* pTsdb = pReader->pTsdb;
if (pSnap) {
if (pSnap->pMem) {
tsdbUnrefMemTable(pSnap->pMem, pSnap->pNode, proactive);
}
if (pSnap->pIMem) {
tsdbUnrefMemTable(pSnap->pIMem, pSnap->pINode, proactive);
}
if (pSnap->pNode) taosMemoryFree(pSnap->pNode);
if (pSnap->pINode) taosMemoryFree(pSnap->pINode);
tsdbFSDestroyRefSnapshot(&pSnap->pfSetArray);
taosMemoryFree(pSnap);
}
tsdbTrace("vgId:%d, untake read snapshot", TD_VID(pTsdb->pVnode));
}
// if failed, do nothing
int32_t tsdbReaderSetId(void* p, const char* idstr) {
STsdbReader* pReader = (STsdbReader*) p;
taosMemoryFreeClear(pReader->idStr);
pReader->idStr = taosStrdup(idstr);
if (pReader->idStr == NULL) {
tsdbError("%s failed to build reader id, code:%s", idstr, tstrerror(terrno));
return terrno;
}
pReader->status.fileIter.pSttBlockReader->mergeTree.idStr = pReader->idStr;
return 0;
}
void tsdbReaderSetCloseFlag(STsdbReader* pReader) { /*pReader->code = TSDB_CODE_TSC_QUERY_CANCELLED;*/ }
void tsdbSetFilesetDelimited(STsdbReader* pReader) { pReader->bFilesetDelimited = true; }
void tsdbReaderSetNotifyCb(STsdbReader* pReader, TsdReaderNotifyCbFn notifyFn, void* param) {
pReader->notifyFn = notifyFn;
pReader->notifyParam = param;
}