fix crash when insert big csv

This commit is contained in:
Bob Liu 2023-11-29 01:53:13 +08:00
parent 4439c1e4f5
commit 5c5efa34eb
9 changed files with 238 additions and 10 deletions

View File

@ -449,6 +449,7 @@ typedef struct SVnodeModifyOpStmt {
SHashObj* pSubTableHashObj; // SHashObj<table_name, STableMeta*>
SHashObj* pTableNameHashObj; // set of table names for refreshing meta, sync mode
SHashObj* pDbFNameHashObj; // set of db names for refreshing meta, sync mode
SHashObj* pTableCxtHashObj; // temp SHashObj<tuid, STableDataCxt*> for single request
SArray* pVgDataBlocks; // SArray<SVgroupDataCxt*>
SVCreateTbReq* pCreateTblReq;
TdFilePtr fp;

View File

@ -887,6 +887,7 @@ void nodesDestroyNode(SNode* pNode) {
taosHashCleanup(pStmt->pSubTableHashObj);
taosHashCleanup(pStmt->pTableNameHashObj);
taosHashCleanup(pStmt->pDbFNameHashObj);
taosHashCleanup(pStmt->pTableCxtHashObj);
if (pStmt->freeHashFunc) {
pStmt->freeHashFunc(pStmt->pTableBlockHashObj);
}

View File

@ -50,7 +50,7 @@ void insCheckTableDataOrder(STableDataCxt *pTableCxt, TSKEY tsKey);
int32_t insGetTableDataCxt(SHashObj *pHash, void *id, int32_t idLen, STableMeta *pTableMeta,
SVCreateTbReq **pCreateTbReq, STableDataCxt **pTableCxt, bool colMode, bool ignoreColVals);
int32_t initTableColSubmitData(STableDataCxt *pTableCxt);
int32_t insMergeTableDataCxt(SHashObj *pTableHash, SArray **pVgDataBlocks);
int32_t insMergeTableDataCxt(SHashObj *pTableHash, SArray **pVgDataBlocks, bool isRebuild);
int32_t insBuildVgDataBlocks(SHashObj *pVgroupsHashObj, SArray *pVgDataBlocks, SArray **pDataBlocks);
void insDestroyTableDataCxtHashMap(SHashObj *pTableCxtHash);
void insDestroyVgroupDataCxt(SVgroupDataCxt *pVgCxt);

View File

@ -425,7 +425,7 @@ SQuery* smlInitHandle() {
int32_t smlBuildOutput(SQuery* handle, SHashObj* pVgHash) {
SVnodeModifyOpStmt* pStmt = (SVnodeModifyOpStmt*)(handle)->pRoot;
// merge according to vgId
int32_t code = insMergeTableDataCxt(pStmt->pTableBlockHashObj, &pStmt->pVgDataBlocks);
int32_t code = insMergeTableDataCxt(pStmt->pTableBlockHashObj, &pStmt->pVgDataBlocks, true);
if (code != TSDB_CODE_SUCCESS) {
uError("insMergeTableDataCxt failed");
return code;

View File

@ -55,6 +55,7 @@ typedef struct SInsertParseContext {
bool usingDuplicateTable;
bool forceUpdate;
bool needTableTagVal;
bool needRequest; // whether or not request server
} SInsertParseContext;
typedef int32_t (*_row_append_fn_t)(SMsgBuf* pMsgBuf, const void* value, int32_t len, void* param);
@ -652,6 +653,10 @@ static int32_t parseTagValue(SInsertParseContext* pCxt, SVnodeModifyOpStmt* pStm
}
static int32_t buildCreateTbReq(SVnodeModifyOpStmt* pStmt, STag* pTag, SArray* pTagName) {
if (pStmt->pCreateTblReq) {
tdDestroySVCreateTbReq(pStmt->pCreateTblReq);
taosMemoryFreeClear(pStmt->pCreateTblReq);
}
pStmt->pCreateTblReq = taosMemoryCalloc(1, sizeof(SVCreateTbReq));
if (NULL == pStmt->pCreateTblReq) {
return TSDB_CODE_OUT_OF_MEMORY;
@ -1992,7 +1997,7 @@ static int32_t parseCsvFile(SInsertParseContext* pCxt, SVnodeModifyOpStmt* pStmt
(*pNumOfRows)++;
}
if (TSDB_CODE_SUCCESS == code && (*pNumOfRows) > tsMaxInsertBatchRows) {
if (TSDB_CODE_SUCCESS == code && (*pNumOfRows) >= tsMaxInsertBatchRows) {
pStmt->fileProcessing = true;
break;
}
@ -2003,7 +2008,7 @@ static int32_t parseCsvFile(SInsertParseContext* pCxt, SVnodeModifyOpStmt* pStmt
parserDebug("0x%" PRIx64 " %d rows have been parsed", pCxt->pComCxt->requestId, *pNumOfRows);
if (TSDB_CODE_SUCCESS == code && 0 == (*pNumOfRows) &&
if (TSDB_CODE_SUCCESS == code && 0 == (*pNumOfRows) && 0 == pStmt->totalRowsNum &&
(!TSDB_QUERY_HAS_TYPE(pStmt->insertType, TSDB_QUERY_TYPE_STMT_INSERT)) && !pStmt->fileProcessing) {
code = buildSyntaxErrMsg(&pCxt->msg, "no any data points", NULL);
}
@ -2022,7 +2027,22 @@ static int32_t parseDataFromFileImpl(SInsertParseContext* pCxt, SVnodeModifyOpSt
} else {
parserDebug("0x%" PRIx64 " insert from csv. File is too large, do it in batches.", pCxt->pComCxt->requestId);
}
if (pStmt->insertType != TSDB_QUERY_TYPE_FILE_INSERT) {
return buildSyntaxErrMsg(&pCxt->msg, "keyword VALUES or FILE is exclusive", NULL);
}
}
// just record pTableCxt whose data come from file
if (numOfRows > 0) {
if (NULL == pStmt->pTableCxtHashObj) {
pStmt->pTableCxtHashObj =
taosHashInit(128, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_NO_LOCK);
}
void* pData = rowsDataCxt.pTableDataCxt;
taosHashPut(pStmt->pTableCxtHashObj, &pStmt->pTableMeta->uid, sizeof(pStmt->pTableMeta->uid), &pData,
POINTER_BYTES);
}
return code;
}
@ -2061,6 +2081,9 @@ static int32_t parseDataClause(SInsertParseContext* pCxt, SVnodeModifyOpStmt* pS
NEXT_TOKEN(pStmt->pSql, token);
switch (token.type) {
case TK_VALUES:
if (TSDB_QUERY_HAS_TYPE(pStmt->insertType, TSDB_QUERY_TYPE_FILE_INSERT)) {
return buildSyntaxErrMsg(&pCxt->msg, "keyword VALUES or FILE is exclusive", token.z);
}
return parseValuesClause(pCxt, pStmt, rowsDataCxt, &token);
case TK_FILE:
return parseFileClause(pCxt, pStmt, rowsDataCxt, &token);
@ -2275,8 +2298,25 @@ static int32_t parseInsertBodyBottom(SInsertParseContext* pCxt, SVnodeModifyOpSt
return setStmtInfo(pCxt, pStmt);
}
// release old array alloced by merge
pStmt->freeArrayFunc(pStmt->pVgDataBlocks);
pStmt->pVgDataBlocks = NULL;
bool fileOnly = (pStmt->insertType == TSDB_QUERY_TYPE_FILE_INSERT);
if (fileOnly) {
// none data, skip merge & buildvgdata
if (0 == taosHashGetSize(pStmt->pTableCxtHashObj)) {
pCxt->needRequest = false;
return TSDB_CODE_SUCCESS;
}
}
// merge according to vgId
int32_t code = insMergeTableDataCxt(pStmt->pTableBlockHashObj, &pStmt->pVgDataBlocks);
int32_t code = insMergeTableDataCxt(fileOnly ? pStmt->pTableCxtHashObj : pStmt->pTableBlockHashObj,
&pStmt->pVgDataBlocks, pStmt->fileProcessing);
// clear tmp hashobj only
taosHashClear(pStmt->pTableCxtHashObj);
if (TSDB_CODE_SUCCESS == code) {
code = insBuildVgDataBlocks(pStmt->pVgroupsHashObj, pStmt->pVgDataBlocks, &pStmt->pDataBlocks);
}
@ -2718,6 +2758,7 @@ int32_t parseInsertSql(SParseContext* pCxt, SQuery** pQuery, SCatalogReq* pCatal
.msg = {.buf = pCxt->pMsg, .len = pCxt->msgLen},
.missCache = false,
.usingDuplicateTable = false,
.needRequest = true,
.forceUpdate = (NULL != pCatalogReq ? pCatalogReq->forceUpdate : false)};
int32_t code = initInsertQuery(&context, pCatalogReq, pMetaData, pQuery);
@ -2732,5 +2773,10 @@ int32_t parseInsertSql(SParseContext* pCxt, SQuery** pQuery, SCatalogReq* pCatal
code = setRefreshMeta(*pQuery);
}
insDestroyBoundColInfo(&context.tags);
// if no data to insert, set emptyMode to avoid request server
if (!context.needRequest) {
(*pQuery)->execMode = QUERY_EXEC_MODE_EMPTY_RESULT;
}
return code;
}

View File

@ -58,7 +58,7 @@ int32_t qBuildStmtOutput(SQuery* pQuery, SHashObj* pVgHash, SHashObj* pBlockHash
// merge according to vgId
if (taosHashGetSize(pBlockHash) > 0) {
code = insMergeTableDataCxt(pBlockHash, &pVgDataBlocks);
code = insMergeTableDataCxt(pBlockHash, &pVgDataBlocks, true);
}
if (TSDB_CODE_SUCCESS == code) {
code = insBuildVgDataBlocks(pVgHash, pVgDataBlocks, &pStmt->pDataBlocks);

View File

@ -289,6 +289,14 @@ static int32_t rebuildTableData(SSubmitTbData* pSrc, SSubmitTbData** pDst) {
pTmp->uid = pSrc->uid;
pTmp->sver = pSrc->sver;
pTmp->pCreateTbReq = NULL;
if (pTmp->flags & SUBMIT_REQ_AUTO_CREATE_TABLE) {
if (pSrc->pCreateTbReq) {
cloneSVreateTbReq(pSrc->pCreateTbReq, &pTmp->pCreateTbReq);
} else {
pTmp->flags &= ~SUBMIT_REQ_AUTO_CREATE_TABLE;
}
}
if (pTmp->flags & SUBMIT_REQ_COLUMN_DATA_FORMAT) {
pTmp->aCol = taosArrayInit(128, sizeof(SColData));
if (NULL == pTmp->aCol) {
@ -416,15 +424,21 @@ void insDestroyTableDataCxtHashMap(SHashObj* pTableCxtHash) {
taosHashCleanup(pTableCxtHash);
}
static int32_t fillVgroupDataCxt(STableDataCxt* pTableCxt, SVgroupDataCxt* pVgCxt) {
static int32_t fillVgroupDataCxt(STableDataCxt* pTableCxt, SVgroupDataCxt* pVgCxt, bool isRebuild) {
if (NULL == pVgCxt->pData->aSubmitTbData) {
pVgCxt->pData->aSubmitTbData = taosArrayInit(128, sizeof(SSubmitTbData));
if (NULL == pVgCxt->pData->aSubmitTbData) {
return TSDB_CODE_OUT_OF_MEMORY;
}
}
// push data to submit, rebuild empty data for next submit
taosArrayPush(pVgCxt->pData->aSubmitTbData, pTableCxt->pData);
rebuildTableData(pTableCxt->pData, &pTableCxt->pData);
if (isRebuild) {
rebuildTableData(pTableCxt->pData, &pTableCxt->pData);
} else {
taosMemoryFreeClear(pTableCxt->pData);
}
qDebug("add tableDataCxt uid:%" PRId64 " to vgId:%d", pTableCxt->pMeta->uid, pVgCxt->vgId);
@ -467,7 +481,7 @@ int insColDataComp(const void* lp, const void* rp) {
return 0;
}
int32_t insMergeTableDataCxt(SHashObj* pTableHash, SArray** pVgDataBlocks) {
int32_t insMergeTableDataCxt(SHashObj* pTableHash, SArray** pVgDataBlocks, bool isRebuild) {
SHashObj* pVgroupHash = taosHashInit(128, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, false);
SArray* pVgroupList = taosArrayInit(8, POINTER_BYTES);
if (NULL == pVgroupHash || NULL == pVgroupList) {
@ -502,6 +516,13 @@ int32_t insMergeTableDataCxt(SHashObj* pTableHash, SArray** pVgDataBlocks) {
tColDataSortMerge(pTableCxt->pData->aCol);
} else {
// skip the table has no data to insert
// eg: import a csv without valid data
// if (0 == taosArrayGetSize(pTableCxt->pData->aRowP)) {
// qWarn("no row in tableDataCxt uid:%" PRId64 " ", pTableCxt->pMeta->uid);
// p = taosHashIterate(pTableHash, p);
// continue;
// }
if (!pTableCxt->ordered) {
code = tRowSort(pTableCxt->pData->aRowP);
}
@ -520,7 +541,7 @@ int32_t insMergeTableDataCxt(SHashObj* pTableHash, SArray** pVgDataBlocks) {
pVgCxt = *(SVgroupDataCxt**)pp;
}
if (TSDB_CODE_SUCCESS == code) {
code = fillVgroupDataCxt(pTableCxt, pVgCxt);
code = fillVgroupDataCxt(pTableCxt, pVgCxt, isRebuild);
}
}
if (TSDB_CODE_SUCCESS == code) {

View File

@ -300,6 +300,7 @@ e
,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/precisionUS.py
,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/precisionNS.py
,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/test_ts4219.py
,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/ts-4272.py
,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/test_ts4295.py
,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/test_td27388.py
,,y,system-test,./pytest.sh python3 ./test.py -f 0-others/show.py

View File

@ -0,0 +1,158 @@
import csv
from datetime import datetime
import taos
from util.log import *
from util.sql import *
from util.cases import *
from util.dnodes import *
from util.common import *
class TDTestCase:
def init(self, conn, logSql, replicaVar=1):
self.replicaVar = int(replicaVar)
self.testcasePath = os.path.split(__file__)[0]
self.testcaseFilename = os.path.split(__file__)[-1]
self.ts = 1700638570000 # 2023-11-22T07:36:10.000Z
self.tb1 = 'd001'
self.tb2 = 'd002'
self.tag1 = 'using meters(groupId) tags(1)'
self.tag2 = 'using meters(groupId) tags(2)'
self.file1 = f"{self.testcasePath}/b.csv"
self.file2 = f"{self.testcasePath}/c.csv"
os.system("rm -rf %s/b.csv" %self.testcasePath)
tdLog.debug(f"start to excute {__file__}")
tdSql.init(conn.cursor(), logSql)
def check_count(self, rows, records):
tdSql.execute(f"use d1;")
tdSql.query(f"select tbname,count(*) from meters group by tbname order by tbname;")
tdSql.checkRows(rows)
for i in range(rows):
tdSql.checkData(i, 1, records[i])
def reset_tb(self):
# create database and tables
# os.system("taos -s 'drop database if exists d1;'")
# os.system("taos -s 'create database d1;use d1;CREATE STABLE meters (ts timestamp, current float, voltage int, phase float) TAGS (location binary(64), groupId int);'")
# os.system(f"taos -s 'use d1;Create table d2001 using meters(groupId) tags(5);'")
# res = os.system(f"taos -s 'use d1;Create table d2002 using meters(groupId) tags(6);'")
# if (0 != res):
# tdLog.exit(f"create tb error")
tdSql.execute("drop database if exists d1;")
tdSql.execute("create database d1;")
tdSql.execute("use d1;")
tdSql.execute("CREATE STABLE meters (ts timestamp, current float, voltage int, phase float) TAGS (location binary(64), groupId int);")
tdSql.execute("Create table d2001 using meters(groupId) tags(5);")
tdSql.execute("Create table d2002 using meters(groupId) tags(6);")
def test(self, sql):
sql = "use d1;" + sql
res = os.system(f'taos -s "{sql}"')
# if (0 != res):
# tdLog.exit(f"taos sql error")
def check(self):
# same table, auto create + create
sql = f"INSERT INTO {self.tb1} {self.tag1} file '{self.file1}' {self.tb1} {self.tag1} file '{self.file2}';"
self.test(sql)
# same table, create + insert
sql = f"INSERT INTO {self.tb1} {self.tag1} file '{self.file1}' {self.tb1} file '{self.file2}';"
self.test(sql)
# same table, insert + create
sql = f"INSERT INTO {self.tb1} file '{self.file1}' {self.tb1} {self.tag1} file '{self.file2}';"
self.test(sql)
# same table, insert + insert
sql = f"INSERT INTO {self.tb1} file '{self.file1}' {self.tb1} file '{self.file2}';"
self.test(sql)
# diff table auto create + create
sql = f"INSERT INTO {self.tb1} {self.tag1} file '{self.file1}' {self.tb2} {self.tag2} file '{self.file2}';"
self.test(sql)
# diff table, create + insert
sql = f"INSERT INTO {self.tb1} {self.tag1} file '{self.file1}' {self.tb2} file '{self.file2}';"
self.test(sql)
# diff table, insert + create
sql = f"INSERT INTO {self.tb1} file '{self.file1}' {self.tb2} {self.tag2} file '{self.file2}';"
self.test(sql)
# diff table, insert + insert
sql = f"INSERT INTO {self.tb1} file '{self.file1}' {self.tb2} file '{self.file2}';"
self.test(sql)
# bigNum = 1010000
# self.check_count(5, [2100, 2100, bigNum, bigNum, bigNum])
result = os.popen("taos -s 'select count(*) from d1.%s'" %self.tb1)
res = result.read()
if (f"OK" in res):
tdLog.info(f"check count success")
def make_csv(self, filepath, once, qtime, startts):
f = open(filepath, 'w')
with f:
writer = csv.writer(f)
for j in range(qtime):
ts = startts + j*once
rows = []
for i in range(once):
rows.append([ts + i, 0.3 + (i%10)/100.0, 210 + i%10, 10.0 + (i%20)/20.0])
writer.writerows(rows)
f.close()
print(datetime.now(), filepath, " ready!")
def test_mix(self):
#forbid use both value and file in one insert
result = os.popen(f"insert into {self.tb1} file '{self.testcasePath}/csv/2k.csv' {self.tb2} values('2021-07-13 14:06:34.630', 10.2, 219, 0.32);")
res = result.read()
if (f"error" in res):
tdLog.info(f"forbid success")
def test_bigcsv(self):
# prepare csv
print("start csv data prepare")
once = 10000
qtime1 = 101
qtime2 = 100
rowNum1 = qtime1 * once
rowNum2 = qtime2 * once
self.make_csv(self.file1, once, qtime1, self.ts - 86400000)
self.make_csv(self.file2, once, qtime2, self.ts)
print("end csv data prepare")
# auto create + insert
sql = f"INSERT INTO d2001 using meters(groupId) tags(5) FILE '{self.file1}';"
self.test(sql)
# only insert
sql = f"INSERT INTO d2002 FILE '{self.file2}';"
self.test(sql)
#tdSql.execute(f"use d1;")
tdSql.query(f"select tbname,count(*) from meters group by tbname order by tbname;")
tdSql.checkRows(2)
tdSql.checkData(0, 1, rowNum1)
tdSql.checkData(1, 1, rowNum2)
def run(self):
tdSql.prepare()
self.reset_tb()
self.test_bigcsv()
self.test_mix()
self.check()
def stop(self):
tdSql.close()
tdLog.success(f"{__file__} successfully executed")
tdCases.addLinux(__file__, TDTestCase())
tdCases.addWindows(__file__, TDTestCase())