fix crash when insert big csv

2023-11-29 01:53:13 +08:00 · 2023-11-29 01:53:13 +08:00 · 5c5efa34eb
parent 4439c1e4f5
commit 5c5efa34eb
9 changed files with 238 additions and 10 deletions
--- a/include/libs/nodes/querynodes.h
+++ b/include/libs/nodes/querynodes.h
@ -449,6 +449,7 @@ typedef struct SVnodeModifyOpStmt {
  SHashObj*             pSubTableHashObj;    // SHashObj<table_name, STableMeta*>
  SHashObj*             pTableNameHashObj;   // set of table names for refreshing meta, sync mode
  SHashObj*             pDbFNameHashObj;     // set of db names for refreshing meta, sync mode
+  SHashObj*             pTableCxtHashObj;    // temp SHashObj<tuid, STableDataCxt*> for single request
  SArray*               pVgDataBlocks;  // SArray<SVgroupDataCxt*>
  SVCreateTbReq*        pCreateTblReq;
  TdFilePtr             fp;
--- a/source/libs/nodes/src/nodesUtilFuncs.c
+++ b/source/libs/nodes/src/nodesUtilFuncs.c
@ -887,6 +887,7 @@ void nodesDestroyNode(SNode* pNode) {
      taosHashCleanup(pStmt->pSubTableHashObj);
      taosHashCleanup(pStmt->pTableNameHashObj);
      taosHashCleanup(pStmt->pDbFNameHashObj);
+      taosHashCleanup(pStmt->pTableCxtHashObj);
      if (pStmt->freeHashFunc) {
        pStmt->freeHashFunc(pStmt->pTableBlockHashObj);
      }
--- a/source/libs/parser/inc/parInsertUtil.h
+++ b/source/libs/parser/inc/parInsertUtil.h
@ -50,7 +50,7 @@ void    insCheckTableDataOrder(STableDataCxt *pTableCxt, TSKEY tsKey);
 int32_t insGetTableDataCxt(SHashObj *pHash, void *id, int32_t idLen, STableMeta *pTableMeta,
                           SVCreateTbReq **pCreateTbReq, STableDataCxt **pTableCxt, bool colMode, bool ignoreColVals);
 int32_t initTableColSubmitData(STableDataCxt *pTableCxt);
-int32_t insMergeTableDataCxt(SHashObj *pTableHash, SArray **pVgDataBlocks);
+int32_t insMergeTableDataCxt(SHashObj *pTableHash, SArray **pVgDataBlocks, bool isRebuild);
 int32_t insBuildVgDataBlocks(SHashObj *pVgroupsHashObj, SArray *pVgDataBlocks, SArray **pDataBlocks);
 void    insDestroyTableDataCxtHashMap(SHashObj *pTableCxtHash);
 void    insDestroyVgroupDataCxt(SVgroupDataCxt *pVgCxt);
--- a/source/libs/parser/src/parInsertSml.c
+++ b/source/libs/parser/src/parInsertSml.c
@ -425,7 +425,7 @@ SQuery* smlInitHandle() {
 int32_t smlBuildOutput(SQuery* handle, SHashObj* pVgHash) {
  SVnodeModifyOpStmt* pStmt = (SVnodeModifyOpStmt*)(handle)->pRoot;
  // merge according to vgId
-  int32_t code = insMergeTableDataCxt(pStmt->pTableBlockHashObj, &pStmt->pVgDataBlocks);
+  int32_t code = insMergeTableDataCxt(pStmt->pTableBlockHashObj, &pStmt->pVgDataBlocks, true);
  if (code != TSDB_CODE_SUCCESS) {
    uError("insMergeTableDataCxt failed");
    return code;
--- a/source/libs/parser/src/parInsertSql.c
+++ b/source/libs/parser/src/parInsertSql.c
@ -55,6 +55,7 @@ typedef struct SInsertParseContext {
  bool           usingDuplicateTable;
  bool           forceUpdate;
  bool           needTableTagVal;
+  bool           needRequest;       // whether or not request server
 } SInsertParseContext;

 typedef int32_t (*_row_append_fn_t)(SMsgBuf* pMsgBuf, const void* value, int32_t len, void* param);
@ -652,6 +653,10 @@ static int32_t parseTagValue(SInsertParseContext* pCxt, SVnodeModifyOpStmt* pStm
 }

 static int32_t buildCreateTbReq(SVnodeModifyOpStmt* pStmt, STag* pTag, SArray* pTagName) {
+  if (pStmt->pCreateTblReq) {
+    tdDestroySVCreateTbReq(pStmt->pCreateTblReq);
+    taosMemoryFreeClear(pStmt->pCreateTblReq);
+  }
  pStmt->pCreateTblReq = taosMemoryCalloc(1, sizeof(SVCreateTbReq));
  if (NULL == pStmt->pCreateTblReq) {
    return TSDB_CODE_OUT_OF_MEMORY;
@ -1992,7 +1997,7 @@ static int32_t parseCsvFile(SInsertParseContext* pCxt, SVnodeModifyOpStmt* pStmt
      (*pNumOfRows)++;
    }

-    if (TSDB_CODE_SUCCESS == code && (*pNumOfRows) > tsMaxInsertBatchRows) {
+    if (TSDB_CODE_SUCCESS == code && (*pNumOfRows) >= tsMaxInsertBatchRows) {
      pStmt->fileProcessing = true;
      break;
    }
@ -2003,7 +2008,7 @@ static int32_t parseCsvFile(SInsertParseContext* pCxt, SVnodeModifyOpStmt* pStmt

  parserDebug("0x%" PRIx64 " %d rows have been parsed", pCxt->pComCxt->requestId, *pNumOfRows);

-  if (TSDB_CODE_SUCCESS == code && 0 == (*pNumOfRows) &&
+  if (TSDB_CODE_SUCCESS == code && 0 == (*pNumOfRows) && 0 == pStmt->totalRowsNum &&
      (!TSDB_QUERY_HAS_TYPE(pStmt->insertType, TSDB_QUERY_TYPE_STMT_INSERT)) && !pStmt->fileProcessing) {
    code = buildSyntaxErrMsg(&pCxt->msg, "no any data points", NULL);
  }
@ -2022,7 +2027,22 @@ static int32_t parseDataFromFileImpl(SInsertParseContext* pCxt, SVnodeModifyOpSt
    } else {
      parserDebug("0x%" PRIx64 " insert from csv. File is too large, do it in batches.", pCxt->pComCxt->requestId);
    }
+    if (pStmt->insertType != TSDB_QUERY_TYPE_FILE_INSERT) {
+      return buildSyntaxErrMsg(&pCxt->msg, "keyword VALUES or FILE is exclusive", NULL);
+    }
  }
+
+  // just record pTableCxt whose data come from file
+  if (numOfRows > 0) {
+    if (NULL == pStmt->pTableCxtHashObj) {
+      pStmt->pTableCxtHashObj =
+          taosHashInit(128, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_NO_LOCK);
+    }
+    void* pData = rowsDataCxt.pTableDataCxt;
+    taosHashPut(pStmt->pTableCxtHashObj, &pStmt->pTableMeta->uid, sizeof(pStmt->pTableMeta->uid), &pData,
+                POINTER_BYTES);
+  }
+
  return code;
 }

@ -2061,6 +2081,9 @@ static int32_t parseDataClause(SInsertParseContext* pCxt, SVnodeModifyOpStmt* pS
  NEXT_TOKEN(pStmt->pSql, token);
  switch (token.type) {
    case TK_VALUES:
+      if (TSDB_QUERY_HAS_TYPE(pStmt->insertType, TSDB_QUERY_TYPE_FILE_INSERT)) {
+        return buildSyntaxErrMsg(&pCxt->msg, "keyword VALUES or FILE is exclusive", token.z);
+      }
      return parseValuesClause(pCxt, pStmt, rowsDataCxt, &token);
    case TK_FILE:
      return parseFileClause(pCxt, pStmt, rowsDataCxt, &token);
@ -2275,8 +2298,25 @@ static int32_t parseInsertBodyBottom(SInsertParseContext* pCxt, SVnodeModifyOpSt
    return setStmtInfo(pCxt, pStmt);
  }

+  // release old array alloced by merge
+  pStmt->freeArrayFunc(pStmt->pVgDataBlocks);
+  pStmt->pVgDataBlocks = NULL;
+
+  bool fileOnly = (pStmt->insertType == TSDB_QUERY_TYPE_FILE_INSERT);
+  if (fileOnly) {
+    // none data, skip merge & buildvgdata 
+    if (0 == taosHashGetSize(pStmt->pTableCxtHashObj)) {
+      pCxt->needRequest = false;
+      return TSDB_CODE_SUCCESS;
+    }
+  }
+
  // merge according to vgId
-  int32_t code = insMergeTableDataCxt(pStmt->pTableBlockHashObj, &pStmt->pVgDataBlocks);
+  int32_t code = insMergeTableDataCxt(fileOnly ? pStmt->pTableCxtHashObj : pStmt->pTableBlockHashObj,
+                                      &pStmt->pVgDataBlocks, pStmt->fileProcessing);
+  // clear tmp hashobj only
+  taosHashClear(pStmt->pTableCxtHashObj);
+
  if (TSDB_CODE_SUCCESS == code) {
    code = insBuildVgDataBlocks(pStmt->pVgroupsHashObj, pStmt->pVgDataBlocks, &pStmt->pDataBlocks);
  }
@ -2718,6 +2758,7 @@ int32_t parseInsertSql(SParseContext* pCxt, SQuery** pQuery, SCatalogReq* pCatal
                                 .msg = {.buf = pCxt->pMsg, .len = pCxt->msgLen},
                                 .missCache = false,
                                 .usingDuplicateTable = false,
+                                 .needRequest = true,
                                 .forceUpdate = (NULL != pCatalogReq ? pCatalogReq->forceUpdate : false)};

  int32_t code = initInsertQuery(&context, pCatalogReq, pMetaData, pQuery);
@ -2732,5 +2773,10 @@ int32_t parseInsertSql(SParseContext* pCxt, SQuery** pQuery, SCatalogReq* pCatal
    code = setRefreshMeta(*pQuery);
  }
  insDestroyBoundColInfo(&context.tags);
+
+  // if no data to insert, set emptyMode to avoid request server
+  if (!context.needRequest) {
+    (*pQuery)->execMode = QUERY_EXEC_MODE_EMPTY_RESULT;
+  }
  return code;
 }
--- a/source/libs/parser/src/parInsertStmt.c
+++ b/source/libs/parser/src/parInsertStmt.c
@ -58,7 +58,7 @@ int32_t qBuildStmtOutput(SQuery* pQuery, SHashObj* pVgHash, SHashObj* pBlockHash

  // merge according to vgId
  if (taosHashGetSize(pBlockHash) > 0) {
-    code = insMergeTableDataCxt(pBlockHash, &pVgDataBlocks);
+    code = insMergeTableDataCxt(pBlockHash, &pVgDataBlocks, true);
  }
  if (TSDB_CODE_SUCCESS == code) {
    code = insBuildVgDataBlocks(pVgHash, pVgDataBlocks, &pStmt->pDataBlocks);
--- a/source/libs/parser/src/parInsertUtil.c
+++ b/source/libs/parser/src/parInsertUtil.c
@ -289,6 +289,14 @@ static int32_t rebuildTableData(SSubmitTbData* pSrc, SSubmitTbData** pDst) {
    pTmp->uid = pSrc->uid;
    pTmp->sver = pSrc->sver;
    pTmp->pCreateTbReq = NULL;
+    if (pTmp->flags & SUBMIT_REQ_AUTO_CREATE_TABLE) {
+      if (pSrc->pCreateTbReq) {
+        cloneSVreateTbReq(pSrc->pCreateTbReq, &pTmp->pCreateTbReq);
+      } else {
+        pTmp->flags &= ~SUBMIT_REQ_AUTO_CREATE_TABLE;
+      }
+    }
+
    if (pTmp->flags & SUBMIT_REQ_COLUMN_DATA_FORMAT) {
      pTmp->aCol = taosArrayInit(128, sizeof(SColData));
      if (NULL == pTmp->aCol) {
@ -416,15 +424,21 @@ void insDestroyTableDataCxtHashMap(SHashObj* pTableCxtHash) {
  taosHashCleanup(pTableCxtHash);
 }

-static int32_t fillVgroupDataCxt(STableDataCxt* pTableCxt, SVgroupDataCxt* pVgCxt) {
+static int32_t fillVgroupDataCxt(STableDataCxt* pTableCxt, SVgroupDataCxt* pVgCxt, bool isRebuild) {
  if (NULL == pVgCxt->pData->aSubmitTbData) {
    pVgCxt->pData->aSubmitTbData = taosArrayInit(128, sizeof(SSubmitTbData));
    if (NULL == pVgCxt->pData->aSubmitTbData) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }
  }
+
+  // push data to submit, rebuild empty data for next submit
  taosArrayPush(pVgCxt->pData->aSubmitTbData, pTableCxt->pData);
-  rebuildTableData(pTableCxt->pData, &pTableCxt->pData);
+  if (isRebuild) {
+    rebuildTableData(pTableCxt->pData, &pTableCxt->pData);
+  } else {
+    taosMemoryFreeClear(pTableCxt->pData);
+  }

  qDebug("add tableDataCxt uid:%" PRId64 " to vgId:%d", pTableCxt->pMeta->uid, pVgCxt->vgId);

@ -467,7 +481,7 @@ int insColDataComp(const void* lp, const void* rp) {
  return 0;
 }

-int32_t insMergeTableDataCxt(SHashObj* pTableHash, SArray** pVgDataBlocks) {
+int32_t insMergeTableDataCxt(SHashObj* pTableHash, SArray** pVgDataBlocks, bool isRebuild) {
  SHashObj* pVgroupHash = taosHashInit(128, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, false);
  SArray*   pVgroupList = taosArrayInit(8, POINTER_BYTES);
  if (NULL == pVgroupHash || NULL == pVgroupList) {
@ -502,6 +516,13 @@ int32_t insMergeTableDataCxt(SHashObj* pTableHash, SArray** pVgDataBlocks) {

      tColDataSortMerge(pTableCxt->pData->aCol);
    } else {
+      // skip the table has no data to insert
+      // eg: import a csv without valid data
+      // if (0 == taosArrayGetSize(pTableCxt->pData->aRowP)) {
+      //   qWarn("no row in tableDataCxt uid:%" PRId64 " ", pTableCxt->pMeta->uid);
+      //   p = taosHashIterate(pTableHash, p);
+      //   continue;
+      // }
      if (!pTableCxt->ordered) {
        code = tRowSort(pTableCxt->pData->aRowP);
      }
@ -520,7 +541,7 @@ int32_t insMergeTableDataCxt(SHashObj* pTableHash, SArray** pVgDataBlocks) {
        pVgCxt = *(SVgroupDataCxt**)pp;
      }
      if (TSDB_CODE_SUCCESS == code) {
-        code = fillVgroupDataCxt(pTableCxt, pVgCxt);
+        code = fillVgroupDataCxt(pTableCxt, pVgCxt, isRebuild);
      }
    }
    if (TSDB_CODE_SUCCESS == code) {
--- a/tests/parallel_test/cases.task
+++ b/tests/parallel_test/cases.task
@ -300,6 +300,7 @@ e
 ,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/precisionUS.py
 ,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/precisionNS.py
 ,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/test_ts4219.py
+,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/ts-4272.py
 ,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/test_ts4295.py
 ,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/test_td27388.py
 ,,y,system-test,./pytest.sh python3 ./test.py -f 0-others/show.py
--- a/tests/system-test/1-insert/ts-4272.py
+++ b/tests/system-test/1-insert/ts-4272.py
@ -0,0 +1,158 @@
+
+import csv
+from datetime import datetime
+
+import taos
+from util.log import *
+from util.sql import *
+from util.cases import *
+from util.dnodes import *
+from util.common import *
+
+class TDTestCase:
+    def init(self, conn, logSql, replicaVar=1):
+        self.replicaVar = int(replicaVar)
+        self.testcasePath = os.path.split(__file__)[0]
+        self.testcaseFilename = os.path.split(__file__)[-1]
+        self.ts = 1700638570000  # 2023-11-22T07:36:10.000Z
+        self.tb1 = 'd001'
+        self.tb2 = 'd002'
+        self.tag1 = 'using meters(groupId) tags(1)'
+        self.tag2 = 'using meters(groupId) tags(2)'
+        self.file1 = f"{self.testcasePath}/b.csv"
+        self.file2 = f"{self.testcasePath}/c.csv"
+
+        os.system("rm -rf %s/b.csv" %self.testcasePath)
+        tdLog.debug(f"start to excute {__file__}")
+        tdSql.init(conn.cursor(), logSql)
+
+    def check_count(self, rows, records):
+        tdSql.execute(f"use d1;")
+        tdSql.query(f"select tbname,count(*) from meters group by tbname order by tbname;")
+        tdSql.checkRows(rows)
+        for i in range(rows):
+          tdSql.checkData(i, 1, records[i])
+
+    def reset_tb(self):
+        # create database and tables
+        # os.system("taos -s 'drop database if exists d1;'")
+        # os.system("taos -s 'create database d1;use d1;CREATE STABLE meters (ts timestamp, current float, voltage int, phase float) TAGS (location binary(64), groupId int);'")
+        # os.system(f"taos -s 'use d1;Create table d2001 using meters(groupId) tags(5);'")
+        # res = os.system(f"taos -s 'use d1;Create table d2002 using meters(groupId) tags(6);'")
+        # if (0 != res):
+        #    tdLog.exit(f"create tb error")
+
+        tdSql.execute("drop database if exists d1;")
+        tdSql.execute("create database d1;")
+        tdSql.execute("use d1;")
+        tdSql.execute("CREATE STABLE meters (ts timestamp, current float, voltage int, phase float) TAGS (location binary(64), groupId int);")
+        tdSql.execute("Create table d2001 using meters(groupId) tags(5);")
+        tdSql.execute("Create table d2002 using meters(groupId) tags(6);")
+
+    def test(self, sql):
+        sql = "use d1;" + sql
+        res = os.system(f'taos -s "{sql}"')
+        # if (0 != res):
+        #    tdLog.exit(f"taos sql error")
+
+
+    def check(self):
+        # same table, auto create + create
+        sql = f"INSERT INTO {self.tb1} {self.tag1} file '{self.file1}' {self.tb1} {self.tag1} file '{self.file2}';"
+        self.test(sql)
+
+        # same table, create + insert
+        sql = f"INSERT INTO {self.tb1} {self.tag1} file '{self.file1}' {self.tb1} file '{self.file2}';"
+        self.test(sql)
+
+        # same table, insert + create
+        sql = f"INSERT INTO {self.tb1} file '{self.file1}' {self.tb1} {self.tag1} file '{self.file2}';"
+        self.test(sql)
+
+        # same table, insert + insert
+        sql = f"INSERT INTO {self.tb1} file '{self.file1}' {self.tb1} file '{self.file2}';"
+        self.test(sql)
+
+        # diff table auto create + create
+        sql = f"INSERT INTO {self.tb1} {self.tag1} file '{self.file1}' {self.tb2} {self.tag2} file '{self.file2}';"
+        self.test(sql)
+
+        # diff table, create + insert
+        sql = f"INSERT INTO {self.tb1} {self.tag1} file '{self.file1}' {self.tb2} file '{self.file2}';"
+        self.test(sql)
+
+        # diff table, insert + create
+        sql = f"INSERT INTO {self.tb1} file '{self.file1}' {self.tb2} {self.tag2} file '{self.file2}';"
+        self.test(sql)
+
+        # diff table, insert + insert
+        sql = f"INSERT INTO {self.tb1} file '{self.file1}' {self.tb2} file '{self.file2}';"
+        self.test(sql)
+
+        # bigNum = 1010000
+        # self.check_count(5, [2100, 2100, bigNum, bigNum, bigNum])
+
+        result = os.popen("taos -s 'select count(*) from d1.%s'" %self.tb1)
+        res = result.read()
+        if (f"OK" in res):
+            tdLog.info(f"check count success")
+
+    def make_csv(self, filepath, once, qtime, startts):
+        f = open(filepath, 'w')
+        with f:
+          writer = csv.writer(f)
+          for j in range(qtime):
+            ts = startts + j*once
+            rows = []
+            for i in range(once):
+                rows.append([ts + i, 0.3 + (i%10)/100.0, 210 + i%10, 10.0 + (i%20)/20.0])
+            writer.writerows(rows)
+        f.close()
+        print(datetime.now(), filepath, " ready!")
+
+    def test_mix(self):
+        #forbid use both value and file in one insert
+        result = os.popen(f"insert into {self.tb1} file '{self.testcasePath}/csv/2k.csv' {self.tb2} values('2021-07-13 14:06:34.630', 10.2, 219, 0.32);")
+        res = result.read()
+        if (f"error" in res):
+            tdLog.info(f"forbid success")
+
+    def test_bigcsv(self):
+        # prepare csv
+        print("start csv data prepare")
+        once = 10000
+        qtime1 = 101
+        qtime2 = 100
+        rowNum1 = qtime1 * once
+        rowNum2 = qtime2 * once
+        self.make_csv(self.file1, once, qtime1, self.ts - 86400000)
+        self.make_csv(self.file2, once, qtime2, self.ts)
+        print("end csv data prepare")
+   
+        # auto create + insert
+        sql = f"INSERT INTO d2001 using meters(groupId) tags(5) FILE '{self.file1}';"
+        self.test(sql)
+
+        # only insert 
+        sql = f"INSERT INTO d2002 FILE '{self.file2}';"
+        self.test(sql)
+
+        #tdSql.execute(f"use d1;")
+        tdSql.query(f"select tbname,count(*) from meters group by tbname order by tbname;")
+        tdSql.checkRows(2)
+        tdSql.checkData(0, 1, rowNum1)
+        tdSql.checkData(1, 1, rowNum2)
+
+    def run(self):
+        tdSql.prepare()
+        self.reset_tb()
+        self.test_bigcsv()
+        self.test_mix()
+        self.check()
+
+    def stop(self):
+        tdSql.close()
+        tdLog.success(f"{__file__} successfully executed")
+
+tdCases.addLinux(__file__, TDTestCase())
+tdCases.addWindows(__file__, TDTestCase())