diff --git a/include/common/tglobal.h b/include/common/tglobal.h index d445fc26e8..cd9667843b 100644 --- a/include/common/tglobal.h +++ b/include/common/tglobal.h @@ -82,6 +82,10 @@ extern bool tsEnableTelem; extern int32_t tsTelemInterval; extern char tsTelemServer[]; extern uint16_t tsTelemPort; +extern bool tsEnableCrashReport; +extern char* tsTelemUri; +extern char* tsClientCrashReportUri; +extern char* tsSvrCrashReportUri; // query buffer management extern int32_t tsQueryBufferSize; // maximum allowed usage buffer size in MB for each data node during query processing diff --git a/include/libs/transport/thttp.h b/include/libs/transport/thttp.h index 7d8c588bfc..9a6aee4187 100644 --- a/include/libs/transport/thttp.h +++ b/include/libs/transport/thttp.h @@ -24,7 +24,7 @@ extern "C" { typedef enum { HTTP_GZIP, HTTP_FLAT } EHttpCompFlag; -int32_t taosSendHttpReport(const char* server, uint16_t port, char* pCont, int32_t contLen, EHttpCompFlag flag); +int32_t taosSendHttpReport(const char* server, const char* uri, uint16_t port, char* pCont, int32_t contLen, EHttpCompFlag flag); #ifdef __cplusplus } diff --git a/include/os/osSystem.h b/include/os/osSystem.h index 58f34d26f0..5154c56e4b 100644 --- a/include/os/osSystem.h +++ b/include/os/osSystem.h @@ -46,27 +46,73 @@ void taosSetTerminalMode(); int32_t taosGetOldTerminalMode(); void taosResetTerminalMode(); +#define STACKSIZE 100 + #if !defined(WINDOWS) -#define taosPrintTrace(flags, level, dflag) \ - { \ - void* array[100]; \ - int32_t size = backtrace(array, 100); \ - char** strings = backtrace_symbols(array, size); \ - if (strings != NULL) { \ - taosPrintLog(flags, level, dflag, "obtained %d stack frames", size); \ - for (int32_t i = 0; i < size; i++) { \ - taosPrintLog(flags, level, dflag, "frame:%d, %s", i, strings[i]); \ - } \ - } \ - \ - taosMemoryFree(strings); \ +#define taosLogTraceToBuf(buf, bufSize, ignoreNum) { \ + void* array[STACKSIZE]; \ + int32_t size = backtrace(array, STACKSIZE); \ + char** strings = backtrace_symbols(array, size); \ + int32_t offset = 0; \ + if (strings != NULL) { \ + offset = snprintf(buf, bufSize - 1, "obtained %d stack frames\n", (ignoreNum > 0) ? size - ignoreNum : size); \ + for (int32_t i = (ignoreNum > 0) ? ignoreNum : 0; i < size; i++) { \ + offset += snprintf(buf + offset, bufSize - 1 - offset, "frame:%d, %s\n", (ignoreNum > 0) ? i - ignoreNum : i, strings[i]); \ + } \ + } \ + \ + taosMemoryFree(strings); \ +} + +#define taosPrintTrace(flags, level, dflag, ignoreNum) \ + { \ + void* array[STACKSIZE]; \ + int32_t size = backtrace(array, STACKSIZE); \ + char** strings = backtrace_symbols(array, size); \ + if (strings != NULL) { \ + taosPrintLog(flags, level, dflag, "obtained %d stack frames", (ignoreNum > 0) ? size - ignoreNum : size); \ + for (int32_t i = (ignoreNum > 0) ? ignoreNum : 0; i < size; i++) { \ + taosPrintLog(flags, level, dflag, "frame:%d, %s", (ignoreNum > 0) ? i - ignoreNum : i, strings[i]); \ + } \ + } \ + \ + taosMemoryFree(strings); \ } #else + #include #include -#define STACKSIZE 64 -#define taosPrintTrace(flags, level, dflag) \ +#define taosLogTraceToBuf(buf, bufSize, ignoreNum) { \ + unsigned int i; \ + void* stack[STACKSIZE]; \ + unsigned short frames; \ + SYMBOL_INFO* symbol; \ + HANDLE process; \ + int32_t offset = 0; \ + \ + process = GetCurrentProcess(); \ + \ + SymInitialize(process, NULL, TRUE); \ + \ + frames = CaptureStackBackTrace(0, STACKSIZE, stack, NULL); \ + symbol = (SYMBOL_INFO*)calloc(sizeof(SYMBOL_INFO) + 256 * sizeof(char), 1); \ + if (symbol != NULL) { \ + symbol->MaxNameLen = 255; \ + symbol->SizeOfStruct = sizeof(SYMBOL_INFO); \ + \ + if (frames > 0) { \ + offset = snprintf(buf, bufSize - 1, "obtained %d stack frames\n", (ignoreNum > 0) ? frames - ignoreNum : frames); \ + for (i = (ignoreNum > 0) ? ignoreNum : 0; i < frames; i++) { \ + SymFromAddr(process, (DWORD64)(stack[i]), 0, symbol); \ + offset += snprintf(buf + offset, bufSize - 1 - offset, "frame:%i, %s - 0x%0X\n", (ignoreNum > 0) ? i - ignoreNum : i, symbol->Name, symbol->Address); \ + } \ + } \ + free(symbol); \ + } \ + } + +#define taosPrintTrace(flags, level, dflag, ignoreNum) \ { \ unsigned int i; \ void* stack[STACKSIZE]; \ @@ -85,10 +131,10 @@ void taosResetTerminalMode(); symbol->SizeOfStruct = sizeof(SYMBOL_INFO); \ \ if (frames > 0) { \ - taosPrintLog(flags, level, dflag, "obtained %d stack frames", frames); \ - for (i = 0; i < frames; i++) { \ + taosPrintLog(flags, level, dflag, "obtained %d stack frames\n", (ignoreNum > 0) ? frames - ignoreNum : frames); \ + for (i = (ignoreNum > 0) ? ignoreNum : 0; i < frames; i++) { \ SymFromAddr(process, (DWORD64)(stack[i]), 0, symbol); \ - taosPrintLog(flags, level, dflag, "frame:%i: %s - 0x%0X", frames - i - 1, symbol->Name, symbol->Address); \ + taosPrintLog(flags, level, dflag, "frame:%i, %s - 0x%0X\n", (ignoreNum > 0) ? i - ignoreNum : i, symbol->Name, symbol->Address); \ } \ } \ free(symbol); \ diff --git a/include/util/tlog.h b/include/util/tlog.h index 6e9b304e1d..808377fa77 100644 --- a/include/util/tlog.h +++ b/include/util/tlog.h @@ -99,6 +99,11 @@ bool taosAssertRelease(bool condition); #endif #endif +void taosLogCrashInfo(char* nodeType, char* pMsg, int64_t msgLen, int signum, void *sigInfo); +void taosReadCrashInfo(char* filepath, char** pMsg, int64_t* pMsgLen, TdFilePtr* pFd); +void taosReleaseCrashLogFile(TdFilePtr pFile, bool truncateFile); +int32_t taosGenCrashJsonMsg(int signum, char** pMsg, int64_t clusterId, int64_t startTime); + // clang-format off #define uFatal(...) { if (uDebugFlag & DEBUG_FATAL) { taosPrintLog("UTL FATAL", DEBUG_FATAL, tsLogEmbedded ? 255 : uDebugFlag, __VA_ARGS__); }} #define uError(...) { if (uDebugFlag & DEBUG_ERROR) { taosPrintLog("UTL ERROR ", DEBUG_ERROR, tsLogEmbedded ? 255 : uDebugFlag, __VA_ARGS__); }} diff --git a/packaging/cfg/taos.cfg b/packaging/cfg/taos.cfg index e22aa85c97..3d3dfc8e73 100644 --- a/packaging/cfg/taos.cfg +++ b/packaging/cfg/taos.cfg @@ -43,6 +43,9 @@ # Switch for allowing TDengine to collect and report service usage information # telemetryReporting 1 +# Switch for allowing TDengine to collect and report crash information +# crashReporting 1 + # The maximum number of vnodes supported by this dnode # supportVnodes 0 diff --git a/source/client/inc/clientInt.h b/source/client/inc/clientInt.h index ea76f726ea..903a6a22ca 100644 --- a/source/client/inc/clientInt.h +++ b/source/client/inc/clientInt.h @@ -313,6 +313,8 @@ extern SAppInfo appInfo; extern int32_t clientReqRefPool; extern int32_t clientConnRefPool; extern int32_t timestampDeltaLimit; +extern int64_t lastClusterId; + __async_send_cb_fn_t getMsgRspHandle(int32_t msgType); @@ -340,6 +342,7 @@ void resetConnectDB(STscObj* pTscObj); int taos_options_imp(TSDB_OPTION option, const char* str); void* openTransporter(const char* user, const char* auth, int32_t numOfThreads); +void tscStopCrashReport(); typedef struct AsyncArg { SRpcMsg msg; diff --git a/source/client/src/clientEnv.c b/source/client/src/clientEnv.c index 64e1fd908a..2ecade58f9 100644 --- a/source/client/src/clientEnv.c +++ b/source/client/src/clientEnv.c @@ -28,13 +28,16 @@ #include "trpc.h" #include "tsched.h" #include "ttime.h" +#include "thttp.h" #define TSC_VAR_NOT_RELEASE 1 #define TSC_VAR_RELEASED 0 SAppInfo appInfo; +int64_t lastClusterId = 0; int32_t clientReqRefPool = -1; int32_t clientConnRefPool = -1; +int32_t clientStop = 0; int32_t timestampDeltaLimit = 900; // s @@ -385,6 +388,146 @@ void destroyRequest(SRequestObj *pRequest) { removeRequest(pRequest->self); } +void taosClientCrash(int signum, void *sigInfo, void *context) { + taosIgnSignal(SIGTERM); + taosIgnSignal(SIGHUP); + taosIgnSignal(SIGINT); + taosIgnSignal(SIGBREAK); + +#if !defined(WINDOWS) + taosIgnSignal(SIGBUS); +#endif + taosIgnSignal(SIGABRT); + taosIgnSignal(SIGFPE); + taosIgnSignal(SIGSEGV); + + char *pMsg = NULL; + const char *flags = "UTL FATAL "; + ELogLevel level = DEBUG_FATAL; + int32_t dflag = 255; + int64_t msgLen= -1; + + if (tsEnableCrashReport) { + if (taosGenCrashJsonMsg(signum, &pMsg, lastClusterId, appInfo.startTime)) { + taosPrintLog(flags, level, dflag, "failed to generate crash json msg"); + goto _return; + } else { + msgLen = strlen(pMsg); + } + } + +_return: + + taosLogCrashInfo("taos", pMsg, msgLen, signum, sigInfo); + + exit(signum); +} + +void crashReportThreadFuncUnexpectedStopped(void) { atomic_store_32(&clientStop, -1); } + +static void *tscCrashReportThreadFp(void *param) { + setThreadName("client-crashReport"); + char filepath[PATH_MAX] = {0}; + snprintf(filepath, sizeof(filepath), "%s%s.taosCrashLog", tsLogDir, TD_DIRSEP); + char *pMsg = NULL; + int64_t msgLen = 0; + TdFilePtr pFile = NULL; + bool truncateFile = false; + int32_t sleepTime = 200; + int32_t reportPeriodNum = 3600 * 1000 / sleepTime; + int32_t loopTimes = reportPeriodNum; + +#ifdef WINDOWS + if (taosCheckCurrentInDll()) { + atexit(crashReportThreadFuncUnexpectedStopped); + } +#endif + + while (1) { + if (clientStop) break; + if (loopTimes++ < reportPeriodNum) { + taosMsleep(sleepTime); + continue; + } + + taosReadCrashInfo(filepath, &pMsg, &msgLen, &pFile); + if (pMsg && msgLen > 0) { + if (taosSendHttpReport(tsTelemServer, tsClientCrashReportUri, tsTelemPort, pMsg, msgLen, HTTP_FLAT) != 0) { + tscError("failed to send crash report"); + if (pFile) { + taosReleaseCrashLogFile(pFile, false); + continue; + } + } else { + tscInfo("succeed to send crash report"); + truncateFile = true; + } + } else { + tscDebug("no crash info"); + } + + taosMemoryFree(pMsg); + + if (pMsg && msgLen > 0) { + pMsg = NULL; + continue; + } + + if (pFile) { + taosReleaseCrashLogFile(pFile, truncateFile); + truncateFile = false; + } + + taosMsleep(sleepTime); + loopTimes = 0; + } + + clientStop = -1; + return NULL; +} + +int32_t tscCrashReportInit() { + if (!tsEnableCrashReport) { + return 0; + } + + TdThreadAttr thAttr; + taosThreadAttrInit(&thAttr); + taosThreadAttrSetDetachState(&thAttr, PTHREAD_CREATE_JOINABLE); + TdThread crashReportThread; + if (taosThreadCreate(&crashReportThread, &thAttr, tscCrashReportThreadFp, NULL) != 0) { + tscError("failed to create crashReport thread since %s", strerror(errno)); + return -1; + } + + taosThreadAttrDestroy(&thAttr); + return 0; +} + +void tscStopCrashReport() { + if (!tsEnableCrashReport) { + return; + } + + if (atomic_val_compare_exchange_32(&clientStop, 0, 1)) { + tscDebug("hb thread already stopped"); + return; + } + + while (atomic_load_32(&clientStop) > 0) { + taosMsleep(100); + } +} + +static void tscSetSignalHandle() { +#if !defined(WINDOWS) + taosSetSignal(SIGBUS, taosClientCrash); +#endif + taosSetSignal(SIGABRT, taosClientCrash); + taosSetSignal(SIGFPE, taosClientCrash); + taosSetSignal(SIGSEGV, taosClientCrash); +} + void taos_init_imp(void) { // In the APIs of other program language, taos_cleanup is not available yet. // So, to make sure taos_cleanup will be invoked to clean up the allocated resource to suppress the valgrind warning. @@ -392,6 +535,10 @@ void taos_init_imp(void) { errno = TSDB_CODE_SUCCESS; taosSeedRand(taosGetTimestampSec()); + appInfo.pid = taosGetPId(); + appInfo.startTime = taosGetTimestampMs(); + appInfo.pInstMap = taosHashInit(4, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); + deltaToUtcInitOnce(); if (taosCreateLog("taoslog", 10, configDir, NULL, NULL, NULL, NULL, 1) != 0) { @@ -404,6 +551,8 @@ void taos_init_imp(void) { return; } + tscSetSignalHandle(); + initQueryModuleMsgHandle(); if (taosConvInit() != 0) { @@ -433,9 +582,8 @@ void taos_init_imp(void) { taosGetAppName(appInfo.appName, NULL); taosThreadMutexInit(&appInfo.mutex, NULL); - appInfo.pid = taosGetPId(); - appInfo.startTime = taosGetTimestampMs(); - appInfo.pInstMap = taosHashInit(4, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); + tscCrashReportInit(); + tscDebug("client is initialized successfully"); } diff --git a/source/client/src/clientImpl.c b/source/client/src/clientImpl.c index f36036fd0a..53acafeeaa 100644 --- a/source/client/src/clientImpl.c +++ b/source/client/src/clientImpl.c @@ -1253,7 +1253,7 @@ STscObj* taosConnectImpl(const char* user, const char* auth, const char* db, __t int64_t transporterId = 0; asyncSendMsgToServer(pTscObj->pAppInfo->pTransporter, &pTscObj->pAppInfo->mgmtEp.epSet, &transporterId, body); - + tsem_wait(&pRequest->body.rspSem); if (pRequest->code != TSDB_CODE_SUCCESS) { const char* errorMsg = diff --git a/source/client/src/clientMain.c b/source/client/src/clientMain.c index 7f79323c4c..15c1d65162 100644 --- a/source/client/src/clientMain.c +++ b/source/client/src/clientMain.c @@ -55,6 +55,8 @@ void taos_cleanup(void) { return; } + tscStopCrashReport(); + int32_t id = clientReqRefPool; clientReqRefPool = -1; taosCloseRef(id); @@ -106,7 +108,7 @@ TAOS *taos_connect(const char *ip, const char *user, const char *pass, const cha if (pass == NULL) { pass = TSDB_DEFAULT_PASS; } - + STscObj *pObj = taos_connect_internal(ip, user, pass, NULL, db, port, CONN_TYPE__QUERY); if (pObj) { int64_t *rid = taosMemoryCalloc(1, sizeof(int64_t)); diff --git a/source/client/src/clientMsgHandler.c b/source/client/src/clientMsgHandler.c index 85027ff371..f414c7e92f 100644 --- a/source/client/src/clientMsgHandler.c +++ b/source/client/src/clientMsgHandler.c @@ -119,6 +119,7 @@ int32_t processConnectRsp(void* param, SDataBuf* pMsg, int32_t code) { // update the appInstInfo pTscObj->pAppInfo->clusterId = connectRsp.clusterId; + lastClusterId = connectRsp.clusterId; pTscObj->connType = connectRsp.connType; diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c index deefa65595..29785aa666 100644 --- a/source/common/src/tglobal.c +++ b/source/common/src/tglobal.c @@ -73,6 +73,11 @@ bool tsEnableTelem = true; int32_t tsTelemInterval = 43200; char tsTelemServer[TSDB_FQDN_LEN] = "telemetry.taosdata.com"; uint16_t tsTelemPort = 80; +char* tsTelemUri = "/report"; + +bool tsEnableCrashReport = true; +char* tsClientCrashReportUri = "/ccrashreport"; +char* tsSvrCrashReportUri = "/dcrashreport"; // schemaless char tsSmlTagName[TSDB_COL_NAME_LEN] = "_tag_null"; @@ -314,6 +319,7 @@ static int32_t taosAddClientCfg(SConfig *pCfg) { if (cfgAddInt32(pCfg, "maxMemUsedByInsert", tsMaxMemUsedByInsert, 1, INT32_MAX, true) != 0) return -1; if (cfgAddInt32(pCfg, "maxRetryWaitTime", tsMaxRetryWaitTime, 0, 86400000, 0) != 0) return -1; if (cfgAddBool(pCfg, "useAdapter", tsUseAdapter, true) != 0) return -1; + if (cfgAddBool(pCfg, "crashReporting", tsEnableCrashReport, true) != 0) return -1; tsNumOfTaskQueueThreads = tsNumOfCores / 2; tsNumOfTaskQueueThreads = TMAX(tsNumOfTaskQueueThreads, 4); @@ -434,6 +440,7 @@ static int32_t taosAddServerCfg(SConfig *pCfg) { if (cfgAddInt32(pCfg, "monitorMaxLogs", tsMonitorMaxLogs, 1, 1000000, 0) != 0) return -1; if (cfgAddBool(pCfg, "monitorComp", tsMonitorComp, 0) != 0) return -1; + if (cfgAddBool(pCfg, "crashReporting", tsEnableCrashReport, 0) != 0) return -1; if (cfgAddBool(pCfg, "telemetryReporting", tsEnableTelem, 0) != 0) return -1; if (cfgAddInt32(pCfg, "telemetryInterval", tsTelemInterval, 1, 200000, 0) != 0) return -1; if (cfgAddString(pCfg, "telemetryServer", tsTelemServer, 0) != 0) return -1; @@ -665,6 +672,7 @@ static int32_t taosSetClientCfg(SConfig *pCfg) { tsQueryUseNodeAllocator = cfgGetItem(pCfg, "queryUseNodeAllocator")->bval; tsKeepColumnName = cfgGetItem(pCfg, "keepColumnName")->bval; tsUseAdapter = cfgGetItem(pCfg, "useAdapter")->bval; + tsEnableCrashReport = cfgGetItem(pCfg, "crashReporting")->bval; tsMaxRetryWaitTime = cfgGetItem(pCfg, "maxRetryWaitTime")->i32; return 0; @@ -726,6 +734,7 @@ static int32_t taosSetServerCfg(SConfig *pCfg) { tsQueryRspPolicy = cfgGetItem(pCfg, "queryRspPolicy")->i32; tsEnableTelem = cfgGetItem(pCfg, "telemetryReporting")->bval; + tsEnableCrashReport = cfgGetItem(pCfg, "crashReporting")->bval; tsTelemInterval = cfgGetItem(pCfg, "telemetryInterval")->i32; tstrncpy(tsTelemServer, cfgGetItem(pCfg, "telemetryServer")->str, TSDB_FQDN_LEN); tsTelemPort = (uint16_t)cfgGetItem(pCfg, "telemetryPort")->i32; @@ -795,6 +804,8 @@ int32_t taosSetCfg(SConfig *pCfg, char *name) { tsCountAlwaysReturnValue = cfgGetItem(pCfg, "countAlwaysReturnValue")->i32; } else if (strcasecmp("cDebugFlag", name) == 0) { cDebugFlag = cfgGetItem(pCfg, "cDebugFlag")->i32; + } else if (strcasecmp("crashReporting", name) == 0) { + tsEnableCrashReport = cfgGetItem(pCfg, "crashReporting")->bval; } break; } diff --git a/source/dnode/mgmt/exe/dmMain.c b/source/dnode/mgmt/exe/dmMain.c index d308d3e618..711280ea58 100644 --- a/source/dnode/mgmt/exe/dmMain.c +++ b/source/dnode/mgmt/exe/dmMain.c @@ -44,6 +44,7 @@ static struct { char apolloUrl[PATH_MAX]; const char **envCmd; SArray *pArgs; // SConfigPair + int64_t startTime; } global = {0}; static void dmSetDebugFlag(int32_t signum, void *sigInfo, void *context) { taosSetAllDebugFlag(143, true); } @@ -67,23 +68,67 @@ static void dmStopDnode(int signum, void *sigInfo, void *context) { dmStop(); } +void dmLogCrash(int signum, void *sigInfo, void *context) { + taosIgnSignal(SIGTERM); + taosIgnSignal(SIGHUP); + taosIgnSignal(SIGINT); + taosIgnSignal(SIGBREAK); + +#ifndef WINDOWS + taosIgnSignal(SIGBUS); +#endif + taosIgnSignal(SIGABRT); + taosIgnSignal(SIGFPE); + taosIgnSignal(SIGSEGV); + + char *pMsg = NULL; + const char *flags = "UTL FATAL "; + ELogLevel level = DEBUG_FATAL; + int32_t dflag = 255; + int64_t msgLen= -1; + + if (tsEnableCrashReport) { + if (taosGenCrashJsonMsg(signum, &pMsg, dmGetClusterId(), global.startTime)) { + taosPrintLog(flags, level, dflag, "failed to generate crash json msg"); + goto _return; + } else { + msgLen = strlen(pMsg); + } + } + +_return: + + taosLogCrashInfo("taosd", pMsg, msgLen, signum, sigInfo); + + exit(signum); +} + static void dmSetSignalHandle() { taosSetSignal(SIGUSR1, dmSetDebugFlag); taosSetSignal(SIGUSR2, dmSetAssert); taosSetSignal(SIGTERM, dmStopDnode); taosSetSignal(SIGHUP, dmStopDnode); taosSetSignal(SIGINT, dmStopDnode); - taosSetSignal(SIGABRT, dmStopDnode); taosSetSignal(SIGBREAK, dmStopDnode); #ifndef WINDOWS taosSetSignal(SIGTSTP, dmStopDnode); taosSetSignal(SIGQUIT, dmStopDnode); #endif + +#ifndef WINDOWS + taosSetSignal(SIGBUS, dmLogCrash); +#endif + taosSetSignal(SIGABRT, dmLogCrash); + taosSetSignal(SIGFPE, dmLogCrash); + taosSetSignal(SIGSEGV, dmLogCrash); } static int32_t dmParseArgs(int32_t argc, char const *argv[]) { + global.startTime = taosGetTimestampMs(); + int32_t cmdEnvIndex = 0; if (argc < 2) return 0; + global.envCmd = taosMemoryMalloc((argc - 1) * sizeof(char *)); memset(global.envCmd, 0, (argc - 1) * sizeof(char *)); for (int32_t i = 1; i < argc; ++i) { diff --git a/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h b/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h index c776beb3f0..ff32cbcb08 100644 --- a/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h +++ b/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h @@ -29,6 +29,7 @@ typedef struct SDnodeMgmt { const char *name; TdThread statusThread; TdThread monitorThread; + TdThread crashReportThread; SSingleWorker mgmtWorker; ProcessCreateNodeFp processCreateNodeFp; ProcessDropNodeFp processDropNodeFp; @@ -55,6 +56,8 @@ int32_t dmStartStatusThread(SDnodeMgmt *pMgmt); void dmStopStatusThread(SDnodeMgmt *pMgmt); int32_t dmStartMonitorThread(SDnodeMgmt *pMgmt); void dmStopMonitorThread(SDnodeMgmt *pMgmt); +int32_t dmStartCrashReportThread(SDnodeMgmt *pMgmt); +void dmStopCrashReportThread(SDnodeMgmt *pMgmt); int32_t dmStartWorker(SDnodeMgmt *pMgmt); void dmStopWorker(SDnodeMgmt *pMgmt); diff --git a/source/dnode/mgmt/mgmt_dnode/src/dmInt.c b/source/dnode/mgmt/mgmt_dnode/src/dmInt.c index d2db1a4a62..51df293ba7 100644 --- a/source/dnode/mgmt/mgmt_dnode/src/dmInt.c +++ b/source/dnode/mgmt/mgmt_dnode/src/dmInt.c @@ -23,6 +23,9 @@ static int32_t dmStartMgmt(SDnodeMgmt *pMgmt) { if (dmStartMonitorThread(pMgmt) != 0) { return -1; } + if (dmStartCrashReportThread(pMgmt) != 0) { + return -1; + } return 0; } @@ -30,6 +33,7 @@ static void dmStopMgmt(SDnodeMgmt *pMgmt) { pMgmt->pData->stopped = true; dmStopMonitorThread(pMgmt); dmStopStatusThread(pMgmt); + dmStopCrashReportThread(pMgmt); } static int32_t dmOpenMgmt(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) { diff --git a/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c b/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c index 80c040a5e8..76c8e09b70 100644 --- a/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c +++ b/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c @@ -15,6 +15,7 @@ #define _DEFAULT_SOURCE #include "dmInt.h" +#include "thttp.h" static void *dmStatusThreadFp(void *param) { SDnodeMgmt *pMgmt = param; @@ -63,6 +64,63 @@ static void *dmMonitorThreadFp(void *param) { return NULL; } +static void *dmCrashReportThreadFp(void *param) { + SDnodeMgmt *pMgmt = param; + int64_t lastTime = taosGetTimestampMs(); + setThreadName("dnode-crashReport"); + char filepath[PATH_MAX] = {0}; + snprintf(filepath, sizeof(filepath), "%s%s.taosdCrashLog", tsLogDir, TD_DIRSEP); + char *pMsg = NULL; + int64_t msgLen = 0; + TdFilePtr pFile = NULL; + bool truncateFile = false; + int32_t sleepTime = 200; + int32_t reportPeriodNum = 3600 * 1000 / sleepTime;; + int32_t loopTimes = reportPeriodNum; + + while (1) { + if (pMgmt->pData->dropped || pMgmt->pData->stopped) break; + if (loopTimes++ < reportPeriodNum) { + taosMsleep(sleepTime); + continue; + } + + taosReadCrashInfo(filepath, &pMsg, &msgLen, &pFile); + if (pMsg && msgLen > 0) { + if (taosSendHttpReport(tsTelemServer, tsSvrCrashReportUri, tsTelemPort, pMsg, msgLen, HTTP_FLAT) != 0) { + dError("failed to send crash report"); + if (pFile) { + taosReleaseCrashLogFile(pFile, false); + continue; + } + } else { + dInfo("succeed to send crash report"); + truncateFile = true; + } + } else { + dDebug("no crash info"); + } + + taosMemoryFree(pMsg); + + if (pMsg && msgLen > 0) { + pMsg = NULL; + continue; + } + + if (pFile) { + taosReleaseCrashLogFile(pFile, truncateFile); + truncateFile = false; + } + + taosMsleep(sleepTime); + loopTimes = 0; + } + + return NULL; +} + + int32_t dmStartStatusThread(SDnodeMgmt *pMgmt) { TdThreadAttr thAttr; taosThreadAttrInit(&thAttr); @@ -105,6 +163,36 @@ void dmStopMonitorThread(SDnodeMgmt *pMgmt) { } } +int32_t dmStartCrashReportThread(SDnodeMgmt *pMgmt) { + if (!tsEnableCrashReport) { + return 0; + } + + TdThreadAttr thAttr; + taosThreadAttrInit(&thAttr); + taosThreadAttrSetDetachState(&thAttr, PTHREAD_CREATE_JOINABLE); + if (taosThreadCreate(&pMgmt->crashReportThread, &thAttr, dmCrashReportThreadFp, pMgmt) != 0) { + dError("failed to create crashReport thread since %s", strerror(errno)); + return -1; + } + + taosThreadAttrDestroy(&thAttr); + tmsgReportStartup("dnode-crashReport", "initialized"); + return 0; +} + +void dmStopCrashReportThread(SDnodeMgmt *pMgmt) { + if (!tsEnableCrashReport) { + return; + } + + if (taosCheckPthreadValid(pMgmt->crashReportThread)) { + taosThreadJoin(pMgmt->crashReportThread, NULL); + taosThreadClear(&pMgmt->crashReportThread); + } +} + + static void dmProcessMgmtQueue(SQueueInfo *pInfo, SRpcMsg *pMsg) { SDnodeMgmt *pMgmt = pInfo->ahandle; int32_t code = -1; diff --git a/source/dnode/mgmt/node_mgmt/inc/dmMgmt.h b/source/dnode/mgmt/node_mgmt/inc/dmMgmt.h index 7e85e6b722..02cd678433 100644 --- a/source/dnode/mgmt/node_mgmt/inc/dmMgmt.h +++ b/source/dnode/mgmt/node_mgmt/inc/dmMgmt.h @@ -85,6 +85,7 @@ typedef struct SDnode { // dmEnv.c SDnode *dmInstance(); void dmReportStartup(const char *pName, const char *pDesc); +int64_t dmGetClusterId(); // dmMgmt.c int32_t dmInitDnode(SDnode *pDnode); diff --git a/source/dnode/mgmt/node_mgmt/src/dmEnv.c b/source/dnode/mgmt/node_mgmt/src/dmEnv.c index e3bda5a3f0..1d0236c0c5 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmEnv.c +++ b/source/dnode/mgmt/node_mgmt/src/dmEnv.c @@ -268,3 +268,8 @@ void dmReportStartup(const char *pName, const char *pDesc) { tstrncpy(pStartup->desc, pDesc, TSDB_STEP_DESC_LEN); dDebug("step:%s, %s", pStartup->name, pStartup->desc); } + +int64_t dmGetClusterId() { + return global.data.clusterId; +} + diff --git a/source/dnode/mgmt/node_mgmt/src/dmNodes.c b/source/dnode/mgmt/node_mgmt/src/dmNodes.c index 981797834a..08330e025f 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmNodes.c +++ b/source/dnode/mgmt/node_mgmt/src/dmNodes.c @@ -111,6 +111,7 @@ static int32_t dmStartNodes(SDnode *pDnode) { dInfo("TDengine initialized successfully"); dmReportStartup("TDengine", "initialized successfully"); + return 0; } diff --git a/source/dnode/mnode/impl/src/mndTelem.c b/source/dnode/mnode/impl/src/mndTelem.c index b0b49b42dc..679fafa28d 100644 --- a/source/dnode/mnode/impl/src/mndTelem.c +++ b/source/dnode/mnode/impl/src/mndTelem.c @@ -131,7 +131,7 @@ static int32_t mndProcessTelemTimer(SRpcMsg* pReq) { taosThreadMutexUnlock(&pMgmt->lock); if (pCont != NULL) { - if (taosSendHttpReport(tsTelemServer, tsTelemPort, pCont, strlen(pCont), HTTP_FLAT) != 0) { + if (taosSendHttpReport(tsTelemServer, tsTelemUri, tsTelemPort, pCont, strlen(pCont), HTTP_FLAT) != 0) { mError("failed to send telemetry report"); } else { mInfo("succeed to send telemetry report"); diff --git a/source/libs/monitor/src/monMain.c b/source/libs/monitor/src/monMain.c index b3ca0fa452..b23a36d4df 100644 --- a/source/libs/monitor/src/monMain.c +++ b/source/libs/monitor/src/monMain.c @@ -20,6 +20,7 @@ #include "ttime.h" static SMonitor tsMonitor = {0}; +static char* tsMonUri = "/report"; void monRecordLog(int64_t ts, ELogLevel level, const char *content) { taosThreadMutexLock(&tsMonitor.lock); @@ -550,7 +551,7 @@ void monSendReport() { // uDebugL("report cont:%s\n", pCont); if (pCont != NULL) { EHttpCompFlag flag = tsMonitor.cfg.comp ? HTTP_GZIP : HTTP_FLAT; - if (taosSendHttpReport(tsMonitor.cfg.server, tsMonitor.cfg.port, pCont, strlen(pCont), flag) != 0) { + if (taosSendHttpReport(tsMonitor.cfg.server, tsMonUri, tsMonitor.cfg.port, pCont, strlen(pCont), flag) != 0) { uError("failed to send monitor msg"); } taosMemoryFree(pCont); diff --git a/source/libs/transport/src/thttp.c b/source/libs/transport/src/thttp.c index 00854b5ee5..cd508f6fe9 100644 --- a/source/libs/transport/src/thttp.c +++ b/source/libs/transport/src/thttp.c @@ -35,6 +35,7 @@ typedef struct SHttpModule { typedef struct SHttpMsg { queue q; char* server; + char* uri; int32_t port; char* cont; int32_t len; @@ -63,26 +64,26 @@ static void httpHandleReq(SHttpMsg* msg); static void httpHandleQuit(SHttpMsg* msg); static int32_t httpSendQuit(); -static int32_t taosSendHttpReportImpl(const char* server, uint16_t port, char* pCont, int32_t contLen, +static int32_t taosSendHttpReportImpl(const char* server, const char* uri, uint16_t port, char* pCont, int32_t contLen, EHttpCompFlag flag); -static int32_t taosBuildHttpHeader(const char* server, int32_t contLen, char* pHead, int32_t headLen, +static int32_t taosBuildHttpHeader(const char* server, const char* uri, int32_t contLen, char* pHead, int32_t headLen, EHttpCompFlag flag) { if (flag == HTTP_FLAT) { return snprintf(pHead, headLen, - "POST /report HTTP/1.1\n" + "POST %s HTTP/1.1\n" "Host: %s\n" "Content-Type: application/json\n" "Content-Length: %d\n\n", - server, contLen); + uri, server, contLen); } else if (flag == HTTP_GZIP) { return snprintf(pHead, headLen, - "POST /report HTTP/1.1\n" + "POST %s HTTP/1.1\n" "Host: %s\n" "Content-Type: application/json\n" "Content-Encoding: gzip\n" "Content-Length: %d\n\n", - server, contLen); + uri, server, contLen); } else { terrno = TSDB_CODE_INVALID_CFG; return -1; @@ -181,6 +182,7 @@ static void httpDestroyMsg(SHttpMsg* msg) { if (msg == NULL) return; taosMemoryFree(msg->server); + taosMemoryFree(msg->uri); taosMemoryFree(msg->cont); taosMemoryFree(msg); } @@ -293,10 +295,11 @@ int32_t httpSendQuit() { return 0; } -static int32_t taosSendHttpReportImpl(const char* server, uint16_t port, char* pCont, int32_t contLen, +static int32_t taosSendHttpReportImpl(const char* server, const char* uri, uint16_t port, char* pCont, int32_t contLen, EHttpCompFlag flag) { SHttpMsg* msg = taosMemoryMalloc(sizeof(SHttpMsg)); msg->server = strdup(server); + msg->uri = strdup(uri); msg->port = port; msg->cont = taosMemoryMalloc(contLen); memcpy(msg->cont, pCont, contLen); @@ -309,12 +312,10 @@ static int32_t taosSendHttpReportImpl(const char* server, uint16_t port, char* p httpDestroyMsg(msg); tError("http-report already released"); return -1; - } else { - msg->http = load; - transAsyncSend(load->asyncPool, &(msg->q)); } - - return 0; + + msg->http = load; + return transAsyncSend(load->asyncPool, &(msg->q)); } static void httpDestroyClientCb(uv_handle_t* handle) { @@ -360,7 +361,7 @@ static void httpHandleReq(SHttpMsg* msg) { int32_t len = 2048; char* header = taosMemoryCalloc(1, len); - int32_t headLen = taosBuildHttpHeader(msg->server, msg->len, header, len, msg->flag); + int32_t headLen = taosBuildHttpHeader(msg->server, msg->uri, msg->len, header, len, msg->flag); if (headLen < 0) { taosMemoryFree(header); goto END; @@ -380,6 +381,7 @@ static void httpHandleReq(SHttpMsg* msg) { cli->port = msg->port; cli->dest = dest; + taosMemoryFree(msg->uri); taosMemoryFree(msg); uv_tcp_init(http->loop, &cli->tcp); @@ -406,9 +408,9 @@ END: httpDestroyMsg(msg); } -int32_t taosSendHttpReport(const char* server, uint16_t port, char* pCont, int32_t contLen, EHttpCompFlag flag) { +int32_t taosSendHttpReport(const char* server, const char* uri, uint16_t port, char* pCont, int32_t contLen, EHttpCompFlag flag) { taosThreadOnce(&transHttpInit, transHttpEnvInit); - return taosSendHttpReportImpl(server, port, pCont, contLen, flag); + return taosSendHttpReportImpl(server, uri, port, pCont, contLen, flag); } static void transHttpEnvInit() { diff --git a/source/os/test/osTests.cpp b/source/os/test/osTests.cpp index f831f457f9..2e24bb0526 100644 --- a/source/os/test/osTests.cpp +++ b/source/os/test/osTests.cpp @@ -33,7 +33,7 @@ TEST(osTest, osSystem) { const char *flags = "UTL FATAL "; ELogLevel level = DEBUG_FATAL; int32_t dflag = 255; // tsLogEmbedded ? 255 : uDebugFlag - taosPrintTrace(flags, level, dflag); + taosPrintTrace(flags, level, dflag, 0); } void fileOperateOnFree(void *param) { diff --git a/source/util/src/tlog.c b/source/util/src/tlog.c index 53d0cad5ea..d9cbde5714 100644 --- a/source/util/src/tlog.c +++ b/source/util/src/tlog.c @@ -18,6 +18,8 @@ #include "os.h" #include "tconfig.h" #include "tutil.h" +#include "tjson.h" +#include "tglobal.h" #define LOG_MAX_LINE_SIZE (1024) #define LOG_MAX_LINE_BUFFER_SIZE (LOG_MAX_LINE_SIZE + 3) @@ -808,7 +810,7 @@ bool taosAssertDebug(bool condition, const char *file, int32_t line, const char taosPrintLogImp(1, 255, buffer, len); taosPrintLog(flags, level, dflag, "tAssert at file %s:%d exit:%d", file, line, tsAssert); - taosPrintTrace(flags, level, dflag); + taosPrintTrace(flags, level, dflag, -1); if (tsAssert) { // taosCloseLog(); @@ -824,6 +826,216 @@ bool taosAssertDebug(bool condition, const char *file, int32_t line, const char return true; } +int32_t taosGenCrashJsonMsg(int signum, char** pMsg, int64_t clusterId, int64_t startTime) { + SJson* pJson = tjsonCreateObject(); + if (pJson == NULL) return -1; + char tmp[4096] = {0}; + + tjsonAddDoubleToObject(pJson, "reportVersion", 1); + + tjsonAddIntegerToObject(pJson, "clusterId", clusterId); + tjsonAddIntegerToObject(pJson, "startTime", startTime); + + taosGetFqdn(tmp); + tjsonAddStringToObject(pJson, "fqdn", tmp); + + tjsonAddIntegerToObject(pJson, "pid", taosGetPId()); + + taosGetAppName(tmp, NULL); + tjsonAddStringToObject(pJson, "appName", tmp); + + if (taosGetOsReleaseName(tmp, sizeof(tmp)) == 0) { + tjsonAddStringToObject(pJson, "os", tmp); + } + + float numOfCores = 0; + if (taosGetCpuInfo(tmp, sizeof(tmp), &numOfCores) == 0) { + tjsonAddStringToObject(pJson, "cpuModel", tmp); + tjsonAddDoubleToObject(pJson, "numOfCpu", numOfCores); + } else { + tjsonAddDoubleToObject(pJson, "numOfCpu", tsNumOfCores); + } + + snprintf(tmp, sizeof(tmp), "%" PRId64 " kB", tsTotalMemoryKB); + tjsonAddStringToObject(pJson, "memory", tmp); + + tjsonAddStringToObject(pJson, "version", version); + tjsonAddStringToObject(pJson, "buildInfo", buildinfo); + tjsonAddStringToObject(pJson, "gitInfo", gitinfo); + + tjsonAddIntegerToObject(pJson, "crashSig", signum); + tjsonAddIntegerToObject(pJson, "crashTs", taosGetTimestampUs()); + +#ifdef _TD_DARWIN_64 + taosLogTraceToBuf(tmp, sizeof(tmp), 4); +#elif !defined(WINDOWS) + taosLogTraceToBuf(tmp, sizeof(tmp), 3); +#else + taosLogTraceToBuf(tmp, sizeof(tmp), 8); +#endif + + tjsonAddStringToObject(pJson, "stackInfo", tmp); + + char* pCont = tjsonToString(pJson); + tjsonDelete(pJson); + + *pMsg = pCont; + + return TSDB_CODE_SUCCESS; +} + + +void taosLogCrashInfo(char* nodeType, char* pMsg, int64_t msgLen, int signum, void *sigInfo) { + const char *flags = "UTL FATAL "; + ELogLevel level = DEBUG_FATAL; + int32_t dflag = 255; + char filepath[PATH_MAX] = {0}; + TdFilePtr pFile = NULL; + + if (pMsg && msgLen > 0) { + snprintf(filepath, sizeof(filepath), "%s%s.%sCrashLog", tsLogDir, TD_DIRSEP, nodeType); + + pFile = taosOpenFile(filepath, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND); + if (pFile == NULL) { + taosPrintLog(flags, level, dflag, "failed to open file:%s since %s", filepath, terrstr()); + goto _return; + } + + taosLockFile(pFile); + + int64_t writeSize = taosWriteFile(pFile, &msgLen, sizeof(msgLen)); + if (sizeof(msgLen) != writeSize) { + taosUnLockFile(pFile); + taosPrintLog(flags, level, dflag, "failed to write len to file:%s,%p wlen:%" PRId64 " tlen:%lu since %s", + filepath, pFile, writeSize, sizeof(msgLen), terrstr()); + goto _return; + } + + writeSize = taosWriteFile(pFile, pMsg, msgLen); + if (msgLen != writeSize) { + taosUnLockFile(pFile); + taosPrintLog(flags, level, dflag, "failed to write file:%s,%p wlen:%" PRId64 " tlen:%" PRId64 " since %s", + filepath, pFile, writeSize, msgLen, terrstr()); + goto _return; + } + + taosUnLockFile(pFile); + } + +_return: + + if (pFile) taosCloseFile(&pFile); + + terrno = TAOS_SYSTEM_ERROR(errno); + taosPrintLog(flags, level, dflag, "crash signal is %d", signum); + +#ifdef _TD_DARWIN_64 + taosPrintTrace(flags, level, dflag, 4); +#elif !defined(WINDOWS) + taosPrintLog(flags, level, dflag, "sender PID:%d cmdline:%s", ((siginfo_t *)sigInfo)->si_pid, + taosGetCmdlineByPID(((siginfo_t *)sigInfo)->si_pid)); + taosPrintTrace(flags, level, dflag, 3); +#else + taosPrintTrace(flags, level, dflag, 8); +#endif + + taosMemoryFree(pMsg); +} + +void taosReadCrashInfo(char* filepath, char** pMsg, int64_t* pMsgLen, TdFilePtr* pFd) { + const char *flags = "UTL FATAL "; + ELogLevel level = DEBUG_FATAL; + int32_t dflag = 255; + TdFilePtr pFile = NULL; + bool truncateFile = false; + char* buf = NULL; + + if (NULL == *pFd) { + int64_t filesize = 0; + if (taosStatFile(filepath, &filesize, NULL) < 0) { + if (ENOENT == errno) { + return; + } + + terrno = TAOS_SYSTEM_ERROR(errno); + taosPrintLog(flags, level, dflag, "failed to stat file:%s since %s", filepath, terrstr()); + return; + } + + if (filesize <= 0) { + return; + } + + pFile = taosOpenFile(filepath, TD_FILE_READ|TD_FILE_WRITE); + if (pFile == NULL) { + if (ENOENT == errno) { + return; + } + + terrno = TAOS_SYSTEM_ERROR(errno); + taosPrintLog(flags, level, dflag, "failed to open file:%s since %s", filepath, terrstr()); + return; + } + + taosLockFile(pFile); + } else { + pFile = *pFd; + } + + int64_t msgLen = 0; + int64_t readSize = taosReadFile(pFile, &msgLen, sizeof(msgLen)); + if (sizeof(msgLen) != readSize) { + truncateFile = true; + if (readSize < 0) { + taosPrintLog(flags, level, dflag, "failed to read len from file:%s,%p wlen:%" PRId64 " tlen:%lu since %s", + filepath, pFile, readSize, sizeof(msgLen), terrstr()); + } + goto _return; + } + + buf = taosMemoryMalloc(msgLen); + if (NULL == buf) { + taosPrintLog(flags, level, dflag, "failed to malloc buf, size:%" PRId64, msgLen); + goto _return; + } + + readSize = taosReadFile(pFile, buf, msgLen); + if (msgLen != readSize) { + truncateFile = true; + taosPrintLog(flags, level, dflag, "failed to read file:%s,%p wlen:%" PRId64 " tlen:%" PRId64 " since %s", + filepath, pFile, readSize, msgLen, terrstr()); + goto _return; + } + + *pMsg = buf; + *pMsgLen = msgLen; + *pFd = pFile; + + return; + +_return: + + if (truncateFile) { + taosFtruncateFile(pFile, 0); + } + taosUnLockFile(pFile); + taosCloseFile(&pFile); + taosMemoryFree(buf); + + *pMsg = NULL; + *pMsgLen = 0; + *pFd = NULL; +} + +void taosReleaseCrashLogFile(TdFilePtr pFile, bool truncateFile) { + if (truncateFile) { + taosFtruncateFile(pFile, 0); + } + + taosUnLockFile(pFile); + taosCloseFile(&pFile); +} + #ifdef NDEBUG bool taosAssertRelease(bool condition) { if (condition) return false; @@ -842,4 +1054,4 @@ bool taosAssertRelease(bool condition) { return true; } -#endif \ No newline at end of file +#endif