955 lines
38 KiB
Python
955 lines
38 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
import io
|
|
import sys
|
|
from enum import Enum
|
|
import threading
|
|
import signal
|
|
import logging
|
|
import time
|
|
from subprocess import PIPE, Popen, TimeoutExpired
|
|
from typing import BinaryIO, Generator, IO, List, NewType, Optional
|
|
import typing
|
|
|
|
try:
|
|
import psutil
|
|
except:
|
|
print("Psutil module needed, please install: sudo pip3 install psutil")
|
|
sys.exit(-1)
|
|
from queue import Queue, Empty
|
|
|
|
from .shared.config import Config
|
|
from .shared.db import DbTarget, DbConn
|
|
from .shared.misc import Logging, Helper, CrashGenError, Status, Progress, Dice
|
|
from .shared.types import DirPath, IpcStream
|
|
|
|
# from crash_gen.misc import CrashGenError, Dice, Helper, Logging, Progress, Status
|
|
# from crash_gen.db import DbConn, DbTarget
|
|
# from crash_gen.settings import Config
|
|
# from crash_gen.types import DirPath
|
|
|
|
class TdeInstance():
|
|
"""
|
|
A class to capture the *static* information of a TDengine instance,
|
|
including the location of the various files/directories, and basica
|
|
configuration.
|
|
"""
|
|
|
|
@classmethod
|
|
def _getBuildPath(cls):
|
|
selfPath = os.path.dirname(os.path.realpath(__file__))
|
|
if ("community" in selfPath):
|
|
projPath = selfPath[:selfPath.find("communit")]
|
|
else:
|
|
projPath = selfPath[:selfPath.find("tests")]
|
|
|
|
buildPath = None
|
|
for root, dirs, files in os.walk(projPath):
|
|
if ("taosd" in files):
|
|
rootRealPath = os.path.dirname(os.path.realpath(root))
|
|
if ("packaging" not in rootRealPath):
|
|
buildPath = root[:len(root) - len("/build/bin")]
|
|
break
|
|
if buildPath == None:
|
|
raise RuntimeError("Failed to determine buildPath, selfPath={}, projPath={}"
|
|
.format(selfPath, projPath))
|
|
return buildPath
|
|
|
|
@classmethod
|
|
def prepareGcovEnv(cls, env):
|
|
# Ref: https://gcc.gnu.org/onlinedocs/gcc/Cross-profiling.html
|
|
bPath = cls._getBuildPath() # build PATH
|
|
numSegments = len(bPath.split('/')) # "/x/TDengine/build" should yield 3
|
|
# numSegments += 2 # cover "/src" after build
|
|
# numSegments = numSegments - 1 # DEBUG only
|
|
env['GCOV_PREFIX'] = bPath + '/src_s' # Server side source
|
|
env['GCOV_PREFIX_STRIP'] = str(numSegments) # Strip every element, plus, ENV needs strings
|
|
# VERY VERY important note: GCOV data collection NOT effective upon SIG_KILL
|
|
Logging.info("Preparing GCOV environement to strip {} elements and use path: {}".format(
|
|
numSegments, env['GCOV_PREFIX'] ))
|
|
|
|
def __init__(self, subdir='test', tInstNum=0, port=6030, fepPort=6030):
|
|
self._buildDir = self._getBuildPath()
|
|
self._subdir = '/' + subdir # TODO: tolerate "/"
|
|
self._port = port # TODO: support different IP address too
|
|
self._fepPort = fepPort
|
|
|
|
self._tInstNum = tInstNum
|
|
|
|
# An "Tde Instance" will *contain* a "sub process" object, with will/may use a thread internally
|
|
# self._smThread = ServiceManagerThread()
|
|
self._subProcess = None # type: Optional[TdeSubProcess]
|
|
|
|
def getDbTarget(self):
|
|
return DbTarget(self.getCfgDir(), self.getHostAddr(), self._port)
|
|
|
|
def getPort(self):
|
|
return self._port
|
|
|
|
def __repr__(self):
|
|
return "[TdeInstance: {}, subdir={}]".format(
|
|
self._buildDir, Helper.getFriendlyPath(self._subdir))
|
|
|
|
def generateCfgFile(self):
|
|
# print("Logger = {}".format(logger))
|
|
# buildPath = self.getBuildPath()
|
|
# taosdPath = self._buildPath + "/build/bin/taosd"
|
|
|
|
cfgDir = self.getCfgDir()
|
|
cfgFile = cfgDir + "/taos.cfg" # TODO: inquire if this is fixed
|
|
if os.path.exists(cfgFile):
|
|
if os.path.isfile(cfgFile):
|
|
Logging.warning("Config file exists already, skip creation: {}".format(cfgFile))
|
|
return # cfg file already exists, nothing to do
|
|
else:
|
|
raise CrashGenError("Invalid config file: {}".format(cfgFile))
|
|
# Now that the cfg file doesn't exist
|
|
if os.path.exists(cfgDir):
|
|
if not os.path.isdir(cfgDir):
|
|
raise CrashGenError("Invalid config dir: {}".format(cfgDir))
|
|
# else: good path
|
|
else:
|
|
os.makedirs(cfgDir, exist_ok=True) # like "mkdir -p"
|
|
# Now we have a good cfg dir
|
|
cfgValues = {
|
|
'runDir': self.getRunDir(),
|
|
'ip': '127.0.0.1', # TODO: change to a network addressable ip
|
|
'port': self._port,
|
|
'fepPort': self._fepPort,
|
|
}
|
|
cfgTemplate = """
|
|
dataDir {runDir}/data
|
|
logDir {runDir}/log
|
|
|
|
charset UTF-8
|
|
|
|
firstEp {ip}:{fepPort}
|
|
fqdn {ip}
|
|
serverPort {port}
|
|
|
|
# was all 135 below
|
|
dDebugFlag 135
|
|
cDebugFlag 135
|
|
rpcDebugFlag 135
|
|
qDebugFlag 135
|
|
# httpDebugFlag 143
|
|
# asyncLog 0
|
|
# tables 10
|
|
maxtablesPerVnode 10
|
|
rpcMaxTime 101
|
|
# cache 2
|
|
keep 36500
|
|
# walLevel 2
|
|
walLevel 1
|
|
#
|
|
# maxConnections 100
|
|
quorum 2
|
|
"""
|
|
cfgContent = cfgTemplate.format_map(cfgValues)
|
|
f = open(cfgFile, "w")
|
|
f.write(cfgContent)
|
|
f.close()
|
|
|
|
def rotateLogs(self):
|
|
logPath = self.getLogDir()
|
|
# ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397
|
|
if os.path.exists(logPath):
|
|
logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S')
|
|
Logging.info("Saving old log files to: {}".format(logPathSaved))
|
|
os.rename(logPath, logPathSaved)
|
|
# os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms
|
|
|
|
|
|
def getExecFile(self): # .../taosd
|
|
return self._buildDir + "/build/bin/taosd"
|
|
|
|
def getRunDir(self) -> DirPath : # TODO: rename to "root dir" ?!
|
|
return DirPath(self._buildDir + self._subdir)
|
|
|
|
def getCfgDir(self) -> DirPath : # path, not file
|
|
return DirPath(self.getRunDir() + "/cfg")
|
|
|
|
def getLogDir(self) -> DirPath :
|
|
return DirPath(self.getRunDir() + "/log")
|
|
|
|
def getHostAddr(self):
|
|
return "127.0.0.1"
|
|
|
|
def getServiceCmdLine(self): # to start the instance
|
|
if Config.getConfig().track_memory_leaks:
|
|
Logging.info("Invoking VALGRIND on service...")
|
|
return ['exec valgrind', '--leak-check=yes', self.getExecFile(), '-c', self.getCfgDir()]
|
|
else:
|
|
# TODO: move "exec -c" into Popen(), we can both "use shell" and NOT fork so ask to lose kill control
|
|
return ["exec " + self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen()
|
|
|
|
def _getDnodes(self, dbc):
|
|
dbc.query("show dnodes")
|
|
cols = dbc.getQueryResult() # id,end_point,vnodes,cores,status,role,create_time,offline reason
|
|
return {c[1]:c[4] for c in cols} # {'xxx:6030':'ready', 'xxx:6130':'ready'}
|
|
|
|
def createDnode(self, dbt: DbTarget):
|
|
"""
|
|
With a connection to the "first" EP, let's create a dnode for someone else who
|
|
wants to join.
|
|
"""
|
|
dbc = DbConn.createNative(self.getDbTarget())
|
|
dbc.open()
|
|
|
|
if dbt.getEp() in self._getDnodes(dbc):
|
|
Logging.info("Skipping DNode creation for: {}".format(dbt))
|
|
dbc.close()
|
|
return
|
|
|
|
sql = "CREATE DNODE \"{}\"".format(dbt.getEp())
|
|
dbc.execute(sql)
|
|
dbc.close()
|
|
|
|
def getStatus(self):
|
|
# return self._smThread.getStatus()
|
|
if self._subProcess is None:
|
|
return Status(Status.STATUS_EMPTY)
|
|
return self._subProcess.getStatus()
|
|
|
|
# def getSmThread(self):
|
|
# return self._smThread
|
|
|
|
def start(self):
|
|
if self.getStatus().isActive():
|
|
raise CrashGenError("Cannot start instance from status: {}".format(self.getStatus()))
|
|
|
|
Logging.info("Starting TDengine instance: {}".format(self))
|
|
self.generateCfgFile() # service side generates config file, client does not
|
|
self.rotateLogs()
|
|
|
|
# self._smThread.start(self.getServiceCmdLine(), self.getLogDir()) # May raise exceptions
|
|
self._subProcess = TdeSubProcess(self.getServiceCmdLine(), self.getLogDir())
|
|
|
|
def stop(self):
|
|
self._subProcess.stop()
|
|
self._subProcess = None
|
|
|
|
def isFirst(self):
|
|
return self._tInstNum == 0
|
|
|
|
def printFirst10Lines(self):
|
|
if self._subProcess is None:
|
|
Logging.warning("Incorrect TI status for procIpcBatch-10 operation")
|
|
return
|
|
self._subProcess.procIpcBatch(trimToTarget=10, forceOutput=True)
|
|
|
|
def procIpcBatch(self):
|
|
if self._subProcess is None:
|
|
Logging.warning("Incorrect TI status for procIpcBatch operation")
|
|
return
|
|
self._subProcess.procIpcBatch() # may enounter EOF and change status to STOPPED
|
|
if self._subProcess.getStatus().isStopped():
|
|
self._subProcess.stop()
|
|
self._subProcess = None
|
|
|
|
class TdeSubProcess:
|
|
"""
|
|
A class to to represent the actual sub process that is the run-time
|
|
of a TDengine instance.
|
|
|
|
It takes a TdeInstance object as its parameter, with the rationale being
|
|
"a sub process runs an instance".
|
|
|
|
We aim to ensure that this object has exactly the same life-cycle as the
|
|
underlying sub process.
|
|
"""
|
|
|
|
# RET_ALREADY_STOPPED = -1
|
|
# RET_TIME_OUT = -3
|
|
# RET_SUCCESS = -4
|
|
|
|
def __init__(self, cmdLine: List[str], logDir: DirPath):
|
|
# Create the process + managing thread immediately
|
|
|
|
Logging.info("Attempting to start TAOS sub process...")
|
|
self._popen = self._start(cmdLine) # the actual sub process
|
|
self._smThread = ServiceManagerThread(self, logDir) # A thread to manage the sub process, mostly to process the IO
|
|
Logging.info("Successfully started TAOS process: {}".format(self))
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
# if self.subProcess is None:
|
|
# return '[TdeSubProc: Empty]'
|
|
return '[TdeSubProc: pid = {}, status = {}]'.format(
|
|
self.getPid(), self.getStatus() )
|
|
|
|
def getIpcStdOut(self) -> IpcStream :
|
|
if self._popen.universal_newlines : # alias of text_mode
|
|
raise CrashGenError("We need binary mode for STDOUT IPC")
|
|
# Logging.info("Type of stdout is: {}".format(type(self._popen.stdout)))
|
|
return typing.cast(IpcStream, self._popen.stdout)
|
|
|
|
def getIpcStdErr(self) -> IpcStream :
|
|
if self._popen.universal_newlines : # alias of text_mode
|
|
raise CrashGenError("We need binary mode for STDERR IPC")
|
|
return typing.cast(IpcStream, self._popen.stderr)
|
|
|
|
# Now it's always running, since we matched the life cycle
|
|
# def isRunning(self):
|
|
# return self.subProcess is not None
|
|
|
|
def getPid(self):
|
|
return self._popen.pid
|
|
|
|
def _start(self, cmdLine) -> Popen :
|
|
ON_POSIX = 'posix' in sys.builtin_module_names
|
|
|
|
# Prepare environment variables for coverage information
|
|
# Ref: https://stackoverflow.com/questions/2231227/python-subprocess-popen-with-a-modified-environment
|
|
myEnv = os.environ.copy()
|
|
TdeInstance.prepareGcovEnv(myEnv)
|
|
|
|
# print(myEnv)
|
|
# print("Starting TDengine with env: ", myEnv.items())
|
|
print("Starting TDengine: {}".format(cmdLine))
|
|
|
|
ret = Popen(
|
|
' '.join(cmdLine), # ' '.join(cmdLine) if useShell else cmdLine,
|
|
shell=True, # Always use shell, since we need to pass ENV vars
|
|
stdout=PIPE,
|
|
stderr=PIPE,
|
|
close_fds=ON_POSIX,
|
|
env=myEnv
|
|
) # had text=True, which interferred with reading EOF
|
|
time.sleep(0.01) # very brief wait, then let's check if sub process started successfully.
|
|
if ret.poll():
|
|
raise CrashGenError("Sub process failed to start with command line: {}".format(cmdLine))
|
|
return ret
|
|
|
|
STOP_SIGNAL = signal.SIGINT # signal.SIGKILL/SIGINT # What signal to use (in kill) to stop a taosd process?
|
|
SIG_KILL_RETCODE = 137 # ref: https://stackoverflow.com/questions/43268156/process-finished-with-exit-code-137-in-pycharm
|
|
|
|
def stop(self):
|
|
"""
|
|
Stop a sub process, DO NOT return anything, process all conditions INSIDE.
|
|
|
|
Calling function should immediately delete/unreference the object
|
|
|
|
Common POSIX signal values (from man -7 signal):
|
|
SIGHUP 1
|
|
SIGINT 2
|
|
SIGQUIT 3
|
|
SIGILL 4
|
|
SIGTRAP 5
|
|
SIGABRT 6
|
|
SIGIOT 6
|
|
SIGBUS 7
|
|
SIGEMT -
|
|
SIGFPE 8
|
|
SIGKILL 9
|
|
SIGUSR1 10
|
|
SIGSEGV 11
|
|
SIGUSR2 12
|
|
"""
|
|
# self._popen should always be valid.
|
|
|
|
Logging.info("Terminating TDengine service running as the sub process...")
|
|
if self.getStatus().isStopped():
|
|
Logging.info("Service already stopped")
|
|
return
|
|
if self.getStatus().isStopping():
|
|
Logging.info("Service is already being stopped, pid: {}".format(self.getPid()))
|
|
return
|
|
|
|
self.setStatus(Status.STATUS_STOPPING)
|
|
|
|
retCode = self._popen.poll() # ret -N means killed with signal N, otherwise it's from exit(N)
|
|
if retCode: # valid return code, process ended
|
|
# retCode = -retCode # only if valid
|
|
Logging.warning("TSP.stop(): process ended itself")
|
|
# self.subProcess = None
|
|
return
|
|
|
|
# process still alive, let's interrupt it
|
|
self._stopForSure(self._popen, self.STOP_SIGNAL) # success if no exception
|
|
|
|
# sub process should end, then IPC queue should end, causing IO thread to end
|
|
self._smThread.stop() # stop for sure too
|
|
|
|
self.setStatus(Status.STATUS_STOPPED)
|
|
|
|
@classmethod
|
|
def _stopForSure(cls, proc: Popen, sig: int):
|
|
'''
|
|
Stop a process and all sub processes with a signal, and SIGKILL if necessary
|
|
'''
|
|
def doKillTdService(proc: Popen, sig: int):
|
|
Logging.info("Killing sub-sub process {} with signal {}".format(proc.pid, sig))
|
|
proc.send_signal(sig)
|
|
try:
|
|
retCode = proc.wait(20)
|
|
if (- retCode) == signal.SIGSEGV: # Crashed
|
|
Logging.warning("Process {} CRASHED, please check CORE file!".format(proc.pid))
|
|
elif (- retCode) == sig :
|
|
Logging.info("TD service terminated with expected return code {}".format(sig))
|
|
else:
|
|
Logging.warning("TD service terminated, EXPECTING ret code {}, got {}".format(sig, -retCode))
|
|
return True # terminated successfully
|
|
except TimeoutExpired as err:
|
|
Logging.warning("Failed to kill sub-sub process {} with signal {}".format(proc.pid, sig))
|
|
return False # failed to terminate
|
|
|
|
|
|
def doKillChild(child: psutil.Process, sig: int):
|
|
Logging.info("Killing sub-sub process {} with signal {}".format(child.pid, sig))
|
|
child.send_signal(sig)
|
|
try:
|
|
retCode = child.wait(20) # type: ignore
|
|
if (- retCode) == signal.SIGSEGV: # type: ignore # Crashed
|
|
Logging.warning("Process {} CRASHED, please check CORE file!".format(child.pid))
|
|
elif (- retCode) == sig : # type: ignore
|
|
Logging.info("Sub-sub process terminated with expected return code {}".format(sig))
|
|
else:
|
|
Logging.warning("Process terminated, EXPECTING ret code {}, got {}".format(sig, -retCode)) # type: ignore
|
|
return True # terminated successfully
|
|
except psutil.TimeoutExpired as err:
|
|
Logging.warning("Failed to kill sub-sub process {} with signal {}".format(child.pid, sig))
|
|
return False # did not terminate
|
|
|
|
def doKill(proc: Popen, sig: int):
|
|
pid = proc.pid
|
|
try:
|
|
topSubProc = psutil.Process(pid) # Now that we are doing "exec -c", should not have children any more
|
|
for child in topSubProc.children(recursive=True): # or parent.children() for recursive=False
|
|
Logging.warning("Unexpected child to be killed")
|
|
doKillChild(child, sig)
|
|
except psutil.NoSuchProcess as err:
|
|
Logging.info("Process not found, can't kill, pid = {}".format(pid))
|
|
|
|
return doKillTdService(proc, sig)
|
|
# TODO: re-examine if we need to kill the top process, which is always the SHELL for now
|
|
# try:
|
|
# proc.wait(1) # SHELL process here, may throw subprocess.TimeoutExpired exception
|
|
# # expRetCode = self.SIG_KILL_RETCODE if sig==signal.SIGKILL else (-sig)
|
|
# # if retCode == expRetCode:
|
|
# # Logging.info("Process terminated with expected return code {}".format(retCode))
|
|
# # else:
|
|
# # Logging.warning("Process terminated, EXPECTING ret code {}, got {}".format(expRetCode, retCode))
|
|
# # return True # success
|
|
# except subprocess.TimeoutExpired as err:
|
|
# Logging.warning("Failed to kill process {} with signal {}".format(pid, sig))
|
|
# return False # failed to kill
|
|
|
|
def softKill(proc, sig):
|
|
return doKill(proc, sig)
|
|
|
|
def hardKill(proc):
|
|
return doKill(proc, signal.SIGKILL)
|
|
|
|
pid = proc.pid
|
|
Logging.info("Terminate running processes under {}, with SIG #{} and wait...".format(pid, sig))
|
|
if softKill(proc, sig):
|
|
return # success
|
|
if sig != signal.SIGKILL: # really was soft above
|
|
if hardKill(proc):
|
|
return
|
|
raise CrashGenError("Failed to stop process, pid={}".format(pid))
|
|
|
|
def getStatus(self):
|
|
return self._smThread.getStatus()
|
|
|
|
def setStatus(self, status):
|
|
self._smThread.setStatus(status)
|
|
|
|
def procIpcBatch(self, trimToTarget=0, forceOutput=False):
|
|
self._smThread.procIpcBatch(trimToTarget, forceOutput)
|
|
|
|
class ServiceManager:
|
|
PAUSE_BETWEEN_IPC_CHECK = 1.2 # seconds between checks on STDOUT of sub process
|
|
|
|
def __init__(self, numDnodes): # >1 when we run a cluster
|
|
Logging.info("TDengine Service Manager (TSM) created")
|
|
self._numDnodes = numDnodes # >1 means we have a cluster
|
|
self._lock = threading.Lock()
|
|
# signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec
|
|
# signal.signal(signal.SIGINT, self.sigIntHandler)
|
|
# signal.signal(signal.SIGUSR1, self.sigUsrHandler) # different handler!
|
|
|
|
self.inSigHandler = False
|
|
# self._status = MainExec.STATUS_RUNNING # set inside
|
|
# _startTaosService()
|
|
self._runCluster = (numDnodes > 1)
|
|
self._tInsts : List[TdeInstance] = []
|
|
for i in range(0, numDnodes):
|
|
ti = self._createTdeInstance(i) # construct tInst
|
|
self._tInsts.append(ti)
|
|
|
|
# self.svcMgrThreads : List[ServiceManagerThread] = []
|
|
# for i in range(0, numDnodes):
|
|
# thread = self._createThread(i) # construct tInst
|
|
# self.svcMgrThreads.append(thread)
|
|
|
|
def _createTdeInstance(self, dnIndex):
|
|
if not self._runCluster: # single instance
|
|
subdir = 'test'
|
|
else: # Create all threads in a cluster
|
|
subdir = 'cluster_dnode_{}'.format(dnIndex)
|
|
fepPort= 6030 # firstEP Port
|
|
port = fepPort + dnIndex * 100
|
|
return TdeInstance(subdir, dnIndex, port, fepPort)
|
|
# return ServiceManagerThread(dnIndex, ti)
|
|
|
|
def _doMenu(self):
|
|
choice = ""
|
|
while True:
|
|
print("\nInterrupting Service Program, Choose an Action: ")
|
|
print("1: Resume")
|
|
print("2: Terminate")
|
|
print("3: Restart")
|
|
# Remember to update the if range below
|
|
# print("Enter Choice: ", end="", flush=True)
|
|
while choice == "":
|
|
choice = input("Enter Choice: ")
|
|
if choice != "":
|
|
break # done with reading repeated input
|
|
if choice in ["1", "2", "3"]:
|
|
break # we are done with whole method
|
|
print("Invalid choice, please try again.")
|
|
choice = "" # reset
|
|
return choice
|
|
|
|
def sigUsrHandler(self, signalNumber, frame):
|
|
print("Interrupting main thread execution upon SIGUSR1")
|
|
if self.inSigHandler: # already
|
|
print("Ignoring repeated SIG...")
|
|
return # do nothing if it's already not running
|
|
self.inSigHandler = True
|
|
|
|
choice = self._doMenu()
|
|
if choice == "1":
|
|
self.sigHandlerResume() # TODO: can the sub-process be blocked due to us not reading from queue?
|
|
elif choice == "2":
|
|
self.stopTaosServices()
|
|
elif choice == "3": # Restart
|
|
self.restart()
|
|
else:
|
|
raise RuntimeError("Invalid menu choice: {}".format(choice))
|
|
|
|
self.inSigHandler = False
|
|
|
|
def sigIntHandler(self, signalNumber, frame):
|
|
print("ServiceManager: INT Signal Handler starting...")
|
|
if self.inSigHandler:
|
|
print("Ignoring repeated SIG_INT...")
|
|
return
|
|
self.inSigHandler = True
|
|
|
|
self.stopTaosServices()
|
|
print("ServiceManager: INT Signal Handler returning...")
|
|
self.inSigHandler = False
|
|
|
|
def sigHandlerResume(self):
|
|
print("Resuming TDengine service manager (main thread)...\n\n")
|
|
|
|
# def _updateThreadStatus(self):
|
|
# if self.svcMgrThread: # valid svc mgr thread
|
|
# if self.svcMgrThread.isStopped(): # done?
|
|
# self.svcMgrThread.procIpcBatch() # one last time. TODO: appropriate?
|
|
# self.svcMgrThread = None # no more
|
|
|
|
def isActive(self):
|
|
"""
|
|
Determine if the service/cluster is active at all, i.e. at least
|
|
one instance is active
|
|
"""
|
|
for ti in self._tInsts:
|
|
if ti.getStatus().isActive():
|
|
return True
|
|
return False
|
|
|
|
def isRunning(self):
|
|
for ti in self._tInsts:
|
|
if not ti.getStatus().isRunning():
|
|
return False
|
|
return True
|
|
|
|
|
|
# def isRestarting(self):
|
|
# """
|
|
# Determine if the service/cluster is being "restarted", i.e., at least
|
|
# one thread is in "restarting" status
|
|
# """
|
|
# for thread in self.svcMgrThreads:
|
|
# if thread.isRestarting():
|
|
# return True
|
|
# return False
|
|
|
|
def isStable(self):
|
|
"""
|
|
Determine if the service/cluster is "stable", i.e. all of the
|
|
threads are in "stable" status.
|
|
"""
|
|
for ti in self._tInsts:
|
|
if not ti.getStatus().isStable():
|
|
return False
|
|
return True
|
|
|
|
def _procIpcAll(self):
|
|
while self.isActive():
|
|
Progress.emit(Progress.SERVICE_HEART_BEAT)
|
|
for ti in self._tInsts: # all thread objects should always be valid
|
|
# while self.isRunning() or self.isRestarting() : # for as long as the svc mgr thread is still here
|
|
status = ti.getStatus()
|
|
if status.isRunning():
|
|
# th = ti.getSmThread()
|
|
ti.procIpcBatch() # regular processing,
|
|
if status.isStopped():
|
|
ti.procIpcBatch() # one last time?
|
|
# self._updateThreadStatus()
|
|
|
|
time.sleep(self.PAUSE_BETWEEN_IPC_CHECK) # pause, before next round
|
|
# raise CrashGenError("dummy")
|
|
Logging.info("Service Manager Thread (with subprocess) ended, main thread exiting...")
|
|
|
|
def _getFirstInstance(self):
|
|
return self._tInsts[0]
|
|
|
|
def startTaosServices(self):
|
|
with self._lock:
|
|
if self.isActive():
|
|
raise RuntimeError("Cannot start TAOS service(s) when one/some may already be running")
|
|
|
|
# Find if there's already a taosd service, and then kill it
|
|
for proc in psutil.process_iter():
|
|
if proc.name() == 'taosd' or proc.name() == 'memcheck-amd64-': # Regular or under Valgrind
|
|
Logging.info("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupt")
|
|
time.sleep(2.0)
|
|
proc.kill()
|
|
# print("Process: {}".format(proc.name()))
|
|
|
|
# self.svcMgrThread = ServiceManagerThread() # create the object
|
|
|
|
for ti in self._tInsts:
|
|
ti.start()
|
|
if not ti.isFirst():
|
|
tFirst = self._getFirstInstance()
|
|
tFirst.createDnode(ti.getDbTarget())
|
|
ti.printFirst10Lines()
|
|
# ti.getSmThread().procIpcBatch(trimToTarget=10, forceOutput=True) # for printing 10 lines
|
|
|
|
def stopTaosServices(self):
|
|
with self._lock:
|
|
if not self.isActive():
|
|
Logging.warning("Cannot stop TAOS service(s), already not active")
|
|
return
|
|
|
|
for ti in self._tInsts:
|
|
ti.stop()
|
|
|
|
def run(self):
|
|
self.startTaosServices()
|
|
self._procIpcAll() # pump/process all the messages, may encounter SIG + restart
|
|
if self.isActive(): # if sig handler hasn't destroyed it by now
|
|
self.stopTaosServices() # should have started already
|
|
|
|
def restart(self):
|
|
if not self.isStable():
|
|
Logging.warning("Cannot restart service/cluster, when not stable")
|
|
return
|
|
|
|
# self._isRestarting = True
|
|
if self.isActive():
|
|
self.stopTaosServices()
|
|
else:
|
|
Logging.warning("Service not active when restart requested")
|
|
|
|
self.startTaosServices()
|
|
# self._isRestarting = False
|
|
|
|
# def isRunning(self):
|
|
# return self.svcMgrThread != None
|
|
|
|
# def isRestarting(self):
|
|
# return self._isRestarting
|
|
|
|
class ServiceManagerThread:
|
|
"""
|
|
A class representing a dedicated thread which manages the "sub process"
|
|
of the TDengine service, interacting with its STDOUT/ERR.
|
|
|
|
It takes a TdeInstance parameter at creation time, or create a default
|
|
"""
|
|
MAX_QUEUE_SIZE = 10000
|
|
|
|
def __init__(self, subProc: TdeSubProcess, logDir: str):
|
|
# Set the sub process
|
|
# self._tdeSubProcess = None # type: TdeSubProcess
|
|
|
|
# Arrange the TDengine instance
|
|
# self._tInstNum = tInstNum # instance serial number in cluster, ZERO based
|
|
# self._tInst = tInst or TdeInstance() # Need an instance
|
|
|
|
# self._thread = None # type: Optional[threading.Thread] # The actual thread, # type: threading.Thread
|
|
# self._thread2 = None # type: Optional[threading.Thread] Thread # watching stderr
|
|
self._status = Status(Status.STATUS_STOPPED) # The status of the underlying service, actually.
|
|
|
|
self._start(subProc, logDir)
|
|
|
|
def __repr__(self):
|
|
raise CrashGenError("SMT status moved to TdeSubProcess")
|
|
# return "[SvcMgrThread: status={}, subProc={}]".format(
|
|
# self.getStatus(), self._tdeSubProcess)
|
|
|
|
def getStatus(self):
|
|
'''
|
|
Get the status of the process being managed. (misnomer alert!)
|
|
'''
|
|
return self._status
|
|
|
|
def setStatus(self, statusVal: int):
|
|
self._status.set(statusVal)
|
|
|
|
# Start the thread (with sub process), and wait for the sub service
|
|
# to become fully operational
|
|
def _start(self, subProc :TdeSubProcess, logDir: str):
|
|
'''
|
|
Request the manager thread to start a new sub process, and manage it.
|
|
|
|
:param cmdLine: the command line to invoke
|
|
:param logDir: the logging directory, to hold stdout/stderr files
|
|
'''
|
|
# if self._thread:
|
|
# raise RuntimeError("Unexpected _thread")
|
|
# if self._tdeSubProcess:
|
|
# raise RuntimeError("TDengine sub process already created/running")
|
|
|
|
# Moved to TdeSubProcess
|
|
# Logging.info("Attempting to start TAOS service: {}".format(self))
|
|
|
|
self._status.set(Status.STATUS_STARTING)
|
|
# self._tdeSubProcess = TdeSubProcess.start(cmdLine) # TODO: verify process is running
|
|
|
|
self._ipcQueue = Queue() # type: Queue
|
|
self._thread = threading.Thread( # First thread captures server OUTPUT
|
|
target=self.svcOutputReader,
|
|
args=(subProc.getIpcStdOut(), self._ipcQueue, logDir))
|
|
self._thread.daemon = True # thread dies with the program
|
|
self._thread.start()
|
|
time.sleep(0.01)
|
|
if not self._thread.is_alive(): # What happened?
|
|
Logging.info("Failed to start process to monitor STDOUT")
|
|
self.stop()
|
|
raise CrashGenError("Failed to start thread to monitor STDOUT")
|
|
Logging.info("Successfully started process to monitor STDOUT")
|
|
|
|
self._thread2 = threading.Thread( # 2nd thread captures server ERRORs
|
|
target=self.svcErrorReader,
|
|
args=(subProc.getIpcStdErr(), self._ipcQueue, logDir))
|
|
self._thread2.daemon = True # thread dies with the program
|
|
self._thread2.start()
|
|
time.sleep(0.01)
|
|
if not self._thread2.is_alive():
|
|
self.stop()
|
|
raise CrashGenError("Failed to start thread to monitor STDERR")
|
|
|
|
# wait for service to start
|
|
for i in range(0, 100):
|
|
time.sleep(1.0)
|
|
# self.procIpcBatch() # don't pump message during start up
|
|
Progress.emit(Progress.SERVICE_START_NAP)
|
|
# print("_zz_", end="", flush=True)
|
|
if self._status.isRunning():
|
|
Logging.info("[] TDengine service READY to process requests: pid={}".format(subProc.getPid()))
|
|
# Logging.info("[] TAOS service started: {}".format(self))
|
|
# self._verifyDnode(self._tInst) # query and ensure dnode is ready
|
|
# Logging.debug("[] TAOS Dnode verified: {}".format(self))
|
|
return # now we've started
|
|
# TODO: handle failure-to-start better?
|
|
self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output
|
|
raise RuntimeError("TDengine service DID NOT achieve READY status: pid={}".format(subProc.getPid()))
|
|
|
|
def _verifyDnode(self, tInst: TdeInstance):
|
|
dbc = DbConn.createNative(tInst.getDbTarget())
|
|
dbc.open()
|
|
dbc.query("show dnodes")
|
|
# dbc.query("DESCRIBE {}.{}".format(dbName, self._stName))
|
|
cols = dbc.getQueryResult() # id,end_point,vnodes,cores,status,role,create_time,offline reason
|
|
# ret = {row[0]:row[1] for row in stCols if row[3]=='TAG'} # name:type
|
|
isValid = False
|
|
for col in cols:
|
|
# print("col = {}".format(col))
|
|
ep = col[1].split(':') # 10.1.30.2:6030
|
|
print("Found ep={}".format(ep))
|
|
if tInst.getPort() == int(ep[1]): # That's us
|
|
# print("Valid Dnode matched!")
|
|
isValid = True # now we are valid
|
|
break
|
|
if not isValid:
|
|
print("Failed to start dnode, sleep for a while")
|
|
time.sleep(10.0)
|
|
raise RuntimeError("Failed to start Dnode, expected port not found: {}".
|
|
format(tInst.getPort()))
|
|
dbc.close()
|
|
|
|
def stop(self):
|
|
# can be called from both main thread or signal handler
|
|
|
|
# Linux will send Control-C generated SIGINT to the TDengine process already, ref:
|
|
# https://unix.stackexchange.com/questions/176235/fork-and-how-signals-are-delivered-to-processes
|
|
|
|
self.join() # stop the thread, status change moved to TdeSubProcess
|
|
|
|
# Check if it's really stopped
|
|
outputLines = 10 # for last output
|
|
if self.getStatus().isStopped():
|
|
self.procIpcBatch(outputLines) # one last time
|
|
Logging.debug("End of TDengine Service Output")
|
|
Logging.info("----- TDengine Service (managed by SMT) is now terminated -----\n")
|
|
else:
|
|
print("WARNING: SMT did not terminate as expected")
|
|
|
|
def join(self):
|
|
# TODO: sanity check
|
|
s = self.getStatus()
|
|
if s.isStopping() or s.isStopped(): # we may be stopping ourselves, or have been stopped/killed by others
|
|
if self._thread or self._thread2 :
|
|
if self._thread:
|
|
self._thread.join()
|
|
self._thread = None
|
|
if self._thread2: # STD ERR thread
|
|
self._thread2.join()
|
|
self._thread2 = None
|
|
else:
|
|
Logging.warning("Joining empty thread, doing nothing")
|
|
else:
|
|
raise RuntimeError(
|
|
"SMT.Join(): Unexpected status: {}".format(self._status))
|
|
|
|
def _trimQueue(self, targetSize):
|
|
if targetSize <= 0:
|
|
return # do nothing
|
|
q = self._ipcQueue
|
|
if (q.qsize() <= targetSize): # no need to trim
|
|
return
|
|
|
|
Logging.debug("Triming IPC queue to target size: {}".format(targetSize))
|
|
itemsToTrim = q.qsize() - targetSize
|
|
for i in range(0, itemsToTrim):
|
|
try:
|
|
q.get_nowait()
|
|
except Empty:
|
|
break # break out of for loop, no more trimming
|
|
|
|
TD_READY_MSG = "TDengine is initialized successfully"
|
|
|
|
def procIpcBatch(self, trimToTarget=0, forceOutput=False):
|
|
'''
|
|
Process a batch of STDOUT/STDERR data, until we read EMPTY from
|
|
the queue.
|
|
'''
|
|
self._trimQueue(trimToTarget) # trim if necessary
|
|
# Process all the output generated by the underlying sub process,
|
|
# managed by IO thread
|
|
print("<", end="", flush=True)
|
|
while True:
|
|
try:
|
|
line = self._ipcQueue.get_nowait() # getting output at fast speed
|
|
self._printProgress("_o")
|
|
except Empty:
|
|
# time.sleep(2.3) # wait only if there's no output
|
|
# no more output
|
|
print(".>", end="", flush=True)
|
|
return # we are done with THIS BATCH
|
|
else: # got line, printing out
|
|
if forceOutput:
|
|
Logging.info('[TAOSD] ' + line)
|
|
else:
|
|
Logging.debug('[TAOSD] ' + line)
|
|
print(">", end="", flush=True)
|
|
|
|
_ProgressBars = ["--", "//", "||", "\\\\"]
|
|
|
|
def _printProgress(self, msg): # TODO: assuming 2 chars
|
|
print(msg, end="", flush=True)
|
|
pBar = self._ProgressBars[Dice.throw(4)]
|
|
print(pBar, end="", flush=True)
|
|
print('\b\b\b\b', end="", flush=True)
|
|
|
|
BinaryChunk = NewType('BinaryChunk', bytes) # line with binary data, directly from STDOUT, etc.
|
|
TextChunk = NewType('TextChunk', str) # properly decoded, suitable for printing, etc.
|
|
|
|
@classmethod
|
|
def _decodeBinaryChunk(cls, bChunk: bytes) -> Optional[TextChunk] :
|
|
try:
|
|
tChunk = bChunk.decode("utf-8").rstrip()
|
|
return cls.TextChunk(tChunk)
|
|
except UnicodeError:
|
|
print("\nNon-UTF8 server output: {}\n".format(bChunk.decode('cp437')))
|
|
return None
|
|
|
|
def _textChunkGenerator(self, streamIn: IpcStream, logDir: str, logFile: str
|
|
) -> Generator[TextChunk, None, None]:
|
|
'''
|
|
Take an input stream with binary data (likely from Popen), produced a generator of decoded
|
|
"text chunks".
|
|
|
|
Side effect: it also save the original binary data in a log file.
|
|
'''
|
|
os.makedirs(logDir, exist_ok=True)
|
|
logF = open(os.path.join(logDir, logFile), 'wb')
|
|
if logF is None:
|
|
Logging.error("Failed to open log file (binary write): {}/{}".format(logDir, logFile))
|
|
return
|
|
for bChunk in iter(streamIn.readline, b''):
|
|
logF.write(bChunk) # Write to log file immediately
|
|
tChunk = self._decodeBinaryChunk(bChunk) # decode
|
|
if tChunk is not None:
|
|
yield tChunk # TODO: split into actual text lines
|
|
|
|
# At the end...
|
|
streamIn.close() # Close the incoming stream
|
|
logF.close() # Close the log file
|
|
|
|
def svcOutputReader(self, ipcStdOut: IpcStream, queue, logDir: str):
|
|
'''
|
|
The infinite routine that processes the STDOUT stream for the sub process being managed.
|
|
|
|
:param ipcStdOut: the IO stream object used to fetch the data from
|
|
:param queue: the queue where we dump the roughly parsed chunk-by-chunk text data
|
|
:param logDir: where we should dump a verbatim output file
|
|
'''
|
|
|
|
# Important Reference: https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python
|
|
# print("This is the svcOutput Reader...")
|
|
# stdOut.readline() # Skip the first output? TODO: remove?
|
|
for tChunk in self._textChunkGenerator(ipcStdOut, logDir, 'stdout.log') :
|
|
queue.put(tChunk) # tChunk garanteed not to be None
|
|
self._printProgress("_i")
|
|
|
|
if self._status.isStarting(): # we are starting, let's see if we have started
|
|
if tChunk.find(self.TD_READY_MSG) != -1: # found
|
|
Logging.info("Waiting for the service to become FULLY READY")
|
|
time.sleep(1.0) # wait for the server to truly start. TODO: remove this
|
|
Logging.info("Service is now FULLY READY") # TODO: more ID info here?
|
|
self._status.set(Status.STATUS_RUNNING)
|
|
|
|
# Trim the queue if necessary: TODO: try this 1 out of 10 times
|
|
self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10) # trim to 90% size
|
|
|
|
if self._status.isStopping(): # TODO: use thread status instead
|
|
# WAITING for stopping sub process to finish its outptu
|
|
print("_w", end="", flush=True)
|
|
|
|
# queue.put(line)
|
|
# stdOut has no more data, meaning sub process must have died
|
|
Logging.info("EOF found TDengine STDOUT, marking the process as terminated")
|
|
self.setStatus(Status.STATUS_STOPPED)
|
|
|
|
def svcErrorReader(self, ipcStdErr: IpcStream, queue, logDir: str):
|
|
# os.makedirs(logDir, exist_ok=True)
|
|
# logFile = os.path.join(logDir,'stderr.log')
|
|
# fErr = open(logFile, 'wb')
|
|
# for line in iter(err.readline, b''):
|
|
for tChunk in self._textChunkGenerator(ipcStdErr, logDir, 'stderr.log') :
|
|
queue.put(tChunk) # tChunk garanteed not to be None
|
|
# fErr.write(line)
|
|
Logging.info("TDengine STDERR: {}".format(tChunk))
|
|
Logging.info("EOF for TDengine STDERR")
|