Now locking create table operation, resolving TD-1471

This commit is contained in:
Steven Li 2020-10-30 04:42:03 +00:00
parent 54b34680d6
commit 3060fafd9f
2 changed files with 84 additions and 28 deletions

View File

@ -243,7 +243,7 @@ class WorkerThread:
class ThreadCoordinator: class ThreadCoordinator:
WORKER_THREAD_TIMEOUT = 180 # one minute WORKER_THREAD_TIMEOUT = 120 # Normal: 120
def __init__(self, pool: ThreadPool, dbManager: DbManager): def __init__(self, pool: ThreadPool, dbManager: DbManager):
self._curStep = -1 # first step is 0 self._curStep = -1 # first step is 0
@ -1177,6 +1177,8 @@ class Task():
instead. But a task is always associated with a DB instead. But a task is always associated with a DB
''' '''
taskSn = 100 taskSn = 100
_lock = threading.Lock()
_tableLocks: Dict[str, threading.Lock] = {}
@classmethod @classmethod
def allocTaskNum(cls): def allocTaskNum(cls):
@ -1198,6 +1200,8 @@ class Task():
self._execStats = execStats self._execStats = execStats
self._db = db # A task is always associated/for a specific DB self._db = db # A task is always associated/for a specific DB
def isSuccess(self): def isSuccess(self):
return self._err is None return self._err is None
@ -1351,6 +1355,24 @@ class Task():
def getQueryResult(self, wt: WorkerThread): # execute an SQL on the worker thread def getQueryResult(self, wt: WorkerThread): # execute an SQL on the worker thread
return wt.getQueryResult() return wt.getQueryResult()
def lockTable(self, ftName): # full table name
# print(" <<" + ftName + '_', end="", flush=True)
with Task._lock:
if not ftName in Task._tableLocks:
Task._tableLocks[ftName] = threading.Lock()
Task._tableLocks[ftName].acquire()
def unlockTable(self, ftName):
# print('_' + ftName + ">> ", end="", flush=True)
with Task._lock:
if not ftName in self._tableLocks:
raise RuntimeError("Corrupt state, no such lock")
lock = Task._tableLocks[ftName]
if not lock.locked():
raise RuntimeError("Corrupte state, already unlocked")
lock.release()
class ExecutionStats: class ExecutionStats:
def __init__(self): def __init__(self):
@ -1461,7 +1483,7 @@ class StateTransitionTask(Task):
_baseTableNumber = None _baseTableNumber = None
_endState = None _endState = None # TODO: no longter used?
@classmethod @classmethod
def getInfo(cls): # each sub class should supply their own information def getInfo(cls): # each sub class should supply their own information
@ -1486,7 +1508,7 @@ class StateTransitionTask(Task):
@classmethod @classmethod
def getRegTableName(cls, i): def getRegTableName(cls, i):
if ( StateTransitionTask._baseTableNumber is None): if ( StateTransitionTask._baseTableNumber is None): # Set it one time
StateTransitionTask._baseTableNumber = Dice.throw( StateTransitionTask._baseTableNumber = Dice.throw(
999) if gConfig.dynamic_db_table_names else 0 999) if gConfig.dynamic_db_table_names else 0
return "reg_table_{}".format(StateTransitionTask._baseTableNumber + i) return "reg_table_{}".format(StateTransitionTask._baseTableNumber + i)
@ -1583,14 +1605,23 @@ class TdSuperTable:
def hasRegTables(self, dbc: DbConn, dbName: str): def hasRegTables(self, dbc: DbConn, dbName: str):
return dbc.query("SELECT * FROM {}.{}".format(dbName, self._stName)) > 0 return dbc.query("SELECT * FROM {}.{}".format(dbName, self._stName)) > 0
def ensureTable(self, dbc: DbConn, dbName: str, regTableName: str): def ensureTable(self, task: Task, dbc: DbConn, dbName: str, regTableName: str):
sql = "select tbname from {}.{} where tbname in ('{}')".format(dbName, self._stName, regTableName) sql = "select tbname from {}.{} where tbname in ('{}')".format(dbName, self._stName, regTableName)
if dbc.query(sql) >= 1 : # reg table exists already if dbc.query(sql) >= 1 : # reg table exists already
return return
sql = "CREATE TABLE {}.{} USING {}.{} tags ({})".format(
dbName, regTableName, dbName, self._stName, self._getTagStrForSql(dbc, dbName) # acquire a lock first, so as to be able to *verify*. More details in TD-1471
) fullTableName = dbName + '.' + regTableName
dbc.execute(sql) task.lockTable(fullTableName)
Progress.emit(Progress.CREATE_TABLE_ATTEMPT) # ATTEMPT to create a new table
print("(" + fullTableName[-3:] + ")", end="", flush=True)
try:
sql = "CREATE TABLE {} USING {}.{} tags ({})".format(
fullTableName, dbName, self._stName, self._getTagStrForSql(dbc, dbName)
)
dbc.execute(sql)
finally:
task.unlockTable(fullTableName) # no matter what
def _getTagStrForSql(self, dbc, dbName: str) : def _getTagStrForSql(self, dbc, dbName: str) :
tags = self._getTags(dbc, dbName) tags = self._getTags(dbc, dbName)
@ -1862,7 +1893,12 @@ class TaskAddData(StateTransitionTask):
sTable = db.getFixedSuperTable() sTable = db.getFixedSuperTable()
regTableName = self.getRegTableName(i) # "db.reg_table_{}".format(i) regTableName = self.getRegTableName(i) # "db.reg_table_{}".format(i)
sTable.ensureTable(wt.getDbConn(), db.getName(), regTableName) # Ensure the table exists
fullTableName = db.getName() + '.' + regTableName
# self._lockTable(fullTableName) # "create table" below. Stop it if the table is "locked"
sTable.ensureTable(self, wt.getDbConn(), db.getName(), regTableName) # Ensure the table exists
# self._unlockTable(fullTableName)
for j in range(self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS): # number of records per table for j in range(self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS): # number of records per table
nextInt = db.getNextInt() nextInt = db.getNextInt()
@ -1872,27 +1908,29 @@ class TaskAddData(StateTransitionTask):
self.fAddLogReady.write("Ready to write {} to {}\n".format(nextInt, regTableName)) self.fAddLogReady.write("Ready to write {} to {}\n".format(nextInt, regTableName))
self.fAddLogReady.flush() self.fAddLogReady.flush()
os.fsync(self.fAddLogReady) os.fsync(self.fAddLogReady)
sql = "insert into {}.{} values ('{}', {});".format( # removed: tags ('{}', {})
db.getName(), # TODO: too ugly trying to lock the table reliably, refactor...
regTableName, fullTableName = db.getName() + '.' + regTableName
# ds.getFixedSuperTableName(), if gConfig.verify_data:
# ds.getNextBinary(), ds.getNextFloat(), self.lockTable(fullTableName)
nextTick, nextInt) # print("_w" + str(nextInt % 100), end="", flush=True) # Trace what was written
dbc.execute(sql)
# Successfully wrote the data into the DB, let's record it try:
# somehow sql = "insert into {} values ('{}', {});".format( # removed: tags ('{}', {})
te.recordDataMark(nextInt) fullTableName,
if gConfig.record_ops: # ds.getFixedSuperTableName(),
self.fAddLogDone.write( # ds.getNextBinary(), ds.getNextFloat(),
"Wrote {} to {}\n".format( nextTick, nextInt)
nextInt, regTableName)) dbc.execute(sql)
self.fAddLogDone.flush() except: # Any exception at all
os.fsync(self.fAddLogDone) if gConfig.verify_data:
self.unlockTable(fullTableName)
raise
# Now read it back and verify, we might encounter an error if table is dropped # Now read it back and verify, we might encounter an error if table is dropped
if gConfig.verify_data: # only if command line asks for it if gConfig.verify_data: # only if command line asks for it
try: try:
readBack = dbc.queryScalar("SELECT speed from {}.{} WHERE ts= '{}'". readBack = dbc.queryScalar("SELECT speed from {}.{} WHERE ts='{}'".
format(db.getName(), regTableName, nextTick)) format(db.getName(), regTableName, nextTick))
if readBack != nextInt : if readBack != nextInt :
raise taos.error.ProgrammingError( raise taos.error.ProgrammingError(
@ -1905,8 +1943,23 @@ class TaskAddData(StateTransitionTask):
"Failed to read back same data for tick: {}, wrote: {}, read: {}" "Failed to read back same data for tick: {}, wrote: {}, read: {}"
.format(nextTick, nextInt, "Empty Result" if errno==0x991 else "Multiple Result"), .format(nextTick, nextInt, "Empty Result" if errno==0x991 else "Multiple Result"),
errno) errno)
# Re-throw no matter what elif errno in [0x218, 0x362]: # table doesn't exist
raise # do nothing
dummy = 0
else:
# Re-throw otherwise
raise
finally:
self.unlockTable(fullTableName) # Unlock the table no matter what
# Successfully wrote the data into the DB, let's record it somehow
te.recordDataMark(nextInt)
if gConfig.record_ops:
self.fAddLogDone.write("Wrote {} to {}\n".format(nextInt, regTableName))
self.fAddLogDone.flush()
os.fsync(self.fAddLogDone)
self.activeTable.discard(i) # not raising an error, unlike remove self.activeTable.discard(i) # not raising an error, unlike remove

View File

@ -166,6 +166,8 @@ class Progress:
SERVICE_RECONNECT_START = 4 SERVICE_RECONNECT_START = 4
SERVICE_RECONNECT_SUCCESS = 5 SERVICE_RECONNECT_SUCCESS = 5
SERVICE_RECONNECT_FAILURE = 6 SERVICE_RECONNECT_FAILURE = 6
CREATE_TABLE_ATTEMPT = 7
tokens = { tokens = {
STEP_BOUNDARY: '.', STEP_BOUNDARY: '.',
BEGIN_THREAD_STEP: '[', BEGIN_THREAD_STEP: '[',
@ -174,6 +176,7 @@ class Progress:
SERVICE_RECONNECT_START: '<r.', SERVICE_RECONNECT_START: '<r.',
SERVICE_RECONNECT_SUCCESS: '.r>', SERVICE_RECONNECT_SUCCESS: '.r>',
SERVICE_RECONNECT_FAILURE: '.xr>', SERVICE_RECONNECT_FAILURE: '.xr>',
CREATE_TABLE_ATTEMPT: '_c',
} }
@classmethod @classmethod