homework-jianmu/source/libs/sync/src/syncRaftLog.c

398 lines
13 KiB
C

/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "syncRaftLog.h"
#include "syncRaftCfg.h"
#include "syncRaftStore.h"
#include "syncUtil.h"
// log[m .. n]
// public function
static int32_t raftLogRestoreFromSnapshot(struct SSyncLogStore* pLogStore, SyncIndex snapshotIndex);
static int32_t raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, bool forceSync);
static int32_t raftLogTruncate(struct SSyncLogStore* pLogStore, SyncIndex fromIndex);
static bool raftLogExist(struct SSyncLogStore* pLogStore, SyncIndex index);
static int32_t raftLogUpdateCommitIndex(SSyncLogStore* pLogStore, SyncIndex index);
static SyncIndex raftlogCommitIndex(SSyncLogStore* pLogStore);
static int32_t raftLogGetLastEntry(SSyncLogStore* pLogStore, SSyncRaftEntry** ppLastEntry);
SSyncLogStore* logStoreCreate(SSyncNode* pSyncNode) {
SSyncLogStore* pLogStore = taosMemoryCalloc(1, sizeof(SSyncLogStore));
if (pLogStore == NULL) {
terrno = TSDB_CODE_OUT_OF_MEMORY;
return NULL;
}
// pLogStore->pCache = taosLRUCacheInit(10 * 1024 * 1024, 1, .5);
pLogStore->pCache = taosLRUCacheInit(30 * 1024 * 1024, 1, .5);
if (pLogStore->pCache == NULL) {
taosMemoryFree(pLogStore);
terrno = TSDB_CODE_OUT_OF_MEMORY;
return NULL;
}
pLogStore->cacheHit = 0;
pLogStore->cacheMiss = 0;
taosLRUCacheSetStrictCapacity(pLogStore->pCache, false);
pLogStore->data = taosMemoryMalloc(sizeof(SSyncLogStoreData));
ASSERT(pLogStore->data != NULL);
SSyncLogStoreData* pData = pLogStore->data;
pData->pSyncNode = pSyncNode;
pData->pWal = pSyncNode->pWal;
ASSERT(pData->pWal != NULL);
taosThreadMutexInit(&(pData->mutex), NULL);
pData->pWalHandle = walOpenReader(pData->pWal, NULL);
ASSERT(pData->pWalHandle != NULL);
pLogStore->syncLogUpdateCommitIndex = raftLogUpdateCommitIndex;
pLogStore->syncLogCommitIndex = raftlogCommitIndex;
pLogStore->syncLogRestoreFromSnapshot = raftLogRestoreFromSnapshot;
pLogStore->syncLogBeginIndex = raftLogBeginIndex;
pLogStore->syncLogEndIndex = raftLogEndIndex;
pLogStore->syncLogIsEmpty = raftLogIsEmpty;
pLogStore->syncLogEntryCount = raftLogEntryCount;
pLogStore->syncLogLastIndex = raftLogLastIndex;
pLogStore->syncLogLastTerm = raftLogLastTerm;
pLogStore->syncLogAppendEntry = raftLogAppendEntry;
pLogStore->syncLogGetEntry = raftLogGetEntry;
pLogStore->syncLogTruncate = raftLogTruncate;
pLogStore->syncLogWriteIndex = raftLogWriteIndex;
pLogStore->syncLogExist = raftLogExist;
return pLogStore;
}
void logStoreDestory(SSyncLogStore* pLogStore) {
if (pLogStore != NULL) {
SSyncLogStoreData* pData = pLogStore->data;
taosThreadMutexLock(&(pData->mutex));
if (pData->pWalHandle != NULL) {
walCloseReader(pData->pWalHandle);
pData->pWalHandle = NULL;
}
taosThreadMutexUnlock(&(pData->mutex));
taosThreadMutexDestroy(&(pData->mutex));
taosMemoryFree(pLogStore->data);
taosLRUCacheEraseUnrefEntries(pLogStore->pCache);
taosLRUCacheCleanup(pLogStore->pCache);
taosMemoryFree(pLogStore);
}
}
// log[m .. n]
static int32_t raftLogRestoreFromSnapshot(struct SSyncLogStore* pLogStore, SyncIndex snapshotIndex) {
ASSERT(snapshotIndex >= 0);
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
int32_t code = walRestoreFromSnapshot(pWal, snapshotIndex);
if (code != 0) {
int32_t err = terrno;
const char* errStr = tstrerror(err);
int32_t sysErr = errno;
const char* sysErrStr = strerror(errno);
sNError(pData->pSyncNode,
"wal restore from snapshot error, index:%" PRId64 ", err:0x%x, msg:%s, syserr:%d, sysmsg:%s", snapshotIndex,
err, errStr, sysErr, sysErrStr);
return -1;
}
return 0;
}
SyncIndex raftLogBeginIndex(struct SSyncLogStore* pLogStore) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
SyncIndex firstVer = walGetFirstVer(pWal);
return firstVer;
}
SyncIndex raftLogEndIndex(struct SSyncLogStore* pLogStore) { return raftLogLastIndex(pLogStore); }
bool raftLogIsEmpty(struct SSyncLogStore* pLogStore) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
return walIsEmpty(pWal);
}
int32_t raftLogEntryCount(struct SSyncLogStore* pLogStore) {
SyncIndex beginIndex = raftLogBeginIndex(pLogStore);
SyncIndex endIndex = raftLogEndIndex(pLogStore);
int32_t count = endIndex - beginIndex + 1;
return count > 0 ? count : 0;
}
SyncIndex raftLogLastIndex(struct SSyncLogStore* pLogStore) {
SyncIndex lastIndex;
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
SyncIndex lastVer = walGetLastVer(pWal);
return lastVer;
}
SyncIndex raftLogWriteIndex(struct SSyncLogStore* pLogStore) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
SyncIndex lastVer = walGetLastVer(pWal);
return lastVer + 1;
}
static bool raftLogExist(struct SSyncLogStore* pLogStore, SyncIndex index) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
bool b = walLogExist(pWal, index);
return b;
}
// if success, return last term
// if not log, return 0
// if error, return SYNC_TERM_INVALID
SyncTerm raftLogLastTerm(struct SSyncLogStore* pLogStore) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
if (walIsEmpty(pWal)) {
return 0;
} else {
SSyncRaftEntry* pLastEntry;
int32_t code = raftLogGetLastEntry(pLogStore, &pLastEntry);
if (code == 0 && pLastEntry != NULL) {
SyncTerm lastTerm = pLastEntry->term;
taosMemoryFree(pLastEntry);
return lastTerm;
} else {
return SYNC_TERM_INVALID;
}
}
// can not be here!
return SYNC_TERM_INVALID;
}
static int32_t raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, bool forceSync) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
SyncIndex index = 0;
SWalSyncInfo syncMeta = {0};
syncMeta.isWeek = pEntry->isWeak;
syncMeta.seqNum = pEntry->seqNum;
syncMeta.term = pEntry->term;
int64_t tsWriteBegin = taosGetTimestampNs();
index = walAppendLog(pWal, pEntry->index, pEntry->originalRpcType, syncMeta, pEntry->data, pEntry->dataLen);
int64_t tsWriteEnd = taosGetTimestampNs();
int64_t tsElapsed = tsWriteEnd - tsWriteBegin;
if (index < 0) {
int32_t err = terrno;
const char* errStr = tstrerror(err);
int32_t sysErr = errno;
const char* sysErrStr = strerror(errno);
sNError(pData->pSyncNode, "wal write error, index:%" PRId64 ", err:0x%x, msg:%s, syserr:%d, sysmsg:%s",
pEntry->index, err, errStr, sysErr, sysErrStr);
return -1;
}
ASSERT(pEntry->index == index);
walFsync(pWal, forceSync);
sNTrace(pData->pSyncNode, "write index:%" PRId64 ", type:%s, origin type:%s, elapsed:%" PRId64, pEntry->index,
TMSG_INFO(pEntry->msgType), TMSG_INFO(pEntry->originalRpcType), tsElapsed);
return 0;
}
// entry found, return 0
// entry not found, return -1, terrno = TSDB_CODE_WAL_LOG_NOT_EXIST
// other error, return -1
int32_t raftLogGetEntry(struct SSyncLogStore* pLogStore, SyncIndex index, SSyncRaftEntry** ppEntry) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
int32_t code = 0;
*ppEntry = NULL;
int64_t ts1 = taosGetTimestampNs();
taosThreadMutexLock(&(pData->mutex));
SWalReader* pWalHandle = pData->pWalHandle;
if (pWalHandle == NULL) {
terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
sError("vgId:%d, wal handle is NULL", pData->pSyncNode->vgId);
taosThreadMutexUnlock(&(pData->mutex));
return -1;
}
int64_t ts2 = taosGetTimestampNs();
code = walReadVer(pWalHandle, index);
walReadReset(pWalHandle);
int64_t ts3 = taosGetTimestampNs();
// code = walReadVerCached(pWalHandle, index);
if (code != 0) {
int32_t err = terrno;
const char* errStr = tstrerror(err);
int32_t sysErr = errno;
const char* sysErrStr = strerror(errno);
if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) {
sNTrace(pData->pSyncNode, "wal read not exist, index:%" PRId64 ", err:0x%x, msg:%s, syserr:%d, sysmsg:%s", index,
err, errStr, sysErr, sysErrStr);
} else {
sNTrace(pData->pSyncNode, "wal read error, index:%" PRId64 ", err:0x%x, msg:%s, syserr:%d, sysmsg:%s", index, err,
errStr, sysErr, sysErrStr);
}
/*
int32_t saveErr = terrno;
walCloseReadHandle(pWalHandle);
terrno = saveErr;
*/
taosThreadMutexUnlock(&(pData->mutex));
return code;
}
*ppEntry = syncEntryBuild(pWalHandle->pHead->head.bodyLen);
ASSERT(*ppEntry != NULL);
(*ppEntry)->msgType = TDMT_SYNC_CLIENT_REQUEST;
(*ppEntry)->originalRpcType = pWalHandle->pHead->head.msgType;
(*ppEntry)->seqNum = pWalHandle->pHead->head.syncMeta.seqNum;
(*ppEntry)->isWeak = pWalHandle->pHead->head.syncMeta.isWeek;
(*ppEntry)->term = pWalHandle->pHead->head.syncMeta.term;
(*ppEntry)->index = index;
ASSERT((*ppEntry)->dataLen == pWalHandle->pHead->head.bodyLen);
memcpy((*ppEntry)->data, pWalHandle->pHead->head.body, pWalHandle->pHead->head.bodyLen);
/*
int32_t saveErr = terrno;
walCloseReadHandle(pWalHandle);
terrno = saveErr;
*/
taosThreadMutexUnlock(&(pData->mutex));
int64_t ts4 = taosGetTimestampNs();
int64_t tsElapsed = ts4 - ts1;
int64_t tsElapsedLock = ts2 - ts1;
int64_t tsElapsedRead = ts3 - ts2;
int64_t tsElapsedBuild = ts4 - ts3;
sNTrace(pData->pSyncNode,
"read index:%" PRId64 ", elapsed:%" PRId64 ", elapsed-lock:%" PRId64 ", elapsed-read:%" PRId64
", elapsed-build:%" PRId64,
index, tsElapsed, tsElapsedLock, tsElapsedRead, tsElapsedBuild);
return code;
}
// truncate semantic
static int32_t raftLogTruncate(struct SSyncLogStore* pLogStore, SyncIndex fromIndex) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
int32_t code = walRollback(pWal, fromIndex);
if (code != 0) {
int32_t err = terrno;
const char* errStr = tstrerror(err);
int32_t sysErr = errno;
const char* sysErrStr = strerror(errno);
sError("vgId:%d, wal truncate error, from-index:%" PRId64 ", err:0x%x, msg:%s, syserr:%d, sysmsg:%s",
pData->pSyncNode->vgId, fromIndex, err, errStr, sysErr, sysErrStr);
}
// event log
sNTrace(pData->pSyncNode, "log truncate, from-index:%" PRId64, fromIndex);
return code;
}
// entry found, return 0
// entry not found, return -1, terrno = TSDB_CODE_WAL_LOG_NOT_EXIST
// other error, return -1
static int32_t raftLogGetLastEntry(SSyncLogStore* pLogStore, SSyncRaftEntry** ppLastEntry) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
ASSERT(ppLastEntry != NULL);
*ppLastEntry = NULL;
if (walIsEmpty(pWal)) {
terrno = TSDB_CODE_WAL_LOG_NOT_EXIST;
return -1;
} else {
SyncIndex lastIndex = raftLogLastIndex(pLogStore);
ASSERT(lastIndex >= SYNC_INDEX_BEGIN);
int32_t code = raftLogGetEntry(pLogStore, lastIndex, ppLastEntry);
return code;
}
return -1;
}
int32_t raftLogUpdateCommitIndex(SSyncLogStore* pLogStore, SyncIndex index) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
// need not update
SyncIndex snapshotVer = walGetSnapshotVer(pWal);
SyncIndex walCommitVer = walGetCommittedVer(pWal);
SyncIndex wallastVer = walGetLastVer(pWal);
if (index < snapshotVer || index > wallastVer) {
// ignore
return 0;
}
int32_t code = walCommit(pWal, index);
if (code != 0) {
int32_t err = terrno;
const char* errStr = tstrerror(err);
int32_t sysErr = errno;
const char* sysErrStr = strerror(errno);
sError("vgId:%d, wal update commit index error, index:%" PRId64 ", err:0x%x, msg:%s, syserr:%d, sysmsg:%s",
pData->pSyncNode->vgId, index, err, errStr, sysErr, sysErrStr);
return -1;
}
return 0;
}
SyncIndex raftlogCommitIndex(SSyncLogStore* pLogStore) {
SSyncLogStoreData* pData = pLogStore->data;
return pData->pSyncNode->commitIndex;
}
SyncIndex logStoreFirstIndex(SSyncLogStore* pLogStore) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
return walGetFirstVer(pWal);
}
SyncIndex logStoreWalCommitVer(SSyncLogStore* pLogStore) {
SSyncLogStoreData* pData = pLogStore->data;
SWal* pWal = pData->pWal;
return walGetCommittedVer(pWal);
}