Merge branch '3.0' into feature/vnode
This commit is contained in:
commit
9188298eb2
|
@ -48,6 +48,7 @@ endif(${BUILD_WITH_ROCKSDB})
|
||||||
## lucene
|
## lucene
|
||||||
if(${BUILD_WITH_LUCENE})
|
if(${BUILD_WITH_LUCENE})
|
||||||
cat("${CMAKE_SUPPORT_DIR}/lucene_CMakeLists.txt.in" ${DEPS_TMP_FILE})
|
cat("${CMAKE_SUPPORT_DIR}/lucene_CMakeLists.txt.in" ${DEPS_TMP_FILE})
|
||||||
|
add_definitions(-DUSE_LUCENE)
|
||||||
endif(${BUILD_WITH_LUCENE})
|
endif(${BUILD_WITH_LUCENE})
|
||||||
|
|
||||||
## NuRaft
|
## NuRaft
|
||||||
|
|
|
@ -22,7 +22,7 @@ option(
|
||||||
option(
|
option(
|
||||||
BUILD_WITH_LUCENE
|
BUILD_WITH_LUCENE
|
||||||
"If build with lucene"
|
"If build with lucene"
|
||||||
OFF
|
off
|
||||||
)
|
)
|
||||||
|
|
||||||
option(
|
option(
|
||||||
|
@ -41,4 +41,4 @@ option(
|
||||||
BUILD_DOCS
|
BUILD_DOCS
|
||||||
"If use doxygen build documents"
|
"If use doxygen build documents"
|
||||||
ON
|
ON
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
|
|
||||||
# lucene
|
# lucene
|
||||||
ExternalProject_Add(lucene
|
ExternalProject_Add(lucene
|
||||||
GIT_REPOSITORY https://github.com/taosdata-contrib/LucenePlusPlus.git
|
GIT_REPOSITORY https://github.com/yihaoDeng/LucenePlusPlus.git
|
||||||
GIT_TAG rel_3.0.8_td
|
|
||||||
SOURCE_DIR "${CMAKE_SOURCE_DIR}/deps/lucene"
|
SOURCE_DIR "${CMAKE_SOURCE_DIR}/deps/lucene"
|
||||||
BINARY_DIR ""
|
BINARY_DIR ""
|
||||||
#BUILD_IN_SOURCE TRUE
|
#BUILD_IN_SOURCE TRUE
|
||||||
|
@ -10,4 +9,4 @@ ExternalProject_Add(lucene
|
||||||
BUILD_COMMAND ""
|
BUILD_COMMAND ""
|
||||||
INSTALL_COMMAND ""
|
INSTALL_COMMAND ""
|
||||||
TEST_COMMAND ""
|
TEST_COMMAND ""
|
||||||
)
|
)
|
||||||
|
|
|
@ -68,6 +68,11 @@ endif(${BUILD_WITH_ROCKSDB})
|
||||||
if(${BUILD_WITH_LUCENE})
|
if(${BUILD_WITH_LUCENE})
|
||||||
option(ENABLE_TEST "Enable the tests" OFF)
|
option(ENABLE_TEST "Enable the tests" OFF)
|
||||||
add_subdirectory(lucene)
|
add_subdirectory(lucene)
|
||||||
|
target_include_directories(
|
||||||
|
lucene++
|
||||||
|
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/lucene/include>
|
||||||
|
)
|
||||||
|
|
||||||
endif(${BUILD_WITH_LUCENE})
|
endif(${BUILD_WITH_LUCENE})
|
||||||
|
|
||||||
# NuRaft
|
# NuRaft
|
||||||
|
|
|
@ -109,11 +109,10 @@ typedef struct TqTopicVhandle {
|
||||||
|
|
||||||
#define TQ_BUFFER_SIZE 8
|
#define TQ_BUFFER_SIZE 8
|
||||||
|
|
||||||
// TODO: define a serializer and deserializer
|
|
||||||
typedef struct TqBufferItem {
|
typedef struct TqBufferItem {
|
||||||
int64_t offset;
|
int64_t offset;
|
||||||
// executors are identical but not concurrent
|
// executors are identical but not concurrent
|
||||||
// so it must be a copy in each item
|
// so there must be a copy in each item
|
||||||
void* executor;
|
void* executor;
|
||||||
int64_t size;
|
int64_t size;
|
||||||
void* content;
|
void* content;
|
||||||
|
@ -156,23 +155,111 @@ typedef struct TqQueryMsg {
|
||||||
|
|
||||||
typedef struct TqLogReader {
|
typedef struct TqLogReader {
|
||||||
void* logHandle;
|
void* logHandle;
|
||||||
int32_t (*walRead)(void* logHandle, void** data, int64_t ver);
|
int32_t (*logRead)(void* logHandle, void** data, int64_t ver);
|
||||||
int64_t (*walGetFirstVer)(void* logHandle);
|
int64_t (*logGetFirstVer)(void* logHandle);
|
||||||
int64_t (*walGetSnapshotVer)(void* logHandle);
|
int64_t (*logGetSnapshotVer)(void* logHandle);
|
||||||
int64_t (*walGetLastVer)(void* logHandle);
|
int64_t (*logGetLastVer)(void* logHandle);
|
||||||
} TqLogReader;
|
} TqLogReader;
|
||||||
|
|
||||||
typedef struct TqConfig {
|
typedef struct TqConfig {
|
||||||
// TODO
|
// TODO
|
||||||
} TqConfig;
|
} TqConfig;
|
||||||
|
|
||||||
|
typedef struct TqMemRef {
|
||||||
|
SMemAllocatorFactory *pAlloctorFactory;
|
||||||
|
SMemAllocator *pAllocator;
|
||||||
|
} TqMemRef;
|
||||||
|
|
||||||
|
typedef struct TqSerializedHead {
|
||||||
|
int16_t ver;
|
||||||
|
int16_t action;
|
||||||
|
int32_t checksum;
|
||||||
|
int64_t ssize;
|
||||||
|
char content[];
|
||||||
|
} TqSerializedHead;
|
||||||
|
|
||||||
|
typedef int (*TqSerializeFun)(const void* pObj, TqSerializedHead** ppHead);
|
||||||
|
typedef const void* (*TqDeserializeFun)(const TqSerializedHead* pHead, void** ppObj);
|
||||||
|
typedef void (*TqDeleteFun)(void*);
|
||||||
|
|
||||||
|
#define TQ_BUCKET_MASK 0xFF
|
||||||
|
#define TQ_BUCKET_SIZE 256
|
||||||
|
|
||||||
|
#define TQ_PAGE_SIZE 4096
|
||||||
|
//key + offset + size
|
||||||
|
#define TQ_IDX_SIZE 24
|
||||||
|
//4096 / 24
|
||||||
|
#define TQ_MAX_IDX_ONE_PAGE 170
|
||||||
|
//24 * 170
|
||||||
|
#define TQ_IDX_PAGE_BODY_SIZE 4080
|
||||||
|
//4096 - 4080
|
||||||
|
#define TQ_IDX_PAGE_HEAD_SIZE 16
|
||||||
|
|
||||||
|
#define TQ_ACTION_CONST 0
|
||||||
|
#define TQ_ACTION_INUSE 1
|
||||||
|
#define TQ_ACTION_INUSE_CONT 2
|
||||||
|
#define TQ_ACTION_INTXN 3
|
||||||
|
|
||||||
|
#define TQ_SVER 0
|
||||||
|
|
||||||
|
//TODO: inplace mode is not implemented
|
||||||
|
#define TQ_UPDATE_INPLACE 0
|
||||||
|
#define TQ_UPDATE_APPEND 1
|
||||||
|
|
||||||
|
#define TQ_DUP_INTXN_REWRITE 0
|
||||||
|
#define TQ_DUP_INTXN_REJECT 2
|
||||||
|
|
||||||
|
static inline bool TqUpdateAppend(int32_t tqConfigFlag) {
|
||||||
|
return tqConfigFlag & TQ_UPDATE_APPEND;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool TqDupIntxnReject(int32_t tqConfigFlag) {
|
||||||
|
return tqConfigFlag & TQ_DUP_INTXN_REJECT;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const int8_t TQ_CONST_DELETE = TQ_ACTION_CONST;
|
||||||
|
#define TQ_DELETE_TOKEN (void*)&TQ_CONST_DELETE
|
||||||
|
|
||||||
|
typedef struct TqMetaHandle {
|
||||||
|
int64_t key;
|
||||||
|
int64_t offset;
|
||||||
|
int64_t serializedSize;
|
||||||
|
void* valueInUse;
|
||||||
|
void* valueInTxn;
|
||||||
|
} TqMetaHandle;
|
||||||
|
|
||||||
|
typedef struct TqMetaList {
|
||||||
|
TqMetaHandle handle;
|
||||||
|
struct TqMetaList* next;
|
||||||
|
//struct TqMetaList* inTxnPrev;
|
||||||
|
//struct TqMetaList* inTxnNext;
|
||||||
|
struct TqMetaList* unpersistPrev;
|
||||||
|
struct TqMetaList* unpersistNext;
|
||||||
|
} TqMetaList;
|
||||||
|
|
||||||
|
typedef struct TqMetaStore {
|
||||||
|
TqMetaList* bucket[TQ_BUCKET_SIZE];
|
||||||
|
//a table head
|
||||||
|
TqMetaList* unpersistHead;
|
||||||
|
//TODO:temporaral use, to be replaced by unified tfile
|
||||||
|
int fileFd;
|
||||||
|
//TODO:temporaral use, to be replaced by unified tfile
|
||||||
|
int idxFd;
|
||||||
|
char* dirPath;
|
||||||
|
int32_t tqConfigFlag;
|
||||||
|
TqSerializeFun pSerializer;
|
||||||
|
TqDeserializeFun pDeserializer;
|
||||||
|
TqDeleteFun pDeleter;
|
||||||
|
} TqMetaStore;
|
||||||
|
|
||||||
typedef struct STQ {
|
typedef struct STQ {
|
||||||
// the collection of group handle
|
// the collection of group handle
|
||||||
// the handle of kvstore
|
// the handle of kvstore
|
||||||
const char* path;
|
char* path;
|
||||||
TqConfig* tqConfig;
|
TqConfig* tqConfig;
|
||||||
TqLogReader* tqLogReader;
|
TqLogReader* tqLogReader;
|
||||||
SMemAllocatorFactory* allocFac;
|
TqMemRef tqMemRef;
|
||||||
|
TqMetaStore* tqMeta;
|
||||||
} STQ;
|
} STQ;
|
||||||
|
|
||||||
// open in each vnode
|
// open in each vnode
|
||||||
|
@ -187,7 +274,7 @@ int tqConsume(STQ*, TmqConsumeReq*);
|
||||||
|
|
||||||
TqGroupHandle* tqGetGroupHandle(STQ*, int64_t cId);
|
TqGroupHandle* tqGetGroupHandle(STQ*, int64_t cId);
|
||||||
|
|
||||||
int tqOpenTCGroup(STQ*, int64_t topicId, int64_t cgId, int64_t cId);
|
TqGroupHandle* tqOpenTCGroup(STQ*, int64_t topicId, int64_t cgId, int64_t cId);
|
||||||
int tqCloseTCGroup(STQ*, int64_t topicId, int64_t cgId, int64_t cId);
|
int tqCloseTCGroup(STQ*, int64_t topicId, int64_t cgId, int64_t cId);
|
||||||
int tqMoveOffsetToNext(TqGroupHandle*);
|
int tqMoveOffsetToNext(TqGroupHandle*);
|
||||||
int tqResetOffset(STQ*, int64_t topicId, int64_t cgId, int64_t offset);
|
int tqResetOffset(STQ*, int64_t topicId, int64_t cgId, int64_t offset);
|
||||||
|
@ -195,18 +282,9 @@ int tqRegisterContext(TqGroupHandle*, void* ahandle);
|
||||||
int tqLaunchQuery(TqGroupHandle*);
|
int tqLaunchQuery(TqGroupHandle*);
|
||||||
int tqSendLaunchQuery(TqGroupHandle*);
|
int tqSendLaunchQuery(TqGroupHandle*);
|
||||||
|
|
||||||
int tqSerializeGroupHandle(TqGroupHandle* gHandle, void** ppBytes);
|
int tqSerializeGroupHandle(const TqGroupHandle* gHandle, TqSerializedHead** ppHead);
|
||||||
void* tqSerializeListHandle(TqListHandle* listHandle, void* ptr);
|
|
||||||
void* tqSerializeBufHandle(TqBufferHandle* bufHandle, void* ptr);
|
|
||||||
void* tqSerializeBufItem(TqBufferItem* bufItem, void* ptr);
|
|
||||||
|
|
||||||
const void* tqDeserializeGroupHandle(const void* pBytes, TqGroupHandle* ghandle);
|
const void* tqDeserializeGroupHandle(const TqSerializedHead* pHead, TqGroupHandle** gHandle);
|
||||||
const void* tqDeserializeBufHandle(const void* pBytes, TqBufferHandle* bufHandle);
|
|
||||||
const void* tqDeserializeBufItem(const void* pBytes, TqBufferItem* bufItem);
|
|
||||||
|
|
||||||
int tqGetGHandleSSize(const TqGroupHandle* gHandle);
|
|
||||||
int tqBufHandleSSize();
|
|
||||||
int tqBufItemSSize();
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,12 +16,53 @@
|
||||||
#ifndef _TD_INDEX_H_
|
#ifndef _TD_INDEX_H_
|
||||||
#define _TD_INDEX_H_
|
#define _TD_INDEX_H_
|
||||||
|
|
||||||
|
#include "os.h"
|
||||||
|
#include "tarray.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
typedef struct SIndex SIndex;
|
||||||
|
typedef struct SIndexOpts SIndexOpts;
|
||||||
|
typedef struct SIndexMultiTermQuery SIndexMultiTermQuery;
|
||||||
|
typedef struct SArray SIndexMultiTerm;
|
||||||
|
|
||||||
|
typedef enum { MUST = 0, SHOULD = 1, NOT = 2 } EIndexOperatorType;
|
||||||
|
typedef enum { QUERY_TERM = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX = 3} EIndexQueryType;
|
||||||
|
/*
|
||||||
|
* @param: oper
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
SIndexMultiTermQuery *indexMultiTermQueryCreate(EIndexOperatorType oper);
|
||||||
|
void indexMultiTermQueryDestroy(SIndexMultiTermQuery *pQuery);
|
||||||
|
int indexMultiTermQueryAdd(SIndexMultiTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type);
|
||||||
|
/*
|
||||||
|
* @param:
|
||||||
|
* @param:
|
||||||
|
*/
|
||||||
|
SIndex* indexOpen(SIndexOpts *opt, const char *path);
|
||||||
|
void indexClose(SIndex *index);
|
||||||
|
int indexPut(SIndex *index, SIndexMultiTerm *terms, int uid);
|
||||||
|
int indexDelete(SIndex *index, SIndexMultiTermQuery *query);
|
||||||
|
int indexSearch(SIndex *index, SIndexMultiTermQuery *query, SArray *result);
|
||||||
|
int indexRebuild(SIndex *index, SIndexOpts *opt);
|
||||||
|
/*
|
||||||
|
* @param
|
||||||
|
* @param
|
||||||
|
*/
|
||||||
|
SIndexMultiTerm *indexMultiTermCreate();
|
||||||
|
int indexMultiTermAdd(SIndexMultiTerm *terms, const char *field, int32_t nFields, const char *value, int32_t nValue);
|
||||||
|
void indexMultiTermDestroy(SIndexMultiTerm *terms);
|
||||||
|
/*
|
||||||
|
* @param:
|
||||||
|
* @param:
|
||||||
|
*/
|
||||||
|
SIndexOpts *indexOptsCreate();
|
||||||
|
void indexOptsDestroy(SIndexOpts *opts);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /*_TD_INDEX_H_*/
|
#endif /*_TD_INDEX_H_*/
|
||||||
|
|
|
@ -44,8 +44,10 @@ typedef struct {
|
||||||
EWalType walLevel; // wal level
|
EWalType walLevel; // wal level
|
||||||
} SWalCfg;
|
} SWalCfg;
|
||||||
|
|
||||||
struct SWal;
|
typedef struct SWal {
|
||||||
typedef struct SWal SWal; // WAL HANDLE
|
int8_t unused;
|
||||||
|
} SWal; // WAL HANDLE
|
||||||
|
|
||||||
typedef int32_t (*FWalWrite)(void *ahandle, void *pHead, int32_t qtype, void *pMsg);
|
typedef int32_t (*FWalWrite)(void *ahandle, void *pHead, int32_t qtype, void *pMsg);
|
||||||
|
|
||||||
// module initialization
|
// module initialization
|
||||||
|
|
|
@ -17,97 +17,22 @@
|
||||||
#define _TQ_META_STORE_H_
|
#define _TQ_META_STORE_H_
|
||||||
|
|
||||||
#include "os.h"
|
#include "os.h"
|
||||||
|
#include "tq.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define TQ_BUCKET_MASK 0xFF
|
|
||||||
#define TQ_BUCKET_SIZE 256
|
|
||||||
|
|
||||||
#define TQ_PAGE_SIZE 4096
|
|
||||||
//key + offset + size
|
|
||||||
#define TQ_IDX_SIZE 24
|
|
||||||
//4096 / 24
|
|
||||||
#define TQ_MAX_IDX_ONE_PAGE 170
|
|
||||||
//24 * 170
|
|
||||||
#define TQ_IDX_PAGE_BODY_SIZE 4080
|
|
||||||
//4096 - 4080
|
|
||||||
#define TQ_IDX_PAGE_HEAD_SIZE 16
|
|
||||||
|
|
||||||
#define TQ_ACTION_CONST 0
|
|
||||||
#define TQ_ACTION_INUSE 1
|
|
||||||
#define TQ_ACTION_INUSE_CONT 2
|
|
||||||
#define TQ_ACTION_INTXN 3
|
|
||||||
|
|
||||||
#define TQ_SVER 0
|
|
||||||
|
|
||||||
//TODO: inplace mode is not implemented
|
|
||||||
#define TQ_UPDATE_INPLACE 0
|
|
||||||
#define TQ_UPDATE_APPEND 1
|
|
||||||
|
|
||||||
#define TQ_DUP_INTXN_REWRITE 0
|
|
||||||
#define TQ_DUP_INTXN_REJECT 2
|
|
||||||
|
|
||||||
static inline bool TqUpdateAppend(int32_t tqConfigFlag) {
|
|
||||||
return tqConfigFlag & TQ_UPDATE_APPEND;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool TqDupIntxnReject(int32_t tqConfigFlag) {
|
|
||||||
return tqConfigFlag & TQ_DUP_INTXN_REJECT;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const int8_t TQ_CONST_DELETE = TQ_ACTION_CONST;
|
|
||||||
#define TQ_DELETE_TOKEN (void*)&TQ_CONST_DELETE
|
|
||||||
|
|
||||||
typedef struct TqSerializedHead {
|
|
||||||
int16_t ver;
|
|
||||||
int16_t action;
|
|
||||||
int32_t checksum;
|
|
||||||
int64_t ssize;
|
|
||||||
char content[];
|
|
||||||
} TqSerializedHead;
|
|
||||||
|
|
||||||
typedef struct TqMetaHandle {
|
|
||||||
int64_t key;
|
|
||||||
int64_t offset;
|
|
||||||
int64_t serializedSize;
|
|
||||||
void* valueInUse;
|
|
||||||
void* valueInTxn;
|
|
||||||
} TqMetaHandle;
|
|
||||||
|
|
||||||
typedef struct TqMetaList {
|
|
||||||
TqMetaHandle handle;
|
|
||||||
struct TqMetaList* next;
|
|
||||||
//struct TqMetaList* inTxnPrev;
|
|
||||||
//struct TqMetaList* inTxnNext;
|
|
||||||
struct TqMetaList* unpersistPrev;
|
|
||||||
struct TqMetaList* unpersistNext;
|
|
||||||
} TqMetaList;
|
|
||||||
|
|
||||||
typedef struct TqMetaStore {
|
|
||||||
TqMetaList* bucket[TQ_BUCKET_SIZE];
|
|
||||||
//a table head
|
|
||||||
TqMetaList* unpersistHead;
|
|
||||||
int fileFd; //TODO:temporaral use, to be replaced by unified tfile
|
|
||||||
int idxFd; //TODO:temporaral use, to be replaced by unified tfile
|
|
||||||
char* dirPath;
|
|
||||||
int32_t tqConfigFlag;
|
|
||||||
int (*serializer)(const void* pObj, TqSerializedHead** ppHead);
|
|
||||||
const void* (*deserializer)(const TqSerializedHead* pHead, void** ppObj);
|
|
||||||
void (*deleter)(void*);
|
|
||||||
} TqMetaStore;
|
|
||||||
|
|
||||||
TqMetaStore* tqStoreOpen(const char* path,
|
TqMetaStore* tqStoreOpen(const char* path,
|
||||||
int serializer(const void* pObj, TqSerializedHead** ppHead),
|
TqSerializeFun pSerializer,
|
||||||
const void* deserializer(const TqSerializedHead* pHead, void** ppObj),
|
TqDeserializeFun pDeserializer,
|
||||||
void deleter(void* pObj),
|
TqDeleteFun pDeleter,
|
||||||
int32_t tqConfigFlag
|
int32_t tqConfigFlag
|
||||||
);
|
);
|
||||||
int32_t tqStoreClose(TqMetaStore*);
|
int32_t tqStoreClose(TqMetaStore*);
|
||||||
//int32_t tqStoreDelete(TqMetaStore*);
|
//int32_t tqStoreDelete(TqMetaStore*);
|
||||||
//int32_t TqStoreCommitAll(TqMetaStore*);
|
//int32_t tqStoreCommitAll(TqMetaStore*);
|
||||||
int32_t tqStorePersist(TqMetaStore*);
|
int32_t tqStorePersist(TqMetaStore*);
|
||||||
//clean deleted idx and data from persistent file
|
//clean deleted idx and data from persistent file
|
||||||
int32_t tqStoreCompact(TqMetaStore*);
|
int32_t tqStoreCompact(TqMetaStore*);
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "tqInt.h"
|
#include "tqInt.h"
|
||||||
|
#include "tqMetaStore.h"
|
||||||
|
|
||||||
//static
|
//static
|
||||||
//read next version data
|
//read next version data
|
||||||
|
@ -24,6 +25,46 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
int tqGetgHandleSSize(const TqGroupHandle *gHandle);
|
int tqGetgHandleSSize(const TqGroupHandle *gHandle);
|
||||||
|
int tqBufHandleSSize();
|
||||||
|
int tqBufItemSSize();
|
||||||
|
|
||||||
|
TqGroupHandle* tqFindHandle(STQ* pTq, int64_t topicId, int64_t cgId, int64_t cId) {
|
||||||
|
TqGroupHandle* gHandle;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void* tqSerializeListHandle(TqListHandle* listHandle, void* ptr);
|
||||||
|
void* tqSerializeBufHandle(TqBufferHandle* bufHandle, void* ptr);
|
||||||
|
void* tqSerializeBufItem(TqBufferItem* bufItem, void* ptr);
|
||||||
|
|
||||||
|
const void* tqDeserializeBufHandle(const void* pBytes, TqBufferHandle* bufHandle);
|
||||||
|
const void* tqDeserializeBufItem(const void* pBytes, TqBufferItem* bufItem);
|
||||||
|
|
||||||
|
STQ* tqOpen(const char* path, TqConfig* tqConfig, TqLogReader* tqLogReader, SMemAllocatorFactory *allocFac) {
|
||||||
|
STQ* pTq = malloc(sizeof(STQ));
|
||||||
|
if(pTq == NULL) {
|
||||||
|
//TODO: memory error
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
strcpy(pTq->path, path);
|
||||||
|
pTq->tqConfig = tqConfig;
|
||||||
|
pTq->tqLogReader = tqLogReader;
|
||||||
|
pTq->tqMemRef.pAlloctorFactory = allocFac;
|
||||||
|
pTq->tqMemRef.pAllocator = allocFac->create();
|
||||||
|
if(pTq->tqMemRef.pAllocator == NULL) {
|
||||||
|
//TODO
|
||||||
|
}
|
||||||
|
pTq->tqMeta = tqStoreOpen(path,
|
||||||
|
(TqSerializeFun)tqSerializeGroupHandle,
|
||||||
|
(TqDeserializeFun)tqDeserializeGroupHandle,
|
||||||
|
free,
|
||||||
|
0);
|
||||||
|
if(pTq->tqMeta == NULL) {
|
||||||
|
//TODO: free STQ
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return pTq;
|
||||||
|
}
|
||||||
|
|
||||||
static int tqProtoCheck(TmqMsgHead *pMsg) {
|
static int tqProtoCheck(TmqMsgHead *pMsg) {
|
||||||
return pMsg->protoVer == 0;
|
return pMsg->protoVer == 0;
|
||||||
|
@ -83,14 +124,29 @@ static int tqCommitTCGroup(TqGroupHandle* handle) {
|
||||||
|
|
||||||
int tqCreateTCGroup(STQ *pTq, int64_t topicId, int64_t cgId, int64_t cId, TqGroupHandle** handle) {
|
int tqCreateTCGroup(STQ *pTq, int64_t topicId, int64_t cgId, int64_t cId, TqGroupHandle** handle) {
|
||||||
//create in disk
|
//create in disk
|
||||||
|
TqGroupHandle* gHandle = (TqGroupHandle*)malloc(sizeof(TqGroupHandle));
|
||||||
|
if(gHandle == NULL) {
|
||||||
|
//TODO
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
memset(gHandle, 0, sizeof(TqGroupHandle));
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int tqOpenTCGroup(STQ* pTq, int64_t topicId, int64_t cgId, int64_t cId) {
|
TqGroupHandle* tqOpenTCGroup(STQ* pTq, int64_t topicId, int64_t cgId, int64_t cId) {
|
||||||
//look up in disk
|
TqGroupHandle* gHandle = tqHandleGet(pTq->tqMeta, cId);
|
||||||
|
if(gHandle == NULL) {
|
||||||
|
int code = tqCreateTCGroup(pTq, topicId, cgId, cId, &gHandle);
|
||||||
|
if(code != 0) {
|
||||||
|
//TODO
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//create
|
//create
|
||||||
//open
|
//open
|
||||||
return 0;
|
return gHandle;
|
||||||
}
|
}
|
||||||
|
|
||||||
int tqCloseTCGroup(STQ* pTq, int64_t topicId, int64_t cgId, int64_t cId) {
|
int tqCloseTCGroup(STQ* pTq, int64_t topicId, int64_t cgId, int64_t cId) {
|
||||||
|
@ -207,16 +263,20 @@ int tqConsume(STQ* pTq, TmqConsumeReq* pMsg) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int tqSerializeGroupHandle(TqGroupHandle *gHandle, void** ppBytes) {
|
int tqSerializeGroupHandle(const TqGroupHandle *gHandle, TqSerializedHead** ppHead) {
|
||||||
//calculate size
|
//calculate size
|
||||||
int sz = tqGetgHandleSSize(gHandle);
|
int sz = tqGetgHandleSSize(gHandle) + sizeof(TqSerializedHead);
|
||||||
void* ptr = realloc(*ppBytes, sz);
|
if(sz > (*ppHead)->ssize) {
|
||||||
if(ptr == NULL) {
|
void* tmpPtr = realloc(*ppHead, sz);
|
||||||
free(ppBytes);
|
if(tmpPtr == NULL) {
|
||||||
//TODO: memory err
|
free(*ppHead);
|
||||||
return -1;
|
//TODO: memory err
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
*ppHead = tmpPtr;
|
||||||
|
(*ppHead)->ssize = sz;
|
||||||
}
|
}
|
||||||
*ppBytes = ptr;
|
void* ptr = (*ppHead)->content;
|
||||||
//do serialization
|
//do serialization
|
||||||
*(int64_t*)ptr = gHandle->cId;
|
*(int64_t*)ptr = gHandle->cId;
|
||||||
ptr = POINTER_SHIFT(ptr, sizeof(int64_t));
|
ptr = POINTER_SHIFT(ptr, sizeof(int64_t));
|
||||||
|
@ -261,8 +321,9 @@ void* tqSerializeBufItem(TqBufferItem *bufItem, void* ptr) {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const void* tqDeserializeGroupHandle(const void* pBytes, TqGroupHandle *gHandle) {
|
const void* tqDeserializeGroupHandle(const TqSerializedHead* pHead, TqGroupHandle **ppGHandle) {
|
||||||
const void* ptr = pBytes;
|
TqGroupHandle *gHandle = *ppGHandle;
|
||||||
|
const void* ptr = pHead->content;
|
||||||
gHandle->cId = *(int64_t*)ptr;
|
gHandle->cId = *(int64_t*)ptr;
|
||||||
ptr = POINTER_SHIFT(ptr, sizeof(int64_t));
|
ptr = POINTER_SHIFT(ptr, sizeof(int64_t));
|
||||||
gHandle->cgId = *(int64_t*)ptr;
|
gHandle->cgId = *(int64_t*)ptr;
|
||||||
|
@ -317,15 +378,15 @@ const void* tqDeserializeBufItem(const void* pBytes, TqBufferItem *bufItem) {
|
||||||
|
|
||||||
//TODO: make this a macro
|
//TODO: make this a macro
|
||||||
int tqGetgHandleSSize(const TqGroupHandle *gHandle) {
|
int tqGetgHandleSSize(const TqGroupHandle *gHandle) {
|
||||||
return sizeof(int64_t) * 2
|
return sizeof(int64_t) * 2 //cId + cgId
|
||||||
+ sizeof(int32_t)
|
+ sizeof(int32_t) //topicNum
|
||||||
+ gHandle->topicNum * tqBufHandleSSize();
|
+ gHandle->topicNum * tqBufHandleSSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO: make this a macro
|
//TODO: make this a macro
|
||||||
int tqBufHandleSSize() {
|
int tqBufHandleSSize() {
|
||||||
return sizeof(int64_t) * 2
|
return sizeof(int64_t) * 2 // nextConsumeOffset + topicId
|
||||||
+ sizeof(int32_t) * 2
|
+ sizeof(int32_t) * 2 // head + tail
|
||||||
+ TQ_BUFFER_SIZE * tqBufItemSSize();
|
+ TQ_BUFFER_SIZE * tqBufItemSSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -69,10 +69,10 @@ static inline int tqReadLastPage(int fd, TqIdxPageBuf* pBuf) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TqMetaStore* tqStoreOpen(const char* path,
|
TqMetaStore* tqStoreOpen(const char* path,
|
||||||
int serializer(const void* pObj, TqSerializedHead** ppHead),
|
TqSerializeFun serializer,
|
||||||
const void* deserializer(const TqSerializedHead* pHead, void** ppObj),
|
TqDeserializeFun deserializer,
|
||||||
void deleter(void* pObj),
|
TqDeleteFun deleter,
|
||||||
int32_t tqConfigFlag
|
int32_t tqConfigFlag
|
||||||
) {
|
) {
|
||||||
TqMetaStore* pMeta = malloc(sizeof(TqMetaStore));
|
TqMetaStore* pMeta = malloc(sizeof(TqMetaStore));
|
||||||
if(pMeta == NULL) {
|
if(pMeta == NULL) {
|
||||||
|
@ -127,9 +127,9 @@ TqMetaStore* tqStoreOpen(const char* path,
|
||||||
|
|
||||||
pMeta->fileFd = fileFd;
|
pMeta->fileFd = fileFd;
|
||||||
|
|
||||||
pMeta->serializer = serializer;
|
pMeta->pSerializer = serializer;
|
||||||
pMeta->deserializer = deserializer;
|
pMeta->pDeserializer = deserializer;
|
||||||
pMeta->deleter = deleter;
|
pMeta->pDeleter = deleter;
|
||||||
pMeta->tqConfigFlag = tqConfigFlag;
|
pMeta->tqConfigFlag = tqConfigFlag;
|
||||||
|
|
||||||
//read idx file and load into memory
|
//read idx file and load into memory
|
||||||
|
@ -171,25 +171,25 @@ TqMetaStore* tqStoreOpen(const char* path,
|
||||||
}
|
}
|
||||||
if(serializedObj->action == TQ_ACTION_INUSE) {
|
if(serializedObj->action == TQ_ACTION_INUSE) {
|
||||||
if(serializedObj->ssize != sizeof(TqSerializedHead)) {
|
if(serializedObj->ssize != sizeof(TqSerializedHead)) {
|
||||||
pMeta->deserializer(serializedObj, &pNode->handle.valueInUse);
|
pMeta->pDeserializer(serializedObj, &pNode->handle.valueInUse);
|
||||||
} else {
|
} else {
|
||||||
pNode->handle.valueInUse = TQ_DELETE_TOKEN;
|
pNode->handle.valueInUse = TQ_DELETE_TOKEN;
|
||||||
}
|
}
|
||||||
} else if(serializedObj->action == TQ_ACTION_INTXN) {
|
} else if(serializedObj->action == TQ_ACTION_INTXN) {
|
||||||
if(serializedObj->ssize != sizeof(TqSerializedHead)) {
|
if(serializedObj->ssize != sizeof(TqSerializedHead)) {
|
||||||
pMeta->deserializer(serializedObj, &pNode->handle.valueInTxn);
|
pMeta->pDeserializer(serializedObj, &pNode->handle.valueInTxn);
|
||||||
} else {
|
} else {
|
||||||
pNode->handle.valueInTxn = TQ_DELETE_TOKEN;
|
pNode->handle.valueInTxn = TQ_DELETE_TOKEN;
|
||||||
}
|
}
|
||||||
} else if(serializedObj->action == TQ_ACTION_INUSE_CONT) {
|
} else if(serializedObj->action == TQ_ACTION_INUSE_CONT) {
|
||||||
if(serializedObj->ssize != sizeof(TqSerializedHead)) {
|
if(serializedObj->ssize != sizeof(TqSerializedHead)) {
|
||||||
pMeta->deserializer(serializedObj, &pNode->handle.valueInUse);
|
pMeta->pDeserializer(serializedObj, &pNode->handle.valueInUse);
|
||||||
} else {
|
} else {
|
||||||
pNode->handle.valueInUse = TQ_DELETE_TOKEN;
|
pNode->handle.valueInUse = TQ_DELETE_TOKEN;
|
||||||
}
|
}
|
||||||
TqSerializedHead* ptr = POINTER_SHIFT(serializedObj, serializedObj->ssize);
|
TqSerializedHead* ptr = POINTER_SHIFT(serializedObj, serializedObj->ssize);
|
||||||
if(ptr->ssize != sizeof(TqSerializedHead)) {
|
if(ptr->ssize != sizeof(TqSerializedHead)) {
|
||||||
pMeta->deserializer(ptr, &pNode->handle.valueInTxn);
|
pMeta->pDeserializer(ptr, &pNode->handle.valueInTxn);
|
||||||
} else {
|
} else {
|
||||||
pNode->handle.valueInTxn = TQ_DELETE_TOKEN;
|
pNode->handle.valueInTxn = TQ_DELETE_TOKEN;
|
||||||
}
|
}
|
||||||
|
@ -225,11 +225,11 @@ TqMetaStore* tqStoreOpen(const char* path,
|
||||||
if(pBucketNode) {
|
if(pBucketNode) {
|
||||||
if(pBucketNode->handle.valueInUse
|
if(pBucketNode->handle.valueInUse
|
||||||
&& pBucketNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
&& pBucketNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pBucketNode->handle.valueInUse);
|
pMeta->pDeleter(pBucketNode->handle.valueInUse);
|
||||||
}
|
}
|
||||||
if(pBucketNode->handle.valueInTxn
|
if(pBucketNode->handle.valueInTxn
|
||||||
&& pBucketNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
&& pBucketNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pBucketNode->handle.valueInTxn);
|
pMeta->pDeleter(pBucketNode->handle.valueInTxn);
|
||||||
}
|
}
|
||||||
free(pBucketNode);
|
free(pBucketNode);
|
||||||
}
|
}
|
||||||
|
@ -253,11 +253,11 @@ int32_t tqStoreClose(TqMetaStore* pMeta) {
|
||||||
ASSERT(pNode->unpersistPrev == NULL);
|
ASSERT(pNode->unpersistPrev == NULL);
|
||||||
if(pNode->handle.valueInTxn
|
if(pNode->handle.valueInTxn
|
||||||
&& pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
&& pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pNode->handle.valueInTxn);
|
pMeta->pDeleter(pNode->handle.valueInTxn);
|
||||||
}
|
}
|
||||||
if(pNode->handle.valueInUse
|
if(pNode->handle.valueInUse
|
||||||
&& pNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
&& pNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pNode->handle.valueInUse);
|
pMeta->pDeleter(pNode->handle.valueInUse);
|
||||||
}
|
}
|
||||||
TqMetaList* next = pNode->next;
|
TqMetaList* next = pNode->next;
|
||||||
free(pNode);
|
free(pNode);
|
||||||
|
@ -280,11 +280,11 @@ int32_t tqStoreDelete(TqMetaStore* pMeta) {
|
||||||
while(pNode) {
|
while(pNode) {
|
||||||
if(pNode->handle.valueInTxn
|
if(pNode->handle.valueInTxn
|
||||||
&& pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
&& pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pNode->handle.valueInTxn);
|
pMeta->pDeleter(pNode->handle.valueInTxn);
|
||||||
}
|
}
|
||||||
if(pNode->handle.valueInUse
|
if(pNode->handle.valueInUse
|
||||||
&& pNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
&& pNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pNode->handle.valueInUse);
|
pMeta->pDeleter(pNode->handle.valueInUse);
|
||||||
}
|
}
|
||||||
TqMetaList* next = pNode->next;
|
TqMetaList* next = pNode->next;
|
||||||
free(pNode);
|
free(pNode);
|
||||||
|
@ -338,7 +338,7 @@ int32_t tqStorePersist(TqMetaStore* pMeta) {
|
||||||
if(pNode->handle.valueInUse == TQ_DELETE_TOKEN) {
|
if(pNode->handle.valueInUse == TQ_DELETE_TOKEN) {
|
||||||
pSHead->ssize = sizeof(TqSerializedHead);
|
pSHead->ssize = sizeof(TqSerializedHead);
|
||||||
} else {
|
} else {
|
||||||
pMeta->serializer(pNode->handle.valueInUse, &pSHead);
|
pMeta->pSerializer(pNode->handle.valueInUse, &pSHead);
|
||||||
}
|
}
|
||||||
nBytes = write(pMeta->fileFd, pSHead, pSHead->ssize);
|
nBytes = write(pMeta->fileFd, pSHead, pSHead->ssize);
|
||||||
ASSERT(nBytes == pSHead->ssize);
|
ASSERT(nBytes == pSHead->ssize);
|
||||||
|
@ -349,7 +349,7 @@ int32_t tqStorePersist(TqMetaStore* pMeta) {
|
||||||
if(pNode->handle.valueInTxn == TQ_DELETE_TOKEN) {
|
if(pNode->handle.valueInTxn == TQ_DELETE_TOKEN) {
|
||||||
pSHead->ssize = sizeof(TqSerializedHead);
|
pSHead->ssize = sizeof(TqSerializedHead);
|
||||||
} else {
|
} else {
|
||||||
pMeta->serializer(pNode->handle.valueInTxn, &pSHead);
|
pMeta->pSerializer(pNode->handle.valueInTxn, &pSHead);
|
||||||
}
|
}
|
||||||
int nBytesTxn = write(pMeta->fileFd, pSHead, pSHead->ssize);
|
int nBytesTxn = write(pMeta->fileFd, pSHead, pSHead->ssize);
|
||||||
ASSERT(nBytesTxn == pSHead->ssize);
|
ASSERT(nBytesTxn == pSHead->ssize);
|
||||||
|
@ -423,7 +423,7 @@ static int32_t tqHandlePutCommitted(TqMetaStore* pMeta, int64_t key, void* value
|
||||||
//TODO: think about thread safety
|
//TODO: think about thread safety
|
||||||
if(pNode->handle.valueInUse
|
if(pNode->handle.valueInUse
|
||||||
&& pNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
&& pNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pNode->handle.valueInUse);
|
pMeta->pDeleter(pNode->handle.valueInUse);
|
||||||
}
|
}
|
||||||
//change pointer ownership
|
//change pointer ownership
|
||||||
pNode->handle.valueInUse = value;
|
pNode->handle.valueInUse = value;
|
||||||
|
@ -496,7 +496,7 @@ static inline int32_t tqHandlePutImpl(TqMetaStore* pMeta, int64_t key, void* val
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
if(pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
if(pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pNode->handle.valueInTxn);
|
pMeta->pDeleter(pNode->handle.valueInTxn);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pNode->handle.valueInTxn = value;
|
pNode->handle.valueInTxn = value;
|
||||||
|
@ -562,7 +562,7 @@ int32_t tqHandleCommit(TqMetaStore* pMeta, int64_t key) {
|
||||||
}
|
}
|
||||||
if(pNode->handle.valueInUse
|
if(pNode->handle.valueInUse
|
||||||
&& pNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
&& pNode->handle.valueInUse != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pNode->handle.valueInUse);
|
pMeta->pDeleter(pNode->handle.valueInUse);
|
||||||
}
|
}
|
||||||
pNode->handle.valueInUse = pNode->handle.valueInTxn;
|
pNode->handle.valueInUse = pNode->handle.valueInTxn;
|
||||||
pNode->handle.valueInTxn = NULL;
|
pNode->handle.valueInTxn = NULL;
|
||||||
|
@ -582,7 +582,7 @@ int32_t tqHandleAbort(TqMetaStore* pMeta, int64_t key) {
|
||||||
if(pNode->handle.key == key) {
|
if(pNode->handle.key == key) {
|
||||||
if(pNode->handle.valueInTxn) {
|
if(pNode->handle.valueInTxn) {
|
||||||
if(pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
if(pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
||||||
pMeta->deleter(pNode->handle.valueInTxn);
|
pMeta->pDeleter(pNode->handle.valueInTxn);
|
||||||
}
|
}
|
||||||
pNode->handle.valueInTxn = NULL;
|
pNode->handle.valueInTxn = NULL;
|
||||||
tqLinkUnpersist(pMeta, pNode);
|
tqLinkUnpersist(pMeta, pNode);
|
||||||
|
@ -602,7 +602,7 @@ int32_t tqHandleDel(TqMetaStore* pMeta, int64_t key) {
|
||||||
while(pNode) {
|
while(pNode) {
|
||||||
if(pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
if(pNode->handle.valueInTxn != TQ_DELETE_TOKEN) {
|
||||||
if(pNode->handle.valueInTxn) {
|
if(pNode->handle.valueInTxn) {
|
||||||
pMeta->deleter(pNode->handle.valueInTxn);
|
pMeta->pDeleter(pNode->handle.valueInTxn);
|
||||||
}
|
}
|
||||||
pNode->handle.valueInTxn = TQ_DELETE_TOKEN;
|
pNode->handle.valueInTxn = TQ_DELETE_TOKEN;
|
||||||
tqLinkUnpersist(pMeta, pNode);
|
tqLinkUnpersist(pMeta, pNode);
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <cstring>
|
||||||
|
#include <iostream>
|
||||||
|
#include <queue>
|
||||||
|
|
||||||
|
#include "tq.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
TEST(TqSerializerTest, basicTest) {
|
||||||
|
TqGroupHandle* gHandle = (TqGroupHandle*)malloc(sizeof(TqGroupHandle));
|
||||||
|
|
||||||
|
}
|
|
@ -4,4 +4,27 @@ target_include_directories(
|
||||||
index
|
index
|
||||||
PUBLIC "${CMAKE_SOURCE_DIR}/include/libs/index"
|
PUBLIC "${CMAKE_SOURCE_DIR}/include/libs/index"
|
||||||
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/inc"
|
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/inc"
|
||||||
)
|
)
|
||||||
|
target_link_libraries(
|
||||||
|
index
|
||||||
|
PUBLIC os
|
||||||
|
PUBLIC util
|
||||||
|
)
|
||||||
|
|
||||||
|
if (${BUILD_WITH_LUCENE})
|
||||||
|
target_include_directories(
|
||||||
|
index
|
||||||
|
PUBLIC "${CMAKE_SOURCE_DIR}/deps/lucene/include"
|
||||||
|
)
|
||||||
|
LINK_DIRECTORIES("${CMAKE_SOURCE_DIR}/deps/lucene/debug/src/core")
|
||||||
|
target_link_libraries(
|
||||||
|
index
|
||||||
|
PUBLIC lucene++
|
||||||
|
)
|
||||||
|
|
||||||
|
endif(${BUILD_WITH_LUCENE})
|
||||||
|
|
||||||
|
if (${BUILD_TEST})
|
||||||
|
add_subdirectory(test)
|
||||||
|
endif(${BUILD_TEST})
|
||||||
|
|
||||||
|
|
|
@ -16,12 +16,52 @@
|
||||||
#ifndef _TD_INDEX_INT_H_
|
#ifndef _TD_INDEX_INT_H_
|
||||||
#define _TD_INDEX_INT_H_
|
#define _TD_INDEX_INT_H_
|
||||||
|
|
||||||
|
#include "index.h"
|
||||||
|
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
#include <lucene++/Lucene_c.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct SIndex {
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
index_t *index;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SIndexOpts {
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
void *opts;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SIndexMultiTermQuery {
|
||||||
|
EIndexOperatorType opera;
|
||||||
|
SArray *query;
|
||||||
|
};
|
||||||
|
|
||||||
|
// field and key;
|
||||||
|
typedef struct SIndexTerm {
|
||||||
|
char *key;
|
||||||
|
int32_t nKey;
|
||||||
|
char *val;
|
||||||
|
int32_t nVal;
|
||||||
|
} SIndexTerm;
|
||||||
|
|
||||||
|
typedef struct SIndexTermQuery {
|
||||||
|
SIndexTerm* field_value;
|
||||||
|
EIndexQueryType type;
|
||||||
|
} SIndexTermQuery;
|
||||||
|
|
||||||
|
|
||||||
|
SIndexTerm *indexTermCreate(const char *key, int32_t nKey, const char *val, int32_t nVal);
|
||||||
|
void indexTermDestroy(SIndexTerm *p);
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /*_TD_INDEX_INT_H_*/
|
#endif /*_TD_INDEX_INT_H_*/
|
||||||
|
|
|
@ -0,0 +1,164 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __INDEX_FST_H__
|
||||||
|
#define __INDEX_FST_H__
|
||||||
|
|
||||||
|
|
||||||
|
#include "tarray.h"
|
||||||
|
#include "index_fst_util.h"
|
||||||
|
#include "index_fst_registry.h"
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct FstNode FstNode;
|
||||||
|
#define OUTPUT_PREFIX(a, b) ((a) > (b) ? (b) : (a)
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct FstRange {
|
||||||
|
uint64_t start;
|
||||||
|
uint64_t end;
|
||||||
|
} FstRange;
|
||||||
|
|
||||||
|
|
||||||
|
typedef enum { OneTransNext, OneTrans, AnyTrans, EmptyFinal} State;
|
||||||
|
typedef enum { Included, Excluded, Unbounded} FstBound;
|
||||||
|
|
||||||
|
typedef uint32_t CheckSummer;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* UnFinished node and helper function
|
||||||
|
* TODO: simple function name
|
||||||
|
*/
|
||||||
|
typedef struct FstUnFinishedNodes {
|
||||||
|
SArray *stack; // <FstBuilderNodeUnfinished> } FstUnFinishedNodes;
|
||||||
|
} FstUnFinishedNodes;
|
||||||
|
|
||||||
|
#define FST_UNFINISHED_NODES_LEN(nodes) taosArrayGetSize(nodes->stack)
|
||||||
|
|
||||||
|
FstUnFinishedNodes *FstUnFinishedNodesCreate();
|
||||||
|
void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal);
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes);
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr);
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes);
|
||||||
|
void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *node, Output out);
|
||||||
|
void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *node, CompiledAddr addr);
|
||||||
|
void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *node, FstSlice bs, Output out);
|
||||||
|
uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs);
|
||||||
|
uint64_t FstUnFinishedNodesFindCommPreifxAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out);
|
||||||
|
|
||||||
|
typedef struct FstCountingWriter {
|
||||||
|
void* wtr; // wrap any writer that counts and checksum bytes written
|
||||||
|
uint64_t count;
|
||||||
|
CheckSummer summer;
|
||||||
|
} FstCountingWriter;
|
||||||
|
|
||||||
|
typedef struct FstBuilder {
|
||||||
|
FstCountingWriter wtr; // The FST raw data is written directly to `wtr`.
|
||||||
|
FstUnFinishedNodes *unfinished; // The stack of unfinished nodes
|
||||||
|
FstRegistry registry; // A map of finished nodes.
|
||||||
|
SArray* last; // The last word added
|
||||||
|
CompiledAddr lastAddr; // The address of the last compiled node
|
||||||
|
uint64_t len; // num of keys added
|
||||||
|
} FstBuilder;
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct FstTransitions {
|
||||||
|
FstNode *node;
|
||||||
|
FstRange range;
|
||||||
|
} FstTransitions;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct FstLastTransition {
|
||||||
|
uint8_t inp;
|
||||||
|
Output out;
|
||||||
|
} FstLastTransition;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FstBuilderNodeUnfinished and helper function
|
||||||
|
* TODO: simple function name
|
||||||
|
*/
|
||||||
|
typedef struct FstBuilderNodeUnfinished {
|
||||||
|
FstBuilderNode *node;
|
||||||
|
FstLastTransition* last;
|
||||||
|
} FstBuilderNodeUnfinished;
|
||||||
|
|
||||||
|
void fstBuilderNodeUnfinishedLastCompiled(FstBuilderNodeUnfinished *node, CompiledAddr addr);
|
||||||
|
void fstBuilderNodeUnfinishedAddOutputPrefix(FstBuilderNodeUnfinished *node, CompiledAddr addr);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FstNode and helper function
|
||||||
|
*/
|
||||||
|
typedef struct FstNode {
|
||||||
|
FstSlice data;
|
||||||
|
uint64_t version;
|
||||||
|
State state;
|
||||||
|
CompiledAddr start;
|
||||||
|
CompiledAddr end;
|
||||||
|
bool isFinal;
|
||||||
|
uint64_t nTrans;
|
||||||
|
PackSizes sizes;
|
||||||
|
Output finalOutput;
|
||||||
|
} FstNode;
|
||||||
|
|
||||||
|
// If this node is final and has a terminal output value, then it is, returned. Otherwise, a zero output is returned
|
||||||
|
#define FST_NODE_FINAL_OUTPUT(node) node->finalOutput
|
||||||
|
// Returns true if and only if this node corresponds to a final or "match", state in the finite state transducer.
|
||||||
|
#define FST_NODE_IS_FINAL(node) node->isFinal
|
||||||
|
// Returns the number of transitions in this node, The maximum number of transitions is 256.
|
||||||
|
#define FST_NODE_LEN(node) node->nTrans
|
||||||
|
// Returns true if and only if this node has zero transitions.
|
||||||
|
#define FST_NODE_IS_EMPTYE(node) (node->nTrans == 0)
|
||||||
|
// Return the address of this node.
|
||||||
|
#define FST_NODE_ADDR(node) node->start
|
||||||
|
|
||||||
|
FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *data);
|
||||||
|
FstTransitions fstNodeTransitionIter(FstNode *node);
|
||||||
|
FstTransitions* fstNodeTransitions(FstNode *node);
|
||||||
|
bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res);
|
||||||
|
bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res);
|
||||||
|
bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res);
|
||||||
|
bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode);
|
||||||
|
FstSlice fstNodeAsSlice(FstNode *node);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct FstMeta {
|
||||||
|
uint64_t version;
|
||||||
|
CompiledAddr rootAddr;
|
||||||
|
FstType ty;
|
||||||
|
uint64_t len;
|
||||||
|
uint32_t checkSum;
|
||||||
|
} FstMeta;
|
||||||
|
|
||||||
|
typedef struct Fst {
|
||||||
|
FstMeta meta;
|
||||||
|
void *data; //
|
||||||
|
} Fst;
|
||||||
|
|
||||||
|
// ops
|
||||||
|
|
||||||
|
typedef struct FstIndexedValue {
|
||||||
|
uint64_t index;
|
||||||
|
uint64_t value;
|
||||||
|
} FstIndexedValue;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,42 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef __INDEX_FST_AUTAOMATION_H__
|
||||||
|
#define __INDEX_FST_AUTAOMATION_H__
|
||||||
|
|
||||||
|
struct AutomationCtx;
|
||||||
|
|
||||||
|
typedef struct StartWith {
|
||||||
|
AutomationCtx *autoSelf;
|
||||||
|
} StartWith;
|
||||||
|
|
||||||
|
typedef struct Complement {
|
||||||
|
AutomationCtx *autoSelf;
|
||||||
|
} Complement;
|
||||||
|
|
||||||
|
// automation
|
||||||
|
typedef struct AutomationCtx {
|
||||||
|
void *data;
|
||||||
|
} AutomationCtx;
|
||||||
|
|
||||||
|
// automation interface
|
||||||
|
void (*start)(AutomationCtx *ctx);
|
||||||
|
bool (*isMatch)(AutomationCtx *ctx);
|
||||||
|
bool (*canMatch)(AutomationCtx *ctx, void *data);
|
||||||
|
bool (*willAlwaysMatch)(AutomationCtx *ctx, void *state);
|
||||||
|
void* (*accpet)(AutomationCtx *ctx, void *state, uint8_t byte);
|
||||||
|
void* (*accpetEof)(AutomationCtx *ctx, *state);
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,40 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __INDEX_FST_NODE_H__
|
||||||
|
#define __INDEX_FST_NODE_H__
|
||||||
|
|
||||||
|
#include "index_fst_util.h"
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct FstTransition {
|
||||||
|
uint8_t inp; //The byte input associated with this transition.
|
||||||
|
Output out; //The output associated with this transition
|
||||||
|
CompiledAddr addr; //The address of the node that this transition points to
|
||||||
|
} FstTransition;
|
||||||
|
|
||||||
|
typedef struct FstBuilderNode {
|
||||||
|
bool isFinal;
|
||||||
|
Output finalOutput;
|
||||||
|
SArray *trans; // <FstTransition>
|
||||||
|
} FstBuilderNode;
|
||||||
|
|
||||||
|
FstBuilderNode *fstBuilderNodeDefault();
|
||||||
|
|
||||||
|
FstBuilderNode *fstBuilderNodeClone(FstBuilderNode *src);
|
||||||
|
|
||||||
|
void fstBuilderNodeCloneFrom(FstBuilderNode *dst, FstBuilderNode *src);
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,57 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef __FST_REGISTRY_H__
|
||||||
|
#define __FST_REGISTRY_H__
|
||||||
|
|
||||||
|
#include "index_fst_util.h"
|
||||||
|
#include "tarray.h"
|
||||||
|
#include "index_fst_node.h"
|
||||||
|
|
||||||
|
typedef struct FstRegistryCell {
|
||||||
|
CompiledAddr addr;
|
||||||
|
FstBuilderNode *node;
|
||||||
|
} FstRegistryCell;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//typedef struct FstRegistryCache {
|
||||||
|
// SArray *cells;
|
||||||
|
// uint32_t start;
|
||||||
|
// uint32_t end;
|
||||||
|
//} FstRegistryCache;
|
||||||
|
|
||||||
|
typedef enum {FOUND, NOTFOUND, REJECTED} FstRegistryEntryState;
|
||||||
|
|
||||||
|
typedef struct FstRegistryEntry {
|
||||||
|
FstRegistryEntryState state;
|
||||||
|
CompiledAddr addr;
|
||||||
|
FstRegistryCell *cell;
|
||||||
|
} FstRegistryEntry;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Registry relation function
|
||||||
|
typedef struct FstRegistry {
|
||||||
|
SArray *table;
|
||||||
|
uint64_t tableSize; // num of rows
|
||||||
|
uint64_t mruSize; // num of columns
|
||||||
|
} FstRegistry;
|
||||||
|
|
||||||
|
//
|
||||||
|
FstRegistry* fstRegistryCreate(uint64_t tableSize, uint64_t mruSize);
|
||||||
|
|
||||||
|
FstRegistryEntry* fstRegistryGetEntry(FstRegistry *registry, FstBuilderNode *bNode);
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef __INDEX_FST_UTIL_H__
|
||||||
|
#define __INDEX_FST_UTIL_H__
|
||||||
|
|
||||||
|
#include "tarray.h"
|
||||||
|
|
||||||
|
|
||||||
|
typedef uint64_t FstType;
|
||||||
|
typedef uint64_t CompiledAddr;
|
||||||
|
typedef uint64_t Output;
|
||||||
|
typedef uint8_t PackSizes;
|
||||||
|
|
||||||
|
|
||||||
|
//A sentinel value used to indicate an empty final state
|
||||||
|
extern const CompiledAddr EMPTY_ADDRESS;
|
||||||
|
/// A sentinel value used to indicate an invalid state.
|
||||||
|
extern const CompiledAddr NONE_ADDRESS;
|
||||||
|
|
||||||
|
// This version number is written to every finite state transducer created by
|
||||||
|
// this crate. When a finite state transducer is read, its version number is
|
||||||
|
// checked against this value.
|
||||||
|
extern const uint64_t version;
|
||||||
|
// The threshold (in number of transitions) at which an index is created for
|
||||||
|
// a node's transitions. This speeds up lookup time at the expense of FST size
|
||||||
|
|
||||||
|
extern const uint64_t TRANS_INDEX_THRESHOLD;
|
||||||
|
// high 4 bits is transition address packed size.
|
||||||
|
// low 4 bits is output value packed size.
|
||||||
|
//
|
||||||
|
// `0` is a legal value which means there are no transitions/outputs
|
||||||
|
|
||||||
|
#define FST_SET_TRANSITION_PACK_SIZE(v, sz) do {v = (v & 0b00001111) | (sz << 4} while(0)
|
||||||
|
#define FST_GET_TRANSITION_PACK_SIZE(v) (((v) & 0b11110000) >> 4)
|
||||||
|
#define FST_SET_OUTPUT_PACK_SIZE(v, sz) do { v = (v & 0b11110000) | sz } while(0)
|
||||||
|
#define FST_GET_OUTPUT_PACK_SIZE(v) ((v) & 0b00001111)
|
||||||
|
|
||||||
|
#define COMMON_INPUT(idx) COMMON_INPUTS_INV[(idx) - 1]
|
||||||
|
|
||||||
|
#define COMMON_INDEX(v, max, val) do { \
|
||||||
|
val = ((uint16_t)COMMON_INPUTS[v] + 1)%256; \
|
||||||
|
val = val > max ? 0: val; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
|
||||||
|
//uint8_t commonInput(uint8_t idx);
|
||||||
|
//uint8_t commonIdx(uint8_t v, uint8_t max);
|
||||||
|
|
||||||
|
uint8_t packSize(uint64_t n);
|
||||||
|
uint64_t unpackUint64(uint8_t *ch, uint8_t sz);
|
||||||
|
uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr);
|
||||||
|
CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct FstSlice {
|
||||||
|
uint8_t *data;
|
||||||
|
uint64_t dLen;
|
||||||
|
uint32_t start;
|
||||||
|
uint32_t end;
|
||||||
|
} FstSlice;
|
||||||
|
|
||||||
|
FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end);
|
||||||
|
FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen);
|
||||||
|
bool fstSliceEmpty(FstSlice *slice);
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
|
@ -13,15 +13,176 @@
|
||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _TD_INDEX_H_
|
#include "index.h"
|
||||||
#define _TD_INDEX_H_
|
#include "indexInt.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef USE_LUCENE
|
||||||
extern "C" {
|
#include "lucene++/Lucene_c.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
static pthread_once_t isInit = PTHREAD_ONCE_INIT;
|
||||||
|
|
||||||
|
static void indexInit();
|
||||||
|
|
||||||
|
SIndex *indexOpen(SIndexOpts *opts, const char *path) {
|
||||||
|
pthread_once(&isInit, indexInit);
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
index_t *index = index_open(path);
|
||||||
|
SIndex *p = malloc(sizeof(SIndex));
|
||||||
|
p->index = index;
|
||||||
|
return p;
|
||||||
|
#endif
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /*_TD_INDEX_H_*/
|
void indexClose(SIndex *index) {
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
index_close(index->index);
|
||||||
|
index->index = NULL;
|
||||||
|
#endif
|
||||||
|
free(index);
|
||||||
|
return;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
#endif
|
||||||
|
int indexPut(SIndex *index, SArray* field_vals, int uid) {
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
index_document_t *doc = index_document_create();
|
||||||
|
|
||||||
|
char buf[16] = {0};
|
||||||
|
sprintf(buf, "%d", uid);
|
||||||
|
|
||||||
|
for (int i = 0; i < taosArrayGetSize(field_vals); i++) {
|
||||||
|
SIndexTerm *p = taosArrayGetP(field_vals, i);
|
||||||
|
index_document_add(doc, (const char *)(p->key), p->nKey, (const char *)(p->val), p->nVal, 1);
|
||||||
|
}
|
||||||
|
index_document_add(doc, NULL, 0, buf, strlen(buf), 0);
|
||||||
|
|
||||||
|
index_put(index->index, doc);
|
||||||
|
index_document_destroy(doc);
|
||||||
|
#endif
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
}
|
||||||
|
int indexSearch(SIndex *index, SIndexMultiTermQuery *multiQuerys, SArray *result) {
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
EIndexOperatorType opera = multiQuerys->opera;
|
||||||
|
|
||||||
|
int nQuery = taosArrayGetSize(multiQuerys->query);
|
||||||
|
char **fields = malloc(sizeof(char *) * nQuery);
|
||||||
|
char **keys = malloc(sizeof(char *) * nQuery);
|
||||||
|
int *types = malloc(sizeof(int) * nQuery);
|
||||||
|
|
||||||
|
for (int i = 0; i < nQuery; i++) {
|
||||||
|
SIndexTermQuery *p = taosArrayGet(multiQuerys->query, i);
|
||||||
|
SIndexTerm *term = p->field_value;
|
||||||
|
|
||||||
|
fields[i] = calloc(1, term->nKey + 1);
|
||||||
|
keys[i] = calloc(1, term->nVal + 1);
|
||||||
|
|
||||||
|
memcpy(fields[i], term->key, term->nKey);
|
||||||
|
memcpy(keys[i], term->val, term->nVal);
|
||||||
|
types[i] = (int)(p->type);
|
||||||
|
}
|
||||||
|
int *tResult = NULL;
|
||||||
|
int tsz= 0;
|
||||||
|
index_multi_search(index->index, (const char **)fields, (const char **)keys, types, nQuery, opera, &tResult, &tsz);
|
||||||
|
|
||||||
|
for (int i = 0; i < tsz; i++) {
|
||||||
|
taosArrayPush(result, &tResult[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < nQuery; i++) {
|
||||||
|
free(fields[i]);
|
||||||
|
free(keys[i]);
|
||||||
|
}
|
||||||
|
free(fields);
|
||||||
|
free(keys);
|
||||||
|
free(types);
|
||||||
|
#endif
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int indexDelete(SIndex *index, SIndexMultiTermQuery *query) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
int indexRebuild(SIndex *index, SIndexOpts *opts);
|
||||||
|
|
||||||
|
|
||||||
|
SIndexOpts *indexOptsCreate() {
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
#endif
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
void indexOptsDestroy(SIndexOpts *opts) {
|
||||||
|
#ifdef USE_LUCENE
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* @param: oper
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
SIndexMultiTermQuery *indexMultiTermQueryCreate(EIndexOperatorType opera) {
|
||||||
|
SIndexMultiTermQuery *p = (SIndexMultiTermQuery *)malloc(sizeof(SIndexMultiTermQuery));
|
||||||
|
if (p == NULL) { return NULL; }
|
||||||
|
p->opera = opera;
|
||||||
|
p->query = taosArrayInit(1, sizeof(SIndexTermQuery));
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
void indexMultiTermQueryDestroy(SIndexMultiTermQuery *pQuery) {
|
||||||
|
for (int i = 0; i < taosArrayGetSize(pQuery->query); i++) {
|
||||||
|
SIndexTermQuery *p = (SIndexTermQuery *)taosArrayGet(pQuery->query, i);
|
||||||
|
indexTermDestroy(p->field_value);
|
||||||
|
}
|
||||||
|
taosArrayDestroy(pQuery->query);
|
||||||
|
free(pQuery);
|
||||||
|
};
|
||||||
|
int indexMultiTermQueryAdd(SIndexMultiTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type){
|
||||||
|
SIndexTerm *t = indexTermCreate(field, nFields, value, nValue);
|
||||||
|
if (t == NULL) {return -1;}
|
||||||
|
SIndexTermQuery q = {.type = type, .field_value = t};
|
||||||
|
taosArrayPush(pQuery->query, &q);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SIndexTerm *indexTermCreate(const char *key, int32_t nKey, const char *val, int32_t nVal) {
|
||||||
|
SIndexTerm *t = (SIndexTerm *)malloc(sizeof(SIndexTerm));
|
||||||
|
t->key = (char *)calloc(nKey + 1, 1);
|
||||||
|
memcpy(t->key, key, nKey);
|
||||||
|
t->nKey = nKey;
|
||||||
|
|
||||||
|
t->val = (char *)calloc(nVal + 1, 1);
|
||||||
|
memcpy(t->val, val, nVal);
|
||||||
|
t->nVal = nVal;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
void indexTermDestroy(SIndexTerm *p) {
|
||||||
|
free(p->key);
|
||||||
|
free(p->val);
|
||||||
|
free(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
SArray *indexMultiTermCreate() {
|
||||||
|
return taosArrayInit(4, sizeof(SIndexTerm *));
|
||||||
|
}
|
||||||
|
|
||||||
|
int indexMultiTermAdd(SArray *array, const char *field, int32_t nField, const char *val, int32_t nVal) {
|
||||||
|
SIndexTerm *term = indexTermCreate(field, nField, val, nVal);
|
||||||
|
if (term == NULL) { return -1; }
|
||||||
|
taosArrayPush(array, &term);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
void indexMultiTermDestroy(SArray *array) {
|
||||||
|
for (int32_t i = 0; i < taosArrayGetSize(array); i++) {
|
||||||
|
SIndexTerm *p = taosArrayGetP(array, i);
|
||||||
|
indexTermDestroy(p);
|
||||||
|
}
|
||||||
|
taosArrayDestroy(array);
|
||||||
|
}
|
||||||
|
void indexInit() {
|
||||||
|
//do nothing
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,296 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "index_fst.h"
|
||||||
|
|
||||||
|
|
||||||
|
FstUnFinishedNodes *fstUnFinishedNodesCreate() {
|
||||||
|
FstUnFinishedNodes *nodes = malloc(sizeof(FstUnFinishedNodes));
|
||||||
|
if (nodes == NULL) { return NULL; }
|
||||||
|
|
||||||
|
nodes->stack = (SArray *)taosArrayInit(64, sizeof(FstBuilderNodeUnfinished));
|
||||||
|
fstUnFinishedNodesPushEmpty(nodes, false);
|
||||||
|
return nodes;
|
||||||
|
}
|
||||||
|
void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal) {
|
||||||
|
FstBuilderNode *node = malloc(sizeof(FstBuilderNode));
|
||||||
|
node->isFinal = isFinal;
|
||||||
|
node->finalOutput = 0;
|
||||||
|
node->trans = NULL;
|
||||||
|
|
||||||
|
FstBuilderNodeUnfinished un = {.node = node, .last = NULL};
|
||||||
|
taosArrayPush(nodes->stack, &un);
|
||||||
|
|
||||||
|
}
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes) {
|
||||||
|
assert(taosArrayGetSize(nodes->stack) == 1);
|
||||||
|
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack);
|
||||||
|
assert(un->last == NULL);
|
||||||
|
return un->node;
|
||||||
|
}
|
||||||
|
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack);
|
||||||
|
fstBuilderNodeUnfinishedLastCompiled(un, addr);
|
||||||
|
free(un->last); // TODO add func FstLastTransitionFree()
|
||||||
|
return un->node;
|
||||||
|
}
|
||||||
|
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack);
|
||||||
|
assert(un->last == NULL);
|
||||||
|
return un->node;
|
||||||
|
|
||||||
|
}
|
||||||
|
void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *nodes, Output out) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, 0);
|
||||||
|
un->node->isFinal = true;
|
||||||
|
un->node->finalOutput = out;
|
||||||
|
//un->node->trans = NULL;
|
||||||
|
}
|
||||||
|
void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) {
|
||||||
|
size_t sz = taosArrayGetSize(nodes->stack) - 1;
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz);
|
||||||
|
fstBuilderNodeUnfinishedLastCompiled(un, addr);
|
||||||
|
}
|
||||||
|
void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *nodes, FstSlice bs, Output out) {
|
||||||
|
FstSlice *s = &bs;
|
||||||
|
if (s->data == NULL || s->dLen == 0 || s->start > s->end) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
size_t sz = taosArrayGetSize(nodes->stack) - 1;
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz);
|
||||||
|
assert(un->last == NULL);
|
||||||
|
|
||||||
|
|
||||||
|
FstLastTransition *trn = malloc(sizeof(FstLastTransition));
|
||||||
|
trn->inp = s->data[s->start];
|
||||||
|
trn->out = out;
|
||||||
|
|
||||||
|
un->last = trn;
|
||||||
|
|
||||||
|
for (uint64_t i = s->start; i <= s->end; i++) {
|
||||||
|
FstBuilderNode *n = malloc(sizeof(FstBuilderNode));
|
||||||
|
n->isFinal = false;
|
||||||
|
n->finalOutput = 0;
|
||||||
|
n->trans = NULL;
|
||||||
|
|
||||||
|
FstLastTransition *trn = malloc(sizeof(FstLastTransition));
|
||||||
|
trn->inp = s->data[i];
|
||||||
|
trn->out = out;
|
||||||
|
|
||||||
|
FstBuilderNodeUnfinished un = {.node = n, .last = trn};
|
||||||
|
taosArrayPush(nodes->stack, &un);
|
||||||
|
}
|
||||||
|
fstUnFinishedNodesPushEmpty(nodes, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs) {
|
||||||
|
FstSlice *s = &bs;
|
||||||
|
|
||||||
|
size_t lsz = (size_t)(s->end - s->start + 1); // data len
|
||||||
|
size_t ssz = taosArrayGetSize(node->stack); // stack size
|
||||||
|
|
||||||
|
uint64_t count = 0;
|
||||||
|
for (size_t i = 0; i < ssz && i < lsz; i++) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i);
|
||||||
|
if (un->last->inp == s->data[s->start + i]) {
|
||||||
|
count++;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
uint64_t FstUnFinishedNodesFindCommPrefixAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out) {
|
||||||
|
FstSlice *s = &bs;
|
||||||
|
|
||||||
|
size_t lsz = (size_t)(s->end - s->start + 1); // data len
|
||||||
|
size_t ssz = taosArrayGetSize(node->stack); // stack size
|
||||||
|
|
||||||
|
uint64_t res = 0;
|
||||||
|
for (size_t i = 0; i < lsz && i < ssz; i++) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i);
|
||||||
|
|
||||||
|
FstLastTransition *last = un->last;
|
||||||
|
if (last->inp == s->data[s->start + i]) {
|
||||||
|
uint64_t commPrefix = last->out;
|
||||||
|
uint64_t addPrefix = last->out - commPrefix;
|
||||||
|
out = out - commPrefix;
|
||||||
|
last->out = commPrefix;
|
||||||
|
if (addPrefix != 0) {
|
||||||
|
fstBuilderNodeUnfinishedAddOutputPrefix(un, addPrefix);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// fst node function
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *slice) {
|
||||||
|
FstNode *n = (FstNode *)malloc(sizeof(FstNode));
|
||||||
|
if (n == NULL) { return NULL; }
|
||||||
|
|
||||||
|
if (addr == EMPTY_ADDRESS) {
|
||||||
|
n->data = fstSliceCreate(NULL, 0);
|
||||||
|
n->version = version;
|
||||||
|
n->state = EmptyFinal;
|
||||||
|
n->start = EMPTY_ADDRESS;
|
||||||
|
n->end = EMPTY_ADDRESS;
|
||||||
|
n->isFinal = true;
|
||||||
|
n->nTrans = 0;
|
||||||
|
n->sizes = 0;
|
||||||
|
n->finalOutput = 0;
|
||||||
|
}
|
||||||
|
uint8_t v = slice->data[addr];
|
||||||
|
uint8_t s = (v & 0b11000000) >> 6;
|
||||||
|
if (s == 0b11) { // oneTransNext
|
||||||
|
n->data = fstSliceCopy(slice, 0, addr);
|
||||||
|
n->version = version;
|
||||||
|
n->state = OneTransNext;
|
||||||
|
n->start = addr;
|
||||||
|
n->end = addr; //? s.end_addr(data);
|
||||||
|
n->isFinal = false;
|
||||||
|
n->sizes = 0;
|
||||||
|
n->nTrans = 0;
|
||||||
|
n->finalOutput = 0;
|
||||||
|
} else if (v == 0b10) { // oneTrans
|
||||||
|
uint64_t sz; // fetch sz from addr
|
||||||
|
n->data = fstSliceCopy(slice, 0, addr);
|
||||||
|
n->version = version;
|
||||||
|
n->state = OneTrans;
|
||||||
|
n->start = addr;
|
||||||
|
n->end = addr; // s.end_addr(data, sz);
|
||||||
|
n->isFinal = false;
|
||||||
|
n->nTrans = 1;
|
||||||
|
n->sizes = sz;
|
||||||
|
n->finalOutput = 0;
|
||||||
|
} else { // anyTrans
|
||||||
|
uint64_t sz; // s.sizes(data)
|
||||||
|
uint32_t nTrans; // s.ntrans(data)
|
||||||
|
n->data = *slice;
|
||||||
|
n->version = version;
|
||||||
|
n->state = AnyTrans;
|
||||||
|
n->start = addr;
|
||||||
|
n->end = addr; // s.end_addr(version, data, sz, ntrans);
|
||||||
|
n->isFinal = false; // s.is_final_state();
|
||||||
|
n->nTrans = nTrans;
|
||||||
|
n->sizes = sz;
|
||||||
|
n->finalOutput = 0; // s.final_output(version, data, sz, ntrans);
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
FstTransitions* fstNodeTransitions(FstNode *node) {
|
||||||
|
FstTransitions *t = malloc(sizeof(FstTransitions));
|
||||||
|
if (NULL == t) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
FstRange range = {.start = 0, .end = FST_NODE_LEN(node)};
|
||||||
|
t->node = node;
|
||||||
|
t->range = range;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res) {
|
||||||
|
bool s = true;
|
||||||
|
if (node->state == OneTransNext) {
|
||||||
|
|
||||||
|
} else if (node->state == OneTrans) {
|
||||||
|
|
||||||
|
} else if (node->state == AnyTrans) {
|
||||||
|
|
||||||
|
} else {
|
||||||
|
s = false;
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res) {
|
||||||
|
bool s = true;
|
||||||
|
if (node->state == OneTransNext) {
|
||||||
|
|
||||||
|
} else if (node->state == OneTrans) {
|
||||||
|
|
||||||
|
} else if (node->state == AnyTrans) {
|
||||||
|
|
||||||
|
} else if (node->state == EmptyFinal){
|
||||||
|
s = false;
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res) {
|
||||||
|
bool s = true;
|
||||||
|
uint8_t input; // s.input
|
||||||
|
if (node->state == OneTransNext) {
|
||||||
|
if (b == input) { *res = 0; }
|
||||||
|
else { return s ; }
|
||||||
|
} else if (node->state == OneTrans) {
|
||||||
|
if (b == input) { *res = 0; }
|
||||||
|
else {return s;}
|
||||||
|
} else if (node->state == AnyTrans) {
|
||||||
|
|
||||||
|
} else if (node->state == EmptyFinal) {
|
||||||
|
s = false;
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode) {
|
||||||
|
size_t sz = taosArrayGetSize(builderNode->trans);
|
||||||
|
assert(sz < 256);
|
||||||
|
if (sz == 0 && builderNode->isFinal && builderNode->finalOutput == 0) {
|
||||||
|
return true;
|
||||||
|
} else if (sz != 1 || builderNode->isFinal) {
|
||||||
|
// AnyTrans->Compile(w, addr, node);
|
||||||
|
} else {
|
||||||
|
FstTransition *tran = taosArrayGet(builderNode->trans, 0);
|
||||||
|
if (tran->addr == lastAddr && tran->out == 0) {
|
||||||
|
//OneTransNext::compile(w, lastAddr, tran->inp);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
//OneTrans::Compile(w, lastAddr, *tran);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
FstBuilder *fstBuilderCreate(void *w, FstType ty) {
|
||||||
|
FstBuilder *b = malloc(sizeof(FstBuilder));
|
||||||
|
if (NULL == b) { return b; }
|
||||||
|
|
||||||
|
FstCountingWriter wtr = {.wtr = w, .count = 0, .summer = 0};
|
||||||
|
b->wtr = wtr;
|
||||||
|
b->unfinished = malloc(sizeof(FstUnFinishedNodes));
|
||||||
|
return b;
|
||||||
|
|
||||||
|
}
|
||||||
|
FstSlice fstNodeAsSlice(FstNode *node) {
|
||||||
|
FstSlice *slice = &node->data;
|
||||||
|
FstSlice s = fstSliceCopy(slice, slice->end, slice->dLen - 1);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
*
|
*
|
||||||
* This program is free software: you can use, redistribute, and/or modify
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
* it under the terms of the GNU Affero General Public License, version 3
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
@ -12,16 +12,3 @@
|
||||||
* You should have received a copy of the GNU Affero General Public License
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _TD_LIBS_SYNC_RAFT_CONFIGURATION_H
|
|
||||||
#define _TD_LIBS_SYNC_RAFT_CONFIGURATION_H
|
|
||||||
|
|
||||||
#include "sync.h"
|
|
||||||
#include "sync_type.h"
|
|
||||||
|
|
||||||
// return -1 if cannot find this id
|
|
||||||
int syncRaftConfigurationIndexOfNode(SSyncRaft *pRaft, SyncNodeId id);
|
|
||||||
|
|
||||||
int syncRaftConfigurationVoterCount(SSyncRaft *pRaft);
|
|
||||||
|
|
||||||
#endif /* _TD_LIBS_SYNC_RAFT_CONFIGURATION_H */
|
|
|
@ -0,0 +1,306 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "tutil.h"
|
||||||
|
const uint8_t COMMON_INPUTS[] = {
|
||||||
|
84, // '\x00'
|
||||||
|
85, // '\x01'
|
||||||
|
86, // '\x02'
|
||||||
|
87, // '\x03'
|
||||||
|
88, // '\x04'
|
||||||
|
89, // '\x05'
|
||||||
|
90, // '\x06'
|
||||||
|
91, // '\x07'
|
||||||
|
92, // '\x08'
|
||||||
|
93, // '\t'
|
||||||
|
94, // '\n'
|
||||||
|
95, // '\x0b'
|
||||||
|
96, // '\x0c'
|
||||||
|
97, // '\r'
|
||||||
|
98, // '\x0e'
|
||||||
|
99, // '\x0f'
|
||||||
|
100, // '\x10'
|
||||||
|
101, // '\x11'
|
||||||
|
102, // '\x12'
|
||||||
|
103, // '\x13'
|
||||||
|
104, // '\x14'
|
||||||
|
105, // '\x15'
|
||||||
|
106, // '\x16'
|
||||||
|
107, // '\x17'
|
||||||
|
108, // '\x18'
|
||||||
|
109, // '\x19'
|
||||||
|
110, // '\x1a'
|
||||||
|
111, // '\x1b'
|
||||||
|
112, // '\x1c'
|
||||||
|
113, // '\x1d'
|
||||||
|
114, // '\x1e'
|
||||||
|
115, // '\x1f'
|
||||||
|
116, // ' '
|
||||||
|
80, // '!'
|
||||||
|
117, // '"'
|
||||||
|
118, // '#'
|
||||||
|
79, // '$'
|
||||||
|
39, // '%'
|
||||||
|
30, // '&'
|
||||||
|
81, // "'"
|
||||||
|
75, // '('
|
||||||
|
74, // ')'
|
||||||
|
82, // '*'
|
||||||
|
57, // '+'
|
||||||
|
66, // ','
|
||||||
|
16, // '-'
|
||||||
|
12, // '.'
|
||||||
|
2, // '/'
|
||||||
|
19, // '0'
|
||||||
|
20, // '1'
|
||||||
|
21, // '2'
|
||||||
|
27, // '3'
|
||||||
|
32, // '4'
|
||||||
|
29, // '5'
|
||||||
|
35, // '6'
|
||||||
|
36, // '7'
|
||||||
|
37, // '8'
|
||||||
|
34, // '9'
|
||||||
|
24, // ':'
|
||||||
|
73, // ';'
|
||||||
|
119, // '<'
|
||||||
|
23, // '='
|
||||||
|
120, // '>'
|
||||||
|
40, // '?'
|
||||||
|
83, // '@'
|
||||||
|
44, // 'A'
|
||||||
|
48, // 'B'
|
||||||
|
42, // 'C'
|
||||||
|
43, // 'D'
|
||||||
|
49, // 'E'
|
||||||
|
46, // 'F'
|
||||||
|
62, // 'G'
|
||||||
|
61, // 'H'
|
||||||
|
47, // 'I'
|
||||||
|
69, // 'J'
|
||||||
|
68, // 'K'
|
||||||
|
58, // 'L'
|
||||||
|
56, // 'M'
|
||||||
|
55, // 'N'
|
||||||
|
59, // 'O'
|
||||||
|
51, // 'P'
|
||||||
|
72, // 'Q'
|
||||||
|
54, // 'R'
|
||||||
|
45, // 'S'
|
||||||
|
52, // 'T'
|
||||||
|
64, // 'U'
|
||||||
|
65, // 'V'
|
||||||
|
63, // 'W'
|
||||||
|
71, // 'X'
|
||||||
|
67, // 'Y'
|
||||||
|
70, // 'Z'
|
||||||
|
77, // '['
|
||||||
|
121, // '\\'
|
||||||
|
78, // ']'
|
||||||
|
122, // '^'
|
||||||
|
31, // '_'
|
||||||
|
123, // '`'
|
||||||
|
4, // 'a'
|
||||||
|
25, // 'b'
|
||||||
|
9, // 'c'
|
||||||
|
17, // 'd'
|
||||||
|
1, // 'e'
|
||||||
|
26, // 'f'
|
||||||
|
22, // 'g'
|
||||||
|
13, // 'h'
|
||||||
|
7, // 'i'
|
||||||
|
50, // 'j'
|
||||||
|
38, // 'k'
|
||||||
|
14, // 'l'
|
||||||
|
15, // 'm'
|
||||||
|
10, // 'n'
|
||||||
|
3, // 'o'
|
||||||
|
8, // 'p'
|
||||||
|
60, // 'q'
|
||||||
|
6, // 'r'
|
||||||
|
5, // 's'
|
||||||
|
0, // 't'
|
||||||
|
18, // 'u'
|
||||||
|
33, // 'v'
|
||||||
|
11, // 'w'
|
||||||
|
41, // 'x'
|
||||||
|
28, // 'y'
|
||||||
|
53, // 'z'
|
||||||
|
124, // '{'
|
||||||
|
125, // '|'
|
||||||
|
126, // '}'
|
||||||
|
76, // '~'
|
||||||
|
127, // '\x7f'
|
||||||
|
128, // '\x80'
|
||||||
|
129, // '\x81'
|
||||||
|
130, // '\x82'
|
||||||
|
131, // '\x83'
|
||||||
|
132, // '\x84'
|
||||||
|
133, // '\x85'
|
||||||
|
134, // '\x86'
|
||||||
|
135, // '\x87'
|
||||||
|
136, // '\x88'
|
||||||
|
137, // '\x89'
|
||||||
|
138, // '\x8a'
|
||||||
|
139, // '\x8b'
|
||||||
|
140, // '\x8c'
|
||||||
|
141, // '\x8d'
|
||||||
|
142, // '\x8e'
|
||||||
|
143, // '\x8f'
|
||||||
|
144, // '\x90'
|
||||||
|
145, // '\x91'
|
||||||
|
146, // '\x92'
|
||||||
|
147, // '\x93'
|
||||||
|
148, // '\x94'
|
||||||
|
149, // '\x95'
|
||||||
|
150, // '\x96'
|
||||||
|
151, // '\x97'
|
||||||
|
152, // '\x98'
|
||||||
|
153, // '\x99'
|
||||||
|
154, // '\x9a'
|
||||||
|
155, // '\x9b'
|
||||||
|
156, // '\x9c'
|
||||||
|
157, // '\x9d'
|
||||||
|
158, // '\x9e'
|
||||||
|
159, // '\x9f'
|
||||||
|
160, // '\xa0'
|
||||||
|
161, // '¡'
|
||||||
|
162, // '¢'
|
||||||
|
163, // '£'
|
||||||
|
164, // '¤'
|
||||||
|
165, // '¥'
|
||||||
|
166, // '¦'
|
||||||
|
167, // '§'
|
||||||
|
168, // '¨'
|
||||||
|
169, // '©'
|
||||||
|
170, // 'ª'
|
||||||
|
171, // '«'
|
||||||
|
172, // '¬'
|
||||||
|
173, // '\xad'
|
||||||
|
174, // '®'
|
||||||
|
175, // '¯'
|
||||||
|
176, // '°'
|
||||||
|
177, // '±'
|
||||||
|
178, // '²'
|
||||||
|
179, // '³'
|
||||||
|
180, // '´'
|
||||||
|
181, // 'µ'
|
||||||
|
182, // '¶'
|
||||||
|
183, // '·'
|
||||||
|
184, // '¸'
|
||||||
|
185, // '¹'
|
||||||
|
186, // 'º'
|
||||||
|
187, // '»'
|
||||||
|
188, // '¼'
|
||||||
|
189, // '½'
|
||||||
|
190, // '¾'
|
||||||
|
191, // '¿'
|
||||||
|
192, // 'À'
|
||||||
|
193, // 'Á'
|
||||||
|
194, // 'Â'
|
||||||
|
195, // 'Ã'
|
||||||
|
196, // 'Ä'
|
||||||
|
197, // 'Å'
|
||||||
|
198, // 'Æ'
|
||||||
|
199, // 'Ç'
|
||||||
|
200, // 'È'
|
||||||
|
201, // 'É'
|
||||||
|
202, // 'Ê'
|
||||||
|
203, // 'Ë'
|
||||||
|
204, // 'Ì'
|
||||||
|
205, // 'Í'
|
||||||
|
206, // 'Î'
|
||||||
|
207, // 'Ï'
|
||||||
|
208, // 'Ð'
|
||||||
|
209, // 'Ñ'
|
||||||
|
210, // 'Ò'
|
||||||
|
211, // 'Ó'
|
||||||
|
212, // 'Ô'
|
||||||
|
213, // 'Õ'
|
||||||
|
214, // 'Ö'
|
||||||
|
215, // '×'
|
||||||
|
216, // 'Ø'
|
||||||
|
217, // 'Ù'
|
||||||
|
218, // 'Ú'
|
||||||
|
219, // 'Û'
|
||||||
|
220, // 'Ü'
|
||||||
|
221, // 'Ý'
|
||||||
|
222, // 'Þ'
|
||||||
|
223, // 'ß'
|
||||||
|
224, // 'à'
|
||||||
|
225, // 'á'
|
||||||
|
226, // 'â'
|
||||||
|
227, // 'ã'
|
||||||
|
228, // 'ä'
|
||||||
|
229, // 'å'
|
||||||
|
230, // 'æ'
|
||||||
|
231, // 'ç'
|
||||||
|
232, // 'è'
|
||||||
|
233, // 'é'
|
||||||
|
234, // 'ê'
|
||||||
|
235, // 'ë'
|
||||||
|
236, // 'ì'
|
||||||
|
237, // 'í'
|
||||||
|
238, // 'î'
|
||||||
|
239, // 'ï'
|
||||||
|
240, // 'ð'
|
||||||
|
241, // 'ñ'
|
||||||
|
242, // 'ò'
|
||||||
|
243, // 'ó'
|
||||||
|
244, // 'ô'
|
||||||
|
245, // 'õ'
|
||||||
|
246, // 'ö'
|
||||||
|
247, // '÷'
|
||||||
|
248, // 'ø'
|
||||||
|
249, // 'ù'
|
||||||
|
250, // 'ú'
|
||||||
|
251, // 'û'
|
||||||
|
252, // 'ü'
|
||||||
|
253, // 'ý'
|
||||||
|
254, // 'þ'
|
||||||
|
255, // 'ÿ'
|
||||||
|
};
|
||||||
|
|
||||||
|
char const COMMON_INPUTS_INV[] = {
|
||||||
|
't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w',
|
||||||
|
'.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=',
|
||||||
|
':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6',
|
||||||
|
'7', '8', 'k', '%', '?', 'x', 'C', 'D', 'A', 'S', 'F', 'I',
|
||||||
|
'B', 'E', 'j', 'P', 'T', 'z', 'R', 'N', 'M', '+', 'L', 'O',
|
||||||
|
'q', 'H', 'G', 'W', 'U', 'V', ',', 'Y', 'K', 'J', 'Z', 'X',
|
||||||
|
'Q', ';', ')', '(', '~', '[', ']', '$', '!', '\'', '*', '@',
|
||||||
|
'\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
|
||||||
|
'\x08', '\t', '\n', '\x0b', '\x0c', '\r', '\x0e', '\x0f', '\x10',
|
||||||
|
'\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18',
|
||||||
|
'\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', ' ', '"',
|
||||||
|
'#', '<', '>', '\\', '^', '`', '{', '|', '}','\x7f','\x80',
|
||||||
|
'\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88',
|
||||||
|
'\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90',
|
||||||
|
'\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98',
|
||||||
|
'\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '\xa0',
|
||||||
|
'\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', '\xa8',
|
||||||
|
'\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', '\xb0',
|
||||||
|
'\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', '\xb8',
|
||||||
|
'\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', '\xc0',
|
||||||
|
'\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', '\xc8',
|
||||||
|
'\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', '\xd0',
|
||||||
|
'\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', '\xd8',
|
||||||
|
'\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', '\xe0',
|
||||||
|
'\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', '\xe8',
|
||||||
|
'\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0',
|
||||||
|
'\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8',
|
||||||
|
'\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
|
||||||
|
};
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#include "index_fst_node.h"
|
||||||
|
|
||||||
|
FstBuilderNode *fstBuilderNodeDefault() {
|
||||||
|
FstBuilderNode *bn = malloc(sizeof(FstBuilderNode));
|
||||||
|
bn->isFinal = false;
|
||||||
|
bn->finalOutput = 0;
|
||||||
|
bn->trans = NULL;
|
||||||
|
return bn;
|
||||||
|
}
|
||||||
|
|
||||||
|
FstBuilderNode *fstBuilderNodeClone(FstBuilderNode *src) {
|
||||||
|
FstBuilderNode *node = malloc(sizeof(FstBuilderNode));
|
||||||
|
if (node == NULL) { return NULL; }
|
||||||
|
|
||||||
|
|
||||||
|
size_t sz = taosArrayGetSize(src->trans);
|
||||||
|
SArray *trans = taosArrayInit(sz, sizeof(FstTransition));
|
||||||
|
|
||||||
|
for (size_t i = 0; i < sz; i++) {
|
||||||
|
FstTransition *tran = taosArrayGet(src->trans, i);
|
||||||
|
FstTransition t = *tran;
|
||||||
|
taosArrayPush(trans, &t);
|
||||||
|
}
|
||||||
|
|
||||||
|
node->trans = trans;
|
||||||
|
node->isFinal = src->isFinal;
|
||||||
|
node->finalOutput = src->finalOutput;
|
||||||
|
return node;
|
||||||
|
|
||||||
|
}
|
||||||
|
// not destroy src, User's bussiness
|
||||||
|
void fstBuilderNodeCloneFrom(FstBuilderNode *dst, FstBuilderNode *src) {
|
||||||
|
if (dst == NULL || src == NULL) { return; }
|
||||||
|
|
||||||
|
dst->isFinal = src->isFinal;
|
||||||
|
dst->finalOutput = src->finalOutput ;
|
||||||
|
dst->trans = src->trans;
|
||||||
|
|
||||||
|
src->trans = NULL;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,158 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "index_fst_registry.h"
|
||||||
|
|
||||||
|
|
||||||
|
uint64_t fstRegistryHash(FstRegistry *registry, FstBuilderNode *bNode) {
|
||||||
|
//TODO(yihaoDeng): refactor later
|
||||||
|
const uint64_t FNV_PRIME = 1099511628211;
|
||||||
|
uint64_t h = 14695981039346656037u;
|
||||||
|
|
||||||
|
h = (h ^ (uint64_t)bNode->isFinal) * FNV_PRIME;
|
||||||
|
h = (h ^ (bNode)->finalOutput) * FNV_PRIME;
|
||||||
|
|
||||||
|
uint32_t sz = (uint32_t)taosArrayGetSize(bNode->trans);
|
||||||
|
for (uint32_t i = 0; i < sz; i++) {
|
||||||
|
FstTransition *trn = taosArrayGet(bNode->trans, i);
|
||||||
|
h = (h ^ (uint64_t)(trn->inp)) * FNV_PRIME;
|
||||||
|
h = (h ^ (uint64_t)(trn->out)) * FNV_PRIME;
|
||||||
|
h = (h ^ (uint64_t)(trn->addr))* FNV_PRIME;
|
||||||
|
}
|
||||||
|
return h %(registry->tableSize);
|
||||||
|
}
|
||||||
|
static void fstRegistryCellSwap(SArray *arr, uint32_t a, uint32_t b) {
|
||||||
|
size_t sz = taosArrayGetSize(arr);
|
||||||
|
if (a >= sz || b >= sz) { return; }
|
||||||
|
|
||||||
|
FstRegistryCell *cell1 = (FstRegistryCell *)taosArrayGet(arr, a);
|
||||||
|
FstRegistryCell *cell2 = (FstRegistryCell *)taosArrayGet(arr, b);
|
||||||
|
|
||||||
|
FstRegistryCell t = {.addr = cell1->addr, .node = cell1->node};
|
||||||
|
|
||||||
|
cell1->addr = cell2->addr;
|
||||||
|
cell1->node = cell2->node;
|
||||||
|
|
||||||
|
cell2->addr = t.addr;
|
||||||
|
cell2->node = t.node;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fstRegistryCellPromote(SArray *arr, uint32_t start, uint32_t end) {
|
||||||
|
size_t sz = taosArrayGetSize(arr);
|
||||||
|
if (start >= sz && end >= sz) {return; }
|
||||||
|
|
||||||
|
assert(start >= end);
|
||||||
|
|
||||||
|
int32_t s = (int32_t)start;
|
||||||
|
int32_t e = (int32_t)end;
|
||||||
|
while(s > e) {
|
||||||
|
fstRegistryCellSwap(arr, s - 1, s);
|
||||||
|
s -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#define FST_REGISTRY_CELL_IS_EMPTY(cell) (cell->addr == NONE_ADDRESS)
|
||||||
|
#define FST_REGISTRY_CELL_INSERT(cell, addr) do {cell->addr = addr;} while(0)
|
||||||
|
|
||||||
|
FstRegistry* fstRegistryCreate(uint64_t tableSize, uint64_t mruSize) {
|
||||||
|
FstRegistry *registry = malloc(sizeof(FstRegistry));
|
||||||
|
if (registry == NULL) { return NULL ;}
|
||||||
|
|
||||||
|
uint64_t nCells = tableSize * mruSize;
|
||||||
|
SArray* tb = (SArray *)taosArrayInit(nCells, sizeof(FstRegistryCell));
|
||||||
|
for (uint64_t i = 0; i < nCells; i++) {
|
||||||
|
FstRegistryCell *cell = taosArrayGet(tb, i);
|
||||||
|
cell->addr = NONE_ADDRESS;
|
||||||
|
cell->node = fstBuilderNodeDefault();
|
||||||
|
}
|
||||||
|
|
||||||
|
registry->table = tb;
|
||||||
|
registry->tableSize = tableSize;
|
||||||
|
registry->mruSize = mruSize;
|
||||||
|
return registry;
|
||||||
|
}
|
||||||
|
|
||||||
|
FstRegistryEntry *fstRegistryGetEntry(FstRegistry *registry, FstBuilderNode *bNode) {
|
||||||
|
if (taosArrayGetSize(registry->table) <= 0) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
uint64_t bucket = fstRegistryHash(registry, bNode);
|
||||||
|
uint64_t start = registry->mruSize * bucket;
|
||||||
|
uint64_t end = start + registry->mruSize;
|
||||||
|
|
||||||
|
FstRegistryEntry *entry = malloc(sizeof(FstRegistryEntry));
|
||||||
|
if (end - start == 1) {
|
||||||
|
FstRegistryCell *cell = taosArrayGet(registry->table, start);
|
||||||
|
//cell->isNode &&
|
||||||
|
if (cell->addr != NONE_ADDRESS && cell->node == bNode) {
|
||||||
|
entry->state = FOUND;
|
||||||
|
entry->addr = cell->addr ;
|
||||||
|
return entry;
|
||||||
|
} else {
|
||||||
|
// clone from bNode, refactor later
|
||||||
|
//
|
||||||
|
fstBuilderNodeCloneFrom(cell->node, bNode);
|
||||||
|
entry->state = NOTFOUND;
|
||||||
|
entry->cell = cell; // copy or not
|
||||||
|
}
|
||||||
|
} else if (end - start == 2) {
|
||||||
|
FstRegistryCell *cell1 = taosArrayGet(registry->table, start);
|
||||||
|
if (cell1->addr != NONE_ADDRESS && cell1->node == bNode) {
|
||||||
|
entry->state = FOUND;
|
||||||
|
entry->addr = cell1->addr;
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
FstRegistryCell *cell2 = taosArrayGet(registry->table, start + 1);
|
||||||
|
if (cell2->addr != NONE_ADDRESS && cell2->node == bNode) {
|
||||||
|
entry->state = FOUND;
|
||||||
|
entry->addr = cell2->addr;
|
||||||
|
// must swap here
|
||||||
|
fstRegistryCellSwap(registry->table, start, start + 1);
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
//clone from bNode, refactor later
|
||||||
|
fstBuilderNodeCloneFrom(cell2->node, bNode);
|
||||||
|
|
||||||
|
fstRegistryCellSwap(registry->table, start, start + 1);
|
||||||
|
FstRegistryCell *cCell = taosArrayGet(registry->table, start);
|
||||||
|
entry->state = NOTFOUND;
|
||||||
|
entry->cell = cCell;
|
||||||
|
} else {
|
||||||
|
uint32_t i = start;
|
||||||
|
for (; i < end; i++) {
|
||||||
|
FstRegistryCell *cell = (FstRegistryCell *)taosArrayGet(registry->table, i);
|
||||||
|
if (cell->addr != NONE_ADDRESS && cell->node == bNode) {
|
||||||
|
entry->state = FOUND;
|
||||||
|
entry->addr = cell->addr;
|
||||||
|
fstRegistryCellPromote(registry->table, i, start);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i >= end) {
|
||||||
|
uint64_t last = end - 1;
|
||||||
|
FstRegistryCell *cell = (FstRegistryCell *)taosArrayGet(registry->table, last);
|
||||||
|
//clone from bNode, refactor later
|
||||||
|
fstBuilderNodeCloneFrom(cell->node, bNode);
|
||||||
|
|
||||||
|
fstRegistryCellPromote(registry->table, last, start);
|
||||||
|
FstRegistryCell *cCell = taosArrayGet(registry->table, start);
|
||||||
|
entry->state = NOTFOUND;
|
||||||
|
entry->cell = cCell;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,115 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#include "index_fst_util.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//A sentinel value used to indicate an empty final state
|
||||||
|
const CompiledAddr EMPTY_ADDRESS = 0;
|
||||||
|
/// A sentinel value used to indicate an invalid state.
|
||||||
|
const CompiledAddr NONE_ADDRESS = 1;
|
||||||
|
|
||||||
|
// This version number is written to every finite state transducer created by
|
||||||
|
// this crate. When a finite state transducer is read, its version number is
|
||||||
|
// checked against this value.
|
||||||
|
const uint64_t version = 3;
|
||||||
|
// The threshold (in number of transitions) at which an index is created for
|
||||||
|
// a node's transitions. This speeds up lookup time at the expense of FST size
|
||||||
|
|
||||||
|
const uint64_t TRANS_INDEX_THRESHOLD = 32;
|
||||||
|
|
||||||
|
|
||||||
|
//uint8_t commonInput(uint8_t idx) {
|
||||||
|
// if (idx == 0) { return -1; }
|
||||||
|
// else {
|
||||||
|
// return COMMON_INPUTS_INV[idx - 1];
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
//
|
||||||
|
//uint8_t commonIdx(uint8_t v, uint8_t max) {
|
||||||
|
// uint8_t v = ((uint16_t)tCOMMON_INPUTS[v] + 1)%256;
|
||||||
|
// return v > max ? 0: v;
|
||||||
|
//}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
uint8_t packSize(uint64_t n) {
|
||||||
|
if (n < (1u << 8)) {
|
||||||
|
return 1;
|
||||||
|
} else if (n < (1u << 16)) {
|
||||||
|
return 2;
|
||||||
|
} else if (n < (1u << 24)) {
|
||||||
|
return 3;
|
||||||
|
} else if (n < ((uint64_t)(1) << 32)) {
|
||||||
|
return 4;
|
||||||
|
} else if (n < ((uint64_t)(1) << 40)) {
|
||||||
|
return 5;
|
||||||
|
} else if (n < ((uint64_t)(1) << 48)) {
|
||||||
|
return 6;
|
||||||
|
} else if (n < ((uint64_t)(1) << 56)) {
|
||||||
|
return 7;
|
||||||
|
} else {
|
||||||
|
return 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t unpackUint64(uint8_t *ch, uint8_t sz) {
|
||||||
|
uint64_t n;
|
||||||
|
for (uint8_t i = 0; i < sz; i++) {
|
||||||
|
n = n | (ch[i] << (8 * i));
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr) {
|
||||||
|
if (transAddr == EMPTY_ADDRESS) {
|
||||||
|
return packSize(EMPTY_ADDRESS);
|
||||||
|
} else {
|
||||||
|
return packSize(nodeAddr - transAddr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr) {
|
||||||
|
uint64_t delta = unpackUint64(data, len);
|
||||||
|
// delta_add = u64_to_usize
|
||||||
|
if (delta == EMPTY_ADDRESS) {
|
||||||
|
return EMPTY_ADDRESS;
|
||||||
|
} else {
|
||||||
|
return nodeAddr - delta;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fst slice func
|
||||||
|
FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen) {
|
||||||
|
FstSlice slice = {.data = data, .dLen = dLen, .start = 0, .end = dLen - 1};
|
||||||
|
return slice;
|
||||||
|
}
|
||||||
|
FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end) {
|
||||||
|
FstSlice t;
|
||||||
|
if (start >= slice->dLen || end >= slice->dLen || start > end) {
|
||||||
|
t.data = NULL;
|
||||||
|
return t;
|
||||||
|
};
|
||||||
|
|
||||||
|
t.data = slice->data;
|
||||||
|
t.dLen = slice->dLen;
|
||||||
|
t.start = start;
|
||||||
|
t.end = end;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
bool fstSliceEmpty(FstSlice *slice) {
|
||||||
|
return slice->data == NULL || slice->dLen <= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
add_executable(indexTest "")
|
||||||
|
target_sources(indexTest
|
||||||
|
PRIVATE
|
||||||
|
"../src/index.c"
|
||||||
|
"indexTests.cpp"
|
||||||
|
)
|
||||||
|
target_include_directories ( indexTest
|
||||||
|
PUBLIC
|
||||||
|
"${CMAKE_SOURCE_DIR}/include/libs/index"
|
||||||
|
"${CMAKE_CURRENT_SOURCE_DIR}/../inc"
|
||||||
|
)
|
||||||
|
target_link_libraries (indexTest
|
||||||
|
os
|
||||||
|
util
|
||||||
|
common
|
||||||
|
gtest_main
|
||||||
|
index
|
||||||
|
)
|
||||||
|
|
||||||
|
add_test(
|
||||||
|
NAME index_test
|
||||||
|
COMMAND indexTest
|
||||||
|
)
|
|
@ -0,0 +1,59 @@
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <string>
|
||||||
|
#include <iostream>
|
||||||
|
#include "index.h"
|
||||||
|
#include "indexInt.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
TEST(IndexTest, index_create_test) {
|
||||||
|
SIndexOpts *opts = indexOptsCreate();
|
||||||
|
SIndex *index = indexOpen(opts, "./test");
|
||||||
|
if (index == NULL) {
|
||||||
|
std::cout << "index open failed" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// write
|
||||||
|
for (int i = 0; i < 100000; i++) {
|
||||||
|
SIndexMultiTerm* terms = indexMultiTermCreate();
|
||||||
|
std::string val = "field";
|
||||||
|
|
||||||
|
indexMultiTermAdd(terms, "tag1", strlen("tag1"), val.c_str(), val.size());
|
||||||
|
|
||||||
|
val.append(std::to_string(i));
|
||||||
|
indexMultiTermAdd(terms, "tag2", strlen("tag2"), val.c_str(), val.size());
|
||||||
|
|
||||||
|
val.insert(0, std::to_string(i));
|
||||||
|
indexMultiTermAdd(terms, "tag3", strlen("tag3"), val.c_str(), val.size());
|
||||||
|
|
||||||
|
val.append("const");
|
||||||
|
indexMultiTermAdd(terms, "tag4", strlen("tag4"), val.c_str(), val.size());
|
||||||
|
|
||||||
|
|
||||||
|
indexPut(index, terms, i);
|
||||||
|
indexMultiTermDestroy(terms);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// query
|
||||||
|
SIndexMultiTermQuery *multiQuery = indexMultiTermQueryCreate(MUST);
|
||||||
|
|
||||||
|
indexMultiTermQueryAdd(multiQuery, "tag1", strlen("tag1"), "field", strlen("field"), QUERY_PREFIX);
|
||||||
|
indexMultiTermQueryAdd(multiQuery, "tag3", strlen("tag3"), "0field0", strlen("0field0"), QUERY_TERM);
|
||||||
|
|
||||||
|
SArray *result = (SArray *)taosArrayInit(10, sizeof(int));
|
||||||
|
indexSearch(index, multiQuery, result);
|
||||||
|
|
||||||
|
std::cout << "taos'size : " << taosArrayGetSize(result) << std::endl;
|
||||||
|
for (int i = 0; i < taosArrayGetSize(result); i++) {
|
||||||
|
int *v = (int *)taosArrayGet(result, i);
|
||||||
|
std::cout << "value --->" << *v << std::endl;
|
||||||
|
}
|
||||||
|
indexMultiTermQueryDestroy(multiQuery);
|
||||||
|
|
||||||
|
indexOptsDestroy(opts);
|
||||||
|
indexClose(index);
|
||||||
|
//
|
||||||
|
}
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
#include "sync.h"
|
#include "sync.h"
|
||||||
#include "sync_type.h"
|
#include "sync_type.h"
|
||||||
|
#include "thash.h"
|
||||||
#include "raft_message.h"
|
#include "raft_message.h"
|
||||||
#include "sync_raft_impl.h"
|
#include "sync_raft_impl.h"
|
||||||
#include "sync_raft_quorum.h"
|
#include "sync_raft_quorum.h"
|
||||||
|
@ -43,9 +44,9 @@ struct SSyncRaft {
|
||||||
// owner sync node
|
// owner sync node
|
||||||
SSyncNode* pNode;
|
SSyncNode* pNode;
|
||||||
|
|
||||||
SSyncCluster cluster;
|
// hash map nodeId -> SNodeInfo*
|
||||||
|
SHashObj* nodeInfoMap;
|
||||||
|
|
||||||
int selfIndex;
|
|
||||||
SyncNodeId selfId;
|
SyncNodeId selfId;
|
||||||
SyncGroupId selfGroupId;
|
SyncGroupId selfGroupId;
|
||||||
|
|
||||||
|
|
|
@ -39,8 +39,6 @@ struct SSyncRaftLog {
|
||||||
SyncIndex commitIndex;
|
SyncIndex commitIndex;
|
||||||
|
|
||||||
SyncIndex appliedIndex;
|
SyncIndex appliedIndex;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
SSyncRaftLog* syncRaftLogOpen();
|
SSyncRaftLog* syncRaftLogOpen();
|
||||||
|
|
|
@ -20,11 +20,11 @@
|
||||||
#include "syncInt.h"
|
#include "syncInt.h"
|
||||||
#include "sync_type.h"
|
#include "sync_type.h"
|
||||||
|
|
||||||
// syncRaftReplicate sends an append RPC with new entries to the given peer,
|
// syncRaftMaybeSendAppend sends an append RPC with new entries to the given peer,
|
||||||
// if necessary. Returns true if a message was sent. The sendIfEmpty
|
// if necessary. Returns true if a message was sent. The sendIfEmpty
|
||||||
// argument controls whether messages with no entries will be sent
|
// argument controls whether messages with no entries will be sent
|
||||||
// ("empty" messages are useful to convey updated Commit indexes, but
|
// ("empty" messages are useful to convey updated Commit indexes, but
|
||||||
// are undesirable when we're sending multiple messages in a batch).
|
// are undesirable when we're sending multiple messages in a batch).
|
||||||
bool syncRaftReplicate(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty);
|
bool syncRaftMaybeSendAppend(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty);
|
||||||
|
|
||||||
#endif /* TD_SYNC_RAFT_REPLICATION_H */
|
#endif /* TD_SYNC_RAFT_REPLICATION_H */
|
||||||
|
|
|
@ -13,13 +13,13 @@
|
||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "raft_configuration.h"
|
#ifndef _TD_LIBS_SYNC_CONST_H
|
||||||
#include "raft.h"
|
#define _TD_LIBS_SYNC_CONST_H
|
||||||
|
|
||||||
int syncRaftConfigurationIndexOfNode(SSyncRaft *pRaft, SyncNodeId id) {
|
#include "sync.h"
|
||||||
return (int)(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
int syncRaftConfigurationVoterCount(SSyncRaft *pRaft) {
|
static int kSyncRaftMaxInflghtMsgs = 20;
|
||||||
return pRaft->cluster.replica;
|
|
||||||
}
|
static SyncIndex kMaxCommitIndex = UINT64_MAX;
|
||||||
|
|
||||||
|
#endif /* _TD_LIBS_SYNC_CONST_H */
|
|
@ -33,6 +33,11 @@ struct SSyncRaftChanger {
|
||||||
typedef int (*configChangeFp)(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
|
typedef int (*configChangeFp)(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
|
||||||
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||||
|
|
||||||
|
// Simple carries out a series of configuration changes that (in aggregate)
|
||||||
|
// mutates the incoming majority config Voters[0] by at most one. This method
|
||||||
|
// will return an error if that is not the case, if the resulting quorum is
|
||||||
|
// zero, or if the configuration is in a joint state (i.e. if there is an
|
||||||
|
// outgoing configuration).
|
||||||
int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
|
int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css,
|
||||||
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,8 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft);
|
||||||
|
|
||||||
void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType);
|
void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType);
|
||||||
|
|
||||||
|
void syncRaftCampaign(SSyncRaft* pRaft, ESyncRaftElectionType cType);
|
||||||
|
|
||||||
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft);
|
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft);
|
||||||
|
|
||||||
void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft);
|
void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft);
|
||||||
|
@ -51,4 +53,6 @@ void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState);
|
||||||
|
|
||||||
void syncRaftBroadcastAppend(SSyncRaft* pRaft);
|
void syncRaftBroadcastAppend(SSyncRaft* pRaft);
|
||||||
|
|
||||||
|
SNodeInfo* syncRaftGetNodeById(SSyncRaft *pRaft, SyncNodeId id);
|
||||||
|
|
||||||
#endif /* _TD_LIBS_SYNC_RAFT_IMPL_H */
|
#endif /* _TD_LIBS_SYNC_RAFT_IMPL_H */
|
||||||
|
|
|
@ -18,54 +18,47 @@
|
||||||
|
|
||||||
#include "sync.h"
|
#include "sync.h"
|
||||||
|
|
||||||
/**
|
// Inflights limits the number of MsgApp (represented by the largest index
|
||||||
* SSyncRaftInflights limits the number of MsgApp (represented by the largest index
|
// contained within) sent to followers but not yet acknowledged by them. Callers
|
||||||
* contained within) sent to followers but not yet acknowledged by them. Callers
|
// use Full() to check whether more messages can be sent, call Add() whenever
|
||||||
* use syncRaftInflightFull() to check whether more messages can be sent,
|
// they are sending a new append, and release "quota" via FreeLE() whenever an
|
||||||
* call syncRaftInflightAdd() whenever they are sending a new append,
|
// ack is received.
|
||||||
* and release "quota" via FreeLE() whenever an ack is received.
|
|
||||||
**/
|
|
||||||
typedef struct SSyncRaftInflights {
|
typedef struct SSyncRaftInflights {
|
||||||
/* the starting index in the buffer */
|
// the starting index in the buffer
|
||||||
int start;
|
int start;
|
||||||
|
|
||||||
/* number of inflights in the buffer */
|
// number of inflights in the buffer
|
||||||
int count;
|
int count;
|
||||||
|
|
||||||
/* the size of the buffer */
|
// the size of the buffer
|
||||||
int size;
|
int size;
|
||||||
|
|
||||||
/**
|
// buffer contains the index of the last entry
|
||||||
* buffer contains the index of the last entry
|
// inside one message.
|
||||||
* inside one message.
|
|
||||||
**/
|
|
||||||
SyncIndex* buffer;
|
SyncIndex* buffer;
|
||||||
} SSyncRaftInflights;
|
} SSyncRaftInflights;
|
||||||
|
|
||||||
SSyncRaftInflights* syncRaftOpenInflights(int size);
|
SSyncRaftInflights* syncRaftOpenInflights(int size);
|
||||||
void syncRaftCloseInflights(SSyncRaftInflights*);
|
void syncRaftCloseInflights(SSyncRaftInflights*);
|
||||||
|
|
||||||
|
// reset frees all inflights.
|
||||||
static FORCE_INLINE void syncRaftInflightReset(SSyncRaftInflights* inflights) {
|
static FORCE_INLINE void syncRaftInflightReset(SSyncRaftInflights* inflights) {
|
||||||
inflights->count = 0;
|
inflights->count = 0;
|
||||||
inflights->start = 0;
|
inflights->start = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Full returns true if no more messages can be sent at the moment.
|
||||||
static FORCE_INLINE bool syncRaftInflightFull(SSyncRaftInflights* inflights) {
|
static FORCE_INLINE bool syncRaftInflightFull(SSyncRaftInflights* inflights) {
|
||||||
return inflights->count == inflights->size;
|
return inflights->count == inflights->size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// Add notifies the Inflights that a new message with the given index is being
|
||||||
* syncRaftInflightAdd notifies the Inflights that a new message with the given index is being
|
// dispatched. Full() must be called prior to Add() to verify that there is room
|
||||||
* dispatched. syncRaftInflightFull() must be called prior to syncRaftInflightAdd()
|
// for one more message, and consecutive calls to add Add() must provide a
|
||||||
* to verify that there is room for one more message,
|
// monotonic sequence of indexes.
|
||||||
* and consecutive calls to add syncRaftInflightAdd() must provide a
|
|
||||||
* monotonic sequence of indexes.
|
|
||||||
**/
|
|
||||||
void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex);
|
void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex);
|
||||||
|
|
||||||
/**
|
// FreeLE frees the inflights smaller or equal to the given `to` flight.
|
||||||
* syncRaftInflightFreeLE frees the inflights smaller or equal to the given `to` flight.
|
|
||||||
**/
|
|
||||||
void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex);
|
void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _TD_LIBS_SYNC_RAFT_NODE_MAP_H
|
||||||
|
#define _TD_LIBS_SYNC_RAFT_NODE_MAP_H
|
||||||
|
|
||||||
|
#include "thash.h"
|
||||||
|
#include "sync.h"
|
||||||
|
#include "sync_type.h"
|
||||||
|
|
||||||
|
struct SSyncRaftNodeMap {
|
||||||
|
SHashObj* nodeIdMap;
|
||||||
|
};
|
||||||
|
|
||||||
|
void syncRaftInitNodeMap(SSyncRaftNodeMap* nodeMap);
|
||||||
|
void syncRaftFreeNodeMap(SSyncRaftNodeMap* nodeMap);
|
||||||
|
|
||||||
|
void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap);
|
||||||
|
|
||||||
|
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
|
||||||
|
|
||||||
|
void syncRaftCopyNodeMap(SSyncRaftNodeMap* from, SSyncRaftNodeMap* to);
|
||||||
|
|
||||||
|
void syncRaftUnionNodeMap(SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to);
|
||||||
|
|
||||||
|
void syncRaftAddToNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
|
||||||
|
|
||||||
|
void syncRaftRemoveFromNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
|
||||||
|
|
||||||
|
int32_t syncRaftNodeMapSize(const SSyncRaftNodeMap* nodeMap);
|
||||||
|
|
||||||
|
// return true if reach the end
|
||||||
|
bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId);
|
||||||
|
|
||||||
|
bool syncRaftIsAllNodeInProgressMap(SSyncRaftNodeMap* nodeMap, SSyncRaftProgressMap* progressMap);
|
||||||
|
|
||||||
|
#endif /* _TD_LIBS_SYNC_RAFT_NODE_MAP_H */
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
#include "sync_type.h"
|
#include "sync_type.h"
|
||||||
#include "sync_raft_inflights.h"
|
#include "sync_raft_inflights.h"
|
||||||
|
#include "thash.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* State defines how the leader should interact with the follower.
|
* State defines how the leader should interact with the follower.
|
||||||
|
@ -64,141 +65,123 @@ static const char* kProgressStateString[] = {
|
||||||
"Snapshot",
|
"Snapshot",
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
// Progress represents a follower’s progress in the view of the leader. Leader
|
||||||
* Progress represents a follower’s progress in the view of the leader. Leader maintains
|
// maintains progresses of all followers, and sends entries to the follower
|
||||||
* progresses of all followers, and sends entries to the follower based on its progress.
|
// based on its progress.
|
||||||
**/
|
//
|
||||||
|
// NB(tbg): Progress is basically a state machine whose transitions are mostly
|
||||||
|
// strewn around `*raft.raft`. Additionally, some fields are only used when in a
|
||||||
|
// certain State. All of this isn't ideal.
|
||||||
struct SSyncRaftProgress {
|
struct SSyncRaftProgress {
|
||||||
// index in raft cluster config
|
SyncGroupId groupId;
|
||||||
int selfIndex;
|
|
||||||
|
|
||||||
SyncNodeId id;
|
SyncNodeId id;
|
||||||
|
|
||||||
|
int16_t refCount;
|
||||||
|
|
||||||
SyncIndex nextIndex;
|
SyncIndex nextIndex;
|
||||||
|
|
||||||
SyncIndex matchIndex;
|
SyncIndex matchIndex;
|
||||||
|
|
||||||
/**
|
// State defines how the leader should interact with the follower.
|
||||||
* State defines how the leader should interact with the follower.
|
//
|
||||||
*
|
// When in StateProbe, leader sends at most one replication message
|
||||||
* When in StateProbe, leader sends at most one replication message
|
// per heartbeat interval. It also probes actual progress of the follower.
|
||||||
* per heartbeat interval. It also probes actual progress of the follower.
|
//
|
||||||
*
|
// When in StateReplicate, leader optimistically increases next
|
||||||
* When in StateReplicate, leader optimistically increases next
|
// to the latest entry sent after sending replication message. This is
|
||||||
* to the latest entry sent after sending replication message. This is
|
// an optimized state for fast replicating log entries to the follower.
|
||||||
* an optimized state for fast replicating log entries to the follower.
|
//
|
||||||
*
|
// When in StateSnapshot, leader should have sent out snapshot
|
||||||
* When in StateSnapshot, leader should have sent out snapshot
|
// before and stops sending any replication message.
|
||||||
* before and stops sending any replication message.
|
|
||||||
**/
|
|
||||||
ESyncRaftProgressState state;
|
ESyncRaftProgressState state;
|
||||||
|
|
||||||
/**
|
// PendingSnapshot is used in StateSnapshot.
|
||||||
* pendingSnapshotIndex is used in PROGRESS_STATE_SNAPSHOT.
|
// If there is a pending snapshot, the pendingSnapshot will be set to the
|
||||||
* If there is a pending snapshot, the pendingSnapshotIndex will be set to the
|
// index of the snapshot. If pendingSnapshot is set, the replication process of
|
||||||
* index of the snapshot. If pendingSnapshotIndex is set, the replication process of
|
// this Progress will be paused. raft will not resend snapshot until the pending one
|
||||||
* this Progress will be paused. raft will not resend snapshot until the pending one
|
// is reported to be failed.
|
||||||
* is reported to be failed.
|
|
||||||
**/
|
|
||||||
SyncIndex pendingSnapshotIndex;
|
SyncIndex pendingSnapshotIndex;
|
||||||
|
|
||||||
/**
|
// RecentActive is true if the progress is recently active. Receiving any messages
|
||||||
* recentActive is true if the progress is recently active. Receiving any messages
|
// from the corresponding follower indicates the progress is active.
|
||||||
* from the corresponding follower indicates the progress is active.
|
// RecentActive can be reset to false after an election timeout.
|
||||||
* RecentActive can be reset to false after an election timeout.
|
//
|
||||||
**/
|
// TODO(tbg): the leader should always have this set to true.
|
||||||
bool recentActive;
|
bool recentActive;
|
||||||
|
|
||||||
/**
|
// ProbeSent is used while this follower is in StateProbe. When ProbeSent is
|
||||||
* probeSent is used while this follower is in StateProbe. When probeSent is
|
// true, raft should pause sending replication message to this peer until
|
||||||
* true, raft should pause sending replication message to this peer until
|
// ProbeSent is reset. See ProbeAcked() and IsPaused().
|
||||||
* probeSent is reset. See ProbeAcked() and IsPaused().
|
|
||||||
**/
|
|
||||||
bool probeSent;
|
bool probeSent;
|
||||||
|
|
||||||
/**
|
// Inflights is a sliding window for the inflight messages.
|
||||||
* inflights is a sliding window for the inflight messages.
|
// Each inflight message contains one or more log entries.
|
||||||
* Each inflight message contains one or more log entries.
|
// The max number of entries per message is defined in raft config as MaxSizePerMsg.
|
||||||
* The max number of entries per message is defined in raft config as MaxSizePerMsg.
|
// Thus inflight effectively limits both the number of inflight messages
|
||||||
* Thus inflight effectively limits both the number of inflight messages
|
// and the bandwidth each Progress can use.
|
||||||
* and the bandwidth each Progress can use.
|
// When inflights is Full, no more message should be sent.
|
||||||
* When inflights is Full, no more message should be sent.
|
// When a leader sends out a message, the index of the last
|
||||||
* When a leader sends out a message, the index of the last
|
// entry should be added to inflights. The index MUST be added
|
||||||
* entry should be added to inflights. The index MUST be added
|
// into inflights in order.
|
||||||
* into inflights in order.
|
// When a leader receives a reply, the previous inflights should
|
||||||
* When a leader receives a reply, the previous inflights should
|
// be freed by calling inflights.FreeLE with the index of the last
|
||||||
* be freed by calling inflights.FreeLE with the index of the last
|
// received entry.
|
||||||
* received entry.
|
|
||||||
**/
|
|
||||||
SSyncRaftInflights* inflights;
|
SSyncRaftInflights* inflights;
|
||||||
|
|
||||||
/**
|
// IsLearner is true if this progress is tracked for a learner.
|
||||||
* IsLearner is true if this progress is tracked for a learner.
|
|
||||||
**/
|
|
||||||
bool isLearner;
|
bool isLearner;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct SSyncRaftProgressMap {
|
struct SSyncRaftProgressMap {
|
||||||
SSyncRaftProgress progress[TSDB_MAX_REPLICA];
|
// map nodeId -> SSyncRaftProgress*
|
||||||
|
SHashObj* progressMap;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static FORCE_INLINE const char* syncRaftProgressStateString(const SSyncRaftProgress* progress) {
|
static FORCE_INLINE const char* syncRaftProgressStateString(const SSyncRaftProgress* progress) {
|
||||||
return kProgressStateString[progress->state];
|
return kProgressStateString[progress->state];
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
void syncRaftResetProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
||||||
|
|
||||||
/**
|
// BecomeProbe transitions into StateProbe. Next is reset to Match+1 or,
|
||||||
* syncRaftProgressBecomeProbe transitions into StateProbe. Next is reset to Match+1 or,
|
// optionally and if larger, the index of the pending snapshot.
|
||||||
* optionally and if larger, the index of the pending snapshot.
|
|
||||||
**/
|
|
||||||
void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress);
|
void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress);
|
||||||
|
|
||||||
/**
|
// BecomeReplicate transitions into StateReplicate, resetting Next to Match+1.
|
||||||
* syncRaftProgressBecomeReplicate transitions into StateReplicate, resetting Next to Match+1.
|
|
||||||
**/
|
|
||||||
void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress);
|
void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress);
|
||||||
|
|
||||||
/**
|
// MaybeUpdate is called when an MsgAppResp arrives from the follower, with the
|
||||||
* syncRaftProgressMaybeUpdate is called when an MsgAppResp arrives from the follower, with the
|
// index acked by it. The method returns false if the given n index comes from
|
||||||
* index acked by it. The method returns false if the given n index comes from
|
// an outdated message. Otherwise it updates the progress and returns true.
|
||||||
* an outdated message. Otherwise it updates the progress and returns true.
|
|
||||||
**/
|
|
||||||
bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex);
|
bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex);
|
||||||
|
|
||||||
/**
|
// OptimisticUpdate signals that appends all the way up to and including index n
|
||||||
* syncRaftProgressOptimisticNextIndex signals that appends all the way up to and including index n
|
// are in-flight. As a result, Next is increased to n+1.
|
||||||
* are in-flight. As a result, Next is increased to n+1.
|
|
||||||
**/
|
|
||||||
static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* progress, SyncIndex nextIndex) {
|
static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* progress, SyncIndex nextIndex) {
|
||||||
progress->nextIndex = nextIndex + 1;
|
progress->nextIndex = nextIndex + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// MaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The
|
||||||
* syncRaftProgressMaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The
|
// arguments are the index of the append message rejected by the follower, and
|
||||||
* arguments are the index of the append message rejected by the follower, and
|
// the hint that we want to decrease to.
|
||||||
* the hint that we want to decrease to.
|
//
|
||||||
*
|
// Rejections can happen spuriously as messages are sent out of order or
|
||||||
* Rejections can happen spuriously as messages are sent out of order or
|
// duplicated. In such cases, the rejection pertains to an index that the
|
||||||
* duplicated. In such cases, the rejection pertains to an index that the
|
// Progress already knows were previously acknowledged, and false is returned
|
||||||
* Progress already knows were previously acknowledged, and false is returned
|
// without changing the Progress.
|
||||||
* without changing the Progress.
|
//
|
||||||
*
|
// If the rejection is genuine, Next is lowered sensibly, and the Progress is
|
||||||
* If the rejection is genuine, Next is lowered sensibly, and the Progress is
|
// cleared for sending log entries.
|
||||||
* cleared for sending log entries.
|
|
||||||
**/
|
|
||||||
bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress,
|
bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress,
|
||||||
SyncIndex rejected, SyncIndex matchHint);
|
SyncIndex rejected, SyncIndex matchHint);
|
||||||
|
|
||||||
/**
|
// IsPaused returns whether sending log entries to this node has been throttled.
|
||||||
* syncRaftProgressIsPaused returns whether sending log entries to this node has been throttled.
|
// This is done when a node has rejected recent MsgApps, is currently waiting
|
||||||
* This is done when a node has rejected recent MsgApps, is currently waiting
|
// for a snapshot, or has reached the MaxInflightMsgs limit. In normal
|
||||||
* for a snapshot, or has reached the MaxInflightMsgs limit. In normal
|
// operation, this is false. A throttled node will be contacted less frequently
|
||||||
* operation, this is false. A throttled node will be contacted less frequently
|
// until it has reached a state in which it's able to accept a steady stream of
|
||||||
* until it has reached a state in which it's able to accept a steady stream of
|
// log entries again.
|
||||||
* log entries again.
|
|
||||||
**/
|
|
||||||
bool syncRaftProgressIsPaused(SSyncRaftProgress* progress);
|
bool syncRaftProgressIsPaused(SSyncRaftProgress* progress);
|
||||||
|
|
||||||
static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progress) {
|
static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progress) {
|
||||||
|
@ -221,22 +204,35 @@ static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progres
|
||||||
return progress->recentActive;
|
return progress->recentActive;
|
||||||
}
|
}
|
||||||
|
|
||||||
int syncRaftFindProgressIndexByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
void syncRaftInitProgressMap(SSyncRaftProgressMap* progressMap);
|
||||||
|
void syncRaftFreeProgressMap(SSyncRaftProgressMap* progressMap);
|
||||||
|
|
||||||
int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
void syncRaftClearProgressMap(SSyncRaftProgressMap* progressMap);
|
||||||
|
void syncRaftCopyProgressMap(SSyncRaftProgressMap* from, SSyncRaftProgressMap* to);
|
||||||
|
|
||||||
|
SSyncRaftProgress* syncRaftFindProgressByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||||
|
|
||||||
|
int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgress* progress);
|
||||||
|
|
||||||
void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||||
|
|
||||||
|
bool syncRaftIsInProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* return true if progress's log is up-todate
|
* return true if progress's log is up-todate
|
||||||
**/
|
**/
|
||||||
bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
||||||
|
|
||||||
|
// BecomeSnapshot moves the Progress to StateSnapshot with the specified pending
|
||||||
|
// snapshot index.
|
||||||
void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex);
|
void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex);
|
||||||
|
|
||||||
void syncRaftCopyProgress(const SSyncRaftProgress* from, SSyncRaftProgress* to);
|
void syncRaftCopyProgress(const SSyncRaftProgress* from, SSyncRaftProgress* to);
|
||||||
|
|
||||||
void syncRaftProgressMapCopy(const SSyncRaftProgressMap* from, SSyncRaftProgressMap* to);
|
// return true if reach the end
|
||||||
|
bool syncRaftIterateProgressMap(const SSyncRaftProgressMap* progressMap, SSyncRaftProgress *pProgress);
|
||||||
|
|
||||||
|
bool syncRaftVisitProgressMap(SSyncRaftProgressMap* progressMap, visitProgressFp fp, void* arg);
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,9 @@
|
||||||
#include "sync_raft_quorum_joint.h"
|
#include "sync_raft_quorum_joint.h"
|
||||||
#include "sync_raft_progress.h"
|
#include "sync_raft_progress.h"
|
||||||
#include "sync_raft_proto.h"
|
#include "sync_raft_proto.h"
|
||||||
|
#include "thash.h"
|
||||||
|
|
||||||
|
// Config reflects the configuration tracked in a ProgressTracker.
|
||||||
struct SSyncRaftProgressTrackerConfig {
|
struct SSyncRaftProgressTrackerConfig {
|
||||||
SSyncRaftQuorumJointConfig voters;
|
SSyncRaftQuorumJointConfig voters;
|
||||||
|
|
||||||
|
@ -83,34 +85,47 @@ struct SSyncRaftProgressTracker {
|
||||||
|
|
||||||
SSyncRaftProgressMap progressMap;
|
SSyncRaftProgressMap progressMap;
|
||||||
|
|
||||||
ESyncRaftVoteType votes[TSDB_MAX_REPLICA];
|
// nodeid -> ESyncRaftVoteType map
|
||||||
|
SHashObj* votesMap;
|
||||||
|
|
||||||
int maxInflightMsgs;
|
int maxInflightMsgs;
|
||||||
|
|
||||||
|
SSyncRaft* pRaft;
|
||||||
};
|
};
|
||||||
|
|
||||||
SSyncRaftProgressTracker* syncRaftOpenProgressTracker();
|
SSyncRaftProgressTracker* syncRaftOpenProgressTracker(SSyncRaft* pRaft);
|
||||||
|
|
||||||
|
void syncRaftInitTrackConfig(SSyncRaftProgressTrackerConfig* config);
|
||||||
|
void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config);
|
||||||
|
|
||||||
|
void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config);
|
||||||
|
|
||||||
|
// ResetVotes prepares for a new round of vote counting via recordVote.
|
||||||
void syncRaftResetVotes(SSyncRaftProgressTracker*);
|
void syncRaftResetVotes(SSyncRaftProgressTracker*);
|
||||||
|
|
||||||
typedef void (*visitProgressFp)(int i, SSyncRaftProgress* progress, void* arg);
|
|
||||||
void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg);
|
void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg);
|
||||||
|
|
||||||
/**
|
// RecordVote records that the node with the given id voted for this Raft
|
||||||
* syncRaftRecordVote records that the node with the given id voted for this Raft
|
// instance if v == true (and declined it otherwise).
|
||||||
* instance if v == true (and declined it otherwise).
|
void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant);
|
||||||
**/
|
|
||||||
void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant);
|
|
||||||
|
|
||||||
void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressTrackerConfig* result);
|
void syncRaftCopyTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to);
|
||||||
|
|
||||||
int syncRaftCheckProgress(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
int syncRaftCheckTrackerConfigInProgress(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||||
|
|
||||||
/**
|
// TallyVotes returns the number of granted and rejected Votes, and whether the
|
||||||
* syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the
|
// election outcome is known.
|
||||||
* election outcome is known.
|
|
||||||
**/
|
|
||||||
ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted);
|
ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted);
|
||||||
|
|
||||||
void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs);
|
void syncRaftConfigState(SSyncRaftProgressTracker* tracker, SSyncConfigState* cs);
|
||||||
|
|
||||||
|
// Committed returns the largest log index known to be committed based on what
|
||||||
|
// the voting members of the group have acknowledged.
|
||||||
|
SyncIndex syncRaftCommittedIndex(SSyncRaftProgressTracker* tracker);
|
||||||
|
|
||||||
|
// QuorumActive returns true if the quorum is active from the view of the local
|
||||||
|
// raft state machine. Otherwise, it returns false.
|
||||||
|
bool syncRaftQuorumActive(SSyncRaftProgressTracker* tracker);
|
||||||
|
|
||||||
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
|
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#define TD_SYNC_RAFT_PROTO_H
|
#define TD_SYNC_RAFT_PROTO_H
|
||||||
|
|
||||||
#include "sync_type.h"
|
#include "sync_type.h"
|
||||||
|
#include "sync_raft_node_map.h"
|
||||||
|
|
||||||
typedef enum ESyncRaftConfChangeType {
|
typedef enum ESyncRaftConfChangeType {
|
||||||
SYNC_RAFT_Conf_AddNode = 0,
|
SYNC_RAFT_Conf_AddNode = 0,
|
||||||
|
@ -58,4 +59,19 @@ typedef struct SSyncConfigState {
|
||||||
bool autoLeave;
|
bool autoLeave;
|
||||||
} SSyncConfigState;
|
} SSyncConfigState;
|
||||||
|
|
||||||
|
static FORCE_INLINE bool syncRaftConfArrayIsEmpty(const SSyncConfChangeSingleArray* ary) {
|
||||||
|
return ary->n == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCE_INLINE void syncRaftInitConfArray(SSyncConfChangeSingleArray* ary) {
|
||||||
|
*ary = (SSyncConfChangeSingleArray) {
|
||||||
|
.changes = NULL,
|
||||||
|
.n = 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCE_INLINE void syncRaftFreeConfArray(SSyncConfChangeSingleArray* ary) {
|
||||||
|
if (ary->changes != NULL) free(ary->changes);
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* TD_SYNC_RAFT_PROTO_H */
|
#endif /* TD_SYNC_RAFT_PROTO_H */
|
||||||
|
|
|
@ -19,24 +19,31 @@
|
||||||
#include "taosdef.h"
|
#include "taosdef.h"
|
||||||
#include "sync.h"
|
#include "sync.h"
|
||||||
#include "sync_type.h"
|
#include "sync_type.h"
|
||||||
|
#include "sync_raft_node_map.h"
|
||||||
|
#include "thash.h"
|
||||||
|
|
||||||
/**
|
// JointConfig is a configuration of two groups of (possibly overlapping)
|
||||||
* SSyncRaftQuorumJointConfig is a configuration of two groups of (possibly overlapping)
|
// majority configurations. Decisions require the support of both majorities.
|
||||||
* majority configurations. Decisions require the support of both majorities.
|
|
||||||
**/
|
|
||||||
typedef struct SSyncRaftQuorumJointConfig {
|
typedef struct SSyncRaftQuorumJointConfig {
|
||||||
SSyncRaftNodeMap outgoing;
|
SSyncRaftNodeMap outgoing;
|
||||||
SSyncRaftNodeMap incoming;
|
SSyncRaftNodeMap incoming;
|
||||||
} SSyncRaftQuorumJointConfig;
|
} SSyncRaftQuorumJointConfig;
|
||||||
|
|
||||||
/**
|
// IDs returns a newly initialized map representing the set of voters present
|
||||||
* syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
// in the joint configuration.
|
||||||
* a result indicating whether the vote is pending, lost, or won. A joint quorum
|
void syncRaftJointConfigIDs(SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap);
|
||||||
* requires both majority quorums to vote in favor.
|
|
||||||
**/
|
|
||||||
ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const ESyncRaftVoteType* votes);
|
|
||||||
|
|
||||||
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId);
|
// CommittedIndex returns the largest committed index for the given joint
|
||||||
|
// quorum. An index is jointly committed if it is committed in both constituent
|
||||||
|
// majorities.
|
||||||
|
SyncIndex syncRaftJointConfigCommittedIndex(const SSyncRaftQuorumJointConfig* config, matchAckIndexerFp indexer, void* arg);
|
||||||
|
|
||||||
|
// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||||
|
// a result indicating whether the vote is pending, lost, or won. A joint quorum
|
||||||
|
// requires both majority quorums to vote in favor.
|
||||||
|
ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashObj* votesMap);
|
||||||
|
|
||||||
|
void syncRaftInitQuorumJointConfig(SSyncRaftQuorumJointConfig* config);
|
||||||
|
|
||||||
static FORCE_INLINE bool syncRaftJointConfigInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
static FORCE_INLINE bool syncRaftJointConfigInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
||||||
return syncRaftIsInNodeMap(&config->outgoing, id);
|
return syncRaftIsInNodeMap(&config->outgoing, id);
|
||||||
|
@ -59,7 +66,19 @@ static FORCE_INLINE const SSyncRaftNodeMap* syncRaftJointConfigOutgoing(const SS
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCE_INLINE void syncRaftJointConfigClearOutgoing(SSyncRaftQuorumJointConfig* config) {
|
static FORCE_INLINE void syncRaftJointConfigClearOutgoing(SSyncRaftQuorumJointConfig* config) {
|
||||||
memset(&config->outgoing, 0, sizeof(SSyncCluster));
|
syncRaftClearNodeMap(&config->outgoing);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCE_INLINE bool syncRaftJointConfigIsIncomingEmpty(const SSyncRaftQuorumJointConfig* config) {
|
||||||
|
return syncRaftNodeMapSize(&config->incoming) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCE_INLINE bool syncRaftJointConfigIsOutgoingEmpty(const SSyncRaftQuorumJointConfig* config) {
|
||||||
|
return syncRaftNodeMapSize(&config->outgoing) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCE_INLINE bool syncRaftJointConfigIsInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
||||||
|
return syncRaftIsInNodeMap(&config->outgoing, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */
|
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#include "sync.h"
|
#include "sync.h"
|
||||||
#include "sync_type.h"
|
#include "sync_type.h"
|
||||||
#include "sync_raft_quorum.h"
|
#include "sync_raft_quorum.h"
|
||||||
|
#include "thash.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
* syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||||
|
@ -26,6 +27,10 @@
|
||||||
* yes/no has been reached), won (a quorum of yes has been reached), or lost (a
|
* yes/no has been reached), won (a quorum of yes has been reached), or lost (a
|
||||||
* quorum of no has been reached).
|
* quorum of no has been reached).
|
||||||
**/
|
**/
|
||||||
ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, const ESyncRaftVoteType* votes);
|
ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashObj* votesMap);
|
||||||
|
|
||||||
|
// CommittedIndex computes the committed index from those supplied via the
|
||||||
|
// provided AckedIndexer (for the active config).
|
||||||
|
SyncIndex syncRaftMajorityConfigCommittedIndex(const SSyncRaftNodeMap* config, matchAckIndexerFp indexer, void* arg);
|
||||||
|
|
||||||
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */
|
#endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */
|
||||||
|
|
|
@ -27,6 +27,7 @@
|
||||||
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
|
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
|
||||||
// this can just take LastIndex and MaxInflight directly instead and cook up
|
// this can just take LastIndex and MaxInflight directly instead and cook up
|
||||||
// the results from that alone.
|
// the results from that alone.
|
||||||
int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs);
|
int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs,
|
||||||
|
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap);
|
||||||
|
|
||||||
#endif /* TD_SYNC_RAFT_RESTORE_H */
|
#endif /* TD_SYNC_RAFT_RESTORE_H */
|
||||||
|
|
|
@ -32,6 +32,8 @@ typedef struct SSyncRaftProgress SSyncRaftProgress;
|
||||||
typedef struct SSyncRaftProgressMap SSyncRaftProgressMap;
|
typedef struct SSyncRaftProgressMap SSyncRaftProgressMap;
|
||||||
typedef struct SSyncRaftProgressTrackerConfig SSyncRaftProgressTrackerConfig;
|
typedef struct SSyncRaftProgressTrackerConfig SSyncRaftProgressTrackerConfig;
|
||||||
|
|
||||||
|
typedef struct SSyncRaftNodeMap SSyncRaftNodeMap;
|
||||||
|
|
||||||
typedef struct SSyncRaftProgressTracker SSyncRaftProgressTracker;
|
typedef struct SSyncRaftProgressTracker SSyncRaftProgressTracker;
|
||||||
|
|
||||||
typedef struct SSyncRaftChanger SSyncRaftChanger;
|
typedef struct SSyncRaftChanger SSyncRaftChanger;
|
||||||
|
@ -68,11 +70,6 @@ typedef struct SSyncClusterConfig {
|
||||||
const SSyncCluster* cluster;
|
const SSyncCluster* cluster;
|
||||||
} SSyncClusterConfig;
|
} SSyncClusterConfig;
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
int32_t replica;
|
|
||||||
SyncNodeId nodeId[TSDB_MAX_REPLICA];
|
|
||||||
} SSyncRaftNodeMap;
|
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0,
|
SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0,
|
||||||
SYNC_RAFT_CAMPAIGN_ELECTION = 1,
|
SYNC_RAFT_CAMPAIGN_ELECTION = 1,
|
||||||
|
@ -80,9 +77,6 @@ typedef enum {
|
||||||
} ESyncRaftElectionType;
|
} ESyncRaftElectionType;
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
// the init vote resp status
|
|
||||||
SYNC_RAFT_VOTE_RESP_UNKNOWN = 0,
|
|
||||||
|
|
||||||
// grant the vote request
|
// grant the vote request
|
||||||
SYNC_RAFT_VOTE_RESP_GRANT = 1,
|
SYNC_RAFT_VOTE_RESP_GRANT = 1,
|
||||||
|
|
||||||
|
@ -90,4 +84,8 @@ typedef enum {
|
||||||
SYNC_RAFT_VOTE_RESP_REJECT = 2,
|
SYNC_RAFT_VOTE_RESP_REJECT = 2,
|
||||||
} ESyncRaftVoteType;
|
} ESyncRaftVoteType;
|
||||||
|
|
||||||
|
typedef void (*visitProgressFp)(SSyncRaftProgress* progress, void* arg);
|
||||||
|
|
||||||
|
typedef void (*matchAckIndexerFp)(SyncNodeId id, void* arg, SyncIndex* index);
|
||||||
|
|
||||||
#endif /* _TD_LIBS_SYNC_TYPE_H */
|
#endif /* _TD_LIBS_SYNC_TYPE_H */
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "raft.h"
|
#include "raft.h"
|
||||||
#include "raft_configuration.h"
|
#include "sync_raft_impl.h"
|
||||||
#include "raft_log.h"
|
#include "raft_log.h"
|
||||||
#include "sync_raft_restore.h"
|
#include "sync_raft_restore.h"
|
||||||
#include "raft_replication.h"
|
#include "raft_replication.h"
|
||||||
|
@ -59,8 +59,13 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
||||||
logStore = &(pRaft->logStore);
|
logStore = &(pRaft->logStore);
|
||||||
fsm = &(pRaft->fsm);
|
fsm = &(pRaft->fsm);
|
||||||
|
|
||||||
|
pRaft->nodeInfoMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK);
|
||||||
|
if (pRaft->nodeInfoMap == NULL) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
// init progress tracker
|
// init progress tracker
|
||||||
pRaft->tracker = syncRaftOpenProgressTracker();
|
pRaft->tracker = syncRaftOpenProgressTracker(pRaft);
|
||||||
if (pRaft->tracker == NULL) {
|
if (pRaft->tracker == NULL) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -96,11 +101,22 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) {
|
||||||
.tracker = pRaft->tracker,
|
.tracker = pRaft->tracker,
|
||||||
.lastIndex = syncRaftLogLastIndex(pRaft->log),
|
.lastIndex = syncRaftLogLastIndex(pRaft->log),
|
||||||
};
|
};
|
||||||
if (syncRaftRestoreConfig(&changer, &confState) < 0) {
|
SSyncRaftProgressTrackerConfig config;
|
||||||
|
SSyncRaftProgressMap progressMap;
|
||||||
|
|
||||||
|
if (syncRaftRestoreConfig(&changer, &confState, &config, &progressMap) < 0) {
|
||||||
syncError("syncRaftRestoreConfig for vgid %d fail", pInfo->vgId);
|
syncError("syncRaftRestoreConfig for vgid %d fail", pInfo->vgId);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// save restored config and progress map to tracker
|
||||||
|
syncRaftCopyProgressMap(&progressMap, &pRaft->tracker->progressMap);
|
||||||
|
syncRaftCopyTrackerConfig(&config, &pRaft->tracker->config);
|
||||||
|
|
||||||
|
// free progress map and config
|
||||||
|
syncRaftFreeProgressMap(&progressMap);
|
||||||
|
syncRaftFreeTrackConfig(&config);
|
||||||
|
|
||||||
if (!syncRaftIsEmptyServerState(&serverState)) {
|
if (!syncRaftIsEmptyServerState(&serverState)) {
|
||||||
syncRaftLoadState(pRaft, &serverState);
|
syncRaftLoadState(pRaft, &serverState);
|
||||||
}
|
}
|
||||||
|
@ -140,6 +156,7 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
|
|
||||||
int32_t syncRaftTick(SSyncRaft* pRaft) {
|
int32_t syncRaftTick(SSyncRaft* pRaft) {
|
||||||
pRaft->currentTick += 1;
|
pRaft->currentTick += 1;
|
||||||
|
pRaft->tickFp(pRaft);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -151,8 +168,8 @@ static int deserializeClusterStateFromBuffer(SSyncConfigState* cluster, const ch
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void visitProgressMaybeSendAppend(int i, SSyncRaftProgress* progress, void* arg) {
|
static void visitProgressMaybeSendAppend(SSyncRaftProgress* progress, void* arg) {
|
||||||
syncRaftReplicate(arg, progress, false);
|
syncRaftMaybeSendAppend(arg, progress, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// switchToConfig reconfigures this node to use the provided configuration. It
|
// switchToConfig reconfigures this node to use the provided configuration. It
|
||||||
|
@ -169,13 +186,12 @@ static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfi
|
||||||
SSyncRaftProgress* progress = NULL;
|
SSyncRaftProgress* progress = NULL;
|
||||||
|
|
||||||
syncRaftConfigState(pRaft->tracker, cs);
|
syncRaftConfigState(pRaft->tracker, cs);
|
||||||
i = syncRaftFindProgressIndexByNodeId(&pRaft->tracker->progressMap, selfId);
|
progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, selfId);
|
||||||
exist = (i != -1);
|
exist = (progress != NULL);
|
||||||
|
|
||||||
// Update whether the node itself is a learner, resetting to false when the
|
// Update whether the node itself is a learner, resetting to false when the
|
||||||
// node is removed.
|
// node is removed.
|
||||||
if (exist) {
|
if (exist) {
|
||||||
progress = &pRaft->tracker->progressMap.progress[i];
|
|
||||||
pRaft->isLearner = progress->isLearner;
|
pRaft->isLearner = progress->isLearner;
|
||||||
} else {
|
} else {
|
||||||
pRaft->isLearner = false;
|
pRaft->isLearner = false;
|
||||||
|
@ -196,7 +212,7 @@ static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfi
|
||||||
|
|
||||||
// The remaining steps only make sense if this node is the leader and there
|
// The remaining steps only make sense if this node is the leader and there
|
||||||
// are other nodes.
|
// are other nodes.
|
||||||
if (pRaft->state != TAOS_SYNC_STATE_LEADER || cs->voters.replica == 0) {
|
if (pRaft->state != TAOS_SYNC_STATE_LEADER || syncRaftNodeMapSize(&cs->voters) == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -212,8 +228,11 @@ static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfi
|
||||||
|
|
||||||
// If the the leadTransferee was removed or demoted, abort the leadership transfer.
|
// If the the leadTransferee was removed or demoted, abort the leadership transfer.
|
||||||
SyncNodeId leadTransferee = pRaft->leadTransferee;
|
SyncNodeId leadTransferee = pRaft->leadTransferee;
|
||||||
if (leadTransferee != SYNC_NON_NODE_ID && !syncRaftIsInNodeMap(&pRaft->tracker->config.voters, leadTransferee)) {
|
if (leadTransferee != SYNC_NON_NODE_ID) {
|
||||||
abortLeaderTransfer(pRaft);
|
if (!syncRaftIsInNodeMap(&pRaft->tracker->config.voters.incoming, leadTransferee) &&
|
||||||
|
!syncRaftIsInNodeMap(&pRaft->tracker->config.voters.outgoing, leadTransferee)) {
|
||||||
|
abortLeaderTransfer(pRaft);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -286,8 +305,8 @@ static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg)
|
||||||
* but it will not receive MsgApp or MsgHeartbeat, so it will not create
|
* but it will not receive MsgApp or MsgHeartbeat, so it will not create
|
||||||
* disruptive term increases
|
* disruptive term increases
|
||||||
**/
|
**/
|
||||||
int peerIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from);
|
SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from);
|
||||||
if (peerIndex < 0) {
|
if (pNode == NULL) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
SSyncMessage* msg = syncNewEmptyAppendRespMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
SSyncMessage* msg = syncNewEmptyAppendRespMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
||||||
|
@ -295,7 +314,7 @@ static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
pRaft->io.send(msg, &(pRaft->cluster.nodeInfo[peerIndex]));
|
pRaft->io.send(msg, pNode);
|
||||||
} else {
|
} else {
|
||||||
// ignore other cases
|
// ignore other cases
|
||||||
syncInfo("[%d:%d] [term:%" PRId64 "] ignored a %d message with lower term from %d [term:%" PRId64 "]",
|
syncInfo("[%d:%d] [term:%" PRId64 "] ignored a %d message with lower term from %d [term:%" PRId64 "]",
|
||||||
|
|
|
@ -16,15 +16,14 @@
|
||||||
#include "syncInt.h"
|
#include "syncInt.h"
|
||||||
#include "raft.h"
|
#include "raft.h"
|
||||||
#include "raft_log.h"
|
#include "raft_log.h"
|
||||||
#include "raft_configuration.h"
|
#include "sync_raft_impl.h"
|
||||||
#include "raft_message.h"
|
#include "raft_message.h"
|
||||||
|
|
||||||
int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
const RaftMsg_Append_Entries *appendEntries = &(pMsg->appendEntries);
|
const RaftMsg_Append_Entries *appendEntries = &(pMsg->appendEntries);
|
||||||
|
|
||||||
int peerIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from);
|
SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from);
|
||||||
|
if (pNode == NULL) {
|
||||||
if (peerIndex < 0) {
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,6 +43,6 @@ int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMs
|
||||||
pRaft->selfGroupId, pRaft->selfId, pMsg->from, appendEntries->index);
|
pRaft->selfGroupId, pRaft->selfId, pMsg->from, appendEntries->index);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
pRaft->io.send(pRespMsg, &(pRaft->cluster.nodeInfo[peerIndex]));
|
pRaft->io.send(pRespMsg, pNode);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
|
@ -19,24 +19,6 @@
|
||||||
#include "raft_message.h"
|
#include "raft_message.h"
|
||||||
|
|
||||||
int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
if (pRaft->state == TAOS_SYNC_STATE_LEADER) {
|
|
||||||
syncDebug("[%d:%d] ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfGroupId, pRaft->selfId);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!syncRaftIsPromotable(pRaft)) {
|
|
||||||
syncDebug("[%d:%d] is unpromotable and can not campaign", pRaft->selfGroupId, pRaft->selfId);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
// if there is pending uncommitted config,cannot start election
|
|
||||||
if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) {
|
|
||||||
syncWarn("[%d:%d] cannot syncRaftStartElection at term %" PRId64 " since there are still pending configuration changes to apply",
|
|
||||||
pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
syncInfo("[%d:%d] is starting a new election at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
|
||||||
|
|
||||||
if (pRaft->preVote) {
|
if (pRaft->preVote) {
|
||||||
syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_PRE_ELECTION);
|
syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_PRE_ELECTION);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
|
|
||||||
#include "syncInt.h"
|
#include "syncInt.h"
|
||||||
#include "raft.h"
|
#include "raft.h"
|
||||||
#include "raft_configuration.h"
|
#include "sync_raft_impl.h"
|
||||||
#include "raft_log.h"
|
#include "raft_log.h"
|
||||||
#include "raft_message.h"
|
#include "raft_message.h"
|
||||||
|
|
||||||
|
@ -23,10 +23,11 @@ static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
|
|
||||||
int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
SSyncMessage* pRespMsg;
|
SSyncMessage* pRespMsg;
|
||||||
int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from);
|
SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from);
|
||||||
if (voteIndex == -1) {
|
if (pNode == NULL) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool grant;
|
bool grant;
|
||||||
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
||||||
SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log);
|
SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log);
|
||||||
|
@ -42,17 +43,19 @@ int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
grant ? "grant" : "reject",
|
grant ? "grant" : "reject",
|
||||||
pMsg->from, pMsg->vote.lastTerm, pMsg->vote.lastIndex, pRaft->term);
|
pMsg->from, pMsg->vote.lastTerm, pMsg->vote.lastIndex, pRaft->term);
|
||||||
|
|
||||||
pRaft->io.send(pRespMsg, &(pRaft->cluster.nodeInfo[voteIndex]));
|
pRaft->io.send(pRespMsg, pNode);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
if (!(pRaft->voteFor == SYNC_NON_NODE_ID || pMsg->term > pRaft->term || pRaft->voteFor == pMsg->from)) {
|
bool canVote =
|
||||||
return false;
|
// We can vote if this is a repeat of a vote we've already cast...
|
||||||
}
|
pRaft->voteFor == pMsg->from ||
|
||||||
if (!syncRaftLogIsUptodate(pRaft->log, pMsg->vote.lastIndex, pMsg->vote.lastTerm)) {
|
// ...we haven't voted and we don't think there's a leader yet in this term...
|
||||||
return false;
|
(pRaft->voteFor == SYNC_NON_NODE_ID && pRaft->leaderId == SYNC_NON_NODE_ID) ||
|
||||||
}
|
// ...or this is a PreVote for a future term...
|
||||||
|
(pMsg->vote.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION && pMsg->term > pRaft->term);
|
||||||
|
|
||||||
return true;
|
// ...and we believe the candidate is up to date.
|
||||||
|
return canVote && syncRaftLogIsUptodate(pRaft->log, pMsg->vote.lastIndex, pMsg->vote.lastTerm);
|
||||||
}
|
}
|
|
@ -15,7 +15,7 @@
|
||||||
|
|
||||||
#include "syncInt.h"
|
#include "syncInt.h"
|
||||||
#include "raft.h"
|
#include "raft.h"
|
||||||
#include "raft_configuration.h"
|
#include "sync_raft_impl.h"
|
||||||
#include "raft_message.h"
|
#include "raft_message.h"
|
||||||
|
|
||||||
int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
|
@ -25,8 +25,8 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
|
|
||||||
assert(pRaft->state == TAOS_SYNC_STATE_CANDIDATE);
|
assert(pRaft->state == TAOS_SYNC_STATE_CANDIDATE);
|
||||||
|
|
||||||
voterIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from);
|
SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from);
|
||||||
if (voterIndex == -1) {
|
if (pNode == NULL) {
|
||||||
syncError("[%d:%d] recv vote resp from unknown server %d", pRaft->selfGroupId, pRaft->selfId, pMsg->from);
|
syncError("[%d:%d] recv vote resp from unknown server %d", pRaft->selfGroupId, pRaft->selfId, pMsg->from);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -45,12 +45,14 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
|
|
||||||
if (result == SYNC_RAFT_VOTE_WON) {
|
if (result == SYNC_RAFT_VOTE_WON) {
|
||||||
if (pRaft->candidateState.inPreVote) {
|
if (pRaft->candidateState.inPreVote) {
|
||||||
syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION);
|
syncRaftCampaign(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION);
|
||||||
} else {
|
} else {
|
||||||
syncRaftBecomeLeader(pRaft);
|
syncRaftBecomeLeader(pRaft);
|
||||||
|
syncRaftBroadcastAppend(pRaft);
|
||||||
}
|
}
|
||||||
} else if (result == SYNC_RAFT_VOTE_LOST) {
|
} else if (result == SYNC_RAFT_VOTE_LOST) {
|
||||||
|
// pb.MsgPreVoteResp contains future term of pre-candidate
|
||||||
|
// m.Term > r.Term; reuse r.Term
|
||||||
syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID);
|
syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,14 +22,14 @@
|
||||||
static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress);
|
||||||
static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress,
|
static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress,
|
||||||
SyncIndex prevIndex, SyncTerm prevTerm,
|
SyncIndex prevIndex, SyncTerm prevTerm,
|
||||||
const SSyncRaftEntry *entries, int nEntry);
|
SSyncRaftEntry *entries, int nEntry);
|
||||||
|
|
||||||
// syncRaftReplicate sends an append RPC with new entries to the given peer,
|
// maybeSendAppend sends an append RPC with new entries to the given peer,
|
||||||
// if necessary. Returns true if a message was sent. The sendIfEmpty
|
// if necessary. Returns true if a message was sent. The sendIfEmpty
|
||||||
// argument controls whether messages with no entries will be sent
|
// argument controls whether messages with no entries will be sent
|
||||||
// ("empty" messages are useful to convey updated Commit indexes, but
|
// ("empty" messages are useful to convey updated Commit indexes, but
|
||||||
// are undesirable when we're sending multiple messages in a batch).
|
// are undesirable when we're sending multiple messages in a batch).
|
||||||
bool syncRaftReplicate(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty) {
|
bool syncRaftMaybeSendAppend(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty) {
|
||||||
assert(pRaft->state == TAOS_SYNC_STATE_LEADER);
|
assert(pRaft->state == TAOS_SYNC_STATE_LEADER);
|
||||||
SyncNodeId nodeId = progress->id;
|
SyncNodeId nodeId = progress->id;
|
||||||
|
|
||||||
|
@ -68,10 +68,13 @@ static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress) {
|
||||||
|
|
||||||
static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress,
|
static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress,
|
||||||
SyncIndex prevIndex, SyncTerm prevTerm,
|
SyncIndex prevIndex, SyncTerm prevTerm,
|
||||||
const SSyncRaftEntry *entries, int nEntry) {
|
SSyncRaftEntry *entries, int nEntry) {
|
||||||
|
SNodeInfo* pNode = syncRaftGetNodeById(pRaft, progress->id);
|
||||||
|
if (pNode == NULL) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
SyncIndex lastIndex;
|
SyncIndex lastIndex;
|
||||||
SyncTerm logTerm = prevTerm;
|
SyncTerm logTerm = prevTerm;
|
||||||
SNodeInfo* pNode = &(pRaft->cluster.nodeInfo[progress->selfIndex]);
|
|
||||||
|
|
||||||
SSyncMessage* msg = syncNewAppendMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term,
|
SSyncMessage* msg = syncNewAppendMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term,
|
||||||
prevIndex, prevTerm, pRaft->log->commitIndex,
|
prevIndex, prevTerm, pRaft->log->commitIndex,
|
||||||
|
@ -87,7 +90,7 @@ static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress,
|
||||||
case PROGRESS_STATE_REPLICATE:
|
case PROGRESS_STATE_REPLICATE:
|
||||||
lastIndex = entries[nEntry - 1].index;
|
lastIndex = entries[nEntry - 1].index;
|
||||||
syncRaftProgressOptimisticNextIndex(progress, lastIndex);
|
syncRaftProgressOptimisticNextIndex(progress, lastIndex);
|
||||||
syncRaftInflightAdd(&progress->inflights, lastIndex);
|
syncRaftInflightAdd(progress->inflights, lastIndex);
|
||||||
break;
|
break;
|
||||||
case PROGRESS_STATE_PROBE:
|
case PROGRESS_STATE_PROBE:
|
||||||
progress->probeSent = true;
|
progress->probeSent = true;
|
||||||
|
|
|
@ -99,7 +99,7 @@ void syncCleanUp() {
|
||||||
SSyncNode* syncStart(const SSyncInfo* pInfo) {
|
SSyncNode* syncStart(const SSyncInfo* pInfo) {
|
||||||
pthread_mutex_lock(&gSyncManager->mutex);
|
pthread_mutex_lock(&gSyncManager->mutex);
|
||||||
|
|
||||||
SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pInfo->vgId, sizeof(SyncGroupId));
|
SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pInfo->vgId, sizeof(SyncGroupId*));
|
||||||
if (ppNode != NULL) {
|
if (ppNode != NULL) {
|
||||||
syncInfo("vgroup %d already exist", pInfo->vgId);
|
syncInfo("vgroup %d already exist", pInfo->vgId);
|
||||||
pthread_mutex_unlock(&gSyncManager->mutex);
|
pthread_mutex_unlock(&gSyncManager->mutex);
|
||||||
|
@ -140,7 +140,7 @@ SSyncNode* syncStart(const SSyncInfo* pInfo) {
|
||||||
void syncStop(const SSyncNode* pNode) {
|
void syncStop(const SSyncNode* pNode) {
|
||||||
pthread_mutex_lock(&gSyncManager->mutex);
|
pthread_mutex_lock(&gSyncManager->mutex);
|
||||||
|
|
||||||
SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId));
|
SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId*));
|
||||||
if (ppNode == NULL) {
|
if (ppNode == NULL) {
|
||||||
syncInfo("vgroup %d not exist", pNode->vgId);
|
syncInfo("vgroup %d not exist", pNode->vgId);
|
||||||
pthread_mutex_unlock(&gSyncManager->mutex);
|
pthread_mutex_unlock(&gSyncManager->mutex);
|
||||||
|
@ -288,7 +288,7 @@ static void *syncWorkerMain(void *argv) {
|
||||||
|
|
||||||
static void syncNodeTick(void *param, void *tmrId) {
|
static void syncNodeTick(void *param, void *tmrId) {
|
||||||
SyncGroupId vgId = (SyncGroupId)param;
|
SyncGroupId vgId = (SyncGroupId)param;
|
||||||
SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &vgId, sizeof(SyncGroupId));
|
SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &vgId, sizeof(SyncGroupId*));
|
||||||
if (ppNode == NULL) {
|
if (ppNode == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "raft.h"
|
||||||
#include "syncInt.h"
|
#include "syncInt.h"
|
||||||
#include "sync_raft_config_change.h"
|
#include "sync_raft_config_change.h"
|
||||||
#include "sync_raft_progress.h"
|
#include "sync_raft_progress.h"
|
||||||
|
@ -40,8 +41,58 @@ static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig*
|
||||||
static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||||
SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||||
static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||||
SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
SSyncRaftProgressMap* progressMap, SyncNodeId id);
|
||||||
// syncRaftChangerSimpleConfig carries out a series of configuration changes that (in aggregate)
|
|
||||||
|
// EnterJoint verifies that the outgoing (=right) majority config of the joint
|
||||||
|
// config is empty and initializes it with a copy of the incoming (=left)
|
||||||
|
// majority config. That is, it transitions from
|
||||||
|
//
|
||||||
|
// (1 2 3)&&()
|
||||||
|
// to
|
||||||
|
// (1 2 3)&&(1 2 3).
|
||||||
|
//
|
||||||
|
// The supplied changes are then applied to the incoming majority config,
|
||||||
|
// resulting in a joint configuration that in terms of the Raft thesis[1]
|
||||||
|
// (Section 4.3) corresponds to `C_{new,old}`.
|
||||||
|
//
|
||||||
|
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
|
||||||
|
int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css,
|
||||||
|
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = checkAndCopy(changer, config, progressMap);
|
||||||
|
if (ret != 0) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasJointConfig(config)) {
|
||||||
|
syncError("config is already joint");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(syncRaftJointConfigIsIncomingEmpty(&config->voters) == 0) {
|
||||||
|
// We allow adding nodes to an empty config for convenience (testing and
|
||||||
|
// bootstrap), but you can't enter a joint state.
|
||||||
|
syncError("can't make a zero-voter config joint");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear the outgoing config.
|
||||||
|
syncRaftJointConfigClearOutgoing(&config->voters);
|
||||||
|
|
||||||
|
// Copy incoming to outgoing.
|
||||||
|
syncRaftCopyNodeMap(&config->voters.incoming, &config->voters.outgoing);
|
||||||
|
|
||||||
|
ret = applyConfig(changer, config, progressMap, css);
|
||||||
|
if (ret != 0) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
config->autoLeave = autoLeave;
|
||||||
|
return checkAndReturn(config, progressMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simple carries out a series of configuration changes that (in aggregate)
|
||||||
// mutates the incoming majority config Voters[0] by at most one. This method
|
// mutates the incoming majority config Voters[0] by at most one. This method
|
||||||
// will return an error if that is not the case, if the resulting quorum is
|
// will return an error if that is not the case, if the resulting quorum is
|
||||||
// zero, or if the configuration is in a joint state (i.e. if there is an
|
// zero, or if the configuration is in a joint state (i.e. if there is an
|
||||||
|
@ -75,132 +126,9 @@ int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChange
|
||||||
return checkAndReturn(config, progressMap);
|
return checkAndReturn(config, progressMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
// EnterJoint verifies that the outgoing (=right) majority config of the joint
|
// apply a change to the configuration. By convention, changes to voters are
|
||||||
// config is empty and initializes it with a copy of the incoming (=left)
|
// always made to the incoming majority config Voters[0]. Voters[1] is either
|
||||||
// majority config. That is, it transitions from
|
// empty or preserves the outgoing majority configuration while in a joint state.
|
||||||
//
|
|
||||||
// (1 2 3)&&()
|
|
||||||
// to
|
|
||||||
// (1 2 3)&&(1 2 3).
|
|
||||||
//
|
|
||||||
// The supplied changes are then applied to the incoming majority config,
|
|
||||||
// resulting in a joint configuration that in terms of the Raft thesis[1]
|
|
||||||
// (Section 4.3) corresponds to `C_{new,old}`.
|
|
||||||
//
|
|
||||||
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
|
|
||||||
int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css,
|
|
||||||
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
ret = checkAndCopy(changer, config, progressMap);
|
|
||||||
if (ret != 0) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
if (hasJointConfig(config)) {
|
|
||||||
syncError("config is already joint");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(config->voters.incoming.replica == 0) {
|
|
||||||
// We allow adding nodes to an empty config for convenience (testing and
|
|
||||||
// bootstrap), but you can't enter a joint state.
|
|
||||||
syncError("can't make a zero-voter config joint");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear the outgoing config.
|
|
||||||
syncRaftJointConfigClearOutgoing(&config->voters);
|
|
||||||
|
|
||||||
// Copy incoming to outgoing.
|
|
||||||
memcpy(&config->voters.outgoing, &config->voters.incoming, sizeof(SSyncCluster));
|
|
||||||
|
|
||||||
ret = applyConfig(changer, config, progressMap, css);
|
|
||||||
if (ret != 0) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
config->autoLeave = autoLeave;
|
|
||||||
return checkAndReturn(config, progressMap);
|
|
||||||
}
|
|
||||||
|
|
||||||
// checkAndCopy copies the tracker's config and progress map (deeply enough for
|
|
||||||
// the purposes of the Changer) and returns those copies. It returns an error
|
|
||||||
// if checkInvariants does.
|
|
||||||
static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
|
||||||
syncRaftCloneTrackerConfig(&changer->tracker->config, config);
|
|
||||||
int i;
|
|
||||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
|
||||||
SSyncRaftProgress* progress = &(changer->tracker->progressMap.progress[i]);
|
|
||||||
if (progress->id == SYNC_NON_NODE_ID) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
syncRaftCopyProgress(progress, &(progressMap->progress[i]));
|
|
||||||
}
|
|
||||||
return checkAndReturn(config, progressMap);
|
|
||||||
}
|
|
||||||
|
|
||||||
// checkAndReturn calls checkInvariants on the input and returns either the
|
|
||||||
// resulting error or the input.
|
|
||||||
static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
|
||||||
if (checkInvariants(config, progressMap) != 0) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// checkInvariants makes sure that the config and progress are compatible with
|
|
||||||
// each other. This is used to check both what the Changer is initialized with,
|
|
||||||
// as well as what it returns.
|
|
||||||
static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
|
||||||
int ret = syncRaftCheckProgress(config, progressMap);
|
|
||||||
if (ret != 0) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
int i;
|
|
||||||
// Any staged learner was staged because it could not be directly added due
|
|
||||||
// to a conflicting voter in the outgoing config.
|
|
||||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
|
||||||
if (!syncRaftJointConfigInOutgoing(&config->voters, config->learnersNext.nodeId[i])) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (progressMap->progress[i].id != SYNC_NON_NODE_ID && progressMap->progress[i].isLearner) {
|
|
||||||
syncError("%d is in LearnersNext, but is already marked as learner", progressMap->progress[i].id);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Conversely Learners and Voters doesn't intersect at all.
|
|
||||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
|
||||||
if (syncRaftJointConfigInIncoming(&config->voters, config->learners.nodeId[i])) {
|
|
||||||
syncError("%d is in Learners and voter.incoming", progressMap->progress[i].id);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (progressMap->progress[i].id != SYNC_NON_NODE_ID && !progressMap->progress[i].isLearner) {
|
|
||||||
syncError("%d is in Learners, but is not marked as learner", progressMap->progress[i].id);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!hasJointConfig(config)) {
|
|
||||||
// We enforce that empty maps are nil instead of zero.
|
|
||||||
if (config->learnersNext.replica > 0) {
|
|
||||||
syncError("cfg.LearnersNext must be nil when not joint");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (config->autoLeave) {
|
|
||||||
syncError("AutoLeave must be false when not joint");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config) {
|
|
||||||
return config->voters.outgoing.replica > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||||
SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css) {
|
SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css) {
|
||||||
int i;
|
int i;
|
||||||
|
@ -227,7 +155,7 @@ static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config->voters.incoming.replica == 0) {
|
if (syncRaftJointConfigIsIncomingEmpty(&config->voters)) {
|
||||||
syncError("removed all voters");
|
syncError("removed all voters");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -235,86 +163,16 @@ static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// symdiff returns the count of the symmetric difference between the sets of
|
|
||||||
// uint64s, i.e. len( (l - r) \union (r - l)).
|
|
||||||
static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r) {
|
|
||||||
int n;
|
|
||||||
int i;
|
|
||||||
int j0, j1;
|
|
||||||
const SSyncRaftNodeMap* pairs[2][2] = {
|
|
||||||
{l, r}, // count elems in l but not in r
|
|
||||||
{r, l}, // count elems in r but not in l
|
|
||||||
};
|
|
||||||
|
|
||||||
for (n = 0, i = 0; i < 2; ++i) {
|
|
||||||
const SSyncRaftNodeMap** pp = pairs[i];
|
|
||||||
|
|
||||||
const SSyncRaftNodeMap* p0 = pp[0];
|
|
||||||
const SSyncRaftNodeMap* p1 = pp[1];
|
|
||||||
for (j0 = 0; j0 < TSDB_MAX_REPLICA; ++j0) {
|
|
||||||
SyncNodeId id = p0->nodeId[j0];
|
|
||||||
if (id == SYNC_NON_NODE_ID) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (j1 = 0; j1 < p1->replica; ++j1) {
|
|
||||||
if (p1->nodeId[j1] != SYNC_NON_NODE_ID && p1->nodeId[j1] != id) {
|
|
||||||
n+=1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
|
||||||
SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
|
|
||||||
static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id) {
|
|
||||||
int i;
|
|
||||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
|
||||||
if (nodeMap->nodeId[i] == id) {
|
|
||||||
nodeMap->replica -= 1;
|
|
||||||
nodeMap->nodeId[i] = SYNC_NON_NODE_ID;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(nodeMap->replica >= 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// nilAwareAdd populates a map entry, creating the map if necessary.
|
|
||||||
static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id) {
|
|
||||||
int i, j;
|
|
||||||
for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) {
|
|
||||||
if (nodeMap->nodeId[i] == id) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (j == -1 && nodeMap->nodeId[i] == SYNC_NON_NODE_ID) {
|
|
||||||
j = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(j != -1);
|
|
||||||
nodeMap->nodeId[j] = id;
|
|
||||||
nodeMap->replica += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// makeVoter adds or promotes the given ID to be a voter in the incoming
|
// makeVoter adds or promotes the given ID to be a voter in the incoming
|
||||||
// majority config.
|
// majority config.
|
||||||
static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||||
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||||
int i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id);
|
||||||
if (i == -1) {
|
if (progress == NULL) {
|
||||||
initProgress(changer, config, progressMap, id, false);
|
initProgress(changer, config, progressMap, id, false);
|
||||||
i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(i != -1);
|
|
||||||
SSyncRaftProgress* progress = &(progressMap->progress[i]);
|
|
||||||
|
|
||||||
progress->isLearner = false;
|
progress->isLearner = false;
|
||||||
nilAwareDelete(&config->learners, id);
|
nilAwareDelete(&config->learners, id);
|
||||||
|
@ -337,14 +195,12 @@ static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig*
|
||||||
// LeaveJoint().
|
// LeaveJoint().
|
||||||
static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||||
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||||
int i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id);
|
||||||
if (i == -1) {
|
if (progress == NULL) {
|
||||||
initProgress(changer, config, progressMap, id, false);
|
initProgress(changer, config, progressMap, id, true);
|
||||||
i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(i != -1);
|
|
||||||
SSyncRaftProgress* progress = &(progressMap->progress[i]);
|
|
||||||
if (progress->isLearner) {
|
if (progress->isLearner) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -352,15 +208,15 @@ static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfi
|
||||||
removeNodeId(changer, config, progressMap, id);
|
removeNodeId(changer, config, progressMap, id);
|
||||||
|
|
||||||
// ... but save the Progress.
|
// ... but save the Progress.
|
||||||
syncRaftAddToProgressMap(progressMap, id);
|
syncRaftAddToProgressMap(progressMap, progress);
|
||||||
|
|
||||||
// Use LearnersNext if we can't add the learner to Learners directly, i.e.
|
// Use LearnersNext if we can't add the learner to Learners directly, i.e.
|
||||||
// if the peer is still tracked as a voter in the outgoing config. It will
|
// if the peer is still tracked as a voter in the outgoing config. It will
|
||||||
// be turned into a learner in LeaveJoint().
|
// be turned into a learner in LeaveJoint().
|
||||||
//
|
//
|
||||||
// Otherwise, add a regular learner right away.
|
// Otherwise, add a regular learner right away.
|
||||||
bool inOutgoing = syncRaftJointConfigInCluster(&config->voters.outgoing, id);
|
bool inInOutgoing = syncRaftJointConfigIsInOutgoing(&config->voters, id);
|
||||||
if (inOutgoing) {
|
if (inInOutgoing) {
|
||||||
nilAwareAdd(&config->learnersNext, id);
|
nilAwareAdd(&config->learnersNext, id);
|
||||||
} else {
|
} else {
|
||||||
nilAwareAdd(&config->learners, id);
|
nilAwareAdd(&config->learners, id);
|
||||||
|
@ -371,8 +227,8 @@ static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfi
|
||||||
// removeNodeId this peer as a voter or learner from the incoming config.
|
// removeNodeId this peer as a voter or learner from the incoming config.
|
||||||
static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||||
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||||
int i = syncRaftFindProgressIndexByNodeId(progressMap, id);
|
SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id);
|
||||||
if (i == -1) {
|
if (progress == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -381,8 +237,173 @@ static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConf
|
||||||
nilAwareDelete(&config->learnersNext, id);
|
nilAwareDelete(&config->learnersNext, id);
|
||||||
|
|
||||||
// If the peer is still a voter in the outgoing config, keep the Progress.
|
// If the peer is still a voter in the outgoing config, keep the Progress.
|
||||||
bool inOutgoing = syncRaftJointConfigInCluster(&config->voters.outgoing, id);
|
bool inInOutgoing = syncRaftJointConfigIsInOutgoing(&config->voters, id);
|
||||||
if (!inOutgoing) {
|
if (!inInOutgoing) {
|
||||||
syncRaftRemoveFromProgressMap(progressMap, id);
|
syncRaftRemoveFromProgressMap(progressMap, id);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// initProgress initializes a new progress for the given node or learner.
|
||||||
|
static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config,
|
||||||
|
SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner) {
|
||||||
|
if (!isLearner) {
|
||||||
|
syncRaftJointConfigAddToIncoming(&config->voters, id);
|
||||||
|
} else {
|
||||||
|
nilAwareAdd(&config->learners, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
SSyncRaftProgress* pProgress = (SSyncRaftProgress*)malloc(sizeof(SSyncRaftProgress));
|
||||||
|
assert (pProgress != NULL);
|
||||||
|
*pProgress = (SSyncRaftProgress) {
|
||||||
|
// Initializing the Progress with the last index means that the follower
|
||||||
|
// can be probed (with the last index).
|
||||||
|
//
|
||||||
|
// TODO(tbg): seems awfully optimistic. Using the first index would be
|
||||||
|
// better. The general expectation here is that the follower has no log
|
||||||
|
// at all (and will thus likely need a snapshot), though the app may
|
||||||
|
// have applied a snapshot out of band before adding the replica (thus
|
||||||
|
// making the first index the better choice).
|
||||||
|
.id = id,
|
||||||
|
.groupId = changer->tracker->pRaft->selfGroupId,
|
||||||
|
.nextIndex = changer->lastIndex,
|
||||||
|
.matchIndex = 0,
|
||||||
|
.state = PROGRESS_STATE_PROBE,
|
||||||
|
.pendingSnapshotIndex = 0,
|
||||||
|
.probeSent = false,
|
||||||
|
.inflights = syncRaftOpenInflights(changer->tracker->maxInflightMsgs),
|
||||||
|
.isLearner = isLearner,
|
||||||
|
// When a node is first added, we should mark it as recently active.
|
||||||
|
// Otherwise, CheckQuorum may cause us to step down if it is invoked
|
||||||
|
// before the added node has had a chance to communicate with us.
|
||||||
|
.recentActive = true,
|
||||||
|
.refCount = 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
syncRaftAddToProgressMap(progressMap, pProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
// checkInvariants makes sure that the config and progress are compatible with
|
||||||
|
// each other. This is used to check both what the Changer is initialized with,
|
||||||
|
// as well as what it returns.
|
||||||
|
static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||||
|
int ret = syncRaftCheckTrackerConfigInProgress(config, progressMap);
|
||||||
|
if (ret != 0) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Any staged learner was staged because it could not be directly added due
|
||||||
|
// to a conflicting voter in the outgoing config.
|
||||||
|
SyncNodeId* pNodeId = NULL;
|
||||||
|
while (!syncRaftIterateNodeMap(&config->learnersNext, pNodeId)) {
|
||||||
|
SyncNodeId nodeId = *pNodeId;
|
||||||
|
if (!syncRaftJointConfigInOutgoing(&config->voters, nodeId)) {
|
||||||
|
syncError("[%d] is in LearnersNext, but not outgoing", nodeId);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, nodeId);
|
||||||
|
assert(progress);
|
||||||
|
assert(progress->id == nodeId);
|
||||||
|
if (progress->isLearner) {
|
||||||
|
syncError("[%d:%d] is in LearnersNext, but is already marked as learner", progress->groupId, nodeId);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Conversely Learners and Voters doesn't intersect at all.
|
||||||
|
pNodeId = NULL;
|
||||||
|
while (!syncRaftIterateNodeMap(&config->learners, pNodeId)) {
|
||||||
|
SyncNodeId nodeId = *pNodeId;
|
||||||
|
if (syncRaftJointConfigInOutgoing(&config->voters, nodeId)) {
|
||||||
|
syncError("%d is in Learners and outgoing", nodeId);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, nodeId);
|
||||||
|
assert(progress);
|
||||||
|
assert(progress->id == nodeId);
|
||||||
|
|
||||||
|
if (!progress->isLearner) {
|
||||||
|
syncError("[%d:%d] is in Learners, but is not marked as learner", progress->groupId, nodeId);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!hasJointConfig(config)) {
|
||||||
|
// We enforce that empty maps are nil instead of zero.
|
||||||
|
if (syncRaftNodeMapSize(&config->learnersNext) > 0) {
|
||||||
|
syncError("cfg.LearnersNext must be nil when not joint");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (config->autoLeave) {
|
||||||
|
syncError("AutoLeave must be false when not joint");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// checkAndCopy copies the tracker's config and progress map (deeply enough for
|
||||||
|
// the purposes of the Changer) and returns those copies. It returns an error
|
||||||
|
// if checkInvariants does.
|
||||||
|
static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||||
|
syncRaftCopyTrackerConfig(&changer->tracker->config, config);
|
||||||
|
syncRaftClearProgressMap(progressMap);
|
||||||
|
|
||||||
|
SSyncRaftProgress* pProgress = NULL;
|
||||||
|
while (!syncRaftIterateProgressMap(&changer->tracker->progressMap, pProgress)) {
|
||||||
|
syncRaftAddToProgressMap(progressMap, pProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
return checkAndReturn(config, progressMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
// checkAndReturn calls checkInvariants on the input and returns either the
|
||||||
|
// resulting error or the input.
|
||||||
|
static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||||
|
if (checkInvariants(config, progressMap) != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config) {
|
||||||
|
return !syncRaftJointConfigIsOutgoingEmpty(&config->voters);
|
||||||
|
}
|
||||||
|
|
||||||
|
// symdiff returns the count of the symmetric difference between the sets of
|
||||||
|
// uint64s, i.e. len( (l - r) \union (r - l)).
|
||||||
|
static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r) {
|
||||||
|
int n;
|
||||||
|
int i;
|
||||||
|
int j0, j1;
|
||||||
|
const SSyncRaftNodeMap* pairs[2][2] = {
|
||||||
|
{l, r}, // count elems in l but not in r
|
||||||
|
{r, l}, // count elems in r but not in l
|
||||||
|
};
|
||||||
|
|
||||||
|
for (n = 0, i = 0; i < 2; ++i) {
|
||||||
|
const SSyncRaftNodeMap** pp = pairs[i];
|
||||||
|
|
||||||
|
const SSyncRaftNodeMap* p0 = pp[0];
|
||||||
|
const SSyncRaftNodeMap* p1 = pp[1];
|
||||||
|
SyncNodeId* pNodeId;
|
||||||
|
while (!syncRaftIterateNodeMap(p0, pNodeId)) {
|
||||||
|
if (!syncRaftIsInNodeMap(p1, *pNodeId)) {
|
||||||
|
n+=1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
|
||||||
|
static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id) {
|
||||||
|
syncRaftRemoveFromNodeMap(nodeMap, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// nilAwareAdd populates a map entry, creating the map if necessary.
|
||||||
|
static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id) {
|
||||||
|
syncRaftAddToNodeMap(nodeMap, id);
|
||||||
}
|
}
|
|
@ -17,15 +17,40 @@
|
||||||
#include "raft.h"
|
#include "raft.h"
|
||||||
#include "raft_log.h"
|
#include "raft_log.h"
|
||||||
#include "raft_message.h"
|
#include "raft_message.h"
|
||||||
|
#include "sync_raft_progress_tracker.h"
|
||||||
|
|
||||||
void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) {
|
void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) {
|
||||||
SyncTerm term;
|
if (pRaft->state == TAOS_SYNC_STATE_LEADER) {
|
||||||
|
syncDebug("[%d:%d] ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfGroupId, pRaft->selfId);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!syncRaftIsPromotable(pRaft)) {
|
||||||
|
syncWarn("[%d:%d] is unpromotable and can not syncRaftCampaign", pRaft->selfGroupId, pRaft->selfId);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if there is pending uncommitted config,cannot start election
|
||||||
|
if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) {
|
||||||
|
syncWarn("[%d:%d] cannot syncRaftStartElection at term %" PRId64 " since there are still pending configuration changes to apply",
|
||||||
|
pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
syncInfo("[%d:%d] is starting a new election at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
||||||
|
|
||||||
|
syncRaftCampaign(pRaft, cType);
|
||||||
|
}
|
||||||
|
|
||||||
|
// syncRaftCampaign transitions the raft instance to candidate state. This must only be
|
||||||
|
// called after verifying that this is a legitimate transition.
|
||||||
|
void syncRaftCampaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) {
|
||||||
bool preVote;
|
bool preVote;
|
||||||
ESyncRaftMessageType voteMsgType;
|
SyncTerm term;
|
||||||
|
|
||||||
if (syncRaftIsPromotable(pRaft)) {
|
if (syncRaftIsPromotable(pRaft)) {
|
||||||
syncDebug("[%d:%d] is unpromotable; campaign() should have been called", pRaft->selfGroupId, pRaft->selfId);
|
syncDebug("[%d:%d] is unpromotable; syncRaftCampaign() should have been called", pRaft->selfGroupId, pRaft->selfId);
|
||||||
return 0;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) {
|
if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) {
|
||||||
|
@ -35,7 +60,6 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) {
|
||||||
term = pRaft->term + 1;
|
term = pRaft->term + 1;
|
||||||
} else {
|
} else {
|
||||||
syncRaftBecomeCandidate(pRaft);
|
syncRaftBecomeCandidate(pRaft);
|
||||||
voteMsgType = RAFT_MSG_VOTE;
|
|
||||||
term = pRaft->term;
|
term = pRaft->term;
|
||||||
preVote = false;
|
preVote = false;
|
||||||
}
|
}
|
||||||
|
@ -43,10 +67,8 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) {
|
||||||
int quorum = syncRaftQuorum(pRaft);
|
int quorum = syncRaftQuorum(pRaft);
|
||||||
ESyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL);
|
ESyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL);
|
||||||
if (result == SYNC_RAFT_VOTE_WON) {
|
if (result == SYNC_RAFT_VOTE_WON) {
|
||||||
/**
|
// We won the election after voting for ourselves (which must mean that
|
||||||
* We won the election after voting for ourselves (which must mean that
|
// this is a single-node cluster). Advance to the next state.
|
||||||
* this is a single-node cluster). Advance to the next state.
|
|
||||||
**/
|
|
||||||
if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) {
|
if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) {
|
||||||
syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION);
|
syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION);
|
||||||
} else {
|
} else {
|
||||||
|
@ -59,12 +81,23 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) {
|
||||||
int i;
|
int i;
|
||||||
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
||||||
SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log);
|
SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log);
|
||||||
for (i = 0; i < pRaft->cluster.replica; ++i) {
|
SSyncRaftNodeMap nodeMap;
|
||||||
if (i == pRaft->cluster.selfIndex) {
|
syncRaftJointConfigIDs(&pRaft->tracker->config.voters, &nodeMap);
|
||||||
|
SyncNodeId *pNodeId = NULL;
|
||||||
|
while (!syncRaftIterateNodeMap(&nodeMap, pNodeId)) {
|
||||||
|
SyncNodeId nodeId = *pNodeId;
|
||||||
|
if (nodeId == SYNC_NON_NODE_ID) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
SyncNodeId nodeId = pRaft->cluster.nodeInfo[i].nodeId;
|
if (nodeId == pRaft->selfId) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
SNodeInfo* pNode = syncRaftGetNodeById(pRaft, nodeId);
|
||||||
|
if (pNode == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
SSyncMessage* pMsg = syncNewVoteMsg(pRaft->selfGroupId, pRaft->selfId,
|
SSyncMessage* pMsg = syncNewVoteMsg(pRaft->selfGroupId, pRaft->selfId,
|
||||||
term, cType, lastIndex, lastTerm);
|
term, cType, lastIndex, lastTerm);
|
||||||
|
@ -72,10 +105,10 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 "] sent %d request to %d at term %" PRId64 "",
|
syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 "] sent vote request to %d at term %" PRId64 "",
|
||||||
pRaft->selfGroupId, pRaft->selfId, lastTerm,
|
pRaft->selfGroupId, pRaft->selfId, lastTerm,
|
||||||
lastIndex, voteMsgType, nodeId, pRaft->term);
|
lastIndex, nodeId, pRaft->term);
|
||||||
|
|
||||||
pRaft->io.send(pMsg, &(pRaft->cluster.nodeInfo[i]));
|
pRaft->io.send(pMsg, pNode);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -14,7 +14,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "raft.h"
|
#include "raft.h"
|
||||||
#include "raft_configuration.h"
|
#include "sync_raft_impl.h"
|
||||||
#include "raft_log.h"
|
#include "raft_log.h"
|
||||||
#include "raft_replication.h"
|
#include "raft_replication.h"
|
||||||
#include "sync_raft_progress_tracker.h"
|
#include "sync_raft_progress_tracker.h"
|
||||||
|
@ -25,6 +25,8 @@ static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg);
|
||||||
|
|
||||||
|
static bool increaseUncommittedSize(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n);
|
||||||
|
|
||||||
static int triggerAll(SSyncRaft* pRaft);
|
static int triggerAll(SSyncRaft* pRaft);
|
||||||
|
|
||||||
static void tickElection(SSyncRaft* pRaft);
|
static void tickElection(SSyncRaft* pRaft);
|
||||||
|
@ -82,13 +84,22 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft) {
|
||||||
resetRaft(pRaft, pRaft->term);
|
resetRaft(pRaft, pRaft->term);
|
||||||
pRaft->leaderId = pRaft->leaderId;
|
pRaft->leaderId = pRaft->leaderId;
|
||||||
pRaft->state = TAOS_SYNC_STATE_LEADER;
|
pRaft->state = TAOS_SYNC_STATE_LEADER;
|
||||||
// TODO: check if there is pending config log
|
|
||||||
int nPendingConf = syncRaftLogNumOfPendingConf(pRaft->log);
|
|
||||||
if (nPendingConf > 1) {
|
|
||||||
syncFatal("unexpected multiple uncommitted config entry");
|
|
||||||
}
|
|
||||||
|
|
||||||
syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, pRaft->selfId);
|
||||||
|
assert(progress != NULL);
|
||||||
|
// Followers enter replicate mode when they've been successfully probed
|
||||||
|
// (perhaps after having received a snapshot as a result). The leader is
|
||||||
|
// trivially in this state. Note that r.reset() has initialized this
|
||||||
|
// progress with the last index already.
|
||||||
|
syncRaftProgressBecomeReplicate(progress);
|
||||||
|
|
||||||
|
// Conservatively set the pendingConfIndex to the last index in the
|
||||||
|
// log. There may or may not be a pending config change, but it's
|
||||||
|
// safe to delay any future proposals until we commit all our
|
||||||
|
// pending log entries, and scanning the entire tail of the log
|
||||||
|
// could be expensive.
|
||||||
|
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
||||||
|
pRaft->pendingConfigIndex = lastIndex;
|
||||||
|
|
||||||
// after become leader, send a no-op log
|
// after become leader, send a no-op log
|
||||||
SSyncRaftEntry* entry = (SSyncRaftEntry*)malloc(sizeof(SSyncRaftEntry));
|
SSyncRaftEntry* entry = (SSyncRaftEntry*)malloc(sizeof(SSyncRaftEntry));
|
||||||
|
@ -103,6 +114,7 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft) {
|
||||||
};
|
};
|
||||||
appendEntries(pRaft, entry, 1);
|
appendEntries(pRaft, entry, 1);
|
||||||
//syncRaftTriggerHeartbeat(pRaft);
|
//syncRaftTriggerHeartbeat(pRaft);
|
||||||
|
syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term);
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) {
|
void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) {
|
||||||
|
@ -123,15 +135,16 @@ bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int syncRaftQuorum(SSyncRaft* pRaft) {
|
int syncRaftQuorum(SSyncRaft* pRaft) {
|
||||||
return pRaft->cluster.replica / 2 + 1;
|
return 0;
|
||||||
|
//return pRaft->cluster.replica / 2 + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
||||||
bool preVote, bool grant,
|
bool preVote, bool grant,
|
||||||
int* rejected, int *granted) {
|
int* rejected, int *granted) {
|
||||||
int voterIndex = syncRaftConfigurationIndexOfNode(pRaft, id);
|
SNodeInfo* pNode = syncRaftGetNodeById(pRaft, id);
|
||||||
if (voterIndex == -1) {
|
if (pNode == NULL) {
|
||||||
return SYNC_RAFT_VOTE_PENDING;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (grant) {
|
if (grant) {
|
||||||
|
@ -142,7 +155,7 @@ ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
||||||
pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term);
|
pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term);
|
||||||
}
|
}
|
||||||
|
|
||||||
syncRaftRecordVote(pRaft->tracker, voterIndex, grant);
|
syncRaftRecordVote(pRaft->tracker, pNode->nodeId, grant);
|
||||||
return syncRaftTallyVotes(pRaft->tracker, rejected, granted);
|
return syncRaftTallyVotes(pRaft->tracker, rejected, granted);
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
|
@ -154,7 +167,7 @@ ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id,
|
||||||
pRaft->selfGroupId, pRaft->selfId, id, pRaft->term);
|
pRaft->selfGroupId, pRaft->selfId, id, pRaft->term);
|
||||||
}
|
}
|
||||||
|
|
||||||
int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, id);
|
int voteIndex = syncRaftGetNodeById(pRaft, id);
|
||||||
assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0);
|
assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0);
|
||||||
assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN);
|
assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN);
|
||||||
|
|
||||||
|
@ -185,19 +198,30 @@ void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState) {
|
||||||
pRaft->voteFor = serverState->voteFor;
|
pRaft->voteFor = serverState->voteFor;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void visitProgressSendAppend(int i, SSyncRaftProgress* progress, void* arg) {
|
static void visitProgressSendAppend(SSyncRaftProgress* progress, void* arg) {
|
||||||
SSyncRaft* pRaft = (SSyncRaft*)arg;
|
SSyncRaft* pRaft = (SSyncRaft*)arg;
|
||||||
if (pRaft->selfId == progress->id) {
|
if (pRaft->selfId == progress->id) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
syncRaftReplicate(arg, progress, true);
|
syncRaftMaybeSendAppend(arg, progress, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bcastAppend sends RPC, with entries to all peers that are not up-to-date
|
||||||
|
// according to the progress recorded in r.prs.
|
||||||
void syncRaftBroadcastAppend(SSyncRaft* pRaft) {
|
void syncRaftBroadcastAppend(SSyncRaft* pRaft) {
|
||||||
syncRaftProgressVisit(pRaft->tracker, visitProgressSendAppend, pRaft);
|
syncRaftProgressVisit(pRaft->tracker, visitProgressSendAppend, pRaft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SNodeInfo* syncRaftGetNodeById(SSyncRaft *pRaft, SyncNodeId id) {
|
||||||
|
SNodeInfo **ppNode = taosHashGet(pRaft->nodeInfoMap, &id, sizeof(SyncNodeId*));
|
||||||
|
if (ppNode != NULL) {
|
||||||
|
return *ppNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
static int convertClear(SSyncRaft* pRaft) {
|
static int convertClear(SSyncRaft* pRaft) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -223,7 +247,7 @@ static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
syncRaftHandleVoteRespMessage(pRaft, pMsg);
|
syncRaftHandleVoteRespMessage(pRaft, pMsg);
|
||||||
return 0;
|
return 0;
|
||||||
} else if (msgType == RAFT_MSG_APPEND) {
|
} else if (msgType == RAFT_MSG_APPEND) {
|
||||||
syncRaftBecomeFollower(pRaft, pRaft->term, pMsg->from);
|
syncRaftBecomeFollower(pRaft, pMsg->term, pMsg->from);
|
||||||
syncRaftHandleAppendEntriesMessage(pRaft, pMsg);
|
syncRaftHandleAppendEntriesMessage(pRaft, pMsg);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -234,9 +258,7 @@ static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// tickElection is run by followers and candidates after r.electionTimeout.
|
||||||
* tickElection is run by followers and candidates per tick.
|
|
||||||
**/
|
|
||||||
static void tickElection(SSyncRaft* pRaft) {
|
static void tickElection(SSyncRaft* pRaft) {
|
||||||
pRaft->electionElapsed += 1;
|
pRaft->electionElapsed += 1;
|
||||||
|
|
||||||
|
@ -254,10 +276,16 @@ static void tickElection(SSyncRaft* pRaft) {
|
||||||
syncRaftStep(pRaft, syncInitElectionMsg(&msg, pRaft->selfId));
|
syncRaftStep(pRaft, syncInitElectionMsg(&msg, pRaft->selfId));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// tickHeartbeat is run by leaders to send a MsgBeat after r.heartbeatTimeout.
|
||||||
static void tickHeartbeat(SSyncRaft* pRaft) {
|
static void tickHeartbeat(SSyncRaft* pRaft) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO
|
||||||
|
static bool increaseUncommittedSize(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) {
|
static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) {
|
||||||
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log);
|
||||||
SyncTerm term = pRaft->term;
|
SyncTerm term = pRaft->term;
|
||||||
|
@ -268,9 +296,16 @@ static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) {
|
||||||
entries[i].index = lastIndex + 1 + i;
|
entries[i].index = lastIndex + 1 + i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Track the size of this uncommitted proposal.
|
||||||
|
if (!increaseUncommittedSize(pRaft, entries, n)) {
|
||||||
|
// Drop the proposal.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
syncRaftLogAppend(pRaft->log, entries, n);
|
syncRaftLogAppend(pRaft->log, entries, n);
|
||||||
|
|
||||||
SSyncRaftProgress* progress = &(pRaft->tracker->progressMap.progress[pRaft->cluster.selfIndex]);
|
SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, pRaft->selfId);
|
||||||
|
assert(progress != NULL);
|
||||||
syncRaftProgressMaybeUpdate(progress, lastIndex);
|
syncRaftProgressMaybeUpdate(progress, lastIndex);
|
||||||
// Regardless of syncRaftMaybeCommit's return, our caller will call bcastAppend.
|
// Regardless of syncRaftMaybeCommit's return, our caller will call bcastAppend.
|
||||||
syncRaftMaybeCommit(pRaft);
|
syncRaftMaybeCommit(pRaft);
|
||||||
|
@ -297,7 +332,7 @@ static int triggerAll(SSyncRaft* pRaft) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
syncRaftReplicate(pRaft, pRaft->tracker->progressMap.progress[i], true);
|
syncRaftMaybeSendAppend(pRaft, pRaft->tracker->progressMap.progress[i], true);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -307,8 +342,8 @@ static void abortLeaderTransfer(SSyncRaft* pRaft) {
|
||||||
pRaft->leadTransferee = SYNC_NON_NODE_ID;
|
pRaft->leadTransferee = SYNC_NON_NODE_ID;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void initProgress(int i, SSyncRaftProgress* progress, void* arg) {
|
static void resetProgress(SSyncRaftProgress* progress, void* arg) {
|
||||||
syncRaftInitProgress(i, (SSyncRaft*)arg, progress);
|
syncRaftResetProgress((SSyncRaft*)arg, progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void resetRaft(SSyncRaft* pRaft, SyncTerm term) {
|
static void resetRaft(SSyncRaft* pRaft, SyncTerm term) {
|
||||||
|
@ -327,7 +362,7 @@ static void resetRaft(SSyncRaft* pRaft, SyncTerm term) {
|
||||||
abortLeaderTransfer(pRaft);
|
abortLeaderTransfer(pRaft);
|
||||||
|
|
||||||
syncRaftResetVotes(pRaft->tracker);
|
syncRaftResetVotes(pRaft->tracker);
|
||||||
syncRaftProgressVisit(pRaft->tracker, initProgress, pRaft);
|
syncRaftProgressVisit(pRaft->tracker, resetProgress, pRaft);
|
||||||
|
|
||||||
pRaft->pendingConfigIndex = 0;
|
pRaft->pendingConfigIndex = 0;
|
||||||
pRaft->uncommittedSize = 0;
|
pRaft->uncommittedSize = 0;
|
||||||
|
|
|
@ -40,19 +40,16 @@ void syncRaftCloseInflights(SSyncRaftInflights* inflights) {
|
||||||
free(inflights);
|
free(inflights);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// Add notifies the Inflights that a new message with the given index is being
|
||||||
* syncRaftInflightAdd notifies the Inflights that a new message with the given index is being
|
// dispatched. Full() must be called prior to Add() to verify that there is room
|
||||||
* dispatched. syncRaftInflightFull() must be called prior to syncRaftInflightAdd()
|
// for one more message, and consecutive calls to add Add() must provide a
|
||||||
* to verify that there is room for one more message,
|
// monotonic sequence of indexes.
|
||||||
* and consecutive calls to add syncRaftInflightAdd() must provide a
|
|
||||||
* monotonic sequence of indexes.
|
|
||||||
**/
|
|
||||||
void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex) {
|
void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex) {
|
||||||
assert(!syncRaftInflightFull(inflights));
|
assert(!syncRaftInflightFull(inflights));
|
||||||
|
|
||||||
int next = inflights->start + inflights->count;
|
int next = inflights->start + inflights->count;
|
||||||
int size = inflights->size;
|
int size = inflights->size;
|
||||||
/* is next wrapped around buffer? */
|
|
||||||
if (next >= size) {
|
if (next >= size) {
|
||||||
next -= size;
|
next -= size;
|
||||||
}
|
}
|
||||||
|
@ -61,12 +58,10 @@ void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex)
|
||||||
inflights->count++;
|
inflights->count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// FreeLE frees the inflights smaller or equal to the given `to` flight.
|
||||||
* syncRaftInflightFreeLE frees the inflights smaller or equal to the given `to` flight.
|
|
||||||
**/
|
|
||||||
void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex) {
|
void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex) {
|
||||||
if (inflights->count == 0 || toIndex < inflights->buffer[inflights->start]) {
|
if (inflights->count == 0 || toIndex < inflights->buffer[inflights->start]) {
|
||||||
/* out of the left side of the window */
|
// out of the left side of the window
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -95,10 +90,8 @@ void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// FreeFirstOne releases the first inflight. This is a no-op if nothing is
|
||||||
* syncRaftInflightFreeFirstOne releases the first inflight.
|
// inflight.
|
||||||
* This is a no-op if nothing is inflight.
|
|
||||||
**/
|
|
||||||
void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights) {
|
void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights) {
|
||||||
syncRaftInflightFreeLE(inflights, inflights->buffer[inflights->start]);
|
syncRaftInflightFreeLE(inflights, inflights->buffer[inflights->start]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <cli@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "sync_raft_node_map.h"
|
||||||
|
#include "sync_type.h"
|
||||||
|
#include "sync_raft_progress.h"
|
||||||
|
|
||||||
|
void syncRaftInitNodeMap(SSyncRaftNodeMap* nodeMap) {
|
||||||
|
nodeMap->nodeIdMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftFreeNodeMap(SSyncRaftNodeMap* nodeMap) {
|
||||||
|
taosHashCleanup(nodeMap->nodeIdMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap) {
|
||||||
|
taosHashClear(nodeMap->nodeIdMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) {
|
||||||
|
SyncNodeId** ppId = (SyncNodeId**)taosHashGet(nodeMap->nodeIdMap, &nodeId, sizeof(SyncNodeId*));
|
||||||
|
if (ppId == NULL) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftCopyNodeMap(SSyncRaftNodeMap* from, SSyncRaftNodeMap* to) {
|
||||||
|
SyncNodeId *pId = NULL;
|
||||||
|
while (!syncRaftIterateNodeMap(from, pId)) {
|
||||||
|
taosHashPut(to->nodeIdMap, &pId, sizeof(SyncNodeId*), &pId, sizeof(SyncNodeId*));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId) {
|
||||||
|
SyncNodeId **ppId = taosHashIterate(nodeMap->nodeIdMap, pId);
|
||||||
|
if (ppId == NULL) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
*pId = *(*ppId);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftIsAllNodeInProgressMap(SSyncRaftNodeMap* nodeMap, SSyncRaftProgressMap* progressMap) {
|
||||||
|
SyncNodeId *pId = NULL;
|
||||||
|
while (!syncRaftIterateNodeMap(nodeMap, pId)) {
|
||||||
|
if (!syncRaftIsInProgressMap(progressMap, *pId)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftUnionNodeMap(SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) {
|
||||||
|
syncRaftCopyNodeMap(nodeMap, to);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftAddToNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) {
|
||||||
|
taosHashPut(nodeMap->nodeIdMap, &nodeId, sizeof(SyncNodeId*), &nodeId, sizeof(SyncNodeId*));
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftRemoveFromNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) {
|
||||||
|
taosHashRemove(nodeMap->nodeIdMap, &nodeId, sizeof(SyncNodeId*));
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t syncRaftNodeMapSize(const SSyncRaftNodeMap* nodeMap) {
|
||||||
|
return taosHashGetSize(nodeMap->nodeIdMap);
|
||||||
|
}
|
|
@ -20,18 +20,26 @@
|
||||||
#include "sync.h"
|
#include "sync.h"
|
||||||
#include "syncInt.h"
|
#include "syncInt.h"
|
||||||
|
|
||||||
|
static void copyProgress(SSyncRaftProgress* progress, void* arg);
|
||||||
|
|
||||||
|
static void refProgress(SSyncRaftProgress* progress);
|
||||||
|
static void unrefProgress(SSyncRaftProgress* progress, void*);
|
||||||
|
|
||||||
static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state);
|
static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state);
|
||||||
static void probeAcked(SSyncRaftProgress* progress);
|
static void probeAcked(SSyncRaftProgress* progress);
|
||||||
|
|
||||||
static void resumeProgress(SSyncRaftProgress* progress);
|
static void resumeProgress(SSyncRaftProgress* progress);
|
||||||
|
|
||||||
void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress) {
|
void syncRaftResetProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress) {
|
||||||
|
if (progress->inflights) {
|
||||||
|
syncRaftCloseInflights(progress->inflights);
|
||||||
|
}
|
||||||
SSyncRaftInflights* inflights = syncRaftOpenInflights(pRaft->tracker->maxInflightMsgs);
|
SSyncRaftInflights* inflights = syncRaftOpenInflights(pRaft->tracker->maxInflightMsgs);
|
||||||
if (inflights == NULL) {
|
if (inflights == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
*progress = (SSyncRaftProgress) {
|
*progress = (SSyncRaftProgress) {
|
||||||
.matchIndex = i == pRaft->selfIndex ? syncRaftLogLastIndex(pRaft->log) : 0,
|
.matchIndex = progress->id == pRaft->selfId ? syncRaftLogLastIndex(pRaft->log) : 0,
|
||||||
.nextIndex = syncRaftLogLastIndex(pRaft->log) + 1,
|
.nextIndex = syncRaftLogLastIndex(pRaft->log) + 1,
|
||||||
.inflights = inflights,
|
.inflights = inflights,
|
||||||
.isLearner = false,
|
.isLearner = false,
|
||||||
|
@ -39,11 +47,9 @@ void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// MaybeUpdate is called when an MsgAppResp arrives from the follower, with the
|
||||||
* syncRaftProgressMaybeUpdate is called when an MsgAppResp arrives from the follower, with the
|
// index acked by it. The method returns false if the given n index comes from
|
||||||
* index acked by it. The method returns false if the given n index comes from
|
// an outdated message. Otherwise it updates the progress and returns true.
|
||||||
* an outdated message. Otherwise it updates the progress and returns true.
|
|
||||||
**/
|
|
||||||
bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex) {
|
bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex) {
|
||||||
bool updated = false;
|
bool updated = false;
|
||||||
|
|
||||||
|
@ -58,27 +64,36 @@ bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastInde
|
||||||
return updated;
|
return updated;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The
|
||||||
|
// arguments are the index of the append message rejected by the follower, and
|
||||||
|
// the hint that we want to decrease to.
|
||||||
|
//
|
||||||
|
// Rejections can happen spuriously as messages are sent out of order or
|
||||||
|
// duplicated. In such cases, the rejection pertains to an index that the
|
||||||
|
// Progress already knows were previously acknowledged, and false is returned
|
||||||
|
// without changing the Progress.
|
||||||
|
//
|
||||||
|
// If the rejection is genuine, Next is lowered sensibly, and the Progress is
|
||||||
|
// cleared for sending log entries.
|
||||||
bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress,
|
bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress,
|
||||||
SyncIndex rejected, SyncIndex matchHint) {
|
SyncIndex rejected, SyncIndex matchHint) {
|
||||||
if (progress->state == PROGRESS_STATE_REPLICATE) {
|
if (progress->state == PROGRESS_STATE_REPLICATE) {
|
||||||
/**
|
// The rejection must be stale if the progress has matched and "rejected"
|
||||||
* the rejection must be stale if the progress has matched and "rejected"
|
// is smaller than "match".
|
||||||
* is smaller than "match".
|
|
||||||
**/
|
|
||||||
if (rejected <= progress->matchIndex) {
|
if (rejected <= progress->matchIndex) {
|
||||||
syncDebug("match index is up to date,ignore");
|
syncDebug("match index is up to date,ignore");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* directly decrease next to match + 1 */
|
// Directly decrease next to match + 1.
|
||||||
|
//
|
||||||
|
// TODO(tbg): why not use matchHint if it's larger?
|
||||||
progress->nextIndex = progress->matchIndex + 1;
|
progress->nextIndex = progress->matchIndex + 1;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// The rejection must be stale if "rejected" does not match next - 1. This
|
||||||
* The rejection must be stale if "rejected" does not match next - 1. This
|
// is because non-replicating followers are probed one entry at a time.
|
||||||
* is because non-replicating followers are probed one entry at a time.
|
|
||||||
**/
|
|
||||||
if (rejected != progress->nextIndex - 1) {
|
if (rejected != progress->nextIndex - 1) {
|
||||||
syncDebug("rejected index %" PRId64 " different from next index %" PRId64 " -> ignore"
|
syncDebug("rejected index %" PRId64 " different from next index %" PRId64 " -> ignore"
|
||||||
, rejected, progress->nextIndex);
|
, rejected, progress->nextIndex);
|
||||||
|
@ -91,14 +106,12 @@ bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// IsPaused returns whether sending log entries to this node has been throttled.
|
||||||
* syncRaftProgressIsPaused returns whether sending log entries to this node has been throttled.
|
// This is done when a node has rejected recent MsgApps, is currently waiting
|
||||||
* This is done when a node has rejected recent MsgApps, is currently waiting
|
// for a snapshot, or has reached the MaxInflightMsgs limit. In normal
|
||||||
* for a snapshot, or has reached the MaxInflightMsgs limit. In normal
|
// operation, this is false. A throttled node will be contacted less frequently
|
||||||
* operation, this is false. A throttled node will be contacted less frequently
|
// until it has reached a state in which it's able to accept a steady stream of
|
||||||
* until it has reached a state in which it's able to accept a steady stream of
|
// log entries again.
|
||||||
* log entries again.
|
|
||||||
**/
|
|
||||||
bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) {
|
bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) {
|
||||||
switch (progress->state) {
|
switch (progress->state) {
|
||||||
case PROGRESS_STATE_PROBE:
|
case PROGRESS_STATE_PROBE:
|
||||||
|
@ -112,58 +125,44 @@ bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int syncRaftFindProgressIndexByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
SSyncRaftProgress* syncRaftFindProgressByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||||
int i;
|
SSyncRaftProgress** ppProgress = (SSyncRaftProgress**)taosHashGet(progressMap->progressMap, &id, sizeof(SyncNodeId*));
|
||||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
if (ppProgress == NULL) {
|
||||||
if (progressMap->progress[i].id == id) {
|
return NULL;
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return -1;
|
|
||||||
|
return *ppProgress;
|
||||||
}
|
}
|
||||||
|
|
||||||
int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgress* progress) {
|
||||||
int i, j;
|
refProgress(progress);
|
||||||
|
taosHashPut(progressMap->progressMap, &progress->id, sizeof(SyncNodeId*), &progress, sizeof(SSyncRaftProgress*));
|
||||||
for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) {
|
|
||||||
if (progressMap->progress[i].id == id) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
if (j == -1 && progressMap->progress[i].id == SYNC_NON_NODE_ID) {
|
|
||||||
j = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(j != -1);
|
|
||||||
|
|
||||||
progressMap->progress[i].id = id;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||||
int i;
|
SSyncRaftProgress** ppProgress = (SSyncRaftProgress**)taosHashGet(progressMap->progressMap, &id, sizeof(SyncNodeId*));
|
||||||
|
if (ppProgress == NULL) {
|
||||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
return;
|
||||||
if (progressMap->progress[i].id == id) {
|
|
||||||
progressMap->progress[i].id = SYNC_NON_NODE_ID;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
unrefProgress(*ppProgress, NULL);
|
||||||
|
|
||||||
|
taosHashRemove(progressMap->progressMap, &id, sizeof(SyncNodeId*));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftIsInProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) {
|
||||||
|
return taosHashGet(progressMap->progressMap, &id, sizeof(SyncNodeId*)) != NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) {
|
bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) {
|
||||||
return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex;
|
return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// BecomeProbe transitions into StateProbe. Next is reset to Match+1 or,
|
||||||
* syncRaftProgressBecomeProbe transitions into StateProbe. Next is reset to Match+1 or,
|
// optionally and if larger, the index of the pending snapshot.
|
||||||
* optionally and if larger, the index of the pending snapshot.
|
|
||||||
**/
|
|
||||||
void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress) {
|
void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress) {
|
||||||
/**
|
// If the original state is StateSnapshot, progress knows that
|
||||||
* If the original state is ProgressStateSnapshot, progress knows that
|
// the pending snapshot has been sent to this peer successfully, then
|
||||||
* the pending snapshot has been sent to this peer successfully, then
|
// probes from pendingSnapshot + 1.
|
||||||
* probes from pendingSnapshot + 1.
|
|
||||||
**/
|
|
||||||
if (progress->state == PROGRESS_STATE_SNAPSHOT) {
|
if (progress->state == PROGRESS_STATE_SNAPSHOT) {
|
||||||
SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex;
|
SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex;
|
||||||
resetProgressState(progress, PROGRESS_STATE_PROBE);
|
resetProgressState(progress, PROGRESS_STATE_PROBE);
|
||||||
|
@ -174,27 +173,78 @@ void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// BecomeReplicate transitions into StateReplicate, resetting Next to Match+1.
|
||||||
* syncRaftProgressBecomeReplicate transitions into StateReplicate, resetting Next to Match+1.
|
|
||||||
**/
|
|
||||||
void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress) {
|
void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress) {
|
||||||
resetProgressState(progress, PROGRESS_STATE_REPLICATE);
|
resetProgressState(progress, PROGRESS_STATE_REPLICATE);
|
||||||
progress->nextIndex = progress->matchIndex + 1;
|
progress->nextIndex = progress->matchIndex + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// BecomeSnapshot moves the Progress to StateSnapshot with the specified pending
|
||||||
|
// snapshot index.
|
||||||
void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex) {
|
void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex) {
|
||||||
resetProgressState(progress, PROGRESS_STATE_SNAPSHOT);
|
resetProgressState(progress, PROGRESS_STATE_SNAPSHOT);
|
||||||
progress->pendingSnapshotIndex = snapshotIndex;
|
progress->pendingSnapshotIndex = snapshotIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftCopyProgress(const SSyncRaftProgress* progress, SSyncRaftProgress* out) {
|
void syncRaftCopyProgress(const SSyncRaftProgress* progress, SSyncRaftProgress* out) {
|
||||||
|
memcpy(out, progress, sizeof(SSyncRaftProgress));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
void syncRaftInitProgressMap(SSyncRaftProgressMap* progressMap) {
|
||||||
* ResetState moves the Progress into the specified State, resetting ProbeSent,
|
progressMap->progressMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK);
|
||||||
* PendingSnapshot, and Inflights.
|
}
|
||||||
**/
|
|
||||||
|
void syncRaftFreeProgressMap(SSyncRaftProgressMap* progressMap) {
|
||||||
|
syncRaftVisitProgressMap(progressMap, unrefProgress, NULL);
|
||||||
|
taosHashCleanup(progressMap->progressMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftClearProgressMap(SSyncRaftProgressMap* progressMap) {
|
||||||
|
taosHashClear(progressMap->progressMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftCopyProgressMap(SSyncRaftProgressMap* from, SSyncRaftProgressMap* to) {
|
||||||
|
syncRaftVisitProgressMap(from, copyProgress, to);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftIterateProgressMap(const SSyncRaftProgressMap* progressMap, SSyncRaftProgress *pProgress) {
|
||||||
|
SSyncRaftProgress **ppProgress = taosHashIterate(progressMap->progressMap, pProgress);
|
||||||
|
if (ppProgress == NULL) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
*pProgress = *(*ppProgress);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool syncRaftVisitProgressMap(SSyncRaftProgressMap* progressMap, visitProgressFp fp, void* arg) {
|
||||||
|
SSyncRaftProgress *pProgress;
|
||||||
|
while (!syncRaftIterateProgressMap(progressMap, pProgress)) {
|
||||||
|
fp(pProgress, arg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void copyProgress(SSyncRaftProgress* progress, void* arg) {
|
||||||
|
assert(progress->refCount > 0);
|
||||||
|
SSyncRaftProgressMap* to = (SSyncRaftProgressMap*)arg;
|
||||||
|
syncRaftAddToProgressMap(to, progress);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void refProgress(SSyncRaftProgress* progress) {
|
||||||
|
progress->refCount += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void unrefProgress(SSyncRaftProgress* progress, void* arg) {
|
||||||
|
(void)arg;
|
||||||
|
progress->refCount -= 1;
|
||||||
|
assert(progress->refCount >= 0);
|
||||||
|
if (progress->refCount == 0) {
|
||||||
|
free(progress);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResetState moves the Progress into the specified State, resetting ProbeSent,
|
||||||
|
// PendingSnapshot, and Inflights.
|
||||||
static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state) {
|
static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state) {
|
||||||
progress->probeSent = false;
|
progress->probeSent = false;
|
||||||
progress->pendingSnapshotIndex = 0;
|
progress->pendingSnapshotIndex = 0;
|
||||||
|
@ -202,83 +252,9 @@ static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressSta
|
||||||
syncRaftInflightReset(progress->inflights);
|
syncRaftInflightReset(progress->inflights);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// ProbeAcked is called when this peer has accepted an append. It resets
|
||||||
* probeAcked is called when this peer has accepted an append. It resets
|
// ProbeSent to signal that additional append messages should be sent without
|
||||||
* ProbeSent to signal that additional append messages should be sent without
|
// further delay.
|
||||||
* further delay.
|
|
||||||
**/
|
|
||||||
static void probeAcked(SSyncRaftProgress* progress) {
|
static void probeAcked(SSyncRaftProgress* progress) {
|
||||||
progress->probeSent = false;
|
progress->probeSent = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
|
|
||||||
SyncIndex syncRaftProgressNextIndex(SSyncRaft* pRaft, int i) {
|
|
||||||
return pRaft->leaderState.progress[i].nextIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i) {
|
|
||||||
return pRaft->leaderState.progress[i].matchIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i) {
|
|
||||||
pRaft->leaderState.progress[i].lastSend = pRaft->io.time(pRaft);
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i) {
|
|
||||||
pRaft->leaderState.progress[i].lastSendSnapshot = pRaft->io.time(pRaft);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i) {
|
|
||||||
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
|
||||||
bool prev = progress->recentRecv;
|
|
||||||
progress->recentRecv = false;
|
|
||||||
return prev;
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i) {
|
|
||||||
pRaft->leaderState.progress[i].recentRecv = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool syncRaftProgressGetRecentRecv(SSyncRaft* pRaft, int i) {
|
|
||||||
return pRaft->leaderState.progress[i].recentRecv;
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i) {
|
|
||||||
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
|
||||||
resetProgressState(progress, PROGRESS_STATE_SNAPSHOT);
|
|
||||||
progress->pendingSnapshotIndex = raftLogSnapshotIndex(pRaft->log);
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i) {
|
|
||||||
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
|
||||||
|
|
||||||
if (progress->state == PROGRESS_STATE_SNAPSHOT) {
|
|
||||||
assert(progress->pendingSnapshotIndex > 0);
|
|
||||||
SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex;
|
|
||||||
resetProgressState(progress, PROGRESS_STATE_PROBE);
|
|
||||||
progress->nextIndex = max(progress->matchIndex + 1, pendingSnapshotIndex);
|
|
||||||
} else {
|
|
||||||
resetProgressState(progress, PROGRESS_STATE_PROBE);
|
|
||||||
progress->nextIndex = progress->matchIndex + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i) {
|
|
||||||
resetProgressState(pRaft->leaderState.progress, PROGRESS_STATE_REPLICATE);
|
|
||||||
pRaft->leaderState.progress->nextIndex = pRaft->leaderState.progress->matchIndex + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i) {
|
|
||||||
SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]);
|
|
||||||
progress->pendingSnapshotIndex = 0;
|
|
||||||
progress->state = PROGRESS_STATE_PROBE;
|
|
||||||
}
|
|
||||||
|
|
||||||
ESyncRaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i) {
|
|
||||||
return pRaft->leaderState.progress[i].state;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -13,62 +13,99 @@
|
||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "raft.h"
|
||||||
|
#include "sync_const.h"
|
||||||
#include "sync_raft_progress_tracker.h"
|
#include "sync_raft_progress_tracker.h"
|
||||||
#include "sync_raft_proto.h"
|
#include "sync_raft_proto.h"
|
||||||
|
|
||||||
SSyncRaftProgressTracker* syncRaftOpenProgressTracker() {
|
SSyncRaftProgressTracker* syncRaftOpenProgressTracker(SSyncRaft* pRaft) {
|
||||||
SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)malloc(sizeof(SSyncRaftProgressTracker));
|
SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)malloc(sizeof(SSyncRaftProgressTracker));
|
||||||
if (tracker == NULL) {
|
if (tracker == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tracker->votesMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK);
|
||||||
|
|
||||||
|
syncRaftInitTrackConfig(&tracker->config);
|
||||||
|
tracker->pRaft = pRaft;
|
||||||
|
tracker->maxInflightMsgs = kSyncRaftMaxInflghtMsgs;
|
||||||
|
|
||||||
return tracker;
|
return tracker;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void syncRaftInitTrackConfig(SSyncRaftProgressTrackerConfig* config) {
|
||||||
|
syncRaftInitNodeMap(&config->learners);
|
||||||
|
syncRaftInitNodeMap(&config->learnersNext);
|
||||||
|
syncRaftInitQuorumJointConfig(&config->voters);
|
||||||
|
config->autoLeave = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config) {
|
||||||
|
syncRaftFreeNodeMap(&config->learners);
|
||||||
|
syncRaftFreeNodeMap(&config->learnersNext);
|
||||||
|
syncRaftFreeNodeMap(&config->voters.incoming);
|
||||||
|
syncRaftFreeNodeMap(&config->voters.outgoing);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResetVotes prepares for a new round of vote counting via recordVote.
|
||||||
void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) {
|
void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) {
|
||||||
memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(ESyncRaftVoteType) * TSDB_MAX_REPLICA);
|
taosHashClear(tracker->votesMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) {
|
void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) {
|
||||||
int i;
|
syncRaftVisitProgressMap(&tracker->progressMap, visit, arg);
|
||||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
|
||||||
SSyncRaftProgress* progress = &(tracker->progressMap.progress[i]);
|
|
||||||
visit(i, progress, arg);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant) {
|
// RecordVote records that the node with the given id voted for this Raft
|
||||||
if (tracker->votes[i] != SYNC_RAFT_VOTE_RESP_UNKNOWN) {
|
// instance if v == true (and declined it otherwise).
|
||||||
|
void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant) {
|
||||||
|
ESyncRaftVoteType* pType = taosHashGet(tracker->votesMap, &id, sizeof(SyncNodeId*));
|
||||||
|
if (pType != NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
tracker->votes[i] = grant ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT;
|
taosHashPut(tracker->votesMap, &id, sizeof(SyncNodeId), &grant, sizeof(bool*));
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) {
|
void syncRaftCopyTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) {
|
||||||
|
memcpy(to, from, sizeof(SSyncRaftProgressTrackerConfig));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
int syncRaftCheckTrackerConfigInProgress(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||||
* syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the
|
// NB: intentionally allow the empty config. In production we'll never see a
|
||||||
* election outcome is known.
|
// non-empty config (we prevent it from being created) but we will need to
|
||||||
**/
|
// be able to *create* an initial config, for example during bootstrap (or
|
||||||
|
// during tests). Instead of having to hand-code this, we allow
|
||||||
|
// transitioning from an empty config into any other legal and non-empty
|
||||||
|
// config.
|
||||||
|
if (!syncRaftIsAllNodeInProgressMap(&config->voters.incoming, progressMap)) return -1;
|
||||||
|
if (!syncRaftIsAllNodeInProgressMap(&config->voters.outgoing, progressMap)) return -1;
|
||||||
|
if (!syncRaftIsAllNodeInProgressMap(&config->learners, progressMap)) return -1;
|
||||||
|
if (!syncRaftIsAllNodeInProgressMap(&config->learnersNext, progressMap)) return -1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TallyVotes returns the number of granted and rejected Votes, and whether the
|
||||||
|
// election outcome is known.
|
||||||
ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) {
|
ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) {
|
||||||
int i;
|
SSyncRaftProgress* progress = NULL;
|
||||||
SSyncRaftProgress* progress;
|
|
||||||
int r, g;
|
int r, g;
|
||||||
|
|
||||||
for (i = 0, r = 0, g = 0; i < TSDB_MAX_REPLICA; ++i) {
|
// Make sure to populate granted/rejected correctly even if the Votes slice
|
||||||
progress = &(tracker->progressMap.progress[i]);
|
// contains members no longer part of the configuration. This doesn't really
|
||||||
|
// matter in the way the numbers are used (they're informational), but might
|
||||||
|
// as well get it right.
|
||||||
|
while (!syncRaftIterateProgressMap(&tracker->progressMap, progress)) {
|
||||||
if (progress->id == SYNC_NON_NODE_ID) {
|
if (progress->id == SYNC_NON_NODE_ID) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tracker->votes[i] == SYNC_RAFT_VOTE_RESP_UNKNOWN) {
|
bool* v = taosHashGet(tracker->votesMap, &progress->id, sizeof(SyncNodeId*));
|
||||||
|
if (v == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tracker->votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) {
|
if (*v) {
|
||||||
g++;
|
g++;
|
||||||
} else {
|
} else {
|
||||||
r++;
|
r++;
|
||||||
|
@ -77,12 +114,43 @@ ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* r
|
||||||
|
|
||||||
if (rejected) *rejected = r;
|
if (rejected) *rejected = r;
|
||||||
if (granted) *granted = g;
|
if (granted) *granted = g;
|
||||||
return syncRaftVoteResult(&(tracker->config.voters), tracker->votes);
|
return syncRaftVoteResult(&(tracker->config.voters), tracker->votesMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) {
|
void syncRaftConfigState(SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) {
|
||||||
memcpy(&cs->voters, &tracker->config.voters.incoming, sizeof(SSyncRaftNodeMap));
|
syncRaftCopyNodeMap(&tracker->config.voters.incoming, &cs->voters);
|
||||||
memcpy(&cs->votersOutgoing, &tracker->config.voters.outgoing, sizeof(SSyncRaftNodeMap));
|
syncRaftCopyNodeMap(&tracker->config.voters.outgoing, &cs->votersOutgoing);
|
||||||
memcpy(&cs->learners, &tracker->config.learners, sizeof(SSyncRaftNodeMap));
|
syncRaftCopyNodeMap(&tracker->config.learners, &cs->learners);
|
||||||
memcpy(&cs->learnersNext, &tracker->config.learnersNext, sizeof(SSyncRaftNodeMap));
|
syncRaftCopyNodeMap(&tracker->config.learnersNext, &cs->learnersNext);
|
||||||
|
cs->autoLeave = tracker->config.autoLeave;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void matchAckIndexer(SyncNodeId id, void* arg, SyncIndex* index) {
|
||||||
|
SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)arg;
|
||||||
|
SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&tracker->progressMap, id);
|
||||||
|
if (progress == NULL) {
|
||||||
|
*index = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
*index = progress->matchIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Committed returns the largest log index known to be committed based on what
|
||||||
|
// the voting members of the group have acknowledged.
|
||||||
|
SyncIndex syncRaftCommittedIndex(SSyncRaftProgressTracker* tracker) {
|
||||||
|
return syncRaftJointConfigCommittedIndex(&tracker->config.voters, matchAckIndexer, tracker);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void visitProgressActive(SSyncRaftProgress* progress, void* arg) {
|
||||||
|
SHashObj* votesMap = (SHashObj*)arg;
|
||||||
|
taosHashPut(votesMap, &progress->id, sizeof(SyncNodeId), &progress->recentActive, sizeof(bool));
|
||||||
|
}
|
||||||
|
|
||||||
|
// QuorumActive returns true if the quorum is active from the view of the local
|
||||||
|
// raft state machine. Otherwise, it returns false.
|
||||||
|
bool syncRaftQuorumActive(SSyncRaftProgressTracker* tracker) {
|
||||||
|
SHashObj* votesMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK);
|
||||||
|
syncRaftVisitProgressMap(&tracker->progressMap, visitProgressActive, votesMap);
|
||||||
|
|
||||||
|
return syncRaftVoteResult(&tracker->config.voters, votesMap) == SYNC_RAFT_VOTE_WON;
|
||||||
}
|
}
|
|
@ -13,6 +13,7 @@
|
||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "sync_raft_node_map.h"
|
||||||
#include "sync_raft_quorum_majority.h"
|
#include "sync_raft_quorum_majority.h"
|
||||||
#include "sync_raft_quorum_joint.h"
|
#include "sync_raft_quorum_joint.h"
|
||||||
#include "sync_raft_quorum.h"
|
#include "sync_raft_quorum.h"
|
||||||
|
@ -22,9 +23,9 @@
|
||||||
* a result indicating whether the vote is pending, lost, or won. A joint quorum
|
* a result indicating whether the vote is pending, lost, or won. A joint quorum
|
||||||
* requires both majority quorums to vote in favor.
|
* requires both majority quorums to vote in favor.
|
||||||
**/
|
**/
|
||||||
ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const ESyncRaftVoteType* votes) {
|
ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashObj* votesMap) {
|
||||||
ESyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->incoming), votes);
|
ESyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->incoming), votesMap);
|
||||||
ESyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->outgoing), votes);
|
ESyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->outgoing), votesMap);
|
||||||
|
|
||||||
if (r1 == r2) {
|
if (r1 == r2) {
|
||||||
// If they agree, return the agreed state.
|
// If they agree, return the agreed state.
|
||||||
|
@ -40,46 +41,35 @@ ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const E
|
||||||
return SYNC_RAFT_VOTE_PENDING;
|
return SYNC_RAFT_VOTE_PENDING;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void syncRaftInitQuorumJointConfig(SSyncRaftQuorumJointConfig* config) {
|
||||||
|
syncRaftInitNodeMap(&config->incoming);
|
||||||
|
syncRaftInitNodeMap(&config->outgoing);
|
||||||
|
}
|
||||||
|
|
||||||
|
void syncRaftFreeQuorumJointConfig(SSyncRaftQuorumJointConfig* config) {
|
||||||
|
syncRaftFreeNodeMap(&config->incoming);
|
||||||
|
syncRaftFreeNodeMap(&config->outgoing);
|
||||||
|
}
|
||||||
|
|
||||||
void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
||||||
int i, min;
|
syncRaftAddToNodeMap(&config->incoming, id);
|
||||||
|
|
||||||
for (i = 0, min = -1; i < TSDB_MAX_REPLICA; ++i) {
|
|
||||||
if (config->incoming.nodeId[i] == id) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (min == -1 && config->incoming.nodeId[i] == SYNC_NON_NODE_ID) {
|
|
||||||
min = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(min != -1);
|
|
||||||
config->incoming.nodeId[min] = id;
|
|
||||||
config->incoming.replica += 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) {
|
||||||
int i;
|
syncRaftRemoveFromNodeMap(&config->incoming, id);
|
||||||
|
|
||||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
|
||||||
if (config->incoming.nodeId[i] == id) {
|
|
||||||
config->incoming.replica -= 1;
|
|
||||||
config->incoming.nodeId[i] = SYNC_NON_NODE_ID;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(config->incoming.replica >= 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void syncRaftJointConfigIDs(SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap) {
|
||||||
|
syncRaftCopyNodeMap(&config->incoming, nodeMap);
|
||||||
|
|
||||||
bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) {
|
syncRaftUnionNodeMap(&config->outgoing, nodeMap);
|
||||||
int i;
|
}
|
||||||
|
|
||||||
for (i = 0; i < TSDB_MAX_REPLICA; ++i) {
|
SyncIndex syncRaftJointConfigCommittedIndex(const SSyncRaftQuorumJointConfig* config, matchAckIndexerFp indexer, void* arg) {
|
||||||
if (nodeId == nodeMap->nodeId[i]) {
|
SyncIndex index0, index1;
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
index0 = syncRaftMajorityConfigCommittedIndex(&config->incoming, indexer, arg);
|
||||||
|
index1 = syncRaftMajorityConfigCommittedIndex(&config->outgoing, indexer, arg);
|
||||||
|
|
||||||
|
return index0 < index1 ? index0 : index1;
|
||||||
}
|
}
|
|
@ -13,42 +13,109 @@
|
||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "sync_const.h"
|
||||||
#include "sync_raft_quorum.h"
|
#include "sync_raft_quorum.h"
|
||||||
#include "sync_raft_quorum_majority.h"
|
#include "sync_raft_quorum_majority.h"
|
||||||
|
#include "sync_raft_node_map.h"
|
||||||
|
|
||||||
/**
|
// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||||
* syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
// a result indicating whether the vote is pending (i.e. neither a quorum of
|
||||||
* a result indicating whether the vote is pending (i.e. neither a quorum of
|
// yes/no has been reached), won (a quorum of yes has been reached), or lost (a
|
||||||
* yes/no has been reached), won (a quorum of yes has been reached), or lost (a
|
// quorum of no has been reached).
|
||||||
* quorum of no has been reached).
|
ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashObj* votesMap) {
|
||||||
**/
|
int n = syncRaftNodeMapSize(config);
|
||||||
ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, const ESyncRaftVoteType* votes) {
|
if (n == 0) {
|
||||||
if (config->replica == 0) {
|
// By convention, the elections on an empty config win. This comes in
|
||||||
|
// handy with joint quorums because it'll make a half-populated joint
|
||||||
|
// quorum behave like a majority quorum.
|
||||||
return SYNC_RAFT_VOTE_WON;
|
return SYNC_RAFT_VOTE_WON;
|
||||||
}
|
}
|
||||||
|
|
||||||
int i, g, r, missing;
|
int i, g, r, missing;
|
||||||
for (i = g = r = missing = 0; i < TSDB_MAX_REPLICA; ++i) {
|
i = g = r = missing = 0;
|
||||||
if (config->nodeId[i] == SYNC_NON_NODE_ID) {
|
SyncNodeId* pId = NULL;
|
||||||
|
while (!syncRaftIterateNodeMap(config, pId)) {
|
||||||
|
const bool* v = (const bool*)taosHashGet(votesMap, pId, sizeof(SyncNodeId*));
|
||||||
|
if (v == NULL) {
|
||||||
|
missing += 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (votes[i] == SYNC_RAFT_VOTE_RESP_UNKNOWN) {
|
if (*v) {
|
||||||
missing += 1;
|
|
||||||
} else if (votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) {
|
|
||||||
g +=1;
|
g +=1;
|
||||||
} else {
|
} else {
|
||||||
r += 1;
|
r += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int quorum = config->replica / 2 + 1;
|
int quorum = n / 2 + 1;
|
||||||
if (g >= quorum) {
|
if (g >= quorum) {
|
||||||
return SYNC_RAFT_VOTE_WON;
|
return SYNC_RAFT_VOTE_WON;
|
||||||
}
|
}
|
||||||
if (r + missing >= quorum) {
|
if (g + missing >= quorum) {
|
||||||
return SYNC_RAFT_VOTE_PENDING;
|
return SYNC_RAFT_VOTE_PENDING;
|
||||||
}
|
}
|
||||||
|
|
||||||
return SYNC_RAFT_VOTE_LOST;
|
return SYNC_RAFT_VOTE_LOST;
|
||||||
|
}
|
||||||
|
|
||||||
|
int compSyncIndex(const void * elem1, const void * elem2) {
|
||||||
|
SyncIndex index1 = *((SyncIndex*)elem1);
|
||||||
|
SyncIndex index2 = *((SyncIndex*)elem1);
|
||||||
|
if (index1 > index2) return 1;
|
||||||
|
if (index1 < index2) return -1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
SyncIndex syncRaftMajorityConfigCommittedIndex(const SSyncRaftNodeMap* config, matchAckIndexerFp indexer, void* arg) {
|
||||||
|
int n = syncRaftNodeMapSize(config);
|
||||||
|
if (n == 0) {
|
||||||
|
// This plays well with joint quorums which, when one half is the zero
|
||||||
|
// MajorityConfig, should behave like the other half.
|
||||||
|
return kMaxCommitIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use an on-stack slice to collect the committed indexes when n <= 7
|
||||||
|
// (otherwise we alloc). The alternative is to stash a slice on
|
||||||
|
// MajorityConfig, but this impairs usability (as is, MajorityConfig is just
|
||||||
|
// a map, and that's nice). The assumption is that running with a
|
||||||
|
// replication factor of >7 is rare, and in cases in which it happens
|
||||||
|
// performance is a lesser concern (additionally the performance
|
||||||
|
// implications of an allocation here are far from drastic).
|
||||||
|
SyncIndex* srt = NULL;
|
||||||
|
SyncIndex srk[TSDB_MAX_REPLICA];
|
||||||
|
if (n > TSDB_MAX_REPLICA) {
|
||||||
|
srt = (SyncIndex*)malloc(sizeof(SyncIndex) * n);
|
||||||
|
if (srt == NULL) {
|
||||||
|
return kMaxCommitIndex;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
srt = &srk[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fill the slice with the indexes observed. Any unused slots will be
|
||||||
|
// left as zero; these correspond to voters that may report in, but
|
||||||
|
// haven't yet. We fill from the right (since the zeroes will end up on
|
||||||
|
// the left after sorting below anyway).
|
||||||
|
SyncNodeId *pId = NULL;
|
||||||
|
int i = 0;
|
||||||
|
SyncIndex index;
|
||||||
|
while (!syncRaftIterateNodeMap(config, pId)) {
|
||||||
|
indexer(*pId, arg, &index);
|
||||||
|
srt[i++] = index;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by index. Use a bespoke algorithm (copied from the stdlib's sort
|
||||||
|
// package) to keep srt on the stack.
|
||||||
|
qsort(srt, n, sizeof(SyncIndex), compSyncIndex);
|
||||||
|
|
||||||
|
// The smallest index into the array for which the value is acked by a
|
||||||
|
// quorum. In other words, from the end of the slice, move n/2+1 to the
|
||||||
|
// left (accounting for zero-indexing).
|
||||||
|
index = srt[n - (n/2 + 1)];
|
||||||
|
if (srt != &srk[0]) {
|
||||||
|
free(srt);
|
||||||
|
}
|
||||||
|
|
||||||
|
return index;
|
||||||
}
|
}
|
|
@ -17,6 +17,7 @@
|
||||||
#include "sync_raft_restore.h"
|
#include "sync_raft_restore.h"
|
||||||
#include "sync_raft_progress_tracker.h"
|
#include "sync_raft_progress_tracker.h"
|
||||||
|
|
||||||
|
static void addToConfChangeSingleArray(SSyncConfChangeSingleArray* out, int* i, const SSyncRaftNodeMap* nodeMap, ESyncRaftConfChangeType t);
|
||||||
static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in);
|
static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in);
|
||||||
|
|
||||||
// syncRaftRestoreConfig takes a Changer (which must represent an empty configuration), and
|
// syncRaftRestoreConfig takes a Changer (which must represent an empty configuration), and
|
||||||
|
@ -27,21 +28,26 @@ static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleA
|
||||||
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
|
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
|
||||||
// this can just take LastIndex and MaxInflight directly instead and cook up
|
// this can just take LastIndex and MaxInflight directly instead and cook up
|
||||||
// the results from that alone.
|
// the results from that alone.
|
||||||
int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs) {
|
int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs,
|
||||||
|
SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) {
|
||||||
SSyncConfChangeSingleArray outgoing;
|
SSyncConfChangeSingleArray outgoing;
|
||||||
SSyncConfChangeSingleArray incoming;
|
SSyncConfChangeSingleArray incoming;
|
||||||
SSyncConfChangeSingleArray css;
|
SSyncConfChangeSingleArray css;
|
||||||
SSyncRaftProgressTracker* tracker = changer->tracker;
|
SSyncRaftProgressTracker* tracker = changer->tracker;
|
||||||
SSyncRaftProgressTrackerConfig* config = &tracker->config;
|
|
||||||
SSyncRaftProgressMap* progressMap = &tracker->progressMap;
|
|
||||||
int i, ret;
|
int i, ret;
|
||||||
|
|
||||||
|
syncRaftInitConfArray(&outgoing);
|
||||||
|
syncRaftInitConfArray(&incoming);
|
||||||
|
|
||||||
|
syncRaftInitTrackConfig(config);
|
||||||
|
syncRaftInitProgressMap(progressMap);
|
||||||
|
|
||||||
ret = toConfChangeSingle(cs, &outgoing, &incoming);
|
ret = toConfChangeSingle(cs, &outgoing, &incoming);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (outgoing.n == 0) {
|
if (syncRaftConfArrayIsEmpty(&outgoing)) {
|
||||||
// No outgoing config, so just apply the incoming changes one by one.
|
// No outgoing config, so just apply the incoming changes one by one.
|
||||||
for (i = 0; i < incoming.n; ++i) {
|
for (i = 0; i < incoming.n; ++i) {
|
||||||
css = (SSyncConfChangeSingleArray) {
|
css = (SSyncConfChangeSingleArray) {
|
||||||
|
@ -52,6 +58,9 @@ int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs)
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
syncRaftCopyTrackerConfig(config, &changer->tracker->config);
|
||||||
|
syncRaftCopyProgressMap(progressMap, &changer->tracker->progressMap);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// The ConfState describes a joint configuration.
|
// The ConfState describes a joint configuration.
|
||||||
|
@ -68,6 +77,8 @@ int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs)
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
syncRaftCopyTrackerConfig(config, &changer->tracker->config);
|
||||||
|
syncRaftCopyProgressMap(progressMap, &changer->tracker->progressMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = syncRaftChangerEnterJoint(changer, cs->autoLeave, &incoming, config, progressMap);
|
ret = syncRaftChangerEnterJoint(changer, cs->autoLeave, &incoming, config, progressMap);
|
||||||
|
@ -77,11 +88,24 @@ int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs)
|
||||||
}
|
}
|
||||||
|
|
||||||
out:
|
out:
|
||||||
if (incoming.n != 0) free(incoming.changes);
|
syncRaftFreeConfArray(&incoming);
|
||||||
if (outgoing.n != 0) free(outgoing.changes);
|
syncRaftFreeConfArray(&outgoing);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void addToConfChangeSingleArray(SSyncConfChangeSingleArray* out, int* i, const SSyncRaftNodeMap* nodeMap, ESyncRaftConfChangeType t) {
|
||||||
|
SyncNodeId* pId = NULL;
|
||||||
|
|
||||||
|
while (!syncRaftIterateNodeMap(nodeMap, pId)) {
|
||||||
|
out->changes[*i] = (SSyncConfChangeSingle) {
|
||||||
|
.type = t,
|
||||||
|
.nodeId = *pId,
|
||||||
|
};
|
||||||
|
*i += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// toConfChangeSingle translates a conf state into 1) a slice of operations creating
|
// toConfChangeSingle translates a conf state into 1) a slice of operations creating
|
||||||
// first the config that will become the outgoing one, and then the incoming one, and
|
// first the config that will become the outgoing one, and then the incoming one, and
|
||||||
// b) another slice that, when applied to the config resulted from 1), represents the
|
// b) another slice that, when applied to the config resulted from 1), represents the
|
||||||
|
@ -89,15 +113,16 @@ out:
|
||||||
static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in) {
|
static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in) {
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
out->n = in->n = 0;
|
out->n = syncRaftNodeMapSize(&cs->votersOutgoing);
|
||||||
|
|
||||||
out->n = cs->votersOutgoing.replica;
|
|
||||||
out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * out->n);
|
out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * out->n);
|
||||||
if (out->changes == NULL) {
|
if (out->changes == NULL) {
|
||||||
out->n = 0;
|
out->n = 0;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
in->n = cs->votersOutgoing.replica + cs->voters.replica + cs->learners.replica + cs->learnersNext.replica;
|
in->n = syncRaftNodeMapSize(&cs->votersOutgoing) +
|
||||||
|
syncRaftNodeMapSize(&cs->voters) +
|
||||||
|
syncRaftNodeMapSize(&cs->learners) +
|
||||||
|
syncRaftNodeMapSize(&cs->learnersNext);
|
||||||
out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * in->n);
|
out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * in->n);
|
||||||
if (in->changes == NULL) {
|
if (in->changes == NULL) {
|
||||||
in->n = 0;
|
in->n = 0;
|
||||||
|
@ -132,50 +157,24 @@ static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleA
|
||||||
//
|
//
|
||||||
// as desired.
|
// as desired.
|
||||||
|
|
||||||
for (i = 0; i < cs->votersOutgoing.replica; ++i) {
|
// If there are outgoing voters, first add them one by one so that the
|
||||||
// If there are outgoing voters, first add them one by one so that the
|
// (non-joint) config has them all.
|
||||||
// (non-joint) config has them all.
|
i = 0;
|
||||||
out->changes[i] = (SSyncConfChangeSingle) {
|
addToConfChangeSingleArray(out, &i, &cs->votersOutgoing, SYNC_RAFT_Conf_AddNode);
|
||||||
.type = SYNC_RAFT_Conf_AddNode,
|
assert(i == out->n);
|
||||||
.nodeId = cs->votersOutgoing.nodeId[i],
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// We're done constructing the outgoing slice, now on to the incoming one
|
// We're done constructing the outgoing slice, now on to the incoming one
|
||||||
// (which will apply on top of the config created by the outgoing slice).
|
// (which will apply on top of the config created by the outgoing slice).
|
||||||
|
i = 0;
|
||||||
|
|
||||||
// First, we'll remove all of the outgoing voters.
|
// First, we'll remove all of the outgoing voters.
|
||||||
int j = 0;
|
addToConfChangeSingleArray(in, &i, &cs->votersOutgoing, SYNC_RAFT_Conf_RemoveNode);
|
||||||
for (i = 0; i < cs->votersOutgoing.replica; ++i) {
|
|
||||||
in->changes[j] = (SSyncConfChangeSingle) {
|
|
||||||
.type = SYNC_RAFT_Conf_RemoveNode,
|
|
||||||
.nodeId = cs->votersOutgoing.nodeId[i],
|
|
||||||
};
|
|
||||||
j += 1;
|
|
||||||
}
|
|
||||||
// Then we'll add the incoming voters and learners.
|
// Then we'll add the incoming voters and learners.
|
||||||
for (i = 0; i < cs->voters.replica; ++i) {
|
addToConfChangeSingleArray(in, &i, &cs->voters, SYNC_RAFT_Conf_AddNode);
|
||||||
in->changes[j] = (SSyncConfChangeSingle) {
|
addToConfChangeSingleArray(in, &i, &cs->learners, SYNC_RAFT_Conf_AddLearnerNode);
|
||||||
.type = SYNC_RAFT_Conf_AddNode,
|
addToConfChangeSingleArray(in, &i, &cs->learnersNext, SYNC_RAFT_Conf_AddLearnerNode);
|
||||||
.nodeId = cs->voters.nodeId[i],
|
assert(i == in->n);
|
||||||
};
|
|
||||||
j += 1;
|
|
||||||
}
|
|
||||||
for (i = 0; i < cs->learners.replica; ++i) {
|
|
||||||
in->changes[j] = (SSyncConfChangeSingle) {
|
|
||||||
.type = SYNC_RAFT_Conf_AddLearnerNode,
|
|
||||||
.nodeId = cs->learners.nodeId[i],
|
|
||||||
};
|
|
||||||
j += 1;
|
|
||||||
}
|
|
||||||
// Same for LearnersNext; these are nodes we want to be learners but which
|
|
||||||
// are currently voters in the outgoing config.
|
|
||||||
for (i = 0; i < cs->learnersNext.replica; ++i) {
|
|
||||||
in->changes[j] = (SSyncConfChangeSingle) {
|
|
||||||
.type = SYNC_RAFT_Conf_AddLearnerNode,
|
|
||||||
.nodeId = cs->learnersNext.nodeId[i],
|
|
||||||
};
|
|
||||||
j += 1;
|
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
|
@ -158,6 +158,8 @@ static void tkvInit() {
|
||||||
#ifdef USE_ROCKSDB
|
#ifdef USE_ROCKSDB
|
||||||
defaultReadOpts.ropts = rocksdb_readoptions_create();
|
defaultReadOpts.ropts = rocksdb_readoptions_create();
|
||||||
defaultWriteOpts.wopts = rocksdb_writeoptions_create();
|
defaultWriteOpts.wopts = rocksdb_writeoptions_create();
|
||||||
|
rocksdb_writeoptions_disable_WAL(defaultWriteOpts.wopts, true);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -166,4 +168,4 @@ static void tkvClear() {
|
||||||
rocksdb_readoptions_destroy(defaultReadOpts.ropts);
|
rocksdb_readoptions_destroy(defaultReadOpts.ropts);
|
||||||
rocksdb_writeoptions_destroy(defaultWriteOpts.wopts);
|
rocksdb_writeoptions_destroy(defaultWriteOpts.wopts);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,11 +19,19 @@ int32_t walInit() { return 0; }
|
||||||
|
|
||||||
void walCleanUp() {}
|
void walCleanUp() {}
|
||||||
|
|
||||||
SWal *walOpen(char *path, SWalCfg *pCfg) { return NULL; }
|
SWal *walOpen(char *path, SWalCfg *pCfg) {
|
||||||
|
SWal* pWal = malloc(sizeof(SWal));
|
||||||
|
if(pWal == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return pWal;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t walAlter(SWal *pWal, SWalCfg *pCfg) { return 0; }
|
int32_t walAlter(SWal *pWal, SWalCfg *pCfg) { return 0; }
|
||||||
|
|
||||||
void walClose(SWal *pWal) {}
|
void walClose(SWal *pWal) {
|
||||||
|
if(pWal) free(pWal);
|
||||||
|
}
|
||||||
|
|
||||||
void walFsync(SWal *pWal, bool force) {}
|
void walFsync(SWal *pWal, bool force) {}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue