From d4cd3836f1e40c9d45459ba766e8a0d46b96b295 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Thu, 21 Oct 2021 22:36:17 +0800 Subject: [PATCH 01/21] add lucene --- cmake/cmake.options | 6 +-- cmake/lucene_CMakeLists.txt.in | 5 +- deps/CMakeLists.txt | 5 ++ include/libs/index/index.h | 62 +++++++++++++++++++++++- source/libs/index/CMakeLists.txt | 25 +++++++++- source/libs/index/inc/indexInt.h | 21 +++++++- source/libs/index/src/index.c | 70 ++++++++++++++++++++++++--- source/libs/index/test/indexTests.cpp | 15 ++++++ source/libs/tkv/src/tkv.c | 4 +- 9 files changed, 196 insertions(+), 17 deletions(-) diff --git a/cmake/cmake.options b/cmake/cmake.options index 74b0d9fdbb..bbecf614d1 100644 --- a/cmake/cmake.options +++ b/cmake/cmake.options @@ -16,17 +16,17 @@ option( option( BUILD_WITH_ROCKSDB "If build with rocksdb" - OFF + OF ) option( BUILD_WITH_LUCENE "If build with lucene" - OFF + on ) option( BUILD_DEPENDENCY_TESTS "If build dependency tests" OFF -) \ No newline at end of file +) diff --git a/cmake/lucene_CMakeLists.txt.in b/cmake/lucene_CMakeLists.txt.in index 91e144dced..436ac64475 100644 --- a/cmake/lucene_CMakeLists.txt.in +++ b/cmake/lucene_CMakeLists.txt.in @@ -1,8 +1,7 @@ # lucene ExternalProject_Add(lucene - GIT_REPOSITORY https://github.com/taosdata-contrib/LucenePlusPlus.git - GIT_TAG rel_3.0.8 + GIT_REPOSITORY https://github.com/yihaoDeng/LucenePlusPlus.git SOURCE_DIR "${CMAKE_SOURCE_DIR}/deps/lucene" BINARY_DIR "" #BUILD_IN_SOURCE TRUE @@ -10,4 +9,4 @@ ExternalProject_Add(lucene BUILD_COMMAND "" INSTALL_COMMAND "" TEST_COMMAND "" -) \ No newline at end of file +) diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt index e35417b4c5..7392763d03 100644 --- a/deps/CMakeLists.txt +++ b/deps/CMakeLists.txt @@ -65,6 +65,11 @@ endif(${BUILD_WITH_ROCKSDB}) if(${BUILD_WITH_LUCENE}) option(ENABLE_TEST "Enable the tests" OFF) add_subdirectory(lucene) + target_include_directories( + lucene++ + PUBLIC $ + ) + endif(${BUILD_WITH_LUCENE}) # ================================================================================================ diff --git a/include/libs/index/index.h b/include/libs/index/index.h index f821b437af..bdd0905234 100644 --- a/include/libs/index/index.h +++ b/include/libs/index/index.h @@ -16,12 +16,72 @@ #ifndef _TD_INDEX_H_ #define _TD_INDEX_H_ +#include "os.h" +#include "tarray.h" + #ifdef __cplusplus extern "C" { #endif +typedef struct SIndex SIndex; +typedef struct SIndexOpts SIndexOpts; + +typedef enum { MUST = 0, SHOULD = 1, NOT = 2 } EIndexOperatorType; +typedef enum { QUERY_POINT = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX = 3} EIndexQueryType; + + + +typedef struct SIndexTermQuery { + EIndexQueryType opera; + SArray *querys; +} SIndexTermQuery; + +// tag and tag val; +typedef struct SIndexPair { + char *key; + char *val; +} SIndexPair; + +// +typedef struct SIndexTerm { + SIndexPair* field_value; + EIndexQueryType type; +} SIndexTerm; + + + +/* + * @param: oper + * +*/ + +SIndexTermQuery *indexTermQueryCreate(EIndexOperatorType oper); +void indexTermQueryDestroy(SIndexTermQuery *pQuery); +int indexTermQueryAdd(SIndexTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type); + + + +/* + * @param: + * @param: + */ +SIndex* indexOpen(SIndexOpts *opt, const char *path); + +void indexClose(SIndex *index); +int indexPut(SIndex *index, SArray *pairs, int uid); +int indexDelete(SIndex *index, SIndexTermQuery *query); +int indexSearch(SIndex *index, SIndexTermQuery *query, SArray *result); +int indexRebuild(SIndex *index, SIndexOpts *opt); + +/* + * @param: + * @param: + */ +SIndexOpts *indexOptsCreate(); +void indexOptsDestroy(SIndexOpts *opts); + #ifdef __cplusplus } #endif -#endif /*_TD_INDEX_H_*/ \ No newline at end of file +#endif /*_TD_INDEX_H_*/ diff --git a/source/libs/index/CMakeLists.txt b/source/libs/index/CMakeLists.txt index 638d3f64cd..3da2c93b39 100644 --- a/source/libs/index/CMakeLists.txt +++ b/source/libs/index/CMakeLists.txt @@ -4,4 +4,27 @@ target_include_directories( index PUBLIC "${CMAKE_SOURCE_DIR}/include/libs/index" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/inc" -) \ No newline at end of file +) +target_link_libraries( + index + PUBLIC os + PUBLIC util +) + +if (${BUILD_WITH_LUCENE}) + target_include_directories( + index + PUBLIC "${CMAKE_SOURCE_DIR}/deps/lucene/include" + ) + LINK_DIRECTORIES("${CMAKE_SOURCE_DIR}/deps/lucene/debug/src/core") + target_link_libraries( + index + PUBLIC lucene++ + ) + +endif(${BUILD_WITH_LUCENE}) + +if (${BUILD_TEST}) + add_subdirectory(test) +endif(${BUILD_TEST}) + diff --git a/source/libs/index/inc/indexInt.h b/source/libs/index/inc/indexInt.h index 81eba4ec91..8d8c950075 100644 --- a/source/libs/index/inc/indexInt.h +++ b/source/libs/index/inc/indexInt.h @@ -16,12 +16,31 @@ #ifndef _TD_INDEX_INT_H_ #define _TD_INDEX_INT_H_ +#include "index.h" + +#ifdef USE_LUCENE +#include +#endif + + #ifdef __cplusplus extern "C" { #endif +struct SIndex { +#ifdef USE_LUCENE + index_t *index; +#endif +}; + +struct SIndexOpts { +#ifdef USE_LUCENE + void *opts; +#endif +}; + #ifdef __cplusplus } #endif -#endif /*_TD_INDEX_INT_H_*/ \ No newline at end of file +#endif /*_TD_INDEX_INT_H_*/ diff --git a/source/libs/index/src/index.c b/source/libs/index/src/index.c index f821b437af..46039249c5 100644 --- a/source/libs/index/src/index.c +++ b/source/libs/index/src/index.c @@ -13,15 +13,71 @@ * along with this program. If not, see . */ -#ifndef _TD_INDEX_H_ -#define _TD_INDEX_H_ +#include "index.h" +#include "indexInt.h" -#ifdef __cplusplus -extern "C" { +#ifdef USE_LUCENE +#include "lucene++/Lucene_c.h" #endif -#ifdef __cplusplus +static pthread_once_t isInit = PTHREAD_ONCE_INIT; + +static void indexInit(); + +SIndex *indexOpen(SIndexOpts *opts, const char *path) { + pthread_once(&isInit, indexInit); +#ifdef USE_LUCENE + index_t *index = index_open(path); + SIndex *p = malloc(sizeof(SIndex)); + p->index = index; + return p; +#endif + return NULL; } -#endif -#endif /*_TD_INDEX_H_*/ \ No newline at end of file +void indexClose(SIndex *index) { +#ifdef USE_LUCENE + index_close(index->index); +#endif + free(index); + return; + +} +int indexPut(SIndex *index, SArray* field_vals, int uid) { + return 1; + +} +int indexSearch(SIndex *index, SIndexTermQuery *query, SArray *result) { + return 1; +} + +int indexDelete(SIndex *index, SIndexTermQuery *query) { + return 1; +} +int indexRebuild(SIndex *index, SIndexOpts *opts); + + +SIndexOpts *indexOptsCreate() { + return NULL; +} +void indexOptsDestroy(SIndexOpts *opts) { + +} +/* + * @param: oper + * +*/ + +SIndexTermQuery *indexTermQueryCreate(EIndexOperatorType oper) { + return NULL; +} +void indexTermQueryDestroy(SIndexTermQuery *pQuery) { + +} +int indexTermQueryAdd(SIndexTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type){ + return 1; +} + +void indexInit() { + //do nothing +} diff --git a/source/libs/index/test/indexTests.cpp b/source/libs/index/test/indexTests.cpp index e69de29bb2..047491838f 100644 --- a/source/libs/index/test/indexTests.cpp +++ b/source/libs/index/test/indexTests.cpp @@ -0,0 +1,15 @@ +#include +#include +#include +#include "index.h" + + + +TEST(IndexTest, index_create_test) { + SIndexOpts *opts = indexOptsCreate(); + SIndex *index = indexOpen(opts, "./"); + if (index == NULL) { + std::cout << "index open failed" << std::endl; + } + indexOptsDestroy(opts); +} diff --git a/source/libs/tkv/src/tkv.c b/source/libs/tkv/src/tkv.c index 5319f6b9da..0c6f896d56 100644 --- a/source/libs/tkv/src/tkv.c +++ b/source/libs/tkv/src/tkv.c @@ -158,6 +158,8 @@ static void tkvInit() { #ifdef USE_ROCKSDB defaultReadOpts.ropts = rocksdb_readoptions_create(); defaultWriteOpts.wopts = rocksdb_writeoptions_create(); + rocksdb_writeoptions_disable_WAL(defaultWriteOpts.wopts, true); + #endif } @@ -166,4 +168,4 @@ static void tkvClear() { rocksdb_readoptions_destroy(defaultReadOpts.ropts); rocksdb_writeoptions_destroy(defaultWriteOpts.wopts); #endif -} \ No newline at end of file +} From 9e74ea9ed3cd21052576c9a2194518c54b105fc7 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Fri, 22 Oct 2021 22:47:26 +0800 Subject: [PATCH 02/21] add lucene test --- include/libs/index/index.h | 46 ++++------- source/libs/index/inc/indexInt.h | 21 +++++ source/libs/index/src/index.c | 106 +++++++++++++++++++++++--- source/libs/index/test/indexTests.cpp | 30 +++++++- 4 files changed, 159 insertions(+), 44 deletions(-) diff --git a/include/libs/index/index.h b/include/libs/index/index.h index bdd0905234..346214e0c8 100644 --- a/include/libs/index/index.h +++ b/include/libs/index/index.h @@ -25,54 +25,36 @@ extern "C" { typedef struct SIndex SIndex; typedef struct SIndexOpts SIndexOpts; +typedef struct SIndexMultiTermQuery SIndexMultiTermQuery; +typedef struct SArray SIndexMultiTerm; +//typedef struct SIndexMultiTerm SIndexMultiTerm; typedef enum { MUST = 0, SHOULD = 1, NOT = 2 } EIndexOperatorType; typedef enum { QUERY_POINT = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX = 3} EIndexQueryType; - -typedef struct SIndexTermQuery { - EIndexQueryType opera; - SArray *querys; -} SIndexTermQuery; - -// tag and tag val; -typedef struct SIndexPair { - char *key; - char *val; -} SIndexPair; - -// -typedef struct SIndexTerm { - SIndexPair* field_value; - EIndexQueryType type; -} SIndexTerm; - - - /* * @param: oper * */ - -SIndexTermQuery *indexTermQueryCreate(EIndexOperatorType oper); -void indexTermQueryDestroy(SIndexTermQuery *pQuery); -int indexTermQueryAdd(SIndexTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type); - - +SIndexMultiTermQuery *indexMultiTermQueryCreate(EIndexOperatorType oper); +void indexMultiTermQueryDestroy(SIndexMultiTermQuery *pQuery); +int indexMultiTermQueryAdd(SIndexMultiTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type); /* * @param: * @param: */ SIndex* indexOpen(SIndexOpts *opt, const char *path); +void indexClose(SIndex *index); +int indexPut(SIndex *index, SIndexMultiTerm *terms, int uid); +int indexDelete(SIndex *index, SIndexMultiTermQuery *query); +int indexSearch(SIndex *index, SIndexMultiTermQuery *query, SArray *result); +int indexRebuild(SIndex *index, SIndexOpts *opt); -void indexClose(SIndex *index); -int indexPut(SIndex *index, SArray *pairs, int uid); -int indexDelete(SIndex *index, SIndexTermQuery *query); -int indexSearch(SIndex *index, SIndexTermQuery *query, SArray *result); -int indexRebuild(SIndex *index, SIndexOpts *opt); - +SIndexMultiTerm *indexMultiTermCreate(); +int indexMultiTermAdd(SIndexMultiTerm *terms, const char *field, int32_t nFields, const char *value, int32_t nValue); +void indexMultiTermDestroy(SIndexMultiTerm *terms); /* * @param: * @param: diff --git a/source/libs/index/inc/indexInt.h b/source/libs/index/inc/indexInt.h index 8d8c950075..742427bf94 100644 --- a/source/libs/index/inc/indexInt.h +++ b/source/libs/index/inc/indexInt.h @@ -39,6 +39,27 @@ struct SIndexOpts { #endif }; +struct SIndexMultiTermQuery { + EIndexOperatorType opera; + SArray *query; +}; + +// field and key; +typedef struct SIndexTerm { + char *key; + int32_t nKey; + char *val; + int32_t nVal; +} SIndexTerm; + +typedef struct SIndexTermQuery { + SIndexTerm* field_value; + EIndexQueryType type; +} SIndexTermQuery; + + +SIndexTerm *indexTermCreate(const char *key, int32_t nKey, const char *val, int32_t nVal); +void indexTermDestroy(SIndexTerm *p); #ifdef __cplusplus } #endif diff --git a/source/libs/index/src/index.c b/source/libs/index/src/index.c index 46039249c5..e4b2a4acc4 100644 --- a/source/libs/index/src/index.c +++ b/source/libs/index/src/index.c @@ -38,46 +38,130 @@ SIndex *indexOpen(SIndexOpts *opts, const char *path) { void indexClose(SIndex *index) { #ifdef USE_LUCENE index_close(index->index); + index->index = NULL; #endif free(index); return; } + +#ifdef USE_LUCENE +#endif int indexPut(SIndex *index, SArray* field_vals, int uid) { +#ifdef USE_LUCENE + index_document_t *doc = index_document_create(); + + char buf[16] = {0}; + sprintf(buf, "%d", uid); + + for (int i = 0; i < taosArrayGetSize(field_vals); i++) { + SIndexTerm *p = taosArrayGetP(field_vals, i); + index_document_add(doc, (const char *)(p->key), p->nKey, (const char *)(p->val), p->nVal, 1); + } + index_document_add(doc, NULL, 0, buf, strlen(buf), 0); + + index_put(index->index, doc); + index_document_destroy(doc); +#endif return 1; } -int indexSearch(SIndex *index, SIndexTermQuery *query, SArray *result) { +int indexSearch(SIndex *index, SIndexMultiTermQuery *multiQuerys, SArray *result) { +#ifdef USE_LUCENE + for (int i = 0; i < taosArrayGetSize(multiQuerys->query); i++) { + SIndexTermQuery *p = taosArrayGet(multiQuerys->query, i); + SIndexTerm *term = p->field_value; + EIndexQueryType qType = p->type; + int *tResult = NULL; + int32_t tsz = 0; + index_search(index->index, term->key, term->nKey, term->val, term->nVal, qType, &tResult, &tsz); + for (int i = 0; i < tsz; i++) { + taosArrayPush(result, &(tResult[i])); + } + + } +#endif return 1; } -int indexDelete(SIndex *index, SIndexTermQuery *query) { +int indexDelete(SIndex *index, SIndexMultiTermQuery *query) { return 1; } int indexRebuild(SIndex *index, SIndexOpts *opts); SIndexOpts *indexOptsCreate() { - return NULL; +#ifdef USE_LUCENE +#endif +return NULL; } void indexOptsDestroy(SIndexOpts *opts) { - +#ifdef USE_LUCENE +#endif } /* * @param: oper * */ -SIndexTermQuery *indexTermQueryCreate(EIndexOperatorType oper) { - return NULL; +SIndexMultiTermQuery *indexMultiTermQueryCreate(EIndexOperatorType opera) { + SIndexMultiTermQuery *p = (SIndexMultiTermQuery *)malloc(sizeof(SIndexMultiTermQuery)); + if (p == NULL) { return NULL; } + p->opera = opera; + p->query = taosArrayInit(1, sizeof(SIndexTermQuery)); + return p; } -void indexTermQueryDestroy(SIndexTermQuery *pQuery) { - -} -int indexTermQueryAdd(SIndexTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type){ - return 1; +void indexMultiTermQueryDestroy(SIndexMultiTermQuery *pQuery) { + for (int i = 0; i < taosArrayGetSize(pQuery->query); i++) { + SIndexTermQuery *p = (SIndexTermQuery *)taosArrayGet(pQuery->query, i); + indexTermDestroy(p->field_value); + } + taosArrayDestroy(pQuery->query); + free(pQuery); +}; +int indexMultiTermQueryAdd(SIndexMultiTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type){ + SIndexTerm *t = indexTermCreate(field, nFields, value, nValue); + if (t == NULL) {return -1;} + SIndexTermQuery q = {.type = type, .field_value = t}; + taosArrayPush(pQuery->query, &q); + return 0; } + +SIndexTerm *indexTermCreate(const char *key, int32_t nKey, const char *val, int32_t nVal) { + SIndexTerm *t = (SIndexTerm *)malloc(sizeof(SIndexTerm)); + t->key = (char *)calloc(nKey + 1, 1); + memcpy(t->key, key, nKey); + t->nKey = nKey; + + t->val = (char *)calloc(nVal + 1, 1); + memcpy(t->val, val, nVal); + t->nVal = nVal; + return t; +} +void indexTermDestroy(SIndexTerm *p) { + free(p->key); + free(p->val); + free(p); +} + +SArray *indexMultiTermCreate() { + return taosArrayInit(4, sizeof(SIndexTerm *)); +} + +int indexMultiTermAdd(SArray *array, const char *field, int32_t nField, const char *val, int32_t nVal) { + SIndexTerm *term = indexTermCreate(field, nField, val, nVal); + if (term == NULL) { return -1; } + taosArrayPush(array, &term); + return 0; +} +void indexMultiTermDestroy(SArray *array) { + for (int32_t i = 0; i < taosArrayGetSize(array); i++) { + SIndexTerm *p = taosArrayGetP(array, i); + indexTermDestroy(p); + } + taosArrayDestroy(array); +} void indexInit() { //do nothing } diff --git a/source/libs/index/test/indexTests.cpp b/source/libs/index/test/indexTests.cpp index 047491838f..efa7f37a60 100644 --- a/source/libs/index/test/indexTests.cpp +++ b/source/libs/index/test/indexTests.cpp @@ -2,14 +2,42 @@ #include #include #include "index.h" +#include "indexInt.h" + TEST(IndexTest, index_create_test) { SIndexOpts *opts = indexOptsCreate(); - SIndex *index = indexOpen(opts, "./"); + SIndex *index = indexOpen(opts, "./test"); if (index == NULL) { std::cout << "index open failed" << std::endl; } + + + SArray* terms = indexMultiTermCreate(); + indexMultiTermAdd(terms, "tag1", strlen("tag1"), "field", strlen("field")); + for (int i = 0; i < 10; i++) { + indexPut(index, terms, i); + } + indexMultiTermDestroy(terms); + + + // query + SIndexMultiTermQuery *multiQuery = indexMultiTermQueryCreate(MUST); + indexMultiTermQueryAdd(multiQuery, "tag1", strlen("tag1"), "field", strlen("field"), QUERY_PREFIX); + + SArray *result = (SArray *)taosArrayInit(10, sizeof(int)); + indexSearch(index, multiQuery, result); + + std::cout << "taos'size : " << taosArrayGetSize(result) << std::endl; + for (int i = 0; i < taosArrayGetSize(result); i++) { + int *v = (int *)taosArrayGet(result, i); + std::cout << "value --->" << *v << std::endl; + } + indexMultiTermQueryDestroy(multiQuery); + indexOptsDestroy(opts); + indexClose(index); + // } From 551cfd5c35b50899bae7f9c86022021b57ff4c21 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 23 Oct 2021 16:12:36 +0800 Subject: [PATCH 03/21] add index test example --- include/libs/index/index.h | 11 ++++---- source/libs/index/src/index.c | 39 ++++++++++++++++++++------- source/libs/index/test/indexTests.cpp | 25 +++++++++++++---- 3 files changed, 55 insertions(+), 20 deletions(-) diff --git a/include/libs/index/index.h b/include/libs/index/index.h index 346214e0c8..1b74928568 100644 --- a/include/libs/index/index.h +++ b/include/libs/index/index.h @@ -27,12 +27,9 @@ typedef struct SIndex SIndex; typedef struct SIndexOpts SIndexOpts; typedef struct SIndexMultiTermQuery SIndexMultiTermQuery; typedef struct SArray SIndexMultiTerm; -//typedef struct SIndexMultiTerm SIndexMultiTerm; typedef enum { MUST = 0, SHOULD = 1, NOT = 2 } EIndexOperatorType; -typedef enum { QUERY_POINT = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX = 3} EIndexQueryType; - - +typedef enum { QUERY_TERM = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX = 3} EIndexQueryType; /* * @param: oper * @@ -40,7 +37,6 @@ typedef enum { QUERY_POINT = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX SIndexMultiTermQuery *indexMultiTermQueryCreate(EIndexOperatorType oper); void indexMultiTermQueryDestroy(SIndexMultiTermQuery *pQuery); int indexMultiTermQueryAdd(SIndexMultiTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type); - /* * @param: * @param: @@ -51,7 +47,10 @@ int indexPut(SIndex *index, SIndexMultiTerm *terms, int uid); int indexDelete(SIndex *index, SIndexMultiTermQuery *query); int indexSearch(SIndex *index, SIndexMultiTermQuery *query, SArray *result); int indexRebuild(SIndex *index, SIndexOpts *opt); - +/* + * @param + * @param + */ SIndexMultiTerm *indexMultiTermCreate(); int indexMultiTermAdd(SIndexMultiTerm *terms, const char *field, int32_t nFields, const char *value, int32_t nValue); void indexMultiTermDestroy(SIndexMultiTerm *terms); diff --git a/source/libs/index/src/index.c b/source/libs/index/src/index.c index e4b2a4acc4..91cfcb5cdf 100644 --- a/source/libs/index/src/index.c +++ b/source/libs/index/src/index.c @@ -68,18 +68,39 @@ int indexPut(SIndex *index, SArray* field_vals, int uid) { } int indexSearch(SIndex *index, SIndexMultiTermQuery *multiQuerys, SArray *result) { #ifdef USE_LUCENE - for (int i = 0; i < taosArrayGetSize(multiQuerys->query); i++) { + EIndexOperatorType opera = multiQuerys->opera; + + int nQuery = taosArrayGetSize(multiQuerys->query); + char **fields = malloc(sizeof(char *) * nQuery); + char **keys = malloc(sizeof(char *) * nQuery); + int *types = malloc(sizeof(int) * nQuery); + + for (int i = 0; i < nQuery; i++) { SIndexTermQuery *p = taosArrayGet(multiQuerys->query, i); SIndexTerm *term = p->field_value; - EIndexQueryType qType = p->type; - int *tResult = NULL; - int32_t tsz = 0; - index_search(index->index, term->key, term->nKey, term->val, term->nVal, qType, &tResult, &tsz); - for (int i = 0; i < tsz; i++) { - taosArrayPush(result, &(tResult[i])); - } - + + fields[i] = calloc(1, term->nKey + 1); + keys[i] = calloc(1, term->nVal + 1); + + memcpy(fields[i], term->key, term->nKey); + memcpy(keys[i], term->val, term->nVal); + types[i] = (int)(p->type); } + int *tResult = NULL; + int tsz= 0; + index_multi_search(index->index, (const char **)fields, (const char **)keys, types, nQuery, opera, &tResult, &tsz); + + for (int i = 0; i < tsz; i++) { + taosArrayPush(result, &tResult[i]); + } + + for (int i = 0; i < nQuery; i++) { + free(fields[i]); + free(keys[i]); + } + free(fields); + free(keys); + free(types); #endif return 1; } diff --git a/source/libs/index/test/indexTests.cpp b/source/libs/index/test/indexTests.cpp index efa7f37a60..763a6a54d3 100644 --- a/source/libs/index/test/indexTests.cpp +++ b/source/libs/index/test/indexTests.cpp @@ -15,17 +15,32 @@ TEST(IndexTest, index_create_test) { } - SArray* terms = indexMultiTermCreate(); - indexMultiTermAdd(terms, "tag1", strlen("tag1"), "field", strlen("field")); - for (int i = 0; i < 10; i++) { - indexPut(index, terms, i); + + // write + for (int i = 0; i < 100000; i++) { + SIndexMultiTerm* terms = indexMultiTermCreate(); + std::string val = "field"; + + indexMultiTermAdd(terms, "tag1", strlen("tag1"), val.c_str(), val.size()); + + val.append(std::to_string(i)); + indexMultiTermAdd(terms, "tag2", strlen("tag2"), val.c_str(), val.size()); + + val.insert(0, std::to_string(i)); + indexMultiTermAdd(terms, "tag3", strlen("tag3"), val.c_str(), val.size()); + + val.append("const"); + indexMultiTermAdd(terms, "tag4", strlen("tag4"), val.c_str(), val.size()); + + indexPut(index, terms, i); + indexMultiTermDestroy(terms); } - indexMultiTermDestroy(terms); // query SIndexMultiTermQuery *multiQuery = indexMultiTermQueryCreate(MUST); indexMultiTermQueryAdd(multiQuery, "tag1", strlen("tag1"), "field", strlen("field"), QUERY_PREFIX); + indexMultiTermQueryAdd(multiQuery, "tag3", strlen("tag3"), "0field0", strlen("0field0"), QUERY_TERM); SArray *result = (SArray *)taosArrayInit(10, sizeof(int)); indexSearch(index, multiQuery, result); From 8721dc4408d673dd9538c793b5fb970bfa5ecfcf Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Mon, 25 Oct 2021 15:14:58 +0800 Subject: [PATCH 04/21] add index test example --- source/libs/index/test/CMakeLists.txt | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 source/libs/index/test/CMakeLists.txt diff --git a/source/libs/index/test/CMakeLists.txt b/source/libs/index/test/CMakeLists.txt new file mode 100644 index 0000000000..f2a7442a5a --- /dev/null +++ b/source/libs/index/test/CMakeLists.txt @@ -0,0 +1,23 @@ +add_executable(indexTest "") +target_sources(indexTest + PRIVATE + "../src/index.c" + "indexTests.cpp" +) +target_include_directories ( indexTest + PUBLIC + "${CMAKE_SOURCE_DIR}/include/libs/index" + "${CMAKE_CURRENT_SOURCE_DIR}/../inc" +) +target_link_libraries (indexTest + os + util + common + gtest_main + index +) + +add_test( + NAME index_test + COMMAND indexTest +) From d2485c4c8a9caaf9a588e11a3b7bc1ca78977d3e Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Mon, 25 Oct 2021 15:17:07 +0800 Subject: [PATCH 05/21] add index test example --- CMakeLists.txt | 1 + source/libs/index/test/indexTests.cpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index abb39c310a..3a2c29ae8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,7 @@ endif(${BUILD_WITH_ROCKSDB}) ## lucene if(${BUILD_WITH_LUCENE}) cat("${CMAKE_SUPPORT_DIR}/lucene_CMakeLists.txt.in" ${DEPS_TMP_FILE}) + add_definitions(-DUSE_LUCENE) endif(${BUILD_WITH_LUCENE}) ## download dependencies diff --git a/source/libs/index/test/indexTests.cpp b/source/libs/index/test/indexTests.cpp index 763a6a54d3..cc0df0d42a 100644 --- a/source/libs/index/test/indexTests.cpp +++ b/source/libs/index/test/indexTests.cpp @@ -15,7 +15,6 @@ TEST(IndexTest, index_create_test) { } - // write for (int i = 0; i < 100000; i++) { SIndexMultiTerm* terms = indexMultiTermCreate(); @@ -32,6 +31,7 @@ TEST(IndexTest, index_create_test) { val.append("const"); indexMultiTermAdd(terms, "tag4", strlen("tag4"), val.c_str(), val.size()); + indexPut(index, terms, i); indexMultiTermDestroy(terms); } @@ -39,6 +39,7 @@ TEST(IndexTest, index_create_test) { // query SIndexMultiTermQuery *multiQuery = indexMultiTermQueryCreate(MUST); + indexMultiTermQueryAdd(multiQuery, "tag1", strlen("tag1"), "field", strlen("field"), QUERY_PREFIX); indexMultiTermQueryAdd(multiQuery, "tag3", strlen("tag3"), "0field0", strlen("0field0"), QUERY_TERM); From 980ace09b54144c1ee3d2d0d1c024d2a5fabe3ab Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Mon, 15 Nov 2021 11:36:51 +0800 Subject: [PATCH 06/21] add interface --- source/libs/index/inc/index_fst.h | 166 +++++++++++++ source/libs/index/src/index_fst.c | 48 ++++ source/libs/index/src/index_fst_common.c | 304 +++++++++++++++++++++++ 3 files changed, 518 insertions(+) create mode 100644 source/libs/index/inc/index_fst.h create mode 100644 source/libs/index/src/index_fst.c create mode 100644 source/libs/index/src/index_fst_common.c diff --git a/source/libs/index/inc/index_fst.h b/source/libs/index/inc/index_fst.h new file mode 100644 index 0000000000..de4c957e29 --- /dev/null +++ b/source/libs/index/inc/index_fst.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _INDEX_FST_H_ +#define _INDEX_FST_H_ +#include "index_fst.h" +#include "tarray.h" + +typedef FstType uint64_t; +typedef CompiledAddr uint64_t; +typedef Output uint64_t; +typedef PackSizes uint8_t; + + +//A sentinel value used to indicate an empty final state +const CompileAddr EMPTY_ADDRESS = 0; +/// A sentinel value used to indicate an invalid state. +const CompileAddr NONE_ADDRESS = 1; + +// This version number is written to every finite state transducer created by +// this crate. When a finite state transducer is read, its version number is +// checked against this value. +const uint64_t version = 3; +// The threshold (in number of transitions) at which an index is created for +// a node's transitions. This speeds up lookup time at the expense of FST size + +const uint64_t TRANS_INDEX_THRESHOLD = 32; + +typedef struct FstRange { + uint64_t start; + uint64_t end; +} FstRange; + +enum State { OneTransNext, OneTrans, AnyTrans, EmptyFinal}; +enum FstBound { Included, Excluded, Unbounded}; + +typedef struct CheckSummer { + uint32_t sum; +}; + + +typedef struct FstBuilder { + FstCountingWriter wtr; // The FST raw data is written directly to `wtr`. + FstUnFinishedNodes unfinished // The stack of unfinished nodes + Registry registry // A map of finished nodes. + SArray* last // The last word added + CompiledAddr lastAddr // The address of the last compiled node + uint64_t len // num of keys added +} FstBuilder; + +typedef struct FstCountingWriter { + void* wtr; // wrap any writer that counts and checksum bytes written + uint64_t count; + CheckSummer summer; +}; + + + + +typedef struct FstTransition { + uint8_t inp; //The byte input associated with this transition. + Output out; //The output associated with this transition + CompiledAddr addr; //The address of the node that this transition points to +} FstTransition; + +typedef struct FstTransitions { + FstNode *node; + FstRange range; +} FstTransitions; + +typedef struct FstUnFinishedNodes { + SArray *stack; // +} FstUnFinishedNodes; + +typedef struct FstBuilderNode { + bool isFinal; + Output finalOutput; + SArray *trans; // +} FstBuilderNode; + + + +typedef struct FstLastTransition { + uint8_t inp; + Output out; +} FstLastTransition; + +typedef struct FstBuilderNodeUnfinished { + FstBuilderNode node; + FstLastTransition last; +} FstBuilderNodeUnfinished; + +typedef struct FstNode { + uint8_t* data; + uint64_t version; + State state; + CompiledAddr start; + CompiledAddr end; + bool isFinal; + uint64_t nTrans; + PackSizes sizes; + Output finalOutput; +} FstNode; + +typedef struct FstMeta { + uint64_t version; + CompiledAddr rootAddr; + FstType ty; + uint64_t len; + uint32_t checkSum; +} FstMeta; + +typedef struct Fst { + FstMeta meta; + void *data; // +}; + +// ops + +typedef struct FstIndexedValue { + uint64_t index; + uint64_t value; +}; + +// relate to Regist +typedef struct FstRegistry { + SArray *table; // + uint64_t tableSize; // num of rows + uint64_t mruSize; // num of columns +} FstRegistry; + +typedef struct FstRegistryCache { + SArray *cells; // +} FstRegistryCache; + +typedef struct FstRegistryCell { + CompiledAddr addr; + FstBuilderNode *node; +} FstRegistryCell; + +enum FstRegistryEntry {Found, NotFound, Rejected}; + +FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, uint8_t *data); +FstTransitions fstNodeTransitionIter(FstNode *node); +FstTransition fstNodeGetTransitionAt(FstNode *node, uint64_t i); +CompiledAddr fstNodeGetTransitionAddr(FstNode *node, uint64_t i); +int64_t fstNodeFindInput(FstNode *node, int8_t b); +Output fstNodeGetFinalOutput(FstNode *node); +void* fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledArr addr, FstBuilderNode *builderNode); + + + + +#endif diff --git a/source/libs/index/src/index_fst.c b/source/libs/index/src/index_fst.c new file mode 100644 index 0000000000..4c6e20a7d5 --- /dev/null +++ b/source/libs/index/src/index_fst.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "index_fst.h" + +// fst node function +FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) { + FstNode *n = (FstNode *)malloc(sizeof(FstNode)); + if (n == NULL) { return NULL; } + + if (addr == EMPTY_ADDRESS) { + n->date = NULL; + n->version = version; + n->state = EmptyFinal; + n->start = EMPTY_ADDRESS; + n->end = EMPTY_ADDRESS; + n->isFinal = true; + n->nTrans = 0; + n->sizes = 0; + n->finalOutpu = 0; + return n; + } + uint8_t v = (data[addr] & 0b1100000) >> 6; + if (v == 0b11) { + + } else if (v == 0b10) { + + } else { + + } + + +} + + + diff --git a/source/libs/index/src/index_fst_common.c b/source/libs/index/src/index_fst_common.c new file mode 100644 index 0000000000..39e5f89b35 --- /dev/null +++ b/source/libs/index/src/index_fst_common.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +const uint8_t COMMON_INPUTS[] = { + 84, // '\x00' + 85, // '\x01' + 86, // '\x02' + 87, // '\x03' + 88, // '\x04' + 89, // '\x05' + 90, // '\x06' + 91, // '\x07' + 92, // '\x08' + 93, // '\t' + 94, // '\n' + 95, // '\x0b' + 96, // '\x0c' + 97, // '\r' + 98, // '\x0e' + 99, // '\x0f' + 100, // '\x10' + 101, // '\x11' + 102, // '\x12' + 103, // '\x13' + 104, // '\x14' + 105, // '\x15' + 106, // '\x16' + 107, // '\x17' + 108, // '\x18' + 109, // '\x19' + 110, // '\x1a' + 111, // '\x1b' + 112, // '\x1c' + 113, // '\x1d' + 114, // '\x1e' + 115, // '\x1f' + 116, // ' ' + 80, // '!' + 117, // '"' + 118, // '#' + 79, // '$' + 39, // '%' + 30, // '&' + 81, // "'" + 75, // '(' + 74, // ')' + 82, // '*' + 57, // '+' + 66, // ',' + 16, // '-' + 12, // '.' + 2, // '/' + 19, // '0' + 20, // '1' + 21, // '2' + 27, // '3' + 32, // '4' + 29, // '5' + 35, // '6' + 36, // '7' + 37, // '8' + 34, // '9' + 24, // ':' + 73, // ';' + 119, // '<' + 23, // '=' + 120, // '>' + 40, // '?' + 83, // '@' + 44, // 'A' + 48, // 'B' + 42, // 'C' + 43, // 'D' + 49, // 'E' + 46, // 'F' + 62, // 'G' + 61, // 'H' + 47, // 'I' + 69, // 'J' + 68, // 'K' + 58, // 'L' + 56, // 'M' + 55, // 'N' + 59, // 'O' + 51, // 'P' + 72, // 'Q' + 54, // 'R' + 45, // 'S' + 52, // 'T' + 64, // 'U' + 65, // 'V' + 63, // 'W' + 71, // 'X' + 67, // 'Y' + 70, // 'Z' + 77, // '[' + 121, // '\\' + 78, // ']' + 122, // '^' + 31, // '_' + 123, // '`' + 4, // 'a' + 25, // 'b' + 9, // 'c' + 17, // 'd' + 1, // 'e' + 26, // 'f' + 22, // 'g' + 13, // 'h' + 7, // 'i' + 50, // 'j' + 38, // 'k' + 14, // 'l' + 15, // 'm' + 10, // 'n' + 3, // 'o' + 8, // 'p' + 60, // 'q' + 6, // 'r' + 5, // 's' + 0, // 't' + 18, // 'u' + 33, // 'v' + 11, // 'w' + 41, // 'x' + 28, // 'y' + 53, // 'z' + 124, // '{' + 125, // '|' + 126, // '}' + 76, // '~' + 127, // '\x7f' + 128, // '\x80' + 129, // '\x81' + 130, // '\x82' + 131, // '\x83' + 132, // '\x84' + 133, // '\x85' + 134, // '\x86' + 135, // '\x87' + 136, // '\x88' + 137, // '\x89' + 138, // '\x8a' + 139, // '\x8b' + 140, // '\x8c' + 141, // '\x8d' + 142, // '\x8e' + 143, // '\x8f' + 144, // '\x90' + 145, // '\x91' + 146, // '\x92' + 147, // '\x93' + 148, // '\x94' + 149, // '\x95' + 150, // '\x96' + 151, // '\x97' + 152, // '\x98' + 153, // '\x99' + 154, // '\x9a' + 155, // '\x9b' + 156, // '\x9c' + 157, // '\x9d' + 158, // '\x9e' + 159, // '\x9f' + 160, // '\xa0' + 161, // '¡' + 162, // '¢' + 163, // '£' + 164, // '¤' + 165, // '¥' + 166, // '¦' + 167, // '§' + 168, // '¨' + 169, // '©' + 170, // 'ª' + 171, // '«' + 172, // '¬' + 173, // '\xad' + 174, // '®' + 175, // '¯' + 176, // '°' + 177, // '±' + 178, // '²' + 179, // '³' + 180, // '´' + 181, // 'µ' + 182, // '¶' + 183, // '·' + 184, // '¸' + 185, // '¹' + 186, // 'º' + 187, // '»' + 188, // '¼' + 189, // '½' + 190, // '¾' + 191, // '¿' + 192, // 'À' + 193, // 'Á' + 194, // 'Â' + 195, // 'Ã' + 196, // 'Ä' + 197, // 'Å' + 198, // 'Æ' + 199, // 'Ç' + 200, // 'È' + 201, // 'É' + 202, // 'Ê' + 203, // 'Ë' + 204, // 'Ì' + 205, // 'Í' + 206, // 'Î' + 207, // 'Ï' + 208, // 'Ð' + 209, // 'Ñ' + 210, // 'Ò' + 211, // 'Ó' + 212, // 'Ô' + 213, // 'Õ' + 214, // 'Ö' + 215, // '×' + 216, // 'Ø' + 217, // 'Ù' + 218, // 'Ú' + 219, // 'Û' + 220, // 'Ü' + 221, // 'Ý' + 222, // 'Þ' + 223, // 'ß' + 224, // 'à' + 225, // 'á' + 226, // 'â' + 227, // 'ã' + 228, // 'ä' + 229, // 'å' + 230, // 'æ' + 231, // 'ç' + 232, // 'è' + 233, // 'é' + 234, // 'ê' + 235, // 'ë' + 236, // 'ì' + 237, // 'í' + 238, // 'î' + 239, // 'ï' + 240, // 'ð' + 241, // 'ñ' + 242, // 'ò' + 243, // 'ó' + 244, // 'ô' + 245, // 'õ' + 246, // 'ö' + 247, // '÷' + 248, // 'ø' + 249, // 'ù' + 250, // 'ú' + 251, // 'û' + 252, // 'ü' + 253, // 'ý' + 254, // 'þ' + 255, // 'ÿ' +}; + +char const COMMON_INPUTS_INV[] = [ + 't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w', + '.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=', + ':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6', + '7', '8', 'k', '%', '?', 'x', 'C', 'D', 'A', 'S', 'F', 'I', + 'B', 'E', 'j', 'P', 'T', 'z', 'R', 'N', 'M', '+', 'L', 'O', + 'q', 'H', 'G', 'W', 'U', 'V', ',', 'Y', 'K', 'J', 'Z', 'X', + 'Q', ';', ')', '(', '~', '[', ']', '$', '!', '\'', '*', '@', + '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', + '\x08', '\t', '\n', '\x0b', '\x0c', '\r', '\x0e', '\x0f', '\x10', + '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', + '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', ' ', '"', + '#', '<', '>', '\\', '^', '`', '{', '|', '}','\x7f','\x80', + '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88', + '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90', + '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', + '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '\xa0', + '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', '\xa8', + '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', '\xb0', + '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', '\xb8', + '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', '\xc0', + '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', '\xc8', + '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', '\xd0', + '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', '\xd8', + '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', '\xe0', + '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', '\xe8', + '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0', + '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8', + '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', +]; + From a936a7a584f01b7dcf98ec98e7591f5df167e74a Mon Sep 17 00:00:00 2001 From: lichuang Date: Tue, 16 Nov 2021 15:41:08 +0800 Subject: [PATCH 07/21] [TD-10645][raft]add node map --- source/libs/sync/inc/sync_raft_node_map.h | 36 ++++++++++ source/libs/sync/inc/sync_raft_proto.h | 1 + source/libs/sync/inc/sync_raft_quorum_joint.h | 7 +- source/libs/sync/inc/sync_type.h | 7 +- source/libs/sync/src/raft.c | 8 ++- .../sync/src/raft_handle_election_message.c | 18 ----- source/libs/sync/src/raft_replication.c | 6 +- .../libs/sync/src/sync_raft_config_change.c | 4 +- .../{raft_election.c => sync_raft_election.c} | 55 ++++++++++++---- source/libs/sync/src/sync_raft_impl.c | 5 +- source/libs/sync/src/sync_raft_node_map.c | 66 +++++++++++++++++++ .../sync/src/sync_raft_progress_tracker.c | 4 ++ source/libs/sync/src/sync_raft_quorum_joint.c | 14 ++-- .../libs/sync/src/sync_raft_quorum_majority.c | 1 + 14 files changed, 174 insertions(+), 58 deletions(-) create mode 100644 source/libs/sync/inc/sync_raft_node_map.h rename source/libs/sync/src/{raft_election.c => sync_raft_election.c} (56%) create mode 100644 source/libs/sync/src/sync_raft_node_map.c diff --git a/source/libs/sync/inc/sync_raft_node_map.h b/source/libs/sync/inc/sync_raft_node_map.h new file mode 100644 index 0000000000..bfb5f68489 --- /dev/null +++ b/source/libs/sync/inc/sync_raft_node_map.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_RAFT_NODE_MAP_H +#define _TD_LIBS_SYNC_RAFT_NODE_MAP_H + +#include "sync.h" +#include "sync_type.h" + +// TODO: is TSDB_MAX_REPLICA enough? +struct SSyncRaftNodeMap { + int32_t replica; + SyncNodeId nodeId[TSDB_MAX_REPLICA]; +}; + +bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); + +void syncRaftCopyNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to); + +void syncRaftUnionNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to); + +void syncRaftAddToNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); + +#endif /* _TD_LIBS_SYNC_RAFT_NODE_MAP_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_proto.h b/source/libs/sync/inc/sync_raft_proto.h index c131e91139..dd153e8dad 100644 --- a/source/libs/sync/inc/sync_raft_proto.h +++ b/source/libs/sync/inc/sync_raft_proto.h @@ -17,6 +17,7 @@ #define TD_SYNC_RAFT_PROTO_H #include "sync_type.h" +#include "sync_raft_node_map.h" typedef enum ESyncRaftConfChangeType { SYNC_RAFT_Conf_AddNode = 0, diff --git a/source/libs/sync/inc/sync_raft_quorum_joint.h b/source/libs/sync/inc/sync_raft_quorum_joint.h index 0ef002fe1a..0637a9be9a 100644 --- a/source/libs/sync/inc/sync_raft_quorum_joint.h +++ b/source/libs/sync/inc/sync_raft_quorum_joint.h @@ -19,6 +19,7 @@ #include "taosdef.h" #include "sync.h" #include "sync_type.h" +#include "sync_raft_node_map.h" /** * SSyncRaftQuorumJointConfig is a configuration of two groups of (possibly overlapping) @@ -36,8 +37,6 @@ typedef struct SSyncRaftQuorumJointConfig { **/ ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const ESyncRaftVoteType* votes); -bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); - static FORCE_INLINE bool syncRaftJointConfigInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) { return syncRaftIsInNodeMap(&config->outgoing, id); } @@ -59,7 +58,9 @@ static FORCE_INLINE const SSyncRaftNodeMap* syncRaftJointConfigOutgoing(const SS } static FORCE_INLINE void syncRaftJointConfigClearOutgoing(SSyncRaftQuorumJointConfig* config) { - memset(&config->outgoing, 0, sizeof(SSyncCluster)); + memset(&config->outgoing, 0, sizeof(SSyncRaftNodeMap)); } +void syncRaftJointConfigIDS(const SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap); + #endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */ diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h index cb938c7319..6d29e019cc 100644 --- a/source/libs/sync/inc/sync_type.h +++ b/source/libs/sync/inc/sync_type.h @@ -32,6 +32,8 @@ typedef struct SSyncRaftProgress SSyncRaftProgress; typedef struct SSyncRaftProgressMap SSyncRaftProgressMap; typedef struct SSyncRaftProgressTrackerConfig SSyncRaftProgressTrackerConfig; +typedef struct SSyncRaftNodeMap SSyncRaftNodeMap; + typedef struct SSyncRaftProgressTracker SSyncRaftProgressTracker; typedef struct SSyncRaftChanger SSyncRaftChanger; @@ -68,11 +70,6 @@ typedef struct SSyncClusterConfig { const SSyncCluster* cluster; } SSyncClusterConfig; -typedef struct { - int32_t replica; - SyncNodeId nodeId[TSDB_MAX_REPLICA]; -} SSyncRaftNodeMap; - typedef enum { SYNC_RAFT_CAMPAIGN_PRE_ELECTION = 0, SYNC_RAFT_CAMPAIGN_ELECTION = 1, diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 23351277c4..39e0377545 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -140,6 +140,7 @@ int32_t syncRaftStep(SSyncRaft* pRaft, const SSyncMessage* pMsg) { int32_t syncRaftTick(SSyncRaft* pRaft) { pRaft->currentTick += 1; + pRaft->tickFp(pRaft); return 0; } @@ -212,8 +213,11 @@ static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfi // If the the leadTransferee was removed or demoted, abort the leadership transfer. SyncNodeId leadTransferee = pRaft->leadTransferee; - if (leadTransferee != SYNC_NON_NODE_ID && !syncRaftIsInNodeMap(&pRaft->tracker->config.voters, leadTransferee)) { - abortLeaderTransfer(pRaft); + if (leadTransferee != SYNC_NON_NODE_ID) { + if (!syncRaftIsInNodeMap(&pRaft->tracker->config.voters.incoming, leadTransferee) && + !syncRaftIsInNodeMap(&pRaft->tracker->config.voters.outgoing, leadTransferee)) { + abortLeaderTransfer(pRaft); + } } } } diff --git a/source/libs/sync/src/raft_handle_election_message.c b/source/libs/sync/src/raft_handle_election_message.c index e536fc67c0..a58c8ba5cf 100644 --- a/source/libs/sync/src/raft_handle_election_message.c +++ b/source/libs/sync/src/raft_handle_election_message.c @@ -19,24 +19,6 @@ #include "raft_message.h" int syncRaftHandleElectionMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - if (pRaft->state == TAOS_SYNC_STATE_LEADER) { - syncDebug("[%d:%d] ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfGroupId, pRaft->selfId); - return 0; - } - - if (!syncRaftIsPromotable(pRaft)) { - syncDebug("[%d:%d] is unpromotable and can not campaign", pRaft->selfGroupId, pRaft->selfId); - return 0; - } - // if there is pending uncommitted config,cannot start election - if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) { - syncWarn("[%d:%d] cannot syncRaftStartElection at term %" PRId64 " since there are still pending configuration changes to apply", - pRaft->selfGroupId, pRaft->selfId, pRaft->term); - return 0; - } - - syncInfo("[%d:%d] is starting a new election at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); - if (pRaft->preVote) { syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_PRE_ELECTION); } else { diff --git a/source/libs/sync/src/raft_replication.c b/source/libs/sync/src/raft_replication.c index c19fcd1e68..74f40179c6 100644 --- a/source/libs/sync/src/raft_replication.c +++ b/source/libs/sync/src/raft_replication.c @@ -22,7 +22,7 @@ static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress); static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress, SyncIndex prevIndex, SyncTerm prevTerm, - const SSyncRaftEntry *entries, int nEntry); + SSyncRaftEntry *entries, int nEntry); // syncRaftReplicate sends an append RPC with new entries to the given peer, // if necessary. Returns true if a message was sent. The sendIfEmpty @@ -68,7 +68,7 @@ static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress) { static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress, SyncIndex prevIndex, SyncTerm prevTerm, - const SSyncRaftEntry *entries, int nEntry) { + SSyncRaftEntry *entries, int nEntry) { SyncIndex lastIndex; SyncTerm logTerm = prevTerm; SNodeInfo* pNode = &(pRaft->cluster.nodeInfo[progress->selfIndex]); @@ -87,7 +87,7 @@ static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress, case PROGRESS_STATE_REPLICATE: lastIndex = entries[nEntry - 1].index; syncRaftProgressOptimisticNextIndex(progress, lastIndex); - syncRaftInflightAdd(&progress->inflights, lastIndex); + syncRaftInflightAdd(progress->inflights, lastIndex); break; case PROGRESS_STATE_PROBE: progress->probeSent = true; diff --git a/source/libs/sync/src/sync_raft_config_change.c b/source/libs/sync/src/sync_raft_config_change.c index 4e7f2190ea..288fdc465e 100644 --- a/source/libs/sync/src/sync_raft_config_change.c +++ b/source/libs/sync/src/sync_raft_config_change.c @@ -359,7 +359,7 @@ static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfi // be turned into a learner in LeaveJoint(). // // Otherwise, add a regular learner right away. - bool inOutgoing = syncRaftJointConfigInCluster(&config->voters.outgoing, id); + bool inOutgoing = syncRaftIsInNodeMap(&config->voters.outgoing, id); if (inOutgoing) { nilAwareAdd(&config->learnersNext, id); } else { @@ -381,7 +381,7 @@ static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConf nilAwareDelete(&config->learnersNext, id); // If the peer is still a voter in the outgoing config, keep the Progress. - bool inOutgoing = syncRaftJointConfigInCluster(&config->voters.outgoing, id); + bool inOutgoing = syncRaftIsInNodeMap(&config->voters.outgoing, id); if (!inOutgoing) { syncRaftRemoveFromProgressMap(progressMap, id); } diff --git a/source/libs/sync/src/raft_election.c b/source/libs/sync/src/sync_raft_election.c similarity index 56% rename from source/libs/sync/src/raft_election.c rename to source/libs/sync/src/sync_raft_election.c index eb310c31ec..74c3e09dae 100644 --- a/source/libs/sync/src/raft_election.c +++ b/source/libs/sync/src/sync_raft_election.c @@ -17,15 +17,42 @@ #include "raft.h" #include "raft_log.h" #include "raft_message.h" +#include "sync_raft_progress_tracker.h" + +static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType); void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) { - SyncTerm term; + if (pRaft->state == TAOS_SYNC_STATE_LEADER) { + syncDebug("[%d:%d] ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfGroupId, pRaft->selfId); + return; + } + + if (!syncRaftIsPromotable(pRaft)) { + syncWarn("[%d:%d] is unpromotable and can not campaign", pRaft->selfGroupId, pRaft->selfId); + return; + } + + // if there is pending uncommitted config,cannot start election + if (syncRaftLogNumOfPendingConf(pRaft->log) > 0 && syncRaftHasUnappliedLog(pRaft->log)) { + syncWarn("[%d:%d] cannot syncRaftStartElection at term %" PRId64 " since there are still pending configuration changes to apply", + pRaft->selfGroupId, pRaft->selfId, pRaft->term); + return; + } + + syncInfo("[%d:%d] is starting a new election at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); + + campaign(pRaft, cType); +} + +// campaign transitions the raft instance to candidate state. This must only be +// called after verifying that this is a legitimate transition. +static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { bool preVote; - ESyncRaftMessageType voteMsgType; + SyncTerm term; if (syncRaftIsPromotable(pRaft)) { syncDebug("[%d:%d] is unpromotable; campaign() should have been called", pRaft->selfGroupId, pRaft->selfId); - return 0; + return; } if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { @@ -35,7 +62,6 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) { term = pRaft->term + 1; } else { syncRaftBecomeCandidate(pRaft); - voteMsgType = RAFT_MSG_VOTE; term = pRaft->term; preVote = false; } @@ -43,10 +69,8 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) { int quorum = syncRaftQuorum(pRaft); ESyncRaftVoteResult result = syncRaftPollVote(pRaft, pRaft->selfId, preVote, true, NULL, NULL); if (result == SYNC_RAFT_VOTE_WON) { - /** - * We won the election after voting for ourselves (which must mean that - * this is a single-node cluster). Advance to the next state. - **/ + // We won the election after voting for ourselves (which must mean that + // this is a single-node cluster). Advance to the next state. if (cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION) { syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); } else { @@ -59,12 +83,17 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) { int i; SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); - for (i = 0; i < pRaft->cluster.replica; ++i) { - if (i == pRaft->cluster.selfIndex) { + SSyncRaftNodeMap nodeMap; + syncRaftJointConfigIDS(&pRaft->tracker->config.voters, &nodeMap); + for (i = 0; i < TSDB_MAX_REPLICA; ++i) { + SyncNodeId nodeId = nodeMap.nodeId[i]; + if (nodeId == SYNC_NON_NODE_ID) { continue; } - SyncNodeId nodeId = pRaft->cluster.nodeInfo[i].nodeId; + if (nodeId == pRaft->selfId) { + continue; + } SSyncMessage* pMsg = syncNewVoteMsg(pRaft->selfGroupId, pRaft->selfId, term, cType, lastIndex, lastTerm); @@ -72,9 +101,9 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) { continue; } - syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 "] sent %d request to %d at term %" PRId64 "", + syncInfo("[%d:%d] [logterm: %" PRId64 ", index: %" PRId64 "] sent vote request to %d at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, lastTerm, - lastIndex, voteMsgType, nodeId, pRaft->term); + lastIndex, nodeId, pRaft->term); pRaft->io.send(pMsg, &(pRaft->cluster.nodeInfo[i])); } diff --git a/source/libs/sync/src/sync_raft_impl.c b/source/libs/sync/src/sync_raft_impl.c index d65e03c64f..ab2db10230 100644 --- a/source/libs/sync/src/sync_raft_impl.c +++ b/source/libs/sync/src/sync_raft_impl.c @@ -234,9 +234,7 @@ static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg) { return 0; } -/** - * tickElection is run by followers and candidates per tick. - **/ +// tickElection is run by followers and candidates after r.electionTimeout. static void tickElection(SSyncRaft* pRaft) { pRaft->electionElapsed += 1; @@ -254,6 +252,7 @@ static void tickElection(SSyncRaft* pRaft) { syncRaftStep(pRaft, syncInitElectionMsg(&msg, pRaft->selfId)); } +// tickHeartbeat is run by leaders to send a MsgBeat after r.heartbeatTimeout. static void tickHeartbeat(SSyncRaft* pRaft) { } diff --git a/source/libs/sync/src/sync_raft_node_map.c b/source/libs/sync/src/sync_raft_node_map.c new file mode 100644 index 0000000000..e13c808075 --- /dev/null +++ b/source/libs/sync/src/sync_raft_node_map.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "sync_raft_node_map.h" +#include "sync_type.h" + +bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { + int i; + + for (i = 0; i < TSDB_MAX_REPLICA; ++i) { + if (nodeId == nodeMap->nodeId[i]) { + return true; + } + } + + return false; +} + +void syncRaftCopyNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) { + memcpy(to, nodeMap, sizeof(SSyncRaftNodeMap)); +} + +void syncRaftUnionNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) { + int i, j, m; + + for (i = 0; i < TSDB_MAX_REPLICA; ++i) { + SyncNodeId id = nodeMap->nodeId[i]; + if (id == SYNC_NON_NODE_ID) { + continue; + } + + syncRaftAddToNodeMap(to, id); + } +} + +void syncRaftAddToNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { + assert(nodeMap->replica < TSDB_MAX_REPLICA); + + int i, j; + for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) { + SyncNodeId id = nodeMap->nodeId[i]; + if (id == SYNC_NON_NODE_ID) { + if (j == -1) j = i; + continue; + } + if (id == nodeId) { + return; + } + } + + assert(j != -1); + nodeMap->nodeId[j] = nodeId; + nodeMap->replica += 1; +} \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c index ea7f1ae4f5..f43414127d 100644 --- a/source/libs/sync/src/sync_raft_progress_tracker.c +++ b/source/libs/sync/src/sync_raft_progress_tracker.c @@ -49,6 +49,10 @@ void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyn } +int syncRaftCheckProgress(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { + return 0; +} + /** * syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the * election outcome is known. diff --git a/source/libs/sync/src/sync_raft_quorum_joint.c b/source/libs/sync/src/sync_raft_quorum_joint.c index fa663b6fc3..8a99574d68 100644 --- a/source/libs/sync/src/sync_raft_quorum_joint.c +++ b/source/libs/sync/src/sync_raft_quorum_joint.c @@ -13,6 +13,7 @@ * along with this program. If not, see . */ +#include "sync_raft_node_map.h" #include "sync_raft_quorum_majority.h" #include "sync_raft_quorum_joint.h" #include "sync_raft_quorum.h" @@ -71,15 +72,10 @@ void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, S assert(config->incoming.replica >= 0); } +void syncRaftJointConfigIDS(const SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap) { + int i, j, m; -bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { - int i; + syncRaftCopyNodeMap(&config->incoming, nodeMap); - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - if (nodeId == nodeMap->nodeId[i]) { - return true; - } - } - - return false; + syncRaftUnionNodeMap(&config->outgoing, nodeMap); } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_quorum_majority.c b/source/libs/sync/src/sync_raft_quorum_majority.c index 73eb378e09..8ff5752b97 100644 --- a/source/libs/sync/src/sync_raft_quorum_majority.c +++ b/source/libs/sync/src/sync_raft_quorum_majority.c @@ -15,6 +15,7 @@ #include "sync_raft_quorum.h" #include "sync_raft_quorum_majority.h" +#include "sync_raft_node_map.h" /** * syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns From 0c65b84886ede5d64c434f5d106ceb0851f69856 Mon Sep 17 00:00:00 2001 From: lichuang Date: Wed, 17 Nov 2021 08:29:24 +0800 Subject: [PATCH 08/21] [TD-10645][raft]add node map --- source/libs/sync/inc/raft.h | 4 ++- source/libs/sync/inc/raft_configuration.h | 27 ------------------- source/libs/sync/inc/sync_raft_impl.h | 2 ++ source/libs/sync/src/raft.c | 13 ++++++--- source/libs/sync/src/raft_configuration.c | 25 ----------------- .../src/raft_handle_append_entries_message.c | 9 +++---- .../libs/sync/src/raft_handle_vote_message.c | 9 ++++--- .../sync/src/raft_handle_vote_resp_message.c | 6 ++--- source/libs/sync/src/raft_replication.c | 7 +++-- source/libs/sync/src/sync_raft_election.c | 2 +- source/libs/sync/src/sync_raft_impl.c | 26 ++++++++++++------ 11 files changed, 50 insertions(+), 80 deletions(-) delete mode 100644 source/libs/sync/inc/raft_configuration.h delete mode 100644 source/libs/sync/src/raft_configuration.c diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 5b6efb95e5..6fa6c6e346 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -18,6 +18,7 @@ #include "sync.h" #include "sync_type.h" +#include "thash.h" #include "raft_message.h" #include "sync_raft_impl.h" #include "sync_raft_quorum.h" @@ -43,7 +44,8 @@ struct SSyncRaft { // owner sync node SSyncNode* pNode; - SSyncCluster cluster; + // hash map nodeId -> SNodeInfo* + SHashObj* nodeInfoMap; int selfIndex; SyncNodeId selfId; diff --git a/source/libs/sync/inc/raft_configuration.h b/source/libs/sync/inc/raft_configuration.h deleted file mode 100644 index ac9bbb5e55..0000000000 --- a/source/libs/sync/inc/raft_configuration.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#ifndef _TD_LIBS_SYNC_RAFT_CONFIGURATION_H -#define _TD_LIBS_SYNC_RAFT_CONFIGURATION_H - -#include "sync.h" -#include "sync_type.h" - -// return -1 if cannot find this id -int syncRaftConfigurationIndexOfNode(SSyncRaft *pRaft, SyncNodeId id); - -int syncRaftConfigurationVoterCount(SSyncRaft *pRaft); - -#endif /* _TD_LIBS_SYNC_RAFT_CONFIGURATION_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_impl.h b/source/libs/sync/inc/sync_raft_impl.h index bd77978c28..a8615f17eb 100644 --- a/source/libs/sync/inc/sync_raft_impl.h +++ b/source/libs/sync/inc/sync_raft_impl.h @@ -51,4 +51,6 @@ void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState); void syncRaftBroadcastAppend(SSyncRaft* pRaft); +SNodeInfo* syncRaftGetNodeById(SSyncRaft *pRaft, SyncNodeId id); + #endif /* _TD_LIBS_SYNC_RAFT_IMPL_H */ diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 39e0377545..d39e047492 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -14,7 +14,7 @@ */ #include "raft.h" -#include "raft_configuration.h" +#include "sync_raft_impl.h" #include "raft_log.h" #include "sync_raft_restore.h" #include "raft_replication.h" @@ -59,6 +59,11 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { logStore = &(pRaft->logStore); fsm = &(pRaft->fsm); + pRaft->nodeInfoMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); + if (pRaft->nodeInfoMap == NULL) { + return -1; + } + // init progress tracker pRaft->tracker = syncRaftOpenProgressTracker(); if (pRaft->tracker == NULL) { @@ -290,8 +295,8 @@ static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) * but it will not receive MsgApp or MsgHeartbeat, so it will not create * disruptive term increases **/ - int peerIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); - if (peerIndex < 0) { + SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from); + if (pNode == NULL) { return true; } SSyncMessage* msg = syncNewEmptyAppendRespMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term); @@ -299,7 +304,7 @@ static bool preHandleOldTermMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) return true; } - pRaft->io.send(msg, &(pRaft->cluster.nodeInfo[peerIndex])); + pRaft->io.send(msg, pNode); } else { // ignore other cases syncInfo("[%d:%d] [term:%" PRId64 "] ignored a %d message with lower term from %d [term:%" PRId64 "]", diff --git a/source/libs/sync/src/raft_configuration.c b/source/libs/sync/src/raft_configuration.c deleted file mode 100644 index e16cb34989..0000000000 --- a/source/libs/sync/src/raft_configuration.c +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2019 TAOS Data, Inc. - * - * This program is free software: you can use, redistribute, and/or modify - * it under the terms of the GNU Affero General Public License, version 3 - * or later ("AGPL"), as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include "raft_configuration.h" -#include "raft.h" - -int syncRaftConfigurationIndexOfNode(SSyncRaft *pRaft, SyncNodeId id) { - return (int)(id); -} - -int syncRaftConfigurationVoterCount(SSyncRaft *pRaft) { - return pRaft->cluster.replica; -} \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_append_entries_message.c b/source/libs/sync/src/raft_handle_append_entries_message.c index 4797b6ce03..92ebfe75f5 100644 --- a/source/libs/sync/src/raft_handle_append_entries_message.c +++ b/source/libs/sync/src/raft_handle_append_entries_message.c @@ -16,15 +16,14 @@ #include "syncInt.h" #include "raft.h" #include "raft_log.h" -#include "raft_configuration.h" +#include "sync_raft_impl.h" #include "raft_message.h" int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { const RaftMsg_Append_Entries *appendEntries = &(pMsg->appendEntries); - int peerIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); - - if (peerIndex < 0) { + SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from); + if (pNode == NULL) { return 0; } @@ -44,6 +43,6 @@ int syncRaftHandleAppendEntriesMessage(SSyncRaft* pRaft, const SSyncMessage* pMs pRaft->selfGroupId, pRaft->selfId, pMsg->from, appendEntries->index); out: - pRaft->io.send(pRespMsg, &(pRaft->cluster.nodeInfo[peerIndex])); + pRaft->io.send(pRespMsg, pNode); return 0; } \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_vote_message.c b/source/libs/sync/src/raft_handle_vote_message.c index 709e319c3e..9997c5226d 100644 --- a/source/libs/sync/src/raft_handle_vote_message.c +++ b/source/libs/sync/src/raft_handle_vote_message.c @@ -15,7 +15,7 @@ #include "syncInt.h" #include "raft.h" -#include "raft_configuration.h" +#include "sync_raft_impl.h" #include "raft_log.h" #include "raft_message.h" @@ -23,10 +23,11 @@ static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg); int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { SSyncMessage* pRespMsg; - int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); - if (voteIndex == -1) { + SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from); + if (pNode == NULL) { return 0; } + bool grant; SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); @@ -42,7 +43,7 @@ int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { grant ? "grant" : "reject", pMsg->from, pMsg->vote.lastTerm, pMsg->vote.lastIndex, pRaft->term); - pRaft->io.send(pRespMsg, &(pRaft->cluster.nodeInfo[voteIndex])); + pRaft->io.send(pRespMsg, pNode); return 0; } diff --git a/source/libs/sync/src/raft_handle_vote_resp_message.c b/source/libs/sync/src/raft_handle_vote_resp_message.c index 1781205ec0..744d654cc5 100644 --- a/source/libs/sync/src/raft_handle_vote_resp_message.c +++ b/source/libs/sync/src/raft_handle_vote_resp_message.c @@ -15,7 +15,7 @@ #include "syncInt.h" #include "raft.h" -#include "raft_configuration.h" +#include "sync_raft_impl.h" #include "raft_message.h" int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { @@ -25,8 +25,8 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { assert(pRaft->state == TAOS_SYNC_STATE_CANDIDATE); - voterIndex = syncRaftConfigurationIndexOfNode(pRaft, pMsg->from); - if (voterIndex == -1) { + SNodeInfo* pNode = syncRaftGetNodeById(pRaft, pMsg->from); + if (pNode == NULL) { syncError("[%d:%d] recv vote resp from unknown server %d", pRaft->selfGroupId, pRaft->selfId, pMsg->from); return 0; } diff --git a/source/libs/sync/src/raft_replication.c b/source/libs/sync/src/raft_replication.c index 74f40179c6..228d8195f6 100644 --- a/source/libs/sync/src/raft_replication.c +++ b/source/libs/sync/src/raft_replication.c @@ -69,9 +69,12 @@ static bool sendSnapshot(SSyncRaft* pRaft, SSyncRaftProgress* progress) { static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress, SyncIndex prevIndex, SyncTerm prevTerm, SSyncRaftEntry *entries, int nEntry) { + SNodeInfo* pNode = syncRaftGetNodeById(pRaft, progress->id); + if (pNode == NULL) { + return false; + } SyncIndex lastIndex; - SyncTerm logTerm = prevTerm; - SNodeInfo* pNode = &(pRaft->cluster.nodeInfo[progress->selfIndex]); + SyncTerm logTerm = prevTerm; SSyncMessage* msg = syncNewAppendMsg(pRaft->selfGroupId, pRaft->selfId, pRaft->term, prevIndex, prevTerm, pRaft->log->commitIndex, diff --git a/source/libs/sync/src/sync_raft_election.c b/source/libs/sync/src/sync_raft_election.c index 74c3e09dae..b5649d5c5e 100644 --- a/source/libs/sync/src/sync_raft_election.c +++ b/source/libs/sync/src/sync_raft_election.c @@ -105,6 +105,6 @@ static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { pRaft->selfGroupId, pRaft->selfId, lastTerm, lastIndex, nodeId, pRaft->term); - pRaft->io.send(pMsg, &(pRaft->cluster.nodeInfo[i])); + //pRaft->io.send(pMsg, &(pRaft->cluster.nodeInfo[i])); } } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_impl.c b/source/libs/sync/src/sync_raft_impl.c index ab2db10230..6ec0c6c089 100644 --- a/source/libs/sync/src/sync_raft_impl.c +++ b/source/libs/sync/src/sync_raft_impl.c @@ -14,7 +14,7 @@ */ #include "raft.h" -#include "raft_configuration.h" +#include "sync_raft_impl.h" #include "raft_log.h" #include "raft_replication.h" #include "sync_raft_progress_tracker.h" @@ -123,15 +123,16 @@ bool syncRaftIsPastElectionTimeout(SSyncRaft* pRaft) { } int syncRaftQuorum(SSyncRaft* pRaft) { - return pRaft->cluster.replica / 2 + 1; + return 0; + //return pRaft->cluster.replica / 2 + 1; } ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id, bool preVote, bool grant, int* rejected, int *granted) { - int voterIndex = syncRaftConfigurationIndexOfNode(pRaft, id); - if (voterIndex == -1) { - return SYNC_RAFT_VOTE_PENDING; + SNodeInfo* pNode = syncRaftGetNodeById(pRaft, id); + if (pNode == NULL) { + return true; } if (grant) { @@ -142,7 +143,7 @@ ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id, pRaft->selfGroupId, pRaft->selfId, preVote, id, pRaft->term); } - syncRaftRecordVote(pRaft->tracker, voterIndex, grant); + syncRaftRecordVote(pRaft->tracker, pNode->nodeId, grant); return syncRaftTallyVotes(pRaft->tracker, rejected, granted); } /* @@ -154,7 +155,7 @@ ESyncRaftVoteResult syncRaftPollVote(SSyncRaft* pRaft, SyncNodeId id, pRaft->selfGroupId, pRaft->selfId, id, pRaft->term); } - int voteIndex = syncRaftConfigurationIndexOfNode(pRaft, id); + int voteIndex = syncRaftGetNodeById(pRaft, id); assert(voteIndex < pRaft->cluster.replica && voteIndex >= 0); assert(pRaft->candidateState.votes[voteIndex] == SYNC_RAFT_VOTE_RESP_UNKNOWN); @@ -198,6 +199,15 @@ void syncRaftBroadcastAppend(SSyncRaft* pRaft) { syncRaftProgressVisit(pRaft->tracker, visitProgressSendAppend, pRaft); } +SNodeInfo* syncRaftGetNodeById(SSyncRaft *pRaft, SyncNodeId id) { + SNodeInfo **ppNode = taosHashGet(pRaft->nodeInfoMap, &id, sizeof(SNodeInfo)); + if (ppNode != NULL) { + return *ppNode; + } + + return NULL; +} + static int convertClear(SSyncRaft* pRaft) { } @@ -269,7 +279,7 @@ static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { syncRaftLogAppend(pRaft->log, entries, n); - SSyncRaftProgress* progress = &(pRaft->tracker->progressMap.progress[pRaft->cluster.selfIndex]); + SSyncRaftProgress* progress = &(pRaft->tracker->progressMap.progress[pRaft->selfIndex]); syncRaftProgressMaybeUpdate(progress, lastIndex); // Regardless of syncRaftMaybeCommit's return, our caller will call bcastAppend. syncRaftMaybeCommit(pRaft); From 1d874657f7b982b6f29c5177b59f6ed6e9c4d8a9 Mon Sep 17 00:00:00 2001 From: lichuang Date: Wed, 17 Nov 2021 09:12:54 +0800 Subject: [PATCH 09/21] [TD-10645][raft]add node map --- .../libs/sync/inc/sync_raft_progress_tracker.h | 7 +++++-- source/libs/sync/inc/sync_raft_quorum_joint.h | 3 ++- .../libs/sync/inc/sync_raft_quorum_majority.h | 3 ++- source/libs/sync/inc/sync_type.h | 3 --- source/libs/sync/src/sync.c | 6 +++--- source/libs/sync/src/sync_raft_election.c | 7 ++++++- source/libs/sync/src/sync_raft_impl.c | 2 +- .../libs/sync/src/sync_raft_progress_tracker.c | 17 ++++++++++------- source/libs/sync/src/sync_raft_quorum_joint.c | 6 +++--- .../libs/sync/src/sync_raft_quorum_majority.c | 10 +++++++--- 10 files changed, 39 insertions(+), 25 deletions(-) diff --git a/source/libs/sync/inc/sync_raft_progress_tracker.h b/source/libs/sync/inc/sync_raft_progress_tracker.h index 8adb0b4736..3a448290c8 100644 --- a/source/libs/sync/inc/sync_raft_progress_tracker.h +++ b/source/libs/sync/inc/sync_raft_progress_tracker.h @@ -21,6 +21,7 @@ #include "sync_raft_quorum_joint.h" #include "sync_raft_progress.h" #include "sync_raft_proto.h" +#include "thash.h" struct SSyncRaftProgressTrackerConfig { SSyncRaftQuorumJointConfig voters; @@ -83,7 +84,9 @@ struct SSyncRaftProgressTracker { SSyncRaftProgressMap progressMap; - ESyncRaftVoteType votes[TSDB_MAX_REPLICA]; + // nodeid -> ESyncRaftVoteType map + SHashObj* votesMap; + int maxInflightMsgs; }; @@ -98,7 +101,7 @@ void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, voi * syncRaftRecordVote records that the node with the given id voted for this Raft * instance if v == true (and declined it otherwise). **/ -void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant); +void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant); void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressTrackerConfig* result); diff --git a/source/libs/sync/inc/sync_raft_quorum_joint.h b/source/libs/sync/inc/sync_raft_quorum_joint.h index 0637a9be9a..4a5b749a0e 100644 --- a/source/libs/sync/inc/sync_raft_quorum_joint.h +++ b/source/libs/sync/inc/sync_raft_quorum_joint.h @@ -20,6 +20,7 @@ #include "sync.h" #include "sync_type.h" #include "sync_raft_node_map.h" +#include "thash.h" /** * SSyncRaftQuorumJointConfig is a configuration of two groups of (possibly overlapping) @@ -35,7 +36,7 @@ typedef struct SSyncRaftQuorumJointConfig { * a result indicating whether the vote is pending, lost, or won. A joint quorum * requires both majority quorums to vote in favor. **/ -ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const ESyncRaftVoteType* votes); +ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashObj* votesMap); static FORCE_INLINE bool syncRaftJointConfigInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) { return syncRaftIsInNodeMap(&config->outgoing, id); diff --git a/source/libs/sync/inc/sync_raft_quorum_majority.h b/source/libs/sync/inc/sync_raft_quorum_majority.h index 0512a4dc87..38df40147a 100644 --- a/source/libs/sync/inc/sync_raft_quorum_majority.h +++ b/source/libs/sync/inc/sync_raft_quorum_majority.h @@ -19,6 +19,7 @@ #include "sync.h" #include "sync_type.h" #include "sync_raft_quorum.h" +#include "thash.h" /** * syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns @@ -26,6 +27,6 @@ * yes/no has been reached), won (a quorum of yes has been reached), or lost (a * quorum of no has been reached). **/ -ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, const ESyncRaftVoteType* votes); +ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashObj* votesMap); #endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */ diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h index 6d29e019cc..fcb0940609 100644 --- a/source/libs/sync/inc/sync_type.h +++ b/source/libs/sync/inc/sync_type.h @@ -77,9 +77,6 @@ typedef enum { } ESyncRaftElectionType; typedef enum { - // the init vote resp status - SYNC_RAFT_VOTE_RESP_UNKNOWN = 0, - // grant the vote request SYNC_RAFT_VOTE_RESP_GRANT = 1, diff --git a/source/libs/sync/src/sync.c b/source/libs/sync/src/sync.c index 2f25828d5d..06af8ff6c2 100644 --- a/source/libs/sync/src/sync.c +++ b/source/libs/sync/src/sync.c @@ -99,7 +99,7 @@ void syncCleanUp() { SSyncNode* syncStart(const SSyncInfo* pInfo) { pthread_mutex_lock(&gSyncManager->mutex); - SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pInfo->vgId, sizeof(SyncGroupId)); + SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pInfo->vgId, sizeof(SyncGroupId*)); if (ppNode != NULL) { syncInfo("vgroup %d already exist", pInfo->vgId); pthread_mutex_unlock(&gSyncManager->mutex); @@ -140,7 +140,7 @@ SSyncNode* syncStart(const SSyncInfo* pInfo) { void syncStop(const SSyncNode* pNode) { pthread_mutex_lock(&gSyncManager->mutex); - SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId)); + SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &pNode->vgId, sizeof(SyncGroupId*)); if (ppNode == NULL) { syncInfo("vgroup %d not exist", pNode->vgId); pthread_mutex_unlock(&gSyncManager->mutex); @@ -288,7 +288,7 @@ static void *syncWorkerMain(void *argv) { static void syncNodeTick(void *param, void *tmrId) { SyncGroupId vgId = (SyncGroupId)param; - SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &vgId, sizeof(SyncGroupId)); + SSyncNode **ppNode = taosHashGet(gSyncManager->vgroupTable, &vgId, sizeof(SyncGroupId*)); if (ppNode == NULL) { return; } diff --git a/source/libs/sync/src/sync_raft_election.c b/source/libs/sync/src/sync_raft_election.c index b5649d5c5e..4423e1ac3c 100644 --- a/source/libs/sync/src/sync_raft_election.c +++ b/source/libs/sync/src/sync_raft_election.c @@ -95,6 +95,11 @@ static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { continue; } + SNodeInfo* pNode = syncRaftGetNodeById(pRaft, nodeId); + if (pNode == NULL) { + continue; + } + SSyncMessage* pMsg = syncNewVoteMsg(pRaft->selfGroupId, pRaft->selfId, term, cType, lastIndex, lastTerm); if (pMsg == NULL) { @@ -105,6 +110,6 @@ static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { pRaft->selfGroupId, pRaft->selfId, lastTerm, lastIndex, nodeId, pRaft->term); - //pRaft->io.send(pMsg, &(pRaft->cluster.nodeInfo[i])); + pRaft->io.send(pMsg, pNode); } } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_impl.c b/source/libs/sync/src/sync_raft_impl.c index 6ec0c6c089..73a02c4b80 100644 --- a/source/libs/sync/src/sync_raft_impl.c +++ b/source/libs/sync/src/sync_raft_impl.c @@ -200,7 +200,7 @@ void syncRaftBroadcastAppend(SSyncRaft* pRaft) { } SNodeInfo* syncRaftGetNodeById(SSyncRaft *pRaft, SyncNodeId id) { - SNodeInfo **ppNode = taosHashGet(pRaft->nodeInfoMap, &id, sizeof(SNodeInfo)); + SNodeInfo **ppNode = taosHashGet(pRaft->nodeInfoMap, &id, sizeof(SyncNodeId*)); if (ppNode != NULL) { return *ppNode; } diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c index f43414127d..1407df059a 100644 --- a/source/libs/sync/src/sync_raft_progress_tracker.c +++ b/source/libs/sync/src/sync_raft_progress_tracker.c @@ -26,7 +26,7 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker() { } void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) { - memset(tracker->votes, SYNC_RAFT_VOTE_RESP_UNKNOWN, sizeof(ESyncRaftVoteType) * TSDB_MAX_REPLICA); + taosHashClear(tracker->votesMap); } void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) { @@ -37,12 +37,14 @@ void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp vi } } -void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, int i, bool grant) { - if (tracker->votes[i] != SYNC_RAFT_VOTE_RESP_UNKNOWN) { +void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant) { + ESyncRaftVoteType* pType = taosHashGet(tracker->votesMap, &id, sizeof(SyncNodeId*)); + if (pType != NULL) { return; } - tracker->votes[i] = grant ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT; + ESyncRaftVoteType type = grant ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT; + taosHashPut(tracker->votesMap, &id, sizeof(SyncNodeId), &type, sizeof(ESyncRaftVoteType*)); } void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) { @@ -68,11 +70,12 @@ ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* r continue; } - if (tracker->votes[i] == SYNC_RAFT_VOTE_RESP_UNKNOWN) { + ESyncRaftVoteType* pType = taosHashGet(tracker->votesMap, &progress->id, sizeof(SyncNodeId*)); + if (pType == NULL) { continue; } - if (tracker->votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) { + if (*pType == SYNC_RAFT_VOTE_RESP_GRANT) { g++; } else { r++; @@ -81,7 +84,7 @@ ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* r if (rejected) *rejected = r; if (granted) *granted = g; - return syncRaftVoteResult(&(tracker->config.voters), tracker->votes); + return syncRaftVoteResult(&(tracker->config.voters), tracker->votesMap); } void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) { diff --git a/source/libs/sync/src/sync_raft_quorum_joint.c b/source/libs/sync/src/sync_raft_quorum_joint.c index 8a99574d68..2383d7ee63 100644 --- a/source/libs/sync/src/sync_raft_quorum_joint.c +++ b/source/libs/sync/src/sync_raft_quorum_joint.c @@ -23,9 +23,9 @@ * a result indicating whether the vote is pending, lost, or won. A joint quorum * requires both majority quorums to vote in favor. **/ -ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, const ESyncRaftVoteType* votes) { - ESyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->incoming), votes); - ESyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->outgoing), votes); +ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashObj* votesMap) { + ESyncRaftVoteResult r1 = syncRaftMajorityVoteResult(&(config->incoming), votesMap); + ESyncRaftVoteResult r2 = syncRaftMajorityVoteResult(&(config->outgoing), votesMap); if (r1 == r2) { // If they agree, return the agreed state. diff --git a/source/libs/sync/src/sync_raft_quorum_majority.c b/source/libs/sync/src/sync_raft_quorum_majority.c index 8ff5752b97..014a8c7303 100644 --- a/source/libs/sync/src/sync_raft_quorum_majority.c +++ b/source/libs/sync/src/sync_raft_quorum_majority.c @@ -23,7 +23,7 @@ * yes/no has been reached), won (a quorum of yes has been reached), or lost (a * quorum of no has been reached). **/ -ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, const ESyncRaftVoteType* votes) { +ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashObj* votesMap) { if (config->replica == 0) { return SYNC_RAFT_VOTE_WON; } @@ -34,9 +34,13 @@ ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, const E continue; } - if (votes[i] == SYNC_RAFT_VOTE_RESP_UNKNOWN) { + const ESyncRaftVoteType* pType = taosHashGet(votesMap, &config->nodeId[i], sizeof(SyncNodeId*)); + if (pType == NULL) { missing += 1; - } else if (votes[i] == SYNC_RAFT_VOTE_RESP_GRANT) { + continue; + } + + if (*pType == SYNC_RAFT_VOTE_RESP_GRANT) { g +=1; } else { r += 1; From df2530f969949ea74190626f1ab6bc0cfa200b61 Mon Sep 17 00:00:00 2001 From: lichuang Date: Wed, 17 Nov 2021 11:04:38 +0800 Subject: [PATCH 10/21] [TD-10645][raft]refactor node and progress map --- source/libs/sync/inc/sync_raft_node_map.h | 16 ++- source/libs/sync/inc/sync_raft_progress.h | 15 +-- source/libs/sync/src/raft.c | 4 +- .../libs/sync/src/sync_raft_config_change.c | 115 +++++++----------- source/libs/sync/src/sync_raft_election.c | 10 +- source/libs/sync/src/sync_raft_node_map.c | 69 +++++------ source/libs/sync/src/sync_raft_progress.c | 49 +++----- .../sync/src/sync_raft_progress_tracker.c | 11 +- source/libs/sync/src/sync_raft_quorum_joint.c | 29 +---- 9 files changed, 141 insertions(+), 177 deletions(-) diff --git a/source/libs/sync/inc/sync_raft_node_map.h b/source/libs/sync/inc/sync_raft_node_map.h index bfb5f68489..5d43e42061 100644 --- a/source/libs/sync/inc/sync_raft_node_map.h +++ b/source/libs/sync/inc/sync_raft_node_map.h @@ -16,15 +16,18 @@ #ifndef _TD_LIBS_SYNC_RAFT_NODE_MAP_H #define _TD_LIBS_SYNC_RAFT_NODE_MAP_H +#include "thash.h" #include "sync.h" #include "sync_type.h" -// TODO: is TSDB_MAX_REPLICA enough? struct SSyncRaftNodeMap { - int32_t replica; - SyncNodeId nodeId[TSDB_MAX_REPLICA]; + SHashObj* nodeIdMap; }; +void syncRaftInitNodeMap(SSyncRaftNodeMap* nodeMap); + +void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap); + bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); void syncRaftCopyNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to); @@ -33,4 +36,11 @@ void syncRaftUnionNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) void syncRaftAddToNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); +void syncRaftRemoveFromNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); + +int32_t syncRaftNodeMapSize(const SSyncRaftNodeMap* nodeMap); + +// return true if reach the end +bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId); + #endif /* _TD_LIBS_SYNC_RAFT_NODE_MAP_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_progress.h b/source/libs/sync/inc/sync_raft_progress.h index 173608a40a..3d2995ed77 100644 --- a/source/libs/sync/inc/sync_raft_progress.h +++ b/source/libs/sync/inc/sync_raft_progress.h @@ -18,6 +18,7 @@ #include "sync_type.h" #include "sync_raft_inflights.h" +#include "thash.h" /** * State defines how the leader should interact with the follower. @@ -69,8 +70,7 @@ static const char* kProgressStateString[] = { * progresses of all followers, and sends entries to the follower based on its progress. **/ struct SSyncRaftProgress { - // index in raft cluster config - int selfIndex; + SyncGroupId groupId; SyncNodeId id; @@ -139,10 +139,10 @@ struct SSyncRaftProgress { }; struct SSyncRaftProgressMap { - SSyncRaftProgress progress[TSDB_MAX_REPLICA]; + // map nodeId -> SSyncRaftProgress* + SHashObj* progressMap; }; - static FORCE_INLINE const char* syncRaftProgressStateString(const SSyncRaftProgress* progress) { return kProgressStateString[progress->state]; } @@ -221,9 +221,9 @@ static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progres return progress->recentActive; } -int syncRaftFindProgressIndexByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id); +SSyncRaftProgress* syncRaftFindProgressByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id); -int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id); +int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgress* progress); void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id); @@ -236,7 +236,8 @@ void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snaps void syncRaftCopyProgress(const SSyncRaftProgress* from, SSyncRaftProgress* to); -void syncRaftProgressMapCopy(const SSyncRaftProgressMap* from, SSyncRaftProgressMap* to); +// return true if reach the end +bool syncRaftIterateProgressMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftProgress *pProgress); #if 0 diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index d39e047492..b2170e0b68 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -175,7 +175,7 @@ static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfi SSyncRaftProgress* progress = NULL; syncRaftConfigState(pRaft->tracker, cs); - i = syncRaftFindProgressIndexByNodeId(&pRaft->tracker->progressMap, selfId); + i = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, selfId); exist = (i != -1); // Update whether the node itself is a learner, resetting to false when the @@ -202,7 +202,7 @@ static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfi // The remaining steps only make sense if this node is the leader and there // are other nodes. - if (pRaft->state != TAOS_SYNC_STATE_LEADER || cs->voters.replica == 0) { + if (pRaft->state != TAOS_SYNC_STATE_LEADER || syncRaftNodeMapSize(&cs->voters) == 0) { return; } diff --git a/source/libs/sync/src/sync_raft_config_change.c b/source/libs/sync/src/sync_raft_config_change.c index 288fdc465e..ad261dd277 100644 --- a/source/libs/sync/src/sync_raft_config_change.c +++ b/source/libs/sync/src/sync_raft_config_change.c @@ -101,7 +101,7 @@ int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const S return -1; } - if(config->voters.incoming.replica == 0) { + if(syncRaftNodeMapSize(&config->voters.incoming) == 0) { // We allow adding nodes to an empty config for convenience (testing and // bootstrap), but you can't enter a joint state. syncError("can't make a zero-voter config joint"); @@ -112,7 +112,7 @@ int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const S syncRaftJointConfigClearOutgoing(&config->voters); // Copy incoming to outgoing. - memcpy(&config->voters.outgoing, &config->voters.incoming, sizeof(SSyncCluster)); + syncRaftCopyNodeMap(&config->voters.incoming, &config->voters.outgoing); ret = applyConfig(changer, config, progressMap, css); if (ret != 0) { @@ -129,13 +129,12 @@ int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const S static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { syncRaftCloneTrackerConfig(&changer->tracker->config, config); int i; - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - SSyncRaftProgress* progress = &(changer->tracker->progressMap.progress[i]); - if (progress->id == SYNC_NON_NODE_ID) { - continue; - } - syncRaftCopyProgress(progress, &(progressMap->progress[i])); + + SSyncRaftProgress* pProgress = NULL; + while (!syncRaftIterateProgressMap(&changer->tracker->progressMap, pProgress)) { + syncRaftAddToProgressMap(progressMap, pProgress); } + return checkAndReturn(config, progressMap); } @@ -158,33 +157,44 @@ static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProg return ret; } - int i; // Any staged learner was staged because it could not be directly added due // to a conflicting voter in the outgoing config. - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - if (!syncRaftJointConfigInOutgoing(&config->voters, config->learnersNext.nodeId[i])) { + SyncNodeId* pNodeId = NULL; + while (!syncRaftIterateNodeMap(&config->learnersNext, pNodeId)) { + SyncNodeId nodeId = *pNodeId; + if (!syncRaftJointConfigInOutgoing(&config->voters, nodeId)) { return -1; } - if (progressMap->progress[i].id != SYNC_NON_NODE_ID && progressMap->progress[i].isLearner) { - syncError("%d is in LearnersNext, but is already marked as learner", progressMap->progress[i].id); + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, nodeId); + assert(progress); + assert(progress->id == nodeId); + if (progress->isLearner) { + syncError("[%d:%d] is in LearnersNext, but is already marked as learner", progress->groupId, nodeId); return -1; } } + // Conversely Learners and Voters doesn't intersect at all. - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - if (syncRaftJointConfigInIncoming(&config->voters, config->learners.nodeId[i])) { - syncError("%d is in Learners and voter.incoming", progressMap->progress[i].id); + SyncNodeId* pNodeId = NULL; + while (!syncRaftIterateNodeMap(&config->learners, pNodeId)) { + SyncNodeId nodeId = *pNodeId; + if (syncRaftJointConfigInIncoming(&config->voters, nodeId)) { + syncError("%d is in Learners and voter.incoming", nodeId); return -1; } - if (progressMap->progress[i].id != SYNC_NON_NODE_ID && !progressMap->progress[i].isLearner) { - syncError("%d is in Learners, but is not marked as learner", progressMap->progress[i].id); + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, nodeId); + assert(progress); + assert(progress->id == nodeId); + + if (!progress->isLearner) { + syncError("[%d:%d] is in Learners, but is not marked as learner", progress->groupId, nodeId); return -1; } } if (!hasJointConfig(config)) { // We enforce that empty maps are nil instead of zero. - if (config->learnersNext.replica > 0) { + if (syncRaftNodeMapSize(&config->learnersNext)) { syncError("cfg.LearnersNext must be nil when not joint"); return -1; } @@ -198,7 +208,7 @@ static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProg } static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config) { - return config->voters.outgoing.replica > 0; + return syncRaftNodeMapSize(&config->voters.outgoing) > 0; } static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, @@ -227,7 +237,7 @@ static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig } } - if (config->voters.incoming.replica == 0) { + if (syncRaftNodeMapSize(&config->voters.incoming) == 0) { syncError("removed all voters"); return -1; } @@ -251,15 +261,10 @@ static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r) { const SSyncRaftNodeMap* p0 = pp[0]; const SSyncRaftNodeMap* p1 = pp[1]; - for (j0 = 0; j0 < TSDB_MAX_REPLICA; ++j0) { - SyncNodeId id = p0->nodeId[j0]; - if (id == SYNC_NON_NODE_ID) { - continue; - } - for (j1 = 0; j1 < p1->replica; ++j1) { - if (p1->nodeId[j1] != SYNC_NON_NODE_ID && p1->nodeId[j1] != id) { - n+=1; - } + SyncNodeId* pNodeId; + while (!syncRaftIterateNodeMap(p0, pNodeId)) { + if (!syncRaftIsInNodeMap(p1, *pNodeId)) { + n+=1; } } } @@ -274,47 +279,23 @@ static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConf // nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after. static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id) { - int i; - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - if (nodeMap->nodeId[i] == id) { - nodeMap->replica -= 1; - nodeMap->nodeId[i] = SYNC_NON_NODE_ID; - break; - } - } - - assert(nodeMap->replica >= 0); + syncRaftRemoveFromNodeMap(nodeMap, id); } // nilAwareAdd populates a map entry, creating the map if necessary. static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id) { - int i, j; - for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) { - if (nodeMap->nodeId[i] == id) { - return; - } - if (j == -1 && nodeMap->nodeId[i] == SYNC_NON_NODE_ID) { - j = i; - } - } - - assert(j != -1); - nodeMap->nodeId[j] = id; - nodeMap->replica += 1; + syncRaftAddToNodeMap(nodeMap, id); } // makeVoter adds or promotes the given ID to be a voter in the incoming // majority config. static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap, SyncNodeId id) { - int i = syncRaftFindProgressIndexByNodeId(progressMap, id); - if (i == -1) { + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); + if (progress == -1) { initProgress(changer, config, progressMap, id, false); - i = syncRaftFindProgressIndexByNodeId(progressMap, id); + return; } - - assert(i != -1); - SSyncRaftProgress* progress = &(progressMap->progress[i]); progress->isLearner = false; nilAwareDelete(&config->learners, id); @@ -337,14 +318,12 @@ static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* // LeaveJoint(). static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap, SyncNodeId id) { - int i = syncRaftFindProgressIndexByNodeId(progressMap, id); - if (i == -1) { + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); + if (progress == NULL) { initProgress(changer, config, progressMap, id, false); - i = syncRaftFindProgressIndexByNodeId(progressMap, id); + return; } - - assert(i != -1); - SSyncRaftProgress* progress = &(progressMap->progress[i]); + if (progress->isLearner) { return; } @@ -352,7 +331,7 @@ static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfi removeNodeId(changer, config, progressMap, id); // ... but save the Progress. - syncRaftAddToProgressMap(progressMap, id); + syncRaftAddToProgressMap(progressMap, progress); // Use LearnersNext if we can't add the learner to Learners directly, i.e. // if the peer is still tracked as a voter in the outgoing config. It will @@ -371,8 +350,8 @@ static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfi // removeNodeId this peer as a voter or learner from the incoming config. static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap, SyncNodeId id) { - int i = syncRaftFindProgressIndexByNodeId(progressMap, id); - if (i == -1) { + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); + if (progress == NULL) { return; } diff --git a/source/libs/sync/src/sync_raft_election.c b/source/libs/sync/src/sync_raft_election.c index 4423e1ac3c..d4013bbc08 100644 --- a/source/libs/sync/src/sync_raft_election.c +++ b/source/libs/sync/src/sync_raft_election.c @@ -85,8 +85,14 @@ static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); SSyncRaftNodeMap nodeMap; syncRaftJointConfigIDS(&pRaft->tracker->config.voters, &nodeMap); - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - SyncNodeId nodeId = nodeMap.nodeId[i]; + SyncNodeId *pNodeId = NULL; + while (true) { + syncRaftIterateNodeMap(&nodeMap, &pNodeId); + if (pNodeId == NULL || *pNodeId == NULL) { + break; + } + + SyncNodeId nodeId = *pNodeId; if (nodeId == SYNC_NON_NODE_ID) { continue; } diff --git a/source/libs/sync/src/sync_raft_node_map.c b/source/libs/sync/src/sync_raft_node_map.c index e13c808075..5dacc01d1a 100644 --- a/source/libs/sync/src/sync_raft_node_map.c +++ b/source/libs/sync/src/sync_raft_node_map.c @@ -16,51 +16,52 @@ #include "sync_raft_node_map.h" #include "sync_type.h" +void syncRaftInitNodeMap(SSyncRaftNodeMap* nodeMap) { + nodeMap->nodeIdMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); +} + +void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap) { + taosHashClear(nodeMap->nodeIdMap); +} + bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { - int i; - - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - if (nodeId == nodeMap->nodeId[i]) { - return true; - } + SyncNodeId** ppId = (SyncNodeId**)taosHashGet(nodeMap->nodeIdMap, &nodeId, sizeof(SyncNodeId*)); + if (ppId == NULL) { + return false; } - - return false; + return true; } void syncRaftCopyNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) { - memcpy(to, nodeMap, sizeof(SSyncRaftNodeMap)); + SyncNodeId** ppId = (SyncNodeId**)taosHashIterate(nodeMap->nodeIdMap, NULL); + while (ppId) { + taosHashPut(to->nodeIdMap, ppId, sizeof(SyncNodeId*), ppId, sizeof(SyncNodeId*)); + ppId = taosHashIterate(nodeMap->nodeIdMap, ppId); + } +} + +bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId) { + SyncNodeId **ppId = taosHashIterate(nodeMap->nodeIdMap, pId); + if (ppId == NULL) { + return true; + } + + *pId = *(*ppId); + return false; } void syncRaftUnionNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) { - int i, j, m; - - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - SyncNodeId id = nodeMap->nodeId[i]; - if (id == SYNC_NON_NODE_ID) { - continue; - } - - syncRaftAddToNodeMap(to, id); - } + syncRaftCopyNodeMap(nodeMap, to); } void syncRaftAddToNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { - assert(nodeMap->replica < TSDB_MAX_REPLICA); + taosHashPut(nodeMap->nodeIdMap, &nodeId, sizeof(SyncNodeId*), &nodeId, sizeof(SyncNodeId*)); +} - int i, j; - for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) { - SyncNodeId id = nodeMap->nodeId[i]; - if (id == SYNC_NON_NODE_ID) { - if (j == -1) j = i; - continue; - } - if (id == nodeId) { - return; - } - } +void syncRaftRemoveFromNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { + taosHashRemove(nodeMap->nodeIdMap, &nodeId, sizeof(SyncNodeId*)); +} - assert(j != -1); - nodeMap->nodeId[j] = nodeId; - nodeMap->replica += 1; +int32_t syncRaftNodeMapSize(const SSyncRaftNodeMap* nodeMap) { + return taosHashGetSize(nodeMap); } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_progress.c b/source/libs/sync/src/sync_raft_progress.c index a53aae93d0..436250e594 100644 --- a/source/libs/sync/src/sync_raft_progress.c +++ b/source/libs/sync/src/sync_raft_progress.c @@ -112,42 +112,21 @@ bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) { } } -int syncRaftFindProgressIndexByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id) { - int i; - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - if (progressMap->progress[i].id == id) { - return i; - } +SSyncRaftProgress* syncRaftFindProgressByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id) { + SSyncRaftProgress** ppProgress = (SSyncRaftProgress**)taosHashGet(progressMap, &id, sizeof(SyncNodeId*)); + if (ppProgress == NULL) { + return NULL; } - return -1; + + return *ppProgress; } -int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) { - int i, j; - - for (i = 0, j = -1; i < TSDB_MAX_REPLICA; ++i) { - if (progressMap->progress[i].id == id) { - return i; - } - if (j == -1 && progressMap->progress[i].id == SYNC_NON_NODE_ID) { - j = i; - } - } - - assert(j != -1); - - progressMap->progress[i].id = id; +int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgress* progress) { + taosHashPut(progressMap->progressMap, &progress->id, sizeof(SyncNodeId*), &progress, sizeof(SSyncRaftProgress*)); } void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) { - int i; - - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - if (progressMap->progress[i].id == id) { - progressMap->progress[i].id = SYNC_NON_NODE_ID; - break; - } - } + taosHashRemove(progressMap->progressMap, &id, sizeof(SyncNodeId*)); } bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) { @@ -188,7 +167,17 @@ void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snaps } void syncRaftCopyProgress(const SSyncRaftProgress* progress, SSyncRaftProgress* out) { + memcpy(out, progress, sizeof(SSyncRaftProgress)); +} +bool syncRaftIterateProgressMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftProgress *pProgress) { + SSyncRaftProgress **ppProgress = taosHashIterate(nodeMap->nodeIdMap, pProgress); + if (ppProgress == NULL) { + return true; + } + + *pProgress = *(*ppProgress); + return false; } /** diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c index 1407df059a..3dd0a5ffe1 100644 --- a/source/libs/sync/src/sync_raft_progress_tracker.c +++ b/source/libs/sync/src/sync_raft_progress_tracker.c @@ -22,6 +22,9 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker() { return NULL; } + syncRaftInitNodeMap(&tracker->config.learners); + syncRaftInitNodeMap(&tracker->config.learnersNext); + return tracker; } @@ -88,8 +91,8 @@ ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* r } void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) { - memcpy(&cs->voters, &tracker->config.voters.incoming, sizeof(SSyncRaftNodeMap)); - memcpy(&cs->votersOutgoing, &tracker->config.voters.outgoing, sizeof(SSyncRaftNodeMap)); - memcpy(&cs->learners, &tracker->config.learners, sizeof(SSyncRaftNodeMap)); - memcpy(&cs->learnersNext, &tracker->config.learnersNext, sizeof(SSyncRaftNodeMap)); + syncRaftCopyNodeMap(&cs->voters, &tracker->config.voters.incoming); + syncRaftCopyNodeMap(&cs->votersOutgoing, &tracker->config.voters.outgoing); + syncRaftCopyNodeMap(&cs->learners, &tracker->config.learners); + syncRaftCopyNodeMap(&cs->learnersNext, &tracker->config.learnersNext); } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_quorum_joint.c b/source/libs/sync/src/sync_raft_quorum_joint.c index 2383d7ee63..6eecfbd9e5 100644 --- a/source/libs/sync/src/sync_raft_quorum_joint.c +++ b/source/libs/sync/src/sync_raft_quorum_joint.c @@ -42,39 +42,14 @@ ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashOb } void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) { - int i, min; - - for (i = 0, min = -1; i < TSDB_MAX_REPLICA; ++i) { - if (config->incoming.nodeId[i] == id) { - return; - } - if (min == -1 && config->incoming.nodeId[i] == SYNC_NON_NODE_ID) { - min = i; - } - } - - assert(min != -1); - config->incoming.nodeId[min] = id; - config->incoming.replica += 1; + syncRaftAddToNodeMap(&config->incoming, id); } void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) { - int i; - - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - if (config->incoming.nodeId[i] == id) { - config->incoming.replica -= 1; - config->incoming.nodeId[i] = SYNC_NON_NODE_ID; - break; - } - } - - assert(config->incoming.replica >= 0); + syncRaftRemoveFromNodeMap(&config->incoming, id); } void syncRaftJointConfigIDS(const SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap) { - int i, j, m; - syncRaftCopyNodeMap(&config->incoming, nodeMap); syncRaftUnionNodeMap(&config->outgoing, nodeMap); From da029f32a77703f9f8dc35067c7c562cbfa657b0 Mon Sep 17 00:00:00 2001 From: lichuang Date: Wed, 17 Nov 2021 12:06:06 +0800 Subject: [PATCH 11/21] [TD-10645][raft]refactor node and progress map --- source/libs/sync/inc/sync_raft_node_map.h | 2 +- source/libs/sync/inc/sync_raft_quorum_joint.h | 14 +- .../libs/sync/src/sync_raft_config_change.c | 323 +++++++++--------- source/libs/sync/src/sync_raft_node_map.c | 6 +- 4 files changed, 184 insertions(+), 161 deletions(-) diff --git a/source/libs/sync/inc/sync_raft_node_map.h b/source/libs/sync/inc/sync_raft_node_map.h index 5d43e42061..285717ed78 100644 --- a/source/libs/sync/inc/sync_raft_node_map.h +++ b/source/libs/sync/inc/sync_raft_node_map.h @@ -30,7 +30,7 @@ void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap); bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); -void syncRaftCopyNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to); +void syncRaftCopyNodeMap(const SSyncRaftNodeMap* from, SSyncRaftNodeMap* to); void syncRaftUnionNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to); diff --git a/source/libs/sync/inc/sync_raft_quorum_joint.h b/source/libs/sync/inc/sync_raft_quorum_joint.h index 4a5b749a0e..59d1fadc4a 100644 --- a/source/libs/sync/inc/sync_raft_quorum_joint.h +++ b/source/libs/sync/inc/sync_raft_quorum_joint.h @@ -59,7 +59,19 @@ static FORCE_INLINE const SSyncRaftNodeMap* syncRaftJointConfigOutgoing(const SS } static FORCE_INLINE void syncRaftJointConfigClearOutgoing(SSyncRaftQuorumJointConfig* config) { - memset(&config->outgoing, 0, sizeof(SSyncRaftNodeMap)); + syncRaftClearNodeMap(&config->outgoing); +} + +static FORCE_INLINE bool syncRaftJointConfigIsIncomingEmpty(const SSyncRaftQuorumJointConfig* config) { + return syncRaftNodeMapSize(&config->incoming) == 0; +} + +static FORCE_INLINE bool syncRaftJointConfigIsOutgoingEmpty(const SSyncRaftQuorumJointConfig* config) { + return syncRaftNodeMapSize(&config->outgoing) == 0; +} + +static FORCE_INLINE bool syncRaftJointConfigIsInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) { + return syncRaftIsInNodeMap(&config->outgoing, id); } void syncRaftJointConfigIDS(const SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap); diff --git a/source/libs/sync/src/sync_raft_config_change.c b/source/libs/sync/src/sync_raft_config_change.c index ad261dd277..0fe31b9f08 100644 --- a/source/libs/sync/src/sync_raft_config_change.c +++ b/source/libs/sync/src/sync_raft_config_change.c @@ -40,7 +40,57 @@ static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap, SyncNodeId id); static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id); + SSyncRaftProgressMap* progressMap, SyncNodeId id); + +// EnterJoint verifies that the outgoing (=right) majority config of the joint +// config is empty and initializes it with a copy of the incoming (=left) +// majority config. That is, it transitions from +// +// (1 2 3)&&() +// to +// (1 2 3)&&(1 2 3). +// +// The supplied changes are then applied to the incoming majority config, +// resulting in a joint configuration that in terms of the Raft thesis[1] +// (Section 4.3) corresponds to `C_{new,old}`. +// +// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf +int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css, + SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { + int ret; + + ret = checkAndCopy(changer, config, progressMap); + if (ret != 0) { + return ret; + } + + if (hasJointConfig(config)) { + syncError("config is already joint"); + return -1; + } + + if(syncRaftJointConfigIsIncomingEmpty(&config->voters) == 0) { + // We allow adding nodes to an empty config for convenience (testing and + // bootstrap), but you can't enter a joint state. + syncError("can't make a zero-voter config joint"); + return -1; + } + + // Clear the outgoing config. + syncRaftJointConfigClearOutgoing(&config->voters); + + // Copy incoming to outgoing. + syncRaftCopyNodeMap(&config->voters.incoming, &config->voters.outgoing); + + ret = applyConfig(changer, config, progressMap, css); + if (ret != 0) { + return ret; + } + + config->autoLeave = autoLeave; + return checkAndReturn(config, progressMap); +} + // syncRaftChangerSimpleConfig carries out a series of configuration changes that (in aggregate) // mutates the incoming majority config Voters[0] by at most one. This method // will return an error if that is not the case, if the resulting quorum is @@ -75,52 +125,131 @@ int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChange return checkAndReturn(config, progressMap); } -// EnterJoint verifies that the outgoing (=right) majority config of the joint -// config is empty and initializes it with a copy of the incoming (=left) -// majority config. That is, it transitions from -// -// (1 2 3)&&() -// to -// (1 2 3)&&(1 2 3). -// -// The supplied changes are then applied to the incoming majority config, -// resulting in a joint configuration that in terms of the Raft thesis[1] -// (Section 4.3) corresponds to `C_{new,old}`. -// -// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf -int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const SSyncConfChangeSingleArray* css, - SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - int ret; +// apply a change to the configuration. By convention, changes to voters are +// always made to the incoming majority config Voters[0]. Voters[1] is either +// empty or preserves the outgoing majority configuration while in a joint state. +static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, + SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css) { + int i; - ret = checkAndCopy(changer, config, progressMap); - if (ret != 0) { - return ret; + for (i = 0; i < css->n; ++i) { + const SSyncConfChangeSingle* cs = &(css->changes[i]); + if (cs->nodeId == SYNC_NON_NODE_ID) { + continue; + } + + ESyncRaftConfChangeType type = cs->type; + switch (type) { + case SYNC_RAFT_Conf_AddNode: + makeVoter(changer, config, progressMap, cs->nodeId); + break; + case SYNC_RAFT_Conf_AddLearnerNode: + makeLearner(changer, config, progressMap, cs->nodeId); + break; + case SYNC_RAFT_Conf_RemoveNode: + removeNodeId(changer, config, progressMap, cs->nodeId); + break; + case SYNC_RAFT_Conf_UpdateNode: + break; + } } - if (hasJointConfig(config)) { - syncError("config is already joint"); + + if (syncRaftJointConfigIsIncomingEmpty(&config->voters)) { + syncError("removed all voters"); return -1; } - if(syncRaftNodeMapSize(&config->voters.incoming) == 0) { - // We allow adding nodes to an empty config for convenience (testing and - // bootstrap), but you can't enter a joint state. - syncError("can't make a zero-voter config joint"); - return -1; + return 0; +} + + +// makeVoter adds or promotes the given ID to be a voter in the incoming +// majority config. +static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, + SSyncRaftProgressMap* progressMap, SyncNodeId id) { + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); + if (progress == -1) { + initProgress(changer, config, progressMap, id, false); + return; } - // Clear the outgoing config. - syncRaftJointConfigClearOutgoing(&config->voters); + progress->isLearner = false; + nilAwareDelete(&config->learners, id); + nilAwareDelete(&config->learnersNext, id); + syncRaftJointConfigAddToIncoming(&config->voters, id); +} - // Copy incoming to outgoing. - syncRaftCopyNodeMap(&config->voters.incoming, &config->voters.outgoing); - - ret = applyConfig(changer, config, progressMap, css); - if (ret != 0) { - return ret; +// makeLearner makes the given ID a learner or stages it to be a learner once +// an active joint configuration is exited. +// +// The former happens when the peer is not a part of the outgoing config, in +// which case we either add a new learner or demote a voter in the incoming +// config. +// +// The latter case occurs when the configuration is joint and the peer is a +// voter in the outgoing config. In that case, we do not want to add the peer +// as a learner because then we'd have to track a peer as a voter and learner +// simultaneously. Instead, we add the learner to LearnersNext, so that it will +// be added to Learners the moment the outgoing config is removed by +// LeaveJoint(). +static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, + SSyncRaftProgressMap* progressMap, SyncNodeId id) { + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); + if (progress == NULL) { + initProgress(changer, config, progressMap, id, true); + return; } - config->autoLeave = autoLeave; - return checkAndReturn(config, progressMap); + if (progress->isLearner) { + return; + } + // Remove any existing voter in the incoming config... + removeNodeId(changer, config, progressMap, id); + + // ... but save the Progress. + syncRaftAddToProgressMap(progressMap, progress); + + // Use LearnersNext if we can't add the learner to Learners directly, i.e. + // if the peer is still tracked as a voter in the outgoing config. It will + // be turned into a learner in LeaveJoint(). + // + // Otherwise, add a regular learner right away. + bool inInOutgoing = syncRaftJointConfigIsInOutgoing(&config->voters, id); + if (inInOutgoing) { + nilAwareAdd(&config->learnersNext, id); + } else { + nilAwareAdd(&config->learners, id); + progress->isLearner = true; + } +} + +// removeNodeId this peer as a voter or learner from the incoming config. +static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, + SSyncRaftProgressMap* progressMap, SyncNodeId id) { + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); + if (progress == NULL) { + return; + } + + syncRaftJointConfigRemoveFromIncoming(&config->voters, id); + nilAwareDelete(&config->learners, id); + nilAwareDelete(&config->learnersNext, id); + + // If the peer is still a voter in the outgoing config, keep the Progress. + bool inInOutgoing = syncRaftJointConfigIsInOutgoing(&config->voters, id); + if (!inInOutgoing) { + syncRaftRemoveFromProgressMap(progressMap, id); + } +} + +// initProgress initializes a new progress for the given node or learner. +static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, + SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner) { + if (!isLearner) { + syncRaftJointConfigAddToIncoming(&config->voters, id); + } else { + nilAwareAdd(&config->learners, id); + } } // checkAndCopy copies the tracker's config and progress map (deeply enough for @@ -208,41 +337,7 @@ static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProg } static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config) { - return syncRaftNodeMapSize(&config->voters.outgoing) > 0; -} - -static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, const SSyncConfChangeSingleArray* css) { - int i; - - for (i = 0; i < css->n; ++i) { - const SSyncConfChangeSingle* cs = &(css->changes[i]); - if (cs->nodeId == SYNC_NON_NODE_ID) { - continue; - } - - ESyncRaftConfChangeType type = cs->type; - switch (type) { - case SYNC_RAFT_Conf_AddNode: - makeVoter(changer, config, progressMap, cs->nodeId); - break; - case SYNC_RAFT_Conf_AddLearnerNode: - makeLearner(changer, config, progressMap, cs->nodeId); - break; - case SYNC_RAFT_Conf_RemoveNode: - removeNodeId(changer, config, progressMap, cs->nodeId); - break; - case SYNC_RAFT_Conf_UpdateNode: - break; - } - } - - if (syncRaftNodeMapSize(&config->voters.incoming) == 0) { - syncError("removed all voters"); - return -1; - } - - return 0; + return !syncRaftJointConfigIsOutgoingEmpty(&config->voters); } // symdiff returns the count of the symmetric difference between the sets of @@ -272,11 +367,6 @@ static int symDiff(const SSyncRaftNodeMap* l, const SSyncRaftNodeMap* r) { return n; } -static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id, bool isLearner) { - -} - // nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after. static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id) { syncRaftRemoveFromNodeMap(nodeMap, id); @@ -285,83 +375,4 @@ static void nilAwareDelete(SSyncRaftNodeMap* nodeMap, SyncNodeId id) { // nilAwareAdd populates a map entry, creating the map if necessary. static void nilAwareAdd(SSyncRaftNodeMap* nodeMap, SyncNodeId id) { syncRaftAddToNodeMap(nodeMap, id); -} - -// makeVoter adds or promotes the given ID to be a voter in the incoming -// majority config. -static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id) { - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); - if (progress == -1) { - initProgress(changer, config, progressMap, id, false); - return; - } - - progress->isLearner = false; - nilAwareDelete(&config->learners, id); - nilAwareDelete(&config->learnersNext, id); - syncRaftJointConfigAddToIncoming(&config->voters, id); -} - -// makeLearner makes the given ID a learner or stages it to be a learner once -// an active joint configuration is exited. -// -// The former happens when the peer is not a part of the outgoing config, in -// which case we either add a new learner or demote a voter in the incoming -// config. -// -// The latter case occurs when the configuration is joint and the peer is a -// voter in the outgoing config. In that case, we do not want to add the peer -// as a learner because then we'd have to track a peer as a voter and learner -// simultaneously. Instead, we add the learner to LearnersNext, so that it will -// be added to Learners the moment the outgoing config is removed by -// LeaveJoint(). -static void makeLearner(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id) { - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); - if (progress == NULL) { - initProgress(changer, config, progressMap, id, false); - return; - } - - if (progress->isLearner) { - return; - } - // Remove any existing voter in the incoming config... - removeNodeId(changer, config, progressMap, id); - - // ... but save the Progress. - syncRaftAddToProgressMap(progressMap, progress); - - // Use LearnersNext if we can't add the learner to Learners directly, i.e. - // if the peer is still tracked as a voter in the outgoing config. It will - // be turned into a learner in LeaveJoint(). - // - // Otherwise, add a regular learner right away. - bool inOutgoing = syncRaftIsInNodeMap(&config->voters.outgoing, id); - if (inOutgoing) { - nilAwareAdd(&config->learnersNext, id); - } else { - nilAwareAdd(&config->learners, id); - progress->isLearner = true; - } -} - -// removeNodeId this peer as a voter or learner from the incoming config. -static void removeNodeId(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, - SSyncRaftProgressMap* progressMap, SyncNodeId id) { - SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); - if (progress == NULL) { - return; - } - - syncRaftJointConfigRemoveFromIncoming(&config->voters, id); - nilAwareDelete(&config->learners, id); - nilAwareDelete(&config->learnersNext, id); - - // If the peer is still a voter in the outgoing config, keep the Progress. - bool inOutgoing = syncRaftIsInNodeMap(&config->voters.outgoing, id); - if (!inOutgoing) { - syncRaftRemoveFromProgressMap(progressMap, id); - } } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_node_map.c b/source/libs/sync/src/sync_raft_node_map.c index 5dacc01d1a..9adb3844f5 100644 --- a/source/libs/sync/src/sync_raft_node_map.c +++ b/source/libs/sync/src/sync_raft_node_map.c @@ -32,11 +32,11 @@ bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { return true; } -void syncRaftCopyNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) { - SyncNodeId** ppId = (SyncNodeId**)taosHashIterate(nodeMap->nodeIdMap, NULL); +void syncRaftCopyNodeMap(const SSyncRaftNodeMap* from, SSyncRaftNodeMap* to) { + SyncNodeId** ppId = (SyncNodeId**)taosHashIterate(from->nodeIdMap, NULL); while (ppId) { taosHashPut(to->nodeIdMap, ppId, sizeof(SyncNodeId*), ppId, sizeof(SyncNodeId*)); - ppId = taosHashIterate(nodeMap->nodeIdMap, ppId); + ppId = taosHashIterate(from->nodeIdMap, ppId); } } From 98a6b1918c1fa5270d5d3742c877594bbef192d4 Mon Sep 17 00:00:00 2001 From: lichuang Date: Wed, 17 Nov 2021 15:50:57 +0800 Subject: [PATCH 12/21] [TD-10645][raft]refactor node and progress map --- source/libs/sync/inc/raft.h | 1 - source/libs/sync/inc/sync_raft_node_map.h | 2 + source/libs/sync/inc/sync_raft_progress.h | 8 +- .../sync/inc/sync_raft_progress_tracker.h | 5 +- source/libs/sync/inc/sync_type.h | 2 + source/libs/sync/src/raft.c | 9 +-- .../libs/sync/src/sync_raft_config_change.c | 79 +++++++++++++------ source/libs/sync/src/sync_raft_election.c | 7 +- source/libs/sync/src/sync_raft_impl.c | 10 +-- source/libs/sync/src/sync_raft_node_map.c | 14 +++- source/libs/sync/src/sync_raft_progress.c | 29 +++++-- .../sync/src/sync_raft_progress_tracker.c | 32 +++++--- .../libs/sync/src/sync_raft_quorum_majority.c | 30 +++---- source/libs/sync/src/sync_raft_restore.c | 74 ++++++++--------- 14 files changed, 180 insertions(+), 122 deletions(-) diff --git a/source/libs/sync/inc/raft.h b/source/libs/sync/inc/raft.h index 6fa6c6e346..129f0f4dbc 100644 --- a/source/libs/sync/inc/raft.h +++ b/source/libs/sync/inc/raft.h @@ -47,7 +47,6 @@ struct SSyncRaft { // hash map nodeId -> SNodeInfo* SHashObj* nodeInfoMap; - int selfIndex; SyncNodeId selfId; SyncGroupId selfGroupId; diff --git a/source/libs/sync/inc/sync_raft_node_map.h b/source/libs/sync/inc/sync_raft_node_map.h index 285717ed78..15e559a733 100644 --- a/source/libs/sync/inc/sync_raft_node_map.h +++ b/source/libs/sync/inc/sync_raft_node_map.h @@ -43,4 +43,6 @@ int32_t syncRaftNodeMapSize(const SSyncRaftNodeMap* nodeMap); // return true if reach the end bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId); +bool syncRaftIsAllInProgressMap(const SSyncRaftNodeMap* nodeMap, const SSyncRaftProgressMap* progressMap); + #endif /* _TD_LIBS_SYNC_RAFT_NODE_MAP_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_progress.h b/source/libs/sync/inc/sync_raft_progress.h index 3d2995ed77..5664cfd15e 100644 --- a/source/libs/sync/inc/sync_raft_progress.h +++ b/source/libs/sync/inc/sync_raft_progress.h @@ -147,7 +147,7 @@ static FORCE_INLINE const char* syncRaftProgressStateString(const SSyncRaftProgr return kProgressStateString[progress->state]; } -void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress); +void syncRaftResetProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress); /** * syncRaftProgressBecomeProbe transitions into StateProbe. Next is reset to Match+1 or, @@ -227,6 +227,8 @@ int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgres void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id); +bool syncRaftIsInProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id); + /** * return true if progress's log is up-todate **/ @@ -237,7 +239,9 @@ void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snaps void syncRaftCopyProgress(const SSyncRaftProgress* from, SSyncRaftProgress* to); // return true if reach the end -bool syncRaftIterateProgressMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftProgress *pProgress); +bool syncRaftIterateProgressMap(const SSyncRaftProgressMap* progressMap, SSyncRaftProgress *pProgress); + +bool syncRaftVisitProgressMap(SSyncRaftProgressMap* progressMap, visitProgressFp fp, void* arg); #if 0 diff --git a/source/libs/sync/inc/sync_raft_progress_tracker.h b/source/libs/sync/inc/sync_raft_progress_tracker.h index 3a448290c8..35daf8139c 100644 --- a/source/libs/sync/inc/sync_raft_progress_tracker.h +++ b/source/libs/sync/inc/sync_raft_progress_tracker.h @@ -88,13 +88,14 @@ struct SSyncRaftProgressTracker { SHashObj* votesMap; int maxInflightMsgs; + + SSyncRaft* pRaft; }; -SSyncRaftProgressTracker* syncRaftOpenProgressTracker(); +SSyncRaftProgressTracker* syncRaftOpenProgressTracker(SSyncRaft* pRaft); void syncRaftResetVotes(SSyncRaftProgressTracker*); -typedef void (*visitProgressFp)(int i, SSyncRaftProgress* progress, void* arg); void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg); /** diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h index fcb0940609..e00700d724 100644 --- a/source/libs/sync/inc/sync_type.h +++ b/source/libs/sync/inc/sync_type.h @@ -84,4 +84,6 @@ typedef enum { SYNC_RAFT_VOTE_RESP_REJECT = 2, } ESyncRaftVoteType; +typedef void (*visitProgressFp)(SSyncRaftProgress* progress, void* arg); + #endif /* _TD_LIBS_SYNC_TYPE_H */ diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index b2170e0b68..85c330ece3 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -65,7 +65,7 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { } // init progress tracker - pRaft->tracker = syncRaftOpenProgressTracker(); + pRaft->tracker = syncRaftOpenProgressTracker(pRaft); if (pRaft->tracker == NULL) { return -1; } @@ -157,7 +157,7 @@ static int deserializeClusterStateFromBuffer(SSyncConfigState* cluster, const ch return 0; } -static void visitProgressMaybeSendAppend(int i, SSyncRaftProgress* progress, void* arg) { +static void visitProgressMaybeSendAppend(SSyncRaftProgress* progress, void* arg) { syncRaftReplicate(arg, progress, false); } @@ -175,13 +175,12 @@ static void switchToConfig(SSyncRaft* pRaft, const SSyncRaftProgressTrackerConfi SSyncRaftProgress* progress = NULL; syncRaftConfigState(pRaft->tracker, cs); - i = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, selfId); - exist = (i != -1); + progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, selfId); + exist = (progress != NULL); // Update whether the node itself is a learner, resetting to false when the // node is removed. if (exist) { - progress = &pRaft->tracker->progressMap.progress[i]; pRaft->isLearner = progress->isLearner; } else { pRaft->isLearner = false; diff --git a/source/libs/sync/src/sync_raft_config_change.c b/source/libs/sync/src/sync_raft_config_change.c index 0fe31b9f08..1f7aab064f 100644 --- a/source/libs/sync/src/sync_raft_config_change.c +++ b/source/libs/sync/src/sync_raft_config_change.c @@ -13,6 +13,7 @@ * along with this program. If not, see . */ +#include "raft.h" #include "syncInt.h" #include "sync_raft_config_change.h" #include "sync_raft_progress.h" @@ -168,7 +169,7 @@ static int applyConfig(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig static void makeVoter(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap, SyncNodeId id) { SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, id); - if (progress == -1) { + if (progress == NULL) { initProgress(changer, config, progressMap, id, false); return; } @@ -250,31 +251,34 @@ static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConf } else { nilAwareAdd(&config->learners, id); } -} -// checkAndCopy copies the tracker's config and progress map (deeply enough for -// the purposes of the Changer) and returns those copies. It returns an error -// if checkInvariants does. -static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - syncRaftCloneTrackerConfig(&changer->tracker->config, config); - int i; + SSyncRaftProgress* pProgress = (SSyncRaftProgress*)malloc(sizeof(SSyncRaftProgress)); + assert (pProgress != NULL); + *pProgress = (SSyncRaftProgress) { + // Initializing the Progress with the last index means that the follower + // can be probed (with the last index). + // + // TODO(tbg): seems awfully optimistic. Using the first index would be + // better. The general expectation here is that the follower has no log + // at all (and will thus likely need a snapshot), though the app may + // have applied a snapshot out of band before adding the replica (thus + // making the first index the better choice). + .id = id, + .groupId = changer->tracker->pRaft->selfGroupId, + .nextIndex = changer->lastIndex, + .matchIndex = 0, + .state = PROGRESS_STATE_PROBE, + .pendingSnapshotIndex = 0, + .probeSent = false, + .inflights = syncRaftOpenInflights(changer->tracker->maxInflightMsgs), + .isLearner = isLearner, + // When a node is first added, we should mark it as recently active. + // Otherwise, CheckQuorum may cause us to step down if it is invoked + // before the added node has had a chance to communicate with us. + .recentActive = true, + }; - SSyncRaftProgress* pProgress = NULL; - while (!syncRaftIterateProgressMap(&changer->tracker->progressMap, pProgress)) { - syncRaftAddToProgressMap(progressMap, pProgress); - } - - return checkAndReturn(config, progressMap); -} - -// checkAndReturn calls checkInvariants on the input and returns either the -// resulting error or the input. -static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - if (checkInvariants(config, progressMap) != 0) { - return -1; - } - - return 0; + syncRaftAddToProgressMap(progressMap, pProgress); } // checkInvariants makes sure that the config and progress are compatible with @@ -304,7 +308,7 @@ static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProg } // Conversely Learners and Voters doesn't intersect at all. - SyncNodeId* pNodeId = NULL; + pNodeId = NULL; while (!syncRaftIterateNodeMap(&config->learners, pNodeId)) { SyncNodeId nodeId = *pNodeId; if (syncRaftJointConfigInIncoming(&config->voters, nodeId)) { @@ -336,6 +340,31 @@ static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProg return 0; } +// checkAndCopy copies the tracker's config and progress map (deeply enough for +// the purposes of the Changer) and returns those copies. It returns an error +// if checkInvariants does. +static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { + syncRaftCloneTrackerConfig(&changer->tracker->config, config); + int i; + + SSyncRaftProgress* pProgress = NULL; + while (!syncRaftIterateProgressMap(&changer->tracker->progressMap, pProgress)) { + syncRaftAddToProgressMap(progressMap, pProgress); + } + + return checkAndReturn(config, progressMap); +} + +// checkAndReturn calls checkInvariants on the input and returns either the +// resulting error or the input. +static int checkAndReturn(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { + if (checkInvariants(config, progressMap) != 0) { + return -1; + } + + return 0; +} + static bool hasJointConfig(const SSyncRaftProgressTrackerConfig* config) { return !syncRaftJointConfigIsOutgoingEmpty(&config->voters); } diff --git a/source/libs/sync/src/sync_raft_election.c b/source/libs/sync/src/sync_raft_election.c index d4013bbc08..6d36d38267 100644 --- a/source/libs/sync/src/sync_raft_election.c +++ b/source/libs/sync/src/sync_raft_election.c @@ -86,12 +86,7 @@ static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { SSyncRaftNodeMap nodeMap; syncRaftJointConfigIDS(&pRaft->tracker->config.voters, &nodeMap); SyncNodeId *pNodeId = NULL; - while (true) { - syncRaftIterateNodeMap(&nodeMap, &pNodeId); - if (pNodeId == NULL || *pNodeId == NULL) { - break; - } - + while (!syncRaftIterateNodeMap(&nodeMap, pNodeId)) { SyncNodeId nodeId = *pNodeId; if (nodeId == SYNC_NON_NODE_ID) { continue; diff --git a/source/libs/sync/src/sync_raft_impl.c b/source/libs/sync/src/sync_raft_impl.c index 73a02c4b80..2093bcb046 100644 --- a/source/libs/sync/src/sync_raft_impl.c +++ b/source/libs/sync/src/sync_raft_impl.c @@ -186,7 +186,7 @@ void syncRaftLoadState(SSyncRaft* pRaft, const SSyncServerState* serverState) { pRaft->voteFor = serverState->voteFor; } -static void visitProgressSendAppend(int i, SSyncRaftProgress* progress, void* arg) { +static void visitProgressSendAppend(SSyncRaftProgress* progress, void* arg) { SSyncRaft* pRaft = (SSyncRaft*)arg; if (pRaft->selfId == progress->id) { return; @@ -279,7 +279,7 @@ static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { syncRaftLogAppend(pRaft->log, entries, n); - SSyncRaftProgress* progress = &(pRaft->tracker->progressMap.progress[pRaft->selfIndex]); + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, pRaft->selfId); syncRaftProgressMaybeUpdate(progress, lastIndex); // Regardless of syncRaftMaybeCommit's return, our caller will call bcastAppend. syncRaftMaybeCommit(pRaft); @@ -316,8 +316,8 @@ static void abortLeaderTransfer(SSyncRaft* pRaft) { pRaft->leadTransferee = SYNC_NON_NODE_ID; } -static void initProgress(int i, SSyncRaftProgress* progress, void* arg) { - syncRaftInitProgress(i, (SSyncRaft*)arg, progress); +static void resetProgress(SSyncRaftProgress* progress, void* arg) { + syncRaftResetProgress((SSyncRaft*)arg, progress); } static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { @@ -336,7 +336,7 @@ static void resetRaft(SSyncRaft* pRaft, SyncTerm term) { abortLeaderTransfer(pRaft); syncRaftResetVotes(pRaft->tracker); - syncRaftProgressVisit(pRaft->tracker, initProgress, pRaft); + syncRaftProgressVisit(pRaft->tracker, resetProgress, pRaft); pRaft->pendingConfigIndex = 0; pRaft->uncommittedSize = 0; diff --git a/source/libs/sync/src/sync_raft_node_map.c b/source/libs/sync/src/sync_raft_node_map.c index 9adb3844f5..022f3b2dcb 100644 --- a/source/libs/sync/src/sync_raft_node_map.c +++ b/source/libs/sync/src/sync_raft_node_map.c @@ -15,6 +15,7 @@ #include "sync_raft_node_map.h" #include "sync_type.h" +#include "sync_raft_progress.h" void syncRaftInitNodeMap(SSyncRaftNodeMap* nodeMap) { nodeMap->nodeIdMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); @@ -50,6 +51,17 @@ bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId) { return false; } +bool syncRaftIsAllInProgressMap(const SSyncRaftNodeMap* nodeMap, const SSyncRaftProgressMap* progressMap) { + SyncNodeId *pId = NULL; + while (!syncRaftIterateNodeMap(nodeMap, pId)) { + if (!syncRaftIsInProgressMap(progressMap, *pId)) { + return false; + } + } + + return true; +} + void syncRaftUnionNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) { syncRaftCopyNodeMap(nodeMap, to); } @@ -63,5 +75,5 @@ void syncRaftRemoveFromNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { } int32_t syncRaftNodeMapSize(const SSyncRaftNodeMap* nodeMap) { - return taosHashGetSize(nodeMap); + return taosHashGetSize(nodeMap->nodeIdMap); } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_progress.c b/source/libs/sync/src/sync_raft_progress.c index 436250e594..e63d37cee9 100644 --- a/source/libs/sync/src/sync_raft_progress.c +++ b/source/libs/sync/src/sync_raft_progress.c @@ -25,13 +25,16 @@ static void probeAcked(SSyncRaftProgress* progress); static void resumeProgress(SSyncRaftProgress* progress); -void syncRaftInitProgress(int i, SSyncRaft* pRaft, SSyncRaftProgress* progress) { +void syncRaftResetProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress) { + if (progress->inflights) { + syncRaftCloseInflights(progress->inflights); + } SSyncRaftInflights* inflights = syncRaftOpenInflights(pRaft->tracker->maxInflightMsgs); if (inflights == NULL) { return; } *progress = (SSyncRaftProgress) { - .matchIndex = i == pRaft->selfIndex ? syncRaftLogLastIndex(pRaft->log) : 0, + .matchIndex = progress->id == pRaft->selfId ? syncRaftLogLastIndex(pRaft->log) : 0, .nextIndex = syncRaftLogLastIndex(pRaft->log) + 1, .inflights = inflights, .isLearner = false, @@ -113,7 +116,7 @@ bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) { } SSyncRaftProgress* syncRaftFindProgressByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id) { - SSyncRaftProgress** ppProgress = (SSyncRaftProgress**)taosHashGet(progressMap, &id, sizeof(SyncNodeId*)); + SSyncRaftProgress** ppProgress = (SSyncRaftProgress**)taosHashGet(progressMap->progressMap, &id, sizeof(SyncNodeId*)); if (ppProgress == NULL) { return NULL; } @@ -126,9 +129,18 @@ int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgres } void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) { + SSyncRaftProgress** ppProgress = (SSyncRaftProgress**)taosHashGet(progressMap->progressMap, &id, sizeof(SyncNodeId*)); + if (ppProgress == NULL) { + return; + } + free(*ppProgress); taosHashRemove(progressMap->progressMap, &id, sizeof(SyncNodeId*)); } +bool syncRaftIsInProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id) { + return taosHashGet(progressMap->progressMap, &id, sizeof(SyncNodeId*)) != NULL; +} + bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) { return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex; } @@ -170,8 +182,8 @@ void syncRaftCopyProgress(const SSyncRaftProgress* progress, SSyncRaftProgress* memcpy(out, progress, sizeof(SSyncRaftProgress)); } -bool syncRaftIterateProgressMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftProgress *pProgress) { - SSyncRaftProgress **ppProgress = taosHashIterate(nodeMap->nodeIdMap, pProgress); +bool syncRaftIterateProgressMap(const SSyncRaftProgressMap* progressMap, SSyncRaftProgress *pProgress) { + SSyncRaftProgress **ppProgress = taosHashIterate(progressMap->progressMap, pProgress); if (ppProgress == NULL) { return true; } @@ -180,6 +192,13 @@ bool syncRaftIterateProgressMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftProgre return false; } +bool syncRaftVisitProgressMap(SSyncRaftProgressMap* progressMap, visitProgressFp fp, void* arg) { + SSyncRaftProgress *pProgress; + while (!syncRaftIterateProgressMap(progressMap, pProgress)) { + fp(pProgress, arg); + } +} + /** * ResetState moves the Progress into the specified State, resetting ProbeSent, * PendingSnapshot, and Inflights. diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c index 3dd0a5ffe1..e6a016b7cf 100644 --- a/source/libs/sync/src/sync_raft_progress_tracker.c +++ b/source/libs/sync/src/sync_raft_progress_tracker.c @@ -16,7 +16,7 @@ #include "sync_raft_progress_tracker.h" #include "sync_raft_proto.h" -SSyncRaftProgressTracker* syncRaftOpenProgressTracker() { +SSyncRaftProgressTracker* syncRaftOpenProgressTracker(SSyncRaft* pRaft) { SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)malloc(sizeof(SSyncRaftProgressTracker)); if (tracker == NULL) { return NULL; @@ -24,6 +24,7 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker() { syncRaftInitNodeMap(&tracker->config.learners); syncRaftInitNodeMap(&tracker->config.learnersNext); + tracker->pRaft = pRaft; return tracker; } @@ -33,11 +34,7 @@ void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) { } void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp visit, void* arg) { - int i; - for (i = 0; i < TSDB_MAX_REPLICA; ++i) { - SSyncRaftProgress* progress = &(tracker->progressMap.progress[i]); - visit(i, progress, arg); - } + syncRaftVisitProgressMap(&tracker->progressMap, visit, arg); } void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant) { @@ -51,10 +48,20 @@ void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool g } void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) { - + memcpy(to, from, sizeof(SSyncRaftProgressTrackerConfig)); } int syncRaftCheckProgress(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { + // NB: intentionally allow the empty config. In production we'll never see a + // non-empty config (we prevent it from being created) but we will need to + // be able to *create* an initial config, for example during bootstrap (or + // during tests). Instead of having to hand-code this, we allow + // transitioning from an empty config into any other legal and non-empty + // config. + if (!syncRaftIsAllInProgressMap(&config->voters.incoming, progressMap)) return -1; + if (!syncRaftIsAllInProgressMap(&config->voters.outgoing, progressMap)) return -1; + if (!syncRaftIsAllInProgressMap(&config->learners, progressMap)) return -1; + if (!syncRaftIsAllInProgressMap(&config->learnersNext, progressMap)) return -1; return 0; } @@ -67,8 +74,7 @@ ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* r SSyncRaftProgress* progress; int r, g; - for (i = 0, r = 0, g = 0; i < TSDB_MAX_REPLICA; ++i) { - progress = &(tracker->progressMap.progress[i]); + while (!syncRaftIterateProgressMap(&tracker->progressMap, progress)) { if (progress->id == SYNC_NON_NODE_ID) { continue; } @@ -91,8 +97,8 @@ ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* r } void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) { - syncRaftCopyNodeMap(&cs->voters, &tracker->config.voters.incoming); - syncRaftCopyNodeMap(&cs->votersOutgoing, &tracker->config.voters.outgoing); - syncRaftCopyNodeMap(&cs->learners, &tracker->config.learners); - syncRaftCopyNodeMap(&cs->learnersNext, &tracker->config.learnersNext); + syncRaftCopyNodeMap(&tracker->config.voters.incoming, &cs->voters); + syncRaftCopyNodeMap(&tracker->config.voters.outgoing, &cs->votersOutgoing); + syncRaftCopyNodeMap(&tracker->config.learners, &cs->learners); + syncRaftCopyNodeMap(&tracker->config.learnersNext, &cs->learnersNext); } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_quorum_majority.c b/source/libs/sync/src/sync_raft_quorum_majority.c index 014a8c7303..ff5ba64876 100644 --- a/source/libs/sync/src/sync_raft_quorum_majority.c +++ b/source/libs/sync/src/sync_raft_quorum_majority.c @@ -17,24 +17,24 @@ #include "sync_raft_quorum_majority.h" #include "sync_raft_node_map.h" -/** - * syncRaftMajorityVoteResult takes a mapping of voters to yes/no (true/false) votes and returns - * a result indicating whether the vote is pending (i.e. neither a quorum of - * yes/no has been reached), won (a quorum of yes has been reached), or lost (a - * quorum of no has been reached). - **/ +// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns +// a result indicating whether the vote is pending (i.e. neither a quorum of +// yes/no has been reached), won (a quorum of yes has been reached), or lost (a +// quorum of no has been reached). ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashObj* votesMap) { - if (config->replica == 0) { + int n = syncRaftNodeMapSize(config); + if (n == 0) { + // By convention, the elections on an empty config win. This comes in + // handy with joint quorums because it'll make a half-populated joint + // quorum behave like a majority quorum. return SYNC_RAFT_VOTE_WON; } int i, g, r, missing; - for (i = g = r = missing = 0; i < TSDB_MAX_REPLICA; ++i) { - if (config->nodeId[i] == SYNC_NON_NODE_ID) { - continue; - } - - const ESyncRaftVoteType* pType = taosHashGet(votesMap, &config->nodeId[i], sizeof(SyncNodeId*)); + i = g = r = missing = 0; + SyncNodeId* pId = NULL; + while (!syncRaftIterateNodeMap(config, pId)) { + const ESyncRaftVoteType* pType = taosHashGet(votesMap, pId, sizeof(SyncNodeId*)); if (pType == NULL) { missing += 1; continue; @@ -47,11 +47,11 @@ ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashOb } } - int quorum = config->replica / 2 + 1; + int quorum = n / 2 + 1; if (g >= quorum) { return SYNC_RAFT_VOTE_WON; } - if (r + missing >= quorum) { + if (g + missing >= quorum) { return SYNC_RAFT_VOTE_PENDING; } diff --git a/source/libs/sync/src/sync_raft_restore.c b/source/libs/sync/src/sync_raft_restore.c index 01bc7da7eb..17269254bd 100644 --- a/source/libs/sync/src/sync_raft_restore.c +++ b/source/libs/sync/src/sync_raft_restore.c @@ -17,6 +17,7 @@ #include "sync_raft_restore.h" #include "sync_raft_progress_tracker.h" +static void addToConfChangeSingleArray(SSyncConfChangeSingleArray* out, int* i, const SSyncRaftNodeMap* nodeMap, ESyncRaftConfChangeType t); static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in); // syncRaftRestoreConfig takes a Changer (which must represent an empty configuration), and @@ -82,6 +83,18 @@ out: return ret; } +static void addToConfChangeSingleArray(SSyncConfChangeSingleArray* out, int* i, const SSyncRaftNodeMap* nodeMap, ESyncRaftConfChangeType t) { + SyncNodeId* pId = NULL; + + while (!syncRaftIterateNodeMap(nodeMap, pId)) { + out->changes[*i] = (SSyncConfChangeSingle) { + .type = t, + .nodeId = *pId, + }; + *i += 1; + } +} + // toConfChangeSingle translates a conf state into 1) a slice of operations creating // first the config that will become the outgoing one, and then the incoming one, and // b) another slice that, when applied to the config resulted from 1), represents the @@ -91,13 +104,16 @@ static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleA out->n = in->n = 0; - out->n = cs->votersOutgoing.replica; + out->n = syncRaftNodeMapSize(&cs->votersOutgoing); out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * out->n); if (out->changes == NULL) { out->n = 0; return -1; } - in->n = cs->votersOutgoing.replica + cs->voters.replica + cs->learners.replica + cs->learnersNext.replica; + in->n = syncRaftNodeMapSize(&cs->votersOutgoing) + + syncRaftNodeMapSize(&cs->voters) + + syncRaftNodeMapSize(&cs->learners) + + syncRaftNodeMapSize(&cs->learnersNext); out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * in->n); if (in->changes == NULL) { in->n = 0; @@ -132,50 +148,24 @@ static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleA // // as desired. - for (i = 0; i < cs->votersOutgoing.replica; ++i) { - // If there are outgoing voters, first add them one by one so that the - // (non-joint) config has them all. - out->changes[i] = (SSyncConfChangeSingle) { - .type = SYNC_RAFT_Conf_AddNode, - .nodeId = cs->votersOutgoing.nodeId[i], - }; - } + // If there are outgoing voters, first add them one by one so that the + // (non-joint) config has them all. + i = 0; + addToConfChangeSingleArray(out, &i, &cs->votersOutgoing, SYNC_RAFT_Conf_AddNode); + assert(i == out->n); // We're done constructing the outgoing slice, now on to the incoming one // (which will apply on top of the config created by the outgoing slice). - + i = 0; + // First, we'll remove all of the outgoing voters. - int j = 0; - for (i = 0; i < cs->votersOutgoing.replica; ++i) { - in->changes[j] = (SSyncConfChangeSingle) { - .type = SYNC_RAFT_Conf_RemoveNode, - .nodeId = cs->votersOutgoing.nodeId[i], - }; - j += 1; - } + addToConfChangeSingleArray(in, &i, &cs->votersOutgoing, SYNC_RAFT_Conf_RemoveNode); + // Then we'll add the incoming voters and learners. - for (i = 0; i < cs->voters.replica; ++i) { - in->changes[j] = (SSyncConfChangeSingle) { - .type = SYNC_RAFT_Conf_AddNode, - .nodeId = cs->voters.nodeId[i], - }; - j += 1; - } - for (i = 0; i < cs->learners.replica; ++i) { - in->changes[j] = (SSyncConfChangeSingle) { - .type = SYNC_RAFT_Conf_AddLearnerNode, - .nodeId = cs->learners.nodeId[i], - }; - j += 1; - } - // Same for LearnersNext; these are nodes we want to be learners but which - // are currently voters in the outgoing config. - for (i = 0; i < cs->learnersNext.replica; ++i) { - in->changes[j] = (SSyncConfChangeSingle) { - .type = SYNC_RAFT_Conf_AddLearnerNode, - .nodeId = cs->learnersNext.nodeId[i], - }; - j += 1; - } + addToConfChangeSingleArray(in, &i, &cs->voters, SYNC_RAFT_Conf_AddNode); + addToConfChangeSingleArray(in, &i, &cs->learners, SYNC_RAFT_Conf_AddLearnerNode); + addToConfChangeSingleArray(in, &i, &cs->learnersNext, SYNC_RAFT_Conf_AddLearnerNode); + assert(i == in->n); + return 0; } \ No newline at end of file From 7e2590f1087014d631e93bfaec349a0d51fbded1 Mon Sep 17 00:00:00 2001 From: lichuang Date: Thu, 18 Nov 2021 17:19:33 +0800 Subject: [PATCH 13/21] [TD-10645][raft]refactor node and progress map --- .../libs/sync/inc/sync_raft_config_change.h | 5 +++ source/libs/sync/inc/sync_raft_node_map.h | 3 +- source/libs/sync/inc/sync_raft_progress.h | 8 ++++ .../sync/inc/sync_raft_progress_tracker.h | 9 +++- source/libs/sync/inc/sync_raft_proto.h | 15 +++++++ source/libs/sync/inc/sync_raft_quorum_joint.h | 2 + source/libs/sync/inc/sync_raft_restore.h | 3 +- source/libs/sync/src/raft.c | 13 +++++- .../libs/sync/src/sync_raft_config_change.c | 18 ++++---- source/libs/sync/src/sync_raft_node_map.c | 6 ++- source/libs/sync/src/sync_raft_progress.c | 45 ++++++++++++++++++- .../sync/src/sync_raft_progress_tracker.c | 27 ++++++++--- source/libs/sync/src/sync_raft_quorum_joint.c | 10 +++++ source/libs/sync/src/sync_raft_restore.c | 25 +++++++---- 14 files changed, 159 insertions(+), 30 deletions(-) diff --git a/source/libs/sync/inc/sync_raft_config_change.h b/source/libs/sync/inc/sync_raft_config_change.h index a54a7544fe..75a29f35e8 100644 --- a/source/libs/sync/inc/sync_raft_config_change.h +++ b/source/libs/sync/inc/sync_raft_config_change.h @@ -33,6 +33,11 @@ struct SSyncRaftChanger { typedef int (*configChangeFp)(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); +// Simple carries out a series of configuration changes that (in aggregate) +// mutates the incoming majority config Voters[0] by at most one. This method +// will return an error if that is not the case, if the resulting quorum is +// zero, or if the configuration is in a joint state (i.e. if there is an +// outgoing configuration). int syncRaftChangerSimpleConfig(SSyncRaftChanger* changer, const SSyncConfChangeSingleArray* css, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); diff --git a/source/libs/sync/inc/sync_raft_node_map.h b/source/libs/sync/inc/sync_raft_node_map.h index 15e559a733..2de4887bf4 100644 --- a/source/libs/sync/inc/sync_raft_node_map.h +++ b/source/libs/sync/inc/sync_raft_node_map.h @@ -25,6 +25,7 @@ struct SSyncRaftNodeMap { }; void syncRaftInitNodeMap(SSyncRaftNodeMap* nodeMap); +void syncRaftFreeNodeMap(SSyncRaftNodeMap* nodeMap); void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap); @@ -43,6 +44,6 @@ int32_t syncRaftNodeMapSize(const SSyncRaftNodeMap* nodeMap); // return true if reach the end bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId); -bool syncRaftIsAllInProgressMap(const SSyncRaftNodeMap* nodeMap, const SSyncRaftProgressMap* progressMap); +bool syncRaftIsAllNodeInProgressMap(SSyncRaftNodeMap* nodeMap, SSyncRaftProgressMap* progressMap); #endif /* _TD_LIBS_SYNC_RAFT_NODE_MAP_H */ \ No newline at end of file diff --git a/source/libs/sync/inc/sync_raft_progress.h b/source/libs/sync/inc/sync_raft_progress.h index 5664cfd15e..7d80ce5438 100644 --- a/source/libs/sync/inc/sync_raft_progress.h +++ b/source/libs/sync/inc/sync_raft_progress.h @@ -74,6 +74,8 @@ struct SSyncRaftProgress { SyncNodeId id; + int16_t refCount; + SyncIndex nextIndex; SyncIndex matchIndex; @@ -221,6 +223,12 @@ static FORCE_INLINE bool syncRaftProgressRecentActive(SSyncRaftProgress* progres return progress->recentActive; } +void syncRaftInitProgressMap(SSyncRaftProgressMap* progressMap); +void syncRaftFreeProgressMap(SSyncRaftProgressMap* progressMap); + +void syncRaftClearProgressMap(SSyncRaftProgressMap* progressMap); +void syncRaftCopyProgressMap(SSyncRaftProgressMap* from, SSyncRaftProgressMap* to); + SSyncRaftProgress* syncRaftFindProgressByNodeId(const SSyncRaftProgressMap* progressMap, SyncNodeId id); int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgress* progress); diff --git a/source/libs/sync/inc/sync_raft_progress_tracker.h b/source/libs/sync/inc/sync_raft_progress_tracker.h index 35daf8139c..ff69b7b1d1 100644 --- a/source/libs/sync/inc/sync_raft_progress_tracker.h +++ b/source/libs/sync/inc/sync_raft_progress_tracker.h @@ -94,6 +94,11 @@ struct SSyncRaftProgressTracker { SSyncRaftProgressTracker* syncRaftOpenProgressTracker(SSyncRaft* pRaft); +void syncRaftInitTrackConfig(SSyncRaftProgressTrackerConfig* config); +void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config); + +void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config); + void syncRaftResetVotes(SSyncRaftProgressTracker*); void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg); @@ -104,9 +109,9 @@ void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, voi **/ void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant); -void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressTrackerConfig* result); +void syncRaftCopyTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to); -int syncRaftCheckProgress(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); +int syncRaftCheckTrackerConfigInProgress(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); /** * syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the diff --git a/source/libs/sync/inc/sync_raft_proto.h b/source/libs/sync/inc/sync_raft_proto.h index dd153e8dad..29371e328d 100644 --- a/source/libs/sync/inc/sync_raft_proto.h +++ b/source/libs/sync/inc/sync_raft_proto.h @@ -59,4 +59,19 @@ typedef struct SSyncConfigState { bool autoLeave; } SSyncConfigState; +static FORCE_INLINE bool syncRaftConfArrayIsEmpty(const SSyncConfChangeSingleArray* ary) { + return ary->n == 0; +} + +static FORCE_INLINE void syncRaftInitConfArray(SSyncConfChangeSingleArray* ary) { + *ary = (SSyncConfChangeSingleArray) { + .changes = NULL, + .n = 0, + }; +} + +static FORCE_INLINE void syncRaftFreeConfArray(SSyncConfChangeSingleArray* ary) { + if (ary->changes != NULL) free(ary->changes); +} + #endif /* TD_SYNC_RAFT_PROTO_H */ diff --git a/source/libs/sync/inc/sync_raft_quorum_joint.h b/source/libs/sync/inc/sync_raft_quorum_joint.h index 59d1fadc4a..92cddaaec1 100644 --- a/source/libs/sync/inc/sync_raft_quorum_joint.h +++ b/source/libs/sync/inc/sync_raft_quorum_joint.h @@ -38,6 +38,8 @@ typedef struct SSyncRaftQuorumJointConfig { **/ ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashObj* votesMap); +void syncRaftInitQuorumJointConfig(SSyncRaftQuorumJointConfig* config); + static FORCE_INLINE bool syncRaftJointConfigInOutgoing(const SSyncRaftQuorumJointConfig* config, SyncNodeId id) { return syncRaftIsInNodeMap(&config->outgoing, id); } diff --git a/source/libs/sync/inc/sync_raft_restore.h b/source/libs/sync/inc/sync_raft_restore.h index 38eadb00c7..df4448cab8 100644 --- a/source/libs/sync/inc/sync_raft_restore.h +++ b/source/libs/sync/inc/sync_raft_restore.h @@ -27,6 +27,7 @@ // the Changer only needs a ProgressMap (not a whole Tracker) at which point // this can just take LastIndex and MaxInflight directly instead and cook up // the results from that alone. -int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs); +int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs, + SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); #endif /* TD_SYNC_RAFT_RESTORE_H */ diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 85c330ece3..3b4c9e5f36 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -101,11 +101,22 @@ int32_t syncRaftStart(SSyncRaft* pRaft, const SSyncInfo* pInfo) { .tracker = pRaft->tracker, .lastIndex = syncRaftLogLastIndex(pRaft->log), }; - if (syncRaftRestoreConfig(&changer, &confState) < 0) { + SSyncRaftProgressTrackerConfig config; + SSyncRaftProgressMap progressMap; + + if (syncRaftRestoreConfig(&changer, &confState, &config, &progressMap) < 0) { syncError("syncRaftRestoreConfig for vgid %d fail", pInfo->vgId); return -1; } + // save restored config and progress map to tracker + syncRaftCopyProgressMap(&progressMap, &pRaft->tracker->progressMap); + syncRaftCopyTrackerConfig(&config, &pRaft->tracker->config); + + // free progress map and config + syncRaftFreeProgressMap(&progressMap); + syncRaftFreeTrackConfig(&config); + if (!syncRaftIsEmptyServerState(&serverState)) { syncRaftLoadState(pRaft, &serverState); } diff --git a/source/libs/sync/src/sync_raft_config_change.c b/source/libs/sync/src/sync_raft_config_change.c index 1f7aab064f..de790b5876 100644 --- a/source/libs/sync/src/sync_raft_config_change.c +++ b/source/libs/sync/src/sync_raft_config_change.c @@ -92,7 +92,7 @@ int syncRaftChangerEnterJoint(SSyncRaftChanger* changer, bool autoLeave, const S return checkAndReturn(config, progressMap); } -// syncRaftChangerSimpleConfig carries out a series of configuration changes that (in aggregate) +// Simple carries out a series of configuration changes that (in aggregate) // mutates the incoming majority config Voters[0] by at most one. This method // will return an error if that is not the case, if the resulting quorum is // zero, or if the configuration is in a joint state (i.e. if there is an @@ -275,7 +275,8 @@ static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConf // When a node is first added, we should mark it as recently active. // Otherwise, CheckQuorum may cause us to step down if it is invoked // before the added node has had a chance to communicate with us. - .recentActive = true, + .recentActive = true, + .refCount = 0, }; syncRaftAddToProgressMap(progressMap, pProgress); @@ -285,7 +286,7 @@ static void initProgress(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConf // each other. This is used to check both what the Changer is initialized with, // as well as what it returns. static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - int ret = syncRaftCheckProgress(config, progressMap); + int ret = syncRaftCheckTrackerConfigInProgress(config, progressMap); if (ret != 0) { return ret; } @@ -296,6 +297,7 @@ static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProg while (!syncRaftIterateNodeMap(&config->learnersNext, pNodeId)) { SyncNodeId nodeId = *pNodeId; if (!syncRaftJointConfigInOutgoing(&config->voters, nodeId)) { + syncError("[%d] is in LearnersNext, but not outgoing", nodeId); return -1; } SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, nodeId); @@ -311,8 +313,8 @@ static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProg pNodeId = NULL; while (!syncRaftIterateNodeMap(&config->learners, pNodeId)) { SyncNodeId nodeId = *pNodeId; - if (syncRaftJointConfigInIncoming(&config->voters, nodeId)) { - syncError("%d is in Learners and voter.incoming", nodeId); + if (syncRaftJointConfigInOutgoing(&config->voters, nodeId)) { + syncError("%d is in Learners and outgoing", nodeId); return -1; } SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(progressMap, nodeId); @@ -327,7 +329,7 @@ static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProg if (!hasJointConfig(config)) { // We enforce that empty maps are nil instead of zero. - if (syncRaftNodeMapSize(&config->learnersNext)) { + if (syncRaftNodeMapSize(&config->learnersNext) > 0) { syncError("cfg.LearnersNext must be nil when not joint"); return -1; } @@ -344,8 +346,8 @@ static int checkInvariants(SSyncRaftProgressTrackerConfig* config, SSyncRaftProg // the purposes of the Changer) and returns those copies. It returns an error // if checkInvariants does. static int checkAndCopy(SSyncRaftChanger* changer, SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { - syncRaftCloneTrackerConfig(&changer->tracker->config, config); - int i; + syncRaftCopyTrackerConfig(&changer->tracker->config, config); + syncRaftClearProgressMap(progressMap); SSyncRaftProgress* pProgress = NULL; while (!syncRaftIterateProgressMap(&changer->tracker->progressMap, pProgress)) { diff --git a/source/libs/sync/src/sync_raft_node_map.c b/source/libs/sync/src/sync_raft_node_map.c index 022f3b2dcb..1c54d32b59 100644 --- a/source/libs/sync/src/sync_raft_node_map.c +++ b/source/libs/sync/src/sync_raft_node_map.c @@ -21,6 +21,10 @@ void syncRaftInitNodeMap(SSyncRaftNodeMap* nodeMap) { nodeMap->nodeIdMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); } +void syncRaftFreeNodeMap(SSyncRaftNodeMap* nodeMap) { + taosHashCleanup(nodeMap->nodeIdMap); +} + void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap) { taosHashClear(nodeMap->nodeIdMap); } @@ -51,7 +55,7 @@ bool syncRaftIterateNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId *pId) { return false; } -bool syncRaftIsAllInProgressMap(const SSyncRaftNodeMap* nodeMap, const SSyncRaftProgressMap* progressMap) { +bool syncRaftIsAllNodeInProgressMap(SSyncRaftNodeMap* nodeMap, SSyncRaftProgressMap* progressMap) { SyncNodeId *pId = NULL; while (!syncRaftIterateNodeMap(nodeMap, pId)) { if (!syncRaftIsInProgressMap(progressMap, *pId)) { diff --git a/source/libs/sync/src/sync_raft_progress.c b/source/libs/sync/src/sync_raft_progress.c index e63d37cee9..65676655ec 100644 --- a/source/libs/sync/src/sync_raft_progress.c +++ b/source/libs/sync/src/sync_raft_progress.c @@ -20,6 +20,11 @@ #include "sync.h" #include "syncInt.h" +static void copyProgress(SSyncRaftProgress* progress, void* arg); + +static void refProgress(SSyncRaftProgress* progress); +static void unrefProgress(SSyncRaftProgress* progress, void*); + static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state); static void probeAcked(SSyncRaftProgress* progress); @@ -125,6 +130,7 @@ SSyncRaftProgress* syncRaftFindProgressByNodeId(const SSyncRaftProgressMap* prog } int syncRaftAddToProgressMap(SSyncRaftProgressMap* progressMap, SSyncRaftProgress* progress) { + refProgress(progress); taosHashPut(progressMap->progressMap, &progress->id, sizeof(SyncNodeId*), &progress, sizeof(SSyncRaftProgress*)); } @@ -133,7 +139,8 @@ void syncRaftRemoveFromProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId if (ppProgress == NULL) { return; } - free(*ppProgress); + unrefProgress(*ppProgress, NULL); + taosHashRemove(progressMap->progressMap, &id, sizeof(SyncNodeId*)); } @@ -182,6 +189,23 @@ void syncRaftCopyProgress(const SSyncRaftProgress* progress, SSyncRaftProgress* memcpy(out, progress, sizeof(SSyncRaftProgress)); } +void syncRaftInitProgressMap(SSyncRaftProgressMap* progressMap) { + progressMap->progressMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); +} + +void syncRaftFreeProgressMap(SSyncRaftProgressMap* progressMap) { + syncRaftVisitProgressMap(progressMap, unrefProgress, NULL); + taosHashCleanup(progressMap->progressMap); +} + +void syncRaftClearProgressMap(SSyncRaftProgressMap* progressMap) { + taosHashClear(progressMap->progressMap); +} + +void syncRaftCopyProgressMap(SSyncRaftProgressMap* from, SSyncRaftProgressMap* to) { + syncRaftVisitProgressMap(from, copyProgress, to); +} + bool syncRaftIterateProgressMap(const SSyncRaftProgressMap* progressMap, SSyncRaftProgress *pProgress) { SSyncRaftProgress **ppProgress = taosHashIterate(progressMap->progressMap, pProgress); if (ppProgress == NULL) { @@ -199,6 +223,25 @@ bool syncRaftVisitProgressMap(SSyncRaftProgressMap* progressMap, visitProgressFp } } +static void copyProgress(SSyncRaftProgress* progress, void* arg) { + assert(progress->refCount > 0); + SSyncRaftProgressMap* to = (SSyncRaftProgressMap*)arg; + syncRaftAddToProgressMap(to, progress); +} + +static void refProgress(SSyncRaftProgress* progress) { + progress->refCount += 1; +} + +static void unrefProgress(SSyncRaftProgress* progress, void* arg) { + (void)arg; + progress->refCount -= 1; + assert(progress->refCount >= 0); + if (progress->refCount == 0) { + free(progress); + } +} + /** * ResetState moves the Progress into the specified State, resetting ProbeSent, * PendingSnapshot, and Inflights. diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c index e6a016b7cf..60e3ccea6a 100644 --- a/source/libs/sync/src/sync_raft_progress_tracker.c +++ b/source/libs/sync/src/sync_raft_progress_tracker.c @@ -22,13 +22,26 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker(SSyncRaft* pRaft) { return NULL; } - syncRaftInitNodeMap(&tracker->config.learners); + syncRaftInitTrackConfig(&tracker->config); syncRaftInitNodeMap(&tracker->config.learnersNext); tracker->pRaft = pRaft; return tracker; } +void syncRaftInitTrackConfig(SSyncRaftProgressTrackerConfig* config) { + syncRaftInitNodeMap(&config->learners); + syncRaftInitNodeMap(&config->learnersNext); + syncRaftInitQuorumJointConfig(&config->voters); + config->autoLeave = false; +} + +void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config) { + syncRaftFreeNodeMap(&config->learners); + syncRaftFreeNodeMap(&config->learnersNext); + syncRaftFreeQuorumJointConfig(&config->voters); +} + void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) { taosHashClear(tracker->votesMap); } @@ -47,21 +60,21 @@ void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool g taosHashPut(tracker->votesMap, &id, sizeof(SyncNodeId), &type, sizeof(ESyncRaftVoteType*)); } -void syncRaftCloneTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) { +void syncRaftCopyTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) { memcpy(to, from, sizeof(SSyncRaftProgressTrackerConfig)); } -int syncRaftCheckProgress(const SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { +int syncRaftCheckTrackerConfigInProgress(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { // NB: intentionally allow the empty config. In production we'll never see a // non-empty config (we prevent it from being created) but we will need to // be able to *create* an initial config, for example during bootstrap (or // during tests). Instead of having to hand-code this, we allow // transitioning from an empty config into any other legal and non-empty // config. - if (!syncRaftIsAllInProgressMap(&config->voters.incoming, progressMap)) return -1; - if (!syncRaftIsAllInProgressMap(&config->voters.outgoing, progressMap)) return -1; - if (!syncRaftIsAllInProgressMap(&config->learners, progressMap)) return -1; - if (!syncRaftIsAllInProgressMap(&config->learnersNext, progressMap)) return -1; + if (!syncRaftIsAllNodeInProgressMap(&config->voters.incoming, progressMap)) return -1; + if (!syncRaftIsAllNodeInProgressMap(&config->voters.outgoing, progressMap)) return -1; + if (!syncRaftIsAllNodeInProgressMap(&config->learners, progressMap)) return -1; + if (!syncRaftIsAllNodeInProgressMap(&config->learnersNext, progressMap)) return -1; return 0; } diff --git a/source/libs/sync/src/sync_raft_quorum_joint.c b/source/libs/sync/src/sync_raft_quorum_joint.c index 6eecfbd9e5..500bd908c0 100644 --- a/source/libs/sync/src/sync_raft_quorum_joint.c +++ b/source/libs/sync/src/sync_raft_quorum_joint.c @@ -41,6 +41,16 @@ ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashOb return SYNC_RAFT_VOTE_PENDING; } +void syncRaftInitQuorumJointConfig(SSyncRaftQuorumJointConfig* config) { + syncRaftInitNodeMap(&config->incoming); + syncRaftInitNodeMap(&config->outgoing); +} + +void syncRaftFreeQuorumJointConfig(SSyncRaftQuorumJointConfig* config) { + syncRaftFreeNodeMap(&config->incoming); + syncRaftFreeNodeMap(&config->outgoing); +} + void syncRaftJointConfigAddToIncoming(SSyncRaftQuorumJointConfig* config, SyncNodeId id) { syncRaftAddToNodeMap(&config->incoming, id); } diff --git a/source/libs/sync/src/sync_raft_restore.c b/source/libs/sync/src/sync_raft_restore.c index 17269254bd..d1acd3e8e9 100644 --- a/source/libs/sync/src/sync_raft_restore.c +++ b/source/libs/sync/src/sync_raft_restore.c @@ -28,21 +28,26 @@ static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleA // the Changer only needs a ProgressMap (not a whole Tracker) at which point // this can just take LastIndex and MaxInflight directly instead and cook up // the results from that alone. -int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs) { +int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs, + SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap) { SSyncConfChangeSingleArray outgoing; SSyncConfChangeSingleArray incoming; SSyncConfChangeSingleArray css; SSyncRaftProgressTracker* tracker = changer->tracker; - SSyncRaftProgressTrackerConfig* config = &tracker->config; - SSyncRaftProgressMap* progressMap = &tracker->progressMap; int i, ret; + syncRaftInitConfArray(&outgoing); + syncRaftInitConfArray(&incoming); + + syncRaftInitTrackConfig(config); + syncRaftInitProgressMap(progressMap); + ret = toConfChangeSingle(cs, &outgoing, &incoming); if (ret != 0) { goto out; } - if (outgoing.n == 0) { + if (syncRaftConfArrayIsEmpty(&outgoing)) { // No outgoing config, so just apply the incoming changes one by one. for (i = 0; i < incoming.n; ++i) { css = (SSyncConfChangeSingleArray) { @@ -53,6 +58,9 @@ int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs) if (ret != 0) { goto out; } + + syncRaftCopyTrackerConfig(config, &changer->tracker->config); + syncRaftCopyProgressMap(progressMap, &changer->tracker->progressMap); } } else { // The ConfState describes a joint configuration. @@ -69,6 +77,8 @@ int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs) if (ret != 0) { goto out; } + syncRaftCopyTrackerConfig(config, &changer->tracker->config); + syncRaftCopyProgressMap(progressMap, &changer->tracker->progressMap); } ret = syncRaftChangerEnterJoint(changer, cs->autoLeave, &incoming, config, progressMap); @@ -78,8 +88,9 @@ int syncRaftRestoreConfig(SSyncRaftChanger* changer, const SSyncConfigState* cs) } out: - if (incoming.n != 0) free(incoming.changes); - if (outgoing.n != 0) free(outgoing.changes); + syncRaftFreeConfArray(&incoming); + syncRaftFreeConfArray(&outgoing); + return ret; } @@ -102,8 +113,6 @@ static void addToConfChangeSingleArray(SSyncConfChangeSingleArray* out, int* i, static int toConfChangeSingle(const SSyncConfigState* cs, SSyncConfChangeSingleArray* out, SSyncConfChangeSingleArray* in) { int i; - out->n = in->n = 0; - out->n = syncRaftNodeMapSize(&cs->votersOutgoing); out->changes = (SSyncConfChangeSingle*)malloc(sizeof(SSyncConfChangeSingle) * out->n); if (out->changes == NULL) { From ce654f835a5b31cff3fbcfe5c52b488689a0335f Mon Sep 17 00:00:00 2001 From: lichuang Date: Fri, 19 Nov 2021 10:18:56 +0800 Subject: [PATCH 14/21] [TD-10645][raft]refactor node and progress map --- source/libs/sync/inc/sync_const.h | 25 +++ source/libs/sync/inc/sync_raft_inflights.h | 41 ++--- source/libs/sync/inc/sync_raft_node_map.h | 4 +- source/libs/sync/inc/sync_raft_progress.h | 159 ++++++++---------- .../sync/inc/sync_raft_progress_tracker.h | 24 ++- source/libs/sync/inc/sync_raft_quorum_joint.h | 25 +-- .../libs/sync/inc/sync_raft_quorum_majority.h | 4 + source/libs/sync/inc/sync_type.h | 2 + source/libs/sync/src/sync_raft_election.c | 2 +- source/libs/sync/src/sync_raft_inflights.c | 25 +-- source/libs/sync/src/sync_raft_node_map.c | 11 +- source/libs/sync/src/sync_raft_progress.c | 155 +++++------------ .../sync/src/sync_raft_progress_tracker.c | 67 ++++++-- source/libs/sync/src/sync_raft_quorum_joint.c | 11 +- .../libs/sync/src/sync_raft_quorum_majority.c | 68 +++++++- 15 files changed, 333 insertions(+), 290 deletions(-) create mode 100644 source/libs/sync/inc/sync_const.h diff --git a/source/libs/sync/inc/sync_const.h b/source/libs/sync/inc/sync_const.h new file mode 100644 index 0000000000..b49c17f82e --- /dev/null +++ b/source/libs/sync/inc/sync_const.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_LIBS_SYNC_CONST_H +#define _TD_LIBS_SYNC_CONST_H + +#include "sync.h" + +static int kSyncRaftMaxInflghtMsgs = 20; + +static SyncIndex kMaxCommitIndex = UINT64_MAX; + +#endif /* _TD_LIBS_SYNC_CONST_H */ diff --git a/source/libs/sync/inc/sync_raft_inflights.h b/source/libs/sync/inc/sync_raft_inflights.h index 6d249c9274..627bf9a26f 100644 --- a/source/libs/sync/inc/sync_raft_inflights.h +++ b/source/libs/sync/inc/sync_raft_inflights.h @@ -18,54 +18,47 @@ #include "sync.h" -/** - * SSyncRaftInflights limits the number of MsgApp (represented by the largest index - * contained within) sent to followers but not yet acknowledged by them. Callers - * use syncRaftInflightFull() to check whether more messages can be sent, - * call syncRaftInflightAdd() whenever they are sending a new append, - * and release "quota" via FreeLE() whenever an ack is received. -**/ +// Inflights limits the number of MsgApp (represented by the largest index +// contained within) sent to followers but not yet acknowledged by them. Callers +// use Full() to check whether more messages can be sent, call Add() whenever +// they are sending a new append, and release "quota" via FreeLE() whenever an +// ack is received. typedef struct SSyncRaftInflights { - /* the starting index in the buffer */ + // the starting index in the buffer int start; - /* number of inflights in the buffer */ + // number of inflights in the buffer int count; - /* the size of the buffer */ + // the size of the buffer int size; - /** - * buffer contains the index of the last entry - * inside one message. - **/ + // buffer contains the index of the last entry + // inside one message. SyncIndex* buffer; } SSyncRaftInflights; SSyncRaftInflights* syncRaftOpenInflights(int size); void syncRaftCloseInflights(SSyncRaftInflights*); +// reset frees all inflights. static FORCE_INLINE void syncRaftInflightReset(SSyncRaftInflights* inflights) { inflights->count = 0; inflights->start = 0; } +// Full returns true if no more messages can be sent at the moment. static FORCE_INLINE bool syncRaftInflightFull(SSyncRaftInflights* inflights) { return inflights->count == inflights->size; } -/** - * syncRaftInflightAdd notifies the Inflights that a new message with the given index is being - * dispatched. syncRaftInflightFull() must be called prior to syncRaftInflightAdd() - * to verify that there is room for one more message, - * and consecutive calls to add syncRaftInflightAdd() must provide a - * monotonic sequence of indexes. - **/ +// Add notifies the Inflights that a new message with the given index is being +// dispatched. Full() must be called prior to Add() to verify that there is room +// for one more message, and consecutive calls to add Add() must provide a +// monotonic sequence of indexes. void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex); -/** - * syncRaftInflightFreeLE frees the inflights smaller or equal to the given `to` flight. - **/ +// FreeLE frees the inflights smaller or equal to the given `to` flight. void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex); /** diff --git a/source/libs/sync/inc/sync_raft_node_map.h b/source/libs/sync/inc/sync_raft_node_map.h index 2de4887bf4..b4cf04056d 100644 --- a/source/libs/sync/inc/sync_raft_node_map.h +++ b/source/libs/sync/inc/sync_raft_node_map.h @@ -31,9 +31,9 @@ void syncRaftClearNodeMap(SSyncRaftNodeMap* nodeMap); bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); -void syncRaftCopyNodeMap(const SSyncRaftNodeMap* from, SSyncRaftNodeMap* to); +void syncRaftCopyNodeMap(SSyncRaftNodeMap* from, SSyncRaftNodeMap* to); -void syncRaftUnionNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to); +void syncRaftUnionNodeMap(SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to); void syncRaftAddToNodeMap(SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); diff --git a/source/libs/sync/inc/sync_raft_progress.h b/source/libs/sync/inc/sync_raft_progress.h index 7d80ce5438..32c21281cd 100644 --- a/source/libs/sync/inc/sync_raft_progress.h +++ b/source/libs/sync/inc/sync_raft_progress.h @@ -65,10 +65,13 @@ static const char* kProgressStateString[] = { "Snapshot", }; -/** - * Progress represents a follower’s progress in the view of the leader. Leader maintains - * progresses of all followers, and sends entries to the follower based on its progress. - **/ +// Progress represents a follower’s progress in the view of the leader. Leader +// maintains progresses of all followers, and sends entries to the follower +// based on its progress. +// +// NB(tbg): Progress is basically a state machine whose transitions are mostly +// strewn around `*raft.raft`. Additionally, some fields are only used when in a +// certain State. All of this isn't ideal. struct SSyncRaftProgress { SyncGroupId groupId; @@ -80,63 +83,53 @@ struct SSyncRaftProgress { SyncIndex matchIndex; - /** - * State defines how the leader should interact with the follower. - * - * When in StateProbe, leader sends at most one replication message - * per heartbeat interval. It also probes actual progress of the follower. - * - * When in StateReplicate, leader optimistically increases next - * to the latest entry sent after sending replication message. This is - * an optimized state for fast replicating log entries to the follower. - * - * When in StateSnapshot, leader should have sent out snapshot - * before and stops sending any replication message. - **/ + // State defines how the leader should interact with the follower. + // + // When in StateProbe, leader sends at most one replication message + // per heartbeat interval. It also probes actual progress of the follower. + // + // When in StateReplicate, leader optimistically increases next + // to the latest entry sent after sending replication message. This is + // an optimized state for fast replicating log entries to the follower. + // + // When in StateSnapshot, leader should have sent out snapshot + // before and stops sending any replication message. ESyncRaftProgressState state; - /** - * pendingSnapshotIndex is used in PROGRESS_STATE_SNAPSHOT. - * If there is a pending snapshot, the pendingSnapshotIndex will be set to the - * index of the snapshot. If pendingSnapshotIndex is set, the replication process of - * this Progress will be paused. raft will not resend snapshot until the pending one - * is reported to be failed. - **/ + // PendingSnapshot is used in StateSnapshot. + // If there is a pending snapshot, the pendingSnapshot will be set to the + // index of the snapshot. If pendingSnapshot is set, the replication process of + // this Progress will be paused. raft will not resend snapshot until the pending one + // is reported to be failed. SyncIndex pendingSnapshotIndex; - /** - * recentActive is true if the progress is recently active. Receiving any messages - * from the corresponding follower indicates the progress is active. - * RecentActive can be reset to false after an election timeout. - **/ + // RecentActive is true if the progress is recently active. Receiving any messages + // from the corresponding follower indicates the progress is active. + // RecentActive can be reset to false after an election timeout. + // + // TODO(tbg): the leader should always have this set to true. bool recentActive; - /** - * probeSent is used while this follower is in StateProbe. When probeSent is - * true, raft should pause sending replication message to this peer until - * probeSent is reset. See ProbeAcked() and IsPaused(). - **/ + // ProbeSent is used while this follower is in StateProbe. When ProbeSent is + // true, raft should pause sending replication message to this peer until + // ProbeSent is reset. See ProbeAcked() and IsPaused(). bool probeSent; - /** - * inflights is a sliding window for the inflight messages. - * Each inflight message contains one or more log entries. - * The max number of entries per message is defined in raft config as MaxSizePerMsg. - * Thus inflight effectively limits both the number of inflight messages - * and the bandwidth each Progress can use. - * When inflights is Full, no more message should be sent. - * When a leader sends out a message, the index of the last - * entry should be added to inflights. The index MUST be added - * into inflights in order. - * When a leader receives a reply, the previous inflights should - * be freed by calling inflights.FreeLE with the index of the last - * received entry. - **/ + // Inflights is a sliding window for the inflight messages. + // Each inflight message contains one or more log entries. + // The max number of entries per message is defined in raft config as MaxSizePerMsg. + // Thus inflight effectively limits both the number of inflight messages + // and the bandwidth each Progress can use. + // When inflights is Full, no more message should be sent. + // When a leader sends out a message, the index of the last + // entry should be added to inflights. The index MUST be added + // into inflights in order. + // When a leader receives a reply, the previous inflights should + // be freed by calling inflights.FreeLE with the index of the last + // received entry. SSyncRaftInflights* inflights; - /** - * IsLearner is true if this progress is tracked for a learner. - **/ + // IsLearner is true if this progress is tracked for a learner. bool isLearner; }; @@ -151,56 +144,44 @@ static FORCE_INLINE const char* syncRaftProgressStateString(const SSyncRaftProgr void syncRaftResetProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress); -/** - * syncRaftProgressBecomeProbe transitions into StateProbe. Next is reset to Match+1 or, - * optionally and if larger, the index of the pending snapshot. - **/ +// BecomeProbe transitions into StateProbe. Next is reset to Match+1 or, +// optionally and if larger, the index of the pending snapshot. void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress); -/** - * syncRaftProgressBecomeReplicate transitions into StateReplicate, resetting Next to Match+1. - **/ +// BecomeReplicate transitions into StateReplicate, resetting Next to Match+1. void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress); -/** - * syncRaftProgressMaybeUpdate is called when an MsgAppResp arrives from the follower, with the - * index acked by it. The method returns false if the given n index comes from - * an outdated message. Otherwise it updates the progress and returns true. - **/ +// MaybeUpdate is called when an MsgAppResp arrives from the follower, with the +// index acked by it. The method returns false if the given n index comes from +// an outdated message. Otherwise it updates the progress and returns true. bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex); -/** - * syncRaftProgressOptimisticNextIndex signals that appends all the way up to and including index n - * are in-flight. As a result, Next is increased to n+1. - **/ +// OptimisticUpdate signals that appends all the way up to and including index n +// are in-flight. As a result, Next is increased to n+1. static FORCE_INLINE void syncRaftProgressOptimisticNextIndex(SSyncRaftProgress* progress, SyncIndex nextIndex) { progress->nextIndex = nextIndex + 1; } -/** - * syncRaftProgressMaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The - * arguments are the index of the append message rejected by the follower, and - * the hint that we want to decrease to. - * - * Rejections can happen spuriously as messages are sent out of order or - * duplicated. In such cases, the rejection pertains to an index that the - * Progress already knows were previously acknowledged, and false is returned - * without changing the Progress. - * - * If the rejection is genuine, Next is lowered sensibly, and the Progress is - * cleared for sending log entries. -**/ +// MaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The +// arguments are the index of the append message rejected by the follower, and +// the hint that we want to decrease to. +// +// Rejections can happen spuriously as messages are sent out of order or +// duplicated. In such cases, the rejection pertains to an index that the +// Progress already knows were previously acknowledged, and false is returned +// without changing the Progress. +// +// If the rejection is genuine, Next is lowered sensibly, and the Progress is +// cleared for sending log entries. bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, SyncIndex rejected, SyncIndex matchHint); -/** - * syncRaftProgressIsPaused returns whether sending log entries to this node has been throttled. - * This is done when a node has rejected recent MsgApps, is currently waiting - * for a snapshot, or has reached the MaxInflightMsgs limit. In normal - * operation, this is false. A throttled node will be contacted less frequently - * until it has reached a state in which it's able to accept a steady stream of - * log entries again. - **/ +// IsPaused returns whether sending log entries to this node has been throttled. +// This is done when a node has rejected recent MsgApps, is currently waiting +// for a snapshot, or has reached the MaxInflightMsgs limit. In normal +// operation, this is false. A throttled node will be contacted less frequently +// until it has reached a state in which it's able to accept a steady stream of +// log entries again. bool syncRaftProgressIsPaused(SSyncRaftProgress* progress); static FORCE_INLINE SyncIndex syncRaftProgressNextIndex(SSyncRaftProgress* progress) { @@ -242,6 +223,8 @@ bool syncRaftIsInProgressMap(SSyncRaftProgressMap* progressMap, SyncNodeId id); **/ bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress); +// BecomeSnapshot moves the Progress to StateSnapshot with the specified pending +// snapshot index. void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex); void syncRaftCopyProgress(const SSyncRaftProgress* from, SSyncRaftProgress* to); diff --git a/source/libs/sync/inc/sync_raft_progress_tracker.h b/source/libs/sync/inc/sync_raft_progress_tracker.h index ff69b7b1d1..0a3c7dd6fc 100644 --- a/source/libs/sync/inc/sync_raft_progress_tracker.h +++ b/source/libs/sync/inc/sync_raft_progress_tracker.h @@ -23,6 +23,7 @@ #include "sync_raft_proto.h" #include "thash.h" +// Config reflects the configuration tracked in a ProgressTracker. struct SSyncRaftProgressTrackerConfig { SSyncRaftQuorumJointConfig voters; @@ -99,27 +100,32 @@ void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config); void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config); +// ResetVotes prepares for a new round of vote counting via recordVote. void syncRaftResetVotes(SSyncRaftProgressTracker*); void syncRaftProgressVisit(SSyncRaftProgressTracker*, visitProgressFp visit, void* arg); -/** - * syncRaftRecordVote records that the node with the given id voted for this Raft - * instance if v == true (and declined it otherwise). - **/ +// RecordVote records that the node with the given id voted for this Raft +// instance if v == true (and declined it otherwise). void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant); void syncRaftCopyTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to); int syncRaftCheckTrackerConfigInProgress(SSyncRaftProgressTrackerConfig* config, SSyncRaftProgressMap* progressMap); -/** - * syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the - * election outcome is known. - **/ +// TallyVotes returns the number of granted and rejected Votes, and whether the +// election outcome is known. ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted); -void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs); +void syncRaftConfigState(SSyncRaftProgressTracker* tracker, SSyncConfigState* cs); + +// Committed returns the largest log index known to be committed based on what +// the voting members of the group have acknowledged. +SyncIndex syncRaftCommittedIndex(SSyncRaftProgressTracker* tracker); + +// QuorumActive returns true if the quorum is active from the view of the local +// raft state machine. Otherwise, it returns false. +bool syncRaftQuorumActive(SSyncRaftProgressTracker* tracker); bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId); diff --git a/source/libs/sync/inc/sync_raft_quorum_joint.h b/source/libs/sync/inc/sync_raft_quorum_joint.h index 92cddaaec1..9d5f10ab51 100644 --- a/source/libs/sync/inc/sync_raft_quorum_joint.h +++ b/source/libs/sync/inc/sync_raft_quorum_joint.h @@ -22,20 +22,25 @@ #include "sync_raft_node_map.h" #include "thash.h" -/** - * SSyncRaftQuorumJointConfig is a configuration of two groups of (possibly overlapping) - * majority configurations. Decisions require the support of both majorities. - **/ +// JointConfig is a configuration of two groups of (possibly overlapping) +// majority configurations. Decisions require the support of both majorities. typedef struct SSyncRaftQuorumJointConfig { SSyncRaftNodeMap outgoing; SSyncRaftNodeMap incoming; } SSyncRaftQuorumJointConfig; -/** - * syncRaftVoteResult takes a mapping of voters to yes/no (true/false) votes and returns - * a result indicating whether the vote is pending, lost, or won. A joint quorum - * requires both majority quorums to vote in favor. - **/ +// IDs returns a newly initialized map representing the set of voters present +// in the joint configuration. +void syncRaftJointConfigIDs(SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap); + +// CommittedIndex returns the largest committed index for the given joint +// quorum. An index is jointly committed if it is committed in both constituent +// majorities. +SyncIndex syncRaftJointConfigCommittedIndex(const SSyncRaftQuorumJointConfig* config, matchAckIndexerFp indexer, void* arg); + +// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns +// a result indicating whether the vote is pending, lost, or won. A joint quorum +// requires both majority quorums to vote in favor. ESyncRaftVoteType syncRaftVoteResult(SSyncRaftQuorumJointConfig* config, SHashObj* votesMap); void syncRaftInitQuorumJointConfig(SSyncRaftQuorumJointConfig* config); @@ -76,6 +81,4 @@ static FORCE_INLINE bool syncRaftJointConfigIsInOutgoing(const SSyncRaftQuorumJo return syncRaftIsInNodeMap(&config->outgoing, id); } -void syncRaftJointConfigIDS(const SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap); - #endif /* _TD_LIBS_SYNC_RAFT_QUORUM_JOINT_H */ diff --git a/source/libs/sync/inc/sync_raft_quorum_majority.h b/source/libs/sync/inc/sync_raft_quorum_majority.h index 38df40147a..399bd71db8 100644 --- a/source/libs/sync/inc/sync_raft_quorum_majority.h +++ b/source/libs/sync/inc/sync_raft_quorum_majority.h @@ -29,4 +29,8 @@ **/ ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashObj* votesMap); +// CommittedIndex computes the committed index from those supplied via the +// provided AckedIndexer (for the active config). +SyncIndex syncRaftMajorityConfigCommittedIndex(const SSyncRaftNodeMap* config, matchAckIndexerFp indexer, void* arg); + #endif /* _TD_LIBS_SYNC_RAFT_QUORUM_MAJORITY_H */ diff --git a/source/libs/sync/inc/sync_type.h b/source/libs/sync/inc/sync_type.h index e00700d724..9c4bc9e63c 100644 --- a/source/libs/sync/inc/sync_type.h +++ b/source/libs/sync/inc/sync_type.h @@ -86,4 +86,6 @@ typedef enum { typedef void (*visitProgressFp)(SSyncRaftProgress* progress, void* arg); +typedef void (*matchAckIndexerFp)(SyncNodeId id, void* arg, SyncIndex* index); + #endif /* _TD_LIBS_SYNC_TYPE_H */ diff --git a/source/libs/sync/src/sync_raft_election.c b/source/libs/sync/src/sync_raft_election.c index 6d36d38267..d961978be2 100644 --- a/source/libs/sync/src/sync_raft_election.c +++ b/source/libs/sync/src/sync_raft_election.c @@ -84,7 +84,7 @@ static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); SyncTerm lastTerm = syncRaftLogLastTerm(pRaft->log); SSyncRaftNodeMap nodeMap; - syncRaftJointConfigIDS(&pRaft->tracker->config.voters, &nodeMap); + syncRaftJointConfigIDs(&pRaft->tracker->config.voters, &nodeMap); SyncNodeId *pNodeId = NULL; while (!syncRaftIterateNodeMap(&nodeMap, pNodeId)) { SyncNodeId nodeId = *pNodeId; diff --git a/source/libs/sync/src/sync_raft_inflights.c b/source/libs/sync/src/sync_raft_inflights.c index 3d740b5a9e..7b97aca014 100644 --- a/source/libs/sync/src/sync_raft_inflights.c +++ b/source/libs/sync/src/sync_raft_inflights.c @@ -40,19 +40,16 @@ void syncRaftCloseInflights(SSyncRaftInflights* inflights) { free(inflights); } -/** - * syncRaftInflightAdd notifies the Inflights that a new message with the given index is being - * dispatched. syncRaftInflightFull() must be called prior to syncRaftInflightAdd() - * to verify that there is room for one more message, - * and consecutive calls to add syncRaftInflightAdd() must provide a - * monotonic sequence of indexes. - **/ +// Add notifies the Inflights that a new message with the given index is being +// dispatched. Full() must be called prior to Add() to verify that there is room +// for one more message, and consecutive calls to add Add() must provide a +// monotonic sequence of indexes. void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex) { assert(!syncRaftInflightFull(inflights)); int next = inflights->start + inflights->count; int size = inflights->size; - /* is next wrapped around buffer? */ + if (next >= size) { next -= size; } @@ -61,12 +58,10 @@ void syncRaftInflightAdd(SSyncRaftInflights* inflights, SyncIndex inflightIndex) inflights->count++; } -/** - * syncRaftInflightFreeLE frees the inflights smaller or equal to the given `to` flight. - **/ +// FreeLE frees the inflights smaller or equal to the given `to` flight. void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex) { if (inflights->count == 0 || toIndex < inflights->buffer[inflights->start]) { - /* out of the left side of the window */ + // out of the left side of the window return; } @@ -95,10 +90,8 @@ void syncRaftInflightFreeLE(SSyncRaftInflights* inflights, SyncIndex toIndex) { } } -/** - * syncRaftInflightFreeFirstOne releases the first inflight. - * This is a no-op if nothing is inflight. - **/ +// FreeFirstOne releases the first inflight. This is a no-op if nothing is +// inflight. void syncRaftInflightFreeFirstOne(SSyncRaftInflights* inflights) { syncRaftInflightFreeLE(inflights, inflights->buffer[inflights->start]); } diff --git a/source/libs/sync/src/sync_raft_node_map.c b/source/libs/sync/src/sync_raft_node_map.c index 1c54d32b59..642eebe65b 100644 --- a/source/libs/sync/src/sync_raft_node_map.c +++ b/source/libs/sync/src/sync_raft_node_map.c @@ -37,11 +37,10 @@ bool syncRaftIsInNodeMap(const SSyncRaftNodeMap* nodeMap, SyncNodeId nodeId) { return true; } -void syncRaftCopyNodeMap(const SSyncRaftNodeMap* from, SSyncRaftNodeMap* to) { - SyncNodeId** ppId = (SyncNodeId**)taosHashIterate(from->nodeIdMap, NULL); - while (ppId) { - taosHashPut(to->nodeIdMap, ppId, sizeof(SyncNodeId*), ppId, sizeof(SyncNodeId*)); - ppId = taosHashIterate(from->nodeIdMap, ppId); +void syncRaftCopyNodeMap(SSyncRaftNodeMap* from, SSyncRaftNodeMap* to) { + SyncNodeId *pId = NULL; + while (!syncRaftIterateNodeMap(from, pId)) { + taosHashPut(to->nodeIdMap, &pId, sizeof(SyncNodeId*), &pId, sizeof(SyncNodeId*)); } } @@ -66,7 +65,7 @@ bool syncRaftIsAllNodeInProgressMap(SSyncRaftNodeMap* nodeMap, SSyncRaftProgress return true; } -void syncRaftUnionNodeMap(const SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) { +void syncRaftUnionNodeMap(SSyncRaftNodeMap* nodeMap, SSyncRaftNodeMap* to) { syncRaftCopyNodeMap(nodeMap, to); } diff --git a/source/libs/sync/src/sync_raft_progress.c b/source/libs/sync/src/sync_raft_progress.c index 65676655ec..a3ab93c0fc 100644 --- a/source/libs/sync/src/sync_raft_progress.c +++ b/source/libs/sync/src/sync_raft_progress.c @@ -47,11 +47,9 @@ void syncRaftResetProgress(SSyncRaft* pRaft, SSyncRaftProgress* progress) { }; } -/** - * syncRaftProgressMaybeUpdate is called when an MsgAppResp arrives from the follower, with the - * index acked by it. The method returns false if the given n index comes from - * an outdated message. Otherwise it updates the progress and returns true. - **/ +// MaybeUpdate is called when an MsgAppResp arrives from the follower, with the +// index acked by it. The method returns false if the given n index comes from +// an outdated message. Otherwise it updates the progress and returns true. bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastIndex) { bool updated = false; @@ -66,27 +64,36 @@ bool syncRaftProgressMaybeUpdate(SSyncRaftProgress* progress, SyncIndex lastInde return updated; } +// MaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The +// arguments are the index of the append message rejected by the follower, and +// the hint that we want to decrease to. +// +// Rejections can happen spuriously as messages are sent out of order or +// duplicated. In such cases, the rejection pertains to an index that the +// Progress already knows were previously acknowledged, and false is returned +// without changing the Progress. +// +// If the rejection is genuine, Next is lowered sensibly, and the Progress is +// cleared for sending log entries. bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, SyncIndex rejected, SyncIndex matchHint) { if (progress->state == PROGRESS_STATE_REPLICATE) { - /** - * the rejection must be stale if the progress has matched and "rejected" - * is smaller than "match". - **/ + // The rejection must be stale if the progress has matched and "rejected" + // is smaller than "match". if (rejected <= progress->matchIndex) { syncDebug("match index is up to date,ignore"); return false; } - /* directly decrease next to match + 1 */ + // Directly decrease next to match + 1. + // + // TODO(tbg): why not use matchHint if it's larger? progress->nextIndex = progress->matchIndex + 1; return true; } - /** - * The rejection must be stale if "rejected" does not match next - 1. This - * is because non-replicating followers are probed one entry at a time. - **/ + // The rejection must be stale if "rejected" does not match next - 1. This + // is because non-replicating followers are probed one entry at a time. if (rejected != progress->nextIndex - 1) { syncDebug("rejected index %" PRId64 " different from next index %" PRId64 " -> ignore" , rejected, progress->nextIndex); @@ -99,14 +106,12 @@ bool syncRaftProgressMaybeDecrTo(SSyncRaftProgress* progress, return true; } -/** - * syncRaftProgressIsPaused returns whether sending log entries to this node has been throttled. - * This is done when a node has rejected recent MsgApps, is currently waiting - * for a snapshot, or has reached the MaxInflightMsgs limit. In normal - * operation, this is false. A throttled node will be contacted less frequently - * until it has reached a state in which it's able to accept a steady stream of - * log entries again. - **/ +// IsPaused returns whether sending log entries to this node has been throttled. +// This is done when a node has rejected recent MsgApps, is currently waiting +// for a snapshot, or has reached the MaxInflightMsgs limit. In normal +// operation, this is false. A throttled node will be contacted less frequently +// until it has reached a state in which it's able to accept a steady stream of +// log entries again. bool syncRaftProgressIsPaused(SSyncRaftProgress* progress) { switch (progress->state) { case PROGRESS_STATE_PROBE: @@ -152,16 +157,12 @@ bool syncRaftProgressIsUptodate(SSyncRaft* pRaft, SSyncRaftProgress* progress) { return syncRaftLogLastIndex(pRaft->log) + 1 == progress->nextIndex; } -/** - * syncRaftProgressBecomeProbe transitions into StateProbe. Next is reset to Match+1 or, - * optionally and if larger, the index of the pending snapshot. - **/ +// BecomeProbe transitions into StateProbe. Next is reset to Match+1 or, +// optionally and if larger, the index of the pending snapshot. void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress) { - /** - * If the original state is ProgressStateSnapshot, progress knows that - * the pending snapshot has been sent to this peer successfully, then - * probes from pendingSnapshot + 1. - **/ + // If the original state is StateSnapshot, progress knows that + // the pending snapshot has been sent to this peer successfully, then + // probes from pendingSnapshot + 1. if (progress->state == PROGRESS_STATE_SNAPSHOT) { SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex; resetProgressState(progress, PROGRESS_STATE_PROBE); @@ -172,14 +173,14 @@ void syncRaftProgressBecomeProbe(SSyncRaftProgress* progress) { } } -/** - * syncRaftProgressBecomeReplicate transitions into StateReplicate, resetting Next to Match+1. - **/ +// BecomeReplicate transitions into StateReplicate, resetting Next to Match+1. void syncRaftProgressBecomeReplicate(SSyncRaftProgress* progress) { resetProgressState(progress, PROGRESS_STATE_REPLICATE); progress->nextIndex = progress->matchIndex + 1; } +// BecomeSnapshot moves the Progress to StateSnapshot with the specified pending +// snapshot index. void syncRaftProgressBecomeSnapshot(SSyncRaftProgress* progress, SyncIndex snapshotIndex) { resetProgressState(progress, PROGRESS_STATE_SNAPSHOT); progress->pendingSnapshotIndex = snapshotIndex; @@ -242,10 +243,8 @@ static void unrefProgress(SSyncRaftProgress* progress, void* arg) { } } -/** - * ResetState moves the Progress into the specified State, resetting ProbeSent, - * PendingSnapshot, and Inflights. - **/ +// ResetState moves the Progress into the specified State, resetting ProbeSent, +// PendingSnapshot, and Inflights. static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressState state) { progress->probeSent = false; progress->pendingSnapshotIndex = 0; @@ -253,83 +252,9 @@ static void resetProgressState(SSyncRaftProgress* progress, ESyncRaftProgressSta syncRaftInflightReset(progress->inflights); } -/** - * probeAcked is called when this peer has accepted an append. It resets - * ProbeSent to signal that additional append messages should be sent without - * further delay. - **/ +// ProbeAcked is called when this peer has accepted an append. It resets +// ProbeSent to signal that additional append messages should be sent without +// further delay. static void probeAcked(SSyncRaftProgress* progress) { progress->probeSent = false; } - -#if 0 - -SyncIndex syncRaftProgressNextIndex(SSyncRaft* pRaft, int i) { - return pRaft->leaderState.progress[i].nextIndex; -} - -SyncIndex syncRaftProgressMatchIndex(SSyncRaft* pRaft, int i) { - return pRaft->leaderState.progress[i].matchIndex; -} - -void syncRaftProgressUpdateLastSend(SSyncRaft* pRaft, int i) { - pRaft->leaderState.progress[i].lastSend = pRaft->io.time(pRaft); -} - -void syncRaftProgressUpdateSnapshotLastSend(SSyncRaft* pRaft, int i) { - pRaft->leaderState.progress[i].lastSendSnapshot = pRaft->io.time(pRaft); -} - -bool syncRaftProgressResetRecentRecv(SSyncRaft* pRaft, int i) { - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - bool prev = progress->recentRecv; - progress->recentRecv = false; - return prev; -} - -void syncRaftProgressMarkRecentRecv(SSyncRaft* pRaft, int i) { - pRaft->leaderState.progress[i].recentRecv = true; -} - -bool syncRaftProgressGetRecentRecv(SSyncRaft* pRaft, int i) { - return pRaft->leaderState.progress[i].recentRecv; -} - -void syncRaftProgressBecomeSnapshot(SSyncRaft* pRaft, int i) { - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - resetProgressState(progress, PROGRESS_STATE_SNAPSHOT); - progress->pendingSnapshotIndex = raftLogSnapshotIndex(pRaft->log); -} - -void syncRaftProgressBecomeProbe(SSyncRaft* pRaft, int i) { - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - - if (progress->state == PROGRESS_STATE_SNAPSHOT) { - assert(progress->pendingSnapshotIndex > 0); - SyncIndex pendingSnapshotIndex = progress->pendingSnapshotIndex; - resetProgressState(progress, PROGRESS_STATE_PROBE); - progress->nextIndex = max(progress->matchIndex + 1, pendingSnapshotIndex); - } else { - resetProgressState(progress, PROGRESS_STATE_PROBE); - progress->nextIndex = progress->matchIndex + 1; - } -} - -void syncRaftProgressBecomeReplicate(SSyncRaft* pRaft, int i) { - resetProgressState(pRaft->leaderState.progress, PROGRESS_STATE_REPLICATE); - pRaft->leaderState.progress->nextIndex = pRaft->leaderState.progress->matchIndex + 1; -} - -void syncRaftProgressAbortSnapshot(SSyncRaft* pRaft, int i) { - SSyncRaftProgress* progress = &(pRaft->leaderState.progress[i]); - progress->pendingSnapshotIndex = 0; - progress->state = PROGRESS_STATE_PROBE; -} - -ESyncRaftProgressState syncRaftProgressState(SSyncRaft* pRaft, int i) { - return pRaft->leaderState.progress[i].state; -} - - - -#endif \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_progress_tracker.c b/source/libs/sync/src/sync_raft_progress_tracker.c index 60e3ccea6a..e0b4afae21 100644 --- a/source/libs/sync/src/sync_raft_progress_tracker.c +++ b/source/libs/sync/src/sync_raft_progress_tracker.c @@ -13,6 +13,8 @@ * along with this program. If not, see . */ +#include "raft.h" +#include "sync_const.h" #include "sync_raft_progress_tracker.h" #include "sync_raft_proto.h" @@ -22,9 +24,11 @@ SSyncRaftProgressTracker* syncRaftOpenProgressTracker(SSyncRaft* pRaft) { return NULL; } + tracker->votesMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); + syncRaftInitTrackConfig(&tracker->config); - syncRaftInitNodeMap(&tracker->config.learnersNext); tracker->pRaft = pRaft; + tracker->maxInflightMsgs = kSyncRaftMaxInflghtMsgs; return tracker; } @@ -39,9 +43,11 @@ void syncRaftInitTrackConfig(SSyncRaftProgressTrackerConfig* config) { void syncRaftFreeTrackConfig(SSyncRaftProgressTrackerConfig* config) { syncRaftFreeNodeMap(&config->learners); syncRaftFreeNodeMap(&config->learnersNext); - syncRaftFreeQuorumJointConfig(&config->voters); + syncRaftFreeNodeMap(&config->voters.incoming); + syncRaftFreeNodeMap(&config->voters.outgoing); } +// ResetVotes prepares for a new round of vote counting via recordVote. void syncRaftResetVotes(SSyncRaftProgressTracker* tracker) { taosHashClear(tracker->votesMap); } @@ -50,14 +56,15 @@ void syncRaftProgressVisit(SSyncRaftProgressTracker* tracker, visitProgressFp vi syncRaftVisitProgressMap(&tracker->progressMap, visit, arg); } +// RecordVote records that the node with the given id voted for this Raft +// instance if v == true (and declined it otherwise). void syncRaftRecordVote(SSyncRaftProgressTracker* tracker, SyncNodeId id, bool grant) { ESyncRaftVoteType* pType = taosHashGet(tracker->votesMap, &id, sizeof(SyncNodeId*)); if (pType != NULL) { return; } - ESyncRaftVoteType type = grant ? SYNC_RAFT_VOTE_RESP_GRANT : SYNC_RAFT_VOTE_RESP_REJECT; - taosHashPut(tracker->votesMap, &id, sizeof(SyncNodeId), &type, sizeof(ESyncRaftVoteType*)); + taosHashPut(tracker->votesMap, &id, sizeof(SyncNodeId), &grant, sizeof(bool*)); } void syncRaftCopyTrackerConfig(const SSyncRaftProgressTrackerConfig* from, SSyncRaftProgressTrackerConfig* to) { @@ -78,26 +85,27 @@ int syncRaftCheckTrackerConfigInProgress(SSyncRaftProgressTrackerConfig* config, return 0; } -/** - * syncRaftTallyVotes returns the number of granted and rejected Votes, and whether the - * election outcome is known. - **/ +// TallyVotes returns the number of granted and rejected Votes, and whether the +// election outcome is known. ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* rejected, int *granted) { - int i; - SSyncRaftProgress* progress; + SSyncRaftProgress* progress = NULL; int r, g; + // Make sure to populate granted/rejected correctly even if the Votes slice + // contains members no longer part of the configuration. This doesn't really + // matter in the way the numbers are used (they're informational), but might + // as well get it right. while (!syncRaftIterateProgressMap(&tracker->progressMap, progress)) { if (progress->id == SYNC_NON_NODE_ID) { continue; } - ESyncRaftVoteType* pType = taosHashGet(tracker->votesMap, &progress->id, sizeof(SyncNodeId*)); - if (pType == NULL) { + bool* v = taosHashGet(tracker->votesMap, &progress->id, sizeof(SyncNodeId*)); + if (v == NULL) { continue; } - if (*pType == SYNC_RAFT_VOTE_RESP_GRANT) { + if (*v) { g++; } else { r++; @@ -109,9 +117,40 @@ ESyncRaftVoteResult syncRaftTallyVotes(SSyncRaftProgressTracker* tracker, int* r return syncRaftVoteResult(&(tracker->config.voters), tracker->votesMap); } -void syncRaftConfigState(const SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) { +void syncRaftConfigState(SSyncRaftProgressTracker* tracker, SSyncConfigState* cs) { syncRaftCopyNodeMap(&tracker->config.voters.incoming, &cs->voters); syncRaftCopyNodeMap(&tracker->config.voters.outgoing, &cs->votersOutgoing); syncRaftCopyNodeMap(&tracker->config.learners, &cs->learners); syncRaftCopyNodeMap(&tracker->config.learnersNext, &cs->learnersNext); + cs->autoLeave = tracker->config.autoLeave; +} + +static void matchAckIndexer(SyncNodeId id, void* arg, SyncIndex* index) { + SSyncRaftProgressTracker* tracker = (SSyncRaftProgressTracker*)arg; + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&tracker->progressMap, id); + if (progress == NULL) { + *index = 0; + return; + } + *index = progress->matchIndex; +} + +// Committed returns the largest log index known to be committed based on what +// the voting members of the group have acknowledged. +SyncIndex syncRaftCommittedIndex(SSyncRaftProgressTracker* tracker) { + return syncRaftJointConfigCommittedIndex(&tracker->config.voters, matchAckIndexer, tracker); +} + +static void visitProgressActive(SSyncRaftProgress* progress, void* arg) { + SHashObj* votesMap = (SHashObj*)arg; + taosHashPut(votesMap, &progress->id, sizeof(SyncNodeId), &progress->recentActive, sizeof(bool)); +} + +// QuorumActive returns true if the quorum is active from the view of the local +// raft state machine. Otherwise, it returns false. +bool syncRaftQuorumActive(SSyncRaftProgressTracker* tracker) { + SHashObj* votesMap = taosHashInit(TSDB_MAX_REPLICA, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_ENTRY_LOCK); + syncRaftVisitProgressMap(&tracker->progressMap, visitProgressActive, votesMap); + + return syncRaftVoteResult(&tracker->config.voters, votesMap) == SYNC_RAFT_VOTE_WON; } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_quorum_joint.c b/source/libs/sync/src/sync_raft_quorum_joint.c index 500bd908c0..70c078b6f5 100644 --- a/source/libs/sync/src/sync_raft_quorum_joint.c +++ b/source/libs/sync/src/sync_raft_quorum_joint.c @@ -59,8 +59,17 @@ void syncRaftJointConfigRemoveFromIncoming(SSyncRaftQuorumJointConfig* config, S syncRaftRemoveFromNodeMap(&config->incoming, id); } -void syncRaftJointConfigIDS(const SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap) { +void syncRaftJointConfigIDs(SSyncRaftQuorumJointConfig* config, SSyncRaftNodeMap* nodeMap) { syncRaftCopyNodeMap(&config->incoming, nodeMap); syncRaftUnionNodeMap(&config->outgoing, nodeMap); +} + +SyncIndex syncRaftJointConfigCommittedIndex(const SSyncRaftQuorumJointConfig* config, matchAckIndexerFp indexer, void* arg) { + SyncIndex index0, index1; + + index0 = syncRaftMajorityConfigCommittedIndex(&config->incoming, indexer, arg); + index1 = syncRaftMajorityConfigCommittedIndex(&config->outgoing, indexer, arg); + + return index0 < index1 ? index0 : index1; } \ No newline at end of file diff --git a/source/libs/sync/src/sync_raft_quorum_majority.c b/source/libs/sync/src/sync_raft_quorum_majority.c index ff5ba64876..313f213cda 100644 --- a/source/libs/sync/src/sync_raft_quorum_majority.c +++ b/source/libs/sync/src/sync_raft_quorum_majority.c @@ -13,6 +13,7 @@ * along with this program. If not, see . */ +#include "sync_const.h" #include "sync_raft_quorum.h" #include "sync_raft_quorum_majority.h" #include "sync_raft_node_map.h" @@ -34,13 +35,13 @@ ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashOb i = g = r = missing = 0; SyncNodeId* pId = NULL; while (!syncRaftIterateNodeMap(config, pId)) { - const ESyncRaftVoteType* pType = taosHashGet(votesMap, pId, sizeof(SyncNodeId*)); - if (pType == NULL) { + const bool* v = (const bool*)taosHashGet(votesMap, pId, sizeof(SyncNodeId*)); + if (v == NULL) { missing += 1; continue; } - if (*pType == SYNC_RAFT_VOTE_RESP_GRANT) { + if (*v) { g +=1; } else { r += 1; @@ -56,4 +57,65 @@ ESyncRaftVoteResult syncRaftMajorityVoteResult(SSyncRaftNodeMap* config, SHashOb } return SYNC_RAFT_VOTE_LOST; +} + +int compSyncIndex(const void * elem1, const void * elem2) { + SyncIndex index1 = *((SyncIndex*)elem1); + SyncIndex index2 = *((SyncIndex*)elem1); + if (index1 > index2) return 1; + if (index1 < index2) return -1; + return 0; +} + +SyncIndex syncRaftMajorityConfigCommittedIndex(const SSyncRaftNodeMap* config, matchAckIndexerFp indexer, void* arg) { + int n = syncRaftNodeMapSize(config); + if (n == 0) { + // This plays well with joint quorums which, when one half is the zero + // MajorityConfig, should behave like the other half. + return kMaxCommitIndex; + } + + // Use an on-stack slice to collect the committed indexes when n <= 7 + // (otherwise we alloc). The alternative is to stash a slice on + // MajorityConfig, but this impairs usability (as is, MajorityConfig is just + // a map, and that's nice). The assumption is that running with a + // replication factor of >7 is rare, and in cases in which it happens + // performance is a lesser concern (additionally the performance + // implications of an allocation here are far from drastic). + SyncIndex* srt = NULL; + SyncIndex srk[TSDB_MAX_REPLICA]; + if (n > TSDB_MAX_REPLICA) { + srt = (SyncIndex*)malloc(sizeof(SyncIndex) * n); + if (srt == NULL) { + return kMaxCommitIndex; + } + } else { + srt = &srk[0]; + } + + // Fill the slice with the indexes observed. Any unused slots will be + // left as zero; these correspond to voters that may report in, but + // haven't yet. We fill from the right (since the zeroes will end up on + // the left after sorting below anyway). + SyncNodeId *pId = NULL; + int i = 0; + SyncIndex index; + while (!syncRaftIterateNodeMap(config, pId)) { + indexer(*pId, arg, &index); + srt[i++] = index; + } + + // Sort by index. Use a bespoke algorithm (copied from the stdlib's sort + // package) to keep srt on the stack. + qsort(srt, n, sizeof(SyncIndex), compSyncIndex); + + // The smallest index into the array for which the value is acked by a + // quorum. In other words, from the end of the slice, move n/2+1 to the + // left (accounting for zero-indexing). + index = srt[n - (n/2 + 1)]; + if (srt != &srk[0]) { + free(srt); + } + + return index; } \ No newline at end of file From 68e6b82a6692533838a95f20d3bfe1ec82746f70 Mon Sep 17 00:00:00 2001 From: lichuang Date: Fri, 19 Nov 2021 11:31:54 +0800 Subject: [PATCH 15/21] [TD-10645][raft]add vote resp process --- source/libs/sync/inc/raft_replication.h | 4 +- source/libs/sync/inc/sync_raft_impl.h | 2 + source/libs/sync/src/raft.c | 2 +- .../libs/sync/src/raft_handle_vote_message.c | 16 +++---- .../sync/src/raft_handle_vote_resp_message.c | 6 ++- source/libs/sync/src/raft_replication.c | 4 +- source/libs/sync/src/sync_raft_election.c | 12 +++--- source/libs/sync/src/sync_raft_impl.c | 42 +++++++++++++++---- 8 files changed, 59 insertions(+), 29 deletions(-) diff --git a/source/libs/sync/inc/raft_replication.h b/source/libs/sync/inc/raft_replication.h index d0e55ef10e..180a2db61f 100644 --- a/source/libs/sync/inc/raft_replication.h +++ b/source/libs/sync/inc/raft_replication.h @@ -20,11 +20,11 @@ #include "syncInt.h" #include "sync_type.h" -// syncRaftReplicate sends an append RPC with new entries to the given peer, +// syncRaftMaybeSendAppend sends an append RPC with new entries to the given peer, // if necessary. Returns true if a message was sent. The sendIfEmpty // argument controls whether messages with no entries will be sent // ("empty" messages are useful to convey updated Commit indexes, but // are undesirable when we're sending multiple messages in a batch). -bool syncRaftReplicate(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty); +bool syncRaftMaybeSendAppend(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty); #endif /* TD_SYNC_RAFT_REPLICATION_H */ diff --git a/source/libs/sync/inc/sync_raft_impl.h b/source/libs/sync/inc/sync_raft_impl.h index a8615f17eb..1a6c13f65f 100644 --- a/source/libs/sync/inc/sync_raft_impl.h +++ b/source/libs/sync/inc/sync_raft_impl.h @@ -28,6 +28,8 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft); void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType); +void syncRaftCampaign(SSyncRaft* pRaft, ESyncRaftElectionType cType); + void syncRaftTriggerHeartbeat(SSyncRaft* pRaft); void syncRaftRandomizedElectionTimeout(SSyncRaft* pRaft); diff --git a/source/libs/sync/src/raft.c b/source/libs/sync/src/raft.c index 3b4c9e5f36..72b0d268a8 100644 --- a/source/libs/sync/src/raft.c +++ b/source/libs/sync/src/raft.c @@ -169,7 +169,7 @@ static int deserializeClusterStateFromBuffer(SSyncConfigState* cluster, const ch } static void visitProgressMaybeSendAppend(SSyncRaftProgress* progress, void* arg) { - syncRaftReplicate(arg, progress, false); + syncRaftMaybeSendAppend(arg, progress, false); } // switchToConfig reconfigures this node to use the provided configuration. It diff --git a/source/libs/sync/src/raft_handle_vote_message.c b/source/libs/sync/src/raft_handle_vote_message.c index 9997c5226d..0219e39df9 100644 --- a/source/libs/sync/src/raft_handle_vote_message.c +++ b/source/libs/sync/src/raft_handle_vote_message.c @@ -48,12 +48,14 @@ int syncRaftHandleVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { } static bool canGrantVoteMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { - if (!(pRaft->voteFor == SYNC_NON_NODE_ID || pMsg->term > pRaft->term || pRaft->voteFor == pMsg->from)) { - return false; - } - if (!syncRaftLogIsUptodate(pRaft->log, pMsg->vote.lastIndex, pMsg->vote.lastTerm)) { - return false; - } + bool canVote = + // We can vote if this is a repeat of a vote we've already cast... + pRaft->voteFor == pMsg->from || + // ...we haven't voted and we don't think there's a leader yet in this term... + (pRaft->voteFor == SYNC_NON_NODE_ID && pRaft->leaderId == SYNC_NON_NODE_ID) || + // ...or this is a PreVote for a future term... + (pMsg->vote.cType == SYNC_RAFT_CAMPAIGN_PRE_ELECTION && pMsg->term > pRaft->term); - return true; + // ...and we believe the candidate is up to date. + return canVote && syncRaftLogIsUptodate(pRaft->log, pMsg->vote.lastIndex, pMsg->vote.lastTerm); } \ No newline at end of file diff --git a/source/libs/sync/src/raft_handle_vote_resp_message.c b/source/libs/sync/src/raft_handle_vote_resp_message.c index 744d654cc5..87a5cfcd15 100644 --- a/source/libs/sync/src/raft_handle_vote_resp_message.c +++ b/source/libs/sync/src/raft_handle_vote_resp_message.c @@ -45,12 +45,14 @@ int syncRaftHandleVoteRespMessage(SSyncRaft* pRaft, const SSyncMessage* pMsg) { if (result == SYNC_RAFT_VOTE_WON) { if (pRaft->candidateState.inPreVote) { - syncRaftStartElection(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); + syncRaftCampaign(pRaft, SYNC_RAFT_CAMPAIGN_ELECTION); } else { syncRaftBecomeLeader(pRaft); - + syncRaftBroadcastAppend(pRaft); } } else if (result == SYNC_RAFT_VOTE_LOST) { + // pb.MsgPreVoteResp contains future term of pre-candidate + // m.Term > r.Term; reuse r.Term syncRaftBecomeFollower(pRaft, pRaft->term, SYNC_NON_NODE_ID); } diff --git a/source/libs/sync/src/raft_replication.c b/source/libs/sync/src/raft_replication.c index 228d8195f6..c8c2d2c379 100644 --- a/source/libs/sync/src/raft_replication.c +++ b/source/libs/sync/src/raft_replication.c @@ -24,12 +24,12 @@ static bool sendAppendEntries(SSyncRaft* pRaft, SSyncRaftProgress* progress, SyncIndex prevIndex, SyncTerm prevTerm, SSyncRaftEntry *entries, int nEntry); -// syncRaftReplicate sends an append RPC with new entries to the given peer, +// maybeSendAppend sends an append RPC with new entries to the given peer, // if necessary. Returns true if a message was sent. The sendIfEmpty // argument controls whether messages with no entries will be sent // ("empty" messages are useful to convey updated Commit indexes, but // are undesirable when we're sending multiple messages in a batch). -bool syncRaftReplicate(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty) { +bool syncRaftMaybeSendAppend(SSyncRaft* pRaft, SSyncRaftProgress* progress, bool sendIfEmpty) { assert(pRaft->state == TAOS_SYNC_STATE_LEADER); SyncNodeId nodeId = progress->id; diff --git a/source/libs/sync/src/sync_raft_election.c b/source/libs/sync/src/sync_raft_election.c index d961978be2..fe2e0fd9d3 100644 --- a/source/libs/sync/src/sync_raft_election.c +++ b/source/libs/sync/src/sync_raft_election.c @@ -19,8 +19,6 @@ #include "raft_message.h" #include "sync_raft_progress_tracker.h" -static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType); - void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) { if (pRaft->state == TAOS_SYNC_STATE_LEADER) { syncDebug("[%d:%d] ignoring RAFT_MSG_INTERNAL_ELECTION because already leader", pRaft->selfGroupId, pRaft->selfId); @@ -28,7 +26,7 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) { } if (!syncRaftIsPromotable(pRaft)) { - syncWarn("[%d:%d] is unpromotable and can not campaign", pRaft->selfGroupId, pRaft->selfId); + syncWarn("[%d:%d] is unpromotable and can not syncRaftCampaign", pRaft->selfGroupId, pRaft->selfId); return; } @@ -41,17 +39,17 @@ void syncRaftStartElection(SSyncRaft* pRaft, ESyncRaftElectionType cType) { syncInfo("[%d:%d] is starting a new election at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); - campaign(pRaft, cType); + syncRaftCampaign(pRaft, cType); } -// campaign transitions the raft instance to candidate state. This must only be +// syncRaftCampaign transitions the raft instance to candidate state. This must only be // called after verifying that this is a legitimate transition. -static void campaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { +void syncRaftCampaign(SSyncRaft* pRaft, ESyncRaftElectionType cType) { bool preVote; SyncTerm term; if (syncRaftIsPromotable(pRaft)) { - syncDebug("[%d:%d] is unpromotable; campaign() should have been called", pRaft->selfGroupId, pRaft->selfId); + syncDebug("[%d:%d] is unpromotable; syncRaftCampaign() should have been called", pRaft->selfGroupId, pRaft->selfId); return; } diff --git a/source/libs/sync/src/sync_raft_impl.c b/source/libs/sync/src/sync_raft_impl.c index 2093bcb046..4d8222e826 100644 --- a/source/libs/sync/src/sync_raft_impl.c +++ b/source/libs/sync/src/sync_raft_impl.c @@ -25,6 +25,8 @@ static int stepFollower(SSyncRaft* pRaft, const SSyncMessage* pMsg); static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg); static int stepLeader(SSyncRaft* pRaft, const SSyncMessage* pMsg); +static bool increaseUncommittedSize(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n); + static int triggerAll(SSyncRaft* pRaft); static void tickElection(SSyncRaft* pRaft); @@ -82,13 +84,22 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft) { resetRaft(pRaft, pRaft->term); pRaft->leaderId = pRaft->leaderId; pRaft->state = TAOS_SYNC_STATE_LEADER; - // TODO: check if there is pending config log - int nPendingConf = syncRaftLogNumOfPendingConf(pRaft->log); - if (nPendingConf > 1) { - syncFatal("unexpected multiple uncommitted config entry"); - } - syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); + SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, pRaft->selfId); + assert(progress != NULL); + // Followers enter replicate mode when they've been successfully probed + // (perhaps after having received a snapshot as a result). The leader is + // trivially in this state. Note that r.reset() has initialized this + // progress with the last index already. + syncRaftProgressBecomeReplicate(progress); + + // Conservatively set the pendingConfIndex to the last index in the + // log. There may or may not be a pending config change, but it's + // safe to delay any future proposals until we commit all our + // pending log entries, and scanning the entire tail of the log + // could be expensive. + SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); + pRaft->pendingConfigIndex = lastIndex; // after become leader, send a no-op log SSyncRaftEntry* entry = (SSyncRaftEntry*)malloc(sizeof(SSyncRaftEntry)); @@ -103,6 +114,7 @@ void syncRaftBecomeLeader(SSyncRaft* pRaft) { }; appendEntries(pRaft, entry, 1); //syncRaftTriggerHeartbeat(pRaft); + syncInfo("[%d:%d] became leader at term %" PRId64 "", pRaft->selfGroupId, pRaft->selfId, pRaft->term); } void syncRaftTriggerHeartbeat(SSyncRaft* pRaft) { @@ -192,9 +204,11 @@ static void visitProgressSendAppend(SSyncRaftProgress* progress, void* arg) { return; } - syncRaftReplicate(arg, progress, true); + syncRaftMaybeSendAppend(arg, progress, true); } +// bcastAppend sends RPC, with entries to all peers that are not up-to-date +// according to the progress recorded in r.prs. void syncRaftBroadcastAppend(SSyncRaft* pRaft) { syncRaftProgressVisit(pRaft->tracker, visitProgressSendAppend, pRaft); } @@ -267,6 +281,11 @@ static void tickHeartbeat(SSyncRaft* pRaft) { } +// TODO +static bool increaseUncommittedSize(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { + return false; +} + static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { SyncIndex lastIndex = syncRaftLogLastIndex(pRaft->log); SyncTerm term = pRaft->term; @@ -277,9 +296,16 @@ static void appendEntries(SSyncRaft* pRaft, SSyncRaftEntry* entries, int n) { entries[i].index = lastIndex + 1 + i; } + // Track the size of this uncommitted proposal. + if (!increaseUncommittedSize(pRaft, entries, n)) { + // Drop the proposal. + return; + } + syncRaftLogAppend(pRaft->log, entries, n); SSyncRaftProgress* progress = syncRaftFindProgressByNodeId(&pRaft->tracker->progressMap, pRaft->selfId); + assert(progress != NULL); syncRaftProgressMaybeUpdate(progress, lastIndex); // Regardless of syncRaftMaybeCommit's return, our caller will call bcastAppend. syncRaftMaybeCommit(pRaft); @@ -306,7 +332,7 @@ static int triggerAll(SSyncRaft* pRaft) { continue; } - syncRaftReplicate(pRaft, pRaft->tracker->progressMap.progress[i], true); + syncRaftMaybeSendAppend(pRaft, pRaft->tracker->progressMap.progress[i], true); } #endif return 0; From 60e339b31df337d3b09f4d23c99ab09b97820f62 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 20 Nov 2021 13:13:21 +0800 Subject: [PATCH 16/21] fst core struct --- source/libs/index/inc/index_fst.h | 158 ++++++----- source/libs/index/inc/index_fst_automation.h | 42 +++ source/libs/index/inc/index_fst_node.h | 22 ++ source/libs/index/inc/index_fst_registry.h | 24 ++ source/libs/index/inc/index_fst_util.h | 82 ++++++ source/libs/index/src/index_fst.c | 274 ++++++++++++++++++- source/libs/index/src/index_fst_automation.c | 14 + source/libs/index/src/index_fst_common.c | 6 +- source/libs/index/src/index_fst_node.c | 15 + source/libs/index/src/index_fst_registry.c | 17 ++ source/libs/index/src/index_fst_util.c | 115 ++++++++ 11 files changed, 683 insertions(+), 86 deletions(-) create mode 100644 source/libs/index/inc/index_fst_automation.h create mode 100644 source/libs/index/inc/index_fst_node.h create mode 100644 source/libs/index/inc/index_fst_registry.h create mode 100644 source/libs/index/inc/index_fst_util.h create mode 100644 source/libs/index/src/index_fst_automation.c create mode 100644 source/libs/index/src/index_fst_node.c create mode 100644 source/libs/index/src/index_fst_registry.c create mode 100644 source/libs/index/src/index_fst_util.c diff --git a/source/libs/index/inc/index_fst.h b/source/libs/index/inc/index_fst.h index de4c957e29..61c857ed74 100644 --- a/source/libs/index/inc/index_fst.h +++ b/source/libs/index/inc/index_fst.h @@ -13,58 +13,73 @@ * along with this program. If not, see . */ -#ifndef _INDEX_FST_H_ -#define _INDEX_FST_H_ -#include "index_fst.h" +#ifndef __INDEX_FST_H__ +#define __INDEX_FST_H__ + + #include "tarray.h" - -typedef FstType uint64_t; -typedef CompiledAddr uint64_t; -typedef Output uint64_t; -typedef PackSizes uint8_t; +#include "index_fst_util.h" +#include "index_fst_registry.h" -//A sentinel value used to indicate an empty final state -const CompileAddr EMPTY_ADDRESS = 0; -/// A sentinel value used to indicate an invalid state. -const CompileAddr NONE_ADDRESS = 1; +typedef struct FstNode FstNode; +#define OUTPUT_PREFIX(a, b) ((a) > (b) ? (b) : (a) -// This version number is written to every finite state transducer created by -// this crate. When a finite state transducer is read, its version number is -// checked against this value. -const uint64_t version = 3; -// The threshold (in number of transitions) at which an index is created for -// a node's transitions. This speeds up lookup time at the expense of FST size - -const uint64_t TRANS_INDEX_THRESHOLD = 32; typedef struct FstRange { uint64_t start; uint64_t end; } FstRange; -enum State { OneTransNext, OneTrans, AnyTrans, EmptyFinal}; -enum FstBound { Included, Excluded, Unbounded}; -typedef struct CheckSummer { - uint32_t sum; -}; +typedef struct FstBuilderNode { + bool isFinal; + Output finalOutput; + SArray *trans; // +} FstBuilderNode; + +typedef enum { OneTransNext, OneTrans, AnyTrans, EmptyFinal} State; +typedef enum { Included, Excluded, Unbounded} FstBound; + +typedef uint32_t CheckSummer; -typedef struct FstBuilder { - FstCountingWriter wtr; // The FST raw data is written directly to `wtr`. - FstUnFinishedNodes unfinished // The stack of unfinished nodes - Registry registry // A map of finished nodes. - SArray* last // The last word added - CompiledAddr lastAddr // The address of the last compiled node - uint64_t len // num of keys added -} FstBuilder; +/* + * + * UnFinished node and helper function + * TODO: simple function name + */ +typedef struct FstUnFinishedNodes { + SArray *stack; // } FstUnFinishedNodes; +} FstUnFinishedNodes; + +#define FST_UNFINISHED_NODES_LEN(nodes) taosArrayGetSize(nodes->stack) + +FstUnFinishedNodes *FstUnFinishedNodesCreate(); +void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal); +FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes); +FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr); +FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes); +void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *node, Output out); +void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *node, CompiledAddr addr); +void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *node, FstSlice bs, Output out); +uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs); +uint64_t FstUnFinishedNodesFindCommPreifxAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out); typedef struct FstCountingWriter { void* wtr; // wrap any writer that counts and checksum bytes written uint64_t count; CheckSummer summer; -}; +} FstCountingWriter; + +typedef struct FstBuilder { + FstCountingWriter wtr; // The FST raw data is written directly to `wtr`. + FstUnFinishedNodes *unfinished; // The stack of unfinished nodes + FstRegistry registry; // A map of finished nodes. + SArray* last; // The last word added + CompiledAddr lastAddr; // The address of the last compiled node + uint64_t len; // num of keys added +} FstBuilder; @@ -80,16 +95,6 @@ typedef struct FstTransitions { FstRange range; } FstTransitions; -typedef struct FstUnFinishedNodes { - SArray *stack; // -} FstUnFinishedNodes; - -typedef struct FstBuilderNode { - bool isFinal; - Output finalOutput; - SArray *trans; // -} FstBuilderNode; - typedef struct FstLastTransition { @@ -97,13 +102,23 @@ typedef struct FstLastTransition { Output out; } FstLastTransition; +/* + * FstBuilderNodeUnfinished and helper function + * TODO: simple function name + */ typedef struct FstBuilderNodeUnfinished { - FstBuilderNode node; - FstLastTransition last; + FstBuilderNode *node; + FstLastTransition* last; } FstBuilderNodeUnfinished; +void fstBuilderNodeUnfinishedLastCompiled(FstBuilderNodeUnfinished *node, CompiledAddr addr); +void fstBuilderNodeUnfinishedAddOutputPrefix(FstBuilderNodeUnfinished *node, CompiledAddr addr); + +/* + * FstNode and helper function + */ typedef struct FstNode { - uint8_t* data; + FstSlice data; uint64_t version; State state; CompiledAddr start; @@ -114,6 +129,28 @@ typedef struct FstNode { Output finalOutput; } FstNode; +// If this node is final and has a terminal output value, then it is, returned. Otherwise, a zero output is returned +#define FST_NODE_FINAL_OUTPUT(node) node->finalOutput +// Returns true if and only if this node corresponds to a final or "match", state in the finite state transducer. +#define FST_NODE_IS_FINAL(node) node->isFinal +// Returns the number of transitions in this node, The maximum number of transitions is 256. +#define FST_NODE_LEN(node) node->nTrans +// Returns true if and only if this node has zero transitions. +#define FST_NODE_IS_EMPTYE(node) (node->nTrans == 0) +// Return the address of this node. +#define FST_NODE_ADDR(node) node->start + +FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *data); +FstTransitions fstNodeTransitionIter(FstNode *node); +FstTransitions* fstNodeTransitions(FstNode *node); +bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res); +bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res); +bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res); +bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode); +FstSlice fstNodeAsSlice(FstNode *node); + + + typedef struct FstMeta { uint64_t version; CompiledAddr rootAddr; @@ -125,42 +162,21 @@ typedef struct FstMeta { typedef struct Fst { FstMeta meta; void *data; // -}; +} Fst; -// ops +// ops typedef struct FstIndexedValue { uint64_t index; uint64_t value; -}; +} FstIndexedValue; -// relate to Regist -typedef struct FstRegistry { - SArray *table; // - uint64_t tableSize; // num of rows - uint64_t mruSize; // num of columns -} FstRegistry; - -typedef struct FstRegistryCache { - SArray *cells; // -} FstRegistryCache; typedef struct FstRegistryCell { CompiledAddr addr; FstBuilderNode *node; } FstRegistryCell; -enum FstRegistryEntry {Found, NotFound, Rejected}; - -FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, uint8_t *data); -FstTransitions fstNodeTransitionIter(FstNode *node); -FstTransition fstNodeGetTransitionAt(FstNode *node, uint64_t i); -CompiledAddr fstNodeGetTransitionAddr(FstNode *node, uint64_t i); -int64_t fstNodeFindInput(FstNode *node, int8_t b); -Output fstNodeGetFinalOutput(FstNode *node); -void* fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledArr addr, FstBuilderNode *builderNode); - - #endif diff --git a/source/libs/index/inc/index_fst_automation.h b/source/libs/index/inc/index_fst_automation.h new file mode 100644 index 0000000000..7ad9a500cc --- /dev/null +++ b/source/libs/index/inc/index_fst_automation.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#ifndef __INDEX_FST_AUTAOMATION_H__ +#define __INDEX_FST_AUTAOMATION_H__ + +struct AutomationCtx; + +typedef struct StartWith { + AutomationCtx *autoSelf; +} StartWith; + +typedef struct Complement { + AutomationCtx *autoSelf; +} Complement; + +// automation +typedef struct AutomationCtx { + void *data; +} AutomationCtx; + +// automation interface +void (*start)(AutomationCtx *ctx); +bool (*isMatch)(AutomationCtx *ctx); +bool (*canMatch)(AutomationCtx *ctx, void *data); +bool (*willAlwaysMatch)(AutomationCtx *ctx, void *state); +void* (*accpet)(AutomationCtx *ctx, void *state, uint8_t byte); +void* (*accpetEof)(AutomationCtx *ctx, *state); + + +#endif diff --git a/source/libs/index/inc/index_fst_node.h b/source/libs/index/inc/index_fst_node.h new file mode 100644 index 0000000000..ba2d2ccd02 --- /dev/null +++ b/source/libs/index/inc/index_fst_node.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef __INDEX_FST_NODE_H__ +#define __INDEX_FST_NODE_H__ + + + + +#endif diff --git a/source/libs/index/inc/index_fst_registry.h b/source/libs/index/inc/index_fst_registry.h new file mode 100644 index 0000000000..6dcb236f29 --- /dev/null +++ b/source/libs/index/inc/index_fst_registry.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#ifndef __FST_REGISTRY_H__ +#define __FST_REGISTRY_H__ + +#include "index_fst_util.h" + + +typedef struct FstRegistry { + +} FstRegistry; +#endif diff --git a/source/libs/index/inc/index_fst_util.h b/source/libs/index/inc/index_fst_util.h new file mode 100644 index 0000000000..fc7dd44637 --- /dev/null +++ b/source/libs/index/inc/index_fst_util.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + + +#ifndef __INDEX_FST_UTIL_H__ +#define __INDEX_FST_UTIL_H__ + +#include "tarray.h" + + +typedef uint64_t FstType; +typedef uint64_t CompiledAddr; +typedef uint64_t Output; +typedef uint8_t PackSizes; + + +//A sentinel value used to indicate an empty final state +extern const CompiledAddr EMPTY_ADDRESS; +/// A sentinel value used to indicate an invalid state. +extern const CompiledAddr NONE_ADDRESS; + +// This version number is written to every finite state transducer created by +// this crate. When a finite state transducer is read, its version number is +// checked against this value. +extern const uint64_t version; +// The threshold (in number of transitions) at which an index is created for +// a node's transitions. This speeds up lookup time at the expense of FST size + +extern const uint64_t TRANS_INDEX_THRESHOLD; +// high 4 bits is transition address packed size. +// low 4 bits is output value packed size. +// +// `0` is a legal value which means there are no transitions/outputs + +#define FST_SET_TRANSITION_PACK_SIZE(v, sz) do {v = (v & 0b00001111) | (sz << 4} while(0) +#define FST_GET_TRANSITION_PACK_SIZE(v) (((v) & 0b11110000) >> 4) +#define FST_SET_OUTPUT_PACK_SIZE(v, sz) do { v = (v & 0b11110000) | sz } while(0) +#define FST_GET_OUTPUT_PACK_SIZE(v) ((v) & 0b00001111) + +#define COMMON_INPUT(idx) COMMON_INPUTS_INV[(idx) - 1] + +#define COMMON_INDEX(v, max, val) do { \ + val = ((uint16_t)COMMON_INPUTS[v] + 1)%256; \ + val = val > max ? 0: val; \ +} while(0) + + +//uint8_t commonInput(uint8_t idx); +//uint8_t commonIdx(uint8_t v, uint8_t max); + +uint8_t packSize(uint64_t n); +uint64_t unpackUint64(uint8_t *ch, uint8_t sz); +uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr); +CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr); + + + +typedef struct FstSlice { + uint8_t *data; + uint64_t dLen; + uint32_t start; + uint32_t end; +} FstSlice; + +FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end); +FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen); +bool fstSliceEmpty(FstSlice *slice); + + +#endif diff --git a/source/libs/index/src/index_fst.c b/source/libs/index/src/index_fst.c index 4c6e20a7d5..2974e7f9b5 100644 --- a/source/libs/index/src/index_fst.c +++ b/source/libs/index/src/index_fst.c @@ -15,13 +15,143 @@ #include "index_fst.h" + +FstUnFinishedNodes *fstUnFinishedNodesCreate() { + FstUnFinishedNodes *nodes = malloc(sizeof(FstUnFinishedNodes)); + if (nodes == NULL) { return NULL; } + + nodes->stack = (SArray *)taosArrayInit(64, sizeof(FstBuilderNodeUnfinished)); + fstUnFinishedNodesPushEmpty(nodes, false); + return nodes; +} +void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal) { + FstBuilderNode *node = malloc(sizeof(FstBuilderNode)); + node->isFinal = isFinal; + node->finalOutput = 0; + node->trans = NULL; + + FstBuilderNodeUnfinished un = {.node = node, .last = NULL}; + taosArrayPush(nodes->stack, &un); + +} +FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes) { + assert(taosArrayGetSize(nodes->stack) == 1); + + FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack); + assert(un->last == NULL); + return un->node; +} + +FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) { + FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack); + fstBuilderNodeUnfinishedLastCompiled(un, addr); + free(un->last); // TODO add func FstLastTransitionFree() + return un->node; +} + +FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes) { + FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack); + assert(un->last == NULL); + return un->node; + +} +void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *nodes, Output out) { + FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, 0); + un->node->isFinal = true; + un->node->finalOutput = out; + //un->node->trans = NULL; +} +void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) { + size_t sz = taosArrayGetSize(nodes->stack) - 1; + FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz); + fstBuilderNodeUnfinishedLastCompiled(un, addr); +} +void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *nodes, FstSlice bs, Output out) { + FstSlice *s = &bs; + if (s->data == NULL || s->dLen == 0 || s->start > s->end) { + return; + } + size_t sz = taosArrayGetSize(nodes->stack) - 1; + FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz); + assert(un->last == NULL); + + + FstLastTransition *trn = malloc(sizeof(FstLastTransition)); + trn->inp = s->data[s->start]; + trn->out = out; + + un->last = trn; + + for (uint64_t i = s->start; i <= s->end; i++) { + FstBuilderNode *n = malloc(sizeof(FstBuilderNode)); + n->isFinal = false; + n->finalOutput = 0; + n->trans = NULL; + + FstLastTransition *trn = malloc(sizeof(FstLastTransition)); + trn->inp = s->data[i]; + trn->out = out; + + FstBuilderNodeUnfinished un = {.node = n, .last = trn}; + taosArrayPush(nodes->stack, &un); + } + fstUnFinishedNodesPushEmpty(nodes, true); +} + + +uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs) { + FstSlice *s = &bs; + + size_t lsz = (size_t)(s->end - s->start + 1); // data len + size_t ssz = taosArrayGetSize(node->stack); // stack size + + uint64_t count = 0; + for (size_t i = 0; i < ssz && i < lsz; i++) { + FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i); + if (un->last->inp == s->data[s->start + i]) { + count++; + } else { + break; + } + } + return count; +} +uint64_t FstUnFinishedNodesFindCommPrefixAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out) { + FstSlice *s = &bs; + + size_t lsz = (size_t)(s->end - s->start + 1); // data len + size_t ssz = taosArrayGetSize(node->stack); // stack size + + uint64_t res = 0; + for (size_t i = 0; i < lsz && i < ssz; i++) { + FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i); + + FstLastTransition *last = un->last; + if (last->inp == s->data[s->start + i]) { + uint64_t commPrefix = last->out; + uint64_t addPrefix = last->out - commPrefix; + out = out - commPrefix; + last->out = commPrefix; + if (addPrefix != 0) { + fstBuilderNodeUnfinishedAddOutputPrefix(un, addPrefix); + } + } else { + break; + } + } + return res; +} + // fst node function -FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) { + + + +FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *slice) { FstNode *n = (FstNode *)malloc(sizeof(FstNode)); if (n == NULL) { return NULL; } if (addr == EMPTY_ADDRESS) { - n->date = NULL; + n->data = fstSliceCreate(NULL, 0); n->version = version; n->state = EmptyFinal; n->start = EMPTY_ADDRESS; @@ -29,20 +159,138 @@ FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) { n->isFinal = true; n->nTrans = 0; n->sizes = 0; - n->finalOutpu = 0; - return n; + n->finalOutput = 0; + } + uint8_t v = slice->data[addr]; + uint8_t s = (v & 0b11000000) >> 6; + if (s == 0b11) { // oneTransNext + n->data = fstSliceCopy(slice, 0, addr); + n->version = version; + n->state = OneTransNext; + n->start = addr; + n->end = addr; //? s.end_addr(data); + n->isFinal = false; + n->sizes = 0; + n->nTrans = 0; + n->finalOutput = 0; + } else if (v == 0b10) { // oneTrans + uint64_t sz; // fetch sz from addr + n->data = fstSliceCopy(slice, 0, addr); + n->version = version; + n->state = OneTrans; + n->start = addr; + n->end = addr; // s.end_addr(data, sz); + n->isFinal = false; + n->nTrans = 1; + n->sizes = sz; + n->finalOutput = 0; + } else { // anyTrans + uint64_t sz; // s.sizes(data) + uint32_t nTrans; // s.ntrans(data) + n->data = *slice; + n->version = version; + n->state = AnyTrans; + n->start = addr; + n->end = addr; // s.end_addr(version, data, sz, ntrans); + n->isFinal = false; // s.is_final_state(); + n->nTrans = nTrans; + n->sizes = sz; + n->finalOutput = 0; // s.final_output(version, data, sz, ntrans); } - uint8_t v = (data[addr] & 0b1100000) >> 6; - if (v == 0b11) { - - } else if (v == 0b10) { - - } else { + return n; +} +FstTransitions* fstNodeTransitions(FstNode *node) { + FstTransitions *t = malloc(sizeof(FstTransitions)); + if (NULL == t) { + return NULL; + } + FstRange range = {.start = 0, .end = FST_NODE_LEN(node)}; + t->node = node; + t->range = range; + return t; +} +bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res) { + bool s = true; + if (node->state == OneTransNext) { - } - + } else if (node->state == OneTrans) { + + } else if (node->state == AnyTrans) { + + } else { + s = false; + } + return s; +} + +bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res) { + bool s = true; + if (node->state == OneTransNext) { + + } else if (node->state == OneTrans) { + + } else if (node->state == AnyTrans) { + + } else if (node->state == EmptyFinal){ + s = false; + } + return s; +} + +bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res) { + bool s = true; + uint8_t input; // s.input + if (node->state == OneTransNext) { + if (b == input) { *res = 0; } + else { return s ; } + } else if (node->state == OneTrans) { + if (b == input) { *res = 0; } + else {return s;} + } else if (node->state == AnyTrans) { + + } else if (node->state == EmptyFinal) { + s = false; + } + return s; +} + +bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode) { + size_t sz = taosArrayGetSize(builderNode->trans); + assert(sz < 256); + if (sz == 0 && builderNode->isFinal && builderNode->finalOutput == 0) { + return true; + } else if (sz != 1 || builderNode->isFinal) { + // AnyTrans->Compile(w, addr, node); + } else { + FstTransition *tran = taosArrayGet(builderNode->trans, 0); + if (tran->addr == lastAddr && tran->out == 0) { + //OneTransNext::compile(w, lastAddr, tran->inp); + return true; + } else { + //OneTrans::Compile(w, lastAddr, *tran); + return true; + } + } + return true; +} + + + + +FstBuilder *fstBuilderCreate(void *w, FstType ty) { + FstBuilder *b = malloc(sizeof(FstBuilder)); + if (NULL == b) { return b; } + + FstCountingWriter wtr = {.wtr = w, .count = 0, .summer = 0}; + b->wtr = wtr; + b->unfinished = malloc(sizeof(FstUnFinishedNodes)); + return b; } +FstSlice fstNodeAsSlice(FstNode *node) { + FstSlice *slice = &node->data; + FstSlice s = fstSliceCopy(slice, slice->end, slice->dLen - 1); + return s; +} - diff --git a/source/libs/index/src/index_fst_automation.c b/source/libs/index/src/index_fst_automation.c new file mode 100644 index 0000000000..f2f48bbc8a --- /dev/null +++ b/source/libs/index/src/index_fst_automation.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ diff --git a/source/libs/index/src/index_fst_common.c b/source/libs/index/src/index_fst_common.c index 39e5f89b35..4ab78cddc5 100644 --- a/source/libs/index/src/index_fst_common.c +++ b/source/libs/index/src/index_fst_common.c @@ -12,6 +12,8 @@ * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ + +#include "tutil.h" const uint8_t COMMON_INPUTS[] = { 84, // '\x00' 85, // '\x01' @@ -271,7 +273,7 @@ const uint8_t COMMON_INPUTS[] = { 255, // 'ÿ' }; -char const COMMON_INPUTS_INV[] = [ +char const COMMON_INPUTS_INV[] = { 't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w', '.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=', ':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6', @@ -300,5 +302,5 @@ char const COMMON_INPUTS_INV[] = [ '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', -]; +}; diff --git a/source/libs/index/src/index_fst_node.c b/source/libs/index/src/index_fst_node.c new file mode 100644 index 0000000000..3d5efd30f3 --- /dev/null +++ b/source/libs/index/src/index_fst_node.c @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + diff --git a/source/libs/index/src/index_fst_registry.c b/source/libs/index/src/index_fst_registry.c new file mode 100644 index 0000000000..940c5863f4 --- /dev/null +++ b/source/libs/index/src/index_fst_registry.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "index_fst_registry.h" + diff --git a/source/libs/index/src/index_fst_util.c b/source/libs/index/src/index_fst_util.c new file mode 100644 index 0000000000..20751baf5f --- /dev/null +++ b/source/libs/index/src/index_fst_util.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#include "index_fst_util.h" + + + +//A sentinel value used to indicate an empty final state +const CompiledAddr EMPTY_ADDRESS = 0; +/// A sentinel value used to indicate an invalid state. +const CompiledAddr NONE_ADDRESS = 1; + +// This version number is written to every finite state transducer created by +// this crate. When a finite state transducer is read, its version number is +// checked against this value. +const uint64_t version = 3; +// The threshold (in number of transitions) at which an index is created for +// a node's transitions. This speeds up lookup time at the expense of FST size + +const uint64_t TRANS_INDEX_THRESHOLD = 32; + + +//uint8_t commonInput(uint8_t idx) { +// if (idx == 0) { return -1; } +// else { +// return COMMON_INPUTS_INV[idx - 1]; +// } +//} +// +//uint8_t commonIdx(uint8_t v, uint8_t max) { +// uint8_t v = ((uint16_t)tCOMMON_INPUTS[v] + 1)%256; +// return v > max ? 0: v; +//} + + + +uint8_t packSize(uint64_t n) { + if (n < (1u << 8)) { + return 1; + } else if (n < (1u << 16)) { + return 2; + } else if (n < (1u << 24)) { + return 3; + } else if (n < ((uint64_t)(1) << 32)) { + return 4; + } else if (n < ((uint64_t)(1) << 40)) { + return 5; + } else if (n < ((uint64_t)(1) << 48)) { + return 6; + } else if (n < ((uint64_t)(1) << 56)) { + return 7; + } else { + return 8; + } +} + +uint64_t unpackUint64(uint8_t *ch, uint8_t sz) { + uint64_t n; + for (uint8_t i = 0; i < sz; i++) { + n = n | (ch[i] << (8 * i)); + } + return n; +} +uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr) { + if (transAddr == EMPTY_ADDRESS) { + return packSize(EMPTY_ADDRESS); + } else { + return packSize(nodeAddr - transAddr); + } +} +CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr) { + uint64_t delta = unpackUint64(data, len); + // delta_add = u64_to_usize + if (delta == EMPTY_ADDRESS) { + return EMPTY_ADDRESS; + } else { + return nodeAddr - delta; + } +} + +// fst slice func +FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen) { + FstSlice slice = {.data = data, .dLen = dLen, .start = 0, .end = dLen - 1}; + return slice; +} +FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end) { + FstSlice t; + if (start >= slice->dLen || end >= slice->dLen || start > end) { + t.data = NULL; + return t; + }; + + t.data = slice->data; + t.dLen = slice->dLen; + t.start = start; + t.end = end; + return t; +} +bool fstSliceEmpty(FstSlice *slice) { + return slice->data == NULL || slice->dLen <= 0; +} + + + From d56dbb15ff326f62ffde6ae54210397daced9dbe Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 21 Nov 2021 18:39:35 +0800 Subject: [PATCH 17/21] add fst_registry --- source/libs/index/inc/index_fst.h | 18 --- source/libs/index/inc/index_fst_node.h | 14 ++ source/libs/index/inc/index_fst_registry.h | 36 ++++- source/libs/index/src/index_fst_node.c | 9 ++ source/libs/index/src/index_fst_registry.c | 147 +++++++++++++++++++++ 5 files changed, 205 insertions(+), 19 deletions(-) diff --git a/source/libs/index/inc/index_fst.h b/source/libs/index/inc/index_fst.h index 61c857ed74..1230fe17ff 100644 --- a/source/libs/index/inc/index_fst.h +++ b/source/libs/index/inc/index_fst.h @@ -32,12 +32,6 @@ typedef struct FstRange { } FstRange; -typedef struct FstBuilderNode { - bool isFinal; - Output finalOutput; - SArray *trans; // -} FstBuilderNode; - typedef enum { OneTransNext, OneTrans, AnyTrans, EmptyFinal} State; typedef enum { Included, Excluded, Unbounded} FstBound; @@ -82,14 +76,6 @@ typedef struct FstBuilder { } FstBuilder; - - -typedef struct FstTransition { - uint8_t inp; //The byte input associated with this transition. - Output out; //The output associated with this transition - CompiledAddr addr; //The address of the node that this transition points to -} FstTransition; - typedef struct FstTransitions { FstNode *node; FstRange range; @@ -172,10 +158,6 @@ typedef struct FstIndexedValue { } FstIndexedValue; -typedef struct FstRegistryCell { - CompiledAddr addr; - FstBuilderNode *node; -} FstRegistryCell; diff --git a/source/libs/index/inc/index_fst_node.h b/source/libs/index/inc/index_fst_node.h index ba2d2ccd02..3eec97e3d8 100644 --- a/source/libs/index/inc/index_fst_node.h +++ b/source/libs/index/inc/index_fst_node.h @@ -16,7 +16,21 @@ #ifndef __INDEX_FST_NODE_H__ #define __INDEX_FST_NODE_H__ +#include "index_fst_util.h" +typedef struct FstTransition { + uint8_t inp; //The byte input associated with this transition. + Output out; //The output associated with this transition + CompiledAddr addr; //The address of the node that this transition points to +} FstTransition; + +typedef struct FstBuilderNode { + bool isFinal; + Output finalOutput; + SArray *trans; // +} FstBuilderNode; + +FstBuilderNode *fstBuilderNodeDefault(); #endif diff --git a/source/libs/index/inc/index_fst_registry.h b/source/libs/index/inc/index_fst_registry.h index 6dcb236f29..80c0194f00 100644 --- a/source/libs/index/inc/index_fst_registry.h +++ b/source/libs/index/inc/index_fst_registry.h @@ -16,9 +16,43 @@ #define __FST_REGISTRY_H__ #include "index_fst_util.h" +#include "tarray.h" +#include "index_fst_node.h" + +typedef struct FstRegistryCell { + CompiledAddr addr; + FstBuilderNode *node; +} FstRegistryCell; + +typedef struct FstRegistryCache { + SArray *cells; + uint32_t start; + uint32_t end; +} FstRegistryCache; + +typedef enum {FOUND, NOTFOUND, REJECTED} FstRegistryEntryState; + +typedef struct FstRegistryEntry { + FstRegistryEntryState state; + CompiledAddr addr; + FstRegistryCell *cell; +} FstRegistryEntry; + + + +// Registry relation function typedef struct FstRegistry { - + SArray *table; + uint64_t tableSize; // num of rows + uint64_t mruSize; // num of columns } FstRegistry; + +// +FstRegistry* fstRegistryCreate(uint64_t tableSize, uint64_t mruSize); + +FstRegistryEntry* fstRegistryGetEntry(FstRegistry *registry, FstBuilderNode *bNode); + +uint64_t fstRegistryHash(FstRegistry *registry, FstBuilderNode *node); #endif diff --git a/source/libs/index/src/index_fst_node.c b/source/libs/index/src/index_fst_node.c index 3d5efd30f3..3e8e7c12a2 100644 --- a/source/libs/index/src/index_fst_node.c +++ b/source/libs/index/src/index_fst_node.c @@ -12,4 +12,13 @@ * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ +#include "index_fst_node.h" + +FstBuilderNode *fstBuilderNodeDefault() { + FstBuilderNode *bn = malloc(sizeof(FstBuilderNode)); + bn->isFinal = false; + bn->finalOutput = 0; + bn->trans = NULL; + return bn; +} diff --git a/source/libs/index/src/index_fst_registry.c b/source/libs/index/src/index_fst_registry.c index 940c5863f4..7b4ef9da19 100644 --- a/source/libs/index/src/index_fst_registry.c +++ b/source/libs/index/src/index_fst_registry.c @@ -15,3 +15,150 @@ #include "index_fst_registry.h" + +static void fstRegistryCellSwap(SArray *arr, uint32_t a, uint32_t b) { + size_t sz = taosArrayGetSize(arr); + if (a >= sz || b >= sz) { return; } + + FstRegistryCell *cell1 = (FstRegistryCell *)taosArrayGet(arr, a); + FstRegistryCell *cell2 = (FstRegistryCell *)taosArrayGet(arr, b); + + FstRegistryCell t = {.addr = cell1->addr, .node = cell1->node}; + + cell1->addr = cell2->addr; + cell1->node = cell2->node; + + cell2->addr = t.addr; + cell2->node = t.node; + return; +} + +static void fstRegistryCellPromote(SArray *arr, uint32_t start, uint32_t end) { + size_t sz = taosArrayGetSize(arr); + if (start >= sz && end >= sz) {return; } + + assert(start >= end); + + int32_t s = (int32_t)start; + int32_t e = (int32_t)end; + while(s > e) { + fstRegistryCellSwap(arr, s - 1, s); + s -= 1; + } +} +FstRegistry* fstRegistryCreate(uint64_t tableSize, uint64_t mruSize) { + FstRegistry *registry = malloc(sizeof(FstRegistry)); + if (registry == NULL) { return NULL ;} + + uint64_t nCells = tableSize * mruSize; + SArray* tb = (SArray *)taosArrayInit(nCells, sizeof(FstRegistryCell)); + for (uint64_t i = 0; i < nCells; i++) { + FstRegistryCell *cell = taosArrayGet(tb, i); + cell->addr = NONE_ADDRESS; + cell->node = fstBuilderNodeDefault(); + } + + registry->table = tb; + registry->tableSize = tableSize; + registry->mruSize = mruSize; + return registry; +} + +FstRegistryEntry *fstRegistryGetEntry(FstRegistry *registry, FstBuilderNode *bNode) { + if (taosArrayGetSize(registry->table) <= 0) { + return NULL; + } + uint64_t bucket = fstRegistryHash(registry, bNode); + uint64_t start = registry->mruSize * bucket; + uint64_t end = start + registry->mruSize; + + FstRegistryEntry *entry = malloc(sizeof(FstRegistryEntry)); + if (end - start == 1) { + FstRegistryCell *cell = taosArrayGet(registry->table, start); + //cell->isNode && + if (cell->addr != NONE_ADDRESS && cell->node == bNode) { + entry->state = FOUND; + entry->addr = cell->addr ; + return entry; + } else { + // clone from bNode, refactor later + cell->node->isFinal = bNode->isFinal; + cell->node->finalOutput = bNode->finalOutput; + cell->node->trans = bNode->trans; + bNode->trans = NULL; + + entry->state = NOTFOUND; + entry->cell = cell; // copy or not + } + } else if (end - start == 2) { + FstRegistryCell *cell1 = taosArrayGet(registry->table, start); + if (cell1->addr != NONE_ADDRESS && cell1->node == bNode) { + entry->state = FOUND; + entry->addr = cell1->addr; + return entry; + } + FstRegistryCell *cell2 = taosArrayGet(registry->table, start + 1); + if (cell2->addr != NONE_ADDRESS && cell2->node == bNode) { + entry->state = FOUND; + entry->addr = cell2->addr; + // must swap here + fstRegistryCellSwap(registry->table, start, start + 1); + return entry; + } + //clone from bNode, refactor later + cell1->node->isFinal = bNode->isFinal; + cell1->node->finalOutput = bNode->finalOutput; + cell1->node->trans = bNode->trans; + bNode->trans = NULL; + + fstRegistryCellSwap(registry->table, start, start + 1); + FstRegistryCell *cCell = taosArrayGet(registry->table, start); + entry->state = NOTFOUND; + entry->cell = cCell; + } else { + uint32_t i = start; + for (; i < end; i++) { + FstRegistryCell *cell = (FstRegistryCell *)taosArrayGet(registry->table, i); + if (cell->addr != NONE_ADDRESS && cell->node == bNode) { + entry->state = FOUND; + entry->addr = cell->addr; + fstRegistryCellPromote(registry->table, i, start); + break; + } + } + if (i >= end) { + uint64_t last = end - 1; + FstRegistryCell *cell = (FstRegistryCell *)taosArrayGet(registry->table, last); + //clone from bNode, refactor later + cell->node->isFinal = bNode->isFinal; + cell->node->finalOutput = bNode->finalOutput; + cell->node->trans = bNode->trans; + bNode->trans = NULL; + + fstRegistryCellPromote(registry->table, last, start); + FstRegistryCell *cCell = taosArrayGet(registry->table, start); + entry->state = NOTFOUND; + entry->cell = cCell; + } + } + return entry; +} + +uint64_t fstRegistryHash(FstRegistry *registry, FstBuilderNode *bNode) { + //TODO(yihaoDeng): refactor later + const uint64_t FNV_PRIME = 1099511628211; + uint64_t h = 14695981039346656037u; + + h = (h ^ (uint64_t)bNode->isFinal) * FNV_PRIME; + h = (h ^ (bNode)->finalOutput) * FNV_PRIME; + + uint32_t sz = (uint32_t)taosArrayGetSize(bNode->trans); + for (uint32_t i = 0; i < sz; i++) { + FstTransition *trn = taosArrayGet(bNode->trans, i); + h = (h ^ (uint64_t)(trn->inp)) * FNV_PRIME; + h = (h ^ (uint64_t)(trn->out)) * FNV_PRIME; + h = (h ^ (uint64_t)(trn->addr))* FNV_PRIME; + } + return h %(registry->tableSize); +} + From 077b89db174fc4a9f77b61269f6d663a2ac642b3 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 21 Nov 2021 19:01:08 +0800 Subject: [PATCH 18/21] add fst_registry --- source/libs/index/inc/index_fst_node.h | 2 + source/libs/index/inc/index_fst_registry.h | 10 ++-- source/libs/index/src/index_fst_node.c | 11 +++++ source/libs/index/src/index_fst_registry.c | 54 ++++++++++------------ 4 files changed, 42 insertions(+), 35 deletions(-) diff --git a/source/libs/index/inc/index_fst_node.h b/source/libs/index/inc/index_fst_node.h index 3eec97e3d8..09dcd223b1 100644 --- a/source/libs/index/inc/index_fst_node.h +++ b/source/libs/index/inc/index_fst_node.h @@ -33,4 +33,6 @@ typedef struct FstBuilderNode { FstBuilderNode *fstBuilderNodeDefault(); +void fstBuilderNodeCloneFrom(FstBuilderNode *dst, FstBuilderNode *src); + #endif diff --git a/source/libs/index/inc/index_fst_registry.h b/source/libs/index/inc/index_fst_registry.h index 80c0194f00..d54d73683f 100644 --- a/source/libs/index/inc/index_fst_registry.h +++ b/source/libs/index/inc/index_fst_registry.h @@ -26,11 +26,11 @@ typedef struct FstRegistryCell { -typedef struct FstRegistryCache { - SArray *cells; - uint32_t start; - uint32_t end; -} FstRegistryCache; +//typedef struct FstRegistryCache { +// SArray *cells; +// uint32_t start; +// uint32_t end; +//} FstRegistryCache; typedef enum {FOUND, NOTFOUND, REJECTED} FstRegistryEntryState; diff --git a/source/libs/index/src/index_fst_node.c b/source/libs/index/src/index_fst_node.c index 3e8e7c12a2..1511781719 100644 --- a/source/libs/index/src/index_fst_node.c +++ b/source/libs/index/src/index_fst_node.c @@ -22,3 +22,14 @@ FstBuilderNode *fstBuilderNodeDefault() { return bn; } +// not destroy src, User's bussiness +void fstBuilderNodeCloneFrom(FstBuilderNode *dst, FstBuilderNode *src) { + if (dst == NULL || src == NULL) { return; } + + dst->isFinal = src->isFinal; + dst->finalOutput = src->finalOutput ; + dst->trans = src->trans; + + src->trans = NULL; +} + diff --git a/source/libs/index/src/index_fst_registry.c b/source/libs/index/src/index_fst_registry.c index 7b4ef9da19..5d6c7b1712 100644 --- a/source/libs/index/src/index_fst_registry.c +++ b/source/libs/index/src/index_fst_registry.c @@ -16,6 +16,23 @@ #include "index_fst_registry.h" +uint64_t fstRegistryHash(FstRegistry *registry, FstBuilderNode *bNode) { + //TODO(yihaoDeng): refactor later + const uint64_t FNV_PRIME = 1099511628211; + uint64_t h = 14695981039346656037u; + + h = (h ^ (uint64_t)bNode->isFinal) * FNV_PRIME; + h = (h ^ (bNode)->finalOutput) * FNV_PRIME; + + uint32_t sz = (uint32_t)taosArrayGetSize(bNode->trans); + for (uint32_t i = 0; i < sz; i++) { + FstTransition *trn = taosArrayGet(bNode->trans, i); + h = (h ^ (uint64_t)(trn->inp)) * FNV_PRIME; + h = (h ^ (uint64_t)(trn->out)) * FNV_PRIME; + h = (h ^ (uint64_t)(trn->addr))* FNV_PRIME; + } + return h %(registry->tableSize); +} static void fstRegistryCellSwap(SArray *arr, uint32_t a, uint32_t b) { size_t sz = taosArrayGetSize(arr); if (a >= sz || b >= sz) { return; } @@ -46,6 +63,9 @@ static void fstRegistryCellPromote(SArray *arr, uint32_t start, uint32_t end) { s -= 1; } } +#define FST_REGISTRY_CELL_IS_EMPTY(cell) (cell->addr == NONE_ADDRESS) +#define FST_REGISTRY_CELL_INSERT(cell, addr) do {cell->addr = addr;} while(0) + FstRegistry* fstRegistryCreate(uint64_t tableSize, uint64_t mruSize) { FstRegistry *registry = malloc(sizeof(FstRegistry)); if (registry == NULL) { return NULL ;} @@ -82,11 +102,8 @@ FstRegistryEntry *fstRegistryGetEntry(FstRegistry *registry, FstBuilderNode *bNo return entry; } else { // clone from bNode, refactor later - cell->node->isFinal = bNode->isFinal; - cell->node->finalOutput = bNode->finalOutput; - cell->node->trans = bNode->trans; - bNode->trans = NULL; - + // + fstBuilderNodeCloneFrom(cell->node, bNode); entry->state = NOTFOUND; entry->cell = cell; // copy or not } @@ -106,10 +123,7 @@ FstRegistryEntry *fstRegistryGetEntry(FstRegistry *registry, FstBuilderNode *bNo return entry; } //clone from bNode, refactor later - cell1->node->isFinal = bNode->isFinal; - cell1->node->finalOutput = bNode->finalOutput; - cell1->node->trans = bNode->trans; - bNode->trans = NULL; + fstBuilderNodeCloneFrom(cell2->node, bNode); fstRegistryCellSwap(registry->table, start, start + 1); FstRegistryCell *cCell = taosArrayGet(registry->table, start); @@ -130,10 +144,7 @@ FstRegistryEntry *fstRegistryGetEntry(FstRegistry *registry, FstBuilderNode *bNo uint64_t last = end - 1; FstRegistryCell *cell = (FstRegistryCell *)taosArrayGet(registry->table, last); //clone from bNode, refactor later - cell->node->isFinal = bNode->isFinal; - cell->node->finalOutput = bNode->finalOutput; - cell->node->trans = bNode->trans; - bNode->trans = NULL; + fstBuilderNodeCloneFrom(cell->node, bNode); fstRegistryCellPromote(registry->table, last, start); FstRegistryCell *cCell = taosArrayGet(registry->table, start); @@ -144,21 +155,4 @@ FstRegistryEntry *fstRegistryGetEntry(FstRegistry *registry, FstBuilderNode *bNo return entry; } -uint64_t fstRegistryHash(FstRegistry *registry, FstBuilderNode *bNode) { - //TODO(yihaoDeng): refactor later - const uint64_t FNV_PRIME = 1099511628211; - uint64_t h = 14695981039346656037u; - - h = (h ^ (uint64_t)bNode->isFinal) * FNV_PRIME; - h = (h ^ (bNode)->finalOutput) * FNV_PRIME; - - uint32_t sz = (uint32_t)taosArrayGetSize(bNode->trans); - for (uint32_t i = 0; i < sz; i++) { - FstTransition *trn = taosArrayGet(bNode->trans, i); - h = (h ^ (uint64_t)(trn->inp)) * FNV_PRIME; - h = (h ^ (uint64_t)(trn->out)) * FNV_PRIME; - h = (h ^ (uint64_t)(trn->addr))* FNV_PRIME; - } - return h %(registry->tableSize); -} From df646e9a512ea02865fd5bec2ce1df0aad2f90b6 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sun, 21 Nov 2021 20:01:34 +0800 Subject: [PATCH 19/21] add core struct --- source/libs/index/inc/index_fst_node.h | 2 ++ source/libs/index/inc/index_fst_registry.h | 1 - source/libs/index/src/index_fst_node.c | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/source/libs/index/inc/index_fst_node.h b/source/libs/index/inc/index_fst_node.h index 09dcd223b1..631c7026c5 100644 --- a/source/libs/index/inc/index_fst_node.h +++ b/source/libs/index/inc/index_fst_node.h @@ -33,6 +33,8 @@ typedef struct FstBuilderNode { FstBuilderNode *fstBuilderNodeDefault(); +FstBuilderNode *fstBuilderNodeClone(FstBuilderNode *src); + void fstBuilderNodeCloneFrom(FstBuilderNode *dst, FstBuilderNode *src); #endif diff --git a/source/libs/index/inc/index_fst_registry.h b/source/libs/index/inc/index_fst_registry.h index d54d73683f..f19bb750c2 100644 --- a/source/libs/index/inc/index_fst_registry.h +++ b/source/libs/index/inc/index_fst_registry.h @@ -54,5 +54,4 @@ FstRegistry* fstRegistryCreate(uint64_t tableSize, uint64_t mruSize); FstRegistryEntry* fstRegistryGetEntry(FstRegistry *registry, FstBuilderNode *bNode); -uint64_t fstRegistryHash(FstRegistry *registry, FstBuilderNode *node); #endif diff --git a/source/libs/index/src/index_fst_node.c b/source/libs/index/src/index_fst_node.c index 1511781719..23af4a4a4b 100644 --- a/source/libs/index/src/index_fst_node.c +++ b/source/libs/index/src/index_fst_node.c @@ -22,6 +22,26 @@ FstBuilderNode *fstBuilderNodeDefault() { return bn; } +FstBuilderNode *fstBuilderNodeClone(FstBuilderNode *src) { + FstBuilderNode *node = malloc(sizeof(FstBuilderNode)); + if (node == NULL) { return NULL; } + + + size_t sz = taosArrayGetSize(src->trans); + SArray *trans = taosArrayInit(sz, sizeof(FstTransition)); + + for (size_t i = 0; i < sz; i++) { + FstTransition *tran = taosArrayGet(src->trans, i); + FstTransition t = *tran; + taosArrayPush(trans, &t); + } + + node->trans = trans; + node->isFinal = src->isFinal; + node->finalOutput = src->finalOutput; + return node; + +} // not destroy src, User's bussiness void fstBuilderNodeCloneFrom(FstBuilderNode *dst, FstBuilderNode *src) { if (dst == NULL || src == NULL) { return; } From b0c147f05770bf1560a3115aa81724e47d9d74e0 Mon Sep 17 00:00:00 2001 From: lichuang Date: Mon, 22 Nov 2021 10:04:47 +0800 Subject: [PATCH 20/21] [TD-10645][raft]add vote resp process --- source/libs/sync/inc/raft_log.h | 2 -- source/libs/sync/src/sync_raft_impl.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/source/libs/sync/inc/raft_log.h b/source/libs/sync/inc/raft_log.h index dc10c59b28..117ed42c2c 100644 --- a/source/libs/sync/inc/raft_log.h +++ b/source/libs/sync/inc/raft_log.h @@ -39,8 +39,6 @@ struct SSyncRaftLog { SyncIndex commitIndex; SyncIndex appliedIndex; - - }; SSyncRaftLog* syncRaftLogOpen(); diff --git a/source/libs/sync/src/sync_raft_impl.c b/source/libs/sync/src/sync_raft_impl.c index 4d8222e826..3050bb2c8a 100644 --- a/source/libs/sync/src/sync_raft_impl.c +++ b/source/libs/sync/src/sync_raft_impl.c @@ -247,7 +247,7 @@ static int stepCandidate(SSyncRaft* pRaft, const SSyncMessage* pMsg) { syncRaftHandleVoteRespMessage(pRaft, pMsg); return 0; } else if (msgType == RAFT_MSG_APPEND) { - syncRaftBecomeFollower(pRaft, pRaft->term, pMsg->from); + syncRaftBecomeFollower(pRaft, pMsg->term, pMsg->from); syncRaftHandleAppendEntriesMessage(pRaft, pMsg); } return 0; From 759b077ecf667184e2f185ba195d08da2caa4dcc Mon Sep 17 00:00:00 2001 From: Liu Jicong Date: Mon, 22 Nov 2021 14:56:11 +0800 Subject: [PATCH 21/21] add walhandle for integration --- include/dnode/vnode/tq/tq.h | 118 +++++++++++++++--- include/libs/wal/wal.h | 6 +- source/dnode/vnode/tq/inc/tqMetaStore.h | 87 +------------ source/dnode/vnode/tq/src/tq.c | 95 +++++++++++--- source/dnode/vnode/tq/src/tqMetaStore.c | 48 +++---- .../dnode/vnode/tq/test/tqSerializerTest.cpp | 13 ++ source/libs/wal/src/wal.c | 12 +- 7 files changed, 233 insertions(+), 146 deletions(-) create mode 100644 source/dnode/vnode/tq/test/tqSerializerTest.cpp diff --git a/include/dnode/vnode/tq/tq.h b/include/dnode/vnode/tq/tq.h index 7a21b08aaf..2785b6de96 100644 --- a/include/dnode/vnode/tq/tq.h +++ b/include/dnode/vnode/tq/tq.h @@ -109,11 +109,10 @@ typedef struct TqTopicVhandle { #define TQ_BUFFER_SIZE 8 -// TODO: define a serializer and deserializer typedef struct TqBufferItem { int64_t offset; // executors are identical but not concurrent - // so it must be a copy in each item + // so there must be a copy in each item void* executor; int64_t size; void* content; @@ -156,23 +155,111 @@ typedef struct TqQueryMsg { typedef struct TqLogReader { void* logHandle; - int32_t (*walRead)(void* logHandle, void** data, int64_t ver); - int64_t (*walGetFirstVer)(void* logHandle); - int64_t (*walGetSnapshotVer)(void* logHandle); - int64_t (*walGetLastVer)(void* logHandle); + int32_t (*logRead)(void* logHandle, void** data, int64_t ver); + int64_t (*logGetFirstVer)(void* logHandle); + int64_t (*logGetSnapshotVer)(void* logHandle); + int64_t (*logGetLastVer)(void* logHandle); } TqLogReader; typedef struct TqConfig { // TODO } TqConfig; +typedef struct TqMemRef { + SMemAllocatorFactory *pAlloctorFactory; + SMemAllocator *pAllocator; +} TqMemRef; + +typedef struct TqSerializedHead { + int16_t ver; + int16_t action; + int32_t checksum; + int64_t ssize; + char content[]; +} TqSerializedHead; + +typedef int (*TqSerializeFun)(const void* pObj, TqSerializedHead** ppHead); +typedef const void* (*TqDeserializeFun)(const TqSerializedHead* pHead, void** ppObj); +typedef void (*TqDeleteFun)(void*); + +#define TQ_BUCKET_MASK 0xFF +#define TQ_BUCKET_SIZE 256 + +#define TQ_PAGE_SIZE 4096 +//key + offset + size +#define TQ_IDX_SIZE 24 +//4096 / 24 +#define TQ_MAX_IDX_ONE_PAGE 170 +//24 * 170 +#define TQ_IDX_PAGE_BODY_SIZE 4080 +//4096 - 4080 +#define TQ_IDX_PAGE_HEAD_SIZE 16 + +#define TQ_ACTION_CONST 0 +#define TQ_ACTION_INUSE 1 +#define TQ_ACTION_INUSE_CONT 2 +#define TQ_ACTION_INTXN 3 + +#define TQ_SVER 0 + +//TODO: inplace mode is not implemented +#define TQ_UPDATE_INPLACE 0 +#define TQ_UPDATE_APPEND 1 + +#define TQ_DUP_INTXN_REWRITE 0 +#define TQ_DUP_INTXN_REJECT 2 + +static inline bool TqUpdateAppend(int32_t tqConfigFlag) { + return tqConfigFlag & TQ_UPDATE_APPEND; +} + +static inline bool TqDupIntxnReject(int32_t tqConfigFlag) { + return tqConfigFlag & TQ_DUP_INTXN_REJECT; +} + +static const int8_t TQ_CONST_DELETE = TQ_ACTION_CONST; +#define TQ_DELETE_TOKEN (void*)&TQ_CONST_DELETE + +typedef struct TqMetaHandle { + int64_t key; + int64_t offset; + int64_t serializedSize; + void* valueInUse; + void* valueInTxn; +} TqMetaHandle; + +typedef struct TqMetaList { + TqMetaHandle handle; + struct TqMetaList* next; + //struct TqMetaList* inTxnPrev; + //struct TqMetaList* inTxnNext; + struct TqMetaList* unpersistPrev; + struct TqMetaList* unpersistNext; +} TqMetaList; + +typedef struct TqMetaStore { + TqMetaList* bucket[TQ_BUCKET_SIZE]; + //a table head + TqMetaList* unpersistHead; + //TODO:temporaral use, to be replaced by unified tfile + int fileFd; + //TODO:temporaral use, to be replaced by unified tfile + int idxFd; + char* dirPath; + int32_t tqConfigFlag; + TqSerializeFun pSerializer; + TqDeserializeFun pDeserializer; + TqDeleteFun pDeleter; +} TqMetaStore; + typedef struct STQ { // the collection of group handle // the handle of kvstore - const char* path; + char* path; TqConfig* tqConfig; TqLogReader* tqLogReader; - SMemAllocatorFactory* allocFac; + TqMemRef tqMemRef; + TqMetaStore* tqMeta; } STQ; // open in each vnode @@ -187,7 +274,7 @@ int tqConsume(STQ*, TmqConsumeReq*); TqGroupHandle* tqGetGroupHandle(STQ*, int64_t cId); -int tqOpenTCGroup(STQ*, int64_t topicId, int64_t cgId, int64_t cId); +TqGroupHandle* tqOpenTCGroup(STQ*, int64_t topicId, int64_t cgId, int64_t cId); int tqCloseTCGroup(STQ*, int64_t topicId, int64_t cgId, int64_t cId); int tqMoveOffsetToNext(TqGroupHandle*); int tqResetOffset(STQ*, int64_t topicId, int64_t cgId, int64_t offset); @@ -195,18 +282,9 @@ int tqRegisterContext(TqGroupHandle*, void* ahandle); int tqLaunchQuery(TqGroupHandle*); int tqSendLaunchQuery(TqGroupHandle*); -int tqSerializeGroupHandle(TqGroupHandle* gHandle, void** ppBytes); -void* tqSerializeListHandle(TqListHandle* listHandle, void* ptr); -void* tqSerializeBufHandle(TqBufferHandle* bufHandle, void* ptr); -void* tqSerializeBufItem(TqBufferItem* bufItem, void* ptr); +int tqSerializeGroupHandle(const TqGroupHandle* gHandle, TqSerializedHead** ppHead); -const void* tqDeserializeGroupHandle(const void* pBytes, TqGroupHandle* ghandle); -const void* tqDeserializeBufHandle(const void* pBytes, TqBufferHandle* bufHandle); -const void* tqDeserializeBufItem(const void* pBytes, TqBufferItem* bufItem); - -int tqGetGHandleSSize(const TqGroupHandle* gHandle); -int tqBufHandleSSize(); -int tqBufItemSSize(); +const void* tqDeserializeGroupHandle(const TqSerializedHead* pHead, TqGroupHandle** gHandle); #ifdef __cplusplus } diff --git a/include/libs/wal/wal.h b/include/libs/wal/wal.h index 0829782310..ba37e6880b 100644 --- a/include/libs/wal/wal.h +++ b/include/libs/wal/wal.h @@ -44,8 +44,10 @@ typedef struct { EWalType walLevel; // wal level } SWalCfg; -struct SWal; -typedef struct SWal SWal; // WAL HANDLE +typedef struct SWal { + int8_t unused; +} SWal; // WAL HANDLE + typedef int32_t (*FWalWrite)(void *ahandle, void *pHead, int32_t qtype, void *pMsg); // module initialization diff --git a/source/dnode/vnode/tq/inc/tqMetaStore.h b/source/dnode/vnode/tq/inc/tqMetaStore.h index 63e48625d9..b9e702a89a 100644 --- a/source/dnode/vnode/tq/inc/tqMetaStore.h +++ b/source/dnode/vnode/tq/inc/tqMetaStore.h @@ -17,97 +17,22 @@ #define _TQ_META_STORE_H_ #include "os.h" - +#include "tq.h" #ifdef __cplusplus extern "C" { #endif -#define TQ_BUCKET_MASK 0xFF -#define TQ_BUCKET_SIZE 256 - -#define TQ_PAGE_SIZE 4096 -//key + offset + size -#define TQ_IDX_SIZE 24 -//4096 / 24 -#define TQ_MAX_IDX_ONE_PAGE 170 -//24 * 170 -#define TQ_IDX_PAGE_BODY_SIZE 4080 -//4096 - 4080 -#define TQ_IDX_PAGE_HEAD_SIZE 16 - -#define TQ_ACTION_CONST 0 -#define TQ_ACTION_INUSE 1 -#define TQ_ACTION_INUSE_CONT 2 -#define TQ_ACTION_INTXN 3 - -#define TQ_SVER 0 - -//TODO: inplace mode is not implemented -#define TQ_UPDATE_INPLACE 0 -#define TQ_UPDATE_APPEND 1 - -#define TQ_DUP_INTXN_REWRITE 0 -#define TQ_DUP_INTXN_REJECT 2 - -static inline bool TqUpdateAppend(int32_t tqConfigFlag) { - return tqConfigFlag & TQ_UPDATE_APPEND; -} - -static inline bool TqDupIntxnReject(int32_t tqConfigFlag) { - return tqConfigFlag & TQ_DUP_INTXN_REJECT; -} - -static const int8_t TQ_CONST_DELETE = TQ_ACTION_CONST; -#define TQ_DELETE_TOKEN (void*)&TQ_CONST_DELETE - -typedef struct TqSerializedHead { - int16_t ver; - int16_t action; - int32_t checksum; - int64_t ssize; - char content[]; -} TqSerializedHead; - -typedef struct TqMetaHandle { - int64_t key; - int64_t offset; - int64_t serializedSize; - void* valueInUse; - void* valueInTxn; -} TqMetaHandle; - -typedef struct TqMetaList { - TqMetaHandle handle; - struct TqMetaList* next; - //struct TqMetaList* inTxnPrev; - //struct TqMetaList* inTxnNext; - struct TqMetaList* unpersistPrev; - struct TqMetaList* unpersistNext; -} TqMetaList; - -typedef struct TqMetaStore { - TqMetaList* bucket[TQ_BUCKET_SIZE]; - //a table head - TqMetaList* unpersistHead; - int fileFd; //TODO:temporaral use, to be replaced by unified tfile - int idxFd; //TODO:temporaral use, to be replaced by unified tfile - char* dirPath; - int32_t tqConfigFlag; - int (*serializer)(const void* pObj, TqSerializedHead** ppHead); - const void* (*deserializer)(const TqSerializedHead* pHead, void** ppObj); - void (*deleter)(void*); -} TqMetaStore; TqMetaStore* tqStoreOpen(const char* path, - int serializer(const void* pObj, TqSerializedHead** ppHead), - const void* deserializer(const TqSerializedHead* pHead, void** ppObj), - void deleter(void* pObj), - int32_t tqConfigFlag + TqSerializeFun pSerializer, + TqDeserializeFun pDeserializer, + TqDeleteFun pDeleter, + int32_t tqConfigFlag ); int32_t tqStoreClose(TqMetaStore*); //int32_t tqStoreDelete(TqMetaStore*); -//int32_t TqStoreCommitAll(TqMetaStore*); +//int32_t tqStoreCommitAll(TqMetaStore*); int32_t tqStorePersist(TqMetaStore*); //clean deleted idx and data from persistent file int32_t tqStoreCompact(TqMetaStore*); diff --git a/source/dnode/vnode/tq/src/tq.c b/source/dnode/vnode/tq/src/tq.c index cf98e3e1a4..c010042b8c 100644 --- a/source/dnode/vnode/tq/src/tq.c +++ b/source/dnode/vnode/tq/src/tq.c @@ -14,6 +14,7 @@ */ #include "tqInt.h" +#include "tqMetaStore.h" //static //read next version data @@ -24,6 +25,46 @@ // int tqGetgHandleSSize(const TqGroupHandle *gHandle); +int tqBufHandleSSize(); +int tqBufItemSSize(); + +TqGroupHandle* tqFindHandle(STQ* pTq, int64_t topicId, int64_t cgId, int64_t cId) { + TqGroupHandle* gHandle; + return NULL; +} + +void* tqSerializeListHandle(TqListHandle* listHandle, void* ptr); +void* tqSerializeBufHandle(TqBufferHandle* bufHandle, void* ptr); +void* tqSerializeBufItem(TqBufferItem* bufItem, void* ptr); + +const void* tqDeserializeBufHandle(const void* pBytes, TqBufferHandle* bufHandle); +const void* tqDeserializeBufItem(const void* pBytes, TqBufferItem* bufItem); + +STQ* tqOpen(const char* path, TqConfig* tqConfig, TqLogReader* tqLogReader, SMemAllocatorFactory *allocFac) { + STQ* pTq = malloc(sizeof(STQ)); + if(pTq == NULL) { + //TODO: memory error + return NULL; + } + strcpy(pTq->path, path); + pTq->tqConfig = tqConfig; + pTq->tqLogReader = tqLogReader; + pTq->tqMemRef.pAlloctorFactory = allocFac; + pTq->tqMemRef.pAllocator = allocFac->create(); + if(pTq->tqMemRef.pAllocator == NULL) { + //TODO + } + pTq->tqMeta = tqStoreOpen(path, + (TqSerializeFun)tqSerializeGroupHandle, + (TqDeserializeFun)tqDeserializeGroupHandle, + free, + 0); + if(pTq->tqMeta == NULL) { + //TODO: free STQ + return NULL; + } + return pTq; +} static int tqProtoCheck(TmqMsgHead *pMsg) { return pMsg->protoVer == 0; @@ -83,14 +124,29 @@ static int tqCommitTCGroup(TqGroupHandle* handle) { int tqCreateTCGroup(STQ *pTq, int64_t topicId, int64_t cgId, int64_t cId, TqGroupHandle** handle) { //create in disk + TqGroupHandle* gHandle = (TqGroupHandle*)malloc(sizeof(TqGroupHandle)); + if(gHandle == NULL) { + //TODO + return -1; + } + memset(gHandle, 0, sizeof(TqGroupHandle)); + return 0; } -int tqOpenTCGroup(STQ* pTq, int64_t topicId, int64_t cgId, int64_t cId) { - //look up in disk +TqGroupHandle* tqOpenTCGroup(STQ* pTq, int64_t topicId, int64_t cgId, int64_t cId) { + TqGroupHandle* gHandle = tqHandleGet(pTq->tqMeta, cId); + if(gHandle == NULL) { + int code = tqCreateTCGroup(pTq, topicId, cgId, cId, &gHandle); + if(code != 0) { + //TODO + return NULL; + } + } + //create //open - return 0; + return gHandle; } int tqCloseTCGroup(STQ* pTq, int64_t topicId, int64_t cgId, int64_t cId) { @@ -207,16 +263,20 @@ int tqConsume(STQ* pTq, TmqConsumeReq* pMsg) { return 0; } -int tqSerializeGroupHandle(TqGroupHandle *gHandle, void** ppBytes) { +int tqSerializeGroupHandle(const TqGroupHandle *gHandle, TqSerializedHead** ppHead) { //calculate size - int sz = tqGetgHandleSSize(gHandle); - void* ptr = realloc(*ppBytes, sz); - if(ptr == NULL) { - free(ppBytes); - //TODO: memory err - return -1; + int sz = tqGetgHandleSSize(gHandle) + sizeof(TqSerializedHead); + if(sz > (*ppHead)->ssize) { + void* tmpPtr = realloc(*ppHead, sz); + if(tmpPtr == NULL) { + free(*ppHead); + //TODO: memory err + return -1; + } + *ppHead = tmpPtr; + (*ppHead)->ssize = sz; } - *ppBytes = ptr; + void* ptr = (*ppHead)->content; //do serialization *(int64_t*)ptr = gHandle->cId; ptr = POINTER_SHIFT(ptr, sizeof(int64_t)); @@ -261,8 +321,9 @@ void* tqSerializeBufItem(TqBufferItem *bufItem, void* ptr) { return ptr; } -const void* tqDeserializeGroupHandle(const void* pBytes, TqGroupHandle *gHandle) { - const void* ptr = pBytes; +const void* tqDeserializeGroupHandle(const TqSerializedHead* pHead, TqGroupHandle **ppGHandle) { + TqGroupHandle *gHandle = *ppGHandle; + const void* ptr = pHead->content; gHandle->cId = *(int64_t*)ptr; ptr = POINTER_SHIFT(ptr, sizeof(int64_t)); gHandle->cgId = *(int64_t*)ptr; @@ -317,15 +378,15 @@ const void* tqDeserializeBufItem(const void* pBytes, TqBufferItem *bufItem) { //TODO: make this a macro int tqGetgHandleSSize(const TqGroupHandle *gHandle) { - return sizeof(int64_t) * 2 - + sizeof(int32_t) + return sizeof(int64_t) * 2 //cId + cgId + + sizeof(int32_t) //topicNum + gHandle->topicNum * tqBufHandleSSize(); } //TODO: make this a macro int tqBufHandleSSize() { - return sizeof(int64_t) * 2 - + sizeof(int32_t) * 2 + return sizeof(int64_t) * 2 // nextConsumeOffset + topicId + + sizeof(int32_t) * 2 // head + tail + TQ_BUFFER_SIZE * tqBufItemSSize(); } diff --git a/source/dnode/vnode/tq/src/tqMetaStore.c b/source/dnode/vnode/tq/src/tqMetaStore.c index f8202941bb..71d1e8d890 100644 --- a/source/dnode/vnode/tq/src/tqMetaStore.c +++ b/source/dnode/vnode/tq/src/tqMetaStore.c @@ -69,10 +69,10 @@ static inline int tqReadLastPage(int fd, TqIdxPageBuf* pBuf) { } TqMetaStore* tqStoreOpen(const char* path, - int serializer(const void* pObj, TqSerializedHead** ppHead), - const void* deserializer(const TqSerializedHead* pHead, void** ppObj), - void deleter(void* pObj), - int32_t tqConfigFlag + TqSerializeFun serializer, + TqDeserializeFun deserializer, + TqDeleteFun deleter, + int32_t tqConfigFlag ) { TqMetaStore* pMeta = malloc(sizeof(TqMetaStore)); if(pMeta == NULL) { @@ -127,9 +127,9 @@ TqMetaStore* tqStoreOpen(const char* path, pMeta->fileFd = fileFd; - pMeta->serializer = serializer; - pMeta->deserializer = deserializer; - pMeta->deleter = deleter; + pMeta->pSerializer = serializer; + pMeta->pDeserializer = deserializer; + pMeta->pDeleter = deleter; pMeta->tqConfigFlag = tqConfigFlag; //read idx file and load into memory @@ -171,25 +171,25 @@ TqMetaStore* tqStoreOpen(const char* path, } if(serializedObj->action == TQ_ACTION_INUSE) { if(serializedObj->ssize != sizeof(TqSerializedHead)) { - pMeta->deserializer(serializedObj, &pNode->handle.valueInUse); + pMeta->pDeserializer(serializedObj, &pNode->handle.valueInUse); } else { pNode->handle.valueInUse = TQ_DELETE_TOKEN; } } else if(serializedObj->action == TQ_ACTION_INTXN) { if(serializedObj->ssize != sizeof(TqSerializedHead)) { - pMeta->deserializer(serializedObj, &pNode->handle.valueInTxn); + pMeta->pDeserializer(serializedObj, &pNode->handle.valueInTxn); } else { pNode->handle.valueInTxn = TQ_DELETE_TOKEN; } } else if(serializedObj->action == TQ_ACTION_INUSE_CONT) { if(serializedObj->ssize != sizeof(TqSerializedHead)) { - pMeta->deserializer(serializedObj, &pNode->handle.valueInUse); + pMeta->pDeserializer(serializedObj, &pNode->handle.valueInUse); } else { pNode->handle.valueInUse = TQ_DELETE_TOKEN; } TqSerializedHead* ptr = POINTER_SHIFT(serializedObj, serializedObj->ssize); if(ptr->ssize != sizeof(TqSerializedHead)) { - pMeta->deserializer(ptr, &pNode->handle.valueInTxn); + pMeta->pDeserializer(ptr, &pNode->handle.valueInTxn); } else { pNode->handle.valueInTxn = TQ_DELETE_TOKEN; } @@ -225,11 +225,11 @@ TqMetaStore* tqStoreOpen(const char* path, if(pBucketNode) { if(pBucketNode->handle.valueInUse && pBucketNode->handle.valueInUse != TQ_DELETE_TOKEN) { - pMeta->deleter(pBucketNode->handle.valueInUse); + pMeta->pDeleter(pBucketNode->handle.valueInUse); } if(pBucketNode->handle.valueInTxn && pBucketNode->handle.valueInTxn != TQ_DELETE_TOKEN) { - pMeta->deleter(pBucketNode->handle.valueInTxn); + pMeta->pDeleter(pBucketNode->handle.valueInTxn); } free(pBucketNode); } @@ -253,11 +253,11 @@ int32_t tqStoreClose(TqMetaStore* pMeta) { ASSERT(pNode->unpersistPrev == NULL); if(pNode->handle.valueInTxn && pNode->handle.valueInTxn != TQ_DELETE_TOKEN) { - pMeta->deleter(pNode->handle.valueInTxn); + pMeta->pDeleter(pNode->handle.valueInTxn); } if(pNode->handle.valueInUse && pNode->handle.valueInUse != TQ_DELETE_TOKEN) { - pMeta->deleter(pNode->handle.valueInUse); + pMeta->pDeleter(pNode->handle.valueInUse); } TqMetaList* next = pNode->next; free(pNode); @@ -280,11 +280,11 @@ int32_t tqStoreDelete(TqMetaStore* pMeta) { while(pNode) { if(pNode->handle.valueInTxn && pNode->handle.valueInTxn != TQ_DELETE_TOKEN) { - pMeta->deleter(pNode->handle.valueInTxn); + pMeta->pDeleter(pNode->handle.valueInTxn); } if(pNode->handle.valueInUse && pNode->handle.valueInUse != TQ_DELETE_TOKEN) { - pMeta->deleter(pNode->handle.valueInUse); + pMeta->pDeleter(pNode->handle.valueInUse); } TqMetaList* next = pNode->next; free(pNode); @@ -338,7 +338,7 @@ int32_t tqStorePersist(TqMetaStore* pMeta) { if(pNode->handle.valueInUse == TQ_DELETE_TOKEN) { pSHead->ssize = sizeof(TqSerializedHead); } else { - pMeta->serializer(pNode->handle.valueInUse, &pSHead); + pMeta->pSerializer(pNode->handle.valueInUse, &pSHead); } nBytes = write(pMeta->fileFd, pSHead, pSHead->ssize); ASSERT(nBytes == pSHead->ssize); @@ -349,7 +349,7 @@ int32_t tqStorePersist(TqMetaStore* pMeta) { if(pNode->handle.valueInTxn == TQ_DELETE_TOKEN) { pSHead->ssize = sizeof(TqSerializedHead); } else { - pMeta->serializer(pNode->handle.valueInTxn, &pSHead); + pMeta->pSerializer(pNode->handle.valueInTxn, &pSHead); } int nBytesTxn = write(pMeta->fileFd, pSHead, pSHead->ssize); ASSERT(nBytesTxn == pSHead->ssize); @@ -423,7 +423,7 @@ static int32_t tqHandlePutCommitted(TqMetaStore* pMeta, int64_t key, void* value //TODO: think about thread safety if(pNode->handle.valueInUse && pNode->handle.valueInUse != TQ_DELETE_TOKEN) { - pMeta->deleter(pNode->handle.valueInUse); + pMeta->pDeleter(pNode->handle.valueInUse); } //change pointer ownership pNode->handle.valueInUse = value; @@ -496,7 +496,7 @@ static inline int32_t tqHandlePutImpl(TqMetaStore* pMeta, int64_t key, void* val return -2; } if(pNode->handle.valueInTxn != TQ_DELETE_TOKEN) { - pMeta->deleter(pNode->handle.valueInTxn); + pMeta->pDeleter(pNode->handle.valueInTxn); } } pNode->handle.valueInTxn = value; @@ -562,7 +562,7 @@ int32_t tqHandleCommit(TqMetaStore* pMeta, int64_t key) { } if(pNode->handle.valueInUse && pNode->handle.valueInUse != TQ_DELETE_TOKEN) { - pMeta->deleter(pNode->handle.valueInUse); + pMeta->pDeleter(pNode->handle.valueInUse); } pNode->handle.valueInUse = pNode->handle.valueInTxn; pNode->handle.valueInTxn = NULL; @@ -582,7 +582,7 @@ int32_t tqHandleAbort(TqMetaStore* pMeta, int64_t key) { if(pNode->handle.key == key) { if(pNode->handle.valueInTxn) { if(pNode->handle.valueInTxn != TQ_DELETE_TOKEN) { - pMeta->deleter(pNode->handle.valueInTxn); + pMeta->pDeleter(pNode->handle.valueInTxn); } pNode->handle.valueInTxn = NULL; tqLinkUnpersist(pMeta, pNode); @@ -602,7 +602,7 @@ int32_t tqHandleDel(TqMetaStore* pMeta, int64_t key) { while(pNode) { if(pNode->handle.valueInTxn != TQ_DELETE_TOKEN) { if(pNode->handle.valueInTxn) { - pMeta->deleter(pNode->handle.valueInTxn); + pMeta->pDeleter(pNode->handle.valueInTxn); } pNode->handle.valueInTxn = TQ_DELETE_TOKEN; tqLinkUnpersist(pMeta, pNode); diff --git a/source/dnode/vnode/tq/test/tqSerializerTest.cpp b/source/dnode/vnode/tq/test/tqSerializerTest.cpp new file mode 100644 index 0000000000..0d76322c17 --- /dev/null +++ b/source/dnode/vnode/tq/test/tqSerializerTest.cpp @@ -0,0 +1,13 @@ +#include +#include +#include +#include + +#include "tq.h" + +using namespace std; + +TEST(TqSerializerTest, basicTest) { + TqGroupHandle* gHandle = (TqGroupHandle*)malloc(sizeof(TqGroupHandle)); + +} diff --git a/source/libs/wal/src/wal.c b/source/libs/wal/src/wal.c index f25c127f3f..c107a94f3f 100644 --- a/source/libs/wal/src/wal.c +++ b/source/libs/wal/src/wal.c @@ -19,11 +19,19 @@ int32_t walInit() { return 0; } void walCleanUp() {} -SWal *walOpen(char *path, SWalCfg *pCfg) { return NULL; } +SWal *walOpen(char *path, SWalCfg *pCfg) { + SWal* pWal = malloc(sizeof(SWal)); + if(pWal == NULL) { + return NULL; + } + return pWal; +} int32_t walAlter(SWal *pWal, SWalCfg *pCfg) { return 0; } -void walClose(SWal *pWal) {} +void walClose(SWal *pWal) { + if(pWal) free(pWal); +} void walFsync(SWal *pWal, bool force) {}