From d4cd3836f1e40c9d45459ba766e8a0d46b96b295 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Thu, 21 Oct 2021 22:36:17 +0800 Subject: [PATCH 1/7] add lucene --- cmake/cmake.options | 6 +-- cmake/lucene_CMakeLists.txt.in | 5 +- deps/CMakeLists.txt | 5 ++ include/libs/index/index.h | 62 +++++++++++++++++++++++- source/libs/index/CMakeLists.txt | 25 +++++++++- source/libs/index/inc/indexInt.h | 21 +++++++- source/libs/index/src/index.c | 70 ++++++++++++++++++++++++--- source/libs/index/test/indexTests.cpp | 15 ++++++ source/libs/tkv/src/tkv.c | 4 +- 9 files changed, 196 insertions(+), 17 deletions(-) diff --git a/cmake/cmake.options b/cmake/cmake.options index 74b0d9fdbb..bbecf614d1 100644 --- a/cmake/cmake.options +++ b/cmake/cmake.options @@ -16,17 +16,17 @@ option( option( BUILD_WITH_ROCKSDB "If build with rocksdb" - OFF + OF ) option( BUILD_WITH_LUCENE "If build with lucene" - OFF + on ) option( BUILD_DEPENDENCY_TESTS "If build dependency tests" OFF -) \ No newline at end of file +) diff --git a/cmake/lucene_CMakeLists.txt.in b/cmake/lucene_CMakeLists.txt.in index 91e144dced..436ac64475 100644 --- a/cmake/lucene_CMakeLists.txt.in +++ b/cmake/lucene_CMakeLists.txt.in @@ -1,8 +1,7 @@ # lucene ExternalProject_Add(lucene - GIT_REPOSITORY https://github.com/taosdata-contrib/LucenePlusPlus.git - GIT_TAG rel_3.0.8 + GIT_REPOSITORY https://github.com/yihaoDeng/LucenePlusPlus.git SOURCE_DIR "${CMAKE_SOURCE_DIR}/deps/lucene" BINARY_DIR "" #BUILD_IN_SOURCE TRUE @@ -10,4 +9,4 @@ ExternalProject_Add(lucene BUILD_COMMAND "" INSTALL_COMMAND "" TEST_COMMAND "" -) \ No newline at end of file +) diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt index e35417b4c5..7392763d03 100644 --- a/deps/CMakeLists.txt +++ b/deps/CMakeLists.txt @@ -65,6 +65,11 @@ endif(${BUILD_WITH_ROCKSDB}) if(${BUILD_WITH_LUCENE}) option(ENABLE_TEST "Enable the tests" OFF) add_subdirectory(lucene) + target_include_directories( + lucene++ + PUBLIC $ + ) + endif(${BUILD_WITH_LUCENE}) # ================================================================================================ diff --git a/include/libs/index/index.h b/include/libs/index/index.h index f821b437af..bdd0905234 100644 --- a/include/libs/index/index.h +++ b/include/libs/index/index.h @@ -16,12 +16,72 @@ #ifndef _TD_INDEX_H_ #define _TD_INDEX_H_ +#include "os.h" +#include "tarray.h" + #ifdef __cplusplus extern "C" { #endif +typedef struct SIndex SIndex; +typedef struct SIndexOpts SIndexOpts; + +typedef enum { MUST = 0, SHOULD = 1, NOT = 2 } EIndexOperatorType; +typedef enum { QUERY_POINT = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX = 3} EIndexQueryType; + + + +typedef struct SIndexTermQuery { + EIndexQueryType opera; + SArray *querys; +} SIndexTermQuery; + +// tag and tag val; +typedef struct SIndexPair { + char *key; + char *val; +} SIndexPair; + +// +typedef struct SIndexTerm { + SIndexPair* field_value; + EIndexQueryType type; +} SIndexTerm; + + + +/* + * @param: oper + * +*/ + +SIndexTermQuery *indexTermQueryCreate(EIndexOperatorType oper); +void indexTermQueryDestroy(SIndexTermQuery *pQuery); +int indexTermQueryAdd(SIndexTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type); + + + +/* + * @param: + * @param: + */ +SIndex* indexOpen(SIndexOpts *opt, const char *path); + +void indexClose(SIndex *index); +int indexPut(SIndex *index, SArray *pairs, int uid); +int indexDelete(SIndex *index, SIndexTermQuery *query); +int indexSearch(SIndex *index, SIndexTermQuery *query, SArray *result); +int indexRebuild(SIndex *index, SIndexOpts *opt); + +/* + * @param: + * @param: + */ +SIndexOpts *indexOptsCreate(); +void indexOptsDestroy(SIndexOpts *opts); + #ifdef __cplusplus } #endif -#endif /*_TD_INDEX_H_*/ \ No newline at end of file +#endif /*_TD_INDEX_H_*/ diff --git a/source/libs/index/CMakeLists.txt b/source/libs/index/CMakeLists.txt index 638d3f64cd..3da2c93b39 100644 --- a/source/libs/index/CMakeLists.txt +++ b/source/libs/index/CMakeLists.txt @@ -4,4 +4,27 @@ target_include_directories( index PUBLIC "${CMAKE_SOURCE_DIR}/include/libs/index" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/inc" -) \ No newline at end of file +) +target_link_libraries( + index + PUBLIC os + PUBLIC util +) + +if (${BUILD_WITH_LUCENE}) + target_include_directories( + index + PUBLIC "${CMAKE_SOURCE_DIR}/deps/lucene/include" + ) + LINK_DIRECTORIES("${CMAKE_SOURCE_DIR}/deps/lucene/debug/src/core") + target_link_libraries( + index + PUBLIC lucene++ + ) + +endif(${BUILD_WITH_LUCENE}) + +if (${BUILD_TEST}) + add_subdirectory(test) +endif(${BUILD_TEST}) + diff --git a/source/libs/index/inc/indexInt.h b/source/libs/index/inc/indexInt.h index 81eba4ec91..8d8c950075 100644 --- a/source/libs/index/inc/indexInt.h +++ b/source/libs/index/inc/indexInt.h @@ -16,12 +16,31 @@ #ifndef _TD_INDEX_INT_H_ #define _TD_INDEX_INT_H_ +#include "index.h" + +#ifdef USE_LUCENE +#include +#endif + + #ifdef __cplusplus extern "C" { #endif +struct SIndex { +#ifdef USE_LUCENE + index_t *index; +#endif +}; + +struct SIndexOpts { +#ifdef USE_LUCENE + void *opts; +#endif +}; + #ifdef __cplusplus } #endif -#endif /*_TD_INDEX_INT_H_*/ \ No newline at end of file +#endif /*_TD_INDEX_INT_H_*/ diff --git a/source/libs/index/src/index.c b/source/libs/index/src/index.c index f821b437af..46039249c5 100644 --- a/source/libs/index/src/index.c +++ b/source/libs/index/src/index.c @@ -13,15 +13,71 @@ * along with this program. If not, see . */ -#ifndef _TD_INDEX_H_ -#define _TD_INDEX_H_ +#include "index.h" +#include "indexInt.h" -#ifdef __cplusplus -extern "C" { +#ifdef USE_LUCENE +#include "lucene++/Lucene_c.h" #endif -#ifdef __cplusplus +static pthread_once_t isInit = PTHREAD_ONCE_INIT; + +static void indexInit(); + +SIndex *indexOpen(SIndexOpts *opts, const char *path) { + pthread_once(&isInit, indexInit); +#ifdef USE_LUCENE + index_t *index = index_open(path); + SIndex *p = malloc(sizeof(SIndex)); + p->index = index; + return p; +#endif + return NULL; } -#endif -#endif /*_TD_INDEX_H_*/ \ No newline at end of file +void indexClose(SIndex *index) { +#ifdef USE_LUCENE + index_close(index->index); +#endif + free(index); + return; + +} +int indexPut(SIndex *index, SArray* field_vals, int uid) { + return 1; + +} +int indexSearch(SIndex *index, SIndexTermQuery *query, SArray *result) { + return 1; +} + +int indexDelete(SIndex *index, SIndexTermQuery *query) { + return 1; +} +int indexRebuild(SIndex *index, SIndexOpts *opts); + + +SIndexOpts *indexOptsCreate() { + return NULL; +} +void indexOptsDestroy(SIndexOpts *opts) { + +} +/* + * @param: oper + * +*/ + +SIndexTermQuery *indexTermQueryCreate(EIndexOperatorType oper) { + return NULL; +} +void indexTermQueryDestroy(SIndexTermQuery *pQuery) { + +} +int indexTermQueryAdd(SIndexTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type){ + return 1; +} + +void indexInit() { + //do nothing +} diff --git a/source/libs/index/test/indexTests.cpp b/source/libs/index/test/indexTests.cpp index e69de29bb2..047491838f 100644 --- a/source/libs/index/test/indexTests.cpp +++ b/source/libs/index/test/indexTests.cpp @@ -0,0 +1,15 @@ +#include +#include +#include +#include "index.h" + + + +TEST(IndexTest, index_create_test) { + SIndexOpts *opts = indexOptsCreate(); + SIndex *index = indexOpen(opts, "./"); + if (index == NULL) { + std::cout << "index open failed" << std::endl; + } + indexOptsDestroy(opts); +} diff --git a/source/libs/tkv/src/tkv.c b/source/libs/tkv/src/tkv.c index 5319f6b9da..0c6f896d56 100644 --- a/source/libs/tkv/src/tkv.c +++ b/source/libs/tkv/src/tkv.c @@ -158,6 +158,8 @@ static void tkvInit() { #ifdef USE_ROCKSDB defaultReadOpts.ropts = rocksdb_readoptions_create(); defaultWriteOpts.wopts = rocksdb_writeoptions_create(); + rocksdb_writeoptions_disable_WAL(defaultWriteOpts.wopts, true); + #endif } @@ -166,4 +168,4 @@ static void tkvClear() { rocksdb_readoptions_destroy(defaultReadOpts.ropts); rocksdb_writeoptions_destroy(defaultWriteOpts.wopts); #endif -} \ No newline at end of file +} From 9e74ea9ed3cd21052576c9a2194518c54b105fc7 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Fri, 22 Oct 2021 22:47:26 +0800 Subject: [PATCH 2/7] add lucene test --- include/libs/index/index.h | 46 ++++------- source/libs/index/inc/indexInt.h | 21 +++++ source/libs/index/src/index.c | 106 +++++++++++++++++++++++--- source/libs/index/test/indexTests.cpp | 30 +++++++- 4 files changed, 159 insertions(+), 44 deletions(-) diff --git a/include/libs/index/index.h b/include/libs/index/index.h index bdd0905234..346214e0c8 100644 --- a/include/libs/index/index.h +++ b/include/libs/index/index.h @@ -25,54 +25,36 @@ extern "C" { typedef struct SIndex SIndex; typedef struct SIndexOpts SIndexOpts; +typedef struct SIndexMultiTermQuery SIndexMultiTermQuery; +typedef struct SArray SIndexMultiTerm; +//typedef struct SIndexMultiTerm SIndexMultiTerm; typedef enum { MUST = 0, SHOULD = 1, NOT = 2 } EIndexOperatorType; typedef enum { QUERY_POINT = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX = 3} EIndexQueryType; - -typedef struct SIndexTermQuery { - EIndexQueryType opera; - SArray *querys; -} SIndexTermQuery; - -// tag and tag val; -typedef struct SIndexPair { - char *key; - char *val; -} SIndexPair; - -// -typedef struct SIndexTerm { - SIndexPair* field_value; - EIndexQueryType type; -} SIndexTerm; - - - /* * @param: oper * */ - -SIndexTermQuery *indexTermQueryCreate(EIndexOperatorType oper); -void indexTermQueryDestroy(SIndexTermQuery *pQuery); -int indexTermQueryAdd(SIndexTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type); - - +SIndexMultiTermQuery *indexMultiTermQueryCreate(EIndexOperatorType oper); +void indexMultiTermQueryDestroy(SIndexMultiTermQuery *pQuery); +int indexMultiTermQueryAdd(SIndexMultiTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type); /* * @param: * @param: */ SIndex* indexOpen(SIndexOpts *opt, const char *path); +void indexClose(SIndex *index); +int indexPut(SIndex *index, SIndexMultiTerm *terms, int uid); +int indexDelete(SIndex *index, SIndexMultiTermQuery *query); +int indexSearch(SIndex *index, SIndexMultiTermQuery *query, SArray *result); +int indexRebuild(SIndex *index, SIndexOpts *opt); -void indexClose(SIndex *index); -int indexPut(SIndex *index, SArray *pairs, int uid); -int indexDelete(SIndex *index, SIndexTermQuery *query); -int indexSearch(SIndex *index, SIndexTermQuery *query, SArray *result); -int indexRebuild(SIndex *index, SIndexOpts *opt); - +SIndexMultiTerm *indexMultiTermCreate(); +int indexMultiTermAdd(SIndexMultiTerm *terms, const char *field, int32_t nFields, const char *value, int32_t nValue); +void indexMultiTermDestroy(SIndexMultiTerm *terms); /* * @param: * @param: diff --git a/source/libs/index/inc/indexInt.h b/source/libs/index/inc/indexInt.h index 8d8c950075..742427bf94 100644 --- a/source/libs/index/inc/indexInt.h +++ b/source/libs/index/inc/indexInt.h @@ -39,6 +39,27 @@ struct SIndexOpts { #endif }; +struct SIndexMultiTermQuery { + EIndexOperatorType opera; + SArray *query; +}; + +// field and key; +typedef struct SIndexTerm { + char *key; + int32_t nKey; + char *val; + int32_t nVal; +} SIndexTerm; + +typedef struct SIndexTermQuery { + SIndexTerm* field_value; + EIndexQueryType type; +} SIndexTermQuery; + + +SIndexTerm *indexTermCreate(const char *key, int32_t nKey, const char *val, int32_t nVal); +void indexTermDestroy(SIndexTerm *p); #ifdef __cplusplus } #endif diff --git a/source/libs/index/src/index.c b/source/libs/index/src/index.c index 46039249c5..e4b2a4acc4 100644 --- a/source/libs/index/src/index.c +++ b/source/libs/index/src/index.c @@ -38,46 +38,130 @@ SIndex *indexOpen(SIndexOpts *opts, const char *path) { void indexClose(SIndex *index) { #ifdef USE_LUCENE index_close(index->index); + index->index = NULL; #endif free(index); return; } + +#ifdef USE_LUCENE +#endif int indexPut(SIndex *index, SArray* field_vals, int uid) { +#ifdef USE_LUCENE + index_document_t *doc = index_document_create(); + + char buf[16] = {0}; + sprintf(buf, "%d", uid); + + for (int i = 0; i < taosArrayGetSize(field_vals); i++) { + SIndexTerm *p = taosArrayGetP(field_vals, i); + index_document_add(doc, (const char *)(p->key), p->nKey, (const char *)(p->val), p->nVal, 1); + } + index_document_add(doc, NULL, 0, buf, strlen(buf), 0); + + index_put(index->index, doc); + index_document_destroy(doc); +#endif return 1; } -int indexSearch(SIndex *index, SIndexTermQuery *query, SArray *result) { +int indexSearch(SIndex *index, SIndexMultiTermQuery *multiQuerys, SArray *result) { +#ifdef USE_LUCENE + for (int i = 0; i < taosArrayGetSize(multiQuerys->query); i++) { + SIndexTermQuery *p = taosArrayGet(multiQuerys->query, i); + SIndexTerm *term = p->field_value; + EIndexQueryType qType = p->type; + int *tResult = NULL; + int32_t tsz = 0; + index_search(index->index, term->key, term->nKey, term->val, term->nVal, qType, &tResult, &tsz); + for (int i = 0; i < tsz; i++) { + taosArrayPush(result, &(tResult[i])); + } + + } +#endif return 1; } -int indexDelete(SIndex *index, SIndexTermQuery *query) { +int indexDelete(SIndex *index, SIndexMultiTermQuery *query) { return 1; } int indexRebuild(SIndex *index, SIndexOpts *opts); SIndexOpts *indexOptsCreate() { - return NULL; +#ifdef USE_LUCENE +#endif +return NULL; } void indexOptsDestroy(SIndexOpts *opts) { - +#ifdef USE_LUCENE +#endif } /* * @param: oper * */ -SIndexTermQuery *indexTermQueryCreate(EIndexOperatorType oper) { - return NULL; +SIndexMultiTermQuery *indexMultiTermQueryCreate(EIndexOperatorType opera) { + SIndexMultiTermQuery *p = (SIndexMultiTermQuery *)malloc(sizeof(SIndexMultiTermQuery)); + if (p == NULL) { return NULL; } + p->opera = opera; + p->query = taosArrayInit(1, sizeof(SIndexTermQuery)); + return p; } -void indexTermQueryDestroy(SIndexTermQuery *pQuery) { - -} -int indexTermQueryAdd(SIndexTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type){ - return 1; +void indexMultiTermQueryDestroy(SIndexMultiTermQuery *pQuery) { + for (int i = 0; i < taosArrayGetSize(pQuery->query); i++) { + SIndexTermQuery *p = (SIndexTermQuery *)taosArrayGet(pQuery->query, i); + indexTermDestroy(p->field_value); + } + taosArrayDestroy(pQuery->query); + free(pQuery); +}; +int indexMultiTermQueryAdd(SIndexMultiTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type){ + SIndexTerm *t = indexTermCreate(field, nFields, value, nValue); + if (t == NULL) {return -1;} + SIndexTermQuery q = {.type = type, .field_value = t}; + taosArrayPush(pQuery->query, &q); + return 0; } + +SIndexTerm *indexTermCreate(const char *key, int32_t nKey, const char *val, int32_t nVal) { + SIndexTerm *t = (SIndexTerm *)malloc(sizeof(SIndexTerm)); + t->key = (char *)calloc(nKey + 1, 1); + memcpy(t->key, key, nKey); + t->nKey = nKey; + + t->val = (char *)calloc(nVal + 1, 1); + memcpy(t->val, val, nVal); + t->nVal = nVal; + return t; +} +void indexTermDestroy(SIndexTerm *p) { + free(p->key); + free(p->val); + free(p); +} + +SArray *indexMultiTermCreate() { + return taosArrayInit(4, sizeof(SIndexTerm *)); +} + +int indexMultiTermAdd(SArray *array, const char *field, int32_t nField, const char *val, int32_t nVal) { + SIndexTerm *term = indexTermCreate(field, nField, val, nVal); + if (term == NULL) { return -1; } + taosArrayPush(array, &term); + return 0; +} +void indexMultiTermDestroy(SArray *array) { + for (int32_t i = 0; i < taosArrayGetSize(array); i++) { + SIndexTerm *p = taosArrayGetP(array, i); + indexTermDestroy(p); + } + taosArrayDestroy(array); +} void indexInit() { //do nothing } diff --git a/source/libs/index/test/indexTests.cpp b/source/libs/index/test/indexTests.cpp index 047491838f..efa7f37a60 100644 --- a/source/libs/index/test/indexTests.cpp +++ b/source/libs/index/test/indexTests.cpp @@ -2,14 +2,42 @@ #include #include #include "index.h" +#include "indexInt.h" + TEST(IndexTest, index_create_test) { SIndexOpts *opts = indexOptsCreate(); - SIndex *index = indexOpen(opts, "./"); + SIndex *index = indexOpen(opts, "./test"); if (index == NULL) { std::cout << "index open failed" << std::endl; } + + + SArray* terms = indexMultiTermCreate(); + indexMultiTermAdd(terms, "tag1", strlen("tag1"), "field", strlen("field")); + for (int i = 0; i < 10; i++) { + indexPut(index, terms, i); + } + indexMultiTermDestroy(terms); + + + // query + SIndexMultiTermQuery *multiQuery = indexMultiTermQueryCreate(MUST); + indexMultiTermQueryAdd(multiQuery, "tag1", strlen("tag1"), "field", strlen("field"), QUERY_PREFIX); + + SArray *result = (SArray *)taosArrayInit(10, sizeof(int)); + indexSearch(index, multiQuery, result); + + std::cout << "taos'size : " << taosArrayGetSize(result) << std::endl; + for (int i = 0; i < taosArrayGetSize(result); i++) { + int *v = (int *)taosArrayGet(result, i); + std::cout << "value --->" << *v << std::endl; + } + indexMultiTermQueryDestroy(multiQuery); + indexOptsDestroy(opts); + indexClose(index); + // } From 551cfd5c35b50899bae7f9c86022021b57ff4c21 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 23 Oct 2021 16:12:36 +0800 Subject: [PATCH 3/7] add index test example --- include/libs/index/index.h | 11 ++++---- source/libs/index/src/index.c | 39 ++++++++++++++++++++------- source/libs/index/test/indexTests.cpp | 25 +++++++++++++---- 3 files changed, 55 insertions(+), 20 deletions(-) diff --git a/include/libs/index/index.h b/include/libs/index/index.h index 346214e0c8..1b74928568 100644 --- a/include/libs/index/index.h +++ b/include/libs/index/index.h @@ -27,12 +27,9 @@ typedef struct SIndex SIndex; typedef struct SIndexOpts SIndexOpts; typedef struct SIndexMultiTermQuery SIndexMultiTermQuery; typedef struct SArray SIndexMultiTerm; -//typedef struct SIndexMultiTerm SIndexMultiTerm; typedef enum { MUST = 0, SHOULD = 1, NOT = 2 } EIndexOperatorType; -typedef enum { QUERY_POINT = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX = 3} EIndexQueryType; - - +typedef enum { QUERY_TERM = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX = 3} EIndexQueryType; /* * @param: oper * @@ -40,7 +37,6 @@ typedef enum { QUERY_POINT = 0, QUERY_PREFIX = 1, QUERY_SUFFIX = 2,QUERY_REGEX SIndexMultiTermQuery *indexMultiTermQueryCreate(EIndexOperatorType oper); void indexMultiTermQueryDestroy(SIndexMultiTermQuery *pQuery); int indexMultiTermQueryAdd(SIndexMultiTermQuery *pQuery, const char *field, int32_t nFields, const char *value, int32_t nValue, EIndexQueryType type); - /* * @param: * @param: @@ -51,7 +47,10 @@ int indexPut(SIndex *index, SIndexMultiTerm *terms, int uid); int indexDelete(SIndex *index, SIndexMultiTermQuery *query); int indexSearch(SIndex *index, SIndexMultiTermQuery *query, SArray *result); int indexRebuild(SIndex *index, SIndexOpts *opt); - +/* + * @param + * @param + */ SIndexMultiTerm *indexMultiTermCreate(); int indexMultiTermAdd(SIndexMultiTerm *terms, const char *field, int32_t nFields, const char *value, int32_t nValue); void indexMultiTermDestroy(SIndexMultiTerm *terms); diff --git a/source/libs/index/src/index.c b/source/libs/index/src/index.c index e4b2a4acc4..91cfcb5cdf 100644 --- a/source/libs/index/src/index.c +++ b/source/libs/index/src/index.c @@ -68,18 +68,39 @@ int indexPut(SIndex *index, SArray* field_vals, int uid) { } int indexSearch(SIndex *index, SIndexMultiTermQuery *multiQuerys, SArray *result) { #ifdef USE_LUCENE - for (int i = 0; i < taosArrayGetSize(multiQuerys->query); i++) { + EIndexOperatorType opera = multiQuerys->opera; + + int nQuery = taosArrayGetSize(multiQuerys->query); + char **fields = malloc(sizeof(char *) * nQuery); + char **keys = malloc(sizeof(char *) * nQuery); + int *types = malloc(sizeof(int) * nQuery); + + for (int i = 0; i < nQuery; i++) { SIndexTermQuery *p = taosArrayGet(multiQuerys->query, i); SIndexTerm *term = p->field_value; - EIndexQueryType qType = p->type; - int *tResult = NULL; - int32_t tsz = 0; - index_search(index->index, term->key, term->nKey, term->val, term->nVal, qType, &tResult, &tsz); - for (int i = 0; i < tsz; i++) { - taosArrayPush(result, &(tResult[i])); - } - + + fields[i] = calloc(1, term->nKey + 1); + keys[i] = calloc(1, term->nVal + 1); + + memcpy(fields[i], term->key, term->nKey); + memcpy(keys[i], term->val, term->nVal); + types[i] = (int)(p->type); } + int *tResult = NULL; + int tsz= 0; + index_multi_search(index->index, (const char **)fields, (const char **)keys, types, nQuery, opera, &tResult, &tsz); + + for (int i = 0; i < tsz; i++) { + taosArrayPush(result, &tResult[i]); + } + + for (int i = 0; i < nQuery; i++) { + free(fields[i]); + free(keys[i]); + } + free(fields); + free(keys); + free(types); #endif return 1; } diff --git a/source/libs/index/test/indexTests.cpp b/source/libs/index/test/indexTests.cpp index efa7f37a60..763a6a54d3 100644 --- a/source/libs/index/test/indexTests.cpp +++ b/source/libs/index/test/indexTests.cpp @@ -15,17 +15,32 @@ TEST(IndexTest, index_create_test) { } - SArray* terms = indexMultiTermCreate(); - indexMultiTermAdd(terms, "tag1", strlen("tag1"), "field", strlen("field")); - for (int i = 0; i < 10; i++) { - indexPut(index, terms, i); + + // write + for (int i = 0; i < 100000; i++) { + SIndexMultiTerm* terms = indexMultiTermCreate(); + std::string val = "field"; + + indexMultiTermAdd(terms, "tag1", strlen("tag1"), val.c_str(), val.size()); + + val.append(std::to_string(i)); + indexMultiTermAdd(terms, "tag2", strlen("tag2"), val.c_str(), val.size()); + + val.insert(0, std::to_string(i)); + indexMultiTermAdd(terms, "tag3", strlen("tag3"), val.c_str(), val.size()); + + val.append("const"); + indexMultiTermAdd(terms, "tag4", strlen("tag4"), val.c_str(), val.size()); + + indexPut(index, terms, i); + indexMultiTermDestroy(terms); } - indexMultiTermDestroy(terms); // query SIndexMultiTermQuery *multiQuery = indexMultiTermQueryCreate(MUST); indexMultiTermQueryAdd(multiQuery, "tag1", strlen("tag1"), "field", strlen("field"), QUERY_PREFIX); + indexMultiTermQueryAdd(multiQuery, "tag3", strlen("tag3"), "0field0", strlen("0field0"), QUERY_TERM); SArray *result = (SArray *)taosArrayInit(10, sizeof(int)); indexSearch(index, multiQuery, result); From 8721dc4408d673dd9538c793b5fb970bfa5ecfcf Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Mon, 25 Oct 2021 15:14:58 +0800 Subject: [PATCH 4/7] add index test example --- source/libs/index/test/CMakeLists.txt | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 source/libs/index/test/CMakeLists.txt diff --git a/source/libs/index/test/CMakeLists.txt b/source/libs/index/test/CMakeLists.txt new file mode 100644 index 0000000000..f2a7442a5a --- /dev/null +++ b/source/libs/index/test/CMakeLists.txt @@ -0,0 +1,23 @@ +add_executable(indexTest "") +target_sources(indexTest + PRIVATE + "../src/index.c" + "indexTests.cpp" +) +target_include_directories ( indexTest + PUBLIC + "${CMAKE_SOURCE_DIR}/include/libs/index" + "${CMAKE_CURRENT_SOURCE_DIR}/../inc" +) +target_link_libraries (indexTest + os + util + common + gtest_main + index +) + +add_test( + NAME index_test + COMMAND indexTest +) From d2485c4c8a9caaf9a588e11a3b7bc1ca78977d3e Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Mon, 25 Oct 2021 15:17:07 +0800 Subject: [PATCH 5/7] add index test example --- CMakeLists.txt | 1 + source/libs/index/test/indexTests.cpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index abb39c310a..3a2c29ae8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,7 @@ endif(${BUILD_WITH_ROCKSDB}) ## lucene if(${BUILD_WITH_LUCENE}) cat("${CMAKE_SUPPORT_DIR}/lucene_CMakeLists.txt.in" ${DEPS_TMP_FILE}) + add_definitions(-DUSE_LUCENE) endif(${BUILD_WITH_LUCENE}) ## download dependencies diff --git a/source/libs/index/test/indexTests.cpp b/source/libs/index/test/indexTests.cpp index 763a6a54d3..cc0df0d42a 100644 --- a/source/libs/index/test/indexTests.cpp +++ b/source/libs/index/test/indexTests.cpp @@ -15,7 +15,6 @@ TEST(IndexTest, index_create_test) { } - // write for (int i = 0; i < 100000; i++) { SIndexMultiTerm* terms = indexMultiTermCreate(); @@ -32,6 +31,7 @@ TEST(IndexTest, index_create_test) { val.append("const"); indexMultiTermAdd(terms, "tag4", strlen("tag4"), val.c_str(), val.size()); + indexPut(index, terms, i); indexMultiTermDestroy(terms); } @@ -39,6 +39,7 @@ TEST(IndexTest, index_create_test) { // query SIndexMultiTermQuery *multiQuery = indexMultiTermQueryCreate(MUST); + indexMultiTermQueryAdd(multiQuery, "tag1", strlen("tag1"), "field", strlen("field"), QUERY_PREFIX); indexMultiTermQueryAdd(multiQuery, "tag3", strlen("tag3"), "0field0", strlen("0field0"), QUERY_TERM); From 980ace09b54144c1ee3d2d0d1c024d2a5fabe3ab Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Mon, 15 Nov 2021 11:36:51 +0800 Subject: [PATCH 6/7] add interface --- source/libs/index/inc/index_fst.h | 166 +++++++++++++ source/libs/index/src/index_fst.c | 48 ++++ source/libs/index/src/index_fst_common.c | 304 +++++++++++++++++++++++ 3 files changed, 518 insertions(+) create mode 100644 source/libs/index/inc/index_fst.h create mode 100644 source/libs/index/src/index_fst.c create mode 100644 source/libs/index/src/index_fst_common.c diff --git a/source/libs/index/inc/index_fst.h b/source/libs/index/inc/index_fst.h new file mode 100644 index 0000000000..de4c957e29 --- /dev/null +++ b/source/libs/index/inc/index_fst.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _INDEX_FST_H_ +#define _INDEX_FST_H_ +#include "index_fst.h" +#include "tarray.h" + +typedef FstType uint64_t; +typedef CompiledAddr uint64_t; +typedef Output uint64_t; +typedef PackSizes uint8_t; + + +//A sentinel value used to indicate an empty final state +const CompileAddr EMPTY_ADDRESS = 0; +/// A sentinel value used to indicate an invalid state. +const CompileAddr NONE_ADDRESS = 1; + +// This version number is written to every finite state transducer created by +// this crate. When a finite state transducer is read, its version number is +// checked against this value. +const uint64_t version = 3; +// The threshold (in number of transitions) at which an index is created for +// a node's transitions. This speeds up lookup time at the expense of FST size + +const uint64_t TRANS_INDEX_THRESHOLD = 32; + +typedef struct FstRange { + uint64_t start; + uint64_t end; +} FstRange; + +enum State { OneTransNext, OneTrans, AnyTrans, EmptyFinal}; +enum FstBound { Included, Excluded, Unbounded}; + +typedef struct CheckSummer { + uint32_t sum; +}; + + +typedef struct FstBuilder { + FstCountingWriter wtr; // The FST raw data is written directly to `wtr`. + FstUnFinishedNodes unfinished // The stack of unfinished nodes + Registry registry // A map of finished nodes. + SArray* last // The last word added + CompiledAddr lastAddr // The address of the last compiled node + uint64_t len // num of keys added +} FstBuilder; + +typedef struct FstCountingWriter { + void* wtr; // wrap any writer that counts and checksum bytes written + uint64_t count; + CheckSummer summer; +}; + + + + +typedef struct FstTransition { + uint8_t inp; //The byte input associated with this transition. + Output out; //The output associated with this transition + CompiledAddr addr; //The address of the node that this transition points to +} FstTransition; + +typedef struct FstTransitions { + FstNode *node; + FstRange range; +} FstTransitions; + +typedef struct FstUnFinishedNodes { + SArray *stack; // +} FstUnFinishedNodes; + +typedef struct FstBuilderNode { + bool isFinal; + Output finalOutput; + SArray *trans; // +} FstBuilderNode; + + + +typedef struct FstLastTransition { + uint8_t inp; + Output out; +} FstLastTransition; + +typedef struct FstBuilderNodeUnfinished { + FstBuilderNode node; + FstLastTransition last; +} FstBuilderNodeUnfinished; + +typedef struct FstNode { + uint8_t* data; + uint64_t version; + State state; + CompiledAddr start; + CompiledAddr end; + bool isFinal; + uint64_t nTrans; + PackSizes sizes; + Output finalOutput; +} FstNode; + +typedef struct FstMeta { + uint64_t version; + CompiledAddr rootAddr; + FstType ty; + uint64_t len; + uint32_t checkSum; +} FstMeta; + +typedef struct Fst { + FstMeta meta; + void *data; // +}; + +// ops + +typedef struct FstIndexedValue { + uint64_t index; + uint64_t value; +}; + +// relate to Regist +typedef struct FstRegistry { + SArray *table; // + uint64_t tableSize; // num of rows + uint64_t mruSize; // num of columns +} FstRegistry; + +typedef struct FstRegistryCache { + SArray *cells; // +} FstRegistryCache; + +typedef struct FstRegistryCell { + CompiledAddr addr; + FstBuilderNode *node; +} FstRegistryCell; + +enum FstRegistryEntry {Found, NotFound, Rejected}; + +FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, uint8_t *data); +FstTransitions fstNodeTransitionIter(FstNode *node); +FstTransition fstNodeGetTransitionAt(FstNode *node, uint64_t i); +CompiledAddr fstNodeGetTransitionAddr(FstNode *node, uint64_t i); +int64_t fstNodeFindInput(FstNode *node, int8_t b); +Output fstNodeGetFinalOutput(FstNode *node); +void* fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledArr addr, FstBuilderNode *builderNode); + + + + +#endif diff --git a/source/libs/index/src/index_fst.c b/source/libs/index/src/index_fst.c new file mode 100644 index 0000000000..4c6e20a7d5 --- /dev/null +++ b/source/libs/index/src/index_fst.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "index_fst.h" + +// fst node function +FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) { + FstNode *n = (FstNode *)malloc(sizeof(FstNode)); + if (n == NULL) { return NULL; } + + if (addr == EMPTY_ADDRESS) { + n->date = NULL; + n->version = version; + n->state = EmptyFinal; + n->start = EMPTY_ADDRESS; + n->end = EMPTY_ADDRESS; + n->isFinal = true; + n->nTrans = 0; + n->sizes = 0; + n->finalOutpu = 0; + return n; + } + uint8_t v = (data[addr] & 0b1100000) >> 6; + if (v == 0b11) { + + } else if (v == 0b10) { + + } else { + + } + + +} + + + diff --git a/source/libs/index/src/index_fst_common.c b/source/libs/index/src/index_fst_common.c new file mode 100644 index 0000000000..39e5f89b35 --- /dev/null +++ b/source/libs/index/src/index_fst_common.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +const uint8_t COMMON_INPUTS[] = { + 84, // '\x00' + 85, // '\x01' + 86, // '\x02' + 87, // '\x03' + 88, // '\x04' + 89, // '\x05' + 90, // '\x06' + 91, // '\x07' + 92, // '\x08' + 93, // '\t' + 94, // '\n' + 95, // '\x0b' + 96, // '\x0c' + 97, // '\r' + 98, // '\x0e' + 99, // '\x0f' + 100, // '\x10' + 101, // '\x11' + 102, // '\x12' + 103, // '\x13' + 104, // '\x14' + 105, // '\x15' + 106, // '\x16' + 107, // '\x17' + 108, // '\x18' + 109, // '\x19' + 110, // '\x1a' + 111, // '\x1b' + 112, // '\x1c' + 113, // '\x1d' + 114, // '\x1e' + 115, // '\x1f' + 116, // ' ' + 80, // '!' + 117, // '"' + 118, // '#' + 79, // '$' + 39, // '%' + 30, // '&' + 81, // "'" + 75, // '(' + 74, // ')' + 82, // '*' + 57, // '+' + 66, // ',' + 16, // '-' + 12, // '.' + 2, // '/' + 19, // '0' + 20, // '1' + 21, // '2' + 27, // '3' + 32, // '4' + 29, // '5' + 35, // '6' + 36, // '7' + 37, // '8' + 34, // '9' + 24, // ':' + 73, // ';' + 119, // '<' + 23, // '=' + 120, // '>' + 40, // '?' + 83, // '@' + 44, // 'A' + 48, // 'B' + 42, // 'C' + 43, // 'D' + 49, // 'E' + 46, // 'F' + 62, // 'G' + 61, // 'H' + 47, // 'I' + 69, // 'J' + 68, // 'K' + 58, // 'L' + 56, // 'M' + 55, // 'N' + 59, // 'O' + 51, // 'P' + 72, // 'Q' + 54, // 'R' + 45, // 'S' + 52, // 'T' + 64, // 'U' + 65, // 'V' + 63, // 'W' + 71, // 'X' + 67, // 'Y' + 70, // 'Z' + 77, // '[' + 121, // '\\' + 78, // ']' + 122, // '^' + 31, // '_' + 123, // '`' + 4, // 'a' + 25, // 'b' + 9, // 'c' + 17, // 'd' + 1, // 'e' + 26, // 'f' + 22, // 'g' + 13, // 'h' + 7, // 'i' + 50, // 'j' + 38, // 'k' + 14, // 'l' + 15, // 'm' + 10, // 'n' + 3, // 'o' + 8, // 'p' + 60, // 'q' + 6, // 'r' + 5, // 's' + 0, // 't' + 18, // 'u' + 33, // 'v' + 11, // 'w' + 41, // 'x' + 28, // 'y' + 53, // 'z' + 124, // '{' + 125, // '|' + 126, // '}' + 76, // '~' + 127, // '\x7f' + 128, // '\x80' + 129, // '\x81' + 130, // '\x82' + 131, // '\x83' + 132, // '\x84' + 133, // '\x85' + 134, // '\x86' + 135, // '\x87' + 136, // '\x88' + 137, // '\x89' + 138, // '\x8a' + 139, // '\x8b' + 140, // '\x8c' + 141, // '\x8d' + 142, // '\x8e' + 143, // '\x8f' + 144, // '\x90' + 145, // '\x91' + 146, // '\x92' + 147, // '\x93' + 148, // '\x94' + 149, // '\x95' + 150, // '\x96' + 151, // '\x97' + 152, // '\x98' + 153, // '\x99' + 154, // '\x9a' + 155, // '\x9b' + 156, // '\x9c' + 157, // '\x9d' + 158, // '\x9e' + 159, // '\x9f' + 160, // '\xa0' + 161, // '¡' + 162, // '¢' + 163, // '£' + 164, // '¤' + 165, // '¥' + 166, // '¦' + 167, // '§' + 168, // '¨' + 169, // '©' + 170, // 'ª' + 171, // '«' + 172, // '¬' + 173, // '\xad' + 174, // '®' + 175, // '¯' + 176, // '°' + 177, // '±' + 178, // '²' + 179, // '³' + 180, // '´' + 181, // 'µ' + 182, // '¶' + 183, // '·' + 184, // '¸' + 185, // '¹' + 186, // 'º' + 187, // '»' + 188, // '¼' + 189, // '½' + 190, // '¾' + 191, // '¿' + 192, // 'À' + 193, // 'Á' + 194, // 'Â' + 195, // 'Ã' + 196, // 'Ä' + 197, // 'Å' + 198, // 'Æ' + 199, // 'Ç' + 200, // 'È' + 201, // 'É' + 202, // 'Ê' + 203, // 'Ë' + 204, // 'Ì' + 205, // 'Í' + 206, // 'Î' + 207, // 'Ï' + 208, // 'Ð' + 209, // 'Ñ' + 210, // 'Ò' + 211, // 'Ó' + 212, // 'Ô' + 213, // 'Õ' + 214, // 'Ö' + 215, // '×' + 216, // 'Ø' + 217, // 'Ù' + 218, // 'Ú' + 219, // 'Û' + 220, // 'Ü' + 221, // 'Ý' + 222, // 'Þ' + 223, // 'ß' + 224, // 'à' + 225, // 'á' + 226, // 'â' + 227, // 'ã' + 228, // 'ä' + 229, // 'å' + 230, // 'æ' + 231, // 'ç' + 232, // 'è' + 233, // 'é' + 234, // 'ê' + 235, // 'ë' + 236, // 'ì' + 237, // 'í' + 238, // 'î' + 239, // 'ï' + 240, // 'ð' + 241, // 'ñ' + 242, // 'ò' + 243, // 'ó' + 244, // 'ô' + 245, // 'õ' + 246, // 'ö' + 247, // '÷' + 248, // 'ø' + 249, // 'ù' + 250, // 'ú' + 251, // 'û' + 252, // 'ü' + 253, // 'ý' + 254, // 'þ' + 255, // 'ÿ' +}; + +char const COMMON_INPUTS_INV[] = [ + 't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w', + '.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=', + ':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6', + '7', '8', 'k', '%', '?', 'x', 'C', 'D', 'A', 'S', 'F', 'I', + 'B', 'E', 'j', 'P', 'T', 'z', 'R', 'N', 'M', '+', 'L', 'O', + 'q', 'H', 'G', 'W', 'U', 'V', ',', 'Y', 'K', 'J', 'Z', 'X', + 'Q', ';', ')', '(', '~', '[', ']', '$', '!', '\'', '*', '@', + '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', + '\x08', '\t', '\n', '\x0b', '\x0c', '\r', '\x0e', '\x0f', '\x10', + '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', + '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', ' ', '"', + '#', '<', '>', '\\', '^', '`', '{', '|', '}','\x7f','\x80', + '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88', + '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90', + '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', + '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '\xa0', + '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', '\xa8', + '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', '\xb0', + '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', '\xb8', + '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', '\xc0', + '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', '\xc8', + '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', '\xd0', + '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', '\xd8', + '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', '\xe0', + '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', '\xe8', + '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0', + '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8', + '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', +]; + From 60e339b31df337d3b09f4d23c99ab09b97820f62 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 20 Nov 2021 13:13:21 +0800 Subject: [PATCH 7/7] fst core struct --- source/libs/index/inc/index_fst.h | 158 ++++++----- source/libs/index/inc/index_fst_automation.h | 42 +++ source/libs/index/inc/index_fst_node.h | 22 ++ source/libs/index/inc/index_fst_registry.h | 24 ++ source/libs/index/inc/index_fst_util.h | 82 ++++++ source/libs/index/src/index_fst.c | 274 ++++++++++++++++++- source/libs/index/src/index_fst_automation.c | 14 + source/libs/index/src/index_fst_common.c | 6 +- source/libs/index/src/index_fst_node.c | 15 + source/libs/index/src/index_fst_registry.c | 17 ++ source/libs/index/src/index_fst_util.c | 115 ++++++++ 11 files changed, 683 insertions(+), 86 deletions(-) create mode 100644 source/libs/index/inc/index_fst_automation.h create mode 100644 source/libs/index/inc/index_fst_node.h create mode 100644 source/libs/index/inc/index_fst_registry.h create mode 100644 source/libs/index/inc/index_fst_util.h create mode 100644 source/libs/index/src/index_fst_automation.c create mode 100644 source/libs/index/src/index_fst_node.c create mode 100644 source/libs/index/src/index_fst_registry.c create mode 100644 source/libs/index/src/index_fst_util.c diff --git a/source/libs/index/inc/index_fst.h b/source/libs/index/inc/index_fst.h index de4c957e29..61c857ed74 100644 --- a/source/libs/index/inc/index_fst.h +++ b/source/libs/index/inc/index_fst.h @@ -13,58 +13,73 @@ * along with this program. If not, see . */ -#ifndef _INDEX_FST_H_ -#define _INDEX_FST_H_ -#include "index_fst.h" +#ifndef __INDEX_FST_H__ +#define __INDEX_FST_H__ + + #include "tarray.h" - -typedef FstType uint64_t; -typedef CompiledAddr uint64_t; -typedef Output uint64_t; -typedef PackSizes uint8_t; +#include "index_fst_util.h" +#include "index_fst_registry.h" -//A sentinel value used to indicate an empty final state -const CompileAddr EMPTY_ADDRESS = 0; -/// A sentinel value used to indicate an invalid state. -const CompileAddr NONE_ADDRESS = 1; +typedef struct FstNode FstNode; +#define OUTPUT_PREFIX(a, b) ((a) > (b) ? (b) : (a) -// This version number is written to every finite state transducer created by -// this crate. When a finite state transducer is read, its version number is -// checked against this value. -const uint64_t version = 3; -// The threshold (in number of transitions) at which an index is created for -// a node's transitions. This speeds up lookup time at the expense of FST size - -const uint64_t TRANS_INDEX_THRESHOLD = 32; typedef struct FstRange { uint64_t start; uint64_t end; } FstRange; -enum State { OneTransNext, OneTrans, AnyTrans, EmptyFinal}; -enum FstBound { Included, Excluded, Unbounded}; -typedef struct CheckSummer { - uint32_t sum; -}; +typedef struct FstBuilderNode { + bool isFinal; + Output finalOutput; + SArray *trans; // +} FstBuilderNode; + +typedef enum { OneTransNext, OneTrans, AnyTrans, EmptyFinal} State; +typedef enum { Included, Excluded, Unbounded} FstBound; + +typedef uint32_t CheckSummer; -typedef struct FstBuilder { - FstCountingWriter wtr; // The FST raw data is written directly to `wtr`. - FstUnFinishedNodes unfinished // The stack of unfinished nodes - Registry registry // A map of finished nodes. - SArray* last // The last word added - CompiledAddr lastAddr // The address of the last compiled node - uint64_t len // num of keys added -} FstBuilder; +/* + * + * UnFinished node and helper function + * TODO: simple function name + */ +typedef struct FstUnFinishedNodes { + SArray *stack; // } FstUnFinishedNodes; +} FstUnFinishedNodes; + +#define FST_UNFINISHED_NODES_LEN(nodes) taosArrayGetSize(nodes->stack) + +FstUnFinishedNodes *FstUnFinishedNodesCreate(); +void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal); +FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes); +FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr); +FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes); +void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *node, Output out); +void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *node, CompiledAddr addr); +void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *node, FstSlice bs, Output out); +uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs); +uint64_t FstUnFinishedNodesFindCommPreifxAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out); typedef struct FstCountingWriter { void* wtr; // wrap any writer that counts and checksum bytes written uint64_t count; CheckSummer summer; -}; +} FstCountingWriter; + +typedef struct FstBuilder { + FstCountingWriter wtr; // The FST raw data is written directly to `wtr`. + FstUnFinishedNodes *unfinished; // The stack of unfinished nodes + FstRegistry registry; // A map of finished nodes. + SArray* last; // The last word added + CompiledAddr lastAddr; // The address of the last compiled node + uint64_t len; // num of keys added +} FstBuilder; @@ -80,16 +95,6 @@ typedef struct FstTransitions { FstRange range; } FstTransitions; -typedef struct FstUnFinishedNodes { - SArray *stack; // -} FstUnFinishedNodes; - -typedef struct FstBuilderNode { - bool isFinal; - Output finalOutput; - SArray *trans; // -} FstBuilderNode; - typedef struct FstLastTransition { @@ -97,13 +102,23 @@ typedef struct FstLastTransition { Output out; } FstLastTransition; +/* + * FstBuilderNodeUnfinished and helper function + * TODO: simple function name + */ typedef struct FstBuilderNodeUnfinished { - FstBuilderNode node; - FstLastTransition last; + FstBuilderNode *node; + FstLastTransition* last; } FstBuilderNodeUnfinished; +void fstBuilderNodeUnfinishedLastCompiled(FstBuilderNodeUnfinished *node, CompiledAddr addr); +void fstBuilderNodeUnfinishedAddOutputPrefix(FstBuilderNodeUnfinished *node, CompiledAddr addr); + +/* + * FstNode and helper function + */ typedef struct FstNode { - uint8_t* data; + FstSlice data; uint64_t version; State state; CompiledAddr start; @@ -114,6 +129,28 @@ typedef struct FstNode { Output finalOutput; } FstNode; +// If this node is final and has a terminal output value, then it is, returned. Otherwise, a zero output is returned +#define FST_NODE_FINAL_OUTPUT(node) node->finalOutput +// Returns true if and only if this node corresponds to a final or "match", state in the finite state transducer. +#define FST_NODE_IS_FINAL(node) node->isFinal +// Returns the number of transitions in this node, The maximum number of transitions is 256. +#define FST_NODE_LEN(node) node->nTrans +// Returns true if and only if this node has zero transitions. +#define FST_NODE_IS_EMPTYE(node) (node->nTrans == 0) +// Return the address of this node. +#define FST_NODE_ADDR(node) node->start + +FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *data); +FstTransitions fstNodeTransitionIter(FstNode *node); +FstTransitions* fstNodeTransitions(FstNode *node); +bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res); +bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res); +bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res); +bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode); +FstSlice fstNodeAsSlice(FstNode *node); + + + typedef struct FstMeta { uint64_t version; CompiledAddr rootAddr; @@ -125,42 +162,21 @@ typedef struct FstMeta { typedef struct Fst { FstMeta meta; void *data; // -}; +} Fst; -// ops +// ops typedef struct FstIndexedValue { uint64_t index; uint64_t value; -}; +} FstIndexedValue; -// relate to Regist -typedef struct FstRegistry { - SArray *table; // - uint64_t tableSize; // num of rows - uint64_t mruSize; // num of columns -} FstRegistry; - -typedef struct FstRegistryCache { - SArray *cells; // -} FstRegistryCache; typedef struct FstRegistryCell { CompiledAddr addr; FstBuilderNode *node; } FstRegistryCell; -enum FstRegistryEntry {Found, NotFound, Rejected}; - -FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, uint8_t *data); -FstTransitions fstNodeTransitionIter(FstNode *node); -FstTransition fstNodeGetTransitionAt(FstNode *node, uint64_t i); -CompiledAddr fstNodeGetTransitionAddr(FstNode *node, uint64_t i); -int64_t fstNodeFindInput(FstNode *node, int8_t b); -Output fstNodeGetFinalOutput(FstNode *node); -void* fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledArr addr, FstBuilderNode *builderNode); - - #endif diff --git a/source/libs/index/inc/index_fst_automation.h b/source/libs/index/inc/index_fst_automation.h new file mode 100644 index 0000000000..7ad9a500cc --- /dev/null +++ b/source/libs/index/inc/index_fst_automation.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#ifndef __INDEX_FST_AUTAOMATION_H__ +#define __INDEX_FST_AUTAOMATION_H__ + +struct AutomationCtx; + +typedef struct StartWith { + AutomationCtx *autoSelf; +} StartWith; + +typedef struct Complement { + AutomationCtx *autoSelf; +} Complement; + +// automation +typedef struct AutomationCtx { + void *data; +} AutomationCtx; + +// automation interface +void (*start)(AutomationCtx *ctx); +bool (*isMatch)(AutomationCtx *ctx); +bool (*canMatch)(AutomationCtx *ctx, void *data); +bool (*willAlwaysMatch)(AutomationCtx *ctx, void *state); +void* (*accpet)(AutomationCtx *ctx, void *state, uint8_t byte); +void* (*accpetEof)(AutomationCtx *ctx, *state); + + +#endif diff --git a/source/libs/index/inc/index_fst_node.h b/source/libs/index/inc/index_fst_node.h new file mode 100644 index 0000000000..ba2d2ccd02 --- /dev/null +++ b/source/libs/index/inc/index_fst_node.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef __INDEX_FST_NODE_H__ +#define __INDEX_FST_NODE_H__ + + + + +#endif diff --git a/source/libs/index/inc/index_fst_registry.h b/source/libs/index/inc/index_fst_registry.h new file mode 100644 index 0000000000..6dcb236f29 --- /dev/null +++ b/source/libs/index/inc/index_fst_registry.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#ifndef __FST_REGISTRY_H__ +#define __FST_REGISTRY_H__ + +#include "index_fst_util.h" + + +typedef struct FstRegistry { + +} FstRegistry; +#endif diff --git a/source/libs/index/inc/index_fst_util.h b/source/libs/index/inc/index_fst_util.h new file mode 100644 index 0000000000..fc7dd44637 --- /dev/null +++ b/source/libs/index/inc/index_fst_util.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + + +#ifndef __INDEX_FST_UTIL_H__ +#define __INDEX_FST_UTIL_H__ + +#include "tarray.h" + + +typedef uint64_t FstType; +typedef uint64_t CompiledAddr; +typedef uint64_t Output; +typedef uint8_t PackSizes; + + +//A sentinel value used to indicate an empty final state +extern const CompiledAddr EMPTY_ADDRESS; +/// A sentinel value used to indicate an invalid state. +extern const CompiledAddr NONE_ADDRESS; + +// This version number is written to every finite state transducer created by +// this crate. When a finite state transducer is read, its version number is +// checked against this value. +extern const uint64_t version; +// The threshold (in number of transitions) at which an index is created for +// a node's transitions. This speeds up lookup time at the expense of FST size + +extern const uint64_t TRANS_INDEX_THRESHOLD; +// high 4 bits is transition address packed size. +// low 4 bits is output value packed size. +// +// `0` is a legal value which means there are no transitions/outputs + +#define FST_SET_TRANSITION_PACK_SIZE(v, sz) do {v = (v & 0b00001111) | (sz << 4} while(0) +#define FST_GET_TRANSITION_PACK_SIZE(v) (((v) & 0b11110000) >> 4) +#define FST_SET_OUTPUT_PACK_SIZE(v, sz) do { v = (v & 0b11110000) | sz } while(0) +#define FST_GET_OUTPUT_PACK_SIZE(v) ((v) & 0b00001111) + +#define COMMON_INPUT(idx) COMMON_INPUTS_INV[(idx) - 1] + +#define COMMON_INDEX(v, max, val) do { \ + val = ((uint16_t)COMMON_INPUTS[v] + 1)%256; \ + val = val > max ? 0: val; \ +} while(0) + + +//uint8_t commonInput(uint8_t idx); +//uint8_t commonIdx(uint8_t v, uint8_t max); + +uint8_t packSize(uint64_t n); +uint64_t unpackUint64(uint8_t *ch, uint8_t sz); +uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr); +CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr); + + + +typedef struct FstSlice { + uint8_t *data; + uint64_t dLen; + uint32_t start; + uint32_t end; +} FstSlice; + +FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end); +FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen); +bool fstSliceEmpty(FstSlice *slice); + + +#endif diff --git a/source/libs/index/src/index_fst.c b/source/libs/index/src/index_fst.c index 4c6e20a7d5..2974e7f9b5 100644 --- a/source/libs/index/src/index_fst.c +++ b/source/libs/index/src/index_fst.c @@ -15,13 +15,143 @@ #include "index_fst.h" + +FstUnFinishedNodes *fstUnFinishedNodesCreate() { + FstUnFinishedNodes *nodes = malloc(sizeof(FstUnFinishedNodes)); + if (nodes == NULL) { return NULL; } + + nodes->stack = (SArray *)taosArrayInit(64, sizeof(FstBuilderNodeUnfinished)); + fstUnFinishedNodesPushEmpty(nodes, false); + return nodes; +} +void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal) { + FstBuilderNode *node = malloc(sizeof(FstBuilderNode)); + node->isFinal = isFinal; + node->finalOutput = 0; + node->trans = NULL; + + FstBuilderNodeUnfinished un = {.node = node, .last = NULL}; + taosArrayPush(nodes->stack, &un); + +} +FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes) { + assert(taosArrayGetSize(nodes->stack) == 1); + + FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack); + assert(un->last == NULL); + return un->node; +} + +FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) { + FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack); + fstBuilderNodeUnfinishedLastCompiled(un, addr); + free(un->last); // TODO add func FstLastTransitionFree() + return un->node; +} + +FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes) { + FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack); + assert(un->last == NULL); + return un->node; + +} +void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *nodes, Output out) { + FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, 0); + un->node->isFinal = true; + un->node->finalOutput = out; + //un->node->trans = NULL; +} +void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) { + size_t sz = taosArrayGetSize(nodes->stack) - 1; + FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz); + fstBuilderNodeUnfinishedLastCompiled(un, addr); +} +void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *nodes, FstSlice bs, Output out) { + FstSlice *s = &bs; + if (s->data == NULL || s->dLen == 0 || s->start > s->end) { + return; + } + size_t sz = taosArrayGetSize(nodes->stack) - 1; + FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz); + assert(un->last == NULL); + + + FstLastTransition *trn = malloc(sizeof(FstLastTransition)); + trn->inp = s->data[s->start]; + trn->out = out; + + un->last = trn; + + for (uint64_t i = s->start; i <= s->end; i++) { + FstBuilderNode *n = malloc(sizeof(FstBuilderNode)); + n->isFinal = false; + n->finalOutput = 0; + n->trans = NULL; + + FstLastTransition *trn = malloc(sizeof(FstLastTransition)); + trn->inp = s->data[i]; + trn->out = out; + + FstBuilderNodeUnfinished un = {.node = n, .last = trn}; + taosArrayPush(nodes->stack, &un); + } + fstUnFinishedNodesPushEmpty(nodes, true); +} + + +uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs) { + FstSlice *s = &bs; + + size_t lsz = (size_t)(s->end - s->start + 1); // data len + size_t ssz = taosArrayGetSize(node->stack); // stack size + + uint64_t count = 0; + for (size_t i = 0; i < ssz && i < lsz; i++) { + FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i); + if (un->last->inp == s->data[s->start + i]) { + count++; + } else { + break; + } + } + return count; +} +uint64_t FstUnFinishedNodesFindCommPrefixAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out) { + FstSlice *s = &bs; + + size_t lsz = (size_t)(s->end - s->start + 1); // data len + size_t ssz = taosArrayGetSize(node->stack); // stack size + + uint64_t res = 0; + for (size_t i = 0; i < lsz && i < ssz; i++) { + FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i); + + FstLastTransition *last = un->last; + if (last->inp == s->data[s->start + i]) { + uint64_t commPrefix = last->out; + uint64_t addPrefix = last->out - commPrefix; + out = out - commPrefix; + last->out = commPrefix; + if (addPrefix != 0) { + fstBuilderNodeUnfinishedAddOutputPrefix(un, addPrefix); + } + } else { + break; + } + } + return res; +} + // fst node function -FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) { + + + +FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *slice) { FstNode *n = (FstNode *)malloc(sizeof(FstNode)); if (n == NULL) { return NULL; } if (addr == EMPTY_ADDRESS) { - n->date = NULL; + n->data = fstSliceCreate(NULL, 0); n->version = version; n->state = EmptyFinal; n->start = EMPTY_ADDRESS; @@ -29,20 +159,138 @@ FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) { n->isFinal = true; n->nTrans = 0; n->sizes = 0; - n->finalOutpu = 0; - return n; + n->finalOutput = 0; + } + uint8_t v = slice->data[addr]; + uint8_t s = (v & 0b11000000) >> 6; + if (s == 0b11) { // oneTransNext + n->data = fstSliceCopy(slice, 0, addr); + n->version = version; + n->state = OneTransNext; + n->start = addr; + n->end = addr; //? s.end_addr(data); + n->isFinal = false; + n->sizes = 0; + n->nTrans = 0; + n->finalOutput = 0; + } else if (v == 0b10) { // oneTrans + uint64_t sz; // fetch sz from addr + n->data = fstSliceCopy(slice, 0, addr); + n->version = version; + n->state = OneTrans; + n->start = addr; + n->end = addr; // s.end_addr(data, sz); + n->isFinal = false; + n->nTrans = 1; + n->sizes = sz; + n->finalOutput = 0; + } else { // anyTrans + uint64_t sz; // s.sizes(data) + uint32_t nTrans; // s.ntrans(data) + n->data = *slice; + n->version = version; + n->state = AnyTrans; + n->start = addr; + n->end = addr; // s.end_addr(version, data, sz, ntrans); + n->isFinal = false; // s.is_final_state(); + n->nTrans = nTrans; + n->sizes = sz; + n->finalOutput = 0; // s.final_output(version, data, sz, ntrans); } - uint8_t v = (data[addr] & 0b1100000) >> 6; - if (v == 0b11) { - - } else if (v == 0b10) { - - } else { + return n; +} +FstTransitions* fstNodeTransitions(FstNode *node) { + FstTransitions *t = malloc(sizeof(FstTransitions)); + if (NULL == t) { + return NULL; + } + FstRange range = {.start = 0, .end = FST_NODE_LEN(node)}; + t->node = node; + t->range = range; + return t; +} +bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res) { + bool s = true; + if (node->state == OneTransNext) { - } - + } else if (node->state == OneTrans) { + + } else if (node->state == AnyTrans) { + + } else { + s = false; + } + return s; +} + +bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res) { + bool s = true; + if (node->state == OneTransNext) { + + } else if (node->state == OneTrans) { + + } else if (node->state == AnyTrans) { + + } else if (node->state == EmptyFinal){ + s = false; + } + return s; +} + +bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res) { + bool s = true; + uint8_t input; // s.input + if (node->state == OneTransNext) { + if (b == input) { *res = 0; } + else { return s ; } + } else if (node->state == OneTrans) { + if (b == input) { *res = 0; } + else {return s;} + } else if (node->state == AnyTrans) { + + } else if (node->state == EmptyFinal) { + s = false; + } + return s; +} + +bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode) { + size_t sz = taosArrayGetSize(builderNode->trans); + assert(sz < 256); + if (sz == 0 && builderNode->isFinal && builderNode->finalOutput == 0) { + return true; + } else if (sz != 1 || builderNode->isFinal) { + // AnyTrans->Compile(w, addr, node); + } else { + FstTransition *tran = taosArrayGet(builderNode->trans, 0); + if (tran->addr == lastAddr && tran->out == 0) { + //OneTransNext::compile(w, lastAddr, tran->inp); + return true; + } else { + //OneTrans::Compile(w, lastAddr, *tran); + return true; + } + } + return true; +} + + + + +FstBuilder *fstBuilderCreate(void *w, FstType ty) { + FstBuilder *b = malloc(sizeof(FstBuilder)); + if (NULL == b) { return b; } + + FstCountingWriter wtr = {.wtr = w, .count = 0, .summer = 0}; + b->wtr = wtr; + b->unfinished = malloc(sizeof(FstUnFinishedNodes)); + return b; } +FstSlice fstNodeAsSlice(FstNode *node) { + FstSlice *slice = &node->data; + FstSlice s = fstSliceCopy(slice, slice->end, slice->dLen - 1); + return s; +} - diff --git a/source/libs/index/src/index_fst_automation.c b/source/libs/index/src/index_fst_automation.c new file mode 100644 index 0000000000..f2f48bbc8a --- /dev/null +++ b/source/libs/index/src/index_fst_automation.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ diff --git a/source/libs/index/src/index_fst_common.c b/source/libs/index/src/index_fst_common.c index 39e5f89b35..4ab78cddc5 100644 --- a/source/libs/index/src/index_fst_common.c +++ b/source/libs/index/src/index_fst_common.c @@ -12,6 +12,8 @@ * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ + +#include "tutil.h" const uint8_t COMMON_INPUTS[] = { 84, // '\x00' 85, // '\x01' @@ -271,7 +273,7 @@ const uint8_t COMMON_INPUTS[] = { 255, // 'ÿ' }; -char const COMMON_INPUTS_INV[] = [ +char const COMMON_INPUTS_INV[] = { 't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w', '.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=', ':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6', @@ -300,5 +302,5 @@ char const COMMON_INPUTS_INV[] = [ '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', -]; +}; diff --git a/source/libs/index/src/index_fst_node.c b/source/libs/index/src/index_fst_node.c new file mode 100644 index 0000000000..3d5efd30f3 --- /dev/null +++ b/source/libs/index/src/index_fst_node.c @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + diff --git a/source/libs/index/src/index_fst_registry.c b/source/libs/index/src/index_fst_registry.c new file mode 100644 index 0000000000..940c5863f4 --- /dev/null +++ b/source/libs/index/src/index_fst_registry.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "index_fst_registry.h" + diff --git a/source/libs/index/src/index_fst_util.c b/source/libs/index/src/index_fst_util.c new file mode 100644 index 0000000000..20751baf5f --- /dev/null +++ b/source/libs/index/src/index_fst_util.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#include "index_fst_util.h" + + + +//A sentinel value used to indicate an empty final state +const CompiledAddr EMPTY_ADDRESS = 0; +/// A sentinel value used to indicate an invalid state. +const CompiledAddr NONE_ADDRESS = 1; + +// This version number is written to every finite state transducer created by +// this crate. When a finite state transducer is read, its version number is +// checked against this value. +const uint64_t version = 3; +// The threshold (in number of transitions) at which an index is created for +// a node's transitions. This speeds up lookup time at the expense of FST size + +const uint64_t TRANS_INDEX_THRESHOLD = 32; + + +//uint8_t commonInput(uint8_t idx) { +// if (idx == 0) { return -1; } +// else { +// return COMMON_INPUTS_INV[idx - 1]; +// } +//} +// +//uint8_t commonIdx(uint8_t v, uint8_t max) { +// uint8_t v = ((uint16_t)tCOMMON_INPUTS[v] + 1)%256; +// return v > max ? 0: v; +//} + + + +uint8_t packSize(uint64_t n) { + if (n < (1u << 8)) { + return 1; + } else if (n < (1u << 16)) { + return 2; + } else if (n < (1u << 24)) { + return 3; + } else if (n < ((uint64_t)(1) << 32)) { + return 4; + } else if (n < ((uint64_t)(1) << 40)) { + return 5; + } else if (n < ((uint64_t)(1) << 48)) { + return 6; + } else if (n < ((uint64_t)(1) << 56)) { + return 7; + } else { + return 8; + } +} + +uint64_t unpackUint64(uint8_t *ch, uint8_t sz) { + uint64_t n; + for (uint8_t i = 0; i < sz; i++) { + n = n | (ch[i] << (8 * i)); + } + return n; +} +uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr) { + if (transAddr == EMPTY_ADDRESS) { + return packSize(EMPTY_ADDRESS); + } else { + return packSize(nodeAddr - transAddr); + } +} +CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr) { + uint64_t delta = unpackUint64(data, len); + // delta_add = u64_to_usize + if (delta == EMPTY_ADDRESS) { + return EMPTY_ADDRESS; + } else { + return nodeAddr - delta; + } +} + +// fst slice func +FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen) { + FstSlice slice = {.data = data, .dLen = dLen, .start = 0, .end = dLen - 1}; + return slice; +} +FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end) { + FstSlice t; + if (start >= slice->dLen || end >= slice->dLen || start > end) { + t.data = NULL; + return t; + }; + + t.data = slice->data; + t.dLen = slice->dLen; + t.start = start; + t.end = end; + return t; +} +bool fstSliceEmpty(FstSlice *slice) { + return slice->data == NULL || slice->dLen <= 0; +} + + +