From 0c8271e0c26eede3c63ea6fd2a9e53a49c277368 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Wed, 30 Mar 2022 17:35:35 +0800 Subject: [PATCH 1/5] add fuzzy search --- source/libs/index/inc/indexFstDfa.h | 72 ++++++++++++++ source/libs/index/inc/indexFstRegex.h | 74 ++++++++++++++ .../inc/{indexSparse.h => indexFstSparse.h} | 4 +- source/libs/index/src/indexFstDfa.c | 96 +++++++++++++++++++ source/libs/index/src/indexFstRegex.c | 34 +++++++ source/libs/index/src/indexSparse.c | 2 +- 6 files changed, 279 insertions(+), 3 deletions(-) create mode 100644 source/libs/index/inc/indexFstDfa.h create mode 100644 source/libs/index/inc/indexFstRegex.h rename source/libs/index/inc/{indexSparse.h => indexFstSparse.h} (94%) create mode 100644 source/libs/index/src/indexFstDfa.c create mode 100644 source/libs/index/src/indexFstRegex.c diff --git a/source/libs/index/inc/indexFstDfa.h b/source/libs/index/inc/indexFstDfa.h new file mode 100644 index 0000000000..72a265c123 --- /dev/null +++ b/source/libs/index/inc/indexFstDfa.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef __INDEX_FST_DFA_H__ +#define __INDEX_FST_DFA_H__ + +#include "indexFstRegex.h" +#include "indexFstSparse.h" +#include "tarray.h" +#include "thash.h" + +#ifdef __cplusplus + +extern "C" { +#endif + +typedef struct FstDfa FstDfa; + +typedef struct { + SArray * insts; + uint32_t next[256]; + bool isMatch; +} State; + +/* + * dfa builder related func + **/ +typedef struct FstDfaBuilder { + FstDfa * dfa; + SHashObj *cache; +} FstDfaBuilder; + +FstDfaBuilder *dfaBuilderCreate(SArray *insts); + +FstDfa *dfaBuilderBuild(FstDfaBuilder *builder); + +bool dfaBuilderRunState(FstDfaBuilder *builder, FstSparseSet *cur, FstSparseSet *next, uint32_t state, uint8_t bytes, + uint32_t *result); + +bool dfaBuilderCachedState(FstDfaBuilder *builder, FstSparseSet *set, uint32_t *result); + +/* + * dfa related func + **/ +typedef struct FstDfa { + SArray *insts; + SArray *states; +} FstDfa; + +FstDfa *dfaCreate(SArray *insts, SArray *states); +bool dfaIsMatch(FstDfa *dfa, uint32_t si); +bool dfaAccept(FstDfa *dfa, uint32_t si, uint8_t byte, uint32_t *result); +void dfaAdd(FstDfa *dfa, FstSparseSet *set, uint32_t ip); +bool dfaRun(FstDfa *dfa, FstSparseSet *from, FstSparseSet *to, uint8_t byte); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/source/libs/index/inc/indexFstRegex.h b/source/libs/index/inc/indexFstRegex.h new file mode 100644 index 0000000000..50b9cae7ff --- /dev/null +++ b/source/libs/index/inc/indexFstRegex.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_INDEX_FST_REGEX_H_ +#define _TD_INDEX_FST_REGEX_H_ + +//#include "indexFstDfa.h" +#include "taos.h" +#include "tarray.h" +#include "tchecksum.h" +#include "thash.h" +#include "tlog.h" +#include "tutil.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { MATCH, JUMP, SPLIT, RANGE } InstType; + +typedef struct MatchValue { +} MatchValue; +typedef struct JumpValue { + uint32_t step; +} JumpValue; + +typedef struct SplitValue { + uint32_t len1; + uint32_t len2; +} SplitValue; + +typedef struct RangeValue { + uint8_t start; + uint8_t end; +} RangeValue; + +typedef struct { + InstType ty; + union { + MatchValue mv; + JumpValue jv; + SplitValue sv; + RangeValue rv; + }; +} Inst; + +typedef struct { + char *orig; + void *dfa; +} FstRegex; + +FstRegex *regexCreate(const char *str); + +void regexSetup(FstRegex *regex, uint32_t size, const char *str); + +// uint32_t regexStart() + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/source/libs/index/inc/indexSparse.h b/source/libs/index/inc/indexFstSparse.h similarity index 94% rename from source/libs/index/inc/indexSparse.h rename to source/libs/index/inc/indexFstSparse.h index 8035f6e08d..69b33c82d9 100644 --- a/source/libs/index/inc/indexSparse.h +++ b/source/libs/index/inc/indexFstSparse.h @@ -13,8 +13,8 @@ * along with this program. If not, see . */ -#ifndef _TD_INDEX_SPARSE_H_ -#define _TD_INDEX_SPARSE_H_ +#ifndef _TD_INDEX_FST_SPARSE_H_ +#define _TD_INDEX_FST_SPARSE_H_ #include "tarray.h" diff --git a/source/libs/index/src/indexFstDfa.c b/source/libs/index/src/indexFstDfa.c new file mode 100644 index 0000000000..765c5f08eb --- /dev/null +++ b/source/libs/index/src/indexFstDfa.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "indexFstDfa.h" +#include "thash.h" + +static int dfaInstsEqual(const void *a, const void *b, size_t size) { + SArray *ar = (SArray *)a; + SArray *br = (SArray *)b; + size_t al = ar != NULL ? taosArrayGetSize(ar) : 0; + size_t bl = br != NULL ? taosArrayGetSize(br) : 0; + if (al != bl) { + return -1; + } + for (int i = 0; i < al; i++) { + uint32_t v1 = *(uint32_t *)taosArrayGet(ar, i); + uint32_t v2 = *(uint32_t *)taosArrayGet(br, i); + if (v1 != v2) { + return -1; + } + } + return 0; +} +FstDfaBuilder *dfaBuilderCreate(SArray *insts) { + FstDfaBuilder *builder = taosMemoryCalloc(1, sizeof(FstDfaBuilder)); + if (builder == NULL) { + return NULL; + } + + SArray *states = taosArrayInit(4, sizeof(State)); + + builder->dfa = dfaCreate(insts, states); + builder->cache = taosHashInit( + 4, taosGetDefaultHashFunction(POINTER_BYTES == sizeof(int64_t) ? TSDB_DATA_TYPE_BIGINT : TSDB_DATA_TYPE_INT), + false, HASH_NO_LOCK); + taosHashSetEqualFp(builder->cache, dfaInstsEqual); + return builder; +} + +FstDfa *dfaBuilderBuild(FstDfaBuilder *builder) { + uint32_t sz = taosArrayGetSize(builder->dfa->insts); + FstSparseSet *cur = sparSetCreate(sz); + FstSparseSet *nxt = sparSetCreate(sz); + + dfaAdd(builder->dfa, cur, 0); +} + +bool dfaBuilderRunState(FstDfaBuilder *builder, FstSparseSet *cur, FstSparseSet *next, uint32_t state, uint8_t bytes, + uint32_t *result) { + // impl run state + return true; +} + +bool dfaBuilderCachedState(FstDfaBuilder *builder, FstSparseSet *set, uint32_t *result) { + // impl cache state + return true; +} + +FstDfa *dfaCreate(SArray *insts, SArray *states) { + FstDfa *dfa = taosMemoryCalloc(1, sizeof(FstDfa)); + if (dfa == NULL) { + return NULL; + } + + dfa->insts = insts; + dfa->states = states; + return dfa; +} +bool dfaIsMatch(FstDfa *dfa, uint32_t si) { + // impl match + return true; +} +bool dfaAccept(FstDfa *dfa, uint32_t si, uint8_t byte, uint32_t *result) { + // impl accept + return true; +} +void dfaAdd(FstDfa *dfa, FstSparseSet *set, uint32_t ip) { + // impl add + return; +} +bool dfaRun(FstDfa *dfa, FstSparseSet *from, FstSparseSet *to, uint8_t byte) { + // impl run + return true; +} diff --git a/source/libs/index/src/indexFstRegex.c b/source/libs/index/src/indexFstRegex.c new file mode 100644 index 0000000000..ec41a7f58e --- /dev/null +++ b/source/libs/index/src/indexFstRegex.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "indexFstRegex.h" +#include "indexFstSparse.h" + +FstRegex *regexCreate(const char *str) { + FstRegex *regex = taosMemoryCalloc(1, sizeof(FstRegex)); + if (regex == NULL) { + return NULL; + } + int32_t sz = (int32_t)strlen(str); + char * orig = taosMemoryCalloc(1, sz); + memcpy(orig, str, sz); + + regex->orig = orig; +} + +void regexSetup(FstRegex *regex, uint32_t size, const char *str) { + // return + // return; +} diff --git a/source/libs/index/src/indexSparse.c b/source/libs/index/src/indexSparse.c index 8bcf04602f..9d228e71ff 100644 --- a/source/libs/index/src/indexSparse.c +++ b/source/libs/index/src/indexSparse.c @@ -13,7 +13,7 @@ * along with this program. If not, see . */ -#include "indexSparse.h" +#include "indexFstSparse.h" FstSparseSet *sparSetCreate(int32_t sz) { FstSparseSet *ss = taosMemoryCalloc(1, sizeof(FstSparseSet)); From a915647df897699fe3d266278eb140844fecf649 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Wed, 30 Mar 2022 19:27:41 +0800 Subject: [PATCH 2/5] add fuzzy search --- source/libs/index/inc/indexFstDfa.h | 2 + source/libs/index/src/indexFstDfa.c | 61 +++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/source/libs/index/inc/indexFstDfa.h b/source/libs/index/inc/indexFstDfa.h index 72a265c123..f6c220bcb7 100644 --- a/source/libs/index/inc/indexFstDfa.h +++ b/source/libs/index/inc/indexFstDfa.h @@ -44,6 +44,8 @@ typedef struct FstDfaBuilder { FstDfaBuilder *dfaBuilderCreate(SArray *insts); +void dfaBuilderDestroy(FstDfaBuilder *builder); + FstDfa *dfaBuilderBuild(FstDfaBuilder *builder); bool dfaBuilderRunState(FstDfaBuilder *builder, FstSparseSet *cur, FstSparseSet *next, uint32_t state, uint8_t bytes, diff --git a/source/libs/index/src/indexFstDfa.c b/source/libs/index/src/indexFstDfa.c index 765c5f08eb..b390c31343 100644 --- a/source/libs/index/src/indexFstDfa.c +++ b/source/libs/index/src/indexFstDfa.c @@ -16,6 +16,8 @@ #include "indexFstDfa.h" #include "thash.h" +const static uint32_t STATE_LIMIT = 1000; + static int dfaInstsEqual(const void *a, const void *b, size_t size) { SArray *ar = (SArray *)a; SArray *br = (SArray *)b; @@ -48,6 +50,18 @@ FstDfaBuilder *dfaBuilderCreate(SArray *insts) { taosHashSetEqualFp(builder->cache, dfaInstsEqual); return builder; } +void dfaBuilderDestroy(FstDfaBuilder *builder) { + if (builder == NULL) { + return; + } + void *pIter = builder->cache != NULL ? taosHashIterate(builder->cache, NULL) : NULL; + while (pIter) { + SArray **key = pIter; + taosArrayDestroy(*key); + pIter = taosHashIterate(builder->cache, pIter); + } + taosHashCleanup(builder->cache); +} FstDfa *dfaBuilderBuild(FstDfaBuilder *builder) { uint32_t sz = taosArrayGetSize(builder->dfa->insts); @@ -55,12 +69,53 @@ FstDfa *dfaBuilderBuild(FstDfaBuilder *builder) { FstSparseSet *nxt = sparSetCreate(sz); dfaAdd(builder->dfa, cur, 0); + + SArray * states = taosArrayInit(0, sizeof(uint32_t)); + uint32_t result; + if (dfaBuilderCachedState(builder, cur, &result)) { + taosArrayPush(states, &result); + } + SHashObj *seen = taosHashInit(12, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false, HASH_NO_LOCK); + while (taosArrayGetSize(states) != 0) { + result = *(uint32_t *)taosArrayPop(states); + for (int i = 0; i < 256; i++) { + uint32_t ns, dummpy = 0; + if (dfaBuilderRunState(builder, cur, nxt, result, i, &ns)) { + if (taosHashGet(seen, &ns, sizeof(ns)) == NULL) { + taosHashPut(seen, &ns, sizeof(ns), &dummpy, sizeof(dummpy)); + taosArrayPush(states, &ns); + } + } + if (taosArrayGetSize(builder->dfa->states) > STATE_LIMIT) { + // Too many state; + // + } + } + } + taosArrayDestroy(states); + taosHashCleanup(seen); + return builder->dfa; } -bool dfaBuilderRunState(FstDfaBuilder *builder, FstSparseSet *cur, FstSparseSet *next, uint32_t state, uint8_t bytes, +bool dfaBuilderRunState(FstDfaBuilder *builder, FstSparseSet *cur, FstSparseSet *next, uint32_t state, uint8_t byte, uint32_t *result) { - // impl run state - return true; + sparSetClear(cur); + State *t = taosArrayGet(builder->dfa->states, state); + for (int i = 0; i < taosArrayGetSize(t->insts); i++) { + uint32_t ip = *(int32_t *)taosArrayGet(t->insts, i); + sparSetAdd(cur, ip); + } + dfaRun(builder->dfa, cur, next, byte); + + t = taosArrayGet(builder->dfa->states, state); + + uint32_t nxtState; + if (dfaBuilderCachedState(builder, next, &nxtState)) { + t->next[byte] = nxtState; + *result = nxtState; + return true; + } + return false; } bool dfaBuilderCachedState(FstDfaBuilder *builder, FstSparseSet *set, uint32_t *result) { From 27280fe2921e7bcfd982a215fd357f804d83c267 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Wed, 30 Mar 2022 21:36:39 +0800 Subject: [PATCH 3/5] add fuzzy search --- source/libs/index/src/indexFstDfa.c | 33 ++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/source/libs/index/src/indexFstDfa.c b/source/libs/index/src/indexFstDfa.c index b390c31343..e704144974 100644 --- a/source/libs/index/src/indexFstDfa.c +++ b/source/libs/index/src/indexFstDfa.c @@ -119,7 +119,38 @@ bool dfaBuilderRunState(FstDfaBuilder *builder, FstSparseSet *cur, FstSparseSet } bool dfaBuilderCachedState(FstDfaBuilder *builder, FstSparseSet *set, uint32_t *result) { - // impl cache state + SArray *tinsts = taosArrayInit(4, sizeof(uint32_t)); + bool isMatch = false; + + for (int i = 0; i < sparSetLen(set); i++) { + uint32_t ip = sparSetGet(set, i); + + Inst *inst = taosArrayGet(builder->dfa->insts, ip); + if (inst->ty == JUMP || inst->ty == SPLIT) { + continue; + } else if (inst->ty == RANGE) { + taosArrayPush(tinsts, &ip); + } else if (inst->ty == MATCH) { + isMatch = true; + taosArrayPush(tinsts, &ip); + } + } + if (taosArrayGetSize(tinsts) == 0) { + return false; + } + uint32_t *v = taosHashGet(builder->cache, &tinsts, sizeof(POINTER_BYTES)); + if (v != NULL) { + *result = *v; + taosArrayDestroy(tinsts); + } else { + State st; + st.insts = tinsts; + st.isMatch = isMatch; + taosArrayPush(builder->dfa->states, &st); + int32_t sz = taosArrayGetSize(builder->dfa->states) - 1; + taosHashPut(builder->cache, &tinsts, sizeof(POINTER_BYTES), &sz, sizeof(sz)); + *result = sz; + } return true; } From baf28eced5c000057c3048a2d97b4cb386ff907c Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Wed, 30 Mar 2022 22:04:10 +0800 Subject: [PATCH 4/5] add fuzzy search --- source/libs/index/src/indexFstDfa.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/source/libs/index/src/indexFstDfa.c b/source/libs/index/src/indexFstDfa.c index e704144974..b4ac9f3a99 100644 --- a/source/libs/index/src/indexFstDfa.c +++ b/source/libs/index/src/indexFstDfa.c @@ -165,15 +165,34 @@ FstDfa *dfaCreate(SArray *insts, SArray *states) { return dfa; } bool dfaIsMatch(FstDfa *dfa, uint32_t si) { - // impl match - return true; + if (dfa->states == NULL || si < taosArrayGetSize(dfa->states)) { + return false; + } + State *st = taosArrayGet(dfa->states, si); + return st != NULL ? st->isMatch : false; } bool dfaAccept(FstDfa *dfa, uint32_t si, uint8_t byte, uint32_t *result) { - // impl accept + if (dfa->states == NULL || si < taosArrayGetSize(dfa->states)) { + return false; + } + State *st = taosArrayGet(dfa->states, si); + *result = st->next[byte]; return true; } void dfaAdd(FstDfa *dfa, FstSparseSet *set, uint32_t ip) { - // impl add + if (sparSetContains(set, ip)) { + return; + } + sparSetAdd(set, ip); + Inst *inst = taosArrayGet(dfa->insts, ip); + if (inst->ty == MATCH || inst->ty == RANGE) { + // do nothing + } else if (inst->ty == JUMP) { + dfaAdd(dfa, set, inst->jv.step); + } else if (inst->ty == SPLIT) { + dfaAdd(dfa, set, inst->sv.len1); + dfaAdd(dfa, set, inst->sv.len2); + } return; } bool dfaRun(FstDfa *dfa, FstSparseSet *from, FstSparseSet *to, uint8_t byte) { From f2c9f40dffca587d89bc321d8ff53ce48fff71fe Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Thu, 31 Mar 2022 10:16:04 +0800 Subject: [PATCH 5/5] add fuzzy search --- source/libs/index/src/indexFstDfa.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/source/libs/index/src/indexFstDfa.c b/source/libs/index/src/indexFstDfa.c index b4ac9f3a99..ff6b154c54 100644 --- a/source/libs/index/src/indexFstDfa.c +++ b/source/libs/index/src/indexFstDfa.c @@ -193,9 +193,26 @@ void dfaAdd(FstDfa *dfa, FstSparseSet *set, uint32_t ip) { dfaAdd(dfa, set, inst->sv.len1); dfaAdd(dfa, set, inst->sv.len2); } + return; } bool dfaRun(FstDfa *dfa, FstSparseSet *from, FstSparseSet *to, uint8_t byte) { - // impl run - return true; + bool isMatch = false; + sparSetClear(to); + for (int i = 0; i < sparSetLen(from); i++) { + uint32_t ip = sparSetGet(from, i); + + Inst *inst = taosArrayGet(dfa->insts, ip); + if (inst->ty == JUMP || inst->ty == SPLIT) { + continue; + } else if (inst->ty == MATCH) { + isMatch = true; + } else if (inst->ty == RANGE) { + if (inst->rv.start <= byte && byte <= inst->rv.end) { + dfaAdd(dfa, to, ip + 1); + } + } + } + + return isMatch; }