diff --git a/source/libs/index/inc/index_fst.h b/source/libs/index/inc/index_fst.h index de4c957e29..61c857ed74 100644 --- a/source/libs/index/inc/index_fst.h +++ b/source/libs/index/inc/index_fst.h @@ -13,58 +13,73 @@ * along with this program. If not, see . */ -#ifndef _INDEX_FST_H_ -#define _INDEX_FST_H_ -#include "index_fst.h" +#ifndef __INDEX_FST_H__ +#define __INDEX_FST_H__ + + #include "tarray.h" - -typedef FstType uint64_t; -typedef CompiledAddr uint64_t; -typedef Output uint64_t; -typedef PackSizes uint8_t; +#include "index_fst_util.h" +#include "index_fst_registry.h" -//A sentinel value used to indicate an empty final state -const CompileAddr EMPTY_ADDRESS = 0; -/// A sentinel value used to indicate an invalid state. -const CompileAddr NONE_ADDRESS = 1; +typedef struct FstNode FstNode; +#define OUTPUT_PREFIX(a, b) ((a) > (b) ? (b) : (a) -// This version number is written to every finite state transducer created by -// this crate. When a finite state transducer is read, its version number is -// checked against this value. -const uint64_t version = 3; -// The threshold (in number of transitions) at which an index is created for -// a node's transitions. This speeds up lookup time at the expense of FST size - -const uint64_t TRANS_INDEX_THRESHOLD = 32; typedef struct FstRange { uint64_t start; uint64_t end; } FstRange; -enum State { OneTransNext, OneTrans, AnyTrans, EmptyFinal}; -enum FstBound { Included, Excluded, Unbounded}; -typedef struct CheckSummer { - uint32_t sum; -}; +typedef struct FstBuilderNode { + bool isFinal; + Output finalOutput; + SArray *trans; // +} FstBuilderNode; + +typedef enum { OneTransNext, OneTrans, AnyTrans, EmptyFinal} State; +typedef enum { Included, Excluded, Unbounded} FstBound; + +typedef uint32_t CheckSummer; -typedef struct FstBuilder { - FstCountingWriter wtr; // The FST raw data is written directly to `wtr`. - FstUnFinishedNodes unfinished // The stack of unfinished nodes - Registry registry // A map of finished nodes. - SArray* last // The last word added - CompiledAddr lastAddr // The address of the last compiled node - uint64_t len // num of keys added -} FstBuilder; +/* + * + * UnFinished node and helper function + * TODO: simple function name + */ +typedef struct FstUnFinishedNodes { + SArray *stack; // } FstUnFinishedNodes; +} FstUnFinishedNodes; + +#define FST_UNFINISHED_NODES_LEN(nodes) taosArrayGetSize(nodes->stack) + +FstUnFinishedNodes *FstUnFinishedNodesCreate(); +void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal); +FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes); +FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr); +FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes); +void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *node, Output out); +void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *node, CompiledAddr addr); +void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *node, FstSlice bs, Output out); +uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs); +uint64_t FstUnFinishedNodesFindCommPreifxAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out); typedef struct FstCountingWriter { void* wtr; // wrap any writer that counts and checksum bytes written uint64_t count; CheckSummer summer; -}; +} FstCountingWriter; + +typedef struct FstBuilder { + FstCountingWriter wtr; // The FST raw data is written directly to `wtr`. + FstUnFinishedNodes *unfinished; // The stack of unfinished nodes + FstRegistry registry; // A map of finished nodes. + SArray* last; // The last word added + CompiledAddr lastAddr; // The address of the last compiled node + uint64_t len; // num of keys added +} FstBuilder; @@ -80,16 +95,6 @@ typedef struct FstTransitions { FstRange range; } FstTransitions; -typedef struct FstUnFinishedNodes { - SArray *stack; // -} FstUnFinishedNodes; - -typedef struct FstBuilderNode { - bool isFinal; - Output finalOutput; - SArray *trans; // -} FstBuilderNode; - typedef struct FstLastTransition { @@ -97,13 +102,23 @@ typedef struct FstLastTransition { Output out; } FstLastTransition; +/* + * FstBuilderNodeUnfinished and helper function + * TODO: simple function name + */ typedef struct FstBuilderNodeUnfinished { - FstBuilderNode node; - FstLastTransition last; + FstBuilderNode *node; + FstLastTransition* last; } FstBuilderNodeUnfinished; +void fstBuilderNodeUnfinishedLastCompiled(FstBuilderNodeUnfinished *node, CompiledAddr addr); +void fstBuilderNodeUnfinishedAddOutputPrefix(FstBuilderNodeUnfinished *node, CompiledAddr addr); + +/* + * FstNode and helper function + */ typedef struct FstNode { - uint8_t* data; + FstSlice data; uint64_t version; State state; CompiledAddr start; @@ -114,6 +129,28 @@ typedef struct FstNode { Output finalOutput; } FstNode; +// If this node is final and has a terminal output value, then it is, returned. Otherwise, a zero output is returned +#define FST_NODE_FINAL_OUTPUT(node) node->finalOutput +// Returns true if and only if this node corresponds to a final or "match", state in the finite state transducer. +#define FST_NODE_IS_FINAL(node) node->isFinal +// Returns the number of transitions in this node, The maximum number of transitions is 256. +#define FST_NODE_LEN(node) node->nTrans +// Returns true if and only if this node has zero transitions. +#define FST_NODE_IS_EMPTYE(node) (node->nTrans == 0) +// Return the address of this node. +#define FST_NODE_ADDR(node) node->start + +FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *data); +FstTransitions fstNodeTransitionIter(FstNode *node); +FstTransitions* fstNodeTransitions(FstNode *node); +bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res); +bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res); +bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res); +bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode); +FstSlice fstNodeAsSlice(FstNode *node); + + + typedef struct FstMeta { uint64_t version; CompiledAddr rootAddr; @@ -125,42 +162,21 @@ typedef struct FstMeta { typedef struct Fst { FstMeta meta; void *data; // -}; +} Fst; -// ops +// ops typedef struct FstIndexedValue { uint64_t index; uint64_t value; -}; +} FstIndexedValue; -// relate to Regist -typedef struct FstRegistry { - SArray *table; // - uint64_t tableSize; // num of rows - uint64_t mruSize; // num of columns -} FstRegistry; - -typedef struct FstRegistryCache { - SArray *cells; // -} FstRegistryCache; typedef struct FstRegistryCell { CompiledAddr addr; FstBuilderNode *node; } FstRegistryCell; -enum FstRegistryEntry {Found, NotFound, Rejected}; - -FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, uint8_t *data); -FstTransitions fstNodeTransitionIter(FstNode *node); -FstTransition fstNodeGetTransitionAt(FstNode *node, uint64_t i); -CompiledAddr fstNodeGetTransitionAddr(FstNode *node, uint64_t i); -int64_t fstNodeFindInput(FstNode *node, int8_t b); -Output fstNodeGetFinalOutput(FstNode *node); -void* fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledArr addr, FstBuilderNode *builderNode); - - #endif diff --git a/source/libs/index/inc/index_fst_automation.h b/source/libs/index/inc/index_fst_automation.h new file mode 100644 index 0000000000..7ad9a500cc --- /dev/null +++ b/source/libs/index/inc/index_fst_automation.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#ifndef __INDEX_FST_AUTAOMATION_H__ +#define __INDEX_FST_AUTAOMATION_H__ + +struct AutomationCtx; + +typedef struct StartWith { + AutomationCtx *autoSelf; +} StartWith; + +typedef struct Complement { + AutomationCtx *autoSelf; +} Complement; + +// automation +typedef struct AutomationCtx { + void *data; +} AutomationCtx; + +// automation interface +void (*start)(AutomationCtx *ctx); +bool (*isMatch)(AutomationCtx *ctx); +bool (*canMatch)(AutomationCtx *ctx, void *data); +bool (*willAlwaysMatch)(AutomationCtx *ctx, void *state); +void* (*accpet)(AutomationCtx *ctx, void *state, uint8_t byte); +void* (*accpetEof)(AutomationCtx *ctx, *state); + + +#endif diff --git a/source/libs/index/inc/index_fst_node.h b/source/libs/index/inc/index_fst_node.h new file mode 100644 index 0000000000..ba2d2ccd02 --- /dev/null +++ b/source/libs/index/inc/index_fst_node.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef __INDEX_FST_NODE_H__ +#define __INDEX_FST_NODE_H__ + + + + +#endif diff --git a/source/libs/index/inc/index_fst_registry.h b/source/libs/index/inc/index_fst_registry.h new file mode 100644 index 0000000000..6dcb236f29 --- /dev/null +++ b/source/libs/index/inc/index_fst_registry.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#ifndef __FST_REGISTRY_H__ +#define __FST_REGISTRY_H__ + +#include "index_fst_util.h" + + +typedef struct FstRegistry { + +} FstRegistry; +#endif diff --git a/source/libs/index/inc/index_fst_util.h b/source/libs/index/inc/index_fst_util.h new file mode 100644 index 0000000000..fc7dd44637 --- /dev/null +++ b/source/libs/index/inc/index_fst_util.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + + +#ifndef __INDEX_FST_UTIL_H__ +#define __INDEX_FST_UTIL_H__ + +#include "tarray.h" + + +typedef uint64_t FstType; +typedef uint64_t CompiledAddr; +typedef uint64_t Output; +typedef uint8_t PackSizes; + + +//A sentinel value used to indicate an empty final state +extern const CompiledAddr EMPTY_ADDRESS; +/// A sentinel value used to indicate an invalid state. +extern const CompiledAddr NONE_ADDRESS; + +// This version number is written to every finite state transducer created by +// this crate. When a finite state transducer is read, its version number is +// checked against this value. +extern const uint64_t version; +// The threshold (in number of transitions) at which an index is created for +// a node's transitions. This speeds up lookup time at the expense of FST size + +extern const uint64_t TRANS_INDEX_THRESHOLD; +// high 4 bits is transition address packed size. +// low 4 bits is output value packed size. +// +// `0` is a legal value which means there are no transitions/outputs + +#define FST_SET_TRANSITION_PACK_SIZE(v, sz) do {v = (v & 0b00001111) | (sz << 4} while(0) +#define FST_GET_TRANSITION_PACK_SIZE(v) (((v) & 0b11110000) >> 4) +#define FST_SET_OUTPUT_PACK_SIZE(v, sz) do { v = (v & 0b11110000) | sz } while(0) +#define FST_GET_OUTPUT_PACK_SIZE(v) ((v) & 0b00001111) + +#define COMMON_INPUT(idx) COMMON_INPUTS_INV[(idx) - 1] + +#define COMMON_INDEX(v, max, val) do { \ + val = ((uint16_t)COMMON_INPUTS[v] + 1)%256; \ + val = val > max ? 0: val; \ +} while(0) + + +//uint8_t commonInput(uint8_t idx); +//uint8_t commonIdx(uint8_t v, uint8_t max); + +uint8_t packSize(uint64_t n); +uint64_t unpackUint64(uint8_t *ch, uint8_t sz); +uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr); +CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr); + + + +typedef struct FstSlice { + uint8_t *data; + uint64_t dLen; + uint32_t start; + uint32_t end; +} FstSlice; + +FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end); +FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen); +bool fstSliceEmpty(FstSlice *slice); + + +#endif diff --git a/source/libs/index/src/index_fst.c b/source/libs/index/src/index_fst.c index 4c6e20a7d5..2974e7f9b5 100644 --- a/source/libs/index/src/index_fst.c +++ b/source/libs/index/src/index_fst.c @@ -15,13 +15,143 @@ #include "index_fst.h" + +FstUnFinishedNodes *fstUnFinishedNodesCreate() { + FstUnFinishedNodes *nodes = malloc(sizeof(FstUnFinishedNodes)); + if (nodes == NULL) { return NULL; } + + nodes->stack = (SArray *)taosArrayInit(64, sizeof(FstBuilderNodeUnfinished)); + fstUnFinishedNodesPushEmpty(nodes, false); + return nodes; +} +void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal) { + FstBuilderNode *node = malloc(sizeof(FstBuilderNode)); + node->isFinal = isFinal; + node->finalOutput = 0; + node->trans = NULL; + + FstBuilderNodeUnfinished un = {.node = node, .last = NULL}; + taosArrayPush(nodes->stack, &un); + +} +FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes) { + assert(taosArrayGetSize(nodes->stack) == 1); + + FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack); + assert(un->last == NULL); + return un->node; +} + +FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) { + FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack); + fstBuilderNodeUnfinishedLastCompiled(un, addr); + free(un->last); // TODO add func FstLastTransitionFree() + return un->node; +} + +FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes) { + FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack); + assert(un->last == NULL); + return un->node; + +} +void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *nodes, Output out) { + FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, 0); + un->node->isFinal = true; + un->node->finalOutput = out; + //un->node->trans = NULL; +} +void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) { + size_t sz = taosArrayGetSize(nodes->stack) - 1; + FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz); + fstBuilderNodeUnfinishedLastCompiled(un, addr); +} +void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *nodes, FstSlice bs, Output out) { + FstSlice *s = &bs; + if (s->data == NULL || s->dLen == 0 || s->start > s->end) { + return; + } + size_t sz = taosArrayGetSize(nodes->stack) - 1; + FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz); + assert(un->last == NULL); + + + FstLastTransition *trn = malloc(sizeof(FstLastTransition)); + trn->inp = s->data[s->start]; + trn->out = out; + + un->last = trn; + + for (uint64_t i = s->start; i <= s->end; i++) { + FstBuilderNode *n = malloc(sizeof(FstBuilderNode)); + n->isFinal = false; + n->finalOutput = 0; + n->trans = NULL; + + FstLastTransition *trn = malloc(sizeof(FstLastTransition)); + trn->inp = s->data[i]; + trn->out = out; + + FstBuilderNodeUnfinished un = {.node = n, .last = trn}; + taosArrayPush(nodes->stack, &un); + } + fstUnFinishedNodesPushEmpty(nodes, true); +} + + +uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs) { + FstSlice *s = &bs; + + size_t lsz = (size_t)(s->end - s->start + 1); // data len + size_t ssz = taosArrayGetSize(node->stack); // stack size + + uint64_t count = 0; + for (size_t i = 0; i < ssz && i < lsz; i++) { + FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i); + if (un->last->inp == s->data[s->start + i]) { + count++; + } else { + break; + } + } + return count; +} +uint64_t FstUnFinishedNodesFindCommPrefixAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out) { + FstSlice *s = &bs; + + size_t lsz = (size_t)(s->end - s->start + 1); // data len + size_t ssz = taosArrayGetSize(node->stack); // stack size + + uint64_t res = 0; + for (size_t i = 0; i < lsz && i < ssz; i++) { + FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i); + + FstLastTransition *last = un->last; + if (last->inp == s->data[s->start + i]) { + uint64_t commPrefix = last->out; + uint64_t addPrefix = last->out - commPrefix; + out = out - commPrefix; + last->out = commPrefix; + if (addPrefix != 0) { + fstBuilderNodeUnfinishedAddOutputPrefix(un, addPrefix); + } + } else { + break; + } + } + return res; +} + // fst node function -FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) { + + + +FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *slice) { FstNode *n = (FstNode *)malloc(sizeof(FstNode)); if (n == NULL) { return NULL; } if (addr == EMPTY_ADDRESS) { - n->date = NULL; + n->data = fstSliceCreate(NULL, 0); n->version = version; n->state = EmptyFinal; n->start = EMPTY_ADDRESS; @@ -29,20 +159,138 @@ FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) { n->isFinal = true; n->nTrans = 0; n->sizes = 0; - n->finalOutpu = 0; - return n; + n->finalOutput = 0; + } + uint8_t v = slice->data[addr]; + uint8_t s = (v & 0b11000000) >> 6; + if (s == 0b11) { // oneTransNext + n->data = fstSliceCopy(slice, 0, addr); + n->version = version; + n->state = OneTransNext; + n->start = addr; + n->end = addr; //? s.end_addr(data); + n->isFinal = false; + n->sizes = 0; + n->nTrans = 0; + n->finalOutput = 0; + } else if (v == 0b10) { // oneTrans + uint64_t sz; // fetch sz from addr + n->data = fstSliceCopy(slice, 0, addr); + n->version = version; + n->state = OneTrans; + n->start = addr; + n->end = addr; // s.end_addr(data, sz); + n->isFinal = false; + n->nTrans = 1; + n->sizes = sz; + n->finalOutput = 0; + } else { // anyTrans + uint64_t sz; // s.sizes(data) + uint32_t nTrans; // s.ntrans(data) + n->data = *slice; + n->version = version; + n->state = AnyTrans; + n->start = addr; + n->end = addr; // s.end_addr(version, data, sz, ntrans); + n->isFinal = false; // s.is_final_state(); + n->nTrans = nTrans; + n->sizes = sz; + n->finalOutput = 0; // s.final_output(version, data, sz, ntrans); } - uint8_t v = (data[addr] & 0b1100000) >> 6; - if (v == 0b11) { - - } else if (v == 0b10) { - - } else { + return n; +} +FstTransitions* fstNodeTransitions(FstNode *node) { + FstTransitions *t = malloc(sizeof(FstTransitions)); + if (NULL == t) { + return NULL; + } + FstRange range = {.start = 0, .end = FST_NODE_LEN(node)}; + t->node = node; + t->range = range; + return t; +} +bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res) { + bool s = true; + if (node->state == OneTransNext) { - } - + } else if (node->state == OneTrans) { + + } else if (node->state == AnyTrans) { + + } else { + s = false; + } + return s; +} + +bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res) { + bool s = true; + if (node->state == OneTransNext) { + + } else if (node->state == OneTrans) { + + } else if (node->state == AnyTrans) { + + } else if (node->state == EmptyFinal){ + s = false; + } + return s; +} + +bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res) { + bool s = true; + uint8_t input; // s.input + if (node->state == OneTransNext) { + if (b == input) { *res = 0; } + else { return s ; } + } else if (node->state == OneTrans) { + if (b == input) { *res = 0; } + else {return s;} + } else if (node->state == AnyTrans) { + + } else if (node->state == EmptyFinal) { + s = false; + } + return s; +} + +bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode) { + size_t sz = taosArrayGetSize(builderNode->trans); + assert(sz < 256); + if (sz == 0 && builderNode->isFinal && builderNode->finalOutput == 0) { + return true; + } else if (sz != 1 || builderNode->isFinal) { + // AnyTrans->Compile(w, addr, node); + } else { + FstTransition *tran = taosArrayGet(builderNode->trans, 0); + if (tran->addr == lastAddr && tran->out == 0) { + //OneTransNext::compile(w, lastAddr, tran->inp); + return true; + } else { + //OneTrans::Compile(w, lastAddr, *tran); + return true; + } + } + return true; +} + + + + +FstBuilder *fstBuilderCreate(void *w, FstType ty) { + FstBuilder *b = malloc(sizeof(FstBuilder)); + if (NULL == b) { return b; } + + FstCountingWriter wtr = {.wtr = w, .count = 0, .summer = 0}; + b->wtr = wtr; + b->unfinished = malloc(sizeof(FstUnFinishedNodes)); + return b; } +FstSlice fstNodeAsSlice(FstNode *node) { + FstSlice *slice = &node->data; + FstSlice s = fstSliceCopy(slice, slice->end, slice->dLen - 1); + return s; +} - diff --git a/source/libs/index/src/index_fst_automation.c b/source/libs/index/src/index_fst_automation.c new file mode 100644 index 0000000000..f2f48bbc8a --- /dev/null +++ b/source/libs/index/src/index_fst_automation.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ diff --git a/source/libs/index/src/index_fst_common.c b/source/libs/index/src/index_fst_common.c index 39e5f89b35..4ab78cddc5 100644 --- a/source/libs/index/src/index_fst_common.c +++ b/source/libs/index/src/index_fst_common.c @@ -12,6 +12,8 @@ * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ + +#include "tutil.h" const uint8_t COMMON_INPUTS[] = { 84, // '\x00' 85, // '\x01' @@ -271,7 +273,7 @@ const uint8_t COMMON_INPUTS[] = { 255, // 'ΓΏ' }; -char const COMMON_INPUTS_INV[] = [ +char const COMMON_INPUTS_INV[] = { 't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w', '.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=', ':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6', @@ -300,5 +302,5 @@ char const COMMON_INPUTS_INV[] = [ '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', -]; +}; diff --git a/source/libs/index/src/index_fst_node.c b/source/libs/index/src/index_fst_node.c new file mode 100644 index 0000000000..3d5efd30f3 --- /dev/null +++ b/source/libs/index/src/index_fst_node.c @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + diff --git a/source/libs/index/src/index_fst_registry.c b/source/libs/index/src/index_fst_registry.c new file mode 100644 index 0000000000..940c5863f4 --- /dev/null +++ b/source/libs/index/src/index_fst_registry.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "index_fst_registry.h" + diff --git a/source/libs/index/src/index_fst_util.c b/source/libs/index/src/index_fst_util.c new file mode 100644 index 0000000000..20751baf5f --- /dev/null +++ b/source/libs/index/src/index_fst_util.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#include "index_fst_util.h" + + + +//A sentinel value used to indicate an empty final state +const CompiledAddr EMPTY_ADDRESS = 0; +/// A sentinel value used to indicate an invalid state. +const CompiledAddr NONE_ADDRESS = 1; + +// This version number is written to every finite state transducer created by +// this crate. When a finite state transducer is read, its version number is +// checked against this value. +const uint64_t version = 3; +// The threshold (in number of transitions) at which an index is created for +// a node's transitions. This speeds up lookup time at the expense of FST size + +const uint64_t TRANS_INDEX_THRESHOLD = 32; + + +//uint8_t commonInput(uint8_t idx) { +// if (idx == 0) { return -1; } +// else { +// return COMMON_INPUTS_INV[idx - 1]; +// } +//} +// +//uint8_t commonIdx(uint8_t v, uint8_t max) { +// uint8_t v = ((uint16_t)tCOMMON_INPUTS[v] + 1)%256; +// return v > max ? 0: v; +//} + + + +uint8_t packSize(uint64_t n) { + if (n < (1u << 8)) { + return 1; + } else if (n < (1u << 16)) { + return 2; + } else if (n < (1u << 24)) { + return 3; + } else if (n < ((uint64_t)(1) << 32)) { + return 4; + } else if (n < ((uint64_t)(1) << 40)) { + return 5; + } else if (n < ((uint64_t)(1) << 48)) { + return 6; + } else if (n < ((uint64_t)(1) << 56)) { + return 7; + } else { + return 8; + } +} + +uint64_t unpackUint64(uint8_t *ch, uint8_t sz) { + uint64_t n; + for (uint8_t i = 0; i < sz; i++) { + n = n | (ch[i] << (8 * i)); + } + return n; +} +uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr) { + if (transAddr == EMPTY_ADDRESS) { + return packSize(EMPTY_ADDRESS); + } else { + return packSize(nodeAddr - transAddr); + } +} +CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr) { + uint64_t delta = unpackUint64(data, len); + // delta_add = u64_to_usize + if (delta == EMPTY_ADDRESS) { + return EMPTY_ADDRESS; + } else { + return nodeAddr - delta; + } +} + +// fst slice func +FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen) { + FstSlice slice = {.data = data, .dLen = dLen, .start = 0, .end = dLen - 1}; + return slice; +} +FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end) { + FstSlice t; + if (start >= slice->dLen || end >= slice->dLen || start > end) { + t.data = NULL; + return t; + }; + + t.data = slice->data; + t.dLen = slice->dLen; + t.start = start; + t.end = end; + return t; +} +bool fstSliceEmpty(FstSlice *slice) { + return slice->data == NULL || slice->dLen <= 0; +} + + +