fst core struct
This commit is contained in:
parent
980ace09b5
commit
60e339b31d
|
@ -13,58 +13,73 @@
|
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef _INDEX_FST_H_
|
||||
#define _INDEX_FST_H_
|
||||
#include "index_fst.h"
|
||||
#ifndef __INDEX_FST_H__
|
||||
#define __INDEX_FST_H__
|
||||
|
||||
|
||||
#include "tarray.h"
|
||||
|
||||
typedef FstType uint64_t;
|
||||
typedef CompiledAddr uint64_t;
|
||||
typedef Output uint64_t;
|
||||
typedef PackSizes uint8_t;
|
||||
#include "index_fst_util.h"
|
||||
#include "index_fst_registry.h"
|
||||
|
||||
|
||||
//A sentinel value used to indicate an empty final state
|
||||
const CompileAddr EMPTY_ADDRESS = 0;
|
||||
/// A sentinel value used to indicate an invalid state.
|
||||
const CompileAddr NONE_ADDRESS = 1;
|
||||
typedef struct FstNode FstNode;
|
||||
#define OUTPUT_PREFIX(a, b) ((a) > (b) ? (b) : (a)
|
||||
|
||||
// This version number is written to every finite state transducer created by
|
||||
// this crate. When a finite state transducer is read, its version number is
|
||||
// checked against this value.
|
||||
const uint64_t version = 3;
|
||||
// The threshold (in number of transitions) at which an index is created for
|
||||
// a node's transitions. This speeds up lookup time at the expense of FST size
|
||||
|
||||
const uint64_t TRANS_INDEX_THRESHOLD = 32;
|
||||
|
||||
typedef struct FstRange {
|
||||
uint64_t start;
|
||||
uint64_t end;
|
||||
} FstRange;
|
||||
|
||||
enum State { OneTransNext, OneTrans, AnyTrans, EmptyFinal};
|
||||
enum FstBound { Included, Excluded, Unbounded};
|
||||
|
||||
typedef struct CheckSummer {
|
||||
uint32_t sum;
|
||||
};
|
||||
typedef struct FstBuilderNode {
|
||||
bool isFinal;
|
||||
Output finalOutput;
|
||||
SArray *trans; // <FstTransition>
|
||||
} FstBuilderNode;
|
||||
|
||||
typedef enum { OneTransNext, OneTrans, AnyTrans, EmptyFinal} State;
|
||||
typedef enum { Included, Excluded, Unbounded} FstBound;
|
||||
|
||||
typedef uint32_t CheckSummer;
|
||||
|
||||
|
||||
typedef struct FstBuilder {
|
||||
FstCountingWriter wtr; // The FST raw data is written directly to `wtr`.
|
||||
FstUnFinishedNodes unfinished // The stack of unfinished nodes
|
||||
Registry registry // A map of finished nodes.
|
||||
SArray* last // The last word added
|
||||
CompiledAddr lastAddr // The address of the last compiled node
|
||||
uint64_t len // num of keys added
|
||||
} FstBuilder;
|
||||
/*
|
||||
*
|
||||
* UnFinished node and helper function
|
||||
* TODO: simple function name
|
||||
*/
|
||||
typedef struct FstUnFinishedNodes {
|
||||
SArray *stack; // <FstBuilderNodeUnfinished> } FstUnFinishedNodes;
|
||||
} FstUnFinishedNodes;
|
||||
|
||||
#define FST_UNFINISHED_NODES_LEN(nodes) taosArrayGetSize(nodes->stack)
|
||||
|
||||
FstUnFinishedNodes *FstUnFinishedNodesCreate();
|
||||
void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal);
|
||||
FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes);
|
||||
FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr);
|
||||
FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes);
|
||||
void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *node, Output out);
|
||||
void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *node, CompiledAddr addr);
|
||||
void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *node, FstSlice bs, Output out);
|
||||
uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs);
|
||||
uint64_t FstUnFinishedNodesFindCommPreifxAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out);
|
||||
|
||||
typedef struct FstCountingWriter {
|
||||
void* wtr; // wrap any writer that counts and checksum bytes written
|
||||
uint64_t count;
|
||||
CheckSummer summer;
|
||||
};
|
||||
} FstCountingWriter;
|
||||
|
||||
typedef struct FstBuilder {
|
||||
FstCountingWriter wtr; // The FST raw data is written directly to `wtr`.
|
||||
FstUnFinishedNodes *unfinished; // The stack of unfinished nodes
|
||||
FstRegistry registry; // A map of finished nodes.
|
||||
SArray* last; // The last word added
|
||||
CompiledAddr lastAddr; // The address of the last compiled node
|
||||
uint64_t len; // num of keys added
|
||||
} FstBuilder;
|
||||
|
||||
|
||||
|
||||
|
@ -80,16 +95,6 @@ typedef struct FstTransitions {
|
|||
FstRange range;
|
||||
} FstTransitions;
|
||||
|
||||
typedef struct FstUnFinishedNodes {
|
||||
SArray *stack; // <FstBuilderNodeUnfinished>
|
||||
} FstUnFinishedNodes;
|
||||
|
||||
typedef struct FstBuilderNode {
|
||||
bool isFinal;
|
||||
Output finalOutput;
|
||||
SArray *trans; // <FstTransition>
|
||||
} FstBuilderNode;
|
||||
|
||||
|
||||
|
||||
typedef struct FstLastTransition {
|
||||
|
@ -97,13 +102,23 @@ typedef struct FstLastTransition {
|
|||
Output out;
|
||||
} FstLastTransition;
|
||||
|
||||
/*
|
||||
* FstBuilderNodeUnfinished and helper function
|
||||
* TODO: simple function name
|
||||
*/
|
||||
typedef struct FstBuilderNodeUnfinished {
|
||||
FstBuilderNode node;
|
||||
FstLastTransition last;
|
||||
FstBuilderNode *node;
|
||||
FstLastTransition* last;
|
||||
} FstBuilderNodeUnfinished;
|
||||
|
||||
void fstBuilderNodeUnfinishedLastCompiled(FstBuilderNodeUnfinished *node, CompiledAddr addr);
|
||||
void fstBuilderNodeUnfinishedAddOutputPrefix(FstBuilderNodeUnfinished *node, CompiledAddr addr);
|
||||
|
||||
/*
|
||||
* FstNode and helper function
|
||||
*/
|
||||
typedef struct FstNode {
|
||||
uint8_t* data;
|
||||
FstSlice data;
|
||||
uint64_t version;
|
||||
State state;
|
||||
CompiledAddr start;
|
||||
|
@ -114,6 +129,28 @@ typedef struct FstNode {
|
|||
Output finalOutput;
|
||||
} FstNode;
|
||||
|
||||
// If this node is final and has a terminal output value, then it is, returned. Otherwise, a zero output is returned
|
||||
#define FST_NODE_FINAL_OUTPUT(node) node->finalOutput
|
||||
// Returns true if and only if this node corresponds to a final or "match", state in the finite state transducer.
|
||||
#define FST_NODE_IS_FINAL(node) node->isFinal
|
||||
// Returns the number of transitions in this node, The maximum number of transitions is 256.
|
||||
#define FST_NODE_LEN(node) node->nTrans
|
||||
// Returns true if and only if this node has zero transitions.
|
||||
#define FST_NODE_IS_EMPTYE(node) (node->nTrans == 0)
|
||||
// Return the address of this node.
|
||||
#define FST_NODE_ADDR(node) node->start
|
||||
|
||||
FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *data);
|
||||
FstTransitions fstNodeTransitionIter(FstNode *node);
|
||||
FstTransitions* fstNodeTransitions(FstNode *node);
|
||||
bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res);
|
||||
bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res);
|
||||
bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res);
|
||||
bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode);
|
||||
FstSlice fstNodeAsSlice(FstNode *node);
|
||||
|
||||
|
||||
|
||||
typedef struct FstMeta {
|
||||
uint64_t version;
|
||||
CompiledAddr rootAddr;
|
||||
|
@ -125,42 +162,21 @@ typedef struct FstMeta {
|
|||
typedef struct Fst {
|
||||
FstMeta meta;
|
||||
void *data; //
|
||||
};
|
||||
} Fst;
|
||||
|
||||
// ops
|
||||
// ops
|
||||
|
||||
typedef struct FstIndexedValue {
|
||||
uint64_t index;
|
||||
uint64_t value;
|
||||
};
|
||||
} FstIndexedValue;
|
||||
|
||||
// relate to Regist
|
||||
typedef struct FstRegistry {
|
||||
SArray *table; // <Registtry cell>
|
||||
uint64_t tableSize; // num of rows
|
||||
uint64_t mruSize; // num of columns
|
||||
} FstRegistry;
|
||||
|
||||
typedef struct FstRegistryCache {
|
||||
SArray *cells; // <RegistryCell>
|
||||
} FstRegistryCache;
|
||||
|
||||
typedef struct FstRegistryCell {
|
||||
CompiledAddr addr;
|
||||
FstBuilderNode *node;
|
||||
} FstRegistryCell;
|
||||
|
||||
enum FstRegistryEntry {Found, NotFound, Rejected};
|
||||
|
||||
FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, uint8_t *data);
|
||||
FstTransitions fstNodeTransitionIter(FstNode *node);
|
||||
FstTransition fstNodeGetTransitionAt(FstNode *node, uint64_t i);
|
||||
CompiledAddr fstNodeGetTransitionAddr(FstNode *node, uint64_t i);
|
||||
int64_t fstNodeFindInput(FstNode *node, int8_t b);
|
||||
Output fstNodeGetFinalOutput(FstNode *node);
|
||||
void* fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledArr addr, FstBuilderNode *builderNode);
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#ifndef __INDEX_FST_AUTAOMATION_H__
|
||||
#define __INDEX_FST_AUTAOMATION_H__
|
||||
|
||||
struct AutomationCtx;
|
||||
|
||||
typedef struct StartWith {
|
||||
AutomationCtx *autoSelf;
|
||||
} StartWith;
|
||||
|
||||
typedef struct Complement {
|
||||
AutomationCtx *autoSelf;
|
||||
} Complement;
|
||||
|
||||
// automation
|
||||
typedef struct AutomationCtx {
|
||||
void *data;
|
||||
} AutomationCtx;
|
||||
|
||||
// automation interface
|
||||
void (*start)(AutomationCtx *ctx);
|
||||
bool (*isMatch)(AutomationCtx *ctx);
|
||||
bool (*canMatch)(AutomationCtx *ctx, void *data);
|
||||
bool (*willAlwaysMatch)(AutomationCtx *ctx, void *state);
|
||||
void* (*accpet)(AutomationCtx *ctx, void *state, uint8_t byte);
|
||||
void* (*accpetEof)(AutomationCtx *ctx, *state);
|
||||
|
||||
|
||||
#endif
|
|
@ -0,0 +1,22 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef __INDEX_FST_NODE_H__
|
||||
#define __INDEX_FST_NODE_H__
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
|
@ -0,0 +1,24 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#ifndef __FST_REGISTRY_H__
|
||||
#define __FST_REGISTRY_H__
|
||||
|
||||
#include "index_fst_util.h"
|
||||
|
||||
|
||||
typedef struct FstRegistry {
|
||||
|
||||
} FstRegistry;
|
||||
#endif
|
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INDEX_FST_UTIL_H__
|
||||
#define __INDEX_FST_UTIL_H__
|
||||
|
||||
#include "tarray.h"
|
||||
|
||||
|
||||
typedef uint64_t FstType;
|
||||
typedef uint64_t CompiledAddr;
|
||||
typedef uint64_t Output;
|
||||
typedef uint8_t PackSizes;
|
||||
|
||||
|
||||
//A sentinel value used to indicate an empty final state
|
||||
extern const CompiledAddr EMPTY_ADDRESS;
|
||||
/// A sentinel value used to indicate an invalid state.
|
||||
extern const CompiledAddr NONE_ADDRESS;
|
||||
|
||||
// This version number is written to every finite state transducer created by
|
||||
// this crate. When a finite state transducer is read, its version number is
|
||||
// checked against this value.
|
||||
extern const uint64_t version;
|
||||
// The threshold (in number of transitions) at which an index is created for
|
||||
// a node's transitions. This speeds up lookup time at the expense of FST size
|
||||
|
||||
extern const uint64_t TRANS_INDEX_THRESHOLD;
|
||||
// high 4 bits is transition address packed size.
|
||||
// low 4 bits is output value packed size.
|
||||
//
|
||||
// `0` is a legal value which means there are no transitions/outputs
|
||||
|
||||
#define FST_SET_TRANSITION_PACK_SIZE(v, sz) do {v = (v & 0b00001111) | (sz << 4} while(0)
|
||||
#define FST_GET_TRANSITION_PACK_SIZE(v) (((v) & 0b11110000) >> 4)
|
||||
#define FST_SET_OUTPUT_PACK_SIZE(v, sz) do { v = (v & 0b11110000) | sz } while(0)
|
||||
#define FST_GET_OUTPUT_PACK_SIZE(v) ((v) & 0b00001111)
|
||||
|
||||
#define COMMON_INPUT(idx) COMMON_INPUTS_INV[(idx) - 1]
|
||||
|
||||
#define COMMON_INDEX(v, max, val) do { \
|
||||
val = ((uint16_t)COMMON_INPUTS[v] + 1)%256; \
|
||||
val = val > max ? 0: val; \
|
||||
} while(0)
|
||||
|
||||
|
||||
//uint8_t commonInput(uint8_t idx);
|
||||
//uint8_t commonIdx(uint8_t v, uint8_t max);
|
||||
|
||||
uint8_t packSize(uint64_t n);
|
||||
uint64_t unpackUint64(uint8_t *ch, uint8_t sz);
|
||||
uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr);
|
||||
CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr);
|
||||
|
||||
|
||||
|
||||
typedef struct FstSlice {
|
||||
uint8_t *data;
|
||||
uint64_t dLen;
|
||||
uint32_t start;
|
||||
uint32_t end;
|
||||
} FstSlice;
|
||||
|
||||
FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end);
|
||||
FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen);
|
||||
bool fstSliceEmpty(FstSlice *slice);
|
||||
|
||||
|
||||
#endif
|
|
@ -15,13 +15,143 @@
|
|||
|
||||
#include "index_fst.h"
|
||||
|
||||
|
||||
FstUnFinishedNodes *fstUnFinishedNodesCreate() {
|
||||
FstUnFinishedNodes *nodes = malloc(sizeof(FstUnFinishedNodes));
|
||||
if (nodes == NULL) { return NULL; }
|
||||
|
||||
nodes->stack = (SArray *)taosArrayInit(64, sizeof(FstBuilderNodeUnfinished));
|
||||
fstUnFinishedNodesPushEmpty(nodes, false);
|
||||
return nodes;
|
||||
}
|
||||
void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal) {
|
||||
FstBuilderNode *node = malloc(sizeof(FstBuilderNode));
|
||||
node->isFinal = isFinal;
|
||||
node->finalOutput = 0;
|
||||
node->trans = NULL;
|
||||
|
||||
FstBuilderNodeUnfinished un = {.node = node, .last = NULL};
|
||||
taosArrayPush(nodes->stack, &un);
|
||||
|
||||
}
|
||||
FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes) {
|
||||
assert(taosArrayGetSize(nodes->stack) == 1);
|
||||
|
||||
FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack);
|
||||
assert(un->last == NULL);
|
||||
return un->node;
|
||||
}
|
||||
|
||||
FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) {
|
||||
FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack);
|
||||
fstBuilderNodeUnfinishedLastCompiled(un, addr);
|
||||
free(un->last); // TODO add func FstLastTransitionFree()
|
||||
return un->node;
|
||||
}
|
||||
|
||||
FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes) {
|
||||
FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack);
|
||||
assert(un->last == NULL);
|
||||
return un->node;
|
||||
|
||||
}
|
||||
void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *nodes, Output out) {
|
||||
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, 0);
|
||||
un->node->isFinal = true;
|
||||
un->node->finalOutput = out;
|
||||
//un->node->trans = NULL;
|
||||
}
|
||||
void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) {
|
||||
size_t sz = taosArrayGetSize(nodes->stack) - 1;
|
||||
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz);
|
||||
fstBuilderNodeUnfinishedLastCompiled(un, addr);
|
||||
}
|
||||
void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *nodes, FstSlice bs, Output out) {
|
||||
FstSlice *s = &bs;
|
||||
if (s->data == NULL || s->dLen == 0 || s->start > s->end) {
|
||||
return;
|
||||
}
|
||||
size_t sz = taosArrayGetSize(nodes->stack) - 1;
|
||||
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz);
|
||||
assert(un->last == NULL);
|
||||
|
||||
|
||||
FstLastTransition *trn = malloc(sizeof(FstLastTransition));
|
||||
trn->inp = s->data[s->start];
|
||||
trn->out = out;
|
||||
|
||||
un->last = trn;
|
||||
|
||||
for (uint64_t i = s->start; i <= s->end; i++) {
|
||||
FstBuilderNode *n = malloc(sizeof(FstBuilderNode));
|
||||
n->isFinal = false;
|
||||
n->finalOutput = 0;
|
||||
n->trans = NULL;
|
||||
|
||||
FstLastTransition *trn = malloc(sizeof(FstLastTransition));
|
||||
trn->inp = s->data[i];
|
||||
trn->out = out;
|
||||
|
||||
FstBuilderNodeUnfinished un = {.node = n, .last = trn};
|
||||
taosArrayPush(nodes->stack, &un);
|
||||
}
|
||||
fstUnFinishedNodesPushEmpty(nodes, true);
|
||||
}
|
||||
|
||||
|
||||
uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs) {
|
||||
FstSlice *s = &bs;
|
||||
|
||||
size_t lsz = (size_t)(s->end - s->start + 1); // data len
|
||||
size_t ssz = taosArrayGetSize(node->stack); // stack size
|
||||
|
||||
uint64_t count = 0;
|
||||
for (size_t i = 0; i < ssz && i < lsz; i++) {
|
||||
FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i);
|
||||
if (un->last->inp == s->data[s->start + i]) {
|
||||
count++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
uint64_t FstUnFinishedNodesFindCommPrefixAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out) {
|
||||
FstSlice *s = &bs;
|
||||
|
||||
size_t lsz = (size_t)(s->end - s->start + 1); // data len
|
||||
size_t ssz = taosArrayGetSize(node->stack); // stack size
|
||||
|
||||
uint64_t res = 0;
|
||||
for (size_t i = 0; i < lsz && i < ssz; i++) {
|
||||
FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i);
|
||||
|
||||
FstLastTransition *last = un->last;
|
||||
if (last->inp == s->data[s->start + i]) {
|
||||
uint64_t commPrefix = last->out;
|
||||
uint64_t addPrefix = last->out - commPrefix;
|
||||
out = out - commPrefix;
|
||||
last->out = commPrefix;
|
||||
if (addPrefix != 0) {
|
||||
fstBuilderNodeUnfinishedAddOutputPrefix(un, addPrefix);
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
// fst node function
|
||||
FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) {
|
||||
|
||||
|
||||
|
||||
FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *slice) {
|
||||
FstNode *n = (FstNode *)malloc(sizeof(FstNode));
|
||||
if (n == NULL) { return NULL; }
|
||||
|
||||
if (addr == EMPTY_ADDRESS) {
|
||||
n->date = NULL;
|
||||
n->data = fstSliceCreate(NULL, 0);
|
||||
n->version = version;
|
||||
n->state = EmptyFinal;
|
||||
n->start = EMPTY_ADDRESS;
|
||||
|
@ -29,20 +159,138 @@ FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) {
|
|||
n->isFinal = true;
|
||||
n->nTrans = 0;
|
||||
n->sizes = 0;
|
||||
n->finalOutpu = 0;
|
||||
return n;
|
||||
n->finalOutput = 0;
|
||||
}
|
||||
uint8_t v = slice->data[addr];
|
||||
uint8_t s = (v & 0b11000000) >> 6;
|
||||
if (s == 0b11) { // oneTransNext
|
||||
n->data = fstSliceCopy(slice, 0, addr);
|
||||
n->version = version;
|
||||
n->state = OneTransNext;
|
||||
n->start = addr;
|
||||
n->end = addr; //? s.end_addr(data);
|
||||
n->isFinal = false;
|
||||
n->sizes = 0;
|
||||
n->nTrans = 0;
|
||||
n->finalOutput = 0;
|
||||
} else if (v == 0b10) { // oneTrans
|
||||
uint64_t sz; // fetch sz from addr
|
||||
n->data = fstSliceCopy(slice, 0, addr);
|
||||
n->version = version;
|
||||
n->state = OneTrans;
|
||||
n->start = addr;
|
||||
n->end = addr; // s.end_addr(data, sz);
|
||||
n->isFinal = false;
|
||||
n->nTrans = 1;
|
||||
n->sizes = sz;
|
||||
n->finalOutput = 0;
|
||||
} else { // anyTrans
|
||||
uint64_t sz; // s.sizes(data)
|
||||
uint32_t nTrans; // s.ntrans(data)
|
||||
n->data = *slice;
|
||||
n->version = version;
|
||||
n->state = AnyTrans;
|
||||
n->start = addr;
|
||||
n->end = addr; // s.end_addr(version, data, sz, ntrans);
|
||||
n->isFinal = false; // s.is_final_state();
|
||||
n->nTrans = nTrans;
|
||||
n->sizes = sz;
|
||||
n->finalOutput = 0; // s.final_output(version, data, sz, ntrans);
|
||||
}
|
||||
uint8_t v = (data[addr] & 0b1100000) >> 6;
|
||||
if (v == 0b11) {
|
||||
|
||||
} else if (v == 0b10) {
|
||||
|
||||
} else {
|
||||
return n;
|
||||
}
|
||||
FstTransitions* fstNodeTransitions(FstNode *node) {
|
||||
FstTransitions *t = malloc(sizeof(FstTransitions));
|
||||
if (NULL == t) {
|
||||
return NULL;
|
||||
}
|
||||
FstRange range = {.start = 0, .end = FST_NODE_LEN(node)};
|
||||
t->node = node;
|
||||
t->range = range;
|
||||
return t;
|
||||
}
|
||||
bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res) {
|
||||
bool s = true;
|
||||
if (node->state == OneTransNext) {
|
||||
|
||||
}
|
||||
|
||||
} else if (node->state == OneTrans) {
|
||||
|
||||
} else if (node->state == AnyTrans) {
|
||||
|
||||
} else {
|
||||
s = false;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res) {
|
||||
bool s = true;
|
||||
if (node->state == OneTransNext) {
|
||||
|
||||
} else if (node->state == OneTrans) {
|
||||
|
||||
} else if (node->state == AnyTrans) {
|
||||
|
||||
} else if (node->state == EmptyFinal){
|
||||
s = false;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res) {
|
||||
bool s = true;
|
||||
uint8_t input; // s.input
|
||||
if (node->state == OneTransNext) {
|
||||
if (b == input) { *res = 0; }
|
||||
else { return s ; }
|
||||
} else if (node->state == OneTrans) {
|
||||
if (b == input) { *res = 0; }
|
||||
else {return s;}
|
||||
} else if (node->state == AnyTrans) {
|
||||
|
||||
} else if (node->state == EmptyFinal) {
|
||||
s = false;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode) {
|
||||
size_t sz = taosArrayGetSize(builderNode->trans);
|
||||
assert(sz < 256);
|
||||
if (sz == 0 && builderNode->isFinal && builderNode->finalOutput == 0) {
|
||||
return true;
|
||||
} else if (sz != 1 || builderNode->isFinal) {
|
||||
// AnyTrans->Compile(w, addr, node);
|
||||
} else {
|
||||
FstTransition *tran = taosArrayGet(builderNode->trans, 0);
|
||||
if (tran->addr == lastAddr && tran->out == 0) {
|
||||
//OneTransNext::compile(w, lastAddr, tran->inp);
|
||||
return true;
|
||||
} else {
|
||||
//OneTrans::Compile(w, lastAddr, *tran);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
FstBuilder *fstBuilderCreate(void *w, FstType ty) {
|
||||
FstBuilder *b = malloc(sizeof(FstBuilder));
|
||||
if (NULL == b) { return b; }
|
||||
|
||||
FstCountingWriter wtr = {.wtr = w, .count = 0, .summer = 0};
|
||||
b->wtr = wtr;
|
||||
b->unfinished = malloc(sizeof(FstUnFinishedNodes));
|
||||
return b;
|
||||
|
||||
}
|
||||
FstSlice fstNodeAsSlice(FstNode *node) {
|
||||
FstSlice *slice = &node->data;
|
||||
FstSlice s = fstSliceCopy(slice, slice->end, slice->dLen - 1);
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
|
@ -12,6 +12,8 @@
|
|||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "tutil.h"
|
||||
const uint8_t COMMON_INPUTS[] = {
|
||||
84, // '\x00'
|
||||
85, // '\x01'
|
||||
|
@ -271,7 +273,7 @@ const uint8_t COMMON_INPUTS[] = {
|
|||
255, // 'ÿ'
|
||||
};
|
||||
|
||||
char const COMMON_INPUTS_INV[] = [
|
||||
char const COMMON_INPUTS_INV[] = {
|
||||
't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w',
|
||||
'.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=',
|
||||
':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6',
|
||||
|
@ -300,5 +302,5 @@ char const COMMON_INPUTS_INV[] = [
|
|||
'\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0',
|
||||
'\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8',
|
||||
'\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
|
||||
];
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "index_fst_registry.h"
|
||||
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||
*
|
||||
* This program is free software: you can use, redistribute, and/or modify
|
||||
* it under the terms of the GNU Affero General Public License, version 3
|
||||
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#include "index_fst_util.h"
|
||||
|
||||
|
||||
|
||||
//A sentinel value used to indicate an empty final state
|
||||
const CompiledAddr EMPTY_ADDRESS = 0;
|
||||
/// A sentinel value used to indicate an invalid state.
|
||||
const CompiledAddr NONE_ADDRESS = 1;
|
||||
|
||||
// This version number is written to every finite state transducer created by
|
||||
// this crate. When a finite state transducer is read, its version number is
|
||||
// checked against this value.
|
||||
const uint64_t version = 3;
|
||||
// The threshold (in number of transitions) at which an index is created for
|
||||
// a node's transitions. This speeds up lookup time at the expense of FST size
|
||||
|
||||
const uint64_t TRANS_INDEX_THRESHOLD = 32;
|
||||
|
||||
|
||||
//uint8_t commonInput(uint8_t idx) {
|
||||
// if (idx == 0) { return -1; }
|
||||
// else {
|
||||
// return COMMON_INPUTS_INV[idx - 1];
|
||||
// }
|
||||
//}
|
||||
//
|
||||
//uint8_t commonIdx(uint8_t v, uint8_t max) {
|
||||
// uint8_t v = ((uint16_t)tCOMMON_INPUTS[v] + 1)%256;
|
||||
// return v > max ? 0: v;
|
||||
//}
|
||||
|
||||
|
||||
|
||||
uint8_t packSize(uint64_t n) {
|
||||
if (n < (1u << 8)) {
|
||||
return 1;
|
||||
} else if (n < (1u << 16)) {
|
||||
return 2;
|
||||
} else if (n < (1u << 24)) {
|
||||
return 3;
|
||||
} else if (n < ((uint64_t)(1) << 32)) {
|
||||
return 4;
|
||||
} else if (n < ((uint64_t)(1) << 40)) {
|
||||
return 5;
|
||||
} else if (n < ((uint64_t)(1) << 48)) {
|
||||
return 6;
|
||||
} else if (n < ((uint64_t)(1) << 56)) {
|
||||
return 7;
|
||||
} else {
|
||||
return 8;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t unpackUint64(uint8_t *ch, uint8_t sz) {
|
||||
uint64_t n;
|
||||
for (uint8_t i = 0; i < sz; i++) {
|
||||
n = n | (ch[i] << (8 * i));
|
||||
}
|
||||
return n;
|
||||
}
|
||||
uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr) {
|
||||
if (transAddr == EMPTY_ADDRESS) {
|
||||
return packSize(EMPTY_ADDRESS);
|
||||
} else {
|
||||
return packSize(nodeAddr - transAddr);
|
||||
}
|
||||
}
|
||||
CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr) {
|
||||
uint64_t delta = unpackUint64(data, len);
|
||||
// delta_add = u64_to_usize
|
||||
if (delta == EMPTY_ADDRESS) {
|
||||
return EMPTY_ADDRESS;
|
||||
} else {
|
||||
return nodeAddr - delta;
|
||||
}
|
||||
}
|
||||
|
||||
// fst slice func
|
||||
FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen) {
|
||||
FstSlice slice = {.data = data, .dLen = dLen, .start = 0, .end = dLen - 1};
|
||||
return slice;
|
||||
}
|
||||
FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end) {
|
||||
FstSlice t;
|
||||
if (start >= slice->dLen || end >= slice->dLen || start > end) {
|
||||
t.data = NULL;
|
||||
return t;
|
||||
};
|
||||
|
||||
t.data = slice->data;
|
||||
t.dLen = slice->dLen;
|
||||
t.start = start;
|
||||
t.end = end;
|
||||
return t;
|
||||
}
|
||||
bool fstSliceEmpty(FstSlice *slice) {
|
||||
return slice->data == NULL || slice->dLen <= 0;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue