fst core struct
This commit is contained in:
parent
980ace09b5
commit
60e339b31d
|
@ -13,58 +13,73 @@
|
||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _INDEX_FST_H_
|
#ifndef __INDEX_FST_H__
|
||||||
#define _INDEX_FST_H_
|
#define __INDEX_FST_H__
|
||||||
#include "index_fst.h"
|
|
||||||
|
|
||||||
#include "tarray.h"
|
#include "tarray.h"
|
||||||
|
#include "index_fst_util.h"
|
||||||
typedef FstType uint64_t;
|
#include "index_fst_registry.h"
|
||||||
typedef CompiledAddr uint64_t;
|
|
||||||
typedef Output uint64_t;
|
|
||||||
typedef PackSizes uint8_t;
|
|
||||||
|
|
||||||
|
|
||||||
//A sentinel value used to indicate an empty final state
|
typedef struct FstNode FstNode;
|
||||||
const CompileAddr EMPTY_ADDRESS = 0;
|
#define OUTPUT_PREFIX(a, b) ((a) > (b) ? (b) : (a)
|
||||||
/// A sentinel value used to indicate an invalid state.
|
|
||||||
const CompileAddr NONE_ADDRESS = 1;
|
|
||||||
|
|
||||||
// This version number is written to every finite state transducer created by
|
|
||||||
// this crate. When a finite state transducer is read, its version number is
|
|
||||||
// checked against this value.
|
|
||||||
const uint64_t version = 3;
|
|
||||||
// The threshold (in number of transitions) at which an index is created for
|
|
||||||
// a node's transitions. This speeds up lookup time at the expense of FST size
|
|
||||||
|
|
||||||
const uint64_t TRANS_INDEX_THRESHOLD = 32;
|
|
||||||
|
|
||||||
typedef struct FstRange {
|
typedef struct FstRange {
|
||||||
uint64_t start;
|
uint64_t start;
|
||||||
uint64_t end;
|
uint64_t end;
|
||||||
} FstRange;
|
} FstRange;
|
||||||
|
|
||||||
enum State { OneTransNext, OneTrans, AnyTrans, EmptyFinal};
|
|
||||||
enum FstBound { Included, Excluded, Unbounded};
|
|
||||||
|
|
||||||
typedef struct CheckSummer {
|
typedef struct FstBuilderNode {
|
||||||
uint32_t sum;
|
bool isFinal;
|
||||||
};
|
Output finalOutput;
|
||||||
|
SArray *trans; // <FstTransition>
|
||||||
|
} FstBuilderNode;
|
||||||
|
|
||||||
|
typedef enum { OneTransNext, OneTrans, AnyTrans, EmptyFinal} State;
|
||||||
|
typedef enum { Included, Excluded, Unbounded} FstBound;
|
||||||
|
|
||||||
|
typedef uint32_t CheckSummer;
|
||||||
|
|
||||||
|
|
||||||
typedef struct FstBuilder {
|
/*
|
||||||
FstCountingWriter wtr; // The FST raw data is written directly to `wtr`.
|
*
|
||||||
FstUnFinishedNodes unfinished // The stack of unfinished nodes
|
* UnFinished node and helper function
|
||||||
Registry registry // A map of finished nodes.
|
* TODO: simple function name
|
||||||
SArray* last // The last word added
|
*/
|
||||||
CompiledAddr lastAddr // The address of the last compiled node
|
typedef struct FstUnFinishedNodes {
|
||||||
uint64_t len // num of keys added
|
SArray *stack; // <FstBuilderNodeUnfinished> } FstUnFinishedNodes;
|
||||||
} FstBuilder;
|
} FstUnFinishedNodes;
|
||||||
|
|
||||||
|
#define FST_UNFINISHED_NODES_LEN(nodes) taosArrayGetSize(nodes->stack)
|
||||||
|
|
||||||
|
FstUnFinishedNodes *FstUnFinishedNodesCreate();
|
||||||
|
void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal);
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes);
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr);
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes);
|
||||||
|
void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *node, Output out);
|
||||||
|
void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *node, CompiledAddr addr);
|
||||||
|
void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *node, FstSlice bs, Output out);
|
||||||
|
uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs);
|
||||||
|
uint64_t FstUnFinishedNodesFindCommPreifxAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out);
|
||||||
|
|
||||||
typedef struct FstCountingWriter {
|
typedef struct FstCountingWriter {
|
||||||
void* wtr; // wrap any writer that counts and checksum bytes written
|
void* wtr; // wrap any writer that counts and checksum bytes written
|
||||||
uint64_t count;
|
uint64_t count;
|
||||||
CheckSummer summer;
|
CheckSummer summer;
|
||||||
};
|
} FstCountingWriter;
|
||||||
|
|
||||||
|
typedef struct FstBuilder {
|
||||||
|
FstCountingWriter wtr; // The FST raw data is written directly to `wtr`.
|
||||||
|
FstUnFinishedNodes *unfinished; // The stack of unfinished nodes
|
||||||
|
FstRegistry registry; // A map of finished nodes.
|
||||||
|
SArray* last; // The last word added
|
||||||
|
CompiledAddr lastAddr; // The address of the last compiled node
|
||||||
|
uint64_t len; // num of keys added
|
||||||
|
} FstBuilder;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -80,16 +95,6 @@ typedef struct FstTransitions {
|
||||||
FstRange range;
|
FstRange range;
|
||||||
} FstTransitions;
|
} FstTransitions;
|
||||||
|
|
||||||
typedef struct FstUnFinishedNodes {
|
|
||||||
SArray *stack; // <FstBuilderNodeUnfinished>
|
|
||||||
} FstUnFinishedNodes;
|
|
||||||
|
|
||||||
typedef struct FstBuilderNode {
|
|
||||||
bool isFinal;
|
|
||||||
Output finalOutput;
|
|
||||||
SArray *trans; // <FstTransition>
|
|
||||||
} FstBuilderNode;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct FstLastTransition {
|
typedef struct FstLastTransition {
|
||||||
|
@ -97,13 +102,23 @@ typedef struct FstLastTransition {
|
||||||
Output out;
|
Output out;
|
||||||
} FstLastTransition;
|
} FstLastTransition;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FstBuilderNodeUnfinished and helper function
|
||||||
|
* TODO: simple function name
|
||||||
|
*/
|
||||||
typedef struct FstBuilderNodeUnfinished {
|
typedef struct FstBuilderNodeUnfinished {
|
||||||
FstBuilderNode node;
|
FstBuilderNode *node;
|
||||||
FstLastTransition last;
|
FstLastTransition* last;
|
||||||
} FstBuilderNodeUnfinished;
|
} FstBuilderNodeUnfinished;
|
||||||
|
|
||||||
|
void fstBuilderNodeUnfinishedLastCompiled(FstBuilderNodeUnfinished *node, CompiledAddr addr);
|
||||||
|
void fstBuilderNodeUnfinishedAddOutputPrefix(FstBuilderNodeUnfinished *node, CompiledAddr addr);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FstNode and helper function
|
||||||
|
*/
|
||||||
typedef struct FstNode {
|
typedef struct FstNode {
|
||||||
uint8_t* data;
|
FstSlice data;
|
||||||
uint64_t version;
|
uint64_t version;
|
||||||
State state;
|
State state;
|
||||||
CompiledAddr start;
|
CompiledAddr start;
|
||||||
|
@ -114,6 +129,28 @@ typedef struct FstNode {
|
||||||
Output finalOutput;
|
Output finalOutput;
|
||||||
} FstNode;
|
} FstNode;
|
||||||
|
|
||||||
|
// If this node is final and has a terminal output value, then it is, returned. Otherwise, a zero output is returned
|
||||||
|
#define FST_NODE_FINAL_OUTPUT(node) node->finalOutput
|
||||||
|
// Returns true if and only if this node corresponds to a final or "match", state in the finite state transducer.
|
||||||
|
#define FST_NODE_IS_FINAL(node) node->isFinal
|
||||||
|
// Returns the number of transitions in this node, The maximum number of transitions is 256.
|
||||||
|
#define FST_NODE_LEN(node) node->nTrans
|
||||||
|
// Returns true if and only if this node has zero transitions.
|
||||||
|
#define FST_NODE_IS_EMPTYE(node) (node->nTrans == 0)
|
||||||
|
// Return the address of this node.
|
||||||
|
#define FST_NODE_ADDR(node) node->start
|
||||||
|
|
||||||
|
FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *data);
|
||||||
|
FstTransitions fstNodeTransitionIter(FstNode *node);
|
||||||
|
FstTransitions* fstNodeTransitions(FstNode *node);
|
||||||
|
bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res);
|
||||||
|
bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res);
|
||||||
|
bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res);
|
||||||
|
bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode);
|
||||||
|
FstSlice fstNodeAsSlice(FstNode *node);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct FstMeta {
|
typedef struct FstMeta {
|
||||||
uint64_t version;
|
uint64_t version;
|
||||||
CompiledAddr rootAddr;
|
CompiledAddr rootAddr;
|
||||||
|
@ -125,42 +162,21 @@ typedef struct FstMeta {
|
||||||
typedef struct Fst {
|
typedef struct Fst {
|
||||||
FstMeta meta;
|
FstMeta meta;
|
||||||
void *data; //
|
void *data; //
|
||||||
};
|
} Fst;
|
||||||
|
|
||||||
// ops
|
// ops
|
||||||
|
|
||||||
typedef struct FstIndexedValue {
|
typedef struct FstIndexedValue {
|
||||||
uint64_t index;
|
uint64_t index;
|
||||||
uint64_t value;
|
uint64_t value;
|
||||||
};
|
} FstIndexedValue;
|
||||||
|
|
||||||
// relate to Regist
|
|
||||||
typedef struct FstRegistry {
|
|
||||||
SArray *table; // <Registtry cell>
|
|
||||||
uint64_t tableSize; // num of rows
|
|
||||||
uint64_t mruSize; // num of columns
|
|
||||||
} FstRegistry;
|
|
||||||
|
|
||||||
typedef struct FstRegistryCache {
|
|
||||||
SArray *cells; // <RegistryCell>
|
|
||||||
} FstRegistryCache;
|
|
||||||
|
|
||||||
typedef struct FstRegistryCell {
|
typedef struct FstRegistryCell {
|
||||||
CompiledAddr addr;
|
CompiledAddr addr;
|
||||||
FstBuilderNode *node;
|
FstBuilderNode *node;
|
||||||
} FstRegistryCell;
|
} FstRegistryCell;
|
||||||
|
|
||||||
enum FstRegistryEntry {Found, NotFound, Rejected};
|
|
||||||
|
|
||||||
FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, uint8_t *data);
|
|
||||||
FstTransitions fstNodeTransitionIter(FstNode *node);
|
|
||||||
FstTransition fstNodeGetTransitionAt(FstNode *node, uint64_t i);
|
|
||||||
CompiledAddr fstNodeGetTransitionAddr(FstNode *node, uint64_t i);
|
|
||||||
int64_t fstNodeFindInput(FstNode *node, int8_t b);
|
|
||||||
Output fstNodeGetFinalOutput(FstNode *node);
|
|
||||||
void* fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledArr addr, FstBuilderNode *builderNode);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef __INDEX_FST_AUTAOMATION_H__
|
||||||
|
#define __INDEX_FST_AUTAOMATION_H__
|
||||||
|
|
||||||
|
struct AutomationCtx;
|
||||||
|
|
||||||
|
typedef struct StartWith {
|
||||||
|
AutomationCtx *autoSelf;
|
||||||
|
} StartWith;
|
||||||
|
|
||||||
|
typedef struct Complement {
|
||||||
|
AutomationCtx *autoSelf;
|
||||||
|
} Complement;
|
||||||
|
|
||||||
|
// automation
|
||||||
|
typedef struct AutomationCtx {
|
||||||
|
void *data;
|
||||||
|
} AutomationCtx;
|
||||||
|
|
||||||
|
// automation interface
|
||||||
|
void (*start)(AutomationCtx *ctx);
|
||||||
|
bool (*isMatch)(AutomationCtx *ctx);
|
||||||
|
bool (*canMatch)(AutomationCtx *ctx, void *data);
|
||||||
|
bool (*willAlwaysMatch)(AutomationCtx *ctx, void *state);
|
||||||
|
void* (*accpet)(AutomationCtx *ctx, void *state, uint8_t byte);
|
||||||
|
void* (*accpetEof)(AutomationCtx *ctx, *state);
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,22 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __INDEX_FST_NODE_H__
|
||||||
|
#define __INDEX_FST_NODE_H__
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,24 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef __FST_REGISTRY_H__
|
||||||
|
#define __FST_REGISTRY_H__
|
||||||
|
|
||||||
|
#include "index_fst_util.h"
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct FstRegistry {
|
||||||
|
|
||||||
|
} FstRegistry;
|
||||||
|
#endif
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef __INDEX_FST_UTIL_H__
|
||||||
|
#define __INDEX_FST_UTIL_H__
|
||||||
|
|
||||||
|
#include "tarray.h"
|
||||||
|
|
||||||
|
|
||||||
|
typedef uint64_t FstType;
|
||||||
|
typedef uint64_t CompiledAddr;
|
||||||
|
typedef uint64_t Output;
|
||||||
|
typedef uint8_t PackSizes;
|
||||||
|
|
||||||
|
|
||||||
|
//A sentinel value used to indicate an empty final state
|
||||||
|
extern const CompiledAddr EMPTY_ADDRESS;
|
||||||
|
/// A sentinel value used to indicate an invalid state.
|
||||||
|
extern const CompiledAddr NONE_ADDRESS;
|
||||||
|
|
||||||
|
// This version number is written to every finite state transducer created by
|
||||||
|
// this crate. When a finite state transducer is read, its version number is
|
||||||
|
// checked against this value.
|
||||||
|
extern const uint64_t version;
|
||||||
|
// The threshold (in number of transitions) at which an index is created for
|
||||||
|
// a node's transitions. This speeds up lookup time at the expense of FST size
|
||||||
|
|
||||||
|
extern const uint64_t TRANS_INDEX_THRESHOLD;
|
||||||
|
// high 4 bits is transition address packed size.
|
||||||
|
// low 4 bits is output value packed size.
|
||||||
|
//
|
||||||
|
// `0` is a legal value which means there are no transitions/outputs
|
||||||
|
|
||||||
|
#define FST_SET_TRANSITION_PACK_SIZE(v, sz) do {v = (v & 0b00001111) | (sz << 4} while(0)
|
||||||
|
#define FST_GET_TRANSITION_PACK_SIZE(v) (((v) & 0b11110000) >> 4)
|
||||||
|
#define FST_SET_OUTPUT_PACK_SIZE(v, sz) do { v = (v & 0b11110000) | sz } while(0)
|
||||||
|
#define FST_GET_OUTPUT_PACK_SIZE(v) ((v) & 0b00001111)
|
||||||
|
|
||||||
|
#define COMMON_INPUT(idx) COMMON_INPUTS_INV[(idx) - 1]
|
||||||
|
|
||||||
|
#define COMMON_INDEX(v, max, val) do { \
|
||||||
|
val = ((uint16_t)COMMON_INPUTS[v] + 1)%256; \
|
||||||
|
val = val > max ? 0: val; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
|
||||||
|
//uint8_t commonInput(uint8_t idx);
|
||||||
|
//uint8_t commonIdx(uint8_t v, uint8_t max);
|
||||||
|
|
||||||
|
uint8_t packSize(uint64_t n);
|
||||||
|
uint64_t unpackUint64(uint8_t *ch, uint8_t sz);
|
||||||
|
uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr);
|
||||||
|
CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct FstSlice {
|
||||||
|
uint8_t *data;
|
||||||
|
uint64_t dLen;
|
||||||
|
uint32_t start;
|
||||||
|
uint32_t end;
|
||||||
|
} FstSlice;
|
||||||
|
|
||||||
|
FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end);
|
||||||
|
FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen);
|
||||||
|
bool fstSliceEmpty(FstSlice *slice);
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
|
@ -15,13 +15,143 @@
|
||||||
|
|
||||||
#include "index_fst.h"
|
#include "index_fst.h"
|
||||||
|
|
||||||
|
|
||||||
|
FstUnFinishedNodes *fstUnFinishedNodesCreate() {
|
||||||
|
FstUnFinishedNodes *nodes = malloc(sizeof(FstUnFinishedNodes));
|
||||||
|
if (nodes == NULL) { return NULL; }
|
||||||
|
|
||||||
|
nodes->stack = (SArray *)taosArrayInit(64, sizeof(FstBuilderNodeUnfinished));
|
||||||
|
fstUnFinishedNodesPushEmpty(nodes, false);
|
||||||
|
return nodes;
|
||||||
|
}
|
||||||
|
void fstUnFinishedNodesPushEmpty(FstUnFinishedNodes *nodes, bool isFinal) {
|
||||||
|
FstBuilderNode *node = malloc(sizeof(FstBuilderNode));
|
||||||
|
node->isFinal = isFinal;
|
||||||
|
node->finalOutput = 0;
|
||||||
|
node->trans = NULL;
|
||||||
|
|
||||||
|
FstBuilderNodeUnfinished un = {.node = node, .last = NULL};
|
||||||
|
taosArrayPush(nodes->stack, &un);
|
||||||
|
|
||||||
|
}
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopRoot(FstUnFinishedNodes *nodes) {
|
||||||
|
assert(taosArrayGetSize(nodes->stack) == 1);
|
||||||
|
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack);
|
||||||
|
assert(un->last == NULL);
|
||||||
|
return un->node;
|
||||||
|
}
|
||||||
|
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack);
|
||||||
|
fstBuilderNodeUnfinishedLastCompiled(un, addr);
|
||||||
|
free(un->last); // TODO add func FstLastTransitionFree()
|
||||||
|
return un->node;
|
||||||
|
}
|
||||||
|
|
||||||
|
FstBuilderNode *fstUnFinishedNodesPopEmpty(FstUnFinishedNodes *nodes) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayPop(nodes->stack);
|
||||||
|
assert(un->last == NULL);
|
||||||
|
return un->node;
|
||||||
|
|
||||||
|
}
|
||||||
|
void fstUnFinishedNodesSetRootOutput(FstUnFinishedNodes *nodes, Output out) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, 0);
|
||||||
|
un->node->isFinal = true;
|
||||||
|
un->node->finalOutput = out;
|
||||||
|
//un->node->trans = NULL;
|
||||||
|
}
|
||||||
|
void fstUnFinishedNodesTopLastFreeze(FstUnFinishedNodes *nodes, CompiledAddr addr) {
|
||||||
|
size_t sz = taosArrayGetSize(nodes->stack) - 1;
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz);
|
||||||
|
fstBuilderNodeUnfinishedLastCompiled(un, addr);
|
||||||
|
}
|
||||||
|
void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *nodes, FstSlice bs, Output out) {
|
||||||
|
FstSlice *s = &bs;
|
||||||
|
if (s->data == NULL || s->dLen == 0 || s->start > s->end) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
size_t sz = taosArrayGetSize(nodes->stack) - 1;
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz);
|
||||||
|
assert(un->last == NULL);
|
||||||
|
|
||||||
|
|
||||||
|
FstLastTransition *trn = malloc(sizeof(FstLastTransition));
|
||||||
|
trn->inp = s->data[s->start];
|
||||||
|
trn->out = out;
|
||||||
|
|
||||||
|
un->last = trn;
|
||||||
|
|
||||||
|
for (uint64_t i = s->start; i <= s->end; i++) {
|
||||||
|
FstBuilderNode *n = malloc(sizeof(FstBuilderNode));
|
||||||
|
n->isFinal = false;
|
||||||
|
n->finalOutput = 0;
|
||||||
|
n->trans = NULL;
|
||||||
|
|
||||||
|
FstLastTransition *trn = malloc(sizeof(FstLastTransition));
|
||||||
|
trn->inp = s->data[i];
|
||||||
|
trn->out = out;
|
||||||
|
|
||||||
|
FstBuilderNodeUnfinished un = {.node = n, .last = trn};
|
||||||
|
taosArrayPush(nodes->stack, &un);
|
||||||
|
}
|
||||||
|
fstUnFinishedNodesPushEmpty(nodes, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
uint64_t fstUnFinishedNodesFindCommPrefix(FstUnFinishedNodes *node, FstSlice bs) {
|
||||||
|
FstSlice *s = &bs;
|
||||||
|
|
||||||
|
size_t lsz = (size_t)(s->end - s->start + 1); // data len
|
||||||
|
size_t ssz = taosArrayGetSize(node->stack); // stack size
|
||||||
|
|
||||||
|
uint64_t count = 0;
|
||||||
|
for (size_t i = 0; i < ssz && i < lsz; i++) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i);
|
||||||
|
if (un->last->inp == s->data[s->start + i]) {
|
||||||
|
count++;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
uint64_t FstUnFinishedNodesFindCommPrefixAndSetOutput(FstUnFinishedNodes *node, FstSlice bs, Output in, Output *out) {
|
||||||
|
FstSlice *s = &bs;
|
||||||
|
|
||||||
|
size_t lsz = (size_t)(s->end - s->start + 1); // data len
|
||||||
|
size_t ssz = taosArrayGetSize(node->stack); // stack size
|
||||||
|
|
||||||
|
uint64_t res = 0;
|
||||||
|
for (size_t i = 0; i < lsz && i < ssz; i++) {
|
||||||
|
FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i);
|
||||||
|
|
||||||
|
FstLastTransition *last = un->last;
|
||||||
|
if (last->inp == s->data[s->start + i]) {
|
||||||
|
uint64_t commPrefix = last->out;
|
||||||
|
uint64_t addPrefix = last->out - commPrefix;
|
||||||
|
out = out - commPrefix;
|
||||||
|
last->out = commPrefix;
|
||||||
|
if (addPrefix != 0) {
|
||||||
|
fstBuilderNodeUnfinishedAddOutputPrefix(un, addPrefix);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
// fst node function
|
// fst node function
|
||||||
FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) {
|
|
||||||
|
|
||||||
|
|
||||||
|
FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, FstSlice *slice) {
|
||||||
FstNode *n = (FstNode *)malloc(sizeof(FstNode));
|
FstNode *n = (FstNode *)malloc(sizeof(FstNode));
|
||||||
if (n == NULL) { return NULL; }
|
if (n == NULL) { return NULL; }
|
||||||
|
|
||||||
if (addr == EMPTY_ADDRESS) {
|
if (addr == EMPTY_ADDRESS) {
|
||||||
n->date = NULL;
|
n->data = fstSliceCreate(NULL, 0);
|
||||||
n->version = version;
|
n->version = version;
|
||||||
n->state = EmptyFinal;
|
n->state = EmptyFinal;
|
||||||
n->start = EMPTY_ADDRESS;
|
n->start = EMPTY_ADDRESS;
|
||||||
|
@ -29,20 +159,138 @@ FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) {
|
||||||
n->isFinal = true;
|
n->isFinal = true;
|
||||||
n->nTrans = 0;
|
n->nTrans = 0;
|
||||||
n->sizes = 0;
|
n->sizes = 0;
|
||||||
n->finalOutpu = 0;
|
n->finalOutput = 0;
|
||||||
|
}
|
||||||
|
uint8_t v = slice->data[addr];
|
||||||
|
uint8_t s = (v & 0b11000000) >> 6;
|
||||||
|
if (s == 0b11) { // oneTransNext
|
||||||
|
n->data = fstSliceCopy(slice, 0, addr);
|
||||||
|
n->version = version;
|
||||||
|
n->state = OneTransNext;
|
||||||
|
n->start = addr;
|
||||||
|
n->end = addr; //? s.end_addr(data);
|
||||||
|
n->isFinal = false;
|
||||||
|
n->sizes = 0;
|
||||||
|
n->nTrans = 0;
|
||||||
|
n->finalOutput = 0;
|
||||||
|
} else if (v == 0b10) { // oneTrans
|
||||||
|
uint64_t sz; // fetch sz from addr
|
||||||
|
n->data = fstSliceCopy(slice, 0, addr);
|
||||||
|
n->version = version;
|
||||||
|
n->state = OneTrans;
|
||||||
|
n->start = addr;
|
||||||
|
n->end = addr; // s.end_addr(data, sz);
|
||||||
|
n->isFinal = false;
|
||||||
|
n->nTrans = 1;
|
||||||
|
n->sizes = sz;
|
||||||
|
n->finalOutput = 0;
|
||||||
|
} else { // anyTrans
|
||||||
|
uint64_t sz; // s.sizes(data)
|
||||||
|
uint32_t nTrans; // s.ntrans(data)
|
||||||
|
n->data = *slice;
|
||||||
|
n->version = version;
|
||||||
|
n->state = AnyTrans;
|
||||||
|
n->start = addr;
|
||||||
|
n->end = addr; // s.end_addr(version, data, sz, ntrans);
|
||||||
|
n->isFinal = false; // s.is_final_state();
|
||||||
|
n->nTrans = nTrans;
|
||||||
|
n->sizes = sz;
|
||||||
|
n->finalOutput = 0; // s.final_output(version, data, sz, ntrans);
|
||||||
|
}
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
uint8_t v = (data[addr] & 0b1100000) >> 6;
|
FstTransitions* fstNodeTransitions(FstNode *node) {
|
||||||
if (v == 0b11) {
|
FstTransitions *t = malloc(sizeof(FstTransitions));
|
||||||
|
if (NULL == t) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
FstRange range = {.start = 0, .end = FST_NODE_LEN(node)};
|
||||||
|
t->node = node;
|
||||||
|
t->range = range;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
bool fstNodeGetTransitionAt(FstNode *node, uint64_t i, FstTransition *res) {
|
||||||
|
bool s = true;
|
||||||
|
if (node->state == OneTransNext) {
|
||||||
|
|
||||||
} else if (v == 0b10) {
|
} else if (node->state == OneTrans) {
|
||||||
|
|
||||||
|
} else if (node->state == AnyTrans) {
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
s = false;
|
||||||
|
}
|
||||||
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool fstNodeGetTransitionAddrAt(FstNode *node, uint64_t i, CompiledAddr *res) {
|
||||||
|
bool s = true;
|
||||||
|
if (node->state == OneTransNext) {
|
||||||
|
|
||||||
|
} else if (node->state == OneTrans) {
|
||||||
|
|
||||||
|
} else if (node->state == AnyTrans) {
|
||||||
|
|
||||||
|
} else if (node->state == EmptyFinal){
|
||||||
|
s = false;
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res) {
|
||||||
|
bool s = true;
|
||||||
|
uint8_t input; // s.input
|
||||||
|
if (node->state == OneTransNext) {
|
||||||
|
if (b == input) { *res = 0; }
|
||||||
|
else { return s ; }
|
||||||
|
} else if (node->state == OneTrans) {
|
||||||
|
if (b == input) { *res = 0; }
|
||||||
|
else {return s;}
|
||||||
|
} else if (node->state == AnyTrans) {
|
||||||
|
|
||||||
|
} else if (node->state == EmptyFinal) {
|
||||||
|
s = false;
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode) {
|
||||||
|
size_t sz = taosArrayGetSize(builderNode->trans);
|
||||||
|
assert(sz < 256);
|
||||||
|
if (sz == 0 && builderNode->isFinal && builderNode->finalOutput == 0) {
|
||||||
|
return true;
|
||||||
|
} else if (sz != 1 || builderNode->isFinal) {
|
||||||
|
// AnyTrans->Compile(w, addr, node);
|
||||||
|
} else {
|
||||||
|
FstTransition *tran = taosArrayGet(builderNode->trans, 0);
|
||||||
|
if (tran->addr == lastAddr && tran->out == 0) {
|
||||||
|
//OneTransNext::compile(w, lastAddr, tran->inp);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
//OneTrans::Compile(w, lastAddr, *tran);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
FstBuilder *fstBuilderCreate(void *w, FstType ty) {
|
||||||
|
FstBuilder *b = malloc(sizeof(FstBuilder));
|
||||||
|
if (NULL == b) { return b; }
|
||||||
|
|
||||||
|
FstCountingWriter wtr = {.wtr = w, .count = 0, .summer = 0};
|
||||||
|
b->wtr = wtr;
|
||||||
|
b->unfinished = malloc(sizeof(FstUnFinishedNodes));
|
||||||
|
return b;
|
||||||
|
|
||||||
|
}
|
||||||
|
FstSlice fstNodeAsSlice(FstNode *node) {
|
||||||
|
FstSlice *slice = &node->data;
|
||||||
|
FstSlice s = fstSliceCopy(slice, slice->end, slice->dLen - 1);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
|
@ -12,6 +12,8 @@
|
||||||
* You should have received a copy of the GNU Affero General Public License
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "tutil.h"
|
||||||
const uint8_t COMMON_INPUTS[] = {
|
const uint8_t COMMON_INPUTS[] = {
|
||||||
84, // '\x00'
|
84, // '\x00'
|
||||||
85, // '\x01'
|
85, // '\x01'
|
||||||
|
@ -271,7 +273,7 @@ const uint8_t COMMON_INPUTS[] = {
|
||||||
255, // 'ÿ'
|
255, // 'ÿ'
|
||||||
};
|
};
|
||||||
|
|
||||||
char const COMMON_INPUTS_INV[] = [
|
char const COMMON_INPUTS_INV[] = {
|
||||||
't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w',
|
't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w',
|
||||||
'.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=',
|
'.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=',
|
||||||
':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6',
|
':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6',
|
||||||
|
@ -300,5 +302,5 @@ char const COMMON_INPUTS_INV[] = [
|
||||||
'\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0',
|
'\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0',
|
||||||
'\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8',
|
'\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8',
|
||||||
'\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
|
'\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
|
||||||
];
|
};
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "index_fst_registry.h"
|
||||||
|
|
|
@ -0,0 +1,115 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
||||||
|
*
|
||||||
|
* This program is free software: you can use, redistribute, and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License, version 3
|
||||||
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#include "index_fst_util.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//A sentinel value used to indicate an empty final state
|
||||||
|
const CompiledAddr EMPTY_ADDRESS = 0;
|
||||||
|
/// A sentinel value used to indicate an invalid state.
|
||||||
|
const CompiledAddr NONE_ADDRESS = 1;
|
||||||
|
|
||||||
|
// This version number is written to every finite state transducer created by
|
||||||
|
// this crate. When a finite state transducer is read, its version number is
|
||||||
|
// checked against this value.
|
||||||
|
const uint64_t version = 3;
|
||||||
|
// The threshold (in number of transitions) at which an index is created for
|
||||||
|
// a node's transitions. This speeds up lookup time at the expense of FST size
|
||||||
|
|
||||||
|
const uint64_t TRANS_INDEX_THRESHOLD = 32;
|
||||||
|
|
||||||
|
|
||||||
|
//uint8_t commonInput(uint8_t idx) {
|
||||||
|
// if (idx == 0) { return -1; }
|
||||||
|
// else {
|
||||||
|
// return COMMON_INPUTS_INV[idx - 1];
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
//
|
||||||
|
//uint8_t commonIdx(uint8_t v, uint8_t max) {
|
||||||
|
// uint8_t v = ((uint16_t)tCOMMON_INPUTS[v] + 1)%256;
|
||||||
|
// return v > max ? 0: v;
|
||||||
|
//}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
uint8_t packSize(uint64_t n) {
|
||||||
|
if (n < (1u << 8)) {
|
||||||
|
return 1;
|
||||||
|
} else if (n < (1u << 16)) {
|
||||||
|
return 2;
|
||||||
|
} else if (n < (1u << 24)) {
|
||||||
|
return 3;
|
||||||
|
} else if (n < ((uint64_t)(1) << 32)) {
|
||||||
|
return 4;
|
||||||
|
} else if (n < ((uint64_t)(1) << 40)) {
|
||||||
|
return 5;
|
||||||
|
} else if (n < ((uint64_t)(1) << 48)) {
|
||||||
|
return 6;
|
||||||
|
} else if (n < ((uint64_t)(1) << 56)) {
|
||||||
|
return 7;
|
||||||
|
} else {
|
||||||
|
return 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t unpackUint64(uint8_t *ch, uint8_t sz) {
|
||||||
|
uint64_t n;
|
||||||
|
for (uint8_t i = 0; i < sz; i++) {
|
||||||
|
n = n | (ch[i] << (8 * i));
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
uint8_t packDeltaSize(CompiledAddr nodeAddr, CompiledAddr transAddr) {
|
||||||
|
if (transAddr == EMPTY_ADDRESS) {
|
||||||
|
return packSize(EMPTY_ADDRESS);
|
||||||
|
} else {
|
||||||
|
return packSize(nodeAddr - transAddr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CompiledAddr unpackDelta(char *data, uint64_t len, uint64_t nodeAddr) {
|
||||||
|
uint64_t delta = unpackUint64(data, len);
|
||||||
|
// delta_add = u64_to_usize
|
||||||
|
if (delta == EMPTY_ADDRESS) {
|
||||||
|
return EMPTY_ADDRESS;
|
||||||
|
} else {
|
||||||
|
return nodeAddr - delta;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fst slice func
|
||||||
|
FstSlice fstSliceCreate(uint8_t *data, uint64_t dLen) {
|
||||||
|
FstSlice slice = {.data = data, .dLen = dLen, .start = 0, .end = dLen - 1};
|
||||||
|
return slice;
|
||||||
|
}
|
||||||
|
FstSlice fstSliceCopy(FstSlice *slice, uint32_t start, uint32_t end) {
|
||||||
|
FstSlice t;
|
||||||
|
if (start >= slice->dLen || end >= slice->dLen || start > end) {
|
||||||
|
t.data = NULL;
|
||||||
|
return t;
|
||||||
|
};
|
||||||
|
|
||||||
|
t.data = slice->data;
|
||||||
|
t.dLen = slice->dLen;
|
||||||
|
t.start = start;
|
||||||
|
t.end = end;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
bool fstSliceEmpty(FstSlice *slice) {
|
||||||
|
return slice->data == NULL || slice->dLen <= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue