diff --git a/source/libs/index/inc/index_fst.h b/source/libs/index/inc/index_fst.h new file mode 100644 index 0000000000..de4c957e29 --- /dev/null +++ b/source/libs/index/inc/index_fst.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _INDEX_FST_H_ +#define _INDEX_FST_H_ +#include "index_fst.h" +#include "tarray.h" + +typedef FstType uint64_t; +typedef CompiledAddr uint64_t; +typedef Output uint64_t; +typedef PackSizes uint8_t; + + +//A sentinel value used to indicate an empty final state +const CompileAddr EMPTY_ADDRESS = 0; +/// A sentinel value used to indicate an invalid state. +const CompileAddr NONE_ADDRESS = 1; + +// This version number is written to every finite state transducer created by +// this crate. When a finite state transducer is read, its version number is +// checked against this value. +const uint64_t version = 3; +// The threshold (in number of transitions) at which an index is created for +// a node's transitions. This speeds up lookup time at the expense of FST size + +const uint64_t TRANS_INDEX_THRESHOLD = 32; + +typedef struct FstRange { + uint64_t start; + uint64_t end; +} FstRange; + +enum State { OneTransNext, OneTrans, AnyTrans, EmptyFinal}; +enum FstBound { Included, Excluded, Unbounded}; + +typedef struct CheckSummer { + uint32_t sum; +}; + + +typedef struct FstBuilder { + FstCountingWriter wtr; // The FST raw data is written directly to `wtr`. + FstUnFinishedNodes unfinished // The stack of unfinished nodes + Registry registry // A map of finished nodes. + SArray* last // The last word added + CompiledAddr lastAddr // The address of the last compiled node + uint64_t len // num of keys added +} FstBuilder; + +typedef struct FstCountingWriter { + void* wtr; // wrap any writer that counts and checksum bytes written + uint64_t count; + CheckSummer summer; +}; + + + + +typedef struct FstTransition { + uint8_t inp; //The byte input associated with this transition. + Output out; //The output associated with this transition + CompiledAddr addr; //The address of the node that this transition points to +} FstTransition; + +typedef struct FstTransitions { + FstNode *node; + FstRange range; +} FstTransitions; + +typedef struct FstUnFinishedNodes { + SArray *stack; // +} FstUnFinishedNodes; + +typedef struct FstBuilderNode { + bool isFinal; + Output finalOutput; + SArray *trans; // +} FstBuilderNode; + + + +typedef struct FstLastTransition { + uint8_t inp; + Output out; +} FstLastTransition; + +typedef struct FstBuilderNodeUnfinished { + FstBuilderNode node; + FstLastTransition last; +} FstBuilderNodeUnfinished; + +typedef struct FstNode { + uint8_t* data; + uint64_t version; + State state; + CompiledAddr start; + CompiledAddr end; + bool isFinal; + uint64_t nTrans; + PackSizes sizes; + Output finalOutput; +} FstNode; + +typedef struct FstMeta { + uint64_t version; + CompiledAddr rootAddr; + FstType ty; + uint64_t len; + uint32_t checkSum; +} FstMeta; + +typedef struct Fst { + FstMeta meta; + void *data; // +}; + +// ops + +typedef struct FstIndexedValue { + uint64_t index; + uint64_t value; +}; + +// relate to Regist +typedef struct FstRegistry { + SArray *table; // + uint64_t tableSize; // num of rows + uint64_t mruSize; // num of columns +} FstRegistry; + +typedef struct FstRegistryCache { + SArray *cells; // +} FstRegistryCache; + +typedef struct FstRegistryCell { + CompiledAddr addr; + FstBuilderNode *node; +} FstRegistryCell; + +enum FstRegistryEntry {Found, NotFound, Rejected}; + +FstNode *fstNodeCreate(int64_t version, CompiledAddr addr, uint8_t *data); +FstTransitions fstNodeTransitionIter(FstNode *node); +FstTransition fstNodeGetTransitionAt(FstNode *node, uint64_t i); +CompiledAddr fstNodeGetTransitionAddr(FstNode *node, uint64_t i); +int64_t fstNodeFindInput(FstNode *node, int8_t b); +Output fstNodeGetFinalOutput(FstNode *node); +void* fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledArr addr, FstBuilderNode *builderNode); + + + + +#endif diff --git a/source/libs/index/src/index_fst.c b/source/libs/index/src/index_fst.c new file mode 100644 index 0000000000..4c6e20a7d5 --- /dev/null +++ b/source/libs/index/src/index_fst.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "index_fst.h" + +// fst node function +FstNode *fstNodeCreate(int64_t version, ComiledAddr addr, uint8_t *data) { + FstNode *n = (FstNode *)malloc(sizeof(FstNode)); + if (n == NULL) { return NULL; } + + if (addr == EMPTY_ADDRESS) { + n->date = NULL; + n->version = version; + n->state = EmptyFinal; + n->start = EMPTY_ADDRESS; + n->end = EMPTY_ADDRESS; + n->isFinal = true; + n->nTrans = 0; + n->sizes = 0; + n->finalOutpu = 0; + return n; + } + uint8_t v = (data[addr] & 0b1100000) >> 6; + if (v == 0b11) { + + } else if (v == 0b10) { + + } else { + + } + + +} + + + diff --git a/source/libs/index/src/index_fst_common.c b/source/libs/index/src/index_fst_common.c new file mode 100644 index 0000000000..39e5f89b35 --- /dev/null +++ b/source/libs/index/src/index_fst_common.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +const uint8_t COMMON_INPUTS[] = { + 84, // '\x00' + 85, // '\x01' + 86, // '\x02' + 87, // '\x03' + 88, // '\x04' + 89, // '\x05' + 90, // '\x06' + 91, // '\x07' + 92, // '\x08' + 93, // '\t' + 94, // '\n' + 95, // '\x0b' + 96, // '\x0c' + 97, // '\r' + 98, // '\x0e' + 99, // '\x0f' + 100, // '\x10' + 101, // '\x11' + 102, // '\x12' + 103, // '\x13' + 104, // '\x14' + 105, // '\x15' + 106, // '\x16' + 107, // '\x17' + 108, // '\x18' + 109, // '\x19' + 110, // '\x1a' + 111, // '\x1b' + 112, // '\x1c' + 113, // '\x1d' + 114, // '\x1e' + 115, // '\x1f' + 116, // ' ' + 80, // '!' + 117, // '"' + 118, // '#' + 79, // '$' + 39, // '%' + 30, // '&' + 81, // "'" + 75, // '(' + 74, // ')' + 82, // '*' + 57, // '+' + 66, // ',' + 16, // '-' + 12, // '.' + 2, // '/' + 19, // '0' + 20, // '1' + 21, // '2' + 27, // '3' + 32, // '4' + 29, // '5' + 35, // '6' + 36, // '7' + 37, // '8' + 34, // '9' + 24, // ':' + 73, // ';' + 119, // '<' + 23, // '=' + 120, // '>' + 40, // '?' + 83, // '@' + 44, // 'A' + 48, // 'B' + 42, // 'C' + 43, // 'D' + 49, // 'E' + 46, // 'F' + 62, // 'G' + 61, // 'H' + 47, // 'I' + 69, // 'J' + 68, // 'K' + 58, // 'L' + 56, // 'M' + 55, // 'N' + 59, // 'O' + 51, // 'P' + 72, // 'Q' + 54, // 'R' + 45, // 'S' + 52, // 'T' + 64, // 'U' + 65, // 'V' + 63, // 'W' + 71, // 'X' + 67, // 'Y' + 70, // 'Z' + 77, // '[' + 121, // '\\' + 78, // ']' + 122, // '^' + 31, // '_' + 123, // '`' + 4, // 'a' + 25, // 'b' + 9, // 'c' + 17, // 'd' + 1, // 'e' + 26, // 'f' + 22, // 'g' + 13, // 'h' + 7, // 'i' + 50, // 'j' + 38, // 'k' + 14, // 'l' + 15, // 'm' + 10, // 'n' + 3, // 'o' + 8, // 'p' + 60, // 'q' + 6, // 'r' + 5, // 's' + 0, // 't' + 18, // 'u' + 33, // 'v' + 11, // 'w' + 41, // 'x' + 28, // 'y' + 53, // 'z' + 124, // '{' + 125, // '|' + 126, // '}' + 76, // '~' + 127, // '\x7f' + 128, // '\x80' + 129, // '\x81' + 130, // '\x82' + 131, // '\x83' + 132, // '\x84' + 133, // '\x85' + 134, // '\x86' + 135, // '\x87' + 136, // '\x88' + 137, // '\x89' + 138, // '\x8a' + 139, // '\x8b' + 140, // '\x8c' + 141, // '\x8d' + 142, // '\x8e' + 143, // '\x8f' + 144, // '\x90' + 145, // '\x91' + 146, // '\x92' + 147, // '\x93' + 148, // '\x94' + 149, // '\x95' + 150, // '\x96' + 151, // '\x97' + 152, // '\x98' + 153, // '\x99' + 154, // '\x9a' + 155, // '\x9b' + 156, // '\x9c' + 157, // '\x9d' + 158, // '\x9e' + 159, // '\x9f' + 160, // '\xa0' + 161, // '¡' + 162, // '¢' + 163, // '£' + 164, // '¤' + 165, // '¥' + 166, // '¦' + 167, // '§' + 168, // '¨' + 169, // '©' + 170, // 'ª' + 171, // '«' + 172, // '¬' + 173, // '\xad' + 174, // '®' + 175, // '¯' + 176, // '°' + 177, // '±' + 178, // '²' + 179, // '³' + 180, // '´' + 181, // 'µ' + 182, // '¶' + 183, // '·' + 184, // '¸' + 185, // '¹' + 186, // 'º' + 187, // '»' + 188, // '¼' + 189, // '½' + 190, // '¾' + 191, // '¿' + 192, // 'À' + 193, // 'Á' + 194, // 'Â' + 195, // 'Ã' + 196, // 'Ä' + 197, // 'Å' + 198, // 'Æ' + 199, // 'Ç' + 200, // 'È' + 201, // 'É' + 202, // 'Ê' + 203, // 'Ë' + 204, // 'Ì' + 205, // 'Í' + 206, // 'Î' + 207, // 'Ï' + 208, // 'Ð' + 209, // 'Ñ' + 210, // 'Ò' + 211, // 'Ó' + 212, // 'Ô' + 213, // 'Õ' + 214, // 'Ö' + 215, // '×' + 216, // 'Ø' + 217, // 'Ù' + 218, // 'Ú' + 219, // 'Û' + 220, // 'Ü' + 221, // 'Ý' + 222, // 'Þ' + 223, // 'ß' + 224, // 'à' + 225, // 'á' + 226, // 'â' + 227, // 'ã' + 228, // 'ä' + 229, // 'å' + 230, // 'æ' + 231, // 'ç' + 232, // 'è' + 233, // 'é' + 234, // 'ê' + 235, // 'ë' + 236, // 'ì' + 237, // 'í' + 238, // 'î' + 239, // 'ï' + 240, // 'ð' + 241, // 'ñ' + 242, // 'ò' + 243, // 'ó' + 244, // 'ô' + 245, // 'õ' + 246, // 'ö' + 247, // '÷' + 248, // 'ø' + 249, // 'ù' + 250, // 'ú' + 251, // 'û' + 252, // 'ü' + 253, // 'ý' + 254, // 'þ' + 255, // 'ÿ' +}; + +char const COMMON_INPUTS_INV[] = [ + 't', 'e', '/', 'o', 'a', 's', 'r', 'i', 'p', 'c', 'n', 'w', + '.', 'h', 'l', 'm', '-', 'd', 'u', '0', '1', '2', 'g', '=', + ':', 'b', 'f', '3', 'y', '5', '&', '_', '4', 'v', '9', '6', + '7', '8', 'k', '%', '?', 'x', 'C', 'D', 'A', 'S', 'F', 'I', + 'B', 'E', 'j', 'P', 'T', 'z', 'R', 'N', 'M', '+', 'L', 'O', + 'q', 'H', 'G', 'W', 'U', 'V', ',', 'Y', 'K', 'J', 'Z', 'X', + 'Q', ';', ')', '(', '~', '[', ']', '$', '!', '\'', '*', '@', + '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', + '\x08', '\t', '\n', '\x0b', '\x0c', '\r', '\x0e', '\x0f', '\x10', + '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', + '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', ' ', '"', + '#', '<', '>', '\\', '^', '`', '{', '|', '}','\x7f','\x80', + '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88', + '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90', + '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', + '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '\xa0', + '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', '\xa8', + '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', '\xb0', + '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', '\xb8', + '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', '\xc0', + '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', '\xc8', + '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', '\xd0', + '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', '\xd8', + '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', '\xe0', + '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', '\xe8', + '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', '\xf0', + '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', '\xf8', + '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', +]; +