243 lines
7.5 KiB
TypeScript
243 lines
7.5 KiB
TypeScript
/*
|
|
* Copyright (c) 2023 Huawei Technologies Co.,Ltd.
|
|
*
|
|
* openInula is licensed under Mulan PSL v2.
|
|
* You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
* You may obtain a copy of Mulan PSL v2 at:
|
|
*
|
|
* http://license.coscl.org.cn/MulanPSL2
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
* See the Mulan PSL v2 for more details.
|
|
*/
|
|
|
|
import Lexer from './Lexer';
|
|
import { mappingRule } from './mappingRule';
|
|
import ruleUtils from '../utils/parseRuleUtils';
|
|
import { RawToken } from '../types/types';
|
|
|
|
const defaultErrorRule = ruleUtils.getRuleOptions('error', { lineBreaks: true, shouldThrow: true });
|
|
|
|
// 解析规则并生成词法分析器所需的数据结构,以便进行词法分析操作
|
|
function parseRules(rules: Record<string, any>, hasStates: boolean): Record<string, any> {
|
|
let errorRule: Record<string, any> | null = null;
|
|
const fast: Record<string, unknown> = {};
|
|
let enableFast = true;
|
|
let unicodeFlag: boolean | null = null;
|
|
const groups: Record<string, any>[] = [];
|
|
const parts: string[] = [];
|
|
|
|
// 检查是否存在 fallback 规则,若存在则禁用快速匹配
|
|
enableFast = isExistsFallback(rules, enableFast);
|
|
|
|
for (let i = 0; i < rules.length; i++) {
|
|
const options = rules[i];
|
|
if (options.include) {
|
|
throw new Error('Inheritance is not allowed in stateless lexers!');
|
|
}
|
|
|
|
errorRule = isOptionsErrorOrFallback(options, errorRule);
|
|
|
|
const match = options.match.slice();
|
|
if (enableFast) {
|
|
// 如果快速匹配允许,则将单字符的规则存入 fast 对象
|
|
processFast(match, fast, options);
|
|
}
|
|
|
|
// 检查规则中是否存在不适当的状态切换选项
|
|
if (options.pop || options.push || options.next) {
|
|
checkStateOptions(hasStates, options);
|
|
}
|
|
// 只有具有 .match 的规则才会被包含在正则表达式中
|
|
if (match.length === 0) {
|
|
continue;
|
|
}
|
|
enableFast = false;
|
|
|
|
groups.push(options);
|
|
|
|
// 检查是否所有规则都使用了 unicode 标志,或者都未使用
|
|
unicodeFlag = checkUnicode(match, unicodeFlag, options);
|
|
|
|
const pat = ruleUtils.getRegUnion(match.map(ruleUtils.getReg));
|
|
const regexp = new RegExp(pat);
|
|
if (regexp.test('')) {
|
|
throw new Error('The regex matched the empty string!');
|
|
}
|
|
const groupCount = ruleUtils.getRegGroups(pat);
|
|
if (groupCount > 0) {
|
|
throw new Error('The regular expression uses capture groups, use (?: … ) instead!');
|
|
}
|
|
|
|
// 检测规则是否匹配换行符
|
|
if (!options.lineBreaks && regexp.test('\n')) {
|
|
throw new Error('The matching rule must contain lineBreaks.');
|
|
}
|
|
|
|
parts.push(ruleUtils.getRegCapture(pat));
|
|
}
|
|
|
|
// 如果没有 fallback 规则,则使用 sticky 标志,只在当前索引位置寻找匹配项,如果不支持 sticky 标志,则使用无法被否定的空模式来模拟
|
|
const fallbackRule = errorRule && errorRule.fallback;
|
|
let flags = ruleUtils.checkSticky() && !fallbackRule ? 'ym' : 'gm';
|
|
const suffix = ruleUtils.checkSticky() || fallbackRule ? '' : '|';
|
|
|
|
if (unicodeFlag === true) {
|
|
flags += 'u';
|
|
}
|
|
const combined = new RegExp(ruleUtils.getRegUnion(parts) + suffix, flags);
|
|
|
|
return {
|
|
regexp: combined,
|
|
groups: groups,
|
|
fast: fast,
|
|
error: errorRule || defaultErrorRule,
|
|
};
|
|
}
|
|
|
|
export function checkStateGroup(group: Record<string, any>, name: string, map: Record<string, any>) {
|
|
const state = group && (group.push || group.next);
|
|
if (state && !map[state]) {
|
|
throw new Error('The state is missing.');
|
|
}
|
|
if (group && group.pop && +group.pop !== 1) {
|
|
throw new Error('The value of pop must be 1.');
|
|
}
|
|
}
|
|
|
|
// 将国际化解析规则注入分词器中
|
|
function parseMappingRule(mappingRule: Record<string, any>, startState?: string): Lexer<RawToken> {
|
|
const keys = Object.getOwnPropertyNames(mappingRule);
|
|
|
|
if (!startState) {
|
|
startState = keys[0];
|
|
}
|
|
|
|
// 将每个状态的规则解析为规则数组,并存储在 ruleMap 对象中
|
|
const ruleMap = keys.reduce((map, key) => {
|
|
map[key] = ruleUtils.getRules(mappingRule[key]);
|
|
return map;
|
|
}, {});
|
|
|
|
// 处理规则中的 include 声明,将被包含的规则添加到相应的状态中
|
|
for (let i = 0; i < keys.length; i++) {
|
|
const key = keys[i];
|
|
const rules = ruleMap[key];
|
|
const included = {};
|
|
|
|
for (let j = 0; j < rules.length; j++) {
|
|
const rule = rules[j];
|
|
if (!rule.include) {
|
|
continue;
|
|
}
|
|
|
|
const splice = [j, 1];
|
|
if (rule.include !== key && !included[rule.include]) {
|
|
included[rule.include] = true;
|
|
const newRules = ruleMap[rule.include];
|
|
|
|
if (!newRules) {
|
|
throw new Error('Cannot contain a state that does not exist!');
|
|
}
|
|
|
|
newRules.forEach(newRule => {
|
|
if (!rules.includes(newRule)) {
|
|
splice.push(newRule);
|
|
}
|
|
});
|
|
}
|
|
// eslint-disable-next-line
|
|
rules.splice.apply(rules, splice);
|
|
j--;
|
|
}
|
|
}
|
|
|
|
const mappingAllRules = {};
|
|
|
|
// 将规则映射为词法分析器数据结构,并存储在 mappingAllRules 对象中
|
|
keys.forEach(key => {
|
|
mappingAllRules[key] = parseRules(ruleMap[key], true);
|
|
});
|
|
|
|
// 检查状态组中的规则是否正确引用了其他状态
|
|
keys.forEach(name => {
|
|
const state = mappingAllRules[name];
|
|
const groups = state.groups;
|
|
groups.forEach(group => {
|
|
checkStateGroup(group, name, mappingAllRules);
|
|
});
|
|
const fastKeys = Object.getOwnPropertyNames(state.fast);
|
|
fastKeys.forEach(fastKey => {
|
|
checkStateGroup(state.fast[fastKey], name, mappingAllRules);
|
|
});
|
|
});
|
|
|
|
return new Lexer(mappingAllRules, startState);
|
|
}
|
|
|
|
function processFast(match, fast: Record<string, unknown>, options) {
|
|
while (match.length && typeof match[0] === 'string' && match[0].length === 1) {
|
|
const word = match.shift();
|
|
fast[word.charCodeAt(0)] = options;
|
|
}
|
|
}
|
|
|
|
function handleErrorRule(options, errorRule: Record<string, any>) {
|
|
if (!options.fallback === !errorRule.fallback) {
|
|
throw new Error('errorRule can only set one!');
|
|
} else {
|
|
throw new Error('fallback and error cannot be set at the same time!');
|
|
}
|
|
}
|
|
|
|
function checkUnicode(match, unicodeFlag, options) {
|
|
for (let j = 0; j < match.length; j++) {
|
|
const obj = match[j];
|
|
if (!ruleUtils.checkRegExp(obj)) {
|
|
continue;
|
|
}
|
|
|
|
if (unicodeFlag === null) {
|
|
unicodeFlag = obj.unicode;
|
|
} else if (unicodeFlag !== obj.unicode && options.fallback === false) {
|
|
throw new Error('If the /u flag is used, all!');
|
|
}
|
|
}
|
|
return unicodeFlag;
|
|
}
|
|
|
|
function checkStateOptions(hasStates: boolean, options) {
|
|
if (!hasStates) {
|
|
throw new Error('State toggle options are not allowed in stateless tokenizers!');
|
|
}
|
|
if (options.fallback) {
|
|
throw new Error('State toggle options are not allowed on fallback tokens!');
|
|
}
|
|
}
|
|
|
|
function isExistsFallback(rules: Record<string, any>, enableFast: boolean) {
|
|
for (let i = 0; i < rules.length; i++) {
|
|
if (rules[i].fallback) {
|
|
enableFast = false;
|
|
}
|
|
}
|
|
return enableFast;
|
|
}
|
|
|
|
function isOptionsErrorOrFallback(options, errorRule: Record<string, any> | null) {
|
|
if (options.error || options.fallback) {
|
|
// 只能设置一个 errorRule
|
|
if (errorRule) {
|
|
handleErrorRule(options, errorRule);
|
|
}
|
|
errorRule = options;
|
|
}
|
|
return errorRule;
|
|
}
|
|
|
|
export const lexer = parseMappingRule(mappingRule);
|
|
|
|
export default parseMappingRule;
|