Merge pull request #220 from lukhnos/custom-phrase-reader
Use a more tolerant parser for user phrases
This commit is contained in:
commit
c698c61432
|
@ -32,6 +32,7 @@
|
||||||
6ACA41FD15FC1D9000935EF6 /* MainMenu.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6ACA41F015FC1D9000935EF6 /* MainMenu.xib */; };
|
6ACA41FD15FC1D9000935EF6 /* MainMenu.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6ACA41F015FC1D9000935EF6 /* MainMenu.xib */; };
|
||||||
6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; };
|
6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; };
|
||||||
6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; };
|
6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; };
|
||||||
|
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */; };
|
||||||
6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; };
|
6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; };
|
||||||
6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; };
|
6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; };
|
||||||
6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; };
|
6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; };
|
||||||
|
@ -155,6 +156,8 @@
|
||||||
6ACA41F615FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.rtf; name = "zh-Hant"; path = "zh-Hant.lproj/License.rtf"; sourceTree = "<group>"; };
|
6ACA41F615FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.rtf; name = "zh-Hant"; path = "zh-Hant.lproj/License.rtf"; sourceTree = "<group>"; };
|
||||||
6ACA41F715FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hant"; path = "zh-Hant.lproj/Localizable.strings"; sourceTree = "<group>"; };
|
6ACA41F715FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hant"; path = "zh-Hant.lproj/Localizable.strings"; sourceTree = "<group>"; };
|
||||||
6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = "<group>"; };
|
6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = "<group>"; };
|
||||||
|
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = "<group>"; };
|
||||||
|
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = "<group>"; };
|
||||||
6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = "<group>"; };
|
6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = "<group>"; };
|
||||||
6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = "<group>"; };
|
6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = "<group>"; };
|
||||||
6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = "<group>"; };
|
6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = "<group>"; };
|
||||||
|
@ -276,6 +279,8 @@
|
||||||
6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */,
|
6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */,
|
||||||
6A0421A615FEF3F50061ED63 /* FastLM.cpp */,
|
6A0421A615FEF3F50061ED63 /* FastLM.cpp */,
|
||||||
6A0421A715FEF3F50061ED63 /* FastLM.h */,
|
6A0421A715FEF3F50061ED63 /* FastLM.h */,
|
||||||
|
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */,
|
||||||
|
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */,
|
||||||
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */,
|
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */,
|
||||||
D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */,
|
D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */,
|
||||||
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */,
|
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */,
|
||||||
|
@ -579,6 +584,7 @@
|
||||||
D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */,
|
D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */,
|
||||||
6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */,
|
6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */,
|
||||||
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */,
|
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */,
|
||||||
|
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */,
|
||||||
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */,
|
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */,
|
||||||
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */,
|
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */,
|
||||||
);
|
);
|
||||||
|
@ -776,7 +782,7 @@
|
||||||
buildSettings = {
|
buildSettings = {
|
||||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
|
CLANG_CXX_LANGUAGE_STANDARD = "c++17";
|
||||||
CLANG_ENABLE_MODULES = YES;
|
CLANG_ENABLE_MODULES = YES;
|
||||||
CLANG_ENABLE_OBJC_ARC = YES;
|
CLANG_ENABLE_OBJC_ARC = YES;
|
||||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||||
|
@ -836,7 +842,7 @@
|
||||||
buildSettings = {
|
buildSettings = {
|
||||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
|
CLANG_CXX_LANGUAGE_STANDARD = "c++17";
|
||||||
CLANG_ENABLE_MODULES = YES;
|
CLANG_ENABLE_MODULES = YES;
|
||||||
CLANG_ENABLE_OBJC_ARC = YES;
|
CLANG_ENABLE_OBJC_ARC = YES;
|
||||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||||
|
@ -945,7 +951,7 @@
|
||||||
buildSettings = {
|
buildSettings = {
|
||||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
|
CLANG_CXX_LANGUAGE_STANDARD = "c++17";
|
||||||
CLANG_ENABLE_OBJC_ARC = YES;
|
CLANG_ENABLE_OBJC_ARC = YES;
|
||||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||||
|
@ -985,7 +991,7 @@
|
||||||
buildSettings = {
|
buildSettings = {
|
||||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
|
CLANG_CXX_LANGUAGE_STANDARD = "c++17";
|
||||||
CLANG_ENABLE_OBJC_ARC = YES;
|
CLANG_ENABLE_OBJC_ARC = YES;
|
||||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
cmake-build-debug
|
|
@ -0,0 +1,24 @@
|
||||||
|
cmake_minimum_required(VERSION 3.17)
|
||||||
|
project(KeyValueBlobReader)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
|
add_library(KeyValueBlobReader KeyValueBlobReader.cpp KeyValueBlobReader.h)
|
||||||
|
|
||||||
|
# Let CMake fetch Google Test for us.
|
||||||
|
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
|
||||||
|
include(FetchContent)
|
||||||
|
|
||||||
|
FetchContent_Declare(
|
||||||
|
googletest
|
||||||
|
# Specify the commit you depend on and update it regularly.
|
||||||
|
URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
|
||||||
|
)
|
||||||
|
# For Windows: Prevent overriding the parent project's compiler/linker settings
|
||||||
|
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||||
|
FetchContent_MakeAvailable(googletest)
|
||||||
|
|
||||||
|
# Test target declarations.
|
||||||
|
add_executable(KeyValueBlobReadTest KeyValueBlobReaderTest.cpp)
|
||||||
|
target_link_libraries(KeyValueBlobReadTest gtest_main KeyValueBlobReader)
|
||||||
|
add_test(NAME KeyValueBlobReadTest COMMAND KeyValueBlobReadTest)
|
|
@ -0,0 +1,140 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#include "KeyValueBlobReader.h"
|
||||||
|
|
||||||
|
namespace McBopomofo {
|
||||||
|
|
||||||
|
KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) {
|
||||||
|
static auto new_line = [](char c) { return c == '\n' || c == '\r'; };
|
||||||
|
static auto blank = [](char c) { return c == ' ' || c == '\t'; };
|
||||||
|
static auto blank_or_newline = [](char c) { return blank(c) || new_line(c); };
|
||||||
|
static auto content_char = [](char c) {
|
||||||
|
return !blank(c) && !new_line(c);
|
||||||
|
};
|
||||||
|
|
||||||
|
if (state_ == State::ERROR) {
|
||||||
|
return state_;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* key_begin = nullptr;
|
||||||
|
size_t key_length = 0;
|
||||||
|
const char* value_begin = nullptr;
|
||||||
|
size_t value_length = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
state_ = SkipUntilNot(blank_or_newline);
|
||||||
|
if (state_ != State::CAN_CONTINUE) {
|
||||||
|
return state_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if it's a comment line; if so, read until end of line.
|
||||||
|
if (*current_ != '#') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
state_ = SkipUntil(new_line);
|
||||||
|
if (state_ != State::CAN_CONTINUE) {
|
||||||
|
return state_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No need to check whether* current_ is a content_char, since content_char
|
||||||
|
// is defined as not blank and not new_line.
|
||||||
|
|
||||||
|
key_begin = current_;
|
||||||
|
state_ = SkipUntilNot(content_char);
|
||||||
|
if (state_ != State::CAN_CONTINUE) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
key_length = current_ - key_begin;
|
||||||
|
|
||||||
|
// There should be at least one blank character after the key string.
|
||||||
|
if (!blank(*current_)) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
state_ = SkipUntilNot(blank);
|
||||||
|
if (state_ != State::CAN_CONTINUE) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!content_char(*current_)) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
value_begin = current_;
|
||||||
|
// value must only contain content characters, blanks not are allowed.
|
||||||
|
// also, there's no need to check the state after this, since we will always
|
||||||
|
// emit the value. This also avoids the situation where trailing spaces in a
|
||||||
|
// line would become part of the value.
|
||||||
|
SkipUntilNot(content_char);
|
||||||
|
value_length = current_ - value_begin;
|
||||||
|
|
||||||
|
// Unconditionally skip until the end of the line. This prevents the case
|
||||||
|
// like "foo bar baz\n" where baz should not be treated as the Next key.
|
||||||
|
SkipUntil(new_line);
|
||||||
|
|
||||||
|
if (out != nullptr) {
|
||||||
|
*out = KeyValue{
|
||||||
|
std::string_view{key_begin, key_length},
|
||||||
|
std::string_view{value_begin, value_length}};
|
||||||
|
}
|
||||||
|
state_ = State::HAS_PAIR;
|
||||||
|
return state_;
|
||||||
|
|
||||||
|
error:
|
||||||
|
state_ = State::ERROR;
|
||||||
|
return State::ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
|
||||||
|
const std::function<bool(char)>& f) {
|
||||||
|
while (current_ != end_ &&* current_) {
|
||||||
|
if (!f(*current_)) {
|
||||||
|
return State::CAN_CONTINUE;
|
||||||
|
}
|
||||||
|
++current_;
|
||||||
|
}
|
||||||
|
|
||||||
|
return State::END;
|
||||||
|
}
|
||||||
|
|
||||||
|
KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
|
||||||
|
const std::function<bool(char)>& f) {
|
||||||
|
while (current_ != end_ &&* current_) {
|
||||||
|
if (f(*current_)) {
|
||||||
|
return State::CAN_CONTINUE;
|
||||||
|
}
|
||||||
|
++current_;
|
||||||
|
}
|
||||||
|
|
||||||
|
return State::END;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& os,
|
||||||
|
const KeyValueBlobReader::KeyValue& kv) {
|
||||||
|
os << "(key: " << kv.key << ", value: " << kv.value << ")";
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace McBopomofo
|
|
@ -0,0 +1,94 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#ifndef SOURCE_ENGINE_KEYVALUEBLOBREADER_H_
|
||||||
|
#define SOURCE_ENGINE_KEYVALUEBLOBREADER_H_
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <functional>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string_view>
|
||||||
|
|
||||||
|
// A reader for text-based, blank-separated key-value pairs in a binary blob.
|
||||||
|
//
|
||||||
|
// This reader is suitable for reading language model files that entirely
|
||||||
|
// consist of key-value pairs. Leading or trailing spaces are ignored.
|
||||||
|
// Lines that start with "#" are treated as comments. Values cannot contain
|
||||||
|
// spaces. Any space after the value string is parsed is ignored. This implies
|
||||||
|
// that after a blank, anything that comes after the value can be used as
|
||||||
|
// comment. Both ' ' and '\t' are treated as blank characters, and the parser
|
||||||
|
// is agnostic to how lines are ended, and so LF, CR LF, and CR are all valid
|
||||||
|
// line endings.
|
||||||
|
//
|
||||||
|
// std::string_view is used to allow returning results efficiently. As a result,
|
||||||
|
// the blob is a const char* and will never be mutated. This implies, for
|
||||||
|
// example, read-only mmap can be used to parse large files.
|
||||||
|
namespace McBopomofo {
|
||||||
|
|
||||||
|
class KeyValueBlobReader {
|
||||||
|
public:
|
||||||
|
enum class State : int {
|
||||||
|
// There are no more key-value pairs in this blob.
|
||||||
|
END = 0,
|
||||||
|
// The reader has produced a new key-value pair.
|
||||||
|
HAS_PAIR = 1,
|
||||||
|
// An error is encountered and the parsing stopped.
|
||||||
|
ERROR = -1,
|
||||||
|
// Internal-only state: the parser can continue parsing.
|
||||||
|
CAN_CONTINUE = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
struct KeyValue {
|
||||||
|
constexpr KeyValue() : key(""), value("") {}
|
||||||
|
constexpr KeyValue(std::string_view k, std::string_view v)
|
||||||
|
: key(k), value(v) {}
|
||||||
|
|
||||||
|
bool operator==(const KeyValue& another) const {
|
||||||
|
return key == another.key && value == another.value;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string_view key;
|
||||||
|
std::string_view value;
|
||||||
|
};
|
||||||
|
|
||||||
|
KeyValueBlobReader(const char* blob, size_t size)
|
||||||
|
: current_(blob), end_(blob + size) {}
|
||||||
|
|
||||||
|
// Parse the next key-value pair and return the state of the reader. If `out`
|
||||||
|
// is passed, out will be set to the produced key-value pair if there is one.
|
||||||
|
State Next(KeyValue* out = nullptr);
|
||||||
|
|
||||||
|
private:
|
||||||
|
State SkipUntil(const std::function<bool(char)>& f);
|
||||||
|
State SkipUntilNot(const std::function<bool(char)>& f);
|
||||||
|
|
||||||
|
const char* current_;
|
||||||
|
const char* end_;
|
||||||
|
State state_ = State::CAN_CONTINUE;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream&, const KeyValueBlobReader::KeyValue&);
|
||||||
|
|
||||||
|
} // namespace McBopomofo
|
||||||
|
|
||||||
|
#endif // SOURCE_ENGINE_KEYVALUEBLOBREADER_H_
|
|
@ -0,0 +1,235 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#include "KeyValueBlobReader.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
namespace McBopomofo {
|
||||||
|
|
||||||
|
using State = KeyValueBlobReader::State;
|
||||||
|
using KeyValue = KeyValueBlobReader::KeyValue;
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, EmptyBlob) {
|
||||||
|
std::string empty;
|
||||||
|
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency) {
|
||||||
|
char empty[0];
|
||||||
|
KeyValueBlobReader reader(empty, 0);
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, BlankBlob) {
|
||||||
|
std::string blank = " ";
|
||||||
|
KeyValueBlobReader reader(blank.c_str(), blank.length());
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid) {
|
||||||
|
std::string empty = "hello";
|
||||||
|
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||||
|
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress) {
|
||||||
|
std::string empty = "hello";
|
||||||
|
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||||
|
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||||
|
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid) {
|
||||||
|
char bad[] = {'h', 0, 'w'};
|
||||||
|
KeyValueBlobReader reader(bad, sizeof(bad));
|
||||||
|
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, SingleKeyValuePair) {
|
||||||
|
std::string empty = "hello world\n";
|
||||||
|
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding) {
|
||||||
|
char small[] = {'p', ' ', 'q'};
|
||||||
|
KeyValueBlobReader reader(small, sizeof(small));
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing) {
|
||||||
|
char small[] = {'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's'};
|
||||||
|
KeyValueBlobReader reader(small, sizeof(small));
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd) {
|
||||||
|
std::string simple = "hello world";
|
||||||
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, EncodingAgnostic1) {
|
||||||
|
std::string simple = u8"smile ☺️";
|
||||||
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"smile", u8"☺️"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, EncodingAgnostic2) {
|
||||||
|
std::string simple = "Nobel-Laureate "
|
||||||
|
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b";
|
||||||
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (
|
||||||
|
KeyValue{"Nobel-Laureate",
|
||||||
|
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace) {
|
||||||
|
std::string simple = "hello world and all\nanother value";
|
||||||
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"another", "value"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored) {
|
||||||
|
std::string simple = "\thello world \n\n foo bar \t\t\t \n\n\n";
|
||||||
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, WindowsCRLFSupported) {
|
||||||
|
std::string simple = "lorem ipsum\r\nhello world";
|
||||||
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"lorem", "ipsum"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, MultipleKeyValuePair) {
|
||||||
|
std::string multi = "\n \nhello world\n foo \t bar ";
|
||||||
|
KeyValueBlobReader reader(multi.c_str(), multi.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, ReadUntilNullChar) {
|
||||||
|
char buf[] = {'p', '\t', 'q', '\n', 0, 'r', ' ', 's'};
|
||||||
|
KeyValueBlobReader reader(buf, sizeof(buf));
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) {
|
||||||
|
std::string text = R"(
|
||||||
|
# comment1
|
||||||
|
# comment2
|
||||||
|
|
||||||
|
# comment3
|
||||||
|
hello World
|
||||||
|
caffè latte
|
||||||
|
|
||||||
|
# another comment
|
||||||
|
foo bar
|
||||||
|
|
||||||
|
# comment4
|
||||||
|
# comment5
|
||||||
|
)";
|
||||||
|
|
||||||
|
KeyValueBlobReader reader(text.c_str(), text.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "World"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeyValueBlobReaderTest, ValueCommentSupported) {
|
||||||
|
std::string text = R"(
|
||||||
|
# empty
|
||||||
|
|
||||||
|
hello world#peace
|
||||||
|
hello world#peace #peace
|
||||||
|
hello world#peace // peace
|
||||||
|
caffè latte # café au lait
|
||||||
|
foo bar
|
||||||
|
)";
|
||||||
|
|
||||||
|
KeyValueBlobReader reader(text.c_str(), text.length());
|
||||||
|
KeyValue keyValue;
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"}));
|
||||||
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
|
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
||||||
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace McBopomofo
|
|
@ -1,3 +1,26 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
#include "McBopomofoLM.h"
|
#include "McBopomofoLM.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
|
|
|
@ -1,3 +1,26 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
#ifndef MCBOPOMOFOLM_H
|
#ifndef MCBOPOMOFOLM_H
|
||||||
#define MCBOPOMOFOLM_H
|
#define MCBOPOMOFOLM_H
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,37 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
#include "UserPhrasesLM.h"
|
#include "UserPhrasesLM.h"
|
||||||
|
|
||||||
#include <sys/mman.h>
|
#include <sys/mman.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
using namespace Formosa::Gramambular;
|
#include "KeyValueBlobReader.h"
|
||||||
using namespace McBopomofo;
|
|
||||||
|
namespace McBopomofo {
|
||||||
|
|
||||||
UserPhrasesLM::UserPhrasesLM()
|
UserPhrasesLM::UserPhrasesLM()
|
||||||
: fd(-1)
|
: fd(-1)
|
||||||
|
@ -42,113 +67,24 @@ bool UserPhrasesLM::open(const char *path)
|
||||||
|
|
||||||
length = (size_t)sb.st_size;
|
length = (size_t)sb.st_size;
|
||||||
|
|
||||||
data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0);
|
data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
::close(fd);
|
::close(fd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *head = (char *)data;
|
KeyValueBlobReader reader(static_cast<char*>(data), length);
|
||||||
char *end = (char *)data + length;
|
KeyValueBlobReader::KeyValue keyValue;
|
||||||
char c;
|
KeyValueBlobReader::State state;
|
||||||
Row row;
|
while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) {
|
||||||
|
// We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading.
|
||||||
start:
|
keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key );
|
||||||
// EOF -> end
|
|
||||||
if (head == end) {
|
|
||||||
goto end;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
c = *head;
|
if (state == KeyValueBlobReader::State::ERROR) {
|
||||||
// \s -> error
|
|
||||||
if (c == ' ') {
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
// \n -> start
|
|
||||||
else if (c == '\n') {
|
|
||||||
head++;
|
|
||||||
goto start;
|
|
||||||
}
|
|
||||||
|
|
||||||
// \w -> record column star, state1
|
|
||||||
row.value = head;
|
|
||||||
head++;
|
|
||||||
// fall through to state 1
|
|
||||||
|
|
||||||
state1:
|
|
||||||
// EOF -> error
|
|
||||||
if (head == end) {
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
|
|
||||||
c = *head;
|
|
||||||
// \n -> error
|
|
||||||
if (c == '\n') {
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
// \s -> state2 + zero out ending + record column start
|
|
||||||
else if (c == ' ') {
|
|
||||||
*head = 0;
|
|
||||||
head++;
|
|
||||||
row.key = head;
|
|
||||||
goto state2;
|
|
||||||
}
|
|
||||||
|
|
||||||
// \w -> state1
|
|
||||||
head++;
|
|
||||||
goto state1;
|
|
||||||
|
|
||||||
state2:
|
|
||||||
if (head == end) {
|
|
||||||
*head = 0;
|
|
||||||
head++;
|
|
||||||
keyRowMap[row.key].push_back(row);
|
|
||||||
goto end;
|
|
||||||
}
|
|
||||||
|
|
||||||
c = *head;
|
|
||||||
// \s -> error
|
|
||||||
if (c == ' ' || c == '\n') {
|
|
||||||
*head = 0;
|
|
||||||
head++;
|
|
||||||
keyRowMap[row.key].push_back(row);
|
|
||||||
if (c == ' ') {
|
|
||||||
goto state3;
|
|
||||||
}
|
|
||||||
goto start;
|
|
||||||
}
|
|
||||||
|
|
||||||
// \w -> state 2
|
|
||||||
head++;
|
|
||||||
goto state2;
|
|
||||||
|
|
||||||
state3:
|
|
||||||
if (head == end) {
|
|
||||||
*head = 0;
|
|
||||||
head++;
|
|
||||||
keyRowMap[row.key].push_back(row);
|
|
||||||
goto end;
|
|
||||||
}
|
|
||||||
|
|
||||||
c = *head;
|
|
||||||
if (c == '\n') {
|
|
||||||
goto start;
|
|
||||||
}
|
|
||||||
|
|
||||||
head++;
|
|
||||||
goto state3;
|
|
||||||
|
|
||||||
error:
|
|
||||||
close();
|
close();
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
end:
|
|
||||||
static const char *space = " ";
|
|
||||||
Row emptyRow;
|
|
||||||
emptyRow.key = space;
|
|
||||||
emptyRow.value = space;
|
|
||||||
keyRowMap[space].push_back(emptyRow);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -165,33 +101,29 @@ void UserPhrasesLM::close()
|
||||||
|
|
||||||
void UserPhrasesLM::dump()
|
void UserPhrasesLM::dump()
|
||||||
{
|
{
|
||||||
size_t rows = 0;
|
for (const auto& entry : keyRowMap) {
|
||||||
for (map<const char *, vector<Row> >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) {
|
const std::vector<Row>& rows = entry.second;
|
||||||
const vector<Row>& r = (*i).second;
|
for (const auto& row : rows) {
|
||||||
for (vector<Row>::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) {
|
std::cerr << row.key << " " << row.value << "\n";
|
||||||
const Row& row = *ri;
|
|
||||||
cerr << row.key << " " << row.value << "\n";
|
|
||||||
rows++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const vector<Bigram> UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key)
|
const std::vector<Formosa::Gramambular::Bigram> UserPhrasesLM::bigramsForKeys(const std::string& preceedingKey, const std::string& key)
|
||||||
{
|
{
|
||||||
return vector<Bigram>();
|
return std::vector<Formosa::Gramambular::Bigram>();
|
||||||
}
|
}
|
||||||
|
|
||||||
const vector<Unigram> UserPhrasesLM::unigramsForKey(const string& key)
|
const std::vector<Formosa::Gramambular::Unigram> UserPhrasesLM::unigramsForKey(const std::string& key)
|
||||||
{
|
{
|
||||||
vector<Unigram> v;
|
std::vector<Formosa::Gramambular::Unigram> v;
|
||||||
map<const char *, vector<Row> >::const_iterator i = keyRowMap.find(key.c_str());
|
auto iter = keyRowMap.find(key);
|
||||||
|
if (iter != keyRowMap.end()) {
|
||||||
if (i != keyRowMap.end()) {
|
const std::vector<Row>& rows = iter->second;
|
||||||
for (vector<Row>::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) {
|
for (const auto& row : rows) {
|
||||||
Unigram g;
|
Formosa::Gramambular::Unigram g;
|
||||||
const Row& r = *ri;
|
g.keyValue.key = row.key;
|
||||||
g.keyValue.key = r.key;
|
g.keyValue.value = row.value;
|
||||||
g.keyValue.value = r.value;
|
|
||||||
g.score = 0.0;
|
g.score = 0.0;
|
||||||
v.push_back(g);
|
v.push_back(g);
|
||||||
}
|
}
|
||||||
|
@ -200,8 +132,9 @@ const vector<Unigram> UserPhrasesLM::unigramsForKey(const string& key)
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UserPhrasesLM::hasUnigramsForKey(const string& key)
|
bool UserPhrasesLM::hasUnigramsForKey(const std::string& key)
|
||||||
{
|
{
|
||||||
return keyRowMap.find(key.c_str()) != keyRowMap.end();
|
return keyRowMap.find(key) != keyRowMap.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}; // namespace McBopomofo
|
||||||
|
|
|
@ -1,8 +1,29 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
#ifndef USERPHRASESLM_H
|
#ifndef USERPHRASESLM_H
|
||||||
#define USERPHRASESLM_H
|
#define USERPHRASESLM_H
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
@ -10,9 +31,7 @@
|
||||||
|
|
||||||
namespace McBopomofo {
|
namespace McBopomofo {
|
||||||
|
|
||||||
using namespace Formosa::Gramambular;
|
class UserPhrasesLM : public Formosa::Gramambular::LanguageModel
|
||||||
|
|
||||||
class UserPhrasesLM : public LanguageModel
|
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
UserPhrasesLM();
|
UserPhrasesLM();
|
||||||
|
@ -22,25 +41,18 @@ public:
|
||||||
void close();
|
void close();
|
||||||
void dump();
|
void dump();
|
||||||
|
|
||||||
virtual const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key);
|
virtual const std::vector<Formosa::Gramambular::Bigram> bigramsForKeys(const std::string& preceedingKey, const std::string& key);
|
||||||
virtual const vector<Unigram> unigramsForKey(const string& key);
|
virtual const std::vector<Formosa::Gramambular::Unigram> unigramsForKey(const std::string& key);
|
||||||
virtual bool hasUnigramsForKey(const string& key);
|
virtual bool hasUnigramsForKey(const std::string& key);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
struct CStringCmp
|
|
||||||
{
|
|
||||||
bool operator()(const char* s1, const char* s2) const
|
|
||||||
{
|
|
||||||
return strcmp(s1, s2) < 0;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct Row {
|
struct Row {
|
||||||
const char *key;
|
Row(std::string_view& k, std::string_view& v) : key(k), value(v) {}
|
||||||
const char *value;
|
std::string_view key;
|
||||||
|
std::string_view value;
|
||||||
};
|
};
|
||||||
|
|
||||||
map<const char *, vector<Row>, CStringCmp> keyRowMap;
|
std::map<std::string_view, std::vector<Row>> keyRowMap;
|
||||||
int fd;
|
int fd;
|
||||||
void *data;
|
void *data;
|
||||||
size_t length;
|
size_t length;
|
||||||
|
|
Loading…
Reference in New Issue