From d6cc5479f6dd0be9be0a53bb6577d8824408305e Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Thu, 13 Jan 2022 14:31:45 -0800 Subject: [PATCH] Use a more tolerant parser for user phrases A generic key-value blob reader, KeyValueBlobReader, is implemented to allow more flexibility in user-editable files. For example, this allows comments in the file, as well as tolerating leading or trailing spaces, tabs, or even Windows CR LF line endings. Unit tests are supplied for KeyValueBlobReader although they are not part of the Xcode project. A separate CMakeLists.txt is provided. UserPhrasesLM is refactored to use KeyValueBlobReader. A small stylistic change is appiled to reduce "using namespace" uses, but otherwise no major style changes were applied to UserPhrasesLM. Please note that McBopomofo's user phrase LM uses the value in a key-value pair as the reading, and the key as the actual "value". We don't plan to change that order so that we don't have to migrate data. std::string_view is used to allow efficient reference to char buffers and interop with std::string (and so no c_str() is needed). C++17 is now enabled for the project to enable the use of std::string_view. Copyright headers are added to McBopomofoLM and UserPhrasesLM. --- McBopomofo.xcodeproj/project.pbxproj | 14 +- Source/Engine/.gitignore | 1 + Source/Engine/CMakeLists.txt | 24 +++ Source/Engine/KeyValueBlobReader.cpp | 140 ++++++++++++++ Source/Engine/KeyValueBlobReader.h | 94 +++++++++ Source/Engine/KeyValueBlobReaderTest.cpp | 235 +++++++++++++++++++++++ Source/Engine/McBopomofoLM.cpp | 23 +++ Source/Engine/McBopomofoLM.h | 23 +++ Source/Engine/UserPhrasesLM.cpp | 177 ++++++----------- Source/Engine/UserPhrasesLM.h | 50 +++-- 10 files changed, 636 insertions(+), 145 deletions(-) create mode 100644 Source/Engine/.gitignore create mode 100644 Source/Engine/CMakeLists.txt create mode 100644 Source/Engine/KeyValueBlobReader.cpp create mode 100644 Source/Engine/KeyValueBlobReader.h create mode 100644 Source/Engine/KeyValueBlobReaderTest.cpp diff --git a/McBopomofo.xcodeproj/project.pbxproj b/McBopomofo.xcodeproj/project.pbxproj index 70883d91..3b2f05bf 100644 --- a/McBopomofo.xcodeproj/project.pbxproj +++ b/McBopomofo.xcodeproj/project.pbxproj @@ -32,6 +32,7 @@ 6ACA41FD15FC1D9000935EF6 /* MainMenu.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6ACA41F015FC1D9000935EF6 /* MainMenu.xib */; }; 6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; }; 6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; }; + 6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */; }; 6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; }; 6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; }; 6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; }; @@ -154,6 +155,8 @@ 6ACA41F615FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.rtf; name = "zh-Hant"; path = "zh-Hant.lproj/License.rtf"; sourceTree = ""; }; 6ACA41F715FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hant"; path = "zh-Hant.lproj/Localizable.strings"; sourceTree = ""; }; 6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = ""; }; + 6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = ""; }; + 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = ""; }; 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = ""; }; 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = ""; }; 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = ""; }; @@ -271,6 +274,8 @@ 6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */, 6A0421A615FEF3F50061ED63 /* FastLM.cpp */, 6A0421A715FEF3F50061ED63 /* FastLM.h */, + 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */, + 6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */, D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */, D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */, D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */, @@ -567,6 +572,7 @@ D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */, 6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */, D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */, + 6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */, 6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */, D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */, ); @@ -765,7 +771,7 @@ buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_WEAK = YES; @@ -825,7 +831,7 @@ buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_WEAK = YES; @@ -934,7 +940,7 @@ buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_WEAK = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; @@ -974,7 +980,7 @@ buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_WEAK = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; diff --git a/Source/Engine/.gitignore b/Source/Engine/.gitignore new file mode 100644 index 00000000..2dff2f0c --- /dev/null +++ b/Source/Engine/.gitignore @@ -0,0 +1 @@ +cmake-build-debug diff --git a/Source/Engine/CMakeLists.txt b/Source/Engine/CMakeLists.txt new file mode 100644 index 00000000..7a97530f --- /dev/null +++ b/Source/Engine/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.17) +project(KeyValueBlobReader) + +set(CMAKE_CXX_STANDARD 17) + +add_library(KeyValueBlobReader KeyValueBlobReader.cpp KeyValueBlobReader.h) + +# Let CMake fetch Google Test for us. +# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project +include(FetchContent) + +FetchContent_Declare( + googletest + # Specify the commit you depend on and update it regularly. + URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip +) +# For Windows: Prevent overriding the parent project's compiler/linker settings +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + +# Test target declarations. +add_executable(KeyValueBlobReadTest KeyValueBlobReaderTest.cpp) +target_link_libraries(KeyValueBlobReadTest gtest_main KeyValueBlobReader) +add_test(NAME KeyValueBlobReadTest COMMAND KeyValueBlobReadTest) diff --git a/Source/Engine/KeyValueBlobReader.cpp b/Source/Engine/KeyValueBlobReader.cpp new file mode 100644 index 00000000..515412d2 --- /dev/null +++ b/Source/Engine/KeyValueBlobReader.cpp @@ -0,0 +1,140 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "KeyValueBlobReader.h" + +namespace McBopomofo { + +KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) { + static auto new_line = [](char c) { return c == '\n' || c == '\r'; }; + static auto blank = [](char c) { return c == ' ' || c == '\t'; }; + static auto blank_or_newline = [](char c) { return blank(c) || new_line(c); }; + static auto content_char = [](char c) { + return !blank(c) && !new_line(c); + }; + + if (state_ == State::ERROR) { + return state_; + } + + const char* key_begin = nullptr; + size_t key_length = 0; + const char* value_begin = nullptr; + size_t value_length = 0; + + while (true) { + state_ = SkipUntilNot(blank_or_newline); + if (state_ != State::CAN_CONTINUE) { + return state_; + } + + // Check if it's a comment line; if so, read until end of line. + if (*current_ != '#') { + break; + } + state_ = SkipUntil(new_line); + if (state_ != State::CAN_CONTINUE) { + return state_; + } + } + + // No need to check whether* current_ is a content_char, since content_char + // is defined as not blank and not new_line. + + key_begin = current_; + state_ = SkipUntilNot(content_char); + if (state_ != State::CAN_CONTINUE) { + goto error; + } + key_length = current_ - key_begin; + + // There should be at least one blank character after the key string. + if (!blank(*current_)) { + goto error; + } + + state_ = SkipUntilNot(blank); + if (state_ != State::CAN_CONTINUE) { + goto error; + } + + if (!content_char(*current_)) { + goto error; + } + + value_begin = current_; + // value must only contain content characters, blanks not are allowed. + // also, there's no need to check the state after this, since we will always + // emit the value. This also avoids the situation where trailing spaces in a + // line would become part of the value. + SkipUntilNot(content_char); + value_length = current_ - value_begin; + + // Unconditionally skip until the end of the line. This prevents the case + // like "foo bar baz\n" where baz should not be treated as the Next key. + SkipUntil(new_line); + + if (out != nullptr) { + *out = KeyValue{ + std::string_view{key_begin, key_length}, + std::string_view{value_begin, value_length}}; + } + state_ = State::HAS_PAIR; + return state_; + +error: + state_ = State::ERROR; + return State::ERROR; +} + +KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot( + const std::function& f) { + while (current_ != end_ &&* current_) { + if (!f(*current_)) { + return State::CAN_CONTINUE; + } + ++current_; + } + + return State::END; +} + +KeyValueBlobReader::State KeyValueBlobReader::SkipUntil( + const std::function& f) { + while (current_ != end_ &&* current_) { + if (f(*current_)) { + return State::CAN_CONTINUE; + } + ++current_; + } + + return State::END; +} + +std::ostream& operator<<(std::ostream& os, + const KeyValueBlobReader::KeyValue& kv) { + os << "(key: " << kv.key << ", value: " << kv.value << ")"; + return os; +} + +} // namespace McBopomofo diff --git a/Source/Engine/KeyValueBlobReader.h b/Source/Engine/KeyValueBlobReader.h new file mode 100644 index 00000000..a6a5d897 --- /dev/null +++ b/Source/Engine/KeyValueBlobReader.h @@ -0,0 +1,94 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#ifndef SOURCE_ENGINE_KEYVALUEBLOBREADER_H_ +#define SOURCE_ENGINE_KEYVALUEBLOBREADER_H_ + +#include +#include +#include +#include + +// A reader for text-based, blank-separated key-value pairs in a binary blob. +// +// This reader is suitable for reading language model files that entirely +// consist of key-value pairs. Leading or trailing spaces are ignored. +// Lines that start with "#" are treated as comments. Values cannot contain +// spaces. Any space after the value string is parsed is ignored. This implies +// that after a blank, anything that comes after the value can be used as +// comment. Both ' ' and '\t' are treated as blank characters, and the parser +// is agnostic to how lines are ended, and so LF, CR LF, and CR are all valid +// line endings. +// +// std::string_view is used to allow returning results efficiently. As a result, +// the blob is a const char* and will never be mutated. This implies, for +// example, read-only mmap can be used to parse large files. +namespace McBopomofo { + +class KeyValueBlobReader { + public: + enum class State : int { + // There are no more key-value pairs in this blob. + END = 0, + // The reader has produced a new key-value pair. + HAS_PAIR = 1, + // An error is encountered and the parsing stopped. + ERROR = -1, + // Internal-only state: the parser can continue parsing. + CAN_CONTINUE = 2 + }; + + struct KeyValue { + constexpr KeyValue() : key(""), value("") {} + constexpr KeyValue(std::string_view k, std::string_view v) + : key(k), value(v) {} + + bool operator==(const KeyValue& another) const { + return key == another.key && value == another.value; + } + + std::string_view key; + std::string_view value; + }; + + KeyValueBlobReader(const char* blob, size_t size) + : current_(blob), end_(blob + size) {} + + // Parse the next key-value pair and return the state of the reader. If `out` + // is passed, out will be set to the produced key-value pair if there is one. + State Next(KeyValue* out = nullptr); + + private: + State SkipUntil(const std::function& f); + State SkipUntilNot(const std::function& f); + + const char* current_; + const char* end_; + State state_ = State::CAN_CONTINUE; +}; + +std::ostream& operator<<(std::ostream&, const KeyValueBlobReader::KeyValue&); + +} // namespace McBopomofo + +#endif // SOURCE_ENGINE_KEYVALUEBLOBREADER_H_ diff --git a/Source/Engine/KeyValueBlobReaderTest.cpp b/Source/Engine/KeyValueBlobReaderTest.cpp new file mode 100644 index 00000000..581e95bd --- /dev/null +++ b/Source/Engine/KeyValueBlobReaderTest.cpp @@ -0,0 +1,235 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "KeyValueBlobReader.h" + +#include +#include "gtest/gtest.h" + +namespace McBopomofo { + +using State = KeyValueBlobReader::State; +using KeyValue = KeyValueBlobReader::KeyValue; + +TEST(KeyValueBlobReaderTest, EmptyBlob) { + std::string empty; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency) { + char empty[0]; + KeyValueBlobReader reader(empty, 0); + EXPECT_EQ(reader.Next(), State::END); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, BlankBlob) { + std::string blank = " "; + KeyValueBlobReader reader(blank.c_str(), blank.length()); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid) { + std::string empty = "hello"; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + EXPECT_EQ(reader.Next(), State::ERROR); +} + +TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress) { + std::string empty = "hello"; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + EXPECT_EQ(reader.Next(), State::ERROR); + EXPECT_EQ(reader.Next(), State::ERROR); +} + +TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid) { + char bad[] = {'h', 0, 'w'}; + KeyValueBlobReader reader(bad, sizeof(bad)); + EXPECT_EQ(reader.Next(), State::ERROR); +} + +TEST(KeyValueBlobReaderTest, SingleKeyValuePair) { + std::string empty = "hello world\n"; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding) { + char small[] = {'p', ' ', 'q'}; + KeyValueBlobReader reader(small, sizeof(small)); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"p", "q"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing) { + char small[] = {'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's'}; + KeyValueBlobReader reader(small, sizeof(small)); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"p", "q"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd) { + std::string simple = "hello world"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, EncodingAgnostic1) { + std::string simple = u8"smile ☺️"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"smile", u8"☺️"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, EncodingAgnostic2) { + std::string simple = "Nobel-Laureate " + "\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, ( + KeyValue{"Nobel-Laureate", + "\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace) { + std::string simple = "hello world and all\nanother value"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"another", "value"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored) { + std::string simple = "\thello world \n\n foo bar \t\t\t \n\n\n"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, WindowsCRLFSupported) { + std::string simple = "lorem ipsum\r\nhello world"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"lorem", "ipsum"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, MultipleKeyValuePair) { + std::string multi = "\n \nhello world\n foo \t bar "; + KeyValueBlobReader reader(multi.c_str(), multi.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, ReadUntilNullChar) { + char buf[] = {'p', '\t', 'q', '\n', 0, 'r', ' ', 's'}; + KeyValueBlobReader reader(buf, sizeof(buf)); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"p", "q"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) { + std::string text = R"( +# comment1 +# comment2 + +# comment3 + hello World + caffè latte + + # another comment + foo bar + +# comment4 +# comment5 +)"; + + KeyValueBlobReader reader(text.c_str(), text.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "World"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, ValueCommentSupported) { + std::string text = R"( + # empty + + hello world#peace + hello world#peace #peace +hello world#peace // peace + caffè latte # café au lait + foo bar +)"; + + KeyValueBlobReader reader(text.c_str(), text.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); + EXPECT_EQ(reader.Next(), State::END); +} + +} // namespace McBopomofo diff --git a/Source/Engine/McBopomofoLM.cpp b/Source/Engine/McBopomofoLM.cpp index 0ffdaddd..3d577fd2 100644 --- a/Source/Engine/McBopomofoLM.cpp +++ b/Source/Engine/McBopomofoLM.cpp @@ -1,3 +1,26 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + #include "McBopomofoLM.h" #include #include diff --git a/Source/Engine/McBopomofoLM.h b/Source/Engine/McBopomofoLM.h index 521d86dc..82b02f0d 100644 --- a/Source/Engine/McBopomofoLM.h +++ b/Source/Engine/McBopomofoLM.h @@ -1,3 +1,26 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + #ifndef MCBOPOMOFOLM_H #define MCBOPOMOFOLM_H diff --git a/Source/Engine/UserPhrasesLM.cpp b/Source/Engine/UserPhrasesLM.cpp index 30e7b240..cbb6b9cd 100644 --- a/Source/Engine/UserPhrasesLM.cpp +++ b/Source/Engine/UserPhrasesLM.cpp @@ -1,12 +1,37 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + #include "UserPhrasesLM.h" + #include #include #include #include #include -using namespace Formosa::Gramambular; -using namespace McBopomofo; +#include "KeyValueBlobReader.h" + +namespace McBopomofo { UserPhrasesLM::UserPhrasesLM() : fd(-1) @@ -42,113 +67,24 @@ bool UserPhrasesLM::open(const char *path) length = (size_t)sb.st_size; - data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0); + data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); if (!data) { ::close(fd); return false; } - char *head = (char *)data; - char *end = (char *)data + length; - char c; - Row row; - -start: - // EOF -> end - if (head == end) { - goto end; + KeyValueBlobReader reader(static_cast(data), length); + KeyValueBlobReader::KeyValue keyValue; + KeyValueBlobReader::State state; + while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) { + // We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading. + keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key ); } - c = *head; - // \s -> error - if (c == ' ') { - goto error; + if (state == KeyValueBlobReader::State::ERROR) { + close(); + return false; } - // \n -> start - else if (c == '\n') { - head++; - goto start; - } - - // \w -> record column star, state1 - row.value = head; - head++; - // fall through to state 1 - -state1: - // EOF -> error - if (head == end) { - goto error; - } - - c = *head; - // \n -> error - if (c == '\n') { - goto error; - } - // \s -> state2 + zero out ending + record column start - else if (c == ' ') { - *head = 0; - head++; - row.key = head; - goto state2; - } - - // \w -> state1 - head++; - goto state1; - -state2: - if (head == end) { - *head = 0; - head++; - keyRowMap[row.key].push_back(row); - goto end; - } - - c = *head; - // \s -> error - if (c == ' ' || c == '\n') { - *head = 0; - head++; - keyRowMap[row.key].push_back(row); - if (c == ' ') { - goto state3; - } - goto start; - } - - // \w -> state 2 - head++; - goto state2; - -state3: - if (head == end) { - *head = 0; - head++; - keyRowMap[row.key].push_back(row); - goto end; - } - - c = *head; - if (c == '\n') { - goto start; - } - - head++; - goto state3; - -error: - close(); - return false; - -end: - static const char *space = " "; - Row emptyRow; - emptyRow.key = space; - emptyRow.value = space; - keyRowMap[space].push_back(emptyRow); - return true; } @@ -165,33 +101,29 @@ void UserPhrasesLM::close() void UserPhrasesLM::dump() { - size_t rows = 0; - for (map >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) { - const vector& r = (*i).second; - for (vector::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) { - const Row& row = *ri; - cerr << row.key << " " << row.value << "\n"; - rows++; + for (const auto& entry : keyRowMap) { + const std::vector& rows = entry.second; + for (const auto& row : rows) { + std::cerr << row.key << " " << row.value << "\n"; } } } -const vector UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key) +const std::vector UserPhrasesLM::bigramsForKeys(const std::string& preceedingKey, const std::string& key) { - return vector(); + return std::vector(); } -const vector UserPhrasesLM::unigramsForKey(const string& key) +const std::vector UserPhrasesLM::unigramsForKey(const std::string& key) { - vector v; - map >::const_iterator i = keyRowMap.find(key.c_str()); - - if (i != keyRowMap.end()) { - for (vector::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) { - Unigram g; - const Row& r = *ri; - g.keyValue.key = r.key; - g.keyValue.value = r.value; + std::vector v; + auto iter = keyRowMap.find(key); + if (iter != keyRowMap.end()) { + const std::vector& rows = iter->second; + for (const auto& row : rows) { + Formosa::Gramambular::Unigram g; + g.keyValue.key = row.key; + g.keyValue.value = row.value; g.score = 0.0; v.push_back(g); } @@ -200,8 +132,9 @@ const vector UserPhrasesLM::unigramsForKey(const string& key) return v; } -bool UserPhrasesLM::hasUnigramsForKey(const string& key) +bool UserPhrasesLM::hasUnigramsForKey(const std::string& key) { - return keyRowMap.find(key.c_str()) != keyRowMap.end(); + return keyRowMap.find(key) != keyRowMap.end(); } +}; // namespace McBopomofo diff --git a/Source/Engine/UserPhrasesLM.h b/Source/Engine/UserPhrasesLM.h index 4dc81d66..7fdb37e9 100644 --- a/Source/Engine/UserPhrasesLM.h +++ b/Source/Engine/UserPhrasesLM.h @@ -1,8 +1,29 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + #ifndef USERPHRASESLM_H #define USERPHRASESLM_H -#include - #include #include #include @@ -10,9 +31,7 @@ namespace McBopomofo { -using namespace Formosa::Gramambular; - -class UserPhrasesLM : public LanguageModel +class UserPhrasesLM : public Formosa::Gramambular::LanguageModel { public: UserPhrasesLM(); @@ -22,25 +41,18 @@ public: void close(); void dump(); - virtual const vector bigramsForKeys(const string& preceedingKey, const string& key); - virtual const vector unigramsForKey(const string& key); - virtual bool hasUnigramsForKey(const string& key); + virtual const std::vector bigramsForKeys(const std::string& preceedingKey, const std::string& key); + virtual const std::vector unigramsForKey(const std::string& key); + virtual bool hasUnigramsForKey(const std::string& key); protected: - struct CStringCmp - { - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) < 0; - } - }; - struct Row { - const char *key; - const char *value; + Row(std::string_view& k, std::string_view& v) : key(k), value(v) {} + std::string_view key; + std::string_view value; }; - map, CStringCmp> keyRowMap; + std::map> keyRowMap; int fd; void *data; size_t length;