diff --git a/McBopomofo.xcodeproj/project.pbxproj b/McBopomofo.xcodeproj/project.pbxproj index 6633cae7..4d65989e 100644 --- a/McBopomofo.xcodeproj/project.pbxproj +++ b/McBopomofo.xcodeproj/project.pbxproj @@ -32,6 +32,7 @@ 6ACA41FD15FC1D9000935EF6 /* MainMenu.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6ACA41F015FC1D9000935EF6 /* MainMenu.xib */; }; 6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; }; 6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; }; + 6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */; }; 6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; }; 6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; }; 6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; }; @@ -155,6 +156,8 @@ 6ACA41F615FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.rtf; name = "zh-Hant"; path = "zh-Hant.lproj/License.rtf"; sourceTree = ""; }; 6ACA41F715FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hant"; path = "zh-Hant.lproj/Localizable.strings"; sourceTree = ""; }; 6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = ""; }; + 6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = ""; }; + 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = ""; }; 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = ""; }; 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = ""; }; 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = ""; }; @@ -276,6 +279,8 @@ 6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */, 6A0421A615FEF3F50061ED63 /* FastLM.cpp */, 6A0421A715FEF3F50061ED63 /* FastLM.h */, + 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */, + 6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */, D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */, D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */, D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */, @@ -579,6 +584,7 @@ D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */, 6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */, D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */, + 6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */, 6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */, D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */, ); @@ -776,7 +782,7 @@ buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_WEAK = YES; @@ -836,7 +842,7 @@ buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_WEAK = YES; @@ -945,7 +951,7 @@ buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_WEAK = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; @@ -985,7 +991,7 @@ buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; CLANG_ENABLE_OBJC_ARC = YES; CLANG_ENABLE_OBJC_WEAK = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; diff --git a/Source/Engine/.gitignore b/Source/Engine/.gitignore new file mode 100644 index 00000000..2dff2f0c --- /dev/null +++ b/Source/Engine/.gitignore @@ -0,0 +1 @@ +cmake-build-debug diff --git a/Source/Engine/CMakeLists.txt b/Source/Engine/CMakeLists.txt new file mode 100644 index 00000000..7a97530f --- /dev/null +++ b/Source/Engine/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.17) +project(KeyValueBlobReader) + +set(CMAKE_CXX_STANDARD 17) + +add_library(KeyValueBlobReader KeyValueBlobReader.cpp KeyValueBlobReader.h) + +# Let CMake fetch Google Test for us. +# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project +include(FetchContent) + +FetchContent_Declare( + googletest + # Specify the commit you depend on and update it regularly. + URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip +) +# For Windows: Prevent overriding the parent project's compiler/linker settings +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + +# Test target declarations. +add_executable(KeyValueBlobReadTest KeyValueBlobReaderTest.cpp) +target_link_libraries(KeyValueBlobReadTest gtest_main KeyValueBlobReader) +add_test(NAME KeyValueBlobReadTest COMMAND KeyValueBlobReadTest) diff --git a/Source/Engine/KeyValueBlobReader.cpp b/Source/Engine/KeyValueBlobReader.cpp new file mode 100644 index 00000000..515412d2 --- /dev/null +++ b/Source/Engine/KeyValueBlobReader.cpp @@ -0,0 +1,140 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "KeyValueBlobReader.h" + +namespace McBopomofo { + +KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) { + static auto new_line = [](char c) { return c == '\n' || c == '\r'; }; + static auto blank = [](char c) { return c == ' ' || c == '\t'; }; + static auto blank_or_newline = [](char c) { return blank(c) || new_line(c); }; + static auto content_char = [](char c) { + return !blank(c) && !new_line(c); + }; + + if (state_ == State::ERROR) { + return state_; + } + + const char* key_begin = nullptr; + size_t key_length = 0; + const char* value_begin = nullptr; + size_t value_length = 0; + + while (true) { + state_ = SkipUntilNot(blank_or_newline); + if (state_ != State::CAN_CONTINUE) { + return state_; + } + + // Check if it's a comment line; if so, read until end of line. + if (*current_ != '#') { + break; + } + state_ = SkipUntil(new_line); + if (state_ != State::CAN_CONTINUE) { + return state_; + } + } + + // No need to check whether* current_ is a content_char, since content_char + // is defined as not blank and not new_line. + + key_begin = current_; + state_ = SkipUntilNot(content_char); + if (state_ != State::CAN_CONTINUE) { + goto error; + } + key_length = current_ - key_begin; + + // There should be at least one blank character after the key string. + if (!blank(*current_)) { + goto error; + } + + state_ = SkipUntilNot(blank); + if (state_ != State::CAN_CONTINUE) { + goto error; + } + + if (!content_char(*current_)) { + goto error; + } + + value_begin = current_; + // value must only contain content characters, blanks not are allowed. + // also, there's no need to check the state after this, since we will always + // emit the value. This also avoids the situation where trailing spaces in a + // line would become part of the value. + SkipUntilNot(content_char); + value_length = current_ - value_begin; + + // Unconditionally skip until the end of the line. This prevents the case + // like "foo bar baz\n" where baz should not be treated as the Next key. + SkipUntil(new_line); + + if (out != nullptr) { + *out = KeyValue{ + std::string_view{key_begin, key_length}, + std::string_view{value_begin, value_length}}; + } + state_ = State::HAS_PAIR; + return state_; + +error: + state_ = State::ERROR; + return State::ERROR; +} + +KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot( + const std::function& f) { + while (current_ != end_ &&* current_) { + if (!f(*current_)) { + return State::CAN_CONTINUE; + } + ++current_; + } + + return State::END; +} + +KeyValueBlobReader::State KeyValueBlobReader::SkipUntil( + const std::function& f) { + while (current_ != end_ &&* current_) { + if (f(*current_)) { + return State::CAN_CONTINUE; + } + ++current_; + } + + return State::END; +} + +std::ostream& operator<<(std::ostream& os, + const KeyValueBlobReader::KeyValue& kv) { + os << "(key: " << kv.key << ", value: " << kv.value << ")"; + return os; +} + +} // namespace McBopomofo diff --git a/Source/Engine/KeyValueBlobReader.h b/Source/Engine/KeyValueBlobReader.h new file mode 100644 index 00000000..a6a5d897 --- /dev/null +++ b/Source/Engine/KeyValueBlobReader.h @@ -0,0 +1,94 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#ifndef SOURCE_ENGINE_KEYVALUEBLOBREADER_H_ +#define SOURCE_ENGINE_KEYVALUEBLOBREADER_H_ + +#include +#include +#include +#include + +// A reader for text-based, blank-separated key-value pairs in a binary blob. +// +// This reader is suitable for reading language model files that entirely +// consist of key-value pairs. Leading or trailing spaces are ignored. +// Lines that start with "#" are treated as comments. Values cannot contain +// spaces. Any space after the value string is parsed is ignored. This implies +// that after a blank, anything that comes after the value can be used as +// comment. Both ' ' and '\t' are treated as blank characters, and the parser +// is agnostic to how lines are ended, and so LF, CR LF, and CR are all valid +// line endings. +// +// std::string_view is used to allow returning results efficiently. As a result, +// the blob is a const char* and will never be mutated. This implies, for +// example, read-only mmap can be used to parse large files. +namespace McBopomofo { + +class KeyValueBlobReader { + public: + enum class State : int { + // There are no more key-value pairs in this blob. + END = 0, + // The reader has produced a new key-value pair. + HAS_PAIR = 1, + // An error is encountered and the parsing stopped. + ERROR = -1, + // Internal-only state: the parser can continue parsing. + CAN_CONTINUE = 2 + }; + + struct KeyValue { + constexpr KeyValue() : key(""), value("") {} + constexpr KeyValue(std::string_view k, std::string_view v) + : key(k), value(v) {} + + bool operator==(const KeyValue& another) const { + return key == another.key && value == another.value; + } + + std::string_view key; + std::string_view value; + }; + + KeyValueBlobReader(const char* blob, size_t size) + : current_(blob), end_(blob + size) {} + + // Parse the next key-value pair and return the state of the reader. If `out` + // is passed, out will be set to the produced key-value pair if there is one. + State Next(KeyValue* out = nullptr); + + private: + State SkipUntil(const std::function& f); + State SkipUntilNot(const std::function& f); + + const char* current_; + const char* end_; + State state_ = State::CAN_CONTINUE; +}; + +std::ostream& operator<<(std::ostream&, const KeyValueBlobReader::KeyValue&); + +} // namespace McBopomofo + +#endif // SOURCE_ENGINE_KEYVALUEBLOBREADER_H_ diff --git a/Source/Engine/KeyValueBlobReaderTest.cpp b/Source/Engine/KeyValueBlobReaderTest.cpp new file mode 100644 index 00000000..581e95bd --- /dev/null +++ b/Source/Engine/KeyValueBlobReaderTest.cpp @@ -0,0 +1,235 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "KeyValueBlobReader.h" + +#include +#include "gtest/gtest.h" + +namespace McBopomofo { + +using State = KeyValueBlobReader::State; +using KeyValue = KeyValueBlobReader::KeyValue; + +TEST(KeyValueBlobReaderTest, EmptyBlob) { + std::string empty; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency) { + char empty[0]; + KeyValueBlobReader reader(empty, 0); + EXPECT_EQ(reader.Next(), State::END); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, BlankBlob) { + std::string blank = " "; + KeyValueBlobReader reader(blank.c_str(), blank.length()); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid) { + std::string empty = "hello"; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + EXPECT_EQ(reader.Next(), State::ERROR); +} + +TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress) { + std::string empty = "hello"; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + EXPECT_EQ(reader.Next(), State::ERROR); + EXPECT_EQ(reader.Next(), State::ERROR); +} + +TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid) { + char bad[] = {'h', 0, 'w'}; + KeyValueBlobReader reader(bad, sizeof(bad)); + EXPECT_EQ(reader.Next(), State::ERROR); +} + +TEST(KeyValueBlobReaderTest, SingleKeyValuePair) { + std::string empty = "hello world\n"; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding) { + char small[] = {'p', ' ', 'q'}; + KeyValueBlobReader reader(small, sizeof(small)); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"p", "q"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing) { + char small[] = {'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's'}; + KeyValueBlobReader reader(small, sizeof(small)); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"p", "q"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd) { + std::string simple = "hello world"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, EncodingAgnostic1) { + std::string simple = u8"smile ☺️"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"smile", u8"☺️"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, EncodingAgnostic2) { + std::string simple = "Nobel-Laureate " + "\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, ( + KeyValue{"Nobel-Laureate", + "\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace) { + std::string simple = "hello world and all\nanother value"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"another", "value"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored) { + std::string simple = "\thello world \n\n foo bar \t\t\t \n\n\n"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, WindowsCRLFSupported) { + std::string simple = "lorem ipsum\r\nhello world"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"lorem", "ipsum"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, MultipleKeyValuePair) { + std::string multi = "\n \nhello world\n foo \t bar "; + KeyValueBlobReader reader(multi.c_str(), multi.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, ReadUntilNullChar) { + char buf[] = {'p', '\t', 'q', '\n', 0, 'r', ' ', 's'}; + KeyValueBlobReader reader(buf, sizeof(buf)); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"p", "q"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) { + std::string text = R"( +# comment1 +# comment2 + +# comment3 + hello World + caffè latte + + # another comment + foo bar + +# comment4 +# comment5 +)"; + + KeyValueBlobReader reader(text.c_str(), text.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "World"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); + EXPECT_EQ(reader.Next(), State::END); +} + +TEST(KeyValueBlobReaderTest, ValueCommentSupported) { + std::string text = R"( + # empty + + hello world#peace + hello world#peace #peace +hello world#peace // peace + caffè latte # café au lait + foo bar +)"; + + KeyValueBlobReader reader(text.c_str(), text.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"})); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); + EXPECT_EQ(reader.Next(), State::END); +} + +} // namespace McBopomofo diff --git a/Source/Engine/McBopomofoLM.cpp b/Source/Engine/McBopomofoLM.cpp index 0ffdaddd..3d577fd2 100644 --- a/Source/Engine/McBopomofoLM.cpp +++ b/Source/Engine/McBopomofoLM.cpp @@ -1,3 +1,26 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + #include "McBopomofoLM.h" #include #include diff --git a/Source/Engine/McBopomofoLM.h b/Source/Engine/McBopomofoLM.h index 521d86dc..82b02f0d 100644 --- a/Source/Engine/McBopomofoLM.h +++ b/Source/Engine/McBopomofoLM.h @@ -1,3 +1,26 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + #ifndef MCBOPOMOFOLM_H #define MCBOPOMOFOLM_H diff --git a/Source/Engine/UserPhrasesLM.cpp b/Source/Engine/UserPhrasesLM.cpp index 30e7b240..cbb6b9cd 100644 --- a/Source/Engine/UserPhrasesLM.cpp +++ b/Source/Engine/UserPhrasesLM.cpp @@ -1,12 +1,37 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + #include "UserPhrasesLM.h" + #include #include #include #include #include -using namespace Formosa::Gramambular; -using namespace McBopomofo; +#include "KeyValueBlobReader.h" + +namespace McBopomofo { UserPhrasesLM::UserPhrasesLM() : fd(-1) @@ -42,113 +67,24 @@ bool UserPhrasesLM::open(const char *path) length = (size_t)sb.st_size; - data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0); + data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); if (!data) { ::close(fd); return false; } - char *head = (char *)data; - char *end = (char *)data + length; - char c; - Row row; - -start: - // EOF -> end - if (head == end) { - goto end; + KeyValueBlobReader reader(static_cast(data), length); + KeyValueBlobReader::KeyValue keyValue; + KeyValueBlobReader::State state; + while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) { + // We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading. + keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key ); } - c = *head; - // \s -> error - if (c == ' ') { - goto error; + if (state == KeyValueBlobReader::State::ERROR) { + close(); + return false; } - // \n -> start - else if (c == '\n') { - head++; - goto start; - } - - // \w -> record column star, state1 - row.value = head; - head++; - // fall through to state 1 - -state1: - // EOF -> error - if (head == end) { - goto error; - } - - c = *head; - // \n -> error - if (c == '\n') { - goto error; - } - // \s -> state2 + zero out ending + record column start - else if (c == ' ') { - *head = 0; - head++; - row.key = head; - goto state2; - } - - // \w -> state1 - head++; - goto state1; - -state2: - if (head == end) { - *head = 0; - head++; - keyRowMap[row.key].push_back(row); - goto end; - } - - c = *head; - // \s -> error - if (c == ' ' || c == '\n') { - *head = 0; - head++; - keyRowMap[row.key].push_back(row); - if (c == ' ') { - goto state3; - } - goto start; - } - - // \w -> state 2 - head++; - goto state2; - -state3: - if (head == end) { - *head = 0; - head++; - keyRowMap[row.key].push_back(row); - goto end; - } - - c = *head; - if (c == '\n') { - goto start; - } - - head++; - goto state3; - -error: - close(); - return false; - -end: - static const char *space = " "; - Row emptyRow; - emptyRow.key = space; - emptyRow.value = space; - keyRowMap[space].push_back(emptyRow); - return true; } @@ -165,33 +101,29 @@ void UserPhrasesLM::close() void UserPhrasesLM::dump() { - size_t rows = 0; - for (map >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) { - const vector& r = (*i).second; - for (vector::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) { - const Row& row = *ri; - cerr << row.key << " " << row.value << "\n"; - rows++; + for (const auto& entry : keyRowMap) { + const std::vector& rows = entry.second; + for (const auto& row : rows) { + std::cerr << row.key << " " << row.value << "\n"; } } } -const vector UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key) +const std::vector UserPhrasesLM::bigramsForKeys(const std::string& preceedingKey, const std::string& key) { - return vector(); + return std::vector(); } -const vector UserPhrasesLM::unigramsForKey(const string& key) +const std::vector UserPhrasesLM::unigramsForKey(const std::string& key) { - vector v; - map >::const_iterator i = keyRowMap.find(key.c_str()); - - if (i != keyRowMap.end()) { - for (vector::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) { - Unigram g; - const Row& r = *ri; - g.keyValue.key = r.key; - g.keyValue.value = r.value; + std::vector v; + auto iter = keyRowMap.find(key); + if (iter != keyRowMap.end()) { + const std::vector& rows = iter->second; + for (const auto& row : rows) { + Formosa::Gramambular::Unigram g; + g.keyValue.key = row.key; + g.keyValue.value = row.value; g.score = 0.0; v.push_back(g); } @@ -200,8 +132,9 @@ const vector UserPhrasesLM::unigramsForKey(const string& key) return v; } -bool UserPhrasesLM::hasUnigramsForKey(const string& key) +bool UserPhrasesLM::hasUnigramsForKey(const std::string& key) { - return keyRowMap.find(key.c_str()) != keyRowMap.end(); + return keyRowMap.find(key) != keyRowMap.end(); } +}; // namespace McBopomofo diff --git a/Source/Engine/UserPhrasesLM.h b/Source/Engine/UserPhrasesLM.h index 4dc81d66..7fdb37e9 100644 --- a/Source/Engine/UserPhrasesLM.h +++ b/Source/Engine/UserPhrasesLM.h @@ -1,8 +1,29 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + #ifndef USERPHRASESLM_H #define USERPHRASESLM_H -#include - #include #include #include @@ -10,9 +31,7 @@ namespace McBopomofo { -using namespace Formosa::Gramambular; - -class UserPhrasesLM : public LanguageModel +class UserPhrasesLM : public Formosa::Gramambular::LanguageModel { public: UserPhrasesLM(); @@ -22,25 +41,18 @@ public: void close(); void dump(); - virtual const vector bigramsForKeys(const string& preceedingKey, const string& key); - virtual const vector unigramsForKey(const string& key); - virtual bool hasUnigramsForKey(const string& key); + virtual const std::vector bigramsForKeys(const std::string& preceedingKey, const std::string& key); + virtual const std::vector unigramsForKey(const std::string& key); + virtual bool hasUnigramsForKey(const std::string& key); protected: - struct CStringCmp - { - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) < 0; - } - }; - struct Row { - const char *key; - const char *value; + Row(std::string_view& k, std::string_view& v) : key(k), value(v) {} + std::string_view key; + std::string_view value; }; - map, CStringCmp> keyRowMap; + std::map> keyRowMap; int fd; void *data; size_t length;