diff --git a/McBopomofo.xcodeproj/project.pbxproj b/McBopomofo.xcodeproj/project.pbxproj index abeca958..64f9430c 100644 --- a/McBopomofo.xcodeproj/project.pbxproj +++ b/McBopomofo.xcodeproj/project.pbxproj @@ -7,7 +7,6 @@ objects = { /* Begin PBXBuildFile section */ - 6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A0421A615FEF3F50061ED63 /* FastLM.cpp */; }; 6A0D4EA715FC0D2D00ABF4B3 /* Cocoa.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */; }; 6A0D4ED215FC0D6400ABF4B3 /* InputMethodController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC715FC0D6400ABF4B3 /* InputMethodController.mm */; }; 6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC815FC0D6400ABF4B3 /* main.m */; }; @@ -33,6 +32,8 @@ 6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; }; 6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; }; 6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */; }; + 6ACC3D442793701600F1B140 /* ParselessPhraseDB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */; }; + 6ACC3D452793701600F1B140 /* ParselessLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D422793701600F1B140 /* ParselessLM.cpp */; }; 6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; }; 6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; }; 6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; }; @@ -74,8 +75,6 @@ /* End PBXContainerItemProxy section */ /* Begin PBXFileReference section */ - 6A0421A615FEF3F50061ED63 /* FastLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = FastLM.cpp; sourceTree = ""; }; - 6A0421A715FEF3F50061ED63 /* FastLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FastLM.h; sourceTree = ""; }; 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = McBopomofo.app; sourceTree = BUILT_PRODUCTS_DIR; }; 6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Cocoa.framework; path = System/Library/Frameworks/Cocoa.framework; sourceTree = SDKROOT; }; 6A0D4EA915FC0D2D00ABF4B3 /* AppKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AppKit.framework; path = System/Library/Frameworks/AppKit.framework; sourceTree = SDKROOT; }; @@ -162,6 +161,10 @@ 6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = ""; }; 6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = ""; }; 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = ""; }; + 6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ParselessPhraseDB.cpp; sourceTree = ""; }; + 6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParselessPhraseDB.h; sourceTree = ""; }; + 6ACC3D422793701600F1B140 /* ParselessLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ParselessLM.cpp; sourceTree = ""; }; + 6ACC3D432793701600F1B140 /* ParselessLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParselessLM.h; sourceTree = ""; }; 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = ""; }; 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = ""; }; 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = ""; }; @@ -289,12 +292,14 @@ 6A0D4F1315FC0EB100ABF4B3 /* Gramambular */, 6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */, 6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */, - 6A0421A615FEF3F50061ED63 /* FastLM.cpp */, - 6A0421A715FEF3F50061ED63 /* FastLM.h */, 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */, 6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */, D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */, D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */, + 6ACC3D422793701600F1B140 /* ParselessLM.cpp */, + 6ACC3D432793701600F1B140 /* ParselessLM.h */, + 6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */, + 6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */, D44FB74B2792189A003C80A6 /* PhraseReplacementMap.cpp */, D44FB74C2792189A003C80A6 /* PhraseReplacementMap.h */, D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */, @@ -592,6 +597,7 @@ buildActionMask = 2147483647; files = ( D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */, + 6ACC3D442793701600F1B140 /* ParselessPhraseDB.cpp in Sources */, D44FB74727919D35003C80A6 /* EmacsKeyHelper.swift in Sources */, 6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */, D44FB74D2792189A003C80A6 /* PhraseReplacementMap.cpp in Sources */, @@ -602,9 +608,9 @@ D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */, D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */, 6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */, + 6ACC3D452793701600F1B140 /* ParselessLM.cpp in Sources */, D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */, 6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */, - 6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */, D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */, ); runOnlyForDeploymentPostprocessing = 0; diff --git a/Source/Engine/CMakeLists.txt b/Source/Engine/CMakeLists.txt index 7a97530f..6e06f2ad 100644 --- a/Source/Engine/CMakeLists.txt +++ b/Source/Engine/CMakeLists.txt @@ -1,9 +1,16 @@ cmake_minimum_required(VERSION 3.17) -project(KeyValueBlobReader) +project(McBopomofoLMLib) set(CMAKE_CXX_STANDARD 17) -add_library(KeyValueBlobReader KeyValueBlobReader.cpp KeyValueBlobReader.h) +include_directories("Gramambular") +add_library(McBopomofoLMLib + KeyValueBlobReader.cpp + KeyValueBlobReader.h + ParselessPhraseDB.cpp + ParselessPhraseDB.h + ParselessLM.cpp + ParselessLM.h) # Let CMake fetch Google Test for us. # https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project @@ -19,6 +26,17 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) FetchContent_MakeAvailable(googletest) # Test target declarations. -add_executable(KeyValueBlobReadTest KeyValueBlobReaderTest.cpp) -target_link_libraries(KeyValueBlobReadTest gtest_main KeyValueBlobReader) -add_test(NAME KeyValueBlobReadTest COMMAND KeyValueBlobReadTest) +add_executable(McBopomofoLMLibTest + KeyValueBlobReaderTest.cpp + ParselessLMTest.cpp + ParselessPhraseDBTest.cpp) +target_link_libraries(McBopomofoLMLibTest gtest_main McBopomofoLMLib) +include(GoogleTest) +gtest_discover_tests(McBopomofoLMLibTest) + +# Benchmark target. +find_package(benchmark REQUIRED) +add_executable(ParselessLMBenchmark + FastLM.cpp + ParselessLMBenchmark.cpp) +target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark) \ No newline at end of file diff --git a/Source/Engine/KeyValueBlobReader.cpp b/Source/Engine/KeyValueBlobReader.cpp index 515412d2..e8cad3c2 100644 --- a/Source/Engine/KeyValueBlobReader.cpp +++ b/Source/Engine/KeyValueBlobReader.cpp @@ -25,116 +25,118 @@ namespace McBopomofo { -KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) { - static auto new_line = [](char c) { return c == '\n' || c == '\r'; }; - static auto blank = [](char c) { return c == ' ' || c == '\t'; }; - static auto blank_or_newline = [](char c) { return blank(c) || new_line(c); }; - static auto content_char = [](char c) { - return !blank(c) && !new_line(c); - }; +KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) +{ + static auto new_line = [](char c) { return c == '\n' || c == '\r'; }; + static auto blank = [](char c) { return c == ' ' || c == '\t'; }; + static auto blank_or_newline + = [](char c) { return blank(c) || new_line(c); }; + static auto content_char = [](char c) { return !blank(c) && !new_line(c); }; - if (state_ == State::ERROR) { + if (state_ == State::ERROR) { + return state_; + } + + const char* key_begin = nullptr; + size_t key_length = 0; + const char* value_begin = nullptr; + size_t value_length = 0; + + while (true) { + state_ = SkipUntilNot(blank_or_newline); + if (state_ != State::CAN_CONTINUE) { + return state_; + } + + // Check if it's a comment line; if so, read until end of line. + if (*current_ != '#') { + break; + } + state_ = SkipUntil(new_line); + if (state_ != State::CAN_CONTINUE) { + return state_; + } + } + + // No need to check whether* current_ is a content_char, since content_char + // is defined as not blank and not new_line. + + key_begin = current_; + state_ = SkipUntilNot(content_char); + if (state_ != State::CAN_CONTINUE) { + goto error; + } + key_length = current_ - key_begin; + + // There should be at least one blank character after the key string. + if (!blank(*current_)) { + goto error; + } + + state_ = SkipUntilNot(blank); + if (state_ != State::CAN_CONTINUE) { + goto error; + } + + if (!content_char(*current_)) { + goto error; + } + + value_begin = current_; + // value must only contain content characters, blanks not are allowed. + // also, there's no need to check the state after this, since we will always + // emit the value. This also avoids the situation where trailing spaces in a + // line would become part of the value. + SkipUntilNot(content_char); + value_length = current_ - value_begin; + + // Unconditionally skip until the end of the line. This prevents the case + // like "foo bar baz\n" where baz should not be treated as the Next key. + SkipUntil(new_line); + + if (out != nullptr) { + *out = KeyValue { std::string_view { key_begin, key_length }, + std::string_view { value_begin, value_length } }; + } + state_ = State::HAS_PAIR; return state_; - } - - const char* key_begin = nullptr; - size_t key_length = 0; - const char* value_begin = nullptr; - size_t value_length = 0; - - while (true) { - state_ = SkipUntilNot(blank_or_newline); - if (state_ != State::CAN_CONTINUE) { - return state_; - } - - // Check if it's a comment line; if so, read until end of line. - if (*current_ != '#') { - break; - } - state_ = SkipUntil(new_line); - if (state_ != State::CAN_CONTINUE) { - return state_; - } - } - - // No need to check whether* current_ is a content_char, since content_char - // is defined as not blank and not new_line. - - key_begin = current_; - state_ = SkipUntilNot(content_char); - if (state_ != State::CAN_CONTINUE) { - goto error; - } - key_length = current_ - key_begin; - - // There should be at least one blank character after the key string. - if (!blank(*current_)) { - goto error; - } - - state_ = SkipUntilNot(blank); - if (state_ != State::CAN_CONTINUE) { - goto error; - } - - if (!content_char(*current_)) { - goto error; - } - - value_begin = current_; - // value must only contain content characters, blanks not are allowed. - // also, there's no need to check the state after this, since we will always - // emit the value. This also avoids the situation where trailing spaces in a - // line would become part of the value. - SkipUntilNot(content_char); - value_length = current_ - value_begin; - - // Unconditionally skip until the end of the line. This prevents the case - // like "foo bar baz\n" where baz should not be treated as the Next key. - SkipUntil(new_line); - - if (out != nullptr) { - *out = KeyValue{ - std::string_view{key_begin, key_length}, - std::string_view{value_begin, value_length}}; - } - state_ = State::HAS_PAIR; - return state_; error: - state_ = State::ERROR; - return State::ERROR; + state_ = State::ERROR; + return state_; } KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot( - const std::function& f) { - while (current_ != end_ &&* current_) { - if (!f(*current_)) { - return State::CAN_CONTINUE; + const std::function& f) +{ + while (current_ != end_ && *current_) { + if (!f(*current_)) { + return State::CAN_CONTINUE; + } + ++current_; } - ++current_; - } - return State::END; + return State::END; } KeyValueBlobReader::State KeyValueBlobReader::SkipUntil( - const std::function& f) { - while (current_ != end_ &&* current_) { - if (f(*current_)) { - return State::CAN_CONTINUE; + const std::function& f) +{ + while (current_ != end_ && *current_) { + if (f(*current_)) { + return State::CAN_CONTINUE; + } + ++current_; } - ++current_; - } - return State::END; + return State::END; } -std::ostream& operator<<(std::ostream& os, - const KeyValueBlobReader::KeyValue& kv) { - os << "(key: " << kv.key << ", value: " << kv.value << ")"; - return os; +std::ostream& operator<<( + std::ostream& os, const KeyValueBlobReader::KeyValue& kv) +{ + os << "(key: " << kv.key << ", value: " << kv.value << ")"; + return os; } -} // namespace McBopomofo +} // namespace McBopomofo diff --git a/Source/Engine/KeyValueBlobReader.h b/Source/Engine/KeyValueBlobReader.h index a6a5d897..aa88421c 100644 --- a/Source/Engine/KeyValueBlobReader.h +++ b/Source/Engine/KeyValueBlobReader.h @@ -46,49 +46,61 @@ namespace McBopomofo { class KeyValueBlobReader { - public: - enum class State : int { - // There are no more key-value pairs in this blob. - END = 0, - // The reader has produced a new key-value pair. - HAS_PAIR = 1, - // An error is encountered and the parsing stopped. - ERROR = -1, - // Internal-only state: the parser can continue parsing. - CAN_CONTINUE = 2 - }; +public: + enum class State : int { + // There are no more key-value pairs in this blob. + END = 0, + // The reader has produced a new key-value pair. + HAS_PAIR = 1, + // An error is encountered and the parsing stopped. + ERROR = -1, + // Internal-only state: the parser can continue parsing. + CAN_CONTINUE = 2 + }; - struct KeyValue { - constexpr KeyValue() : key(""), value("") {} - constexpr KeyValue(std::string_view k, std::string_view v) - : key(k), value(v) {} + struct KeyValue { + constexpr KeyValue() + : key("") + , value("") + { + } + constexpr KeyValue(std::string_view k, std::string_view v) + : key(k) + , value(v) + { + } - bool operator==(const KeyValue& another) const { - return key == another.key && value == another.value; + bool operator==(const KeyValue& another) const + { + return key == another.key && value == another.value; + } + + std::string_view key; + std::string_view value; + }; + + KeyValueBlobReader(const char* blob, size_t size) + : current_(blob) + , end_(blob + size) + { } - std::string_view key; - std::string_view value; - }; + // Parse the next key-value pair and return the state of the reader. If + // `out` is passed, out will be set to the produced key-value pair if there + // is one. + State Next(KeyValue* out = nullptr); - KeyValueBlobReader(const char* blob, size_t size) - : current_(blob), end_(blob + size) {} +private: + State SkipUntil(const std::function& f); + State SkipUntilNot(const std::function& f); - // Parse the next key-value pair and return the state of the reader. If `out` - // is passed, out will be set to the produced key-value pair if there is one. - State Next(KeyValue* out = nullptr); - - private: - State SkipUntil(const std::function& f); - State SkipUntilNot(const std::function& f); - - const char* current_; - const char* end_; - State state_ = State::CAN_CONTINUE; + const char* current_; + const char* end_; + State state_ = State::CAN_CONTINUE; }; std::ostream& operator<<(std::ostream&, const KeyValueBlobReader::KeyValue&); -} // namespace McBopomofo +} // namespace McBopomofo -#endif // SOURCE_ENGINE_KEYVALUEBLOBREADER_H_ +#endif // SOURCE_ENGINE_KEYVALUEBLOBREADER_H_ diff --git a/Source/Engine/KeyValueBlobReaderTest.cpp b/Source/Engine/KeyValueBlobReaderTest.cpp index 581e95bd..44718c9c 100644 --- a/Source/Engine/KeyValueBlobReaderTest.cpp +++ b/Source/Engine/KeyValueBlobReaderTest.cpp @@ -21,9 +21,9 @@ // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // OTHER DEALINGS IN THE SOFTWARE. -#include "KeyValueBlobReader.h" - #include + +#include "KeyValueBlobReader.h" #include "gtest/gtest.h" namespace McBopomofo { @@ -31,156 +31,175 @@ namespace McBopomofo { using State = KeyValueBlobReader::State; using KeyValue = KeyValueBlobReader::KeyValue; -TEST(KeyValueBlobReaderTest, EmptyBlob) { - std::string empty; - KeyValueBlobReader reader(empty.c_str(), empty.length()); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, EmptyBlob) +{ + std::string empty; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency) { - char empty[0]; - KeyValueBlobReader reader(empty, 0); - EXPECT_EQ(reader.Next(), State::END); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency) +{ + char empty[0]; + KeyValueBlobReader reader(empty, 0); + EXPECT_EQ(reader.Next(), State::END); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, BlankBlob) { - std::string blank = " "; - KeyValueBlobReader reader(blank.c_str(), blank.length()); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, BlankBlob) +{ + std::string blank = " "; + KeyValueBlobReader reader(blank.c_str(), blank.length()); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid) { - std::string empty = "hello"; - KeyValueBlobReader reader(empty.c_str(), empty.length()); - EXPECT_EQ(reader.Next(), State::ERROR); +TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid) +{ + std::string empty = "hello"; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + EXPECT_EQ(reader.Next(), State::ERROR); } -TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress) { - std::string empty = "hello"; - KeyValueBlobReader reader(empty.c_str(), empty.length()); - EXPECT_EQ(reader.Next(), State::ERROR); - EXPECT_EQ(reader.Next(), State::ERROR); +TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress) +{ + std::string empty = "hello"; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + EXPECT_EQ(reader.Next(), State::ERROR); + EXPECT_EQ(reader.Next(), State::ERROR); } -TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid) { - char bad[] = {'h', 0, 'w'}; - KeyValueBlobReader reader(bad, sizeof(bad)); - EXPECT_EQ(reader.Next(), State::ERROR); +TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid) +{ + char bad[] = { 'h', 0, 'w' }; + KeyValueBlobReader reader(bad, sizeof(bad)); + EXPECT_EQ(reader.Next(), State::ERROR); } -TEST(KeyValueBlobReaderTest, SingleKeyValuePair) { - std::string empty = "hello world\n"; - KeyValueBlobReader reader(empty.c_str(), empty.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, SingleKeyValuePair) +{ + std::string empty = "hello world\n"; + KeyValueBlobReader reader(empty.c_str(), empty.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "world" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding) { - char small[] = {'p', ' ', 'q'}; - KeyValueBlobReader reader(small, sizeof(small)); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"p", "q"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding) +{ + char small[] = { 'p', ' ', 'q' }; + KeyValueBlobReader reader(small, sizeof(small)); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "p", "q" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing) { - char small[] = {'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's'}; - KeyValueBlobReader reader(small, sizeof(small)); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"p", "q"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing) +{ + char small[] = { 'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's' }; + KeyValueBlobReader reader(small, sizeof(small)); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "p", "q" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd) { - std::string simple = "hello world"; - KeyValueBlobReader reader(simple.c_str(), simple.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd) +{ + std::string simple = "hello world"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "world" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, EncodingAgnostic1) { - std::string simple = u8"smile ☺️"; - KeyValueBlobReader reader(simple.c_str(), simple.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"smile", u8"☺️"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, EncodingAgnostic1) +{ + std::string simple = u8"smile ☺️"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "smile", u8"☺️" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, EncodingAgnostic2) { - std::string simple = "Nobel-Laureate " - "\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"; - KeyValueBlobReader reader(simple.c_str(), simple.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, ( - KeyValue{"Nobel-Laureate", - "\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, EncodingAgnostic2) +{ + std::string simple = "Nobel-Laureate " + "\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, + (KeyValue { "Nobel-Laureate", + "\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace) { - std::string simple = "hello world and all\nanother value"; - KeyValueBlobReader reader(simple.c_str(), simple.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"another", "value"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace) +{ + std::string simple = "hello world and all\nanother value"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "world" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "another", "value" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored) { - std::string simple = "\thello world \n\n foo bar \t\t\t \n\n\n"; - KeyValueBlobReader reader(simple.c_str(), simple.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored) +{ + std::string simple + = "\thello world \n\n foo bar \t\t\t \n\n\n"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "world" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, WindowsCRLFSupported) { - std::string simple = "lorem ipsum\r\nhello world"; - KeyValueBlobReader reader(simple.c_str(), simple.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"lorem", "ipsum"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, WindowsCRLFSupported) +{ + std::string simple = "lorem ipsum\r\nhello world"; + KeyValueBlobReader reader(simple.c_str(), simple.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "lorem", "ipsum" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "world" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, MultipleKeyValuePair) { - std::string multi = "\n \nhello world\n foo \t bar "; - KeyValueBlobReader reader(multi.c_str(), multi.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "world"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, MultipleKeyValuePair) +{ + std::string multi = "\n \nhello world\n foo \t bar "; + KeyValueBlobReader reader(multi.c_str(), multi.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "world" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, ReadUntilNullChar) { - char buf[] = {'p', '\t', 'q', '\n', 0, 'r', ' ', 's'}; - KeyValueBlobReader reader(buf, sizeof(buf)); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"p", "q"})); - EXPECT_EQ(reader.Next(), State::END); +TEST(KeyValueBlobReaderTest, ReadUntilNullChar) +{ + char buf[] = { 'p', '\t', 'q', '\n', 0, 'r', ' ', 's' }; + KeyValueBlobReader reader(buf, sizeof(buf)); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "p", "q" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) { - std::string text = R"( +TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) +{ + std::string text = R"( # comment1 # comment2 @@ -195,19 +214,20 @@ TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) { # comment5 )"; - KeyValueBlobReader reader(text.c_str(), text.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "World"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); - EXPECT_EQ(reader.Next(), State::END); + KeyValueBlobReader reader(text.c_str(), text.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "World" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "caffè", "latte" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" })); + EXPECT_EQ(reader.Next(), State::END); } -TEST(KeyValueBlobReaderTest, ValueCommentSupported) { - std::string text = R"( +TEST(KeyValueBlobReaderTest, ValueCommentSupported) +{ + std::string text = R"( # empty hello world#peace @@ -217,19 +237,19 @@ hello world#peace // peace foo bar )"; - KeyValueBlobReader reader(text.c_str(), text.length()); - KeyValue keyValue; - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"})); - EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); - EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"})); - EXPECT_EQ(reader.Next(), State::END); + KeyValueBlobReader reader(text.c_str(), text.length()); + KeyValue keyValue; + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "caffè", "latte" })); + EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR); + EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" })); + EXPECT_EQ(reader.Next(), State::END); } -} // namespace McBopomofo +} // namespace McBopomofo diff --git a/Source/Engine/McBopomofoLM.h b/Source/Engine/McBopomofoLM.h index 46e28e2a..00babc01 100644 --- a/Source/Engine/McBopomofoLM.h +++ b/Source/Engine/McBopomofoLM.h @@ -25,8 +25,8 @@ #define MCBOPOMOFOLM_H #include -#include "FastLM.h" #include "UserPhrasesLM.h" +#include "ParselessLM.h" #include "PhraseReplacementMap.h" namespace McBopomofo { @@ -51,7 +51,7 @@ public: bool phraseReplacementEnabled(); protected: - FastLM m_languageModel; + ParselessLM m_languageModel; UserPhrasesLM m_userPhrases; UserPhrasesLM m_excludedPhrases; PhraseReplacementMap m_phraseReplacement; diff --git a/Source/Engine/ParselessLM.cpp b/Source/Engine/ParselessLM.cpp new file mode 100644 index 00000000..c0de5c00 --- /dev/null +++ b/Source/Engine/ParselessLM.cpp @@ -0,0 +1,143 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "ParselessLM.h" + +#include +#include +#include +#include + +#include + +McBopomofo::ParselessLM::~ParselessLM() { close(); } + +bool McBopomofo::ParselessLM::open(const std::string_view& path) +{ + if (data_) { + return false; + } + + fd_ = ::open(path.data(), O_RDONLY); + if (fd_ == -1) { + return false; + } + + struct stat sb; + if (fstat(fd_, &sb) == -1) { + ::close(fd_); + fd_ = -1; + return false; + } + + length_ = static_cast(sb.st_size); + + data_ = mmap(NULL, length_, PROT_READ, MAP_SHARED, fd_, 0); + if (data_ == nullptr) { + ::close(fd_); + fd_ = -1; + length_ = 0; + return false; + } + + db_ = std::unique_ptr(new ParselessPhraseDB( + static_cast(data_), length_, /*validate_pragme=*/ + true)); + return true; +} + +void McBopomofo::ParselessLM::close() +{ + if (data_ != nullptr) { + munmap(data_, length_); + ::close(fd_); + fd_ = -1; + length_ = 0; + data_ = nullptr; + } +} + +const std::vector +McBopomofo::ParselessLM::bigramsForKeys( + const std::string& preceedingKey, const std::string& key) +{ + return std::vector(); +} + +const std::vector +McBopomofo::ParselessLM::unigramsForKey(const std::string& key) +{ + if (db_ == nullptr) { + return std::vector(); + } + + std::vector results; + for (const auto& row : db_->findRows(key + " ")) { + Formosa::Gramambular::Unigram unigram; + + // Move ahead until we encounter the first space. This is the key. + auto it = row.begin(); + while (it != row.end() && *it != ' ') { + ++it; + } + + unigram.keyValue.key = std::string(row.begin(), it); + + // Read past the space. + if (it != row.end()) { + ++it; + } + + if (it != row.end()) { + // Now it is the start of the value portion. + auto value_begin = it; + + // Move ahead until we encounter the second space. This is the + // value. + while (it != row.end() && *it != ' ') { + ++it; + } + unigram.keyValue.value = std::string(value_begin, it); + } + + // Read past the space. The remainder, if it exists, is the score. + if (it != row.end()) { + ++it; + } + + if (it != row.end()) { + unigram.score = std::stod(std::string(it, row.end())); + } + results.push_back(unigram); + } + return results; +} + +bool McBopomofo::ParselessLM::hasUnigramsForKey(const std::string& key) +{ + if (db_ == nullptr) { + return false; + } + + return db_->findFirstMatchingLine(key + " ") != nullptr; +} diff --git a/Source/Engine/ParselessLM.h b/Source/Engine/ParselessLM.h new file mode 100644 index 00000000..8d2c0b88 --- /dev/null +++ b/Source/Engine/ParselessLM.h @@ -0,0 +1,58 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#ifndef SOURCE_ENGINE_PARSELESSLM_H_ +#define SOURCE_ENGINE_PARSELESSLM_H_ + +#include +#include +#include + +#include "LanguageModel.h" +#include "ParselessPhraseDB.h" + +namespace McBopomofo { + +class ParselessLM : public Formosa::Gramambular::LanguageModel { +public: + ~ParselessLM() override; + + bool open(const std::string_view& path); + void close(); + + const std::vector bigramsForKeys( + const std::string& preceedingKey, const std::string& key) override; + const std::vector unigramsForKey( + const std::string& key) override; + bool hasUnigramsForKey(const std::string& key) override; + +private: + int fd_ = -1; + void* data_ = nullptr; + size_t length_ = 0; + std::unique_ptr db_; +}; + +}; // namespace McBopomofo + +#endif // SOURCE_ENGINE_PARSELESSLM_H_ diff --git a/Source/Engine/ParselessLMBenchmark.cpp b/Source/Engine/ParselessLMBenchmark.cpp new file mode 100644 index 00000000..d47b27ea --- /dev/null +++ b/Source/Engine/ParselessLMBenchmark.cpp @@ -0,0 +1,89 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include +#include + +#include "FastLM.h" +#include "ParselessLM.h" + +namespace { + +using FastLM = Formosa::Gramambular::FastLM; +using ParselessLM = McBopomofo::ParselessLM; + +static const char* kDataPath = "data.txt"; +static const char* kLegacyDataPath = "data-legacy.txt"; +static const char* kUnigramSearchKey = "ㄕˋ-ㄕˊ"; + +static void BM_ParselessLMOpenClose(benchmark::State& state) +{ + assert(std::filesystem::exists(kDataPath)); + for (auto _ : state) { + ParselessLM lm; + lm.open(kDataPath); + lm.close(); + } +} +BENCHMARK(BM_ParselessLMOpenClose); + +static void BM_FastLMOpenClose(benchmark::State& state) +{ + assert(std::filesystem::exists(kLegacyDataPath)); + for (auto _ : state) { + FastLM lm; + lm.open(kLegacyDataPath); + lm.close(); + } +} +BENCHMARK(BM_FastLMOpenClose); + +static void BM_ParselessLMFindUnigrams(benchmark::State& state) +{ + assert(std::filesystem::exists(kDataPath)); + ParselessLM lm; + lm.open(kDataPath); + for (auto _ : state) { + lm.unigramsForKey(kUnigramSearchKey); + } + lm.close(); +} +BENCHMARK(BM_ParselessLMFindUnigrams); + +static void BM_FastLMFindUnigrams(benchmark::State& state) +{ + assert(std::filesystem::exists(kLegacyDataPath)); + FastLM lm; + lm.open(kLegacyDataPath); + for (auto _ : state) { + lm.unigramsForKey(kUnigramSearchKey); + } + lm.close(); +} +BENCHMARK(BM_FastLMFindUnigrams); + +}; // namespace + +BENCHMARK_MAIN(); diff --git a/Source/Engine/ParselessLMTest.cpp b/Source/Engine/ParselessLMTest.cpp new file mode 100644 index 00000000..e43bbc82 --- /dev/null +++ b/Source/Engine/ParselessLMTest.cpp @@ -0,0 +1,59 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "ParselessLM.h" +#include "gtest/gtest.h" + +namespace McBopomofo { + +TEST(ParselessLMTest, SanityCheckTest) +{ + constexpr const char* data_path = "data.txt"; + if (!std::filesystem::exists(data_path)) { + GTEST_SKIP(); + } + + ParselessLM lm; + bool status = lm.open(data_path); + ASSERT_TRUE(status); + + ASSERT_TRUE(lm.hasUnigramsForKey("ㄕ")); + ASSERT_TRUE(lm.hasUnigramsForKey("ㄕˋ-ㄕˊ")); + ASSERT_TRUE(lm.hasUnigramsForKey("_punctuation_list")); + + auto unigrams = lm.unigramsForKey("ㄕ"); + ASSERT_GT(unigrams.size(), 0); + + unigrams = lm.unigramsForKey("ㄕˋ-ㄕˊ"); + ASSERT_GT(unigrams.size(), 0); + + unigrams = lm.unigramsForKey("_punctuation_list"); + ASSERT_GT(unigrams.size(), 0); + + lm.close(); +} + +}; // namespace McBopomofo diff --git a/Source/Engine/ParselessPhraseDB.cpp b/Source/Engine/ParselessPhraseDB.cpp new file mode 100644 index 00000000..735cc983 --- /dev/null +++ b/Source/Engine/ParselessPhraseDB.cpp @@ -0,0 +1,166 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "ParselessPhraseDB.h" + +#include +#include + +namespace McBopomofo { + +ParselessPhraseDB::ParselessPhraseDB( + const char* buf, size_t length, bool validate_pragma) + : begin_(buf) + , end_(buf + length) +{ + assert(buf != nullptr); + assert(length > 0); + + if (validate_pragma) { + assert(length > SORTED_PRAGMA_HEADER.length()); + + std::string_view header(buf, SORTED_PRAGMA_HEADER.length()); + assert(header == SORTED_PRAGMA_HEADER); + + uint32_t x = 5381; + for (const auto& i : header) { + x = x * 33 + i; + } + + assert(x == uint32_t { 3012373384 }); + + begin_ += header.length(); + } +} + +std::vector ParselessPhraseDB::findRows( + const std::string_view& key) +{ + std::vector rows; + + const char* ptr = findFirstMatchingLine(key); + if (ptr == nullptr) { + return rows; + } + + while (ptr + key.length() <= end_ + && memcmp(ptr, key.data(), key.length()) == 0) { + const char* eol = ptr; + + while (eol != end_ && *eol != '\n') { + ++eol; + } + + rows.emplace_back(ptr, eol - ptr); + if (eol == end_) { + break; + } + + ptr = ++eol; + } + + return rows; +} + +// Implements a binary search that returns the pointer to the first matching +// row. In its core it's just a standard binary search, but we use backtracking +// to locate the line start. We also check the previous line to see if the +// current line is actually the first matching line: if the previous line is +// less to the key and the current line starts exactly with the key, then +// the current line is the first matching line. +const char* ParselessPhraseDB::findFirstMatchingLine( + const std::string_view& key) +{ + if (key.empty()) { + return begin_; + } + + const char* top = begin_; + const char* bottom = end_; + + while (top < bottom) { + const char* mid = top + (bottom - top) / 2; + const char* ptr = mid; + + if (ptr != begin_) { + --ptr; + } + + while (ptr != begin_ && *ptr != '\n') { + --ptr; + } + + const char* prev = nullptr; + if (*ptr == '\n') { + prev = ptr; + ++ptr; + } + + // ptr is now in the "current" line we're interested in. + if (ptr + key.length() > end_) { + // not enough data to compare at this point, bail. + break; + } + + int current_cmp = memcmp(ptr, key.data(), key.length()); + + if (current_cmp > 0) { + bottom = mid - 1; + continue; + } + + if (current_cmp < 0) { + top = mid + 1; + continue; + } + + if (!prev) { + return ptr; + } + + // Move the prev so that it reaches the previous line. + if (prev != begin_) { + --prev; + } + while (prev != begin_ && *prev != '\n') { + --prev; + } + if (*prev == '\n') { + ++prev; + } + + int prev_cmp = memcmp(prev, key.data(), key.length()); + + // This is the first occurrence. + if (prev_cmp < 0 && current_cmp == 0) { + return ptr; + } + + // This is not, which means ptr is "larger" than the keyData. + bottom = mid - 1; + } + + return nullptr; +} + +}; // namespace McBopomofo diff --git a/Source/Engine/ParselessPhraseDB.h b/Source/Engine/ParselessPhraseDB.h new file mode 100644 index 00000000..db5f29d2 --- /dev/null +++ b/Source/Engine/ParselessPhraseDB.h @@ -0,0 +1,60 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#ifndef SOURCE_ENGINE_PARSELESSPHRASEDB_H_ +#define SOURCE_ENGINE_PARSELESSPHRASEDB_H_ + +#include +#include +#include + +namespace McBopomofo { + +constexpr std::string_view SORTED_PRAGMA_HEADER + = "# format org.openvanilla.mcbopomofo.sorted\n"; + +// Defines phrase database that consists of (key, value, score) rows that are +// pre-sorted by the byte value of the keys. It is way faster than FastLM +// because it does not need to parse anything. Instead, it relies on the fact +// that the database is already sorted, and binary search is used to find the +// rows. +class ParselessPhraseDB { +public: + ParselessPhraseDB( + const char* buf, size_t length, bool validate_pragma = false); + + // Find the rows that match the key. Note that prefix match is used. If you + // need exact match, the key will need to have a delimiter (usually a space) + // at the end. + std::vector findRows(const std::string_view& key); + + const char* findFirstMatchingLine(const std::string_view& key); + +private: + const char* begin_; + const char* end_; +}; + +}; // namespace McBopomofo + +#endif // SOURCE_ENGINE_PARSELESSPHRASEDB_H_ diff --git a/Source/Engine/ParselessPhraseDBTest.cpp b/Source/Engine/ParselessPhraseDBTest.cpp new file mode 100644 index 00000000..07c0b7d6 --- /dev/null +++ b/Source/Engine/ParselessPhraseDBTest.cpp @@ -0,0 +1,198 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include +#include + +#include "ParselessPhraseDB.h" +#include "gtest/gtest-death-test.h" +#include "gtest/gtest.h" + +using StringViews = std::vector; + +namespace McBopomofo { + +static bool VectorsEqual( + const std::vector& a, const std::vector& b) +{ + if (a.size() != b.size()) { + return false; + } + + size_t s = a.size(); + for (size_t i = 0; i < s; i++) { + if (a[i] != b[i]) { + return false; + } + } + return true; +} + +TEST(ParselessPhraseDBTest, Simple) +{ + std::string data = "a 1"; + ParselessPhraseDB db(data.c_str(), data.length()); + + const char* first = db.findFirstMatchingLine("a"); + EXPECT_NE(first, nullptr); + EXPECT_EQ(memcmp(first, "a 1", 3), 0); + + first = db.findFirstMatchingLine("a "); + EXPECT_NE(first, nullptr); + EXPECT_EQ(memcmp(first, "a 1", 3), 0); + + first = db.findFirstMatchingLine("a 1"); + EXPECT_NE(first, nullptr); + EXPECT_EQ(memcmp(first, "a 1", 3), 0); +} + +TEST(ParselessPhraseDBTest, NotFound) +{ + std::string data = "a 1\na 2\na 3\nb 1"; + ParselessPhraseDB db(data.c_str(), data.length()); + EXPECT_EQ(db.findFirstMatchingLine("c"), nullptr); + EXPECT_EQ(db.findFirstMatchingLine("A"), nullptr); +} + +TEST(ParselessPhraseDBTest, FindRowsLongerExample) +{ + std::string data = "a 1\na 2\na 3\nb 42\nb 1\nb 2\nc 7\nd 1"; + ParselessPhraseDB db(data.c_str(), data.length()); + + EXPECT_EQ(db.findRows("a"), (StringViews { "a 1", "a 2", "a 3" })); + EXPECT_EQ(db.findRows("b"), (StringViews { "b 42", "b 1", "b 2" })); + EXPECT_EQ(db.findRows("c"), (StringViews { "c 7" })); + EXPECT_EQ(db.findRows("d"), (StringViews { "d 1" })); + EXPECT_EQ(db.findRows("e"), (StringViews {})); + EXPECT_EQ(db.findRows("A"), (StringViews {})); +} + +TEST(ParselessPhraseDBTest, FindFirstMatchingLineLongerExample) +{ + std::string data = "a 1\na 2\na 3\nb 42\nb 1\nb 2\nc 7\nd 1"; + ParselessPhraseDB db(data.c_str(), data.length()); + + const char* first = db.findFirstMatchingLine("a"); + EXPECT_NE(first, nullptr); + EXPECT_EQ(memcmp(first, "a 1", 3), 0); + + db.findFirstMatchingLine("a 1"); + EXPECT_NE(first, nullptr); + EXPECT_EQ(memcmp(first, "a 1", 3), 0); + + first = db.findFirstMatchingLine("b"); + EXPECT_NE(first, nullptr); + EXPECT_EQ(memcmp(first, "b 42", 4), 0); + + first = db.findFirstMatchingLine("c"); + EXPECT_NE(first, nullptr); + EXPECT_EQ(memcmp(first, "c 7", 3), 0); + + first = db.findFirstMatchingLine("d"); + EXPECT_NE(first, nullptr); + EXPECT_EQ(memcmp(first, "d 1", 3), 0); + + first = db.findFirstMatchingLine("d 1"); + EXPECT_NE(first, nullptr); + EXPECT_EQ(memcmp(first, "d 1", 3), 0); + + first = db.findFirstMatchingLine("d 2"); + EXPECT_EQ(first, nullptr); +} + +TEST(ParselessPhraseDBTest, InvalidConstructorArguments) +{ + EXPECT_DEATH((ParselessPhraseDB { nullptr, 1 }), "buf != nullptr"); + EXPECT_DEATH((ParselessPhraseDB { nullptr, 0 }), "buf != nullptr"); + EXPECT_DEATH((ParselessPhraseDB { "", 0 }), "length > 0"); + EXPECT_DEATH((ParselessPhraseDB { "a", 1, /*validate_pragma=*/true }), + "length > SORTED_PRAGMA_HEADER\\.length\\(\\)"); +} + +TEST(ParselessPhraseDBTest, PragmaGuard) +{ + std::string buf1 = std::string(SORTED_PRAGMA_HEADER) + "a"; + std::string buf2 = "#" + buf1; + std::string buf3 = buf1; + buf3[3] = 'x'; + + ParselessPhraseDB { buf1.c_str(), buf1.length(), /*validate_pragma=*/true }; + EXPECT_DEATH( + (ParselessPhraseDB { buf2.c_str(), buf2.length(), /*validate_pragma=*/ + true }), + "=="); + EXPECT_DEATH( + (ParselessPhraseDB { buf3.c_str(), buf3.length(), /*validate_pragma=*/ + true }), + "=="); +} + +TEST(ParselessPhraseDBTest, StressTest) +{ + constexpr const char* data_path = "data.txt"; + if (!std::filesystem::exists(data_path)) { + GTEST_SKIP(); + } + + FILE* f = fopen(data_path, "r"); + ASSERT_NE(f, nullptr); + int status = fseek(f, 0L, SEEK_END); + ASSERT_EQ(status, 0); + size_t length = ftell(f); + std::unique_ptr buf(new char[length]); + status = fseek(f, 0L, SEEK_SET); + ASSERT_EQ(status, 0); + size_t items_read = fread(buf.get(), length, 1, f); + ASSERT_EQ(items_read, 1); + fclose(f); + + std::stringstream sstr(std::string(buf.get(), length)); + std::string line; + std::map> key_to_lines; + + // Skip the pragma line. + std::getline(sstr, line); + + while (!sstr.eof()) { + std::getline(sstr, line); + if (line == "") { + continue; + } + + std::stringstream linest(line); + std::string key; + linest >> key; + key_to_lines[key].push_back(line); + } + + ParselessPhraseDB db(buf.get(), length, /*validate_pragma=*/true); + for (const auto& it : key_to_lines) { + std::vector rows = db.findRows(it.first + " "); + ASSERT_TRUE(VectorsEqual(rows, it.second)); + } +} + +}; // namespace McBopomofo diff --git a/Source/LanguageModelManager.h b/Source/LanguageModelManager.h index 455d9b18..6a82c47a 100644 --- a/Source/LanguageModelManager.h +++ b/Source/LanguageModelManager.h @@ -1,5 +1,4 @@ #import -#import "FastLM.h" #import "UserOverrideModel.h" #import "McBopomofoLM.h"