From c8f65580bb01e3484be5bf758fb88ffef2aab8cb Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Tue, 18 Jan 2022 15:52:02 -0800 Subject: [PATCH] Make UserPhrasesLM more tolerant This lets UserPhrasesLM consumes as much user data as possible before bailing. This makes it more tolerant to data errors and will not fail entirely just because the user has one faulty line in a data file. Also removes FastFM from the benchmarking suite. This also runs the CMake-based C++ tests as part of the GitHub CI. --- .../continuous-integration-workflow.yml | 7 ++- Source/Engine/CMakeLists.txt | 25 +++++--- Source/Engine/ParselessLMBenchmark.cpp | 26 --------- Source/Engine/UserPhrasesLM.cpp | 7 +-- Source/Engine/UserPhrasesLMTest.cpp | 57 +++++++++++++++++++ 5 files changed, 81 insertions(+), 41 deletions(-) create mode 100644 Source/Engine/UserPhrasesLMTest.cpp diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index b70b3abe..4b4304c5 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -9,6 +9,12 @@ jobs: DEVELOPER_DIR: /Applications/Xcode.app/Contents/Developer steps: - uses: actions/checkout@v1 + - name: Build McBopomofoLMLibTest + run: cmake -S . -B build + working-directory: Source/Engine + - name: Run McBopomofoLMLibTest + run: make runTest + working-directory: Source/Engine/build - name: Clean run: xcodebuild -scheme McBopomofo -configuration Release clean - name: Clean @@ -17,4 +23,3 @@ jobs: run: xcodebuild -scheme McBopomofo -configuration Release build - name: Build run: xcodebuild -scheme McBopomofoInstaller -configuration Release build - diff --git a/Source/Engine/CMakeLists.txt b/Source/Engine/CMakeLists.txt index 6e06f2ad..5778bacb 100644 --- a/Source/Engine/CMakeLists.txt +++ b/Source/Engine/CMakeLists.txt @@ -10,7 +10,9 @@ add_library(McBopomofoLMLib ParselessPhraseDB.cpp ParselessPhraseDB.h ParselessLM.cpp - ParselessLM.h) + ParselessLM.h + UserPhrasesLM.h + UserPhrasesLM.cpp) # Let CMake fetch Google Test for us. # https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project @@ -29,14 +31,21 @@ FetchContent_MakeAvailable(googletest) add_executable(McBopomofoLMLibTest KeyValueBlobReaderTest.cpp ParselessLMTest.cpp - ParselessPhraseDBTest.cpp) + ParselessPhraseDBTest.cpp + UserPhrasesLMTest.cpp) target_link_libraries(McBopomofoLMLibTest gtest_main McBopomofoLMLib) include(GoogleTest) gtest_discover_tests(McBopomofoLMLibTest) -# Benchmark target. -find_package(benchmark REQUIRED) -add_executable(ParselessLMBenchmark - FastLM.cpp - ParselessLMBenchmark.cpp) -target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark) \ No newline at end of file +add_custom_target( + runTest + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/McBopomofoLMLibTest +) +add_dependencies(runTest McBopomofoLMLibTest) + +# Benchmark target; to run, manually uncomment the lines below. +# +# find_package(benchmark) +# add_executable(ParselessLMBenchmark +# ParselessLMBenchmark.cpp) +# target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark) diff --git a/Source/Engine/ParselessLMBenchmark.cpp b/Source/Engine/ParselessLMBenchmark.cpp index d47b27ea..7664feaf 100644 --- a/Source/Engine/ParselessLMBenchmark.cpp +++ b/Source/Engine/ParselessLMBenchmark.cpp @@ -26,16 +26,13 @@ #include #include -#include "FastLM.h" #include "ParselessLM.h" namespace { -using FastLM = Formosa::Gramambular::FastLM; using ParselessLM = McBopomofo::ParselessLM; static const char* kDataPath = "data.txt"; -static const char* kLegacyDataPath = "data-legacy.txt"; static const char* kUnigramSearchKey = "ㄕˋ-ㄕˊ"; static void BM_ParselessLMOpenClose(benchmark::State& state) @@ -49,17 +46,6 @@ static void BM_ParselessLMOpenClose(benchmark::State& state) } BENCHMARK(BM_ParselessLMOpenClose); -static void BM_FastLMOpenClose(benchmark::State& state) -{ - assert(std::filesystem::exists(kLegacyDataPath)); - for (auto _ : state) { - FastLM lm; - lm.open(kLegacyDataPath); - lm.close(); - } -} -BENCHMARK(BM_FastLMOpenClose); - static void BM_ParselessLMFindUnigrams(benchmark::State& state) { assert(std::filesystem::exists(kDataPath)); @@ -72,18 +58,6 @@ static void BM_ParselessLMFindUnigrams(benchmark::State& state) } BENCHMARK(BM_ParselessLMFindUnigrams); -static void BM_FastLMFindUnigrams(benchmark::State& state) -{ - assert(std::filesystem::exists(kLegacyDataPath)); - FastLM lm; - lm.open(kLegacyDataPath); - for (auto _ : state) { - lm.unigramsForKey(kUnigramSearchKey); - } - lm.close(); -} -BENCHMARK(BM_FastLMFindUnigrams); - }; // namespace BENCHMARK_MAIN(); diff --git a/Source/Engine/UserPhrasesLM.cpp b/Source/Engine/UserPhrasesLM.cpp index cbb6b9cd..31f2f1e7 100644 --- a/Source/Engine/UserPhrasesLM.cpp +++ b/Source/Engine/UserPhrasesLM.cpp @@ -78,12 +78,7 @@ bool UserPhrasesLM::open(const char *path) KeyValueBlobReader::State state; while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) { // We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading. - keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key ); - } - - if (state == KeyValueBlobReader::State::ERROR) { - close(); - return false; + keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key); } return true; } diff --git a/Source/Engine/UserPhrasesLMTest.cpp b/Source/Engine/UserPhrasesLMTest.cpp new file mode 100644 index 00000000..b923fd97 --- /dev/null +++ b/Source/Engine/UserPhrasesLMTest.cpp @@ -0,0 +1,57 @@ +// Copyright (c) 2022 and onwards The McBopomofo Authors. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include + +#include "UserPhrasesLM.h" +#include "gtest/gtest.h" + +namespace McBopomofo { + +TEST(UserPhreasesLMTest, LenientReading) +{ + std::string tmp_name + = std::string(std::filesystem::temp_directory_path()) + "test.txt"; + + FILE* f = fopen(tmp_name.c_str(), "w"); + ASSERT_NE(f, nullptr); + + fprintf(f, "bar foo\n"); + fprintf(f, "bar \n"); // error line + fprintf(f, "argh baz\n"); + int r = fclose(f); + ASSERT_EQ(r, 0); + + UserPhrasesLM lm; + lm.open(tmp_name.c_str()); + ASSERT_TRUE(lm.hasUnigramsForKey("foo")); + ASSERT_FALSE(lm.hasUnigramsForKey("bar")); + ASSERT_FALSE(lm.hasUnigramsForKey("baz")); + + r = remove(tmp_name.c_str()); + ASSERT_EQ(r, 0); +} + +} // namespace McBopomofo