Make UserPhrasesLM more tolerant

This lets UserPhrasesLM consumes as much user data as possible before
bailing. This makes it more tolerant to data errors and will not fail
entirely just because the user has one faulty line in a data file.

Also removes FastFM from the benchmarking suite.

This also runs the CMake-based C++ tests as part of the GitHub CI.
This commit is contained in:
Lukhnos Liu 2022-01-18 15:52:02 -08:00
parent 00f110f101
commit c8f65580bb
5 changed files with 81 additions and 41 deletions

View File

@ -9,6 +9,12 @@ jobs:
DEVELOPER_DIR: /Applications/Xcode.app/Contents/Developer DEVELOPER_DIR: /Applications/Xcode.app/Contents/Developer
steps: steps:
- uses: actions/checkout@v1 - uses: actions/checkout@v1
- name: Build McBopomofoLMLibTest
run: cmake -S . -B build
working-directory: Source/Engine
- name: Run McBopomofoLMLibTest
run: make runTest
working-directory: Source/Engine/build
- name: Clean - name: Clean
run: xcodebuild -scheme McBopomofo -configuration Release clean run: xcodebuild -scheme McBopomofo -configuration Release clean
- name: Clean - name: Clean
@ -17,4 +23,3 @@ jobs:
run: xcodebuild -scheme McBopomofo -configuration Release build run: xcodebuild -scheme McBopomofo -configuration Release build
- name: Build - name: Build
run: xcodebuild -scheme McBopomofoInstaller -configuration Release build run: xcodebuild -scheme McBopomofoInstaller -configuration Release build

View File

@ -10,7 +10,9 @@ add_library(McBopomofoLMLib
ParselessPhraseDB.cpp ParselessPhraseDB.cpp
ParselessPhraseDB.h ParselessPhraseDB.h
ParselessLM.cpp ParselessLM.cpp
ParselessLM.h) ParselessLM.h
UserPhrasesLM.h
UserPhrasesLM.cpp)
# Let CMake fetch Google Test for us. # Let CMake fetch Google Test for us.
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project # https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
@ -29,14 +31,21 @@ FetchContent_MakeAvailable(googletest)
add_executable(McBopomofoLMLibTest add_executable(McBopomofoLMLibTest
KeyValueBlobReaderTest.cpp KeyValueBlobReaderTest.cpp
ParselessLMTest.cpp ParselessLMTest.cpp
ParselessPhraseDBTest.cpp) ParselessPhraseDBTest.cpp
UserPhrasesLMTest.cpp)
target_link_libraries(McBopomofoLMLibTest gtest_main McBopomofoLMLib) target_link_libraries(McBopomofoLMLibTest gtest_main McBopomofoLMLib)
include(GoogleTest) include(GoogleTest)
gtest_discover_tests(McBopomofoLMLibTest) gtest_discover_tests(McBopomofoLMLibTest)
# Benchmark target. add_custom_target(
find_package(benchmark REQUIRED) runTest
add_executable(ParselessLMBenchmark COMMAND ${CMAKE_CURRENT_BINARY_DIR}/McBopomofoLMLibTest
FastLM.cpp )
ParselessLMBenchmark.cpp) add_dependencies(runTest McBopomofoLMLibTest)
target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark)
# Benchmark target; to run, manually uncomment the lines below.
#
# find_package(benchmark)
# add_executable(ParselessLMBenchmark
# ParselessLMBenchmark.cpp)
# target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark)

View File

@ -26,16 +26,13 @@
#include <cassert> #include <cassert>
#include <filesystem> #include <filesystem>
#include "FastLM.h"
#include "ParselessLM.h" #include "ParselessLM.h"
namespace { namespace {
using FastLM = Formosa::Gramambular::FastLM;
using ParselessLM = McBopomofo::ParselessLM; using ParselessLM = McBopomofo::ParselessLM;
static const char* kDataPath = "data.txt"; static const char* kDataPath = "data.txt";
static const char* kLegacyDataPath = "data-legacy.txt";
static const char* kUnigramSearchKey = "ㄕˋ-ㄕˊ"; static const char* kUnigramSearchKey = "ㄕˋ-ㄕˊ";
static void BM_ParselessLMOpenClose(benchmark::State& state) static void BM_ParselessLMOpenClose(benchmark::State& state)
@ -49,17 +46,6 @@ static void BM_ParselessLMOpenClose(benchmark::State& state)
} }
BENCHMARK(BM_ParselessLMOpenClose); BENCHMARK(BM_ParselessLMOpenClose);
static void BM_FastLMOpenClose(benchmark::State& state)
{
assert(std::filesystem::exists(kLegacyDataPath));
for (auto _ : state) {
FastLM lm;
lm.open(kLegacyDataPath);
lm.close();
}
}
BENCHMARK(BM_FastLMOpenClose);
static void BM_ParselessLMFindUnigrams(benchmark::State& state) static void BM_ParselessLMFindUnigrams(benchmark::State& state)
{ {
assert(std::filesystem::exists(kDataPath)); assert(std::filesystem::exists(kDataPath));
@ -72,18 +58,6 @@ static void BM_ParselessLMFindUnigrams(benchmark::State& state)
} }
BENCHMARK(BM_ParselessLMFindUnigrams); BENCHMARK(BM_ParselessLMFindUnigrams);
static void BM_FastLMFindUnigrams(benchmark::State& state)
{
assert(std::filesystem::exists(kLegacyDataPath));
FastLM lm;
lm.open(kLegacyDataPath);
for (auto _ : state) {
lm.unigramsForKey(kUnigramSearchKey);
}
lm.close();
}
BENCHMARK(BM_FastLMFindUnigrams);
}; // namespace }; // namespace
BENCHMARK_MAIN(); BENCHMARK_MAIN();

View File

@ -80,11 +80,6 @@ bool UserPhrasesLM::open(const char *path)
// We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading. // We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading.
keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key); keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key);
} }
if (state == KeyValueBlobReader::State::ERROR) {
close();
return false;
}
return true; return true;
} }

View File

@ -0,0 +1,57 @@
// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include <cstdio>
#include <filesystem>
#include <string>
#include "UserPhrasesLM.h"
#include "gtest/gtest.h"
namespace McBopomofo {
TEST(UserPhreasesLMTest, LenientReading)
{
std::string tmp_name
= std::string(std::filesystem::temp_directory_path()) + "test.txt";
FILE* f = fopen(tmp_name.c_str(), "w");
ASSERT_NE(f, nullptr);
fprintf(f, "bar foo\n");
fprintf(f, "bar \n"); // error line
fprintf(f, "argh baz\n");
int r = fclose(f);
ASSERT_EQ(r, 0);
UserPhrasesLM lm;
lm.open(tmp_name.c_str());
ASSERT_TRUE(lm.hasUnigramsForKey("foo"));
ASSERT_FALSE(lm.hasUnigramsForKey("bar"));
ASSERT_FALSE(lm.hasUnigramsForKey("baz"));
r = remove(tmp_name.c_str());
ASSERT_EQ(r, 0);
}
} // namespace McBopomofo