Make UserPhrasesLM more tolerant

This lets UserPhrasesLM consumes as much user data as possible before
bailing. This makes it more tolerant to data errors and will not fail
entirely just because the user has one faulty line in a data file.

Also removes FastFM from the benchmarking suite.

This also runs the CMake-based C++ tests as part of the GitHub CI.
This commit is contained in:
Lukhnos Liu 2022-01-18 15:52:02 -08:00
parent 00f110f101
commit c8f65580bb
5 changed files with 81 additions and 41 deletions

View File

@ -9,6 +9,12 @@ jobs:
DEVELOPER_DIR: /Applications/Xcode.app/Contents/Developer
steps:
- uses: actions/checkout@v1
- name: Build McBopomofoLMLibTest
run: cmake -S . -B build
working-directory: Source/Engine
- name: Run McBopomofoLMLibTest
run: make runTest
working-directory: Source/Engine/build
- name: Clean
run: xcodebuild -scheme McBopomofo -configuration Release clean
- name: Clean
@ -17,4 +23,3 @@ jobs:
run: xcodebuild -scheme McBopomofo -configuration Release build
- name: Build
run: xcodebuild -scheme McBopomofoInstaller -configuration Release build

View File

@ -10,7 +10,9 @@ add_library(McBopomofoLMLib
ParselessPhraseDB.cpp
ParselessPhraseDB.h
ParselessLM.cpp
ParselessLM.h)
ParselessLM.h
UserPhrasesLM.h
UserPhrasesLM.cpp)
# Let CMake fetch Google Test for us.
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
@ -29,14 +31,21 @@ FetchContent_MakeAvailable(googletest)
add_executable(McBopomofoLMLibTest
KeyValueBlobReaderTest.cpp
ParselessLMTest.cpp
ParselessPhraseDBTest.cpp)
ParselessPhraseDBTest.cpp
UserPhrasesLMTest.cpp)
target_link_libraries(McBopomofoLMLibTest gtest_main McBopomofoLMLib)
include(GoogleTest)
gtest_discover_tests(McBopomofoLMLibTest)
# Benchmark target.
find_package(benchmark REQUIRED)
add_executable(ParselessLMBenchmark
FastLM.cpp
ParselessLMBenchmark.cpp)
target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark)
add_custom_target(
runTest
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/McBopomofoLMLibTest
)
add_dependencies(runTest McBopomofoLMLibTest)
# Benchmark target; to run, manually uncomment the lines below.
#
# find_package(benchmark)
# add_executable(ParselessLMBenchmark
# ParselessLMBenchmark.cpp)
# target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark)

View File

@ -26,16 +26,13 @@
#include <cassert>
#include <filesystem>
#include "FastLM.h"
#include "ParselessLM.h"
namespace {
using FastLM = Formosa::Gramambular::FastLM;
using ParselessLM = McBopomofo::ParselessLM;
static const char* kDataPath = "data.txt";
static const char* kLegacyDataPath = "data-legacy.txt";
static const char* kUnigramSearchKey = "ㄕˋ-ㄕˊ";
static void BM_ParselessLMOpenClose(benchmark::State& state)
@ -49,17 +46,6 @@ static void BM_ParselessLMOpenClose(benchmark::State& state)
}
BENCHMARK(BM_ParselessLMOpenClose);
static void BM_FastLMOpenClose(benchmark::State& state)
{
assert(std::filesystem::exists(kLegacyDataPath));
for (auto _ : state) {
FastLM lm;
lm.open(kLegacyDataPath);
lm.close();
}
}
BENCHMARK(BM_FastLMOpenClose);
static void BM_ParselessLMFindUnigrams(benchmark::State& state)
{
assert(std::filesystem::exists(kDataPath));
@ -72,18 +58,6 @@ static void BM_ParselessLMFindUnigrams(benchmark::State& state)
}
BENCHMARK(BM_ParselessLMFindUnigrams);
static void BM_FastLMFindUnigrams(benchmark::State& state)
{
assert(std::filesystem::exists(kLegacyDataPath));
FastLM lm;
lm.open(kLegacyDataPath);
for (auto _ : state) {
lm.unigramsForKey(kUnigramSearchKey);
}
lm.close();
}
BENCHMARK(BM_FastLMFindUnigrams);
}; // namespace
BENCHMARK_MAIN();

View File

@ -78,12 +78,7 @@ bool UserPhrasesLM::open(const char *path)
KeyValueBlobReader::State state;
while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) {
// We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading.
keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key );
}
if (state == KeyValueBlobReader::State::ERROR) {
close();
return false;
keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key);
}
return true;
}

View File

@ -0,0 +1,57 @@
// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include <cstdio>
#include <filesystem>
#include <string>
#include "UserPhrasesLM.h"
#include "gtest/gtest.h"
namespace McBopomofo {
TEST(UserPhreasesLMTest, LenientReading)
{
std::string tmp_name
= std::string(std::filesystem::temp_directory_path()) + "test.txt";
FILE* f = fopen(tmp_name.c_str(), "w");
ASSERT_NE(f, nullptr);
fprintf(f, "bar foo\n");
fprintf(f, "bar \n"); // error line
fprintf(f, "argh baz\n");
int r = fclose(f);
ASSERT_EQ(r, 0);
UserPhrasesLM lm;
lm.open(tmp_name.c_str());
ASSERT_TRUE(lm.hasUnigramsForKey("foo"));
ASSERT_FALSE(lm.hasUnigramsForKey("bar"));
ASSERT_FALSE(lm.hasUnigramsForKey("baz"));
r = remove(tmp_name.c_str());
ASSERT_EQ(r, 0);
}
} // namespace McBopomofo