Use a parseless phrase db to speed up LM loading
We take advantage of the fact that no one is able to modify the phrase databases shipped with the binary (guranteed by macOS's integrity check for notarized apps), and we can simply pre-sort the phrases in the database files. With this change, we can speed up McBopomofo's language model loading during the app initialization by about 500-800x on a 2018 Intel MacBook Pro. The LM loading used to take 300-400 ms, but now it's done within a sub-millisecond range (0.5-0.6 ms). Microbenchmarking shows that ParselessLM is about 16000x faster than FastLM. We amortize the latency during the query time, and even by deferring the parsing, ParselessLM is only ~1.5x slower than FastLM, and both LM classes serve queries unedr 6 microseconds (that's 0.006 ms), which means the tradeoff only contributes to neglible overall latency. This PR requires some small changes to the phrase db cooking scripts. Python 3 is now used and the (value, reading, score) tuples are rearranged to (reading, value, score) and sorted by reading ("key"). A header is added to the phrase databases to call out the fact that these are pre-sorted. clang-format is used to apply WebKit C++ style to the new code. This also applies to KeyValueBlobReader that was added recently. Microbenchmark result below: ``` --------------------------------------------------------------------- Benchmark Time CPU Iterations --------------------------------------------------------------------- BM_ParselessLMOpenClose 17710 ns 17199 ns 33422 BM_FastLMOpenClose 376520248 ns 367526500 ns 2 BM_ParselessLMFindUnigrams 5967 ns 5899 ns 113729 BM_FastLMFindUnigrams 2268 ns 2265 ns 307038 ```
This commit is contained in:
parent
136ac34f22
commit
d064f420e4
|
@ -7,7 +7,6 @@
|
||||||
objects = {
|
objects = {
|
||||||
|
|
||||||
/* Begin PBXBuildFile section */
|
/* Begin PBXBuildFile section */
|
||||||
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A0421A615FEF3F50061ED63 /* FastLM.cpp */; };
|
|
||||||
6A0D4EA715FC0D2D00ABF4B3 /* Cocoa.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */; };
|
6A0D4EA715FC0D2D00ABF4B3 /* Cocoa.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */; };
|
||||||
6A0D4ED215FC0D6400ABF4B3 /* InputMethodController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC715FC0D6400ABF4B3 /* InputMethodController.mm */; };
|
6A0D4ED215FC0D6400ABF4B3 /* InputMethodController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC715FC0D6400ABF4B3 /* InputMethodController.mm */; };
|
||||||
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC815FC0D6400ABF4B3 /* main.m */; };
|
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC815FC0D6400ABF4B3 /* main.m */; };
|
||||||
|
@ -33,6 +32,8 @@
|
||||||
6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; };
|
6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; };
|
||||||
6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; };
|
6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; };
|
||||||
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */; };
|
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */; };
|
||||||
|
6ACC3D442793701600F1B140 /* ParselessPhraseDB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */; };
|
||||||
|
6ACC3D452793701600F1B140 /* ParselessLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D422793701600F1B140 /* ParselessLM.cpp */; };
|
||||||
6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; };
|
6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; };
|
||||||
6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; };
|
6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; };
|
||||||
6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; };
|
6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; };
|
||||||
|
@ -74,8 +75,6 @@
|
||||||
/* End PBXContainerItemProxy section */
|
/* End PBXContainerItemProxy section */
|
||||||
|
|
||||||
/* Begin PBXFileReference section */
|
/* Begin PBXFileReference section */
|
||||||
6A0421A615FEF3F50061ED63 /* FastLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = FastLM.cpp; sourceTree = "<group>"; };
|
|
||||||
6A0421A715FEF3F50061ED63 /* FastLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FastLM.h; sourceTree = "<group>"; };
|
|
||||||
6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = McBopomofo.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = McBopomofo.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Cocoa.framework; path = System/Library/Frameworks/Cocoa.framework; sourceTree = SDKROOT; };
|
6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Cocoa.framework; path = System/Library/Frameworks/Cocoa.framework; sourceTree = SDKROOT; };
|
||||||
6A0D4EA915FC0D2D00ABF4B3 /* AppKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AppKit.framework; path = System/Library/Frameworks/AppKit.framework; sourceTree = SDKROOT; };
|
6A0D4EA915FC0D2D00ABF4B3 /* AppKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AppKit.framework; path = System/Library/Frameworks/AppKit.framework; sourceTree = SDKROOT; };
|
||||||
|
@ -162,6 +161,10 @@
|
||||||
6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = "<group>"; };
|
6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = "<group>"; };
|
||||||
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = "<group>"; };
|
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = "<group>"; };
|
||||||
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = "<group>"; };
|
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = "<group>"; };
|
||||||
|
6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ParselessPhraseDB.cpp; sourceTree = "<group>"; };
|
||||||
|
6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParselessPhraseDB.h; sourceTree = "<group>"; };
|
||||||
|
6ACC3D422793701600F1B140 /* ParselessLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ParselessLM.cpp; sourceTree = "<group>"; };
|
||||||
|
6ACC3D432793701600F1B140 /* ParselessLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParselessLM.h; sourceTree = "<group>"; };
|
||||||
6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = "<group>"; };
|
6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = "<group>"; };
|
||||||
6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = "<group>"; };
|
6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = "<group>"; };
|
||||||
6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = "<group>"; };
|
6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = "<group>"; };
|
||||||
|
@ -289,12 +292,14 @@
|
||||||
6A0D4F1315FC0EB100ABF4B3 /* Gramambular */,
|
6A0D4F1315FC0EB100ABF4B3 /* Gramambular */,
|
||||||
6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */,
|
6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */,
|
||||||
6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */,
|
6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */,
|
||||||
6A0421A615FEF3F50061ED63 /* FastLM.cpp */,
|
|
||||||
6A0421A715FEF3F50061ED63 /* FastLM.h */,
|
|
||||||
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */,
|
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */,
|
||||||
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */,
|
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */,
|
||||||
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */,
|
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */,
|
||||||
D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */,
|
D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */,
|
||||||
|
6ACC3D422793701600F1B140 /* ParselessLM.cpp */,
|
||||||
|
6ACC3D432793701600F1B140 /* ParselessLM.h */,
|
||||||
|
6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */,
|
||||||
|
6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */,
|
||||||
D44FB74B2792189A003C80A6 /* PhraseReplacementMap.cpp */,
|
D44FB74B2792189A003C80A6 /* PhraseReplacementMap.cpp */,
|
||||||
D44FB74C2792189A003C80A6 /* PhraseReplacementMap.h */,
|
D44FB74C2792189A003C80A6 /* PhraseReplacementMap.h */,
|
||||||
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */,
|
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */,
|
||||||
|
@ -592,6 +597,7 @@
|
||||||
buildActionMask = 2147483647;
|
buildActionMask = 2147483647;
|
||||||
files = (
|
files = (
|
||||||
D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */,
|
D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */,
|
||||||
|
6ACC3D442793701600F1B140 /* ParselessPhraseDB.cpp in Sources */,
|
||||||
D44FB74727919D35003C80A6 /* EmacsKeyHelper.swift in Sources */,
|
D44FB74727919D35003C80A6 /* EmacsKeyHelper.swift in Sources */,
|
||||||
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */,
|
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */,
|
||||||
D44FB74D2792189A003C80A6 /* PhraseReplacementMap.cpp in Sources */,
|
D44FB74D2792189A003C80A6 /* PhraseReplacementMap.cpp in Sources */,
|
||||||
|
@ -602,9 +608,9 @@
|
||||||
D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */,
|
D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */,
|
||||||
D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */,
|
D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */,
|
||||||
6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */,
|
6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */,
|
||||||
|
6ACC3D452793701600F1B140 /* ParselessLM.cpp in Sources */,
|
||||||
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */,
|
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */,
|
||||||
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */,
|
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */,
|
||||||
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */,
|
|
||||||
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */,
|
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */,
|
||||||
);
|
);
|
||||||
runOnlyForDeploymentPostprocessing = 0;
|
runOnlyForDeploymentPostprocessing = 0;
|
||||||
|
|
|
@ -1,9 +1,16 @@
|
||||||
cmake_minimum_required(VERSION 3.17)
|
cmake_minimum_required(VERSION 3.17)
|
||||||
project(KeyValueBlobReader)
|
project(McBopomofoLMLib)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
add_library(KeyValueBlobReader KeyValueBlobReader.cpp KeyValueBlobReader.h)
|
include_directories("Gramambular")
|
||||||
|
add_library(McBopomofoLMLib
|
||||||
|
KeyValueBlobReader.cpp
|
||||||
|
KeyValueBlobReader.h
|
||||||
|
ParselessPhraseDB.cpp
|
||||||
|
ParselessPhraseDB.h
|
||||||
|
ParselessLM.cpp
|
||||||
|
ParselessLM.h)
|
||||||
|
|
||||||
# Let CMake fetch Google Test for us.
|
# Let CMake fetch Google Test for us.
|
||||||
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
|
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
|
||||||
|
@ -19,6 +26,17 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||||
FetchContent_MakeAvailable(googletest)
|
FetchContent_MakeAvailable(googletest)
|
||||||
|
|
||||||
# Test target declarations.
|
# Test target declarations.
|
||||||
add_executable(KeyValueBlobReadTest KeyValueBlobReaderTest.cpp)
|
add_executable(McBopomofoLMLibTest
|
||||||
target_link_libraries(KeyValueBlobReadTest gtest_main KeyValueBlobReader)
|
KeyValueBlobReaderTest.cpp
|
||||||
add_test(NAME KeyValueBlobReadTest COMMAND KeyValueBlobReadTest)
|
ParselessLMTest.cpp
|
||||||
|
ParselessPhraseDBTest.cpp)
|
||||||
|
target_link_libraries(McBopomofoLMLibTest gtest_main McBopomofoLMLib)
|
||||||
|
include(GoogleTest)
|
||||||
|
gtest_discover_tests(McBopomofoLMLibTest)
|
||||||
|
|
||||||
|
# Benchmark target.
|
||||||
|
find_package(benchmark REQUIRED)
|
||||||
|
add_executable(ParselessLMBenchmark
|
||||||
|
FastLM.cpp
|
||||||
|
ParselessLMBenchmark.cpp)
|
||||||
|
target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark)
|
|
@ -25,13 +25,13 @@
|
||||||
|
|
||||||
namespace McBopomofo {
|
namespace McBopomofo {
|
||||||
|
|
||||||
KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) {
|
KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out)
|
||||||
|
{
|
||||||
static auto new_line = [](char c) { return c == '\n' || c == '\r'; };
|
static auto new_line = [](char c) { return c == '\n' || c == '\r'; };
|
||||||
static auto blank = [](char c) { return c == ' ' || c == '\t'; };
|
static auto blank = [](char c) { return c == ' ' || c == '\t'; };
|
||||||
static auto blank_or_newline = [](char c) { return blank(c) || new_line(c); };
|
static auto blank_or_newline
|
||||||
static auto content_char = [](char c) {
|
= [](char c) { return blank(c) || new_line(c); };
|
||||||
return !blank(c) && !new_line(c);
|
static auto content_char = [](char c) { return !blank(c) && !new_line(c); };
|
||||||
};
|
|
||||||
|
|
||||||
if (state_ == State::ERROR) {
|
if (state_ == State::ERROR) {
|
||||||
return state_;
|
return state_;
|
||||||
|
@ -95,21 +95,21 @@ KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) {
|
||||||
SkipUntil(new_line);
|
SkipUntil(new_line);
|
||||||
|
|
||||||
if (out != nullptr) {
|
if (out != nullptr) {
|
||||||
*out = KeyValue{
|
*out = KeyValue { std::string_view { key_begin, key_length },
|
||||||
std::string_view{key_begin, key_length},
|
std::string_view { value_begin, value_length } };
|
||||||
std::string_view{value_begin, value_length}};
|
|
||||||
}
|
}
|
||||||
state_ = State::HAS_PAIR;
|
state_ = State::HAS_PAIR;
|
||||||
return state_;
|
return state_;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
state_ = State::ERROR;
|
state_ = State::ERROR;
|
||||||
return State::ERROR;
|
return state_;
|
||||||
}
|
}
|
||||||
|
|
||||||
KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
|
KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
|
||||||
const std::function<bool(char)>& f) {
|
const std::function<bool(char)>& f)
|
||||||
while (current_ != end_ &&* current_) {
|
{
|
||||||
|
while (current_ != end_ && *current_) {
|
||||||
if (!f(*current_)) {
|
if (!f(*current_)) {
|
||||||
return State::CAN_CONTINUE;
|
return State::CAN_CONTINUE;
|
||||||
}
|
}
|
||||||
|
@ -120,8 +120,9 @@ KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
|
||||||
}
|
}
|
||||||
|
|
||||||
KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
|
KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
|
||||||
const std::function<bool(char)>& f) {
|
const std::function<bool(char)>& f)
|
||||||
while (current_ != end_ &&* current_) {
|
{
|
||||||
|
while (current_ != end_ && *current_) {
|
||||||
if (f(*current_)) {
|
if (f(*current_)) {
|
||||||
return State::CAN_CONTINUE;
|
return State::CAN_CONTINUE;
|
||||||
}
|
}
|
||||||
|
@ -131,8 +132,9 @@ KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
|
||||||
return State::END;
|
return State::END;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& os,
|
std::ostream& operator<<(
|
||||||
const KeyValueBlobReader::KeyValue& kv) {
|
std::ostream& os, const KeyValueBlobReader::KeyValue& kv)
|
||||||
|
{
|
||||||
os << "(key: " << kv.key << ", value: " << kv.value << ")";
|
os << "(key: " << kv.key << ", value: " << kv.value << ")";
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
namespace McBopomofo {
|
namespace McBopomofo {
|
||||||
|
|
||||||
class KeyValueBlobReader {
|
class KeyValueBlobReader {
|
||||||
public:
|
public:
|
||||||
enum class State : int {
|
enum class State : int {
|
||||||
// There are no more key-value pairs in this blob.
|
// There are no more key-value pairs in this blob.
|
||||||
END = 0,
|
END = 0,
|
||||||
|
@ -59,11 +59,19 @@ class KeyValueBlobReader {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct KeyValue {
|
struct KeyValue {
|
||||||
constexpr KeyValue() : key(""), value("") {}
|
constexpr KeyValue()
|
||||||
|
: key("")
|
||||||
|
, value("")
|
||||||
|
{
|
||||||
|
}
|
||||||
constexpr KeyValue(std::string_view k, std::string_view v)
|
constexpr KeyValue(std::string_view k, std::string_view v)
|
||||||
: key(k), value(v) {}
|
: key(k)
|
||||||
|
, value(v)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
bool operator==(const KeyValue& another) const {
|
bool operator==(const KeyValue& another) const
|
||||||
|
{
|
||||||
return key == another.key && value == another.value;
|
return key == another.key && value == another.value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,13 +80,17 @@ class KeyValueBlobReader {
|
||||||
};
|
};
|
||||||
|
|
||||||
KeyValueBlobReader(const char* blob, size_t size)
|
KeyValueBlobReader(const char* blob, size_t size)
|
||||||
: current_(blob), end_(blob + size) {}
|
: current_(blob)
|
||||||
|
, end_(blob + size)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
// Parse the next key-value pair and return the state of the reader. If `out`
|
// Parse the next key-value pair and return the state of the reader. If
|
||||||
// is passed, out will be set to the produced key-value pair if there is one.
|
// `out` is passed, out will be set to the produced key-value pair if there
|
||||||
|
// is one.
|
||||||
State Next(KeyValue* out = nullptr);
|
State Next(KeyValue* out = nullptr);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
State SkipUntil(const std::function<bool(char)>& f);
|
State SkipUntil(const std::function<bool(char)>& f);
|
||||||
State SkipUntilNot(const std::function<bool(char)>& f);
|
State SkipUntilNot(const std::function<bool(char)>& f);
|
||||||
|
|
||||||
|
|
|
@ -21,9 +21,9 @@
|
||||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
// OTHER DEALINGS IN THE SOFTWARE.
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
#include "KeyValueBlobReader.h"
|
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
#include "KeyValueBlobReader.h"
|
||||||
#include "gtest/gtest.h"
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
namespace McBopomofo {
|
namespace McBopomofo {
|
||||||
|
@ -31,155 +31,174 @@ namespace McBopomofo {
|
||||||
using State = KeyValueBlobReader::State;
|
using State = KeyValueBlobReader::State;
|
||||||
using KeyValue = KeyValueBlobReader::KeyValue;
|
using KeyValue = KeyValueBlobReader::KeyValue;
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, EmptyBlob) {
|
TEST(KeyValueBlobReaderTest, EmptyBlob)
|
||||||
|
{
|
||||||
std::string empty;
|
std::string empty;
|
||||||
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency) {
|
TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency)
|
||||||
|
{
|
||||||
char empty[0];
|
char empty[0];
|
||||||
KeyValueBlobReader reader(empty, 0);
|
KeyValueBlobReader reader(empty, 0);
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, BlankBlob) {
|
TEST(KeyValueBlobReaderTest, BlankBlob)
|
||||||
|
{
|
||||||
std::string blank = " ";
|
std::string blank = " ";
|
||||||
KeyValueBlobReader reader(blank.c_str(), blank.length());
|
KeyValueBlobReader reader(blank.c_str(), blank.length());
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid) {
|
TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid)
|
||||||
|
{
|
||||||
std::string empty = "hello";
|
std::string empty = "hello";
|
||||||
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||||
EXPECT_EQ(reader.Next(), State::ERROR);
|
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress) {
|
TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress)
|
||||||
|
{
|
||||||
std::string empty = "hello";
|
std::string empty = "hello";
|
||||||
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||||
EXPECT_EQ(reader.Next(), State::ERROR);
|
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||||
EXPECT_EQ(reader.Next(), State::ERROR);
|
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid) {
|
TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid)
|
||||||
char bad[] = {'h', 0, 'w'};
|
{
|
||||||
|
char bad[] = { 'h', 0, 'w' };
|
||||||
KeyValueBlobReader reader(bad, sizeof(bad));
|
KeyValueBlobReader reader(bad, sizeof(bad));
|
||||||
EXPECT_EQ(reader.Next(), State::ERROR);
|
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, SingleKeyValuePair) {
|
TEST(KeyValueBlobReaderTest, SingleKeyValuePair)
|
||||||
|
{
|
||||||
std::string empty = "hello world\n";
|
std::string empty = "hello world\n";
|
||||||
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding) {
|
TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding)
|
||||||
char small[] = {'p', ' ', 'q'};
|
{
|
||||||
|
char small[] = { 'p', ' ', 'q' };
|
||||||
KeyValueBlobReader reader(small, sizeof(small));
|
KeyValueBlobReader reader(small, sizeof(small));
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
|
EXPECT_EQ(keyValue, (KeyValue { "p", "q" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing) {
|
TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing)
|
||||||
char small[] = {'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's'};
|
{
|
||||||
|
char small[] = { 'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's' };
|
||||||
KeyValueBlobReader reader(small, sizeof(small));
|
KeyValueBlobReader reader(small, sizeof(small));
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
|
EXPECT_EQ(keyValue, (KeyValue { "p", "q" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd) {
|
TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd)
|
||||||
|
{
|
||||||
std::string simple = "hello world";
|
std::string simple = "hello world";
|
||||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, EncodingAgnostic1) {
|
TEST(KeyValueBlobReaderTest, EncodingAgnostic1)
|
||||||
|
{
|
||||||
std::string simple = u8"smile ☺️";
|
std::string simple = u8"smile ☺️";
|
||||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"smile", u8"☺️"}));
|
EXPECT_EQ(keyValue, (KeyValue { "smile", u8"☺️" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, EncodingAgnostic2) {
|
TEST(KeyValueBlobReaderTest, EncodingAgnostic2)
|
||||||
|
{
|
||||||
std::string simple = "Nobel-Laureate "
|
std::string simple = "Nobel-Laureate "
|
||||||
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b";
|
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b";
|
||||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (
|
EXPECT_EQ(keyValue,
|
||||||
KeyValue{"Nobel-Laureate",
|
(KeyValue { "Nobel-Laureate",
|
||||||
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"}));
|
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace) {
|
TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace)
|
||||||
|
{
|
||||||
std::string simple = "hello world and all\nanother value";
|
std::string simple = "hello world and all\nanother value";
|
||||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"another", "value"}));
|
EXPECT_EQ(keyValue, (KeyValue { "another", "value" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored) {
|
TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored)
|
||||||
std::string simple = "\thello world \n\n foo bar \t\t\t \n\n\n";
|
{
|
||||||
|
std::string simple
|
||||||
|
= "\thello world \n\n foo bar \t\t\t \n\n\n";
|
||||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, WindowsCRLFSupported) {
|
TEST(KeyValueBlobReaderTest, WindowsCRLFSupported)
|
||||||
|
{
|
||||||
std::string simple = "lorem ipsum\r\nhello world";
|
std::string simple = "lorem ipsum\r\nhello world";
|
||||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"lorem", "ipsum"}));
|
EXPECT_EQ(keyValue, (KeyValue { "lorem", "ipsum" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, MultipleKeyValuePair) {
|
TEST(KeyValueBlobReaderTest, MultipleKeyValuePair)
|
||||||
|
{
|
||||||
std::string multi = "\n \nhello world\n foo \t bar ";
|
std::string multi = "\n \nhello world\n foo \t bar ";
|
||||||
KeyValueBlobReader reader(multi.c_str(), multi.length());
|
KeyValueBlobReader reader(multi.c_str(), multi.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, ReadUntilNullChar) {
|
TEST(KeyValueBlobReaderTest, ReadUntilNullChar)
|
||||||
char buf[] = {'p', '\t', 'q', '\n', 0, 'r', ' ', 's'};
|
{
|
||||||
|
char buf[] = { 'p', '\t', 'q', '\n', 0, 'r', ' ', 's' };
|
||||||
KeyValueBlobReader reader(buf, sizeof(buf));
|
KeyValueBlobReader reader(buf, sizeof(buf));
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
|
EXPECT_EQ(keyValue, (KeyValue { "p", "q" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) {
|
TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments)
|
||||||
|
{
|
||||||
std::string text = R"(
|
std::string text = R"(
|
||||||
# comment1
|
# comment1
|
||||||
# comment2
|
# comment2
|
||||||
|
@ -198,15 +217,16 @@ TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) {
|
||||||
KeyValueBlobReader reader(text.c_str(), text.length());
|
KeyValueBlobReader reader(text.c_str(), text.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "World"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "World" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"}));
|
EXPECT_EQ(keyValue, (KeyValue { "caffè", "latte" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeyValueBlobReaderTest, ValueCommentSupported) {
|
TEST(KeyValueBlobReaderTest, ValueCommentSupported)
|
||||||
|
{
|
||||||
std::string text = R"(
|
std::string text = R"(
|
||||||
# empty
|
# empty
|
||||||
|
|
||||||
|
@ -220,15 +240,15 @@ hello world#peace // peace
|
||||||
KeyValueBlobReader reader(text.c_str(), text.length());
|
KeyValueBlobReader reader(text.c_str(), text.length());
|
||||||
KeyValue keyValue;
|
KeyValue keyValue;
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
|
EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"}));
|
EXPECT_EQ(keyValue, (KeyValue { "caffè", "latte" }));
|
||||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||||
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
|
||||||
EXPECT_EQ(reader.Next(), State::END);
|
EXPECT_EQ(reader.Next(), State::END);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -25,8 +25,8 @@
|
||||||
#define MCBOPOMOFOLM_H
|
#define MCBOPOMOFOLM_H
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "FastLM.h"
|
|
||||||
#include "UserPhrasesLM.h"
|
#include "UserPhrasesLM.h"
|
||||||
|
#include "ParselessLM.h"
|
||||||
#include "PhraseReplacementMap.h"
|
#include "PhraseReplacementMap.h"
|
||||||
|
|
||||||
namespace McBopomofo {
|
namespace McBopomofo {
|
||||||
|
@ -51,7 +51,7 @@ public:
|
||||||
bool phraseReplacementEnabled();
|
bool phraseReplacementEnabled();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
FastLM m_languageModel;
|
ParselessLM m_languageModel;
|
||||||
UserPhrasesLM m_userPhrases;
|
UserPhrasesLM m_userPhrases;
|
||||||
UserPhrasesLM m_excludedPhrases;
|
UserPhrasesLM m_excludedPhrases;
|
||||||
PhraseReplacementMap m_phraseReplacement;
|
PhraseReplacementMap m_phraseReplacement;
|
||||||
|
|
|
@ -0,0 +1,143 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#include "ParselessLM.h"
|
||||||
|
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
McBopomofo::ParselessLM::~ParselessLM() { close(); }
|
||||||
|
|
||||||
|
bool McBopomofo::ParselessLM::open(const std::string_view& path)
|
||||||
|
{
|
||||||
|
if (data_) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fd_ = ::open(path.data(), O_RDONLY);
|
||||||
|
if (fd_ == -1) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct stat sb;
|
||||||
|
if (fstat(fd_, &sb) == -1) {
|
||||||
|
::close(fd_);
|
||||||
|
fd_ = -1;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
length_ = static_cast<size_t>(sb.st_size);
|
||||||
|
|
||||||
|
data_ = mmap(NULL, length_, PROT_READ, MAP_SHARED, fd_, 0);
|
||||||
|
if (data_ == nullptr) {
|
||||||
|
::close(fd_);
|
||||||
|
fd_ = -1;
|
||||||
|
length_ = 0;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
db_ = std::unique_ptr<ParselessPhraseDB>(new ParselessPhraseDB(
|
||||||
|
static_cast<char*>(data_), length_, /*validate_pragme=*/
|
||||||
|
true));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void McBopomofo::ParselessLM::close()
|
||||||
|
{
|
||||||
|
if (data_ != nullptr) {
|
||||||
|
munmap(data_, length_);
|
||||||
|
::close(fd_);
|
||||||
|
fd_ = -1;
|
||||||
|
length_ = 0;
|
||||||
|
data_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<Formosa::Gramambular::Bigram>
|
||||||
|
McBopomofo::ParselessLM::bigramsForKeys(
|
||||||
|
const std::string& preceedingKey, const std::string& key)
|
||||||
|
{
|
||||||
|
return std::vector<Formosa::Gramambular::Bigram>();
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<Formosa::Gramambular::Unigram>
|
||||||
|
McBopomofo::ParselessLM::unigramsForKey(const std::string& key)
|
||||||
|
{
|
||||||
|
if (db_ == nullptr) {
|
||||||
|
return std::vector<Formosa::Gramambular::Unigram>();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Formosa::Gramambular::Unigram> results;
|
||||||
|
for (const auto& row : db_->findRows(key + " ")) {
|
||||||
|
Formosa::Gramambular::Unigram unigram;
|
||||||
|
|
||||||
|
// Move ahead until we encounter the first space. This is the key.
|
||||||
|
auto it = row.begin();
|
||||||
|
while (it != row.end() && *it != ' ') {
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
|
||||||
|
unigram.keyValue.key = std::string(row.begin(), it);
|
||||||
|
|
||||||
|
// Read past the space.
|
||||||
|
if (it != row.end()) {
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (it != row.end()) {
|
||||||
|
// Now it is the start of the value portion.
|
||||||
|
auto value_begin = it;
|
||||||
|
|
||||||
|
// Move ahead until we encounter the second space. This is the
|
||||||
|
// value.
|
||||||
|
while (it != row.end() && *it != ' ') {
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
unigram.keyValue.value = std::string(value_begin, it);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read past the space. The remainder, if it exists, is the score.
|
||||||
|
if (it != row.end()) {
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (it != row.end()) {
|
||||||
|
unigram.score = std::stod(std::string(it, row.end()));
|
||||||
|
}
|
||||||
|
results.push_back(unigram);
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool McBopomofo::ParselessLM::hasUnigramsForKey(const std::string& key)
|
||||||
|
{
|
||||||
|
if (db_ == nullptr) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return db_->findFirstMatchingLine(key + " ") != nullptr;
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#ifndef SOURCE_ENGINE_PARSELESSLM_H_
|
||||||
|
#define SOURCE_ENGINE_PARSELESSLM_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "LanguageModel.h"
|
||||||
|
#include "ParselessPhraseDB.h"
|
||||||
|
|
||||||
|
namespace McBopomofo {
|
||||||
|
|
||||||
|
class ParselessLM : public Formosa::Gramambular::LanguageModel {
|
||||||
|
public:
|
||||||
|
~ParselessLM() override;
|
||||||
|
|
||||||
|
bool open(const std::string_view& path);
|
||||||
|
void close();
|
||||||
|
|
||||||
|
const std::vector<Formosa::Gramambular::Bigram> bigramsForKeys(
|
||||||
|
const std::string& preceedingKey, const std::string& key) override;
|
||||||
|
const std::vector<Formosa::Gramambular::Unigram> unigramsForKey(
|
||||||
|
const std::string& key) override;
|
||||||
|
bool hasUnigramsForKey(const std::string& key) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
int fd_ = -1;
|
||||||
|
void* data_ = nullptr;
|
||||||
|
size_t length_ = 0;
|
||||||
|
std::unique_ptr<ParselessPhraseDB> db_;
|
||||||
|
};
|
||||||
|
|
||||||
|
}; // namespace McBopomofo
|
||||||
|
|
||||||
|
#endif // SOURCE_ENGINE_PARSELESSLM_H_
|
|
@ -0,0 +1,89 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <filesystem>
|
||||||
|
|
||||||
|
#include "FastLM.h"
|
||||||
|
#include "ParselessLM.h"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
using FastLM = Formosa::Gramambular::FastLM;
|
||||||
|
using ParselessLM = McBopomofo::ParselessLM;
|
||||||
|
|
||||||
|
static const char* kDataPath = "data.txt";
|
||||||
|
static const char* kLegacyDataPath = "data-legacy.txt";
|
||||||
|
static const char* kUnigramSearchKey = "ㄕˋ-ㄕˊ";
|
||||||
|
|
||||||
|
static void BM_ParselessLMOpenClose(benchmark::State& state)
|
||||||
|
{
|
||||||
|
assert(std::filesystem::exists(kDataPath));
|
||||||
|
for (auto _ : state) {
|
||||||
|
ParselessLM lm;
|
||||||
|
lm.open(kDataPath);
|
||||||
|
lm.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BENCHMARK(BM_ParselessLMOpenClose);
|
||||||
|
|
||||||
|
static void BM_FastLMOpenClose(benchmark::State& state)
|
||||||
|
{
|
||||||
|
assert(std::filesystem::exists(kLegacyDataPath));
|
||||||
|
for (auto _ : state) {
|
||||||
|
FastLM lm;
|
||||||
|
lm.open(kLegacyDataPath);
|
||||||
|
lm.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BENCHMARK(BM_FastLMOpenClose);
|
||||||
|
|
||||||
|
static void BM_ParselessLMFindUnigrams(benchmark::State& state)
|
||||||
|
{
|
||||||
|
assert(std::filesystem::exists(kDataPath));
|
||||||
|
ParselessLM lm;
|
||||||
|
lm.open(kDataPath);
|
||||||
|
for (auto _ : state) {
|
||||||
|
lm.unigramsForKey(kUnigramSearchKey);
|
||||||
|
}
|
||||||
|
lm.close();
|
||||||
|
}
|
||||||
|
BENCHMARK(BM_ParselessLMFindUnigrams);
|
||||||
|
|
||||||
|
static void BM_FastLMFindUnigrams(benchmark::State& state)
|
||||||
|
{
|
||||||
|
assert(std::filesystem::exists(kLegacyDataPath));
|
||||||
|
FastLM lm;
|
||||||
|
lm.open(kLegacyDataPath);
|
||||||
|
for (auto _ : state) {
|
||||||
|
lm.unigramsForKey(kUnigramSearchKey);
|
||||||
|
}
|
||||||
|
lm.close();
|
||||||
|
}
|
||||||
|
BENCHMARK(BM_FastLMFindUnigrams);
|
||||||
|
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
BENCHMARK_MAIN();
|
|
@ -0,0 +1,59 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#include <filesystem>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "ParselessLM.h"
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
namespace McBopomofo {
|
||||||
|
|
||||||
|
TEST(ParselessLMTest, SanityCheckTest)
|
||||||
|
{
|
||||||
|
constexpr const char* data_path = "data.txt";
|
||||||
|
if (!std::filesystem::exists(data_path)) {
|
||||||
|
GTEST_SKIP();
|
||||||
|
}
|
||||||
|
|
||||||
|
ParselessLM lm;
|
||||||
|
bool status = lm.open(data_path);
|
||||||
|
ASSERT_TRUE(status);
|
||||||
|
|
||||||
|
ASSERT_TRUE(lm.hasUnigramsForKey("ㄕ"));
|
||||||
|
ASSERT_TRUE(lm.hasUnigramsForKey("ㄕˋ-ㄕˊ"));
|
||||||
|
ASSERT_TRUE(lm.hasUnigramsForKey("_punctuation_list"));
|
||||||
|
|
||||||
|
auto unigrams = lm.unigramsForKey("ㄕ");
|
||||||
|
ASSERT_GT(unigrams.size(), 0);
|
||||||
|
|
||||||
|
unigrams = lm.unigramsForKey("ㄕˋ-ㄕˊ");
|
||||||
|
ASSERT_GT(unigrams.size(), 0);
|
||||||
|
|
||||||
|
unigrams = lm.unigramsForKey("_punctuation_list");
|
||||||
|
ASSERT_GT(unigrams.size(), 0);
|
||||||
|
|
||||||
|
lm.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace McBopomofo
|
|
@ -0,0 +1,166 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#include "ParselessPhraseDB.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
namespace McBopomofo {
|
||||||
|
|
||||||
|
ParselessPhraseDB::ParselessPhraseDB(
|
||||||
|
const char* buf, size_t length, bool validate_pragma)
|
||||||
|
: begin_(buf)
|
||||||
|
, end_(buf + length)
|
||||||
|
{
|
||||||
|
assert(buf != nullptr);
|
||||||
|
assert(length > 0);
|
||||||
|
|
||||||
|
if (validate_pragma) {
|
||||||
|
assert(length > SORTED_PRAGMA_HEADER.length());
|
||||||
|
|
||||||
|
std::string_view header(buf, SORTED_PRAGMA_HEADER.length());
|
||||||
|
assert(header == SORTED_PRAGMA_HEADER);
|
||||||
|
|
||||||
|
uint32_t x = 5381;
|
||||||
|
for (const auto& i : header) {
|
||||||
|
x = x * 33 + i;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(x == uint32_t { 3012373384 });
|
||||||
|
|
||||||
|
begin_ += header.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string_view> ParselessPhraseDB::findRows(
|
||||||
|
const std::string_view& key)
|
||||||
|
{
|
||||||
|
std::vector<std::string_view> rows;
|
||||||
|
|
||||||
|
const char* ptr = findFirstMatchingLine(key);
|
||||||
|
if (ptr == nullptr) {
|
||||||
|
return rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (ptr + key.length() <= end_
|
||||||
|
&& memcmp(ptr, key.data(), key.length()) == 0) {
|
||||||
|
const char* eol = ptr;
|
||||||
|
|
||||||
|
while (eol != end_ && *eol != '\n') {
|
||||||
|
++eol;
|
||||||
|
}
|
||||||
|
|
||||||
|
rows.emplace_back(ptr, eol - ptr);
|
||||||
|
if (eol == end_) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ptr = ++eol;
|
||||||
|
}
|
||||||
|
|
||||||
|
return rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Implements a binary search that returns the pointer to the first matching
|
||||||
|
// row. In its core it's just a standard binary search, but we use backtracking
|
||||||
|
// to locate the line start. We also check the previous line to see if the
|
||||||
|
// current line is actually the first matching line: if the previous line is
|
||||||
|
// less to the key and the current line starts exactly with the key, then
|
||||||
|
// the current line is the first matching line.
|
||||||
|
const char* ParselessPhraseDB::findFirstMatchingLine(
|
||||||
|
const std::string_view& key)
|
||||||
|
{
|
||||||
|
if (key.empty()) {
|
||||||
|
return begin_;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* top = begin_;
|
||||||
|
const char* bottom = end_;
|
||||||
|
|
||||||
|
while (top < bottom) {
|
||||||
|
const char* mid = top + (bottom - top) / 2;
|
||||||
|
const char* ptr = mid;
|
||||||
|
|
||||||
|
if (ptr != begin_) {
|
||||||
|
--ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (ptr != begin_ && *ptr != '\n') {
|
||||||
|
--ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* prev = nullptr;
|
||||||
|
if (*ptr == '\n') {
|
||||||
|
prev = ptr;
|
||||||
|
++ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ptr is now in the "current" line we're interested in.
|
||||||
|
if (ptr + key.length() > end_) {
|
||||||
|
// not enough data to compare at this point, bail.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int current_cmp = memcmp(ptr, key.data(), key.length());
|
||||||
|
|
||||||
|
if (current_cmp > 0) {
|
||||||
|
bottom = mid - 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current_cmp < 0) {
|
||||||
|
top = mid + 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!prev) {
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move the prev so that it reaches the previous line.
|
||||||
|
if (prev != begin_) {
|
||||||
|
--prev;
|
||||||
|
}
|
||||||
|
while (prev != begin_ && *prev != '\n') {
|
||||||
|
--prev;
|
||||||
|
}
|
||||||
|
if (*prev == '\n') {
|
||||||
|
++prev;
|
||||||
|
}
|
||||||
|
|
||||||
|
int prev_cmp = memcmp(prev, key.data(), key.length());
|
||||||
|
|
||||||
|
// This is the first occurrence.
|
||||||
|
if (prev_cmp < 0 && current_cmp == 0) {
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is not, which means ptr is "larger" than the keyData.
|
||||||
|
bottom = mid - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace McBopomofo
|
|
@ -0,0 +1,60 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#ifndef SOURCE_ENGINE_PARSELESSPHRASEDB_H_
|
||||||
|
#define SOURCE_ENGINE_PARSELESSPHRASEDB_H_
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace McBopomofo {
|
||||||
|
|
||||||
|
constexpr std::string_view SORTED_PRAGMA_HEADER
|
||||||
|
= "# format org.openvanilla.mcbopomofo.sorted\n";
|
||||||
|
|
||||||
|
// Defines phrase database that consists of (key, value, score) rows that are
|
||||||
|
// pre-sorted by the byte value of the keys. It is way faster than FastLM
|
||||||
|
// because it does not need to parse anything. Instead, it relies on the fact
|
||||||
|
// that the database is already sorted, and binary search is used to find the
|
||||||
|
// rows.
|
||||||
|
class ParselessPhraseDB {
|
||||||
|
public:
|
||||||
|
ParselessPhraseDB(
|
||||||
|
const char* buf, size_t length, bool validate_pragma = false);
|
||||||
|
|
||||||
|
// Find the rows that match the key. Note that prefix match is used. If you
|
||||||
|
// need exact match, the key will need to have a delimiter (usually a space)
|
||||||
|
// at the end.
|
||||||
|
std::vector<std::string_view> findRows(const std::string_view& key);
|
||||||
|
|
||||||
|
const char* findFirstMatchingLine(const std::string_view& key);
|
||||||
|
|
||||||
|
private:
|
||||||
|
const char* begin_;
|
||||||
|
const char* end_;
|
||||||
|
};
|
||||||
|
|
||||||
|
}; // namespace McBopomofo
|
||||||
|
|
||||||
|
#endif // SOURCE_ENGINE_PARSELESSPHRASEDB_H_
|
|
@ -0,0 +1,198 @@
|
||||||
|
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <map>
|
||||||
|
#include <sstream>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "ParselessPhraseDB.h"
|
||||||
|
#include "gtest/gtest-death-test.h"
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
using StringViews = std::vector<std::string_view>;
|
||||||
|
|
||||||
|
namespace McBopomofo {
|
||||||
|
|
||||||
|
static bool VectorsEqual(
|
||||||
|
const std::vector<std::string_view>& a, const std::vector<std::string>& b)
|
||||||
|
{
|
||||||
|
if (a.size() != b.size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t s = a.size();
|
||||||
|
for (size_t i = 0; i < s; i++) {
|
||||||
|
if (a[i] != b[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ParselessPhraseDBTest, Simple)
|
||||||
|
{
|
||||||
|
std::string data = "a 1";
|
||||||
|
ParselessPhraseDB db(data.c_str(), data.length());
|
||||||
|
|
||||||
|
const char* first = db.findFirstMatchingLine("a");
|
||||||
|
EXPECT_NE(first, nullptr);
|
||||||
|
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||||
|
|
||||||
|
first = db.findFirstMatchingLine("a ");
|
||||||
|
EXPECT_NE(first, nullptr);
|
||||||
|
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||||
|
|
||||||
|
first = db.findFirstMatchingLine("a 1");
|
||||||
|
EXPECT_NE(first, nullptr);
|
||||||
|
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ParselessPhraseDBTest, NotFound)
|
||||||
|
{
|
||||||
|
std::string data = "a 1\na 2\na 3\nb 1";
|
||||||
|
ParselessPhraseDB db(data.c_str(), data.length());
|
||||||
|
EXPECT_EQ(db.findFirstMatchingLine("c"), nullptr);
|
||||||
|
EXPECT_EQ(db.findFirstMatchingLine("A"), nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ParselessPhraseDBTest, FindRowsLongerExample)
|
||||||
|
{
|
||||||
|
std::string data = "a 1\na 2\na 3\nb 42\nb 1\nb 2\nc 7\nd 1";
|
||||||
|
ParselessPhraseDB db(data.c_str(), data.length());
|
||||||
|
|
||||||
|
EXPECT_EQ(db.findRows("a"), (StringViews { "a 1", "a 2", "a 3" }));
|
||||||
|
EXPECT_EQ(db.findRows("b"), (StringViews { "b 42", "b 1", "b 2" }));
|
||||||
|
EXPECT_EQ(db.findRows("c"), (StringViews { "c 7" }));
|
||||||
|
EXPECT_EQ(db.findRows("d"), (StringViews { "d 1" }));
|
||||||
|
EXPECT_EQ(db.findRows("e"), (StringViews {}));
|
||||||
|
EXPECT_EQ(db.findRows("A"), (StringViews {}));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ParselessPhraseDBTest, FindFirstMatchingLineLongerExample)
|
||||||
|
{
|
||||||
|
std::string data = "a 1\na 2\na 3\nb 42\nb 1\nb 2\nc 7\nd 1";
|
||||||
|
ParselessPhraseDB db(data.c_str(), data.length());
|
||||||
|
|
||||||
|
const char* first = db.findFirstMatchingLine("a");
|
||||||
|
EXPECT_NE(first, nullptr);
|
||||||
|
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||||
|
|
||||||
|
db.findFirstMatchingLine("a 1");
|
||||||
|
EXPECT_NE(first, nullptr);
|
||||||
|
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||||
|
|
||||||
|
first = db.findFirstMatchingLine("b");
|
||||||
|
EXPECT_NE(first, nullptr);
|
||||||
|
EXPECT_EQ(memcmp(first, "b 42", 4), 0);
|
||||||
|
|
||||||
|
first = db.findFirstMatchingLine("c");
|
||||||
|
EXPECT_NE(first, nullptr);
|
||||||
|
EXPECT_EQ(memcmp(first, "c 7", 3), 0);
|
||||||
|
|
||||||
|
first = db.findFirstMatchingLine("d");
|
||||||
|
EXPECT_NE(first, nullptr);
|
||||||
|
EXPECT_EQ(memcmp(first, "d 1", 3), 0);
|
||||||
|
|
||||||
|
first = db.findFirstMatchingLine("d 1");
|
||||||
|
EXPECT_NE(first, nullptr);
|
||||||
|
EXPECT_EQ(memcmp(first, "d 1", 3), 0);
|
||||||
|
|
||||||
|
first = db.findFirstMatchingLine("d 2");
|
||||||
|
EXPECT_EQ(first, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ParselessPhraseDBTest, InvalidConstructorArguments)
|
||||||
|
{
|
||||||
|
EXPECT_DEATH((ParselessPhraseDB { nullptr, 1 }), "buf != nullptr");
|
||||||
|
EXPECT_DEATH((ParselessPhraseDB { nullptr, 0 }), "buf != nullptr");
|
||||||
|
EXPECT_DEATH((ParselessPhraseDB { "", 0 }), "length > 0");
|
||||||
|
EXPECT_DEATH((ParselessPhraseDB { "a", 1, /*validate_pragma=*/true }),
|
||||||
|
"length > SORTED_PRAGMA_HEADER\\.length\\(\\)");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ParselessPhraseDBTest, PragmaGuard)
|
||||||
|
{
|
||||||
|
std::string buf1 = std::string(SORTED_PRAGMA_HEADER) + "a";
|
||||||
|
std::string buf2 = "#" + buf1;
|
||||||
|
std::string buf3 = buf1;
|
||||||
|
buf3[3] = 'x';
|
||||||
|
|
||||||
|
ParselessPhraseDB { buf1.c_str(), buf1.length(), /*validate_pragma=*/true };
|
||||||
|
EXPECT_DEATH(
|
||||||
|
(ParselessPhraseDB { buf2.c_str(), buf2.length(), /*validate_pragma=*/
|
||||||
|
true }),
|
||||||
|
"==");
|
||||||
|
EXPECT_DEATH(
|
||||||
|
(ParselessPhraseDB { buf3.c_str(), buf3.length(), /*validate_pragma=*/
|
||||||
|
true }),
|
||||||
|
"==");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(ParselessPhraseDBTest, StressTest)
|
||||||
|
{
|
||||||
|
constexpr const char* data_path = "data.txt";
|
||||||
|
if (!std::filesystem::exists(data_path)) {
|
||||||
|
GTEST_SKIP();
|
||||||
|
}
|
||||||
|
|
||||||
|
FILE* f = fopen(data_path, "r");
|
||||||
|
ASSERT_NE(f, nullptr);
|
||||||
|
int status = fseek(f, 0L, SEEK_END);
|
||||||
|
ASSERT_EQ(status, 0);
|
||||||
|
size_t length = ftell(f);
|
||||||
|
std::unique_ptr<char[]> buf(new char[length]);
|
||||||
|
status = fseek(f, 0L, SEEK_SET);
|
||||||
|
ASSERT_EQ(status, 0);
|
||||||
|
size_t items_read = fread(buf.get(), length, 1, f);
|
||||||
|
ASSERT_EQ(items_read, 1);
|
||||||
|
fclose(f);
|
||||||
|
|
||||||
|
std::stringstream sstr(std::string(buf.get(), length));
|
||||||
|
std::string line;
|
||||||
|
std::map<std::string, std::vector<std::string>> key_to_lines;
|
||||||
|
|
||||||
|
// Skip the pragma line.
|
||||||
|
std::getline(sstr, line);
|
||||||
|
|
||||||
|
while (!sstr.eof()) {
|
||||||
|
std::getline(sstr, line);
|
||||||
|
if (line == "") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::stringstream linest(line);
|
||||||
|
std::string key;
|
||||||
|
linest >> key;
|
||||||
|
key_to_lines[key].push_back(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
ParselessPhraseDB db(buf.get(), length, /*validate_pragma=*/true);
|
||||||
|
for (const auto& it : key_to_lines) {
|
||||||
|
std::vector<std::string_view> rows = db.findRows(it.first + " ");
|
||||||
|
ASSERT_TRUE(VectorsEqual(rows, it.second));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace McBopomofo
|
|
@ -1,5 +1,4 @@
|
||||||
#import <Foundation/Foundation.h>
|
#import <Foundation/Foundation.h>
|
||||||
#import "FastLM.h"
|
|
||||||
#import "UserOverrideModel.h"
|
#import "UserOverrideModel.h"
|
||||||
#import "McBopomofoLM.h"
|
#import "McBopomofoLM.h"
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue