Merge pull request #230 from lukhnos/parseless-lm

Use a parseless phrase db to speed up LM loading
This commit is contained in:
Weizhong Yang a.k.a zonble 2022-01-16 14:39:08 +08:00 committed by GitHub
commit 8584f5c4b3
14 changed files with 1117 additions and 287 deletions

View File

@ -7,7 +7,6 @@
objects = {
/* Begin PBXBuildFile section */
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A0421A615FEF3F50061ED63 /* FastLM.cpp */; };
6A0D4EA715FC0D2D00ABF4B3 /* Cocoa.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */; };
6A0D4ED215FC0D6400ABF4B3 /* InputMethodController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC715FC0D6400ABF4B3 /* InputMethodController.mm */; };
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC815FC0D6400ABF4B3 /* main.m */; };
@ -33,6 +32,8 @@
6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; };
6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; };
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */; };
6ACC3D442793701600F1B140 /* ParselessPhraseDB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */; };
6ACC3D452793701600F1B140 /* ParselessLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D422793701600F1B140 /* ParselessLM.cpp */; };
6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; };
6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; };
6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; };
@ -74,8 +75,6 @@
/* End PBXContainerItemProxy section */
/* Begin PBXFileReference section */
6A0421A615FEF3F50061ED63 /* FastLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = FastLM.cpp; sourceTree = "<group>"; };
6A0421A715FEF3F50061ED63 /* FastLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FastLM.h; sourceTree = "<group>"; };
6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = McBopomofo.app; sourceTree = BUILT_PRODUCTS_DIR; };
6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Cocoa.framework; path = System/Library/Frameworks/Cocoa.framework; sourceTree = SDKROOT; };
6A0D4EA915FC0D2D00ABF4B3 /* AppKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AppKit.framework; path = System/Library/Frameworks/AppKit.framework; sourceTree = SDKROOT; };
@ -162,6 +161,10 @@
6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = "<group>"; };
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = "<group>"; };
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = "<group>"; };
6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ParselessPhraseDB.cpp; sourceTree = "<group>"; };
6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParselessPhraseDB.h; sourceTree = "<group>"; };
6ACC3D422793701600F1B140 /* ParselessLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ParselessLM.cpp; sourceTree = "<group>"; };
6ACC3D432793701600F1B140 /* ParselessLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParselessLM.h; sourceTree = "<group>"; };
6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = "<group>"; };
6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = "<group>"; };
6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = "<group>"; };
@ -289,12 +292,14 @@
6A0D4F1315FC0EB100ABF4B3 /* Gramambular */,
6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */,
6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */,
6A0421A615FEF3F50061ED63 /* FastLM.cpp */,
6A0421A715FEF3F50061ED63 /* FastLM.h */,
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */,
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */,
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */,
D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */,
6ACC3D422793701600F1B140 /* ParselessLM.cpp */,
6ACC3D432793701600F1B140 /* ParselessLM.h */,
6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */,
6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */,
D44FB74B2792189A003C80A6 /* PhraseReplacementMap.cpp */,
D44FB74C2792189A003C80A6 /* PhraseReplacementMap.h */,
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */,
@ -592,6 +597,7 @@
buildActionMask = 2147483647;
files = (
D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */,
6ACC3D442793701600F1B140 /* ParselessPhraseDB.cpp in Sources */,
D44FB74727919D35003C80A6 /* EmacsKeyHelper.swift in Sources */,
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */,
D44FB74D2792189A003C80A6 /* PhraseReplacementMap.cpp in Sources */,
@ -602,9 +608,9 @@
D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */,
D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */,
6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */,
6ACC3D452793701600F1B140 /* ParselessLM.cpp in Sources */,
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */,
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */,
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */,
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;

View File

@ -1,9 +1,16 @@
cmake_minimum_required(VERSION 3.17)
project(KeyValueBlobReader)
project(McBopomofoLMLib)
set(CMAKE_CXX_STANDARD 17)
add_library(KeyValueBlobReader KeyValueBlobReader.cpp KeyValueBlobReader.h)
include_directories("Gramambular")
add_library(McBopomofoLMLib
KeyValueBlobReader.cpp
KeyValueBlobReader.h
ParselessPhraseDB.cpp
ParselessPhraseDB.h
ParselessLM.cpp
ParselessLM.h)
# Let CMake fetch Google Test for us.
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
@ -19,6 +26,17 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
# Test target declarations.
add_executable(KeyValueBlobReadTest KeyValueBlobReaderTest.cpp)
target_link_libraries(KeyValueBlobReadTest gtest_main KeyValueBlobReader)
add_test(NAME KeyValueBlobReadTest COMMAND KeyValueBlobReadTest)
add_executable(McBopomofoLMLibTest
KeyValueBlobReaderTest.cpp
ParselessLMTest.cpp
ParselessPhraseDBTest.cpp)
target_link_libraries(McBopomofoLMLibTest gtest_main McBopomofoLMLib)
include(GoogleTest)
gtest_discover_tests(McBopomofoLMLibTest)
# Benchmark target.
find_package(benchmark REQUIRED)
add_executable(ParselessLMBenchmark
FastLM.cpp
ParselessLMBenchmark.cpp)
target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark)

View File

@ -25,13 +25,13 @@
namespace McBopomofo {
KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) {
KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out)
{
static auto new_line = [](char c) { return c == '\n' || c == '\r'; };
static auto blank = [](char c) { return c == ' ' || c == '\t'; };
static auto blank_or_newline = [](char c) { return blank(c) || new_line(c); };
static auto content_char = [](char c) {
return !blank(c) && !new_line(c);
};
static auto blank_or_newline
= [](char c) { return blank(c) || new_line(c); };
static auto content_char = [](char c) { return !blank(c) && !new_line(c); };
if (state_ == State::ERROR) {
return state_;
@ -95,21 +95,21 @@ KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) {
SkipUntil(new_line);
if (out != nullptr) {
*out = KeyValue{
std::string_view{key_begin, key_length},
std::string_view{value_begin, value_length}};
*out = KeyValue { std::string_view { key_begin, key_length },
std::string_view { value_begin, value_length } };
}
state_ = State::HAS_PAIR;
return state_;
error:
state_ = State::ERROR;
return State::ERROR;
return state_;
}
KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
const std::function<bool(char)>& f) {
while (current_ != end_ &&* current_) {
const std::function<bool(char)>& f)
{
while (current_ != end_ && *current_) {
if (!f(*current_)) {
return State::CAN_CONTINUE;
}
@ -120,8 +120,9 @@ KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
}
KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
const std::function<bool(char)>& f) {
while (current_ != end_ &&* current_) {
const std::function<bool(char)>& f)
{
while (current_ != end_ && *current_) {
if (f(*current_)) {
return State::CAN_CONTINUE;
}
@ -131,8 +132,9 @@ KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
return State::END;
}
std::ostream& operator<<(std::ostream& os,
const KeyValueBlobReader::KeyValue& kv) {
std::ostream& operator<<(
std::ostream& os, const KeyValueBlobReader::KeyValue& kv)
{
os << "(key: " << kv.key << ", value: " << kv.value << ")";
return os;
}

View File

@ -46,7 +46,7 @@
namespace McBopomofo {
class KeyValueBlobReader {
public:
public:
enum class State : int {
// There are no more key-value pairs in this blob.
END = 0,
@ -59,11 +59,19 @@ class KeyValueBlobReader {
};
struct KeyValue {
constexpr KeyValue() : key(""), value("") {}
constexpr KeyValue()
: key("")
, value("")
{
}
constexpr KeyValue(std::string_view k, std::string_view v)
: key(k), value(v) {}
: key(k)
, value(v)
{
}
bool operator==(const KeyValue& another) const {
bool operator==(const KeyValue& another) const
{
return key == another.key && value == another.value;
}
@ -72,13 +80,17 @@ class KeyValueBlobReader {
};
KeyValueBlobReader(const char* blob, size_t size)
: current_(blob), end_(blob + size) {}
: current_(blob)
, end_(blob + size)
{
}
// Parse the next key-value pair and return the state of the reader. If `out`
// is passed, out will be set to the produced key-value pair if there is one.
// Parse the next key-value pair and return the state of the reader. If
// `out` is passed, out will be set to the produced key-value pair if there
// is one.
State Next(KeyValue* out = nullptr);
private:
private:
State SkipUntil(const std::function<bool(char)>& f);
State SkipUntilNot(const std::function<bool(char)>& f);

View File

@ -21,9 +21,9 @@
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include "KeyValueBlobReader.h"
#include <string>
#include "KeyValueBlobReader.h"
#include "gtest/gtest.h"
namespace McBopomofo {
@ -31,155 +31,174 @@ namespace McBopomofo {
using State = KeyValueBlobReader::State;
using KeyValue = KeyValueBlobReader::KeyValue;
TEST(KeyValueBlobReaderTest, EmptyBlob) {
TEST(KeyValueBlobReaderTest, EmptyBlob)
{
std::string empty;
KeyValueBlobReader reader(empty.c_str(), empty.length());
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency) {
TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency)
{
char empty[0];
KeyValueBlobReader reader(empty, 0);
EXPECT_EQ(reader.Next(), State::END);
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, BlankBlob) {
TEST(KeyValueBlobReaderTest, BlankBlob)
{
std::string blank = " ";
KeyValueBlobReader reader(blank.c_str(), blank.length());
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid) {
TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid)
{
std::string empty = "hello";
KeyValueBlobReader reader(empty.c_str(), empty.length());
EXPECT_EQ(reader.Next(), State::ERROR);
}
TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress) {
TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress)
{
std::string empty = "hello";
KeyValueBlobReader reader(empty.c_str(), empty.length());
EXPECT_EQ(reader.Next(), State::ERROR);
EXPECT_EQ(reader.Next(), State::ERROR);
}
TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid) {
char bad[] = {'h', 0, 'w'};
TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid)
{
char bad[] = { 'h', 0, 'w' };
KeyValueBlobReader reader(bad, sizeof(bad));
EXPECT_EQ(reader.Next(), State::ERROR);
}
TEST(KeyValueBlobReaderTest, SingleKeyValuePair) {
TEST(KeyValueBlobReaderTest, SingleKeyValuePair)
{
std::string empty = "hello world\n";
KeyValueBlobReader reader(empty.c_str(), empty.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding) {
char small[] = {'p', ' ', 'q'};
TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding)
{
char small[] = { 'p', ' ', 'q' };
KeyValueBlobReader reader(small, sizeof(small));
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
EXPECT_EQ(keyValue, (KeyValue { "p", "q" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing) {
char small[] = {'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's'};
TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing)
{
char small[] = { 'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's' };
KeyValueBlobReader reader(small, sizeof(small));
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
EXPECT_EQ(keyValue, (KeyValue { "p", "q" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd) {
TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd)
{
std::string simple = "hello world";
KeyValueBlobReader reader(simple.c_str(), simple.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, EncodingAgnostic1) {
TEST(KeyValueBlobReaderTest, EncodingAgnostic1)
{
std::string simple = u8"smile ☺️";
KeyValueBlobReader reader(simple.c_str(), simple.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"smile", u8"☺️"}));
EXPECT_EQ(keyValue, (KeyValue { "smile", u8"☺️" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, EncodingAgnostic2) {
TEST(KeyValueBlobReaderTest, EncodingAgnostic2)
{
std::string simple = "Nobel-Laureate "
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b";
KeyValueBlobReader reader(simple.c_str(), simple.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (
KeyValue{"Nobel-Laureate",
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"}));
EXPECT_EQ(keyValue,
(KeyValue { "Nobel-Laureate",
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace) {
TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace)
{
std::string simple = "hello world and all\nanother value";
KeyValueBlobReader reader(simple.c_str(), simple.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"another", "value"}));
EXPECT_EQ(keyValue, (KeyValue { "another", "value" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored) {
std::string simple = "\thello world \n\n foo bar \t\t\t \n\n\n";
TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored)
{
std::string simple
= "\thello world \n\n foo bar \t\t\t \n\n\n";
KeyValueBlobReader reader(simple.c_str(), simple.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, WindowsCRLFSupported) {
TEST(KeyValueBlobReaderTest, WindowsCRLFSupported)
{
std::string simple = "lorem ipsum\r\nhello world";
KeyValueBlobReader reader(simple.c_str(), simple.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"lorem", "ipsum"}));
EXPECT_EQ(keyValue, (KeyValue { "lorem", "ipsum" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, MultipleKeyValuePair) {
TEST(KeyValueBlobReaderTest, MultipleKeyValuePair)
{
std::string multi = "\n \nhello world\n foo \t bar ";
KeyValueBlobReader reader(multi.c_str(), multi.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, ReadUntilNullChar) {
char buf[] = {'p', '\t', 'q', '\n', 0, 'r', ' ', 's'};
TEST(KeyValueBlobReaderTest, ReadUntilNullChar)
{
char buf[] = { 'p', '\t', 'q', '\n', 0, 'r', ' ', 's' };
KeyValueBlobReader reader(buf, sizeof(buf));
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
EXPECT_EQ(keyValue, (KeyValue { "p", "q" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) {
TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments)
{
std::string text = R"(
# comment1
# comment2
@ -198,15 +217,16 @@ TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) {
KeyValueBlobReader reader(text.c_str(), text.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "World"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "World" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"}));
EXPECT_EQ(keyValue, (KeyValue { "caffè", "latte" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
EXPECT_EQ(reader.Next(), State::END);
}
TEST(KeyValueBlobReaderTest, ValueCommentSupported) {
TEST(KeyValueBlobReaderTest, ValueCommentSupported)
{
std::string text = R"(
# empty
@ -220,15 +240,15 @@ hello world#peace // peace
KeyValueBlobReader reader(text.c_str(), text.length());
KeyValue keyValue;
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"}));
EXPECT_EQ(keyValue, (KeyValue { "caffè", "latte" }));
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
EXPECT_EQ(reader.Next(), State::END);
}

View File

@ -25,8 +25,8 @@
#define MCBOPOMOFOLM_H
#include <stdio.h>
#include "FastLM.h"
#include "UserPhrasesLM.h"
#include "ParselessLM.h"
#include "PhraseReplacementMap.h"
namespace McBopomofo {
@ -51,7 +51,7 @@ public:
bool phraseReplacementEnabled();
protected:
FastLM m_languageModel;
ParselessLM m_languageModel;
UserPhrasesLM m_userPhrases;
UserPhrasesLM m_excludedPhrases;
PhraseReplacementMap m_phraseReplacement;

View File

@ -0,0 +1,143 @@
// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include "ParselessLM.h"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <memory>
McBopomofo::ParselessLM::~ParselessLM() { close(); }
bool McBopomofo::ParselessLM::open(const std::string_view& path)
{
if (data_) {
return false;
}
fd_ = ::open(path.data(), O_RDONLY);
if (fd_ == -1) {
return false;
}
struct stat sb;
if (fstat(fd_, &sb) == -1) {
::close(fd_);
fd_ = -1;
return false;
}
length_ = static_cast<size_t>(sb.st_size);
data_ = mmap(NULL, length_, PROT_READ, MAP_SHARED, fd_, 0);
if (data_ == nullptr) {
::close(fd_);
fd_ = -1;
length_ = 0;
return false;
}
db_ = std::unique_ptr<ParselessPhraseDB>(new ParselessPhraseDB(
static_cast<char*>(data_), length_, /*validate_pragme=*/
true));
return true;
}
void McBopomofo::ParselessLM::close()
{
if (data_ != nullptr) {
munmap(data_, length_);
::close(fd_);
fd_ = -1;
length_ = 0;
data_ = nullptr;
}
}
const std::vector<Formosa::Gramambular::Bigram>
McBopomofo::ParselessLM::bigramsForKeys(
const std::string& preceedingKey, const std::string& key)
{
return std::vector<Formosa::Gramambular::Bigram>();
}
const std::vector<Formosa::Gramambular::Unigram>
McBopomofo::ParselessLM::unigramsForKey(const std::string& key)
{
if (db_ == nullptr) {
return std::vector<Formosa::Gramambular::Unigram>();
}
std::vector<Formosa::Gramambular::Unigram> results;
for (const auto& row : db_->findRows(key + " ")) {
Formosa::Gramambular::Unigram unigram;
// Move ahead until we encounter the first space. This is the key.
auto it = row.begin();
while (it != row.end() && *it != ' ') {
++it;
}
unigram.keyValue.key = std::string(row.begin(), it);
// Read past the space.
if (it != row.end()) {
++it;
}
if (it != row.end()) {
// Now it is the start of the value portion.
auto value_begin = it;
// Move ahead until we encounter the second space. This is the
// value.
while (it != row.end() && *it != ' ') {
++it;
}
unigram.keyValue.value = std::string(value_begin, it);
}
// Read past the space. The remainder, if it exists, is the score.
if (it != row.end()) {
++it;
}
if (it != row.end()) {
unigram.score = std::stod(std::string(it, row.end()));
}
results.push_back(unigram);
}
return results;
}
bool McBopomofo::ParselessLM::hasUnigramsForKey(const std::string& key)
{
if (db_ == nullptr) {
return false;
}
return db_->findFirstMatchingLine(key + " ") != nullptr;
}

View File

@ -0,0 +1,58 @@
// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#ifndef SOURCE_ENGINE_PARSELESSLM_H_
#define SOURCE_ENGINE_PARSELESSLM_H_
#include <memory>
#include <string>
#include <vector>
#include "LanguageModel.h"
#include "ParselessPhraseDB.h"
namespace McBopomofo {
class ParselessLM : public Formosa::Gramambular::LanguageModel {
public:
~ParselessLM() override;
bool open(const std::string_view& path);
void close();
const std::vector<Formosa::Gramambular::Bigram> bigramsForKeys(
const std::string& preceedingKey, const std::string& key) override;
const std::vector<Formosa::Gramambular::Unigram> unigramsForKey(
const std::string& key) override;
bool hasUnigramsForKey(const std::string& key) override;
private:
int fd_ = -1;
void* data_ = nullptr;
size_t length_ = 0;
std::unique_ptr<ParselessPhraseDB> db_;
};
}; // namespace McBopomofo
#endif // SOURCE_ENGINE_PARSELESSLM_H_

View File

@ -0,0 +1,89 @@
// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include <benchmark/benchmark.h>
#include <cassert>
#include <filesystem>
#include "FastLM.h"
#include "ParselessLM.h"
namespace {
using FastLM = Formosa::Gramambular::FastLM;
using ParselessLM = McBopomofo::ParselessLM;
static const char* kDataPath = "data.txt";
static const char* kLegacyDataPath = "data-legacy.txt";
static const char* kUnigramSearchKey = "ㄕˋ-ㄕˊ";
static void BM_ParselessLMOpenClose(benchmark::State& state)
{
assert(std::filesystem::exists(kDataPath));
for (auto _ : state) {
ParselessLM lm;
lm.open(kDataPath);
lm.close();
}
}
BENCHMARK(BM_ParselessLMOpenClose);
static void BM_FastLMOpenClose(benchmark::State& state)
{
assert(std::filesystem::exists(kLegacyDataPath));
for (auto _ : state) {
FastLM lm;
lm.open(kLegacyDataPath);
lm.close();
}
}
BENCHMARK(BM_FastLMOpenClose);
static void BM_ParselessLMFindUnigrams(benchmark::State& state)
{
assert(std::filesystem::exists(kDataPath));
ParselessLM lm;
lm.open(kDataPath);
for (auto _ : state) {
lm.unigramsForKey(kUnigramSearchKey);
}
lm.close();
}
BENCHMARK(BM_ParselessLMFindUnigrams);
static void BM_FastLMFindUnigrams(benchmark::State& state)
{
assert(std::filesystem::exists(kLegacyDataPath));
FastLM lm;
lm.open(kLegacyDataPath);
for (auto _ : state) {
lm.unigramsForKey(kUnigramSearchKey);
}
lm.close();
}
BENCHMARK(BM_FastLMFindUnigrams);
}; // namespace
BENCHMARK_MAIN();

View File

@ -0,0 +1,59 @@
// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include <filesystem>
#include <iostream>
#include "ParselessLM.h"
#include "gtest/gtest.h"
namespace McBopomofo {
TEST(ParselessLMTest, SanityCheckTest)
{
constexpr const char* data_path = "data.txt";
if (!std::filesystem::exists(data_path)) {
GTEST_SKIP();
}
ParselessLM lm;
bool status = lm.open(data_path);
ASSERT_TRUE(status);
ASSERT_TRUE(lm.hasUnigramsForKey(""));
ASSERT_TRUE(lm.hasUnigramsForKey("ㄕˋ-ㄕˊ"));
ASSERT_TRUE(lm.hasUnigramsForKey("_punctuation_list"));
auto unigrams = lm.unigramsForKey("");
ASSERT_GT(unigrams.size(), 0);
unigrams = lm.unigramsForKey("ㄕˋ-ㄕˊ");
ASSERT_GT(unigrams.size(), 0);
unigrams = lm.unigramsForKey("_punctuation_list");
ASSERT_GT(unigrams.size(), 0);
lm.close();
}
}; // namespace McBopomofo

View File

@ -0,0 +1,166 @@
// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include "ParselessPhraseDB.h"
#include <cassert>
#include <cstring>
namespace McBopomofo {
ParselessPhraseDB::ParselessPhraseDB(
const char* buf, size_t length, bool validate_pragma)
: begin_(buf)
, end_(buf + length)
{
assert(buf != nullptr);
assert(length > 0);
if (validate_pragma) {
assert(length > SORTED_PRAGMA_HEADER.length());
std::string_view header(buf, SORTED_PRAGMA_HEADER.length());
assert(header == SORTED_PRAGMA_HEADER);
uint32_t x = 5381;
for (const auto& i : header) {
x = x * 33 + i;
}
assert(x == uint32_t { 3012373384 });
begin_ += header.length();
}
}
std::vector<std::string_view> ParselessPhraseDB::findRows(
const std::string_view& key)
{
std::vector<std::string_view> rows;
const char* ptr = findFirstMatchingLine(key);
if (ptr == nullptr) {
return rows;
}
while (ptr + key.length() <= end_
&& memcmp(ptr, key.data(), key.length()) == 0) {
const char* eol = ptr;
while (eol != end_ && *eol != '\n') {
++eol;
}
rows.emplace_back(ptr, eol - ptr);
if (eol == end_) {
break;
}
ptr = ++eol;
}
return rows;
}
// Implements a binary search that returns the pointer to the first matching
// row. In its core it's just a standard binary search, but we use backtracking
// to locate the line start. We also check the previous line to see if the
// current line is actually the first matching line: if the previous line is
// less to the key and the current line starts exactly with the key, then
// the current line is the first matching line.
const char* ParselessPhraseDB::findFirstMatchingLine(
const std::string_view& key)
{
if (key.empty()) {
return begin_;
}
const char* top = begin_;
const char* bottom = end_;
while (top < bottom) {
const char* mid = top + (bottom - top) / 2;
const char* ptr = mid;
if (ptr != begin_) {
--ptr;
}
while (ptr != begin_ && *ptr != '\n') {
--ptr;
}
const char* prev = nullptr;
if (*ptr == '\n') {
prev = ptr;
++ptr;
}
// ptr is now in the "current" line we're interested in.
if (ptr + key.length() > end_) {
// not enough data to compare at this point, bail.
break;
}
int current_cmp = memcmp(ptr, key.data(), key.length());
if (current_cmp > 0) {
bottom = mid - 1;
continue;
}
if (current_cmp < 0) {
top = mid + 1;
continue;
}
if (!prev) {
return ptr;
}
// Move the prev so that it reaches the previous line.
if (prev != begin_) {
--prev;
}
while (prev != begin_ && *prev != '\n') {
--prev;
}
if (*prev == '\n') {
++prev;
}
int prev_cmp = memcmp(prev, key.data(), key.length());
// This is the first occurrence.
if (prev_cmp < 0 && current_cmp == 0) {
return ptr;
}
// This is not, which means ptr is "larger" than the keyData.
bottom = mid - 1;
}
return nullptr;
}
}; // namespace McBopomofo

View File

@ -0,0 +1,60 @@
// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#ifndef SOURCE_ENGINE_PARSELESSPHRASEDB_H_
#define SOURCE_ENGINE_PARSELESSPHRASEDB_H_
#include <cstddef>
#include <string>
#include <vector>
namespace McBopomofo {
constexpr std::string_view SORTED_PRAGMA_HEADER
= "# format org.openvanilla.mcbopomofo.sorted\n";
// Defines phrase database that consists of (key, value, score) rows that are
// pre-sorted by the byte value of the keys. It is way faster than FastLM
// because it does not need to parse anything. Instead, it relies on the fact
// that the database is already sorted, and binary search is used to find the
// rows.
class ParselessPhraseDB {
public:
ParselessPhraseDB(
const char* buf, size_t length, bool validate_pragma = false);
// Find the rows that match the key. Note that prefix match is used. If you
// need exact match, the key will need to have a delimiter (usually a space)
// at the end.
std::vector<std::string_view> findRows(const std::string_view& key);
const char* findFirstMatchingLine(const std::string_view& key);
private:
const char* begin_;
const char* end_;
};
}; // namespace McBopomofo
#endif // SOURCE_ENGINE_PARSELESSPHRASEDB_H_

View File

@ -0,0 +1,198 @@
// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include <cstdio>
#include <filesystem>
#include <map>
#include <sstream>
#include <vector>
#include "ParselessPhraseDB.h"
#include "gtest/gtest-death-test.h"
#include "gtest/gtest.h"
using StringViews = std::vector<std::string_view>;
namespace McBopomofo {
static bool VectorsEqual(
const std::vector<std::string_view>& a, const std::vector<std::string>& b)
{
if (a.size() != b.size()) {
return false;
}
size_t s = a.size();
for (size_t i = 0; i < s; i++) {
if (a[i] != b[i]) {
return false;
}
}
return true;
}
TEST(ParselessPhraseDBTest, Simple)
{
std::string data = "a 1";
ParselessPhraseDB db(data.c_str(), data.length());
const char* first = db.findFirstMatchingLine("a");
EXPECT_NE(first, nullptr);
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
first = db.findFirstMatchingLine("a ");
EXPECT_NE(first, nullptr);
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
first = db.findFirstMatchingLine("a 1");
EXPECT_NE(first, nullptr);
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
}
TEST(ParselessPhraseDBTest, NotFound)
{
std::string data = "a 1\na 2\na 3\nb 1";
ParselessPhraseDB db(data.c_str(), data.length());
EXPECT_EQ(db.findFirstMatchingLine("c"), nullptr);
EXPECT_EQ(db.findFirstMatchingLine("A"), nullptr);
}
TEST(ParselessPhraseDBTest, FindRowsLongerExample)
{
std::string data = "a 1\na 2\na 3\nb 42\nb 1\nb 2\nc 7\nd 1";
ParselessPhraseDB db(data.c_str(), data.length());
EXPECT_EQ(db.findRows("a"), (StringViews { "a 1", "a 2", "a 3" }));
EXPECT_EQ(db.findRows("b"), (StringViews { "b 42", "b 1", "b 2" }));
EXPECT_EQ(db.findRows("c"), (StringViews { "c 7" }));
EXPECT_EQ(db.findRows("d"), (StringViews { "d 1" }));
EXPECT_EQ(db.findRows("e"), (StringViews {}));
EXPECT_EQ(db.findRows("A"), (StringViews {}));
}
TEST(ParselessPhraseDBTest, FindFirstMatchingLineLongerExample)
{
std::string data = "a 1\na 2\na 3\nb 42\nb 1\nb 2\nc 7\nd 1";
ParselessPhraseDB db(data.c_str(), data.length());
const char* first = db.findFirstMatchingLine("a");
EXPECT_NE(first, nullptr);
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
db.findFirstMatchingLine("a 1");
EXPECT_NE(first, nullptr);
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
first = db.findFirstMatchingLine("b");
EXPECT_NE(first, nullptr);
EXPECT_EQ(memcmp(first, "b 42", 4), 0);
first = db.findFirstMatchingLine("c");
EXPECT_NE(first, nullptr);
EXPECT_EQ(memcmp(first, "c 7", 3), 0);
first = db.findFirstMatchingLine("d");
EXPECT_NE(first, nullptr);
EXPECT_EQ(memcmp(first, "d 1", 3), 0);
first = db.findFirstMatchingLine("d 1");
EXPECT_NE(first, nullptr);
EXPECT_EQ(memcmp(first, "d 1", 3), 0);
first = db.findFirstMatchingLine("d 2");
EXPECT_EQ(first, nullptr);
}
TEST(ParselessPhraseDBTest, InvalidConstructorArguments)
{
EXPECT_DEATH((ParselessPhraseDB { nullptr, 1 }), "buf != nullptr");
EXPECT_DEATH((ParselessPhraseDB { nullptr, 0 }), "buf != nullptr");
EXPECT_DEATH((ParselessPhraseDB { "", 0 }), "length > 0");
EXPECT_DEATH((ParselessPhraseDB { "a", 1, /*validate_pragma=*/true }),
"length > SORTED_PRAGMA_HEADER\\.length\\(\\)");
}
TEST(ParselessPhraseDBTest, PragmaGuard)
{
std::string buf1 = std::string(SORTED_PRAGMA_HEADER) + "a";
std::string buf2 = "#" + buf1;
std::string buf3 = buf1;
buf3[3] = 'x';
ParselessPhraseDB { buf1.c_str(), buf1.length(), /*validate_pragma=*/true };
EXPECT_DEATH(
(ParselessPhraseDB { buf2.c_str(), buf2.length(), /*validate_pragma=*/
true }),
"==");
EXPECT_DEATH(
(ParselessPhraseDB { buf3.c_str(), buf3.length(), /*validate_pragma=*/
true }),
"==");
}
TEST(ParselessPhraseDBTest, StressTest)
{
constexpr const char* data_path = "data.txt";
if (!std::filesystem::exists(data_path)) {
GTEST_SKIP();
}
FILE* f = fopen(data_path, "r");
ASSERT_NE(f, nullptr);
int status = fseek(f, 0L, SEEK_END);
ASSERT_EQ(status, 0);
size_t length = ftell(f);
std::unique_ptr<char[]> buf(new char[length]);
status = fseek(f, 0L, SEEK_SET);
ASSERT_EQ(status, 0);
size_t items_read = fread(buf.get(), length, 1, f);
ASSERT_EQ(items_read, 1);
fclose(f);
std::stringstream sstr(std::string(buf.get(), length));
std::string line;
std::map<std::string, std::vector<std::string>> key_to_lines;
// Skip the pragma line.
std::getline(sstr, line);
while (!sstr.eof()) {
std::getline(sstr, line);
if (line == "") {
continue;
}
std::stringstream linest(line);
std::string key;
linest >> key;
key_to_lines[key].push_back(line);
}
ParselessPhraseDB db(buf.get(), length, /*validate_pragma=*/true);
for (const auto& it : key_to_lines) {
std::vector<std::string_view> rows = db.findRows(it.first + " ");
ASSERT_TRUE(VectorsEqual(rows, it.second));
}
}
}; // namespace McBopomofo

View File

@ -1,5 +1,4 @@
#import <Foundation/Foundation.h>
#import "FastLM.h"
#import "UserOverrideModel.h"
#import "McBopomofoLM.h"