Merge pull request #230 from lukhnos/parseless-lm
Use a parseless phrase db to speed up LM loading
This commit is contained in:
commit
8584f5c4b3
|
@ -7,7 +7,6 @@
|
|||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A0421A615FEF3F50061ED63 /* FastLM.cpp */; };
|
||||
6A0D4EA715FC0D2D00ABF4B3 /* Cocoa.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */; };
|
||||
6A0D4ED215FC0D6400ABF4B3 /* InputMethodController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC715FC0D6400ABF4B3 /* InputMethodController.mm */; };
|
||||
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4EC815FC0D6400ABF4B3 /* main.m */; };
|
||||
|
@ -33,6 +32,8 @@
|
|||
6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; };
|
||||
6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; };
|
||||
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */; };
|
||||
6ACC3D442793701600F1B140 /* ParselessPhraseDB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */; };
|
||||
6ACC3D452793701600F1B140 /* ParselessLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6ACC3D422793701600F1B140 /* ParselessLM.cpp */; };
|
||||
6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; };
|
||||
6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; };
|
||||
6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; };
|
||||
|
@ -74,8 +75,6 @@
|
|||
/* End PBXContainerItemProxy section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
6A0421A615FEF3F50061ED63 /* FastLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = FastLM.cpp; sourceTree = "<group>"; };
|
||||
6A0421A715FEF3F50061ED63 /* FastLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FastLM.h; sourceTree = "<group>"; };
|
||||
6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = McBopomofo.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
6A0D4EA615FC0D2D00ABF4B3 /* Cocoa.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Cocoa.framework; path = System/Library/Frameworks/Cocoa.framework; sourceTree = SDKROOT; };
|
||||
6A0D4EA915FC0D2D00ABF4B3 /* AppKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AppKit.framework; path = System/Library/Frameworks/AppKit.framework; sourceTree = SDKROOT; };
|
||||
|
@ -162,6 +161,10 @@
|
|||
6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = "<group>"; };
|
||||
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = "<group>"; };
|
||||
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = "<group>"; };
|
||||
6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ParselessPhraseDB.cpp; sourceTree = "<group>"; };
|
||||
6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParselessPhraseDB.h; sourceTree = "<group>"; };
|
||||
6ACC3D422793701600F1B140 /* ParselessLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ParselessLM.cpp; sourceTree = "<group>"; };
|
||||
6ACC3D432793701600F1B140 /* ParselessLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParselessLM.h; sourceTree = "<group>"; };
|
||||
6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = "<group>"; };
|
||||
6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = "<group>"; };
|
||||
6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = "<group>"; };
|
||||
|
@ -289,12 +292,14 @@
|
|||
6A0D4F1315FC0EB100ABF4B3 /* Gramambular */,
|
||||
6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */,
|
||||
6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */,
|
||||
6A0421A615FEF3F50061ED63 /* FastLM.cpp */,
|
||||
6A0421A715FEF3F50061ED63 /* FastLM.h */,
|
||||
6ACC3D3E27914F2400F1B140 /* KeyValueBlobReader.cpp */,
|
||||
6ACC3D3C27914AAB00F1B140 /* KeyValueBlobReader.h */,
|
||||
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */,
|
||||
D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */,
|
||||
6ACC3D422793701600F1B140 /* ParselessLM.cpp */,
|
||||
6ACC3D432793701600F1B140 /* ParselessLM.h */,
|
||||
6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */,
|
||||
6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */,
|
||||
D44FB74B2792189A003C80A6 /* PhraseReplacementMap.cpp */,
|
||||
D44FB74C2792189A003C80A6 /* PhraseReplacementMap.h */,
|
||||
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */,
|
||||
|
@ -592,6 +597,7 @@
|
|||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */,
|
||||
6ACC3D442793701600F1B140 /* ParselessPhraseDB.cpp in Sources */,
|
||||
D44FB74727919D35003C80A6 /* EmacsKeyHelper.swift in Sources */,
|
||||
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */,
|
||||
D44FB74D2792189A003C80A6 /* PhraseReplacementMap.cpp in Sources */,
|
||||
|
@ -602,9 +608,9 @@
|
|||
D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */,
|
||||
D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */,
|
||||
6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */,
|
||||
6ACC3D452793701600F1B140 /* ParselessLM.cpp in Sources */,
|
||||
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */,
|
||||
6ACC3D3F27914F2400F1B140 /* KeyValueBlobReader.cpp in Sources */,
|
||||
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */,
|
||||
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
|
|
|
@ -1,9 +1,16 @@
|
|||
cmake_minimum_required(VERSION 3.17)
|
||||
project(KeyValueBlobReader)
|
||||
project(McBopomofoLMLib)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
||||
add_library(KeyValueBlobReader KeyValueBlobReader.cpp KeyValueBlobReader.h)
|
||||
include_directories("Gramambular")
|
||||
add_library(McBopomofoLMLib
|
||||
KeyValueBlobReader.cpp
|
||||
KeyValueBlobReader.h
|
||||
ParselessPhraseDB.cpp
|
||||
ParselessPhraseDB.h
|
||||
ParselessLM.cpp
|
||||
ParselessLM.h)
|
||||
|
||||
# Let CMake fetch Google Test for us.
|
||||
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
|
||||
|
@ -19,6 +26,17 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
|||
FetchContent_MakeAvailable(googletest)
|
||||
|
||||
# Test target declarations.
|
||||
add_executable(KeyValueBlobReadTest KeyValueBlobReaderTest.cpp)
|
||||
target_link_libraries(KeyValueBlobReadTest gtest_main KeyValueBlobReader)
|
||||
add_test(NAME KeyValueBlobReadTest COMMAND KeyValueBlobReadTest)
|
||||
add_executable(McBopomofoLMLibTest
|
||||
KeyValueBlobReaderTest.cpp
|
||||
ParselessLMTest.cpp
|
||||
ParselessPhraseDBTest.cpp)
|
||||
target_link_libraries(McBopomofoLMLibTest gtest_main McBopomofoLMLib)
|
||||
include(GoogleTest)
|
||||
gtest_discover_tests(McBopomofoLMLibTest)
|
||||
|
||||
# Benchmark target.
|
||||
find_package(benchmark REQUIRED)
|
||||
add_executable(ParselessLMBenchmark
|
||||
FastLM.cpp
|
||||
ParselessLMBenchmark.cpp)
|
||||
target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark)
|
|
@ -25,13 +25,13 @@
|
|||
|
||||
namespace McBopomofo {
|
||||
|
||||
KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) {
|
||||
KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out)
|
||||
{
|
||||
static auto new_line = [](char c) { return c == '\n' || c == '\r'; };
|
||||
static auto blank = [](char c) { return c == ' ' || c == '\t'; };
|
||||
static auto blank_or_newline = [](char c) { return blank(c) || new_line(c); };
|
||||
static auto content_char = [](char c) {
|
||||
return !blank(c) && !new_line(c);
|
||||
};
|
||||
static auto blank_or_newline
|
||||
= [](char c) { return blank(c) || new_line(c); };
|
||||
static auto content_char = [](char c) { return !blank(c) && !new_line(c); };
|
||||
|
||||
if (state_ == State::ERROR) {
|
||||
return state_;
|
||||
|
@ -95,21 +95,21 @@ KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) {
|
|||
SkipUntil(new_line);
|
||||
|
||||
if (out != nullptr) {
|
||||
*out = KeyValue{
|
||||
std::string_view{key_begin, key_length},
|
||||
std::string_view{value_begin, value_length}};
|
||||
*out = KeyValue { std::string_view { key_begin, key_length },
|
||||
std::string_view { value_begin, value_length } };
|
||||
}
|
||||
state_ = State::HAS_PAIR;
|
||||
return state_;
|
||||
|
||||
error:
|
||||
state_ = State::ERROR;
|
||||
return State::ERROR;
|
||||
return state_;
|
||||
}
|
||||
|
||||
KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
|
||||
const std::function<bool(char)>& f) {
|
||||
while (current_ != end_ &&* current_) {
|
||||
const std::function<bool(char)>& f)
|
||||
{
|
||||
while (current_ != end_ && *current_) {
|
||||
if (!f(*current_)) {
|
||||
return State::CAN_CONTINUE;
|
||||
}
|
||||
|
@ -120,8 +120,9 @@ KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
|
|||
}
|
||||
|
||||
KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
|
||||
const std::function<bool(char)>& f) {
|
||||
while (current_ != end_ &&* current_) {
|
||||
const std::function<bool(char)>& f)
|
||||
{
|
||||
while (current_ != end_ && *current_) {
|
||||
if (f(*current_)) {
|
||||
return State::CAN_CONTINUE;
|
||||
}
|
||||
|
@ -131,8 +132,9 @@ KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
|
|||
return State::END;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os,
|
||||
const KeyValueBlobReader::KeyValue& kv) {
|
||||
std::ostream& operator<<(
|
||||
std::ostream& os, const KeyValueBlobReader::KeyValue& kv)
|
||||
{
|
||||
os << "(key: " << kv.key << ", value: " << kv.value << ")";
|
||||
return os;
|
||||
}
|
||||
|
|
|
@ -46,7 +46,7 @@
|
|||
namespace McBopomofo {
|
||||
|
||||
class KeyValueBlobReader {
|
||||
public:
|
||||
public:
|
||||
enum class State : int {
|
||||
// There are no more key-value pairs in this blob.
|
||||
END = 0,
|
||||
|
@ -59,11 +59,19 @@ class KeyValueBlobReader {
|
|||
};
|
||||
|
||||
struct KeyValue {
|
||||
constexpr KeyValue() : key(""), value("") {}
|
||||
constexpr KeyValue()
|
||||
: key("")
|
||||
, value("")
|
||||
{
|
||||
}
|
||||
constexpr KeyValue(std::string_view k, std::string_view v)
|
||||
: key(k), value(v) {}
|
||||
: key(k)
|
||||
, value(v)
|
||||
{
|
||||
}
|
||||
|
||||
bool operator==(const KeyValue& another) const {
|
||||
bool operator==(const KeyValue& another) const
|
||||
{
|
||||
return key == another.key && value == another.value;
|
||||
}
|
||||
|
||||
|
@ -72,13 +80,17 @@ class KeyValueBlobReader {
|
|||
};
|
||||
|
||||
KeyValueBlobReader(const char* blob, size_t size)
|
||||
: current_(blob), end_(blob + size) {}
|
||||
: current_(blob)
|
||||
, end_(blob + size)
|
||||
{
|
||||
}
|
||||
|
||||
// Parse the next key-value pair and return the state of the reader. If `out`
|
||||
// is passed, out will be set to the produced key-value pair if there is one.
|
||||
// Parse the next key-value pair and return the state of the reader. If
|
||||
// `out` is passed, out will be set to the produced key-value pair if there
|
||||
// is one.
|
||||
State Next(KeyValue* out = nullptr);
|
||||
|
||||
private:
|
||||
private:
|
||||
State SkipUntil(const std::function<bool(char)>& f);
|
||||
State SkipUntilNot(const std::function<bool(char)>& f);
|
||||
|
||||
|
|
|
@ -21,9 +21,9 @@
|
|||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
#include "KeyValueBlobReader.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "KeyValueBlobReader.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace McBopomofo {
|
||||
|
@ -31,155 +31,174 @@ namespace McBopomofo {
|
|||
using State = KeyValueBlobReader::State;
|
||||
using KeyValue = KeyValueBlobReader::KeyValue;
|
||||
|
||||
TEST(KeyValueBlobReaderTest, EmptyBlob) {
|
||||
TEST(KeyValueBlobReaderTest, EmptyBlob)
|
||||
{
|
||||
std::string empty;
|
||||
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency) {
|
||||
TEST(KeyValueBlobReaderTest, EmptyBlobIdempotency)
|
||||
{
|
||||
char empty[0];
|
||||
KeyValueBlobReader reader(empty, 0);
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, BlankBlob) {
|
||||
TEST(KeyValueBlobReaderTest, BlankBlob)
|
||||
{
|
||||
std::string blank = " ";
|
||||
KeyValueBlobReader reader(blank.c_str(), blank.length());
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid) {
|
||||
TEST(KeyValueBlobReaderTest, KeyWithoutValueIsInvalid)
|
||||
{
|
||||
std::string empty = "hello";
|
||||
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress) {
|
||||
TEST(KeyValueBlobReaderTest, ErrorStateMakesNoMoreProgress)
|
||||
{
|
||||
std::string empty = "hello";
|
||||
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid) {
|
||||
char bad[] = {'h', 0, 'w'};
|
||||
TEST(KeyValueBlobReaderTest, KeyValueSeparatedByNullCharIsInvalid)
|
||||
{
|
||||
char bad[] = { 'h', 0, 'w' };
|
||||
KeyValueBlobReader reader(bad, sizeof(bad));
|
||||
EXPECT_EQ(reader.Next(), State::ERROR);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, SingleKeyValuePair) {
|
||||
TEST(KeyValueBlobReaderTest, SingleKeyValuePair)
|
||||
{
|
||||
std::string empty = "hello world\n";
|
||||
KeyValueBlobReader reader(empty.c_str(), empty.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding) {
|
||||
char small[] = {'p', ' ', 'q'};
|
||||
TEST(KeyValueBlobReaderTest, SingleKeyValuePairFromButterWithoutNullEnding)
|
||||
{
|
||||
char small[] = { 'p', ' ', 'q' };
|
||||
KeyValueBlobReader reader(small, sizeof(small));
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "p", "q" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing) {
|
||||
char small[] = {'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's'};
|
||||
TEST(KeyValueBlobReaderTest, NullCharInTheMiddleTerminatesParsing)
|
||||
{
|
||||
char small[] = { 'p', ' ', 'q', ' ', 0, '\n', 'r', ' ', 's' };
|
||||
KeyValueBlobReader reader(small, sizeof(small));
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "p", "q" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd) {
|
||||
TEST(KeyValueBlobReaderTest, SingleKeyValuePairWithoutLFAtEnd)
|
||||
{
|
||||
std::string simple = "hello world";
|
||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, EncodingAgnostic1) {
|
||||
TEST(KeyValueBlobReaderTest, EncodingAgnostic1)
|
||||
{
|
||||
std::string simple = u8"smile ☺️";
|
||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"smile", u8"☺️"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "smile", u8"☺️" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, EncodingAgnostic2) {
|
||||
TEST(KeyValueBlobReaderTest, EncodingAgnostic2)
|
||||
{
|
||||
std::string simple = "Nobel-Laureate "
|
||||
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b";
|
||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (
|
||||
KeyValue{"Nobel-Laureate",
|
||||
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b"}));
|
||||
EXPECT_EQ(keyValue,
|
||||
(KeyValue { "Nobel-Laureate",
|
||||
"\xe9\x81\x94\xe8\xb3\xb4\xe5\x96\x87\xe5\x98\x9b" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace) {
|
||||
TEST(KeyValueBlobReaderTest, ValueDoesNotIncludeSpace)
|
||||
{
|
||||
std::string simple = "hello world and all\nanother value";
|
||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"another", "value"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "another", "value" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored) {
|
||||
std::string simple = "\thello world \n\n foo bar \t\t\t \n\n\n";
|
||||
TEST(KeyValueBlobReaderTest, TrailingSpaceInValueIsIgnored)
|
||||
{
|
||||
std::string simple
|
||||
= "\thello world \n\n foo bar \t\t\t \n\n\n";
|
||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, WindowsCRLFSupported) {
|
||||
TEST(KeyValueBlobReaderTest, WindowsCRLFSupported)
|
||||
{
|
||||
std::string simple = "lorem ipsum\r\nhello world";
|
||||
KeyValueBlobReader reader(simple.c_str(), simple.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"lorem", "ipsum"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "lorem", "ipsum" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, MultipleKeyValuePair) {
|
||||
TEST(KeyValueBlobReaderTest, MultipleKeyValuePair)
|
||||
{
|
||||
std::string multi = "\n \nhello world\n foo \t bar ";
|
||||
KeyValueBlobReader reader(multi.c_str(), multi.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "world" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, ReadUntilNullChar) {
|
||||
char buf[] = {'p', '\t', 'q', '\n', 0, 'r', ' ', 's'};
|
||||
TEST(KeyValueBlobReaderTest, ReadUntilNullChar)
|
||||
{
|
||||
char buf[] = { 'p', '\t', 'q', '\n', 0, 'r', ' ', 's' };
|
||||
KeyValueBlobReader reader(buf, sizeof(buf));
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"p", "q"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "p", "q" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) {
|
||||
TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments)
|
||||
{
|
||||
std::string text = R"(
|
||||
# comment1
|
||||
# comment2
|
||||
|
@ -198,15 +217,16 @@ TEST(KeyValueBlobReaderTest, MultipleKeyValuePairWithComments) {
|
|||
KeyValueBlobReader reader(text.c_str(), text.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "World"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "World" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "caffè", "latte" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
TEST(KeyValueBlobReaderTest, ValueCommentSupported) {
|
||||
TEST(KeyValueBlobReaderTest, ValueCommentSupported)
|
||||
{
|
||||
std::string text = R"(
|
||||
# empty
|
||||
|
||||
|
@ -220,15 +240,15 @@ hello world#peace // peace
|
|||
KeyValueBlobReader reader(text.c_str(), text.length());
|
||||
KeyValue keyValue;
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"hello", "world#peace"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "hello", "world#peace" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"caffè", "latte"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "caffè", "latte" }));
|
||||
EXPECT_EQ(reader.Next(&keyValue), State::HAS_PAIR);
|
||||
EXPECT_EQ(keyValue, (KeyValue{"foo", "bar"}));
|
||||
EXPECT_EQ(keyValue, (KeyValue { "foo", "bar" }));
|
||||
EXPECT_EQ(reader.Next(), State::END);
|
||||
}
|
||||
|
||||
|
|
|
@ -25,8 +25,8 @@
|
|||
#define MCBOPOMOFOLM_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include "FastLM.h"
|
||||
#include "UserPhrasesLM.h"
|
||||
#include "ParselessLM.h"
|
||||
#include "PhraseReplacementMap.h"
|
||||
|
||||
namespace McBopomofo {
|
||||
|
@ -51,7 +51,7 @@ public:
|
|||
bool phraseReplacementEnabled();
|
||||
|
||||
protected:
|
||||
FastLM m_languageModel;
|
||||
ParselessLM m_languageModel;
|
||||
UserPhrasesLM m_userPhrases;
|
||||
UserPhrasesLM m_excludedPhrases;
|
||||
PhraseReplacementMap m_phraseReplacement;
|
||||
|
|
|
@ -0,0 +1,143 @@
|
|||
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person
|
||||
// obtaining a copy of this software and associated documentation
|
||||
// files (the "Software"), to deal in the Software without
|
||||
// restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following
|
||||
// conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
#include "ParselessLM.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
McBopomofo::ParselessLM::~ParselessLM() { close(); }
|
||||
|
||||
bool McBopomofo::ParselessLM::open(const std::string_view& path)
|
||||
{
|
||||
if (data_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
fd_ = ::open(path.data(), O_RDONLY);
|
||||
if (fd_ == -1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
struct stat sb;
|
||||
if (fstat(fd_, &sb) == -1) {
|
||||
::close(fd_);
|
||||
fd_ = -1;
|
||||
return false;
|
||||
}
|
||||
|
||||
length_ = static_cast<size_t>(sb.st_size);
|
||||
|
||||
data_ = mmap(NULL, length_, PROT_READ, MAP_SHARED, fd_, 0);
|
||||
if (data_ == nullptr) {
|
||||
::close(fd_);
|
||||
fd_ = -1;
|
||||
length_ = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
db_ = std::unique_ptr<ParselessPhraseDB>(new ParselessPhraseDB(
|
||||
static_cast<char*>(data_), length_, /*validate_pragme=*/
|
||||
true));
|
||||
return true;
|
||||
}
|
||||
|
||||
void McBopomofo::ParselessLM::close()
|
||||
{
|
||||
if (data_ != nullptr) {
|
||||
munmap(data_, length_);
|
||||
::close(fd_);
|
||||
fd_ = -1;
|
||||
length_ = 0;
|
||||
data_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<Formosa::Gramambular::Bigram>
|
||||
McBopomofo::ParselessLM::bigramsForKeys(
|
||||
const std::string& preceedingKey, const std::string& key)
|
||||
{
|
||||
return std::vector<Formosa::Gramambular::Bigram>();
|
||||
}
|
||||
|
||||
const std::vector<Formosa::Gramambular::Unigram>
|
||||
McBopomofo::ParselessLM::unigramsForKey(const std::string& key)
|
||||
{
|
||||
if (db_ == nullptr) {
|
||||
return std::vector<Formosa::Gramambular::Unigram>();
|
||||
}
|
||||
|
||||
std::vector<Formosa::Gramambular::Unigram> results;
|
||||
for (const auto& row : db_->findRows(key + " ")) {
|
||||
Formosa::Gramambular::Unigram unigram;
|
||||
|
||||
// Move ahead until we encounter the first space. This is the key.
|
||||
auto it = row.begin();
|
||||
while (it != row.end() && *it != ' ') {
|
||||
++it;
|
||||
}
|
||||
|
||||
unigram.keyValue.key = std::string(row.begin(), it);
|
||||
|
||||
// Read past the space.
|
||||
if (it != row.end()) {
|
||||
++it;
|
||||
}
|
||||
|
||||
if (it != row.end()) {
|
||||
// Now it is the start of the value portion.
|
||||
auto value_begin = it;
|
||||
|
||||
// Move ahead until we encounter the second space. This is the
|
||||
// value.
|
||||
while (it != row.end() && *it != ' ') {
|
||||
++it;
|
||||
}
|
||||
unigram.keyValue.value = std::string(value_begin, it);
|
||||
}
|
||||
|
||||
// Read past the space. The remainder, if it exists, is the score.
|
||||
if (it != row.end()) {
|
||||
++it;
|
||||
}
|
||||
|
||||
if (it != row.end()) {
|
||||
unigram.score = std::stod(std::string(it, row.end()));
|
||||
}
|
||||
results.push_back(unigram);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
bool McBopomofo::ParselessLM::hasUnigramsForKey(const std::string& key)
|
||||
{
|
||||
if (db_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return db_->findFirstMatchingLine(key + " ") != nullptr;
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person
|
||||
// obtaining a copy of this software and associated documentation
|
||||
// files (the "Software"), to deal in the Software without
|
||||
// restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following
|
||||
// conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
#ifndef SOURCE_ENGINE_PARSELESSLM_H_
|
||||
#define SOURCE_ENGINE_PARSELESSLM_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "LanguageModel.h"
|
||||
#include "ParselessPhraseDB.h"
|
||||
|
||||
namespace McBopomofo {
|
||||
|
||||
class ParselessLM : public Formosa::Gramambular::LanguageModel {
|
||||
public:
|
||||
~ParselessLM() override;
|
||||
|
||||
bool open(const std::string_view& path);
|
||||
void close();
|
||||
|
||||
const std::vector<Formosa::Gramambular::Bigram> bigramsForKeys(
|
||||
const std::string& preceedingKey, const std::string& key) override;
|
||||
const std::vector<Formosa::Gramambular::Unigram> unigramsForKey(
|
||||
const std::string& key) override;
|
||||
bool hasUnigramsForKey(const std::string& key) override;
|
||||
|
||||
private:
|
||||
int fd_ = -1;
|
||||
void* data_ = nullptr;
|
||||
size_t length_ = 0;
|
||||
std::unique_ptr<ParselessPhraseDB> db_;
|
||||
};
|
||||
|
||||
}; // namespace McBopomofo
|
||||
|
||||
#endif // SOURCE_ENGINE_PARSELESSLM_H_
|
|
@ -0,0 +1,89 @@
|
|||
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person
|
||||
// obtaining a copy of this software and associated documentation
|
||||
// files (the "Software"), to deal in the Software without
|
||||
// restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following
|
||||
// conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <filesystem>
|
||||
|
||||
#include "FastLM.h"
|
||||
#include "ParselessLM.h"
|
||||
|
||||
namespace {
|
||||
|
||||
using FastLM = Formosa::Gramambular::FastLM;
|
||||
using ParselessLM = McBopomofo::ParselessLM;
|
||||
|
||||
static const char* kDataPath = "data.txt";
|
||||
static const char* kLegacyDataPath = "data-legacy.txt";
|
||||
static const char* kUnigramSearchKey = "ㄕˋ-ㄕˊ";
|
||||
|
||||
static void BM_ParselessLMOpenClose(benchmark::State& state)
|
||||
{
|
||||
assert(std::filesystem::exists(kDataPath));
|
||||
for (auto _ : state) {
|
||||
ParselessLM lm;
|
||||
lm.open(kDataPath);
|
||||
lm.close();
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_ParselessLMOpenClose);
|
||||
|
||||
static void BM_FastLMOpenClose(benchmark::State& state)
|
||||
{
|
||||
assert(std::filesystem::exists(kLegacyDataPath));
|
||||
for (auto _ : state) {
|
||||
FastLM lm;
|
||||
lm.open(kLegacyDataPath);
|
||||
lm.close();
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_FastLMOpenClose);
|
||||
|
||||
static void BM_ParselessLMFindUnigrams(benchmark::State& state)
|
||||
{
|
||||
assert(std::filesystem::exists(kDataPath));
|
||||
ParselessLM lm;
|
||||
lm.open(kDataPath);
|
||||
for (auto _ : state) {
|
||||
lm.unigramsForKey(kUnigramSearchKey);
|
||||
}
|
||||
lm.close();
|
||||
}
|
||||
BENCHMARK(BM_ParselessLMFindUnigrams);
|
||||
|
||||
static void BM_FastLMFindUnigrams(benchmark::State& state)
|
||||
{
|
||||
assert(std::filesystem::exists(kLegacyDataPath));
|
||||
FastLM lm;
|
||||
lm.open(kLegacyDataPath);
|
||||
for (auto _ : state) {
|
||||
lm.unigramsForKey(kUnigramSearchKey);
|
||||
}
|
||||
lm.close();
|
||||
}
|
||||
BENCHMARK(BM_FastLMFindUnigrams);
|
||||
|
||||
}; // namespace
|
||||
|
||||
BENCHMARK_MAIN();
|
|
@ -0,0 +1,59 @@
|
|||
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person
|
||||
// obtaining a copy of this software and associated documentation
|
||||
// files (the "Software"), to deal in the Software without
|
||||
// restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following
|
||||
// conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
#include <filesystem>
|
||||
#include <iostream>
|
||||
|
||||
#include "ParselessLM.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace McBopomofo {
|
||||
|
||||
TEST(ParselessLMTest, SanityCheckTest)
|
||||
{
|
||||
constexpr const char* data_path = "data.txt";
|
||||
if (!std::filesystem::exists(data_path)) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
||||
ParselessLM lm;
|
||||
bool status = lm.open(data_path);
|
||||
ASSERT_TRUE(status);
|
||||
|
||||
ASSERT_TRUE(lm.hasUnigramsForKey("ㄕ"));
|
||||
ASSERT_TRUE(lm.hasUnigramsForKey("ㄕˋ-ㄕˊ"));
|
||||
ASSERT_TRUE(lm.hasUnigramsForKey("_punctuation_list"));
|
||||
|
||||
auto unigrams = lm.unigramsForKey("ㄕ");
|
||||
ASSERT_GT(unigrams.size(), 0);
|
||||
|
||||
unigrams = lm.unigramsForKey("ㄕˋ-ㄕˊ");
|
||||
ASSERT_GT(unigrams.size(), 0);
|
||||
|
||||
unigrams = lm.unigramsForKey("_punctuation_list");
|
||||
ASSERT_GT(unigrams.size(), 0);
|
||||
|
||||
lm.close();
|
||||
}
|
||||
|
||||
}; // namespace McBopomofo
|
|
@ -0,0 +1,166 @@
|
|||
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person
|
||||
// obtaining a copy of this software and associated documentation
|
||||
// files (the "Software"), to deal in the Software without
|
||||
// restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following
|
||||
// conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
#include "ParselessPhraseDB.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
namespace McBopomofo {
|
||||
|
||||
ParselessPhraseDB::ParselessPhraseDB(
|
||||
const char* buf, size_t length, bool validate_pragma)
|
||||
: begin_(buf)
|
||||
, end_(buf + length)
|
||||
{
|
||||
assert(buf != nullptr);
|
||||
assert(length > 0);
|
||||
|
||||
if (validate_pragma) {
|
||||
assert(length > SORTED_PRAGMA_HEADER.length());
|
||||
|
||||
std::string_view header(buf, SORTED_PRAGMA_HEADER.length());
|
||||
assert(header == SORTED_PRAGMA_HEADER);
|
||||
|
||||
uint32_t x = 5381;
|
||||
for (const auto& i : header) {
|
||||
x = x * 33 + i;
|
||||
}
|
||||
|
||||
assert(x == uint32_t { 3012373384 });
|
||||
|
||||
begin_ += header.length();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string_view> ParselessPhraseDB::findRows(
|
||||
const std::string_view& key)
|
||||
{
|
||||
std::vector<std::string_view> rows;
|
||||
|
||||
const char* ptr = findFirstMatchingLine(key);
|
||||
if (ptr == nullptr) {
|
||||
return rows;
|
||||
}
|
||||
|
||||
while (ptr + key.length() <= end_
|
||||
&& memcmp(ptr, key.data(), key.length()) == 0) {
|
||||
const char* eol = ptr;
|
||||
|
||||
while (eol != end_ && *eol != '\n') {
|
||||
++eol;
|
||||
}
|
||||
|
||||
rows.emplace_back(ptr, eol - ptr);
|
||||
if (eol == end_) {
|
||||
break;
|
||||
}
|
||||
|
||||
ptr = ++eol;
|
||||
}
|
||||
|
||||
return rows;
|
||||
}
|
||||
|
||||
// Implements a binary search that returns the pointer to the first matching
|
||||
// row. In its core it's just a standard binary search, but we use backtracking
|
||||
// to locate the line start. We also check the previous line to see if the
|
||||
// current line is actually the first matching line: if the previous line is
|
||||
// less to the key and the current line starts exactly with the key, then
|
||||
// the current line is the first matching line.
|
||||
const char* ParselessPhraseDB::findFirstMatchingLine(
|
||||
const std::string_view& key)
|
||||
{
|
||||
if (key.empty()) {
|
||||
return begin_;
|
||||
}
|
||||
|
||||
const char* top = begin_;
|
||||
const char* bottom = end_;
|
||||
|
||||
while (top < bottom) {
|
||||
const char* mid = top + (bottom - top) / 2;
|
||||
const char* ptr = mid;
|
||||
|
||||
if (ptr != begin_) {
|
||||
--ptr;
|
||||
}
|
||||
|
||||
while (ptr != begin_ && *ptr != '\n') {
|
||||
--ptr;
|
||||
}
|
||||
|
||||
const char* prev = nullptr;
|
||||
if (*ptr == '\n') {
|
||||
prev = ptr;
|
||||
++ptr;
|
||||
}
|
||||
|
||||
// ptr is now in the "current" line we're interested in.
|
||||
if (ptr + key.length() > end_) {
|
||||
// not enough data to compare at this point, bail.
|
||||
break;
|
||||
}
|
||||
|
||||
int current_cmp = memcmp(ptr, key.data(), key.length());
|
||||
|
||||
if (current_cmp > 0) {
|
||||
bottom = mid - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current_cmp < 0) {
|
||||
top = mid + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!prev) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Move the prev so that it reaches the previous line.
|
||||
if (prev != begin_) {
|
||||
--prev;
|
||||
}
|
||||
while (prev != begin_ && *prev != '\n') {
|
||||
--prev;
|
||||
}
|
||||
if (*prev == '\n') {
|
||||
++prev;
|
||||
}
|
||||
|
||||
int prev_cmp = memcmp(prev, key.data(), key.length());
|
||||
|
||||
// This is the first occurrence.
|
||||
if (prev_cmp < 0 && current_cmp == 0) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// This is not, which means ptr is "larger" than the keyData.
|
||||
bottom = mid - 1;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
}; // namespace McBopomofo
|
|
@ -0,0 +1,60 @@
|
|||
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person
|
||||
// obtaining a copy of this software and associated documentation
|
||||
// files (the "Software"), to deal in the Software without
|
||||
// restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following
|
||||
// conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
#ifndef SOURCE_ENGINE_PARSELESSPHRASEDB_H_
|
||||
#define SOURCE_ENGINE_PARSELESSPHRASEDB_H_
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace McBopomofo {
|
||||
|
||||
constexpr std::string_view SORTED_PRAGMA_HEADER
|
||||
= "# format org.openvanilla.mcbopomofo.sorted\n";
|
||||
|
||||
// Defines phrase database that consists of (key, value, score) rows that are
|
||||
// pre-sorted by the byte value of the keys. It is way faster than FastLM
|
||||
// because it does not need to parse anything. Instead, it relies on the fact
|
||||
// that the database is already sorted, and binary search is used to find the
|
||||
// rows.
|
||||
class ParselessPhraseDB {
|
||||
public:
|
||||
ParselessPhraseDB(
|
||||
const char* buf, size_t length, bool validate_pragma = false);
|
||||
|
||||
// Find the rows that match the key. Note that prefix match is used. If you
|
||||
// need exact match, the key will need to have a delimiter (usually a space)
|
||||
// at the end.
|
||||
std::vector<std::string_view> findRows(const std::string_view& key);
|
||||
|
||||
const char* findFirstMatchingLine(const std::string_view& key);
|
||||
|
||||
private:
|
||||
const char* begin_;
|
||||
const char* end_;
|
||||
};
|
||||
|
||||
}; // namespace McBopomofo
|
||||
|
||||
#endif // SOURCE_ENGINE_PARSELESSPHRASEDB_H_
|
|
@ -0,0 +1,198 @@
|
|||
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person
|
||||
// obtaining a copy of this software and associated documentation
|
||||
// files (the "Software"), to deal in the Software without
|
||||
// restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following
|
||||
// conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
#include <cstdio>
|
||||
#include <filesystem>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include "ParselessPhraseDB.h"
|
||||
#include "gtest/gtest-death-test.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using StringViews = std::vector<std::string_view>;
|
||||
|
||||
namespace McBopomofo {
|
||||
|
||||
static bool VectorsEqual(
|
||||
const std::vector<std::string_view>& a, const std::vector<std::string>& b)
|
||||
{
|
||||
if (a.size() != b.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t s = a.size();
|
||||
for (size_t i = 0; i < s; i++) {
|
||||
if (a[i] != b[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
TEST(ParselessPhraseDBTest, Simple)
|
||||
{
|
||||
std::string data = "a 1";
|
||||
ParselessPhraseDB db(data.c_str(), data.length());
|
||||
|
||||
const char* first = db.findFirstMatchingLine("a");
|
||||
EXPECT_NE(first, nullptr);
|
||||
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||
|
||||
first = db.findFirstMatchingLine("a ");
|
||||
EXPECT_NE(first, nullptr);
|
||||
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||
|
||||
first = db.findFirstMatchingLine("a 1");
|
||||
EXPECT_NE(first, nullptr);
|
||||
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||
}
|
||||
|
||||
TEST(ParselessPhraseDBTest, NotFound)
|
||||
{
|
||||
std::string data = "a 1\na 2\na 3\nb 1";
|
||||
ParselessPhraseDB db(data.c_str(), data.length());
|
||||
EXPECT_EQ(db.findFirstMatchingLine("c"), nullptr);
|
||||
EXPECT_EQ(db.findFirstMatchingLine("A"), nullptr);
|
||||
}
|
||||
|
||||
TEST(ParselessPhraseDBTest, FindRowsLongerExample)
|
||||
{
|
||||
std::string data = "a 1\na 2\na 3\nb 42\nb 1\nb 2\nc 7\nd 1";
|
||||
ParselessPhraseDB db(data.c_str(), data.length());
|
||||
|
||||
EXPECT_EQ(db.findRows("a"), (StringViews { "a 1", "a 2", "a 3" }));
|
||||
EXPECT_EQ(db.findRows("b"), (StringViews { "b 42", "b 1", "b 2" }));
|
||||
EXPECT_EQ(db.findRows("c"), (StringViews { "c 7" }));
|
||||
EXPECT_EQ(db.findRows("d"), (StringViews { "d 1" }));
|
||||
EXPECT_EQ(db.findRows("e"), (StringViews {}));
|
||||
EXPECT_EQ(db.findRows("A"), (StringViews {}));
|
||||
}
|
||||
|
||||
TEST(ParselessPhraseDBTest, FindFirstMatchingLineLongerExample)
|
||||
{
|
||||
std::string data = "a 1\na 2\na 3\nb 42\nb 1\nb 2\nc 7\nd 1";
|
||||
ParselessPhraseDB db(data.c_str(), data.length());
|
||||
|
||||
const char* first = db.findFirstMatchingLine("a");
|
||||
EXPECT_NE(first, nullptr);
|
||||
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||
|
||||
db.findFirstMatchingLine("a 1");
|
||||
EXPECT_NE(first, nullptr);
|
||||
EXPECT_EQ(memcmp(first, "a 1", 3), 0);
|
||||
|
||||
first = db.findFirstMatchingLine("b");
|
||||
EXPECT_NE(first, nullptr);
|
||||
EXPECT_EQ(memcmp(first, "b 42", 4), 0);
|
||||
|
||||
first = db.findFirstMatchingLine("c");
|
||||
EXPECT_NE(first, nullptr);
|
||||
EXPECT_EQ(memcmp(first, "c 7", 3), 0);
|
||||
|
||||
first = db.findFirstMatchingLine("d");
|
||||
EXPECT_NE(first, nullptr);
|
||||
EXPECT_EQ(memcmp(first, "d 1", 3), 0);
|
||||
|
||||
first = db.findFirstMatchingLine("d 1");
|
||||
EXPECT_NE(first, nullptr);
|
||||
EXPECT_EQ(memcmp(first, "d 1", 3), 0);
|
||||
|
||||
first = db.findFirstMatchingLine("d 2");
|
||||
EXPECT_EQ(first, nullptr);
|
||||
}
|
||||
|
||||
TEST(ParselessPhraseDBTest, InvalidConstructorArguments)
|
||||
{
|
||||
EXPECT_DEATH((ParselessPhraseDB { nullptr, 1 }), "buf != nullptr");
|
||||
EXPECT_DEATH((ParselessPhraseDB { nullptr, 0 }), "buf != nullptr");
|
||||
EXPECT_DEATH((ParselessPhraseDB { "", 0 }), "length > 0");
|
||||
EXPECT_DEATH((ParselessPhraseDB { "a", 1, /*validate_pragma=*/true }),
|
||||
"length > SORTED_PRAGMA_HEADER\\.length\\(\\)");
|
||||
}
|
||||
|
||||
TEST(ParselessPhraseDBTest, PragmaGuard)
|
||||
{
|
||||
std::string buf1 = std::string(SORTED_PRAGMA_HEADER) + "a";
|
||||
std::string buf2 = "#" + buf1;
|
||||
std::string buf3 = buf1;
|
||||
buf3[3] = 'x';
|
||||
|
||||
ParselessPhraseDB { buf1.c_str(), buf1.length(), /*validate_pragma=*/true };
|
||||
EXPECT_DEATH(
|
||||
(ParselessPhraseDB { buf2.c_str(), buf2.length(), /*validate_pragma=*/
|
||||
true }),
|
||||
"==");
|
||||
EXPECT_DEATH(
|
||||
(ParselessPhraseDB { buf3.c_str(), buf3.length(), /*validate_pragma=*/
|
||||
true }),
|
||||
"==");
|
||||
}
|
||||
|
||||
TEST(ParselessPhraseDBTest, StressTest)
|
||||
{
|
||||
constexpr const char* data_path = "data.txt";
|
||||
if (!std::filesystem::exists(data_path)) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
||||
FILE* f = fopen(data_path, "r");
|
||||
ASSERT_NE(f, nullptr);
|
||||
int status = fseek(f, 0L, SEEK_END);
|
||||
ASSERT_EQ(status, 0);
|
||||
size_t length = ftell(f);
|
||||
std::unique_ptr<char[]> buf(new char[length]);
|
||||
status = fseek(f, 0L, SEEK_SET);
|
||||
ASSERT_EQ(status, 0);
|
||||
size_t items_read = fread(buf.get(), length, 1, f);
|
||||
ASSERT_EQ(items_read, 1);
|
||||
fclose(f);
|
||||
|
||||
std::stringstream sstr(std::string(buf.get(), length));
|
||||
std::string line;
|
||||
std::map<std::string, std::vector<std::string>> key_to_lines;
|
||||
|
||||
// Skip the pragma line.
|
||||
std::getline(sstr, line);
|
||||
|
||||
while (!sstr.eof()) {
|
||||
std::getline(sstr, line);
|
||||
if (line == "") {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::stringstream linest(line);
|
||||
std::string key;
|
||||
linest >> key;
|
||||
key_to_lines[key].push_back(line);
|
||||
}
|
||||
|
||||
ParselessPhraseDB db(buf.get(), length, /*validate_pragma=*/true);
|
||||
for (const auto& it : key_to_lines) {
|
||||
std::vector<std::string_view> rows = db.findRows(it.first + " ");
|
||||
ASSERT_TRUE(VectorsEqual(rows, it.second));
|
||||
}
|
||||
}
|
||||
|
||||
}; // namespace McBopomofo
|
|
@ -1,5 +1,4 @@
|
|||
#import <Foundation/Foundation.h>
|
||||
#import "FastLM.h"
|
||||
#import "UserOverrideModel.h"
|
||||
#import "McBopomofoLM.h"
|
||||
|
||||
|
|
Loading…
Reference in New Issue