Lukhnos: Use a more tolerant parser for user phrases

This commit is contained in:
ShikiSuen 2022-01-16 10:03:55 +08:00
parent 9944d4ce9b
commit 5f0a0bad6f
6 changed files with 338 additions and 145 deletions

View File

@ -4,6 +4,7 @@
// Copyright (c) 2011-2022 The OpenVanilla Project.
//
// Contributors:
// Lukhnos Liu (@lukhnos) @ OpenVanilla
// Weizhong Yang (@zonble) @ OpenVanilla
//
// Permission is hereby granted, free of charge, to any person
@ -29,14 +30,16 @@
//
#include "UserPhrasesLM.h"
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <fstream>
#include <unistd.h>
using namespace Formosa::Gramambular;
using namespace vChewing;
#include "KeyValueBlobReader.h"
namespace vChewing {
UserPhrasesLM::UserPhrasesLM()
: fd(-1)
@ -72,113 +75,24 @@ bool UserPhrasesLM::open(const char *path)
length = (size_t)sb.st_size;
data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0);
data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
if (!data) {
::close(fd);
return false;
}
char *head = (char *)data;
char *end = (char *)data + length;
char c;
Row row;
start:
// EOF -> end
if (head == end) {
goto end;
KeyValueBlobReader reader(static_cast<char*>(data), length);
KeyValueBlobReader::KeyValue keyValue;
KeyValueBlobReader::State state;
while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) {
// We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading.
keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key );
}
c = *head;
// \s -> error
if (c == ' ') {
goto error;
if (state == KeyValueBlobReader::State::ERROR) {
close();
return false;
}
// \n -> start
else if (c == '\n') {
head++;
goto start;
}
// \w -> record column star, state1
row.value = head;
head++;
// fall through to state 1
state1:
// EOF -> error
if (head == end) {
goto error;
}
c = *head;
// \n -> error
if (c == '\n') {
goto error;
}
// \s -> state2 + zero out ending + record column start
else if (c == ' ') {
*head = 0;
head++;
row.key = head;
goto state2;
}
// \w -> state1
head++;
goto state1;
state2:
if (head == end) {
*head = 0;
head++;
keyRowMap[row.key].push_back(row);
goto end;
}
c = *head;
// \s -> error
if (c == ' ' || c == '\n') {
*head = 0;
head++;
keyRowMap[row.key].push_back(row);
if (c == ' ') {
goto state3;
}
goto start;
}
// \w -> state 2
head++;
goto state2;
state3:
if (head == end) {
*head = 0;
head++;
keyRowMap[row.key].push_back(row);
goto end;
}
c = *head;
if (c == '\n') {
goto start;
}
head++;
goto state3;
error:
close();
return false;
end:
static const char *space = " ";
Row emptyRow;
emptyRow.key = space;
emptyRow.value = space;
keyRowMap[space].push_back(emptyRow);
return true;
}
@ -195,33 +109,29 @@ void UserPhrasesLM::close()
void UserPhrasesLM::dump()
{
size_t rows = 0;
for (map<const char *, vector<Row> >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) {
const vector<Row>& r = (*i).second;
for (vector<Row>::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) {
const Row& row = *ri;
cerr << row.key << " " << row.value << "\n";
rows++;
for (const auto& entry : keyRowMap) {
const std::vector<Row>& rows = entry.second;
for (const auto& row : rows) {
std::cerr << row.key << " " << row.value << "\n";
}
}
}
const vector<Bigram> UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key)
const std::vector<Formosa::Gramambular::Bigram> UserPhrasesLM::bigramsForKeys(const std::string& preceedingKey, const std::string& key)
{
return vector<Bigram>();
return std::vector<Formosa::Gramambular::Bigram>();
}
const vector<Unigram> UserPhrasesLM::unigramsForKey(const string& key)
const std::vector<Formosa::Gramambular::Unigram> UserPhrasesLM::unigramsForKey(const std::string& key)
{
vector<Unigram> v;
map<const char *, vector<Row> >::const_iterator i = keyRowMap.find(key.c_str());
if (i != keyRowMap.end()) {
for (vector<Row>::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) {
Unigram g;
const Row& r = *ri;
g.keyValue.key = r.key;
g.keyValue.value = r.value;
std::vector<Formosa::Gramambular::Unigram> v;
auto iter = keyRowMap.find(key);
if (iter != keyRowMap.end()) {
const std::vector<Row>& rows = iter->second;
for (const auto& row : rows) {
Formosa::Gramambular::Unigram g;
g.keyValue.key = row.key;
g.keyValue.value = row.value;
g.score = 0.0;
v.push_back(g);
}
@ -230,7 +140,9 @@ const vector<Unigram> UserPhrasesLM::unigramsForKey(const string& key)
return v;
}
bool UserPhrasesLM::hasUnigramsForKey(const string& key)
bool UserPhrasesLM::hasUnigramsForKey(const std::string& key)
{
return keyRowMap.find(key.c_str()) != keyRowMap.end();
return keyRowMap.find(key) != keyRowMap.end();
}
}; // namespace vChewing

View File

@ -4,6 +4,7 @@
// Copyright (c) 2011-2022 The OpenVanilla Project.
//
// Contributors:
// Lukhnos Liu (@lukhnos) @ OpenVanilla
// Weizhong Yang (@zonble) @ OpenVanilla
//
// Permission is hereby granted, free of charge, to any person
@ -28,11 +29,10 @@
// OTHER DEALINGS IN THE SOFTWARE.
//
#ifndef USERPHRASESLM_H
#define USERPHRASESLM_H
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
@ -40,37 +40,28 @@
namespace vChewing {
using namespace Formosa::Gramambular;
class UserPhrasesLM : public LanguageModel
class UserPhrasesLM : public Formosa::Gramambular::LanguageModel
{
public:
UserPhrasesLM();
~UserPhrasesLM();
bool open(const char *path);
void close();
void dump();
virtual const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key);
virtual const vector<Unigram> unigramsForKey(const string& key);
virtual bool hasUnigramsForKey(const string& key);
virtual const std::vector<Formosa::Gramambular::Bigram> bigramsForKeys(const std::string& preceedingKey, const std::string& key);
virtual const std::vector<Formosa::Gramambular::Unigram> unigramsForKey(const std::string& key);
virtual bool hasUnigramsForKey(const std::string& key);
protected:
struct CStringCmp
{
bool operator()(const char* s1, const char* s2) const
{
return strcmp(s1, s2) < 0;
}
};
struct Row {
const char *key;
const char *value;
Row(std::string_view& k, std::string_view& v) : key(k), value(v) {}
std::string_view key;
std::string_view value;
};
map<const char *, vector<Row>, CStringCmp> keyRowMap;
std::map<std::string_view, std::vector<Row>> keyRowMap;
int fd;
void *data;
size_t length;

View File

@ -0,0 +1,24 @@
cmake_minimum_required(VERSION 3.17)
project(KeyValueBlobReader)
set(CMAKE_CXX_STANDARD 17)
add_library(KeyValueBlobReader KeyValueBlobReader.cpp KeyValueBlobReader.h)
# Let CMake fetch Google Test for us.
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
include(FetchContent)
FetchContent_Declare(
googletest
# Specify the commit you depend on and update it regularly.
URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
)
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
# Test target declarations.
add_executable(KeyValueBlobReadTest KeyValueBlobReaderTest.cpp)
target_link_libraries(KeyValueBlobReadTest gtest_main KeyValueBlobReader)
add_test(NAME KeyValueBlobReadTest COMMAND KeyValueBlobReadTest)

View File

@ -0,0 +1,147 @@
//
// KeyValueBlobReader.cpp
//
// Copyright (c) 2011-2022 The OpenVanilla Project.
//
// Contributors:
// Lukhnos Liu (@lukhnos) @ OpenVanilla
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
#include "KeyValueBlobReader.h"
namespace vChewing {
KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) {
static auto new_line = [](char c) { return c == '\n' || c == '\r'; };
static auto blank = [](char c) { return c == ' ' || c == '\t'; };
static auto blank_or_newline = [](char c) { return blank(c) || new_line(c); };
static auto content_char = [](char c) {
return !blank(c) && !new_line(c);
};
if (state_ == State::ERROR) {
return state_;
}
const char* key_begin = nullptr;
size_t key_length = 0;
const char* value_begin = nullptr;
size_t value_length = 0;
while (true) {
state_ = SkipUntilNot(blank_or_newline);
if (state_ != State::CAN_CONTINUE) {
return state_;
}
// Check if it's a comment line; if so, read until end of line.
if (*current_ != '#') {
break;
}
state_ = SkipUntil(new_line);
if (state_ != State::CAN_CONTINUE) {
return state_;
}
}
// No need to check whether* current_ is a content_char, since content_char
// is defined as not blank and not new_line.
key_begin = current_;
state_ = SkipUntilNot(content_char);
if (state_ != State::CAN_CONTINUE) {
goto error;
}
key_length = current_ - key_begin;
// There should be at least one blank character after the key string.
if (!blank(*current_)) {
goto error;
}
state_ = SkipUntilNot(blank);
if (state_ != State::CAN_CONTINUE) {
goto error;
}
if (!content_char(*current_)) {
goto error;
}
value_begin = current_;
// value must only contain content characters, blanks not are allowed.
// also, there's no need to check the state after this, since we will always
// emit the value. This also avoids the situation where trailing spaces in a
// line would become part of the value.
SkipUntilNot(content_char);
value_length = current_ - value_begin;
// Unconditionally skip until the end of the line. This prevents the case
// like "foo bar baz\n" where baz should not be treated as the Next key.
SkipUntil(new_line);
if (out != nullptr) {
*out = KeyValue{
std::string_view{key_begin, key_length},
std::string_view{value_begin, value_length}};
}
state_ = State::HAS_PAIR;
return state_;
error:
state_ = State::ERROR;
return State::ERROR;
}
KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
const std::function<bool(char)>& f) {
while (current_ != end_ &&* current_) {
if (!f(*current_)) {
return State::CAN_CONTINUE;
}
++current_;
}
return State::END;
}
KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
const std::function<bool(char)>& f) {
while (current_ != end_ &&* current_) {
if (f(*current_)) {
return State::CAN_CONTINUE;
}
++current_;
}
return State::END;
}
std::ostream& operator<<(std::ostream& os,
const KeyValueBlobReader::KeyValue& kv) {
os << "(key: " << kv.key << ", value: " << kv.value << ")";
return os;
}
} // namespace vChewing

View File

@ -0,0 +1,101 @@
//
// KeyValueBlobReader.h
//
// Copyright (c) 2011-2022 The OpenVanilla Project.
//
// Contributors:
// Lukhnos Liu (@lukhnos) @ OpenVanilla
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
#ifndef SOURCE_ENGINE_KEYVALUEBLOBREADER_H_
#define SOURCE_ENGINE_KEYVALUEBLOBREADER_H_
#include <cstddef>
#include <functional>
#include <iostream>
#include <string_view>
// A reader for text-based, blank-separated key-value pairs in a binary blob.
//
// This reader is suitable for reading language model files that entirely
// consist of key-value pairs. Leading or trailing spaces are ignored.
// Lines that start with "#" are treated as comments. Values cannot contain
// spaces. Any space after the value string is parsed is ignored. This implies
// that after a blank, anything that comes after the value can be used as
// comment. Both ' ' and '\t' are treated as blank characters, and the parser
// is agnostic to how lines are ended, and so LF, CR LF, and CR are all valid
// line endings.
//
// std::string_view is used to allow returning results efficiently. As a result,
// the blob is a const char* and will never be mutated. This implies, for
// example, read-only mmap can be used to parse large files.
namespace vChewing {
class KeyValueBlobReader {
public:
enum class State : int {
// There are no more key-value pairs in this blob.
END = 0,
// The reader has produced a new key-value pair.
HAS_PAIR = 1,
// An error is encountered and the parsing stopped.
ERROR = -1,
// Internal-only state: the parser can continue parsing.
CAN_CONTINUE = 2
};
struct KeyValue {
constexpr KeyValue() : key(""), value("") {}
constexpr KeyValue(std::string_view k, std::string_view v)
: key(k), value(v) {}
bool operator==(const KeyValue& another) const {
return key == another.key && value == another.value;
}
std::string_view key;
std::string_view value;
};
KeyValueBlobReader(const char* blob, size_t size)
: current_(blob), end_(blob + size) {}
// Parse the next key-value pair and return the state of the reader. If `out`
// is passed, out will be set to the produced key-value pair if there is one.
State Next(KeyValue* out = nullptr);
private:
State SkipUntil(const std::function<bool(char)>& f);
State SkipUntilNot(const std::function<bool(char)>& f);
const char* current_;
const char* end_;
State state_ = State::CAN_CONTINUE;
};
std::ostream& operator<<(std::ostream&, const KeyValueBlobReader::KeyValue&);
} // namespace vChewing
#endif // SOURCE_ENGINE_KEYVALUEBLOBREADER_H_

View File

@ -15,6 +15,8 @@
5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */; };
5B5F4F93279294A300922DC2 /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */; };
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.cpp */; };
5BC2D2872793B434002C0BEC /* CMakeLists.txt in Resources */ = {isa = PBXBuildFile; fileRef = 5BC2D2852793B434002C0BEC /* CMakeLists.txt */; };
5BC2D2882793B434002C0BEC /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D2862793B434002C0BEC /* KeyValueBlobReader.cpp */; };
5BC3EE1B278FC48C00F5E44C /* VerticalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE18278FC48C00F5E44C /* VerticalCandidateController.swift */; };
5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE19278FC48C00F5E44C /* VTCandidateController.swift */; };
5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE1A278FC48C00F5E44C /* HorizontalCandidateController.swift */; };
@ -100,6 +102,9 @@
5B9781D82763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "Source/zh-Hans.lproj/Localizable.strings"; sourceTree = "<group>"; };
5B9781D92763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hans"; path = "zh-Hans.lproj/MainMenu.xib"; sourceTree = "<group>"; };
5BA923AC2791B7C20001323A /* vChewingInstaller-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "vChewingInstaller-Bridging-Header.h"; sourceTree = "<group>"; };
5BC2D2842793B434002C0BEC /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = "<group>"; };
5BC2D2852793B434002C0BEC /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = "<group>"; };
5BC2D2862793B434002C0BEC /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = "<group>"; };
5BC3EE18278FC48C00F5E44C /* VerticalCandidateController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VerticalCandidateController.swift; sourceTree = "<group>"; };
5BC3EE19278FC48C00F5E44C /* VTCandidateController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VTCandidateController.swift; sourceTree = "<group>"; };
5BC3EE1A278FC48C00F5E44C /* HorizontalCandidateController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = HorizontalCandidateController.swift; sourceTree = "<group>"; };
@ -257,6 +262,16 @@
path = LanguageModel;
sourceTree = "<group>";
};
5BC2D2832793B434002C0BEC /* vChewing */ = {
isa = PBXGroup;
children = (
5BC2D2842793B434002C0BEC /* KeyValueBlobReader.h */,
5BC2D2852793B434002C0BEC /* CMakeLists.txt */,
5BC2D2862793B434002C0BEC /* KeyValueBlobReader.cpp */,
);
path = vChewing;
sourceTree = "<group>";
};
5BE798A12792E50F00337FF9 /* UI */ = {
isa = PBXGroup;
children = (
@ -365,6 +380,7 @@
6A0D4F1215FC0EB100ABF4B3 /* Engine */ = {
isa = PBXGroup;
children = (
5BC2D2832793B434002C0BEC /* vChewing */,
5BA8DAFE27928120009C9FFF /* LanguageModel */,
6A0D4F1315FC0EB100ABF4B3 /* Gramambular */,
6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */,
@ -626,6 +642,7 @@
5BF4A70027844DC5007DC6E7 /* frmAboutWindow.xib in Resources */,
5BC3FB83278492DE0022E99A /* data-chs.txt in Resources */,
5B000FC4278495AD004F02AC /* SimpBopomofo@2x.tiff in Resources */,
5BC2D2872793B434002C0BEC /* CMakeLists.txt in Resources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -677,6 +694,7 @@
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */,
5BF4A6FE27844738007DC6E7 /* frmAboutWindow.m in Sources */,
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */,
5BC2D2882793B434002C0BEC /* KeyValueBlobReader.cpp in Sources */,
5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */,
5BE798A42792E58A00337FF9 /* TooltipController.swift in Sources */,
5BDF2D062791DFF200838ADB /* AppDelegate.swift in Sources */,