Lukhnos: Use a more tolerant parser for user phrases
This commit is contained in:
parent
9944d4ce9b
commit
5f0a0bad6f
|
@ -4,6 +4,7 @@
|
||||||
// Copyright (c) 2011-2022 The OpenVanilla Project.
|
// Copyright (c) 2011-2022 The OpenVanilla Project.
|
||||||
//
|
//
|
||||||
// Contributors:
|
// Contributors:
|
||||||
|
// Lukhnos Liu (@lukhnos) @ OpenVanilla
|
||||||
// Weizhong Yang (@zonble) @ OpenVanilla
|
// Weizhong Yang (@zonble) @ OpenVanilla
|
||||||
//
|
//
|
||||||
// Permission is hereby granted, free of charge, to any person
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
@ -29,14 +30,16 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "UserPhrasesLM.h"
|
#include "UserPhrasesLM.h"
|
||||||
|
|
||||||
#include <sys/mman.h>
|
#include <sys/mman.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
using namespace Formosa::Gramambular;
|
#include "KeyValueBlobReader.h"
|
||||||
using namespace vChewing;
|
|
||||||
|
namespace vChewing {
|
||||||
|
|
||||||
UserPhrasesLM::UserPhrasesLM()
|
UserPhrasesLM::UserPhrasesLM()
|
||||||
: fd(-1)
|
: fd(-1)
|
||||||
|
@ -72,113 +75,24 @@ bool UserPhrasesLM::open(const char *path)
|
||||||
|
|
||||||
length = (size_t)sb.st_size;
|
length = (size_t)sb.st_size;
|
||||||
|
|
||||||
data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0);
|
data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
::close(fd);
|
::close(fd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *head = (char *)data;
|
KeyValueBlobReader reader(static_cast<char*>(data), length);
|
||||||
char *end = (char *)data + length;
|
KeyValueBlobReader::KeyValue keyValue;
|
||||||
char c;
|
KeyValueBlobReader::State state;
|
||||||
Row row;
|
while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) {
|
||||||
|
// We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading.
|
||||||
start:
|
keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key );
|
||||||
// EOF -> end
|
|
||||||
if (head == end) {
|
|
||||||
goto end;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
c = *head;
|
if (state == KeyValueBlobReader::State::ERROR) {
|
||||||
// \s -> error
|
close();
|
||||||
if (c == ' ') {
|
return false;
|
||||||
goto error;
|
|
||||||
}
|
}
|
||||||
// \n -> start
|
|
||||||
else if (c == '\n') {
|
|
||||||
head++;
|
|
||||||
goto start;
|
|
||||||
}
|
|
||||||
|
|
||||||
// \w -> record column star, state1
|
|
||||||
row.value = head;
|
|
||||||
head++;
|
|
||||||
// fall through to state 1
|
|
||||||
|
|
||||||
state1:
|
|
||||||
// EOF -> error
|
|
||||||
if (head == end) {
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
|
|
||||||
c = *head;
|
|
||||||
// \n -> error
|
|
||||||
if (c == '\n') {
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
// \s -> state2 + zero out ending + record column start
|
|
||||||
else if (c == ' ') {
|
|
||||||
*head = 0;
|
|
||||||
head++;
|
|
||||||
row.key = head;
|
|
||||||
goto state2;
|
|
||||||
}
|
|
||||||
|
|
||||||
// \w -> state1
|
|
||||||
head++;
|
|
||||||
goto state1;
|
|
||||||
|
|
||||||
state2:
|
|
||||||
if (head == end) {
|
|
||||||
*head = 0;
|
|
||||||
head++;
|
|
||||||
keyRowMap[row.key].push_back(row);
|
|
||||||
goto end;
|
|
||||||
}
|
|
||||||
|
|
||||||
c = *head;
|
|
||||||
// \s -> error
|
|
||||||
if (c == ' ' || c == '\n') {
|
|
||||||
*head = 0;
|
|
||||||
head++;
|
|
||||||
keyRowMap[row.key].push_back(row);
|
|
||||||
if (c == ' ') {
|
|
||||||
goto state3;
|
|
||||||
}
|
|
||||||
goto start;
|
|
||||||
}
|
|
||||||
|
|
||||||
// \w -> state 2
|
|
||||||
head++;
|
|
||||||
goto state2;
|
|
||||||
|
|
||||||
state3:
|
|
||||||
if (head == end) {
|
|
||||||
*head = 0;
|
|
||||||
head++;
|
|
||||||
keyRowMap[row.key].push_back(row);
|
|
||||||
goto end;
|
|
||||||
}
|
|
||||||
|
|
||||||
c = *head;
|
|
||||||
if (c == '\n') {
|
|
||||||
goto start;
|
|
||||||
}
|
|
||||||
|
|
||||||
head++;
|
|
||||||
goto state3;
|
|
||||||
|
|
||||||
error:
|
|
||||||
close();
|
|
||||||
return false;
|
|
||||||
|
|
||||||
end:
|
|
||||||
static const char *space = " ";
|
|
||||||
Row emptyRow;
|
|
||||||
emptyRow.key = space;
|
|
||||||
emptyRow.value = space;
|
|
||||||
keyRowMap[space].push_back(emptyRow);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -195,33 +109,29 @@ void UserPhrasesLM::close()
|
||||||
|
|
||||||
void UserPhrasesLM::dump()
|
void UserPhrasesLM::dump()
|
||||||
{
|
{
|
||||||
size_t rows = 0;
|
for (const auto& entry : keyRowMap) {
|
||||||
for (map<const char *, vector<Row> >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) {
|
const std::vector<Row>& rows = entry.second;
|
||||||
const vector<Row>& r = (*i).second;
|
for (const auto& row : rows) {
|
||||||
for (vector<Row>::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) {
|
std::cerr << row.key << " " << row.value << "\n";
|
||||||
const Row& row = *ri;
|
|
||||||
cerr << row.key << " " << row.value << "\n";
|
|
||||||
rows++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const vector<Bigram> UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key)
|
const std::vector<Formosa::Gramambular::Bigram> UserPhrasesLM::bigramsForKeys(const std::string& preceedingKey, const std::string& key)
|
||||||
{
|
{
|
||||||
return vector<Bigram>();
|
return std::vector<Formosa::Gramambular::Bigram>();
|
||||||
}
|
}
|
||||||
|
|
||||||
const vector<Unigram> UserPhrasesLM::unigramsForKey(const string& key)
|
const std::vector<Formosa::Gramambular::Unigram> UserPhrasesLM::unigramsForKey(const std::string& key)
|
||||||
{
|
{
|
||||||
vector<Unigram> v;
|
std::vector<Formosa::Gramambular::Unigram> v;
|
||||||
map<const char *, vector<Row> >::const_iterator i = keyRowMap.find(key.c_str());
|
auto iter = keyRowMap.find(key);
|
||||||
|
if (iter != keyRowMap.end()) {
|
||||||
if (i != keyRowMap.end()) {
|
const std::vector<Row>& rows = iter->second;
|
||||||
for (vector<Row>::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) {
|
for (const auto& row : rows) {
|
||||||
Unigram g;
|
Formosa::Gramambular::Unigram g;
|
||||||
const Row& r = *ri;
|
g.keyValue.key = row.key;
|
||||||
g.keyValue.key = r.key;
|
g.keyValue.value = row.value;
|
||||||
g.keyValue.value = r.value;
|
|
||||||
g.score = 0.0;
|
g.score = 0.0;
|
||||||
v.push_back(g);
|
v.push_back(g);
|
||||||
}
|
}
|
||||||
|
@ -230,7 +140,9 @@ const vector<Unigram> UserPhrasesLM::unigramsForKey(const string& key)
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UserPhrasesLM::hasUnigramsForKey(const string& key)
|
bool UserPhrasesLM::hasUnigramsForKey(const std::string& key)
|
||||||
{
|
{
|
||||||
return keyRowMap.find(key.c_str()) != keyRowMap.end();
|
return keyRowMap.find(key) != keyRowMap.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}; // namespace vChewing
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
// Copyright (c) 2011-2022 The OpenVanilla Project.
|
// Copyright (c) 2011-2022 The OpenVanilla Project.
|
||||||
//
|
//
|
||||||
// Contributors:
|
// Contributors:
|
||||||
|
// Lukhnos Liu (@lukhnos) @ OpenVanilla
|
||||||
// Weizhong Yang (@zonble) @ OpenVanilla
|
// Weizhong Yang (@zonble) @ OpenVanilla
|
||||||
//
|
//
|
||||||
// Permission is hereby granted, free of charge, to any person
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
@ -28,11 +29,10 @@
|
||||||
// OTHER DEALINGS IN THE SOFTWARE.
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
||||||
#ifndef USERPHRASESLM_H
|
#ifndef USERPHRASESLM_H
|
||||||
#define USERPHRASESLM_H
|
#define USERPHRASESLM_H
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
@ -40,37 +40,28 @@
|
||||||
|
|
||||||
namespace vChewing {
|
namespace vChewing {
|
||||||
|
|
||||||
using namespace Formosa::Gramambular;
|
class UserPhrasesLM : public Formosa::Gramambular::LanguageModel
|
||||||
|
|
||||||
class UserPhrasesLM : public LanguageModel
|
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
UserPhrasesLM();
|
UserPhrasesLM();
|
||||||
~UserPhrasesLM();
|
~UserPhrasesLM();
|
||||||
|
|
||||||
bool open(const char *path);
|
bool open(const char *path);
|
||||||
void close();
|
void close();
|
||||||
void dump();
|
void dump();
|
||||||
|
|
||||||
virtual const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key);
|
virtual const std::vector<Formosa::Gramambular::Bigram> bigramsForKeys(const std::string& preceedingKey, const std::string& key);
|
||||||
virtual const vector<Unigram> unigramsForKey(const string& key);
|
virtual const std::vector<Formosa::Gramambular::Unigram> unigramsForKey(const std::string& key);
|
||||||
virtual bool hasUnigramsForKey(const string& key);
|
virtual bool hasUnigramsForKey(const std::string& key);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
struct CStringCmp
|
|
||||||
{
|
|
||||||
bool operator()(const char* s1, const char* s2) const
|
|
||||||
{
|
|
||||||
return strcmp(s1, s2) < 0;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct Row {
|
struct Row {
|
||||||
const char *key;
|
Row(std::string_view& k, std::string_view& v) : key(k), value(v) {}
|
||||||
const char *value;
|
std::string_view key;
|
||||||
|
std::string_view value;
|
||||||
};
|
};
|
||||||
|
|
||||||
map<const char *, vector<Row>, CStringCmp> keyRowMap;
|
std::map<std::string_view, std::vector<Row>> keyRowMap;
|
||||||
int fd;
|
int fd;
|
||||||
void *data;
|
void *data;
|
||||||
size_t length;
|
size_t length;
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
cmake_minimum_required(VERSION 3.17)
|
||||||
|
project(KeyValueBlobReader)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
|
add_library(KeyValueBlobReader KeyValueBlobReader.cpp KeyValueBlobReader.h)
|
||||||
|
|
||||||
|
# Let CMake fetch Google Test for us.
|
||||||
|
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
|
||||||
|
include(FetchContent)
|
||||||
|
|
||||||
|
FetchContent_Declare(
|
||||||
|
googletest
|
||||||
|
# Specify the commit you depend on and update it regularly.
|
||||||
|
URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
|
||||||
|
)
|
||||||
|
# For Windows: Prevent overriding the parent project's compiler/linker settings
|
||||||
|
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||||
|
FetchContent_MakeAvailable(googletest)
|
||||||
|
|
||||||
|
# Test target declarations.
|
||||||
|
add_executable(KeyValueBlobReadTest KeyValueBlobReaderTest.cpp)
|
||||||
|
target_link_libraries(KeyValueBlobReadTest gtest_main KeyValueBlobReader)
|
||||||
|
add_test(NAME KeyValueBlobReadTest COMMAND KeyValueBlobReadTest)
|
|
@ -0,0 +1,147 @@
|
||||||
|
//
|
||||||
|
// KeyValueBlobReader.cpp
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011-2022 The OpenVanilla Project.
|
||||||
|
//
|
||||||
|
// Contributors:
|
||||||
|
// Lukhnos Liu (@lukhnos) @ OpenVanilla
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "KeyValueBlobReader.h"
|
||||||
|
|
||||||
|
namespace vChewing {
|
||||||
|
|
||||||
|
KeyValueBlobReader::State KeyValueBlobReader::Next(KeyValue* out) {
|
||||||
|
static auto new_line = [](char c) { return c == '\n' || c == '\r'; };
|
||||||
|
static auto blank = [](char c) { return c == ' ' || c == '\t'; };
|
||||||
|
static auto blank_or_newline = [](char c) { return blank(c) || new_line(c); };
|
||||||
|
static auto content_char = [](char c) {
|
||||||
|
return !blank(c) && !new_line(c);
|
||||||
|
};
|
||||||
|
|
||||||
|
if (state_ == State::ERROR) {
|
||||||
|
return state_;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* key_begin = nullptr;
|
||||||
|
size_t key_length = 0;
|
||||||
|
const char* value_begin = nullptr;
|
||||||
|
size_t value_length = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
state_ = SkipUntilNot(blank_or_newline);
|
||||||
|
if (state_ != State::CAN_CONTINUE) {
|
||||||
|
return state_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if it's a comment line; if so, read until end of line.
|
||||||
|
if (*current_ != '#') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
state_ = SkipUntil(new_line);
|
||||||
|
if (state_ != State::CAN_CONTINUE) {
|
||||||
|
return state_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No need to check whether* current_ is a content_char, since content_char
|
||||||
|
// is defined as not blank and not new_line.
|
||||||
|
|
||||||
|
key_begin = current_;
|
||||||
|
state_ = SkipUntilNot(content_char);
|
||||||
|
if (state_ != State::CAN_CONTINUE) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
key_length = current_ - key_begin;
|
||||||
|
|
||||||
|
// There should be at least one blank character after the key string.
|
||||||
|
if (!blank(*current_)) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
state_ = SkipUntilNot(blank);
|
||||||
|
if (state_ != State::CAN_CONTINUE) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!content_char(*current_)) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
value_begin = current_;
|
||||||
|
// value must only contain content characters, blanks not are allowed.
|
||||||
|
// also, there's no need to check the state after this, since we will always
|
||||||
|
// emit the value. This also avoids the situation where trailing spaces in a
|
||||||
|
// line would become part of the value.
|
||||||
|
SkipUntilNot(content_char);
|
||||||
|
value_length = current_ - value_begin;
|
||||||
|
|
||||||
|
// Unconditionally skip until the end of the line. This prevents the case
|
||||||
|
// like "foo bar baz\n" where baz should not be treated as the Next key.
|
||||||
|
SkipUntil(new_line);
|
||||||
|
|
||||||
|
if (out != nullptr) {
|
||||||
|
*out = KeyValue{
|
||||||
|
std::string_view{key_begin, key_length},
|
||||||
|
std::string_view{value_begin, value_length}};
|
||||||
|
}
|
||||||
|
state_ = State::HAS_PAIR;
|
||||||
|
return state_;
|
||||||
|
|
||||||
|
error:
|
||||||
|
state_ = State::ERROR;
|
||||||
|
return State::ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
KeyValueBlobReader::State KeyValueBlobReader::SkipUntilNot(
|
||||||
|
const std::function<bool(char)>& f) {
|
||||||
|
while (current_ != end_ &&* current_) {
|
||||||
|
if (!f(*current_)) {
|
||||||
|
return State::CAN_CONTINUE;
|
||||||
|
}
|
||||||
|
++current_;
|
||||||
|
}
|
||||||
|
|
||||||
|
return State::END;
|
||||||
|
}
|
||||||
|
|
||||||
|
KeyValueBlobReader::State KeyValueBlobReader::SkipUntil(
|
||||||
|
const std::function<bool(char)>& f) {
|
||||||
|
while (current_ != end_ &&* current_) {
|
||||||
|
if (f(*current_)) {
|
||||||
|
return State::CAN_CONTINUE;
|
||||||
|
}
|
||||||
|
++current_;
|
||||||
|
}
|
||||||
|
|
||||||
|
return State::END;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& os,
|
||||||
|
const KeyValueBlobReader::KeyValue& kv) {
|
||||||
|
os << "(key: " << kv.key << ", value: " << kv.value << ")";
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vChewing
|
|
@ -0,0 +1,101 @@
|
||||||
|
//
|
||||||
|
// KeyValueBlobReader.h
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011-2022 The OpenVanilla Project.
|
||||||
|
//
|
||||||
|
// Contributors:
|
||||||
|
// Lukhnos Liu (@lukhnos) @ OpenVanilla
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person
|
||||||
|
// obtaining a copy of this software and associated documentation
|
||||||
|
// files (the "Software"), to deal in the Software without
|
||||||
|
// restriction, including without limitation the rights to use,
|
||||||
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the
|
||||||
|
// Software is furnished to do so, subject to the following
|
||||||
|
// conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef SOURCE_ENGINE_KEYVALUEBLOBREADER_H_
|
||||||
|
#define SOURCE_ENGINE_KEYVALUEBLOBREADER_H_
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <functional>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string_view>
|
||||||
|
|
||||||
|
// A reader for text-based, blank-separated key-value pairs in a binary blob.
|
||||||
|
//
|
||||||
|
// This reader is suitable for reading language model files that entirely
|
||||||
|
// consist of key-value pairs. Leading or trailing spaces are ignored.
|
||||||
|
// Lines that start with "#" are treated as comments. Values cannot contain
|
||||||
|
// spaces. Any space after the value string is parsed is ignored. This implies
|
||||||
|
// that after a blank, anything that comes after the value can be used as
|
||||||
|
// comment. Both ' ' and '\t' are treated as blank characters, and the parser
|
||||||
|
// is agnostic to how lines are ended, and so LF, CR LF, and CR are all valid
|
||||||
|
// line endings.
|
||||||
|
//
|
||||||
|
// std::string_view is used to allow returning results efficiently. As a result,
|
||||||
|
// the blob is a const char* and will never be mutated. This implies, for
|
||||||
|
// example, read-only mmap can be used to parse large files.
|
||||||
|
namespace vChewing {
|
||||||
|
|
||||||
|
class KeyValueBlobReader {
|
||||||
|
public:
|
||||||
|
enum class State : int {
|
||||||
|
// There are no more key-value pairs in this blob.
|
||||||
|
END = 0,
|
||||||
|
// The reader has produced a new key-value pair.
|
||||||
|
HAS_PAIR = 1,
|
||||||
|
// An error is encountered and the parsing stopped.
|
||||||
|
ERROR = -1,
|
||||||
|
// Internal-only state: the parser can continue parsing.
|
||||||
|
CAN_CONTINUE = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
struct KeyValue {
|
||||||
|
constexpr KeyValue() : key(""), value("") {}
|
||||||
|
constexpr KeyValue(std::string_view k, std::string_view v)
|
||||||
|
: key(k), value(v) {}
|
||||||
|
|
||||||
|
bool operator==(const KeyValue& another) const {
|
||||||
|
return key == another.key && value == another.value;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string_view key;
|
||||||
|
std::string_view value;
|
||||||
|
};
|
||||||
|
|
||||||
|
KeyValueBlobReader(const char* blob, size_t size)
|
||||||
|
: current_(blob), end_(blob + size) {}
|
||||||
|
|
||||||
|
// Parse the next key-value pair and return the state of the reader. If `out`
|
||||||
|
// is passed, out will be set to the produced key-value pair if there is one.
|
||||||
|
State Next(KeyValue* out = nullptr);
|
||||||
|
|
||||||
|
private:
|
||||||
|
State SkipUntil(const std::function<bool(char)>& f);
|
||||||
|
State SkipUntilNot(const std::function<bool(char)>& f);
|
||||||
|
|
||||||
|
const char* current_;
|
||||||
|
const char* end_;
|
||||||
|
State state_ = State::CAN_CONTINUE;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream&, const KeyValueBlobReader::KeyValue&);
|
||||||
|
|
||||||
|
} // namespace vChewing
|
||||||
|
|
||||||
|
#endif // SOURCE_ENGINE_KEYVALUEBLOBREADER_H_
|
|
@ -15,6 +15,8 @@
|
||||||
5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */; };
|
5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */; };
|
||||||
5B5F4F93279294A300922DC2 /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */; };
|
5B5F4F93279294A300922DC2 /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */; };
|
||||||
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.cpp */; };
|
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.cpp */; };
|
||||||
|
5BC2D2872793B434002C0BEC /* CMakeLists.txt in Resources */ = {isa = PBXBuildFile; fileRef = 5BC2D2852793B434002C0BEC /* CMakeLists.txt */; };
|
||||||
|
5BC2D2882793B434002C0BEC /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D2862793B434002C0BEC /* KeyValueBlobReader.cpp */; };
|
||||||
5BC3EE1B278FC48C00F5E44C /* VerticalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE18278FC48C00F5E44C /* VerticalCandidateController.swift */; };
|
5BC3EE1B278FC48C00F5E44C /* VerticalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE18278FC48C00F5E44C /* VerticalCandidateController.swift */; };
|
||||||
5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE19278FC48C00F5E44C /* VTCandidateController.swift */; };
|
5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE19278FC48C00F5E44C /* VTCandidateController.swift */; };
|
||||||
5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE1A278FC48C00F5E44C /* HorizontalCandidateController.swift */; };
|
5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE1A278FC48C00F5E44C /* HorizontalCandidateController.swift */; };
|
||||||
|
@ -100,6 +102,9 @@
|
||||||
5B9781D82763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "Source/zh-Hans.lproj/Localizable.strings"; sourceTree = "<group>"; };
|
5B9781D82763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "Source/zh-Hans.lproj/Localizable.strings"; sourceTree = "<group>"; };
|
||||||
5B9781D92763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hans"; path = "zh-Hans.lproj/MainMenu.xib"; sourceTree = "<group>"; };
|
5B9781D92763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hans"; path = "zh-Hans.lproj/MainMenu.xib"; sourceTree = "<group>"; };
|
||||||
5BA923AC2791B7C20001323A /* vChewingInstaller-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "vChewingInstaller-Bridging-Header.h"; sourceTree = "<group>"; };
|
5BA923AC2791B7C20001323A /* vChewingInstaller-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "vChewingInstaller-Bridging-Header.h"; sourceTree = "<group>"; };
|
||||||
|
5BC2D2842793B434002C0BEC /* KeyValueBlobReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KeyValueBlobReader.h; sourceTree = "<group>"; };
|
||||||
|
5BC2D2852793B434002C0BEC /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = "<group>"; };
|
||||||
|
5BC2D2862793B434002C0BEC /* KeyValueBlobReader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KeyValueBlobReader.cpp; sourceTree = "<group>"; };
|
||||||
5BC3EE18278FC48C00F5E44C /* VerticalCandidateController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VerticalCandidateController.swift; sourceTree = "<group>"; };
|
5BC3EE18278FC48C00F5E44C /* VerticalCandidateController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VerticalCandidateController.swift; sourceTree = "<group>"; };
|
||||||
5BC3EE19278FC48C00F5E44C /* VTCandidateController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VTCandidateController.swift; sourceTree = "<group>"; };
|
5BC3EE19278FC48C00F5E44C /* VTCandidateController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VTCandidateController.swift; sourceTree = "<group>"; };
|
||||||
5BC3EE1A278FC48C00F5E44C /* HorizontalCandidateController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = HorizontalCandidateController.swift; sourceTree = "<group>"; };
|
5BC3EE1A278FC48C00F5E44C /* HorizontalCandidateController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = HorizontalCandidateController.swift; sourceTree = "<group>"; };
|
||||||
|
@ -257,6 +262,16 @@
|
||||||
path = LanguageModel;
|
path = LanguageModel;
|
||||||
sourceTree = "<group>";
|
sourceTree = "<group>";
|
||||||
};
|
};
|
||||||
|
5BC2D2832793B434002C0BEC /* vChewing */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
5BC2D2842793B434002C0BEC /* KeyValueBlobReader.h */,
|
||||||
|
5BC2D2852793B434002C0BEC /* CMakeLists.txt */,
|
||||||
|
5BC2D2862793B434002C0BEC /* KeyValueBlobReader.cpp */,
|
||||||
|
);
|
||||||
|
path = vChewing;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
5BE798A12792E50F00337FF9 /* UI */ = {
|
5BE798A12792E50F00337FF9 /* UI */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
@ -365,6 +380,7 @@
|
||||||
6A0D4F1215FC0EB100ABF4B3 /* Engine */ = {
|
6A0D4F1215FC0EB100ABF4B3 /* Engine */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
5BC2D2832793B434002C0BEC /* vChewing */,
|
||||||
5BA8DAFE27928120009C9FFF /* LanguageModel */,
|
5BA8DAFE27928120009C9FFF /* LanguageModel */,
|
||||||
6A0D4F1315FC0EB100ABF4B3 /* Gramambular */,
|
6A0D4F1315FC0EB100ABF4B3 /* Gramambular */,
|
||||||
6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */,
|
6A0D4F1F15FC0EB100ABF4B3 /* Mandarin */,
|
||||||
|
@ -626,6 +642,7 @@
|
||||||
5BF4A70027844DC5007DC6E7 /* frmAboutWindow.xib in Resources */,
|
5BF4A70027844DC5007DC6E7 /* frmAboutWindow.xib in Resources */,
|
||||||
5BC3FB83278492DE0022E99A /* data-chs.txt in Resources */,
|
5BC3FB83278492DE0022E99A /* data-chs.txt in Resources */,
|
||||||
5B000FC4278495AD004F02AC /* SimpBopomofo@2x.tiff in Resources */,
|
5B000FC4278495AD004F02AC /* SimpBopomofo@2x.tiff in Resources */,
|
||||||
|
5BC2D2872793B434002C0BEC /* CMakeLists.txt in Resources */,
|
||||||
);
|
);
|
||||||
runOnlyForDeploymentPostprocessing = 0;
|
runOnlyForDeploymentPostprocessing = 0;
|
||||||
};
|
};
|
||||||
|
@ -677,6 +694,7 @@
|
||||||
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */,
|
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */,
|
||||||
5BF4A6FE27844738007DC6E7 /* frmAboutWindow.m in Sources */,
|
5BF4A6FE27844738007DC6E7 /* frmAboutWindow.m in Sources */,
|
||||||
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */,
|
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */,
|
||||||
|
5BC2D2882793B434002C0BEC /* KeyValueBlobReader.cpp in Sources */,
|
||||||
5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */,
|
5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */,
|
||||||
5BE798A42792E58A00337FF9 /* TooltipController.swift in Sources */,
|
5BE798A42792E58A00337FF9 /* TooltipController.swift in Sources */,
|
||||||
5BDF2D062791DFF200838ADB /* AppDelegate.swift in Sources */,
|
5BDF2D062791DFF200838ADB /* AppDelegate.swift in Sources */,
|
||||||
|
|
Loading…
Reference in New Issue