152 lines
4.1 KiB
C++
152 lines
4.1 KiB
C++
// Copyright (c) 2022 and onwards The McBopomofo Authors.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person
|
|
// obtaining a copy of this software and associated documentation
|
|
// files (the "Software"), to deal in the Software without
|
|
// restriction, including without limitation the rights to use,
|
|
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the
|
|
// Software is furnished to do so, subject to the following
|
|
// conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be
|
|
// included in all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
// OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
#include "ParselessLM.h"
|
|
|
|
#include <fcntl.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
|
|
#include <memory>
|
|
|
|
McBopomofo::ParselessLM::~ParselessLM() { close(); }
|
|
|
|
bool McBopomofo::ParselessLM::isLoaded()
|
|
{
|
|
if (data_) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool McBopomofo::ParselessLM::open(const std::string_view& path)
|
|
{
|
|
if (data_) {
|
|
return false;
|
|
}
|
|
|
|
fd_ = ::open(path.data(), O_RDONLY);
|
|
if (fd_ == -1) {
|
|
return false;
|
|
}
|
|
|
|
struct stat sb;
|
|
if (fstat(fd_, &sb) == -1) {
|
|
::close(fd_);
|
|
fd_ = -1;
|
|
return false;
|
|
}
|
|
|
|
length_ = static_cast<size_t>(sb.st_size);
|
|
|
|
data_ = mmap(NULL, length_, PROT_READ, MAP_SHARED, fd_, 0);
|
|
if (data_ == nullptr) {
|
|
::close(fd_);
|
|
fd_ = -1;
|
|
length_ = 0;
|
|
return false;
|
|
}
|
|
|
|
db_ = std::unique_ptr<ParselessPhraseDB>(new ParselessPhraseDB(
|
|
static_cast<char*>(data_), length_, /*validate_pragme=*/
|
|
true));
|
|
return true;
|
|
}
|
|
|
|
void McBopomofo::ParselessLM::close()
|
|
{
|
|
if (data_ != nullptr) {
|
|
munmap(data_, length_);
|
|
::close(fd_);
|
|
fd_ = -1;
|
|
length_ = 0;
|
|
data_ = nullptr;
|
|
}
|
|
}
|
|
|
|
const std::vector<Formosa::Gramambular::Bigram>
|
|
McBopomofo::ParselessLM::bigramsForKeys(
|
|
const std::string& preceedingKey, const std::string& key)
|
|
{
|
|
return std::vector<Formosa::Gramambular::Bigram>();
|
|
}
|
|
|
|
const std::vector<Formosa::Gramambular::Unigram>
|
|
McBopomofo::ParselessLM::unigramsForKey(const std::string& key)
|
|
{
|
|
if (db_ == nullptr) {
|
|
return std::vector<Formosa::Gramambular::Unigram>();
|
|
}
|
|
|
|
std::vector<Formosa::Gramambular::Unigram> results;
|
|
for (const auto& row : db_->findRows(key + " ")) {
|
|
Formosa::Gramambular::Unigram unigram;
|
|
|
|
// Move ahead until we encounter the first space. This is the key.
|
|
auto it = row.begin();
|
|
while (it != row.end() && *it != ' ') {
|
|
++it;
|
|
}
|
|
|
|
unigram.keyValue.key = std::string(row.begin(), it);
|
|
|
|
// Read past the space.
|
|
if (it != row.end()) {
|
|
++it;
|
|
}
|
|
|
|
if (it != row.end()) {
|
|
// Now it is the start of the value portion.
|
|
auto value_begin = it;
|
|
|
|
// Move ahead until we encounter the second space. This is the
|
|
// value.
|
|
while (it != row.end() && *it != ' ') {
|
|
++it;
|
|
}
|
|
unigram.keyValue.value = std::string(value_begin, it);
|
|
}
|
|
|
|
// Read past the space. The remainder, if it exists, is the score.
|
|
if (it != row.end()) {
|
|
++it;
|
|
}
|
|
|
|
if (it != row.end()) {
|
|
unigram.score = std::stod(std::string(it, row.end()));
|
|
}
|
|
results.push_back(unigram);
|
|
}
|
|
return results;
|
|
}
|
|
|
|
bool McBopomofo::ParselessLM::hasUnigramsForKey(const std::string& key)
|
|
{
|
|
if (db_ == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
return db_->findFirstMatchingLine(key + " ") != nullptr;
|
|
}
|