vChewing-macOS/Source/Engine/ParselessPhraseDB.cpp

167 lines
4.4 KiB
C++

// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include "ParselessPhraseDB.h"
#include <cassert>
#include <cstring>
namespace McBopomofo {
ParselessPhraseDB::ParselessPhraseDB(
const char* buf, size_t length, bool validate_pragma)
: begin_(buf)
, end_(buf + length)
{
assert(buf != nullptr);
assert(length > 0);
if (validate_pragma) {
assert(length > SORTED_PRAGMA_HEADER.length());
std::string_view header(buf, SORTED_PRAGMA_HEADER.length());
assert(header == SORTED_PRAGMA_HEADER);
uint32_t x = 5381;
for (const auto& i : header) {
x = x * 33 + i;
}
assert(x == uint32_t { 3012373384 });
begin_ += header.length();
}
}
std::vector<std::string_view> ParselessPhraseDB::findRows(
const std::string_view& key)
{
std::vector<std::string_view> rows;
const char* ptr = findFirstMatchingLine(key);
if (ptr == nullptr) {
return rows;
}
while (ptr + key.length() <= end_
&& memcmp(ptr, key.data(), key.length()) == 0) {
const char* eol = ptr;
while (eol != end_ && *eol != '\n') {
++eol;
}
rows.emplace_back(ptr, eol - ptr);
if (eol == end_) {
break;
}
ptr = ++eol;
}
return rows;
}
// Implements a binary search that returns the pointer to the first matching
// row. In its core it's just a standard binary search, but we use backtracking
// to locate the line start. We also check the previous line to see if the
// current line is actually the first matching line: if the previous line is
// less to the key and the current line starts exactly with the key, then
// the current line is the first matching line.
const char* ParselessPhraseDB::findFirstMatchingLine(
const std::string_view& key)
{
if (key.empty()) {
return begin_;
}
const char* top = begin_;
const char* bottom = end_;
while (top < bottom) {
const char* mid = top + (bottom - top) / 2;
const char* ptr = mid;
if (ptr != begin_) {
--ptr;
}
while (ptr != begin_ && *ptr != '\n') {
--ptr;
}
const char* prev = nullptr;
if (*ptr == '\n') {
prev = ptr;
++ptr;
}
// ptr is now in the "current" line we're interested in.
if (ptr + key.length() > end_) {
// not enough data to compare at this point, bail.
break;
}
int current_cmp = memcmp(ptr, key.data(), key.length());
if (current_cmp > 0) {
bottom = mid - 1;
continue;
}
if (current_cmp < 0) {
top = mid + 1;
continue;
}
if (!prev) {
return ptr;
}
// Move the prev so that it reaches the previous line.
if (prev != begin_) {
--prev;
}
while (prev != begin_ && *prev != '\n') {
--prev;
}
if (*prev == '\n') {
++prev;
}
int prev_cmp = memcmp(prev, key.data(), key.length());
// This is the first occurrence.
if (prev_cmp < 0 && current_cmp == 0) {
return ptr;
}
// This is not, which means ptr is "larger" than the keyData.
bottom = mid - 1;
}
return nullptr;
}
}; // namespace McBopomofo