vChewing-macOS/Source/Engine/McBopomofoLM.cpp

187 lines
5.7 KiB
C++

// Copyright (c) 2022 and onwards The McBopomofo Authors.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include "McBopomofoLM.h"
#include <algorithm>
#include <iterator>
using namespace McBopomofo;
McBopomofoLM::McBopomofoLM()
{
}
McBopomofoLM::~McBopomofoLM()
{
m_languageModel.close();
m_userPhrases.close();
m_excludedPhrases.close();
m_phraseReplacement.close();
}
void McBopomofoLM::loadLanguageModel(const char* languageModelDataPath)
{
if (languageModelDataPath) {
m_languageModel.close();
m_languageModel.open(languageModelDataPath);
}
}
void McBopomofoLM::loadUserPhrases(const char* userPhrasesDataPath,
const char* excludedPhrasesDataPath)
{
if (userPhrasesDataPath) {
m_userPhrases.close();
m_userPhrases.open(userPhrasesDataPath);
}
if (excludedPhrasesDataPath) {
m_excludedPhrases.close();
m_excludedPhrases.open(excludedPhrasesDataPath);
}
}
void McBopomofoLM::loadPhraseReplacementMap(const char* phraseReplacementPath)
{
if (phraseReplacementPath) {
m_phraseReplacement.close();
m_phraseReplacement.open(phraseReplacementPath);
}
}
const vector<Bigram> McBopomofoLM::bigramsForKeys(const string& preceedingKey, const string& key)
{
return vector<Bigram>();
}
const vector<Unigram> McBopomofoLM::unigramsForKey(const string& key)
{
if (key == " ") {
vector<Unigram> spaceUnigrams;
Unigram g;
g.keyValue.key = " ";
g.keyValue.value= " ";
g.score = 0;
spaceUnigrams.push_back(g);
return spaceUnigrams;
}
vector<Unigram> allUnigrams;
vector<Unigram> userUnigrams;
unordered_set<string> excludedValues;
unordered_set<string> insertedValues;
if (m_excludedPhrases.hasUnigramsForKey(key)) {
vector<Unigram> excludedUnigrams = m_excludedPhrases.unigramsForKey(key);
transform(excludedUnigrams.begin(), excludedUnigrams.end(),
inserter(excludedValues, excludedValues.end()),
[](const Unigram& u) { return u.keyValue.value; });
}
if (m_userPhrases.hasUnigramsForKey(key)) {
vector<Unigram> rawUserUnigrams = m_userPhrases.unigramsForKey(key);
userUnigrams = filterAndTransformUnigrams(rawUserUnigrams, excludedValues, insertedValues);
}
if (m_languageModel.hasUnigramsForKey(key)) {
vector<Unigram> rawGlobalUnigrams = m_languageModel.unigramsForKey(key);
allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues);
}
allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end());
return allUnigrams;
}
bool McBopomofoLM::hasUnigramsForKey(const string& key)
{
if (key == " ") {
return true;
}
if (!m_excludedPhrases.hasUnigramsForKey(key)) {
return m_userPhrases.hasUnigramsForKey(key) || m_languageModel.hasUnigramsForKey(key);
}
return unigramsForKey(key).size() > 0;
}
void McBopomofoLM::setPhraseReplacementEnabled(bool enabled)
{
m_phraseReplacementEnabled = enabled;
}
bool McBopomofoLM::phraseReplacementEnabled()
{
return m_phraseReplacementEnabled;
}
void McBopomofoLM::setExternalConverterEnabled(bool enabled)
{
m_externalConverterEnabled = enabled;
}
bool McBopomofoLM::externalConverterEnabled()
{
return m_externalConverterEnabled;
}
void McBopomofoLM::setExternalConverter(std::function<string(string)> externalConverter)
{
m_externalConverter = externalConverter;
}
const vector<Unigram> McBopomofoLM::filterAndTransformUnigrams(const vector<Unigram> unigrams, const unordered_set<string>& excludedValues, unordered_set<string>& insertedValues)
{
vector<Unigram> results;
for (auto&& unigram : unigrams) {
// excludedValues filters out the unigrams with the original value.
// insertedValues filters out the ones with the converted value
string originalValue = unigram.keyValue.value;
if (excludedValues.find(originalValue) != excludedValues.end()) {
continue;
}
string value = originalValue;
if (m_phraseReplacementEnabled) {
string replacement = m_phraseReplacement.valueForKey(value);
if (replacement != "") {
value = replacement;
}
}
if (m_externalConverterEnabled && m_externalConverter) {
string replacement = m_externalConverter(value);
value = replacement;
}
if (insertedValues.find(value) == insertedValues.end()) {
Unigram g;
g.keyValue.value = value;
g.keyValue.key = unigram.keyValue.key;
g.score = unigram.score;
results.push_back(g);
insertedValues.insert(value);
}
}
return results;
}