From b348a057359e3bd13825f973a567f3fd04f3f70e Mon Sep 17 00:00:00 2001 From: zonble Date: Sat, 15 Jan 2022 18:23:52 +0800 Subject: [PATCH] Filters duplicated unigram values properly. --- Source/Engine/McBopomofoLM.cpp | 84 +++++++++++++--------------------- Source/Engine/McBopomofoLM.h | 10 ++-- 2 files changed, 40 insertions(+), 54 deletions(-) diff --git a/Source/Engine/McBopomofoLM.cpp b/Source/Engine/McBopomofoLM.cpp index ea85c2dc..3fba9fc9 100644 --- a/Source/Engine/McBopomofoLM.cpp +++ b/Source/Engine/McBopomofoLM.cpp @@ -24,7 +24,6 @@ #include "McBopomofoLM.h" #include #include -#include using namespace McBopomofo; @@ -49,7 +48,7 @@ void McBopomofoLM::loadLanguageModel(const char* languageModelDataPath) } void McBopomofoLM::loadUserPhrases(const char* userPhrasesDataPath, - const char* excludedPhrasesDataPath) + const char* excludedPhrasesDataPath) { if (userPhrasesDataPath) { m_userPhrases.close(); @@ -61,7 +60,8 @@ void McBopomofoLM::loadUserPhrases(const char* userPhrasesDataPath, } } -void McBopomofoLM::loadPhraseReplacementMap(const char* phraseReplacementPath) { +void McBopomofoLM::loadPhraseReplacementMap(const char* phraseReplacementPath) +{ if (phraseReplacementPath) { m_phraseReplacement.close(); m_phraseReplacement.open(phraseReplacementPath); @@ -75,75 +75,37 @@ const vector McBopomofoLM::bigramsForKeys(const string& preceedingKey, c const vector McBopomofoLM::unigramsForKey(const string& key) { - vector unigrams; + vector allUnigrams; vector userUnigrams; - // Use unordered_set so that you don't have to do O(n*m) unordered_set excludedValues; - unordered_set userValues; + unordered_set insertedValues; if (m_excludedPhrases.hasUnigramsForKey(key)) { vector excludedUnigrams = m_excludedPhrases.unigramsForKey(key); transform(excludedUnigrams.begin(), excludedUnigrams.end(), - inserter(excludedValues, excludedValues.end()), - [](const Unigram &u) { return u.keyValue.value; }); + inserter(excludedValues, excludedValues.end()), + [](const Unigram& u) { return u.keyValue.value; }); } if (m_userPhrases.hasUnigramsForKey(key)) { vector rawUserUnigrams = m_userPhrases.unigramsForKey(key); - vector filterredUserUnigrams = m_userPhrases.unigramsForKey(key); - - for (auto&& unigram : rawUserUnigrams) { - if (excludedValues.find(unigram.keyValue.value) == excludedValues.end()) { - filterredUserUnigrams.push_back(unigram); - } - } - - transform(filterredUserUnigrams.begin(), filterredUserUnigrams.end(), - inserter(userValues, userValues.end()), - [](const Unigram &u) { return u.keyValue.value; }); - - if (m_phraseReplacementEnabled) { - for (auto&& unigram : filterredUserUnigrams) { - string value = unigram.keyValue.value; - string replacement = m_phraseReplacement.valueForKey(value); - if (replacement != "") { - unigram.keyValue.value = replacement; - } - unigrams.push_back(unigram); - } - } else { - unigrams = filterredUserUnigrams; - } + userUnigrams = filterAndTransformUnigrams(rawUserUnigrams, excludedValues, insertedValues); } if (m_languageModel.hasUnigramsForKey(key)) { - vector globalUnigrams = m_languageModel.unigramsForKey(key); - - for (auto&& unigram : globalUnigrams) { - string value = unigram.keyValue.value; - if (excludedValues.find(value) == excludedValues.end() && - userValues.find(value) == userValues.end()) { - if (m_phraseReplacementEnabled) { - string replacement = m_phraseReplacement.valueForKey(value); - if (replacement != "") { - unigram.keyValue.value = replacement; - } - } - unigrams.push_back(unigram); - } - } + vector rawGlobalUnigrams = m_languageModel.unigramsForKey(key); + allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues); } - unigrams.insert(unigrams.begin(), userUnigrams.begin(), userUnigrams.end()); - return unigrams; + allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end()); + return allUnigrams; } bool McBopomofoLM::hasUnigramsForKey(const string& key) { if (!m_excludedPhrases.hasUnigramsForKey(key)) { - return m_userPhrases.hasUnigramsForKey(key) || - m_languageModel.hasUnigramsForKey(key); + return m_userPhrases.hasUnigramsForKey(key) || m_languageModel.hasUnigramsForKey(key); } return unigramsForKey(key).size() > 0; @@ -159,3 +121,23 @@ bool McBopomofoLM::phraseReplacementEnabled() return m_phraseReplacementEnabled; } +const vector McBopomofoLM::filterAndTransformUnigrams(vector unigrams, const unordered_set& excludedValues, unordered_set& insertedValues) +{ + vector results; + + for (auto&& unigram : unigrams) { + string value = unigram.keyValue.value; + if (m_phraseReplacementEnabled) { + string replacement = m_phraseReplacement.valueForKey(value); + if (replacement != "") { + value = replacement; + unigram.keyValue.value = value; + } + } + if (excludedValues.find(value) == excludedValues.end() && insertedValues.find(value) == insertedValues.end()) { + results.push_back(unigram); + insertedValues.insert(value); + } + } + return results; +} diff --git a/Source/Engine/McBopomofoLM.h b/Source/Engine/McBopomofoLM.h index 00babc01..00dbc360 100644 --- a/Source/Engine/McBopomofoLM.h +++ b/Source/Engine/McBopomofoLM.h @@ -28,6 +28,7 @@ #include "UserPhrasesLM.h" #include "ParselessLM.h" #include "PhraseReplacementMap.h" +#include namespace McBopomofo { @@ -38,9 +39,8 @@ public: McBopomofoLM(); ~McBopomofoLM(); - void loadLanguageModel(const char* languageModelDataPath); - void loadUserPhrases(const char* userPhrasesDataPath, - const char* excludedPhrasesDataPath); + void loadLanguageModel(const char* languageModelPath); + void loadUserPhrases(const char* userPhrasesPath, const char* excludedPhrasesPath); void loadPhraseReplacementMap(const char* phraseReplacementPath); const vector bigramsForKeys(const string& preceedingKey, const string& key); @@ -51,6 +51,10 @@ public: bool phraseReplacementEnabled(); protected: + const vector filterAndTransformUnigrams(vector unigrams, + const std::unordered_set& excludedValues, + std::unordered_set& insertedValues); + ParselessLM m_languageModel; UserPhrasesLM m_userPhrases; UserPhrasesLM m_excludedPhrases;