diff --git a/Source/Engine/LanguageModel/vChewingLM.cpp b/Source/Engine/LanguageModel/vChewingLM.cpp index 15bf43a6..8fd5b82a 100644 --- a/Source/Engine/LanguageModel/vChewingLM.cpp +++ b/Source/Engine/LanguageModel/vChewingLM.cpp @@ -9,7 +9,6 @@ #include "vChewingLM.h" #include #include -#include using namespace vChewing; @@ -46,7 +45,8 @@ void vChewingLM::loadUserPhrases(const char* userPhrasesDataPath, } } -void vChewingLM::loadPhraseReplacementMap(const char* phraseReplacementPath) { +void vChewingLM::loadPhraseReplacementMap(const char* phraseReplacementPath) +{ if (phraseReplacementPath) { m_phraseReplacement.close(); m_phraseReplacement.open(phraseReplacementPath); @@ -60,90 +60,69 @@ const vector vChewingLM::bigramsForKeys(const string& preceedingKey, con const vector vChewingLM::unigramsForKey(const string& key) { - vector unigrams; + vector allUnigrams; vector userUnigrams; - - // Use unordered_set so that you don't have to do O(n*m) + unordered_set excludedValues; - unordered_set userValues; - + unordered_set insertedValues; + if (m_excludedPhrases.hasUnigramsForKey(key)) { vector excludedUnigrams = m_excludedPhrases.unigramsForKey(key); transform(excludedUnigrams.begin(), excludedUnigrams.end(), inserter(excludedValues, excludedValues.end()), - [](const Unigram &u) { return u.keyValue.value; }); + [](const Unigram& u) { return u.keyValue.value; }); } - + if (m_userPhrases.hasUnigramsForKey(key)) { vector rawUserUnigrams = m_userPhrases.unigramsForKey(key); - vector filterredUserUnigrams; - - for (auto&& unigram : rawUserUnigrams) { - if (excludedValues.find(unigram.keyValue.value) == excludedValues.end()) { - filterredUserUnigrams.push_back(unigram); - } - } - - transform(filterredUserUnigrams.begin(), filterredUserUnigrams.end(), - inserter(userValues, userValues.end()), - [](const Unigram &u) { return u.keyValue.value; }); - - if (m_phraseReplacementEnabled) { - for (auto&& unigram : filterredUserUnigrams) { - string value = unigram.keyValue.value; - string replacement = m_phraseReplacement.valueForKey(value); - if (replacement != "") { - unigram.keyValue.value = replacement; - } - unigrams.push_back(unigram); - } - } else { - unigrams = filterredUserUnigrams; - } + userUnigrams = filterAndTransformUnigrams(rawUserUnigrams, excludedValues, insertedValues); } if (m_languageModel.hasUnigramsForKey(key)) { - vector globalUnigrams = m_languageModel.unigramsForKey(key); - - for (auto&& unigram : globalUnigrams) { - string value = unigram.keyValue.value; - if (excludedValues.find(value) == excludedValues.end() && - userValues.find(value) == userValues.end()) { - if (m_phraseReplacementEnabled) { - string replacement = m_phraseReplacement.valueForKey(value); - if (replacement != "") { - unigram.keyValue.value = replacement; - } - } - unigrams.push_back(unigram); - } - } + vector rawGlobalUnigrams = m_languageModel.unigramsForKey(key); + allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues); } - - unigrams.insert(unigrams.begin(), userUnigrams.begin(), userUnigrams.end()); - return unigrams; + + allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end()); + return allUnigrams; } bool vChewingLM::hasUnigramsForKey(const string& key) { - if (key == " ") { - return true; + if (!m_excludedPhrases.hasUnigramsForKey(key)) { + return m_userPhrases.hasUnigramsForKey(key) || m_languageModel.hasUnigramsForKey(key); } - if (!m_excludedPhrases.hasUnigramsForKey(key)) { - return m_userPhrases.hasUnigramsForKey(key) || - m_languageModel.hasUnigramsForKey(key); - } - return unigramsForKey(key).size() > 0; } - + void vChewingLM::setPhraseReplacementEnabled(bool enabled) { - m_phraseReplacementEnabled = enabled; + m_phraseReplacementEnabled = enabled; } - + bool vChewingLM::phraseReplacementEnabled() { return m_phraseReplacementEnabled; } + +const vector vChewingLM::filterAndTransformUnigrams(vector unigrams, const unordered_set& excludedValues, unordered_set& insertedValues) +{ + vector results; + + for (auto&& unigram : unigrams) { + string value = unigram.keyValue.value; + if (m_phraseReplacementEnabled) { + string replacement = m_phraseReplacement.valueForKey(value); + if (replacement != "") { + value = replacement; + unigram.keyValue.value = value; + } + } + if (excludedValues.find(value) == excludedValues.end() && insertedValues.find(value) == insertedValues.end()) { + results.push_back(unigram); + insertedValues.insert(value); + } + } + return results; +} diff --git a/Source/Engine/LanguageModel/vChewingLM.h b/Source/Engine/LanguageModel/vChewingLM.h index 95bf0d09..ce339db5 100644 --- a/Source/Engine/LanguageModel/vChewingLM.h +++ b/Source/Engine/LanguageModel/vChewingLM.h @@ -10,9 +10,10 @@ #define VCHEWINGLM_H #include -#include "FastLM.h" #include "UserPhrasesLM.h" +#include "FastLM.h" #include "PhraseReplacementMap.h" +#include namespace vChewing { @@ -22,20 +23,23 @@ class vChewingLM : public LanguageModel { public: vChewingLM(); ~vChewingLM(); - - void loadLanguageModel(const char* languageModelDataPath); - void loadUserPhrases(const char* userPhrasesDataPath, - const char* excludedPhrasesDataPath); + + void loadLanguageModel(const char* languageModelPath); + void loadUserPhrases(const char* userPhrasesPath, const char* excludedPhrasesPath); void loadPhraseReplacementMap(const char* phraseReplacementPath); - + const vector bigramsForKeys(const string& preceedingKey, const string& key); const vector unigramsForKey(const string& key); bool hasUnigramsForKey(const string& key); - + void setPhraseReplacementEnabled(bool enabled); bool phraseReplacementEnabled(); - + protected: + const vector filterAndTransformUnigrams(vector unigrams, + const std::unordered_set& excludedValues, + std::unordered_set& insertedValues); + FastLM m_languageModel; UserPhrasesLM m_userPhrases; UserPhrasesLM m_excludedPhrases;