Filters duplicated unigram values properly.

2022-01-15 18:23:52 +08:00 · 2022-01-15 18:23:52 +08:00 · b348a05735
parent 8584f5c4b3
commit b348a05735
2 changed files with 40 additions and 54 deletions
--- a/Source/Engine/McBopomofoLM.cpp
+++ b/Source/Engine/McBopomofoLM.cpp
@ -24,7 +24,6 @@
 #include "McBopomofoLM.h"
 #include <algorithm>
 #include <iterator>
 #include <unordered_set>
 using namespace McBopomofo;
@ -61,7 +60,8 @@ void McBopomofoLM::loadUserPhrases(const char* userPhrasesDataPath,
    }
 }
-void McBopomofoLM::loadPhraseReplacementMap(const char* phraseReplacementPath) {
+void McBopomofoLM::loadPhraseReplacementMap(const char* phraseReplacementPath)
 {
    if (phraseReplacementPath) {
        m_phraseReplacement.close();
        m_phraseReplacement.open(phraseReplacementPath);
@ -75,75 +75,37 @@ const vector<Bigram> McBopomofoLM::bigramsForKeys(const string& preceedingKey, c
 const vector<Unigram> McBopomofoLM::unigramsForKey(const string& key)
 {
-    vector<Unigram> unigrams;
+    vector<Unigram> allUnigrams;
    vector<Unigram> userUnigrams;
    // Use unordered_set so that you don't have to do O(n*m)
    unordered_set<string> excludedValues;
-    unordered_set<string> userValues;
+    unordered_set<string> insertedValues;
    if (m_excludedPhrases.hasUnigramsForKey(key)) {
        vector<Unigram> excludedUnigrams = m_excludedPhrases.unigramsForKey(key);
        transform(excludedUnigrams.begin(), excludedUnigrams.end(),
            inserter(excludedValues, excludedValues.end()),
-                  [](const Unigram &u) { return u.keyValue.value; });
+            [](const Unigram& u) { return u.keyValue.value; });
    }
    if (m_userPhrases.hasUnigramsForKey(key)) {
        vector<Unigram> rawUserUnigrams = m_userPhrases.unigramsForKey(key);
-        vector<Unigram> filterredUserUnigrams = m_userPhrases.unigramsForKey(key);
+        userUnigrams = filterAndTransformUnigrams(rawUserUnigrams, excludedValues, insertedValues);
        for (auto&& unigram : rawUserUnigrams) {
            if (excludedValues.find(unigram.keyValue.value) == excludedValues.end()) {
                filterredUserUnigrams.push_back(unigram);
            }
        }
        transform(filterredUserUnigrams.begin(), filterredUserUnigrams.end(),
                  inserter(userValues, userValues.end()),
                  [](const Unigram &u) { return u.keyValue.value; });
        if (m_phraseReplacementEnabled) {
            for (auto&& unigram : filterredUserUnigrams) {
                string value = unigram.keyValue.value;
                string replacement = m_phraseReplacement.valueForKey(value);
                if (replacement != "") {
                    unigram.keyValue.value = replacement;
                }
                unigrams.push_back(unigram);
            }
        } else {
            unigrams = filterredUserUnigrams;
        }
    }
    if (m_languageModel.hasUnigramsForKey(key)) {
-        vector<Unigram> globalUnigrams = m_languageModel.unigramsForKey(key);
+        vector<Unigram> rawGlobalUnigrams = m_languageModel.unigramsForKey(key);
-
+        allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues);
        for (auto&& unigram : globalUnigrams) {
            string value = unigram.keyValue.value;
            if (excludedValues.find(value) == excludedValues.end() &&
                userValues.find(value) == userValues.end()) {
                if (m_phraseReplacementEnabled) {
                    string replacement = m_phraseReplacement.valueForKey(value);
                    if (replacement != "") {
                        unigram.keyValue.value = replacement;
                    }
                }
                unigrams.push_back(unigram);
            }
        }
    }
-    unigrams.insert(unigrams.begin(), userUnigrams.begin(), userUnigrams.end());
+    allUnigrams.insert(allUnigrams.begin(), userUnigrams.begin(), userUnigrams.end());
-    return unigrams;
+    return allUnigrams;
 }
 bool McBopomofoLM::hasUnigramsForKey(const string& key)
 {
    if (!m_excludedPhrases.hasUnigramsForKey(key)) {
-        return m_userPhrases.hasUnigramsForKey(key) ||
+        return m_userPhrases.hasUnigramsForKey(key) || m_languageModel.hasUnigramsForKey(key);
        m_languageModel.hasUnigramsForKey(key);
    }
    return unigramsForKey(key).size() > 0;
@ -159,3 +121,23 @@ bool McBopomofoLM::phraseReplacementEnabled()
    return m_phraseReplacementEnabled;
 }
 const vector<Unigram> McBopomofoLM::filterAndTransformUnigrams(vector<Unigram> unigrams, const unordered_set<string>& excludedValues, unordered_set<string>& insertedValues)
 {
    vector<Unigram> results;
    for (auto&& unigram : unigrams) {
        string value = unigram.keyValue.value;
        if (m_phraseReplacementEnabled) {
            string replacement = m_phraseReplacement.valueForKey(value);
            if (replacement != "") {
                value = replacement;
                unigram.keyValue.value = value;
            }
        }
        if (excludedValues.find(value) == excludedValues.end() && insertedValues.find(value) == insertedValues.end()) {
            results.push_back(unigram);
            insertedValues.insert(value);
        }
    }
    return results;
 }
--- a/Source/Engine/McBopomofoLM.h
+++ b/Source/Engine/McBopomofoLM.h
@ -28,6 +28,7 @@
 #include "UserPhrasesLM.h"
 #include "ParselessLM.h"
 #include "PhraseReplacementMap.h"
 #include <unordered_set>
 namespace McBopomofo {
@ -38,9 +39,8 @@ public:
    McBopomofoLM();
    ~McBopomofoLM();
-    void loadLanguageModel(const char* languageModelDataPath);
+    void loadLanguageModel(const char* languageModelPath);
-    void loadUserPhrases(const char* userPhrasesDataPath,
+    void loadUserPhrases(const char* userPhrasesPath, const char* excludedPhrasesPath);
                         const char* excludedPhrasesDataPath);
    void loadPhraseReplacementMap(const char* phraseReplacementPath);
    const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key);
@ -51,6 +51,10 @@ public:
    bool phraseReplacementEnabled();
 protected:
    const vector<Unigram> filterAndTransformUnigrams(vector<Unigram> unigrams,
        const std::unordered_set<string>& excludedValues,
        std::unordered_set<string>& insertedValues);
    ParselessLM m_languageModel;
    UserPhrasesLM m_userPhrases;
    UserPhrasesLM m_excludedPhrases;