LMInstantiator // Dealing with Namespace Pollusion.

This commit is contained in:
ShikiSuen 2022-02-20 22:18:25 +08:00
parent 878b5270a0
commit 26e79fe2d4
2 changed files with 47 additions and 45 deletions

View File

@ -20,12 +20,12 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR TH
#ifndef LMInstantiator_H #ifndef LMInstantiator_H
#define LMInstantiator_H #define LMInstantiator_H
#include <stdio.h>
#include "UserPhrasesLM.h"
#include "ParselessLM.h"
#include "CNSLM.h"
#include "PhraseReplacementMap.h"
#include "AssociatedPhrases.h" #include "AssociatedPhrases.h"
#include "CNSLM.h"
#include "ParselessLM.h"
#include "PhraseReplacementMap.h"
#include "UserPhrasesLM.h"
#include <stdio.h>
#include <unordered_set> #include <unordered_set>
namespace vChewing { namespace vChewing {
@ -54,7 +54,7 @@ using namespace Taiyan::Gramambular;
/// model while launching and to load the user phrases anytime if the custom /// model while launching and to load the user phrases anytime if the custom
/// files are modified. It does not keep the reference of the data pathes but /// files are modified. It does not keep the reference of the data pathes but
/// you have to pass the paths when you ask it to do loading. /// you have to pass the paths when you ask it to do loading.
class LMInstantiator : public LanguageModel { class LMInstantiator : public Taiyan::Gramambular::LanguageModel {
public: public:
LMInstantiator(); LMInstantiator();
~LMInstantiator(); ~LMInstantiator();
@ -83,14 +83,14 @@ public:
void loadPhraseReplacementMap(const char* phraseReplacementPath); void loadPhraseReplacementMap(const char* phraseReplacementPath);
/// Not implemented since we do not have data to provide bigram function. /// Not implemented since we do not have data to provide bigram function.
const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key); const std::vector<Taiyan::Gramambular::Bigram> bigramsForKeys(const std::string& preceedingKey, const std::string& key);
/// Returns a list of available unigram for the given key. /// Returns a list of available unigram for the given key.
/// @param key A string represents the BPMF reading or a symbol key. For /// @param key A std::string represents the BPMF reading or a symbol key. For
/// example, it you pass "ㄇㄚ", it returns "嗎", "媽", and so on. /// example, it you pass "ㄇㄚ", it returns "嗎", "媽", and so on.
const vector<Unigram> unigramsForKey(const string& key); const std::vector<Taiyan::Gramambular::Unigram> unigramsForKey(const std::string& key);
/// If the model has unigrams for the given key. /// If the model has unigrams for the given key.
/// @param key The key. /// @param key The key.
bool hasUnigramsForKey(const string& key); bool hasUnigramsForKey(const std::string& key);
/// Enables or disables phrase replacement. /// Enables or disables phrase replacement.
void setPhraseReplacementEnabled(bool enabled); void setPhraseReplacementEnabled(bool enabled);
@ -107,10 +107,10 @@ public:
/// If the external converted is enabled or not. /// If the external converted is enabled or not.
bool externalConverterEnabled(); bool externalConverterEnabled();
/// Sets a lambda to let the values of unigrams could be converted by it. /// Sets a lambda to let the values of unigrams could be converted by it.
void setExternalConverter(std::function<string(string)> externalConverter); void setExternalConverter(std::function<std::string(std::string)> externalConverter);
const vector<std::string> associatedPhrasesForKey(const string& key); const std::vector<std::string> associatedPhrasesForKey(const std::string& key);
bool hasAssociatedPhrasesForKey(const string& key); bool hasAssociatedPhrasesForKey(const std::string& key);
protected: protected:
@ -121,9 +121,9 @@ protected:
/// @param insertedValues The values for unigrams already in the results. /// @param insertedValues The values for unigrams already in the results.
/// It helps to prevent duplicated unigrams. Please note that the method /// It helps to prevent duplicated unigrams. Please note that the method
/// has a side effect that it inserts values to `insertedValues`. /// has a side effect that it inserts values to `insertedValues`.
const vector<Unigram> filterAndTransformUnigrams(const vector<Unigram> unigrams, const std::vector<Taiyan::Gramambular::Unigram> filterAndTransformUnigrams(const std::vector<Taiyan::Gramambular::Unigram> unigrams,
const std::unordered_set<string>& excludedValues, const std::unordered_set<std::string>& excludedValues,
std::unordered_set<string>& insertedValues); std::unordered_set<std::string>& insertedValues);
ParselessLM m_languageModel; ParselessLM m_languageModel;
CNSLM m_cnsModel; CNSLM m_cnsModel;
@ -134,7 +134,7 @@ protected:
bool m_phraseReplacementEnabled; bool m_phraseReplacementEnabled;
bool m_cnsEnabled; bool m_cnsEnabled;
bool m_externalConverterEnabled; bool m_externalConverterEnabled;
std::function<string(string)> m_externalConverter; std::function<std::string(std::string)> m_externalConverter;
}; };
}; };

View File

@ -21,7 +21,7 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR TH
#include <algorithm> #include <algorithm>
#include <iterator> #include <iterator>
using namespace vChewing; namespace vChewing {
LMInstantiator::LMInstantiator() LMInstantiator::LMInstantiator()
{ {
@ -92,49 +92,49 @@ void LMInstantiator::loadPhraseReplacementMap(const char* phraseReplacementPath)
} }
} }
const vector<Bigram> LMInstantiator::bigramsForKeys(const string& preceedingKey, const string& key) const std::vector<Taiyan::Gramambular::Bigram> LMInstantiator::bigramsForKeys(const std::string& preceedingKey, const std::string& key)
{ {
return vector<Bigram>(); return std::vector<Taiyan::Gramambular::Bigram>();
} }
const vector<Unigram> LMInstantiator::unigramsForKey(const string& key) const std::vector<Taiyan::Gramambular::Unigram> LMInstantiator::unigramsForKey(const std::string& key)
{ {
if (key == " ") { if (key == " ") {
vector<Unigram> spaceUnigrams; std::vector<Taiyan::Gramambular::Unigram> spaceUnigrams;
Unigram g; Taiyan::Gramambular::Unigram g;
g.keyValue.key = " "; g.keyValue.key = " ";
g.keyValue.value= " "; g.keyValue.value = " ";
g.score = 0; g.score = 0;
spaceUnigrams.push_back(g); spaceUnigrams.push_back(g);
return spaceUnigrams; return spaceUnigrams;
} }
vector<Unigram> allUnigrams; std::vector<Taiyan::Gramambular::Unigram> allUnigrams;
vector<Unigram> userUnigrams; std::vector<Taiyan::Gramambular::Unigram> userUnigrams;
vector<Unigram> cnsUnigrams; std::vector<Taiyan::Gramambular::Unigram> cnsUnigrams;
unordered_set<string> excludedValues; std::unordered_set<std::string> excludedValues;
unordered_set<string> insertedValues; std::unordered_set<std::string> insertedValues;
if (m_excludedPhrases.hasUnigramsForKey(key)) { if (m_excludedPhrases.hasUnigramsForKey(key)) {
vector<Unigram> excludedUnigrams = m_excludedPhrases.unigramsForKey(key); std::vector<Taiyan::Gramambular::Unigram> excludedUnigrams = m_excludedPhrases.unigramsForKey(key);
transform(excludedUnigrams.begin(), excludedUnigrams.end(), transform(excludedUnigrams.begin(), excludedUnigrams.end(),
inserter(excludedValues, excludedValues.end()), inserter(excludedValues, excludedValues.end()),
[](const Unigram& u) { return u.keyValue.value; }); [](const Taiyan::Gramambular::Unigram& u) { return u.keyValue.value; });
} }
if (m_userPhrases.hasUnigramsForKey(key)) { if (m_userPhrases.hasUnigramsForKey(key)) {
vector<Unigram> rawUserUnigrams = m_userPhrases.unigramsForKey(key); std::vector<Taiyan::Gramambular::Unigram> rawUserUnigrams = m_userPhrases.unigramsForKey(key);
userUnigrams = filterAndTransformUnigrams(rawUserUnigrams, excludedValues, insertedValues); userUnigrams = filterAndTransformUnigrams(rawUserUnigrams, excludedValues, insertedValues);
} }
if (m_languageModel.hasUnigramsForKey(key)) { if (m_languageModel.hasUnigramsForKey(key)) {
vector<Unigram> rawGlobalUnigrams = m_languageModel.unigramsForKey(key); std::vector<Taiyan::Gramambular::Unigram> rawGlobalUnigrams = m_languageModel.unigramsForKey(key);
allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues); allUnigrams = filterAndTransformUnigrams(rawGlobalUnigrams, excludedValues, insertedValues);
} }
if (m_cnsModel.hasUnigramsForKey(key) && m_cnsEnabled) { if (m_cnsModel.hasUnigramsForKey(key) && m_cnsEnabled) {
vector<Unigram> rawCNSUnigrams = m_cnsModel.unigramsForKey(key); std::vector<Taiyan::Gramambular::Unigram> rawCNSUnigrams = m_cnsModel.unigramsForKey(key);
cnsUnigrams = filterAndTransformUnigrams(rawCNSUnigrams, excludedValues, insertedValues); cnsUnigrams = filterAndTransformUnigrams(rawCNSUnigrams, excludedValues, insertedValues);
} }
@ -143,7 +143,7 @@ const vector<Unigram> LMInstantiator::unigramsForKey(const string& key)
return allUnigrams; return allUnigrams;
} }
bool LMInstantiator::hasUnigramsForKey(const string& key) bool LMInstantiator::hasUnigramsForKey(const std::string& key)
{ {
if (key == " ") { if (key == " ") {
return true; return true;
@ -185,36 +185,36 @@ bool LMInstantiator::externalConverterEnabled()
return m_externalConverterEnabled; return m_externalConverterEnabled;
} }
void LMInstantiator::setExternalConverter(std::function<string(string)> externalConverter) void LMInstantiator::setExternalConverter(std::function<std::string(std::string)> externalConverter)
{ {
m_externalConverter = externalConverter; m_externalConverter = externalConverter;
} }
const vector<Unigram> LMInstantiator::filterAndTransformUnigrams(const vector<Unigram> unigrams, const unordered_set<string>& excludedValues, unordered_set<string>& insertedValues) const std::vector<Taiyan::Gramambular::Unigram> LMInstantiator::filterAndTransformUnigrams(const std::vector<Taiyan::Gramambular::Unigram> unigrams, const std::unordered_set<std::string>& excludedValues, std::unordered_set<std::string>& insertedValues)
{ {
vector<Unigram> results; std::vector<Taiyan::Gramambular::Unigram> results;
for (auto&& unigram : unigrams) { for (auto&& unigram : unigrams) {
// excludedValues filters out the unigrams with the original value. // excludedValues filters out the unigrams with the original value.
// insertedValues filters out the ones with the converted value // insertedValues filters out the ones with the converted value
string originalValue = unigram.keyValue.value; std::string originalValue = unigram.keyValue.value;
if (excludedValues.find(originalValue) != excludedValues.end()) { if (excludedValues.find(originalValue) != excludedValues.end()) {
continue; continue;
} }
string value = originalValue; std::string value = originalValue;
if (m_phraseReplacementEnabled) { if (m_phraseReplacementEnabled) {
string replacement = m_phraseReplacement.valueForKey(value); std::string replacement = m_phraseReplacement.valueForKey(value);
if (replacement != "") { if (replacement != "") {
value = replacement; value = replacement;
} }
} }
if (m_externalConverterEnabled && m_externalConverter) { if (m_externalConverterEnabled && m_externalConverter) {
string replacement = m_externalConverter(value); std::string replacement = m_externalConverter(value);
value = replacement; value = replacement;
} }
if (insertedValues.find(value) == insertedValues.end()) { if (insertedValues.find(value) == insertedValues.end()) {
Unigram g; Taiyan::Gramambular::Unigram g;
g.keyValue.value = value; g.keyValue.value = value;
g.keyValue.key = unigram.keyValue.key; g.keyValue.key = unigram.keyValue.key;
g.score = unigram.score; g.score = unigram.score;
@ -225,12 +225,14 @@ const vector<Unigram> LMInstantiator::filterAndTransformUnigrams(const vector<Un
return results; return results;
} }
const vector<std::string> LMInstantiator::associatedPhrasesForKey(const string& key) const std::vector<std::string> LMInstantiator::associatedPhrasesForKey(const std::string& key)
{ {
return m_associatedPhrases.valuesForKey(key); return m_associatedPhrases.valuesForKey(key);
} }
bool LMInstantiator::hasAssociatedPhrasesForKey(const string& key) bool LMInstantiator::hasAssociatedPhrasesForKey(const std::string& key)
{ {
return m_associatedPhrases.hasValuesForKey(key); return m_associatedPhrases.hasValuesForKey(key);
} }
} // namespace vChewing