LMInstantiator // Simplifying result-generation process.
This commit is contained in:
parent
cb8bb2a7bb
commit
ca970e46c9
|
@ -45,13 +45,12 @@ extension vChewing {
|
||||||
/// 1) Get the original unigrams.
|
/// 1) Get the original unigrams.
|
||||||
/// 2) Drop the unigrams whose value is contained in the exclusion map.
|
/// 2) Drop the unigrams whose value is contained in the exclusion map.
|
||||||
/// 3) Replace the values of the unigrams using the phrase replacement map.
|
/// 3) Replace the values of the unigrams using the phrase replacement map.
|
||||||
/// 4) Replace the values of the unigrams using an external converter lambda.
|
/// 4) Drop the duplicated phrases from the generated unigram array.
|
||||||
/// 5) Drop the duplicated phrases.
|
|
||||||
///
|
///
|
||||||
/// The controller can ask the model to load the primary input method language
|
/// The controller can ask the model to load the primary input method language
|
||||||
/// model while launching and to load the user phrases anytime if the custom
|
/// model while launching and to load the user phrases anytime if the custom
|
||||||
/// files are modified. It does not keep the reference of the data pathes but
|
/// files are modified. It does not keep the reference of the data pathes but
|
||||||
/// you have to pass the paths when you ask it to do loading.
|
/// you have to pass the paths when you ask it to load.
|
||||||
public class LMInstantiator: Megrez.LanguageModel {
|
public class LMInstantiator: Megrez.LanguageModel {
|
||||||
// 在函數內部用以記錄狀態的開關。
|
// 在函數內部用以記錄狀態的開關。
|
||||||
public var isPhraseReplacementEnabled = false
|
public var isPhraseReplacementEnabled = false
|
||||||
|
@ -63,13 +62,13 @@ extension vChewing {
|
||||||
/// 不 Reverse 的話,第一欄是漢字,第二欄是對應的注音,第三欄是可能的權重。
|
/// 不 Reverse 的話,第一欄是漢字,第二欄是對應的注音,第三欄是可能的權重。
|
||||||
let lmCore = LMCore(reverse: false, consolidate: false, defaultScore: -9.5, forceDefaultScore: false)
|
let lmCore = LMCore(reverse: false, consolidate: false, defaultScore: -9.5, forceDefaultScore: false)
|
||||||
let lmMisc = LMCore(reverse: true, consolidate: false, defaultScore: -1, forceDefaultScore: false)
|
let lmMisc = LMCore(reverse: true, consolidate: false, defaultScore: -1, forceDefaultScore: false)
|
||||||
let lmSymbols = LMLite(defaultScore: -13.0, consolidate: true)
|
let lmSymbols = LMLite(consolidate: true)
|
||||||
let lmCNS = LMLite(defaultScore: -11.0, consolidate: true)
|
let lmCNS = LMLite(consolidate: true)
|
||||||
|
|
||||||
// 聲明使用者語言模組
|
// 聲明使用者語言模組
|
||||||
let lmUserPhrases = LMLite(defaultScore: 0.0, consolidate: true)
|
let lmUserPhrases = LMLite(consolidate: true)
|
||||||
let lmFiltered = LMLite(defaultScore: 0.0, consolidate: true)
|
let lmFiltered = LMLite(consolidate: true)
|
||||||
let lmUserSymbols = LMLite(defaultScore: -12.0, consolidate: true)
|
let lmUserSymbols = LMLite(consolidate: true)
|
||||||
let lmReplacements = LMReplacments()
|
let lmReplacements = LMReplacments()
|
||||||
let lmAssociates = LMAssociates()
|
let lmAssociates = LMAssociates()
|
||||||
|
|
||||||
|
@ -173,87 +172,47 @@ extension vChewing {
|
||||||
return [spaceUnigram]
|
return [spaceUnigram]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 準備不同的語言模組容器。
|
/// 準備不同的語言模組容器,開始逐漸往容器陣列內塞入資料。
|
||||||
var coreUnigrams: [Megrez.Unigram] = []
|
var rawAllUnigrams: [Megrez.Unigram] = []
|
||||||
var miscUnigrams: [Megrez.Unigram] = []
|
|
||||||
var symbolUnigrams: [Megrez.Unigram] = []
|
|
||||||
var userUnigrams: [Megrez.Unigram] = []
|
|
||||||
var userSymbolUnigrams: [Megrez.Unigram] = []
|
|
||||||
var cnsUnigrams: [Megrez.Unigram] = []
|
|
||||||
|
|
||||||
var insertedPairs: Set<Megrez.KeyValuePair> = [] // 具體用途有待商榷
|
// 用 reversed 指令讓使用者語彙檔案內的詞條優先順序隨著行數增加而逐漸增高。
|
||||||
var filteredPairs: Set<Megrez.KeyValuePair> = []
|
// 這樣一來就可以在就地新增語彙時徹底複寫優先權。
|
||||||
|
// 將兩句差分也是為了讓 rawUserUnigrams 的類型不受可能的影響。
|
||||||
|
rawAllUnigrams += lmUserPhrases.unigramsFor(key: key, score: 0.0).reversed()
|
||||||
|
|
||||||
// 開始逐漸往容器陣列內塞入資料
|
// LMMisc 與 LMCore 的 score 在 (-10.0, 0.0) 這個區間內。
|
||||||
let filteredUnigrams: [Megrez.Unigram] =
|
rawAllUnigrams += lmMisc.unigramsFor(key: key)
|
||||||
lmFiltered.hasUnigramsFor(key: key) ? lmFiltered.unigramsFor(key: key) : []
|
rawAllUnigrams += lmCore.unigramsFor(key: key)
|
||||||
for unigram in filteredUnigrams {
|
|
||||||
filteredPairs.insert(unigram.keyValue)
|
|
||||||
}
|
|
||||||
|
|
||||||
if lmUserPhrases.hasUnigramsFor(key: key) {
|
if isCNSEnabled {
|
||||||
var rawUserUnigrams: [Megrez.Unigram] = []
|
rawAllUnigrams += lmCNS.unigramsFor(key: key, score: -11)
|
||||||
// 用 reversed 指令讓使用者語彙檔案內的詞條優先順序隨著行數增加而逐漸增高。
|
|
||||||
// 這樣一來就可以在就地新增語彙時徹底複寫優先權。
|
|
||||||
// 將兩句差分也是為了讓 rawUserUnigrams 的類型不受可能的影響。
|
|
||||||
rawUserUnigrams.append(contentsOf: lmUserPhrases.unigramsFor(key: key).reversed())
|
|
||||||
userUnigrams = filterAndTransform(
|
|
||||||
unigrams: rawUserUnigrams, filter: filteredPairs, inserted: &insertedPairs
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
if lmUserPhrases.hasUnigramsFor(key: key) {
|
|
||||||
let rawUserUnigrams: [Megrez.Unigram] = lmUserPhrases.unigramsFor(key: key)
|
|
||||||
userUnigrams = filterAndTransform(
|
|
||||||
unigrams: rawUserUnigrams, filter: filteredPairs, inserted: &insertedPairs
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
if lmMisc.hasUnigramsFor(key: key) {
|
|
||||||
let rawMiscUnigrams: [Megrez.Unigram] = lmMisc.unigramsFor(key: key)
|
|
||||||
miscUnigrams = filterAndTransform(
|
|
||||||
unigrams: rawMiscUnigrams, filter: filteredPairs, inserted: &insertedPairs
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
if lmCore.hasUnigramsFor(key: key) {
|
|
||||||
let rawCoreUnigrams: [Megrez.Unigram] = lmCore.unigramsFor(key: key)
|
|
||||||
coreUnigrams = filterAndTransform(
|
|
||||||
unigrams: rawCoreUnigrams, filter: filteredPairs, inserted: &insertedPairs
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if isSymbolEnabled {
|
if isSymbolEnabled {
|
||||||
if lmUserSymbols.hasUnigramsFor(key: key) {
|
rawAllUnigrams += lmUserSymbols.unigramsFor(key: key, score: -12.0)
|
||||||
let rawUserSymbolUnigrams: [Megrez.Unigram] = lmUserSymbols.unigramsFor(key: key)
|
if lmUserSymbols.unigramsFor(key: key).isEmpty {
|
||||||
userSymbolUnigrams = filterAndTransform(
|
|
||||||
unigrams: rawUserSymbolUnigrams, filter: filteredPairs, inserted: &insertedPairs
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
IME.prtDebugIntel("Not found in UserSymbolUnigram: \(key)")
|
IME.prtDebugIntel("Not found in UserSymbolUnigram: \(key)")
|
||||||
}
|
}
|
||||||
|
|
||||||
if lmSymbols.hasUnigramsFor(key: key) {
|
rawAllUnigrams += lmSymbols.unigramsFor(key: key, score: -11.0)
|
||||||
let rawSymbolUnigrams: [Megrez.Unigram] = lmSymbols.unigramsFor(key: key)
|
if lmSymbols.unigramsFor(key: key).isEmpty {
|
||||||
symbolUnigrams = filterAndTransform(
|
|
||||||
unigrams: rawSymbolUnigrams, filter: filteredPairs, inserted: &insertedPairs
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
IME.prtDebugIntel("Not found in UserUnigram: \(key)")
|
IME.prtDebugIntel("Not found in UserUnigram: \(key)")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if lmCNS.hasUnigramsFor(key: key), isCNSEnabled {
|
// 準備過濾清單與統計清單
|
||||||
let rawCNSUnigrams: [Megrez.Unigram] = lmCNS.unigramsFor(key: key)
|
var insertedPairs: Set<Megrez.KeyValuePair> = [] // 統計清單
|
||||||
cnsUnigrams = filterAndTransform(
|
var filteredPairs: Set<Megrez.KeyValuePair> = [] // 過濾清單
|
||||||
unigrams: rawCNSUnigrams, filter: filteredPairs, inserted: &insertedPairs
|
|
||||||
)
|
// 載入要過濾的 KeyValuePair 清單。
|
||||||
|
for unigram in lmFiltered.unigramsFor(key: key) {
|
||||||
|
filteredPairs.insert(unigram.keyValue)
|
||||||
}
|
}
|
||||||
|
|
||||||
let allUnigrams: [Megrez.Unigram] =
|
return filterAndTransform(
|
||||||
userUnigrams + miscUnigrams + coreUnigrams + cnsUnigrams + userSymbolUnigrams + symbolUnigrams
|
unigrams: rawAllUnigrams,
|
||||||
|
filter: filteredPairs, inserted: &insertedPairs
|
||||||
return allUnigrams
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// If the model has unigrams for the given key.
|
/// If the model has unigrams for the given key.
|
||||||
|
@ -286,16 +245,15 @@ extension vChewing {
|
||||||
var results: [Megrez.Unigram] = []
|
var results: [Megrez.Unigram] = []
|
||||||
|
|
||||||
for unigram in unigrams {
|
for unigram in unigrams {
|
||||||
let pairToDealWith: Megrez.KeyValuePair = unigram.keyValue
|
var pair: Megrez.KeyValuePair = unigram.keyValue
|
||||||
if filteredPairs.contains(pairToDealWith) {
|
if filteredPairs.contains(pair) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
var pair: Megrez.KeyValuePair = pairToDealWith
|
|
||||||
if isPhraseReplacementEnabled {
|
if isPhraseReplacementEnabled {
|
||||||
let replacement = lmReplacements.valuesFor(key: pair.key)
|
let replacement = lmReplacements.valuesFor(key: pair.value)
|
||||||
if !replacement.isEmpty {
|
if !replacement.isEmpty, pair.value.count == replacement.count {
|
||||||
IME.prtDebugIntel(replacement)
|
IME.prtDebugIntel("\(pair.value) -> \(replacement)")
|
||||||
pair.value = replacement
|
pair.value = replacement
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue