LM // Swiftify: LMInstantiator.

2022-04-26 09:45:54 +08:00 · 2022-04-26 09:45:54 +08:00 · 887907fb11
parent 1b4b4149a0
commit 887907fb11
1 changed files with 311 additions and 0 deletions
--- a/Source/Modules/LangModelRelated/LMInstantiator.swift
+++ b/Source/Modules/LangModelRelated/LMInstantiator.swift
@ -0,0 +1,311 @@
 // Copyright (c) 2021 and onwards The vChewing Project (MIT-NTL License).
 // Refactored from the ObjCpp-version of this class by:
 // (c) 2011 and onwards The OpenVanilla Project (MIT License).
 /*
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 the Software, and to permit persons to whom the Software is furnished to do so,
 subject to the following conditions:
 1. The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 2. No trademark license is granted to use the trade names, trademarks, service
 marks, or product names of Contributor, except as required to fulfill notice
 requirements above.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 // NOTE: We still keep some of the comments left by Zonble,
 // regardless that he is not in charge of this Swift module。
 import Foundation
 extension vChewing {
 	/// LMInstantiator is a facade for managing a set of models including
 	/// the input method language model, user phrases and excluded phrases.
 	///
 	/// It is the primary model class that the input controller and grammar builder
 	/// of vChewing talks to. When the grammar builder starts to build a sentence
 	/// from a series of BPMF readings, it passes the readings to the model to see
 	/// if there are valid unigrams, and use returned unigrams to produce the final
 	/// results.
 	///
 	/// LMInstantiator combine and transform the unigrams from the primary language
 	/// model and user phrases. The process is
 	///
 	/// 1) Get the original unigrams.
 	/// 2) Drop the unigrams whose value is contained in the exclusion map.
 	/// 3) Replace the values of the unigrams using the phrase replacement map.
 	/// 4) Replace the values of the unigrams using an external converter lambda.
 	/// 5) Drop the duplicated phrases.
 	///
 	/// The controller can ask the model to load the primary input method language
 	/// model while launching and to load the user phrases anytime if the custom
 	/// files are modified. It does not keep the reference of the data pathes but
 	/// you have to pass the paths when you ask it to do loading.
 	public class LMInstantiator: Megrez.LanguageModel {
 		// 在函數內部用以記錄狀態的開關。
 		public var isPhraseReplacementEnabled = false
 		public var isCNSEnabled = false
 		public var isSymbolEnabled = false
 		// 聲明原廠語言模組
 		/// Reverse 的話，第一欄是注音，第二欄是對應的漢字，第三欄是可能的權重。
 		/// 不 Reverse 的話，第一欄是漢字，第二欄是對應的注音，第三欄是可能的權重。
 		let lmCore = LMCore(reverse: false, consolidate: false, defaultScore: -9.5, forceDefaultScore: false)
 		let lmMisc = LMCore(reverse: true, consolidate: false, defaultScore: -1, forceDefaultScore: false)
 		let lmSymbols = LMLite(defaultScore: -13.0, consolidate: true)
 		let lmCNS = LMLite(defaultScore: -11.0, consolidate: true)
 		// 聲明使用者語言模組
 		let lmUserPhrases = LMLite(defaultScore: 0.0, consolidate: true)
 		let lmFiltered = LMLite(defaultScore: 0.0, consolidate: true)
 		let lmUserSymbols = LMLite(defaultScore: -12.0, consolidate: true)
 		let lmReplacements = LMReplacments()
 		let lmAssociates = LMAssociates()
 		// 初期化的函數先保留
 		override init() {}
 		// 自我析構前要關掉全部的語言模組
 		deinit {
 			lmCore.close()
 			lmMisc.close()
 			lmSymbols.close()
 			lmCNS.close()
 			lmUserPhrases.close()
 			lmFiltered.close()
 			lmUserSymbols.close()
 			lmReplacements.close()
 			lmAssociates.close()
 		}
 		// 以下這些函數命名暫時保持原樣，等弒神行動徹底結束了再調整。
 		public func isDataModelLoaded() -> Bool { lmCore.isLoaded() }
 		public func loadLanguageModel(path: String) {
 			if FileManager.default.isReadableFile(atPath: path) {
 				lmCore.close()
 				lmCore.open(path)
 			}
 		}
 		public func isCNSDataLoaded() -> Bool { lmCNS.isLoaded() }
 		public func loadCNSData(path: String) {
 			if FileManager.default.isReadableFile(atPath: path) {
 				lmCNS.close()
 				lmCNS.open(path)
 			}
 		}
 		public func isMiscDataLoaded() -> Bool { lmMisc.isLoaded() }
 		public func loadMiscData(path: String) {
 			if FileManager.default.isReadableFile(atPath: path) {
 				lmMisc.close()
 				lmMisc.open(path)
 			}
 		}
 		public func isSymbolDataLoaded() -> Bool { lmSymbols.isLoaded() }
 		public func loadSymbolData(path: String) {
 			if FileManager.default.isReadableFile(atPath: path) {
 				lmSymbols.close()
 				lmSymbols.open(path)
 			}
 		}
 		public func loadUserPhrases(path: String, filterPath: String) {
 			if FileManager.default.isReadableFile(atPath: path) {
 				lmUserPhrases.close()
 				lmUserPhrases.open(path)
 			}
 			if FileManager.default.isReadableFile(atPath: filterPath) {
 				lmFiltered.close()
 				lmFiltered.open(filterPath)
 			}
 		}
 		public func loadUserSymbolData(path: String) {
 			if FileManager.default.isReadableFile(atPath: path) {
 				lmUserSymbols.close()
 				lmUserSymbols.open(path)
 			}
 		}
 		public func loadUserAssociatedPhrases(path: String) {
 			if FileManager.default.isReadableFile(atPath: path) {
 				lmAssociates.close()
 				lmAssociates.open(path)
 			}
 		}
 		public func loadPhraseReplacementMap(path: String) {
 			if FileManager.default.isReadableFile(atPath: path) {
 				lmReplacements.close()
 				lmReplacements.open(path)
 			}
 		}
 		// MARK: - Core Functions (Public)
 		/// Not implemented since we do not have data to provide bigram function.
 		// public func bigramsForKeys(preceedingKey: String, key: String) -> [Megrez.Bigram] { }
 		/// Returns a list of available unigram for the given key.
 		/// @param key:String represents the BPMF reading or a symbol key.
 		/// For instance, it you pass "ㄉㄨㄟˇ", it returns "㨃" and other possible candidates.
 		override open func unigramsFor(key: String) -> [Megrez.Unigram] {
 			if key == " " {
 				/// 給空格鍵指定輸出值。
 				let spaceUnigram = Megrez.Unigram(
 					keyValue: Megrez.KeyValuePair(key: " ", value: " "),
 					score: 0
 				)
 				return [spaceUnigram]
 			}
 			/// 準備不同的語言模組容器。
 			var coreUnigrams: [Megrez.Unigram] = []
 			var miscUnigrams: [Megrez.Unigram] = []
 			var symbolUnigrams: [Megrez.Unigram] = []
 			var userUnigrams: [Megrez.Unigram] = []
 			var userSymbolUnigrams: [Megrez.Unigram] = []
 			var cnsUnigrams: [Megrez.Unigram] = []
 			var insertedPairs: Set<Megrez.KeyValuePair> = []  // 具體用途有待商榷
 			var filteredPairs: Set<Megrez.KeyValuePair> = []
 			// 開始逐漸往容器陣列內塞入資料
 			let filteredUnigrams: [Megrez.Unigram] =
 				lmFiltered.hasUnigramsFor(key: key) ? lmFiltered.unigramsFor(key: key) : []
 			for unigram in filteredUnigrams {
 				filteredPairs.insert(unigram.keyValue)
 			}
 			if lmUserPhrases.hasUnigramsFor(key: key) {
 				var rawUserUnigrams: [Megrez.Unigram] = []
 				// 用 reversed 指令讓使用者語彙檔案內的詞條優先順序隨著行數增加而逐漸增高。
 				// 這樣一來就可以在就地新增語彙時徹底複寫優先權。
 				// 將兩句差分也是為了讓 rawUserUnigrams 的類型不受可能的影響。
 				rawUserUnigrams.append(contentsOf: lmUserPhrases.unigramsFor(key: key).reversed())
 				userUnigrams = filterAndTransform(
 					unigrams: rawUserUnigrams, filter: filteredPairs, inserted: &insertedPairs
 				)
 			}
 			if lmUserPhrases.hasUnigramsFor(key: key) {
 				let rawUserUnigrams: [Megrez.Unigram] = lmUserPhrases.unigramsFor(key: key)
 				userUnigrams = filterAndTransform(
 					unigrams: rawUserUnigrams, filter: filteredPairs, inserted: &insertedPairs
 				)
 			}
 			if lmMisc.hasUnigramsFor(key: key) {
 				let rawMiscUnigrams: [Megrez.Unigram] = lmMisc.unigramsFor(key: key)
 				miscUnigrams = filterAndTransform(
 					unigrams: rawMiscUnigrams, filter: filteredPairs, inserted: &insertedPairs
 				)
 			}
 			if lmCore.hasUnigramsFor(key: key) {
 				let rawCoreUnigrams: [Megrez.Unigram] = lmCore.unigramsFor(key: key)
 				coreUnigrams = filterAndTransform(
 					unigrams: rawCoreUnigrams, filter: filteredPairs, inserted: &insertedPairs
 				)
 			}
 			if isSymbolEnabled {
 				if lmUserSymbols.hasUnigramsFor(key: key) {
 					let rawUserSymbolUnigrams: [Megrez.Unigram] = lmUserSymbols.unigramsFor(key: key)
 					userSymbolUnigrams = filterAndTransform(
 						unigrams: rawUserSymbolUnigrams, filter: filteredPairs, inserted: &insertedPairs
 					)
 				} else {
 					IME.prtDebugIntel("Not found in UserSymbolUnigram: \(key)")
 				}
 				if lmSymbols.hasUnigramsFor(key: key) {
 					let rawSymbolUnigrams: [Megrez.Unigram] = lmSymbols.unigramsFor(key: key)
 					symbolUnigrams = filterAndTransform(
 						unigrams: rawSymbolUnigrams, filter: filteredPairs, inserted: &insertedPairs
 					)
 				} else {
 					IME.prtDebugIntel("Not found in UserUnigram: \(key)")
 				}
 			}
 			if lmCNS.hasUnigramsFor(key: key), isCNSEnabled {
 				let rawCNSUnigrams: [Megrez.Unigram] = lmCNS.unigramsFor(key: key)
 				cnsUnigrams = filterAndTransform(
 					unigrams: rawCNSUnigrams, filter: filteredPairs, inserted: &insertedPairs
 				)
 			}
 			let allUnigrams: [Megrez.Unigram] =
 				userUnigrams + miscUnigrams + coreUnigrams + cnsUnigrams + userSymbolUnigrams + symbolUnigrams
 			return allUnigrams
 		}
 		/// If the model has unigrams for the given key.
 		/// @param key The key.
 		override open func hasUnigramsFor(key: String) -> Bool {
 			if key == " " { return true }
 			if !lmFiltered.hasUnigramsFor(key: key) {
 				return lmUserPhrases.hasUnigramsFor(key: key) || lmCore.hasUnigramsFor(key: key)
 			}
 			return !unigramsFor(key: key).isEmpty
 		}
 		public func associatedPhrasesForKey(_ key: String) -> [String] {
 			lmAssociates.valuesFor(key: key) ?? []
 		}
 		public func hasAssociatedPhrasesForKey(_ key: String) -> Bool {
 			lmAssociates.hasValuesFor(key: key)
 		}
 		// MARK: - Core Functions (Private)
 		func filterAndTransform(
 			unigrams: [Megrez.Unigram],
 			filter filteredPairs: Set<Megrez.KeyValuePair>,
 			inserted insertedPairs: inout Set<Megrez.KeyValuePair>
 		) -> [Megrez.Unigram] {
 			var results: [Megrez.Unigram] = []
 			for unigram in unigrams {
 				let pairToDealWith: Megrez.KeyValuePair = unigram.keyValue
 				if filteredPairs.contains(pairToDealWith) {
 					continue
 				}
 				var pair: Megrez.KeyValuePair = pairToDealWith
 				if isPhraseReplacementEnabled {
 					let replacement = lmReplacements.valuesFor(key: pair.key)
 					if !replacement.isEmpty {
 						IME.prtDebugIntel(replacement)
 						pair.value = replacement
 					}
 				}
 				if !insertedPairs.contains(pair) {
 					results.append(Megrez.Unigram(keyValue: pair, score: unigram.score))
 					insertedPairs.insert(pair)
 				}
 			}
 			return results
 		}
 	}
 }