vChewing-macOS/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift

364 lines
16 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
// ... with NTL restriction stating that:
// No trademark license is granted to use the trade names, trademarks, service
// marks, or product names of Contributor, except as required to fulfill notice
// requirements defined in MIT License.
import Foundation
import Megrez
import Shared
public extension vChewingLM {
/// LMInstantiatorLMI
/// LangModelProtocol 使
///
///
/// LMI 調
/// LMI
///
///
/// LMI
///
/// 1.
/// 2.
/// 3.
/// 4.
///
/// LMI LMI
///
class LMInstantiator: LangModelProtocol {
//
public var isCassetteEnabled = false
public var isPhraseReplacementEnabled = false
public var isCNSEnabled = false
public var isSymbolEnabled = false
public var isSCPCEnabled = false
public var isCHS = false
public var deltaOfCalendarYears: Int = -2000
// package
public init(isCHS: Bool = false) {
self.isCHS = isCHS
}
///
/// ----------------------
/// LMCoreEX key [Unigram]
///
/// LMCoreEX Unigram
/// LMCoreEX 滿
/// LMReplacements LMAssociates 使
/// LMCoreEX 2010-2013 mac
/// LMCoreNS plist
//
// Reverse
// Reverse
var lmCore = LMCoreNS(
reverse: false, consolidate: false, defaultScore: -9.9, forceDefaultScore: false
)
var lmMisc = LMCoreNS(
reverse: true, consolidate: false, defaultScore: -1.0, forceDefaultScore: false
)
//
// 100MB
static var lmCNS = vChewingLM.LMCoreNS(
reverse: true, consolidate: false, defaultScore: -11.0, forceDefaultScore: false
)
static var lmSymbols = vChewingLM.LMCoreNS(
reverse: true, consolidate: false, defaultScore: -13.0, forceDefaultScore: false
)
// currentCassetteMetadata
static var lmCassette = LMCassette()
// 使
// 使使
var lmUserPhrases = LMCoreEX(
reverse: true, consolidate: true, defaultScore: 0, forceDefaultScore: false
)
var lmFiltered = LMCoreEX(
reverse: true, consolidate: true, defaultScore: 0, forceDefaultScore: true
)
var lmUserSymbols = LMCoreEX(
reverse: true, consolidate: true, defaultScore: -12.0, forceDefaultScore: true
)
var lmReplacements = LMReplacements()
var lmAssociates = LMAssociates()
var lmPlainBopomofo = LMPlainBopomofo()
// MARK: -
public func resetFactoryPlistModels() {
lmCore.clear()
lmMisc.clear()
Self.lmCNS.clear()
Self.lmSymbols.clear()
}
public var isCoreLMLoaded: Bool { lmCore.isLoaded }
public func loadLanguageModel(plist: (dict: [String: [Data]]?, path: String)) {
guard let plistDict = plist.dict else {
vCLog("lmCore: File access failure: \(plist.path)")
return
}
lmCore.load((dict: plistDict, path: plist.path))
vCLog("lmCore: \(lmCore.count) entries of data loaded from: \(plist.path)")
}
public var isCNSDataLoaded: Bool { Self.lmCNS.isLoaded }
public func loadCNSData(plist: (dict: [String: [Data]]?, path: String)) {
guard let plistDict = plist.dict else {
vCLog("lmCNS: File access failure: \(plist.path)")
return
}
Self.lmCNS.load((dict: plistDict, path: plist.path))
vCLog("lmCNS: \(Self.lmCNS.count) entries of data loaded from: \(plist.path)")
}
public var isMiscDataLoaded: Bool { lmMisc.isLoaded }
public func loadMiscData(plist: (dict: [String: [Data]]?, path: String)) {
guard let plistDict = plist.dict else {
vCLog("lmCore: File access failure: \(plist.path)")
return
}
lmMisc.load((dict: plistDict, path: plist.path))
vCLog("lmMisc: \(lmMisc.count) entries of data loaded from: \(plist.path)")
}
public var isSymbolDataLoaded: Bool { Self.lmSymbols.isLoaded }
public func loadSymbolData(plist: (dict: [String: [Data]]?, path: String)) {
guard let plistDict = plist.dict else {
vCLog("lmCore: File access failure: \(plist.path)")
return
}
Self.lmSymbols.load((dict: plistDict, path: plist.path))
vCLog("lmSymbols: \(Self.lmSymbols.count) entries of data loaded from: \(plist.path)")
}
// Async LMMgr Async GCD
public func loadUserPhrasesData(path: String, filterPath: String) {
DispatchQueue.main.async {
if FileManager.default.isReadableFile(atPath: path) {
self.lmUserPhrases.clear()
self.lmUserPhrases.open(path)
vCLog("lmUserPhrases: \(self.lmUserPhrases.count) entries of data loaded from: \(path)")
} else {
vCLog("lmUserPhrases: File access failure: \(path)")
}
}
DispatchQueue.main.async {
if FileManager.default.isReadableFile(atPath: filterPath) {
self.lmFiltered.clear()
self.lmFiltered.open(filterPath)
vCLog("lmFiltered: \(self.lmFiltered.count) entries of data loaded from: \(path)")
} else {
vCLog("lmFiltered: File access failure: \(path)")
}
}
}
public func loadUserSymbolData(path: String) {
DispatchQueue.main.async {
if FileManager.default.isReadableFile(atPath: path) {
self.lmUserSymbols.clear()
self.lmUserSymbols.open(path)
vCLog("lmUserSymbol: \(self.lmUserSymbols.count) entries of data loaded from: \(path)")
} else {
vCLog("lmUserSymbol: File access failure: \(path)")
}
}
}
public func loadUserAssociatesData(path: String) {
DispatchQueue.main.async {
if FileManager.default.isReadableFile(atPath: path) {
self.lmAssociates.clear()
self.lmAssociates.open(path)
vCLog("lmAssociates: \(self.lmAssociates.count) entries of data loaded from: \(path)")
} else {
vCLog("lmAssociates: File access failure: \(path)")
}
}
}
public func loadReplacementsData(path: String) {
DispatchQueue.main.async {
if FileManager.default.isReadableFile(atPath: path) {
self.lmReplacements.clear()
self.lmReplacements.open(path)
vCLog("lmReplacements: \(self.lmReplacements.count) entries of data loaded from: \(path)")
} else {
vCLog("lmReplacements: File access failure: \(path)")
}
}
}
public func loadUserSCPCSequencesData(path: String) {
DispatchQueue.main.async {
if FileManager.default.isReadableFile(atPath: path) {
self.lmPlainBopomofo.clear()
self.lmPlainBopomofo.open(path)
vCLog("lmPlainBopomofo: \(self.lmPlainBopomofo.count) entries of data loaded from: \(path)")
} else {
vCLog("lmPlainBopomofo: File access failure: \(path)")
}
}
}
public var isCassetteDataLoaded: Bool { Self.lmCassette.isLoaded }
public static func loadCassetteData(path: String) {
DispatchQueue.main.async {
if FileManager.default.isReadableFile(atPath: path) {
Self.lmCassette.clear()
Self.lmCassette.open(path)
vCLog("lmCassette: \(Self.lmCassette.count) entries of data loaded from: \(path)")
} else {
vCLog("lmCassette: File access failure: \(path)")
}
}
}
// MARK: -
public func hasAssociatedPhrasesFor(pair: Megrez.Compositor.KeyValuePaired) -> Bool {
lmAssociates.hasValuesFor(pair: pair)
}
public func associatedPhrasesFor(pair: Megrez.Compositor.KeyValuePaired) -> [String] {
lmAssociates.valuesFor(pair: pair)
}
///
/// - Parameters:
/// - key:
/// - unigram:
/// - isFiltering:
public func insertTemporaryData(keyArray: [String], unigram: Megrez.Unigram, isFiltering: Bool) {
let keyChain = keyArray.joined(separator: "-")
_ =
isFiltering
? lmFiltered.temporaryMap[keyChain, default: []].append(unigram)
: lmUserPhrases.temporaryMap[keyChain, default: []].append(unigram)
}
/// 使
/// - Parameters:
/// - targetType:
public func retrieveData(from targetType: ReplacableUserDataType) -> String {
switch targetType {
case .thePhrases: return lmUserPhrases.strData
case .theFilter: return lmFiltered.strData
case .theReplacements: return lmReplacements.strData
case .theAssociates: return lmAssociates.strData
case .theSymbols: return lmUserSymbols.strData
}
}
/// 使
/// - Parameters:
/// - rawStrData:
/// - targetType:
public func replaceData(textData rawStrData: String, for targetType: ReplacableUserDataType, save: Bool = true) {
var rawText = rawStrData
LMConsolidator.consolidate(text: &rawText, pragma: true)
switch targetType {
case .theAssociates:
lmAssociates.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
case .theFilter:
lmFiltered.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
case .theReplacements:
lmReplacements.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
case .thePhrases:
lmUserPhrases.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
case .theSymbols:
lmUserSymbols.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
}
}
///
/// - Parameter key:
/// - Returns:
public func hasUnigramsFor(keyArray: [String]) -> Bool {
let keyChain = keyArray.joined(separator: "-")
return keyChain == " " || (!unigramsFor(keyArray: keyArray).isEmpty && !keyChain.isEmpty)
}
///
/// - Parameters:
/// - keyArray:
/// - value:
/// - factoryDictionaryOnly:
/// - Returns:
public func hasKeyValuePairFor(keyArray: [String], value: String, factoryDictionaryOnly: Bool = false) -> Bool {
factoryDictionaryOnly
? lmCore.unigramsFor(key: keyArray.joined(separator: "-")).map(\.value).contains(value)
: unigramsFor(keyArray: keyArray).map(\.value).contains(value)
}
/// LMI
/// - Parameter key:
/// - Returns:
public func unigramsFor(keyArray: [String]) -> [Megrez.Unigram] {
let keyChain = keyArray.joined(separator: "-")
guard !keyChain.isEmpty else { return [] }
///
if keyChain == " " { return [.init(value: " ")] }
///
var rawAllUnigrams: [Megrez.Unigram] = []
if isCassetteEnabled { rawAllUnigrams += Self.lmCassette.unigramsFor(key: keyChain) }
// 使
if isSCPCEnabled {
rawAllUnigrams += lmPlainBopomofo.valuesFor(key: keyChain).map { Megrez.Unigram(value: $0, score: 0) }
}
// reversed 使
//
// rawUserUnigrams
rawAllUnigrams += lmUserPhrases.unigramsFor(key: keyChain).reversed()
if !isCassetteEnabled || isCassetteEnabled && keyChain.charComponents[0] == "_" {
// LMMisc LMCore score (-10.0, 0.0)
rawAllUnigrams += lmMisc.unigramsFor(key: keyChain)
rawAllUnigrams += lmCore.unigramsFor(key: keyChain)
if isCNSEnabled { rawAllUnigrams += Self.lmCNS.unigramsFor(key: keyChain) }
}
if isSymbolEnabled {
rawAllUnigrams += lmUserSymbols.unigramsFor(key: keyChain)
if !isCassetteEnabled {
rawAllUnigrams += Self.lmSymbols.unigramsFor(key: keyChain)
}
}
//
rawAllUnigrams.append(contentsOf: queryDateTimeUnigrams(with: keyChain))
//
if isPhraseReplacementEnabled {
for i in 0 ..< rawAllUnigrams.count {
let newValue = lmReplacements.valuesFor(key: rawAllUnigrams[i].value)
guard !newValue.isEmpty else { continue }
rawAllUnigrams[i].value = newValue
}
}
//
rawAllUnigrams.consolidate(filter: .init(lmFiltered.unigramsFor(key: keyChain).map(\.value)))
return rawAllUnigrams
}
}
}