vChewing-macOS/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift

438 lines
18 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
// ... with NTL restriction stating that:
// No trademark license is granted to use the trade names, trademarks, service
// marks, or product names of Contributor, except as required to fulfill notice
// requirements defined in MIT License.
import Foundation
import Megrez
public extension LMAssembly {
/// LMInstantiatorLMI
/// LangModelProtocol 使
///
///
/// LMI 調
/// LMI
///
///
/// LMI
///
/// 1.
/// 2.
/// 3.
/// 4.
///
/// LMI LMI
///
class LMInstantiator: LangModelProtocol {
public struct Config {
/// nil
/// true = false =
public var numPadFWHWStatus: Bool?
public var isCassetteEnabled = false
public var isPhraseReplacementEnabled = false
public var isCNSEnabled = false
public var isSymbolEnabled = false
public var isSCPCEnabled = false
public var filterNonCNSReadings = false
public var deltaOfCalendarYears: Int = -2000
}
public static var asyncLoadingUserData: Bool = true
// SQLite
static var ptrSQL: OpaquePointer?
// SQLite
public internal(set) static var isSQLDBConnected: Bool = false
//
public let isCHS: Bool
//
public private(set) var config = Config()
// package
public init(
isCHS: Bool = false,
uomDataURL: URL? = nil
) {
self.isCHS = isCHS
lmUserOverride = .init(dataURL: uomDataURL)
}
@discardableResult public func setOptions(handler: (inout Config) -> Void) -> LMInstantiator {
handler(&config)
return self
}
public static func setCassetCandidateKeyValidator(_ validator: @escaping (String) -> Bool) {
Self.lmCassette.candidateKeysValidator = validator
}
///
/// ----------------------
/// LMCoreEX key [Unigram]
///
/// LMCoreEX Unigram
/// LMCoreEX 滿
/// LMReplacements LMAssociates 使
/// LMCoreEX 2010-2013 mac
/// LMCoreJSON JSON
// currentCassetteMetadata
static var lmCassette = LMCassette()
static var lmPlainBopomofo = LMPlainBopomofo()
// 使
// 使使
var lmUserPhrases = LMCoreEX(
reverse: true, consolidate: true, defaultScore: 0, forceDefaultScore: false
)
var lmFiltered = LMCoreEX(
reverse: true, consolidate: true, defaultScore: 0, forceDefaultScore: true
)
var lmUserSymbols = LMCoreEX(
reverse: true, consolidate: true, defaultScore: -12.0, forceDefaultScore: true
)
var lmReplacements = LMReplacements()
var lmAssociates = LMAssociates()
//
var lmUserOverride: LMUserOverride
// MARK: -
public func resetFactoryJSONModels() {}
public func loadUserPhrasesData(path: String, filterPath: String?) {
func loadMain() {
if FileManager.default.isReadableFile(atPath: path) {
lmUserPhrases.clear()
lmUserPhrases.open(path)
vCLMLog("lmUserPhrases: \(lmUserPhrases.count) entries of data loaded from: \(path)")
} else {
vCLMLog("lmUserPhrases: File access failure: \(path)")
}
}
if !Self.asyncLoadingUserData {
loadMain()
} else {
DispatchQueue.main.async {
loadMain()
}
}
guard let filterPath = filterPath else { return }
func loadFilter() {
if FileManager.default.isReadableFile(atPath: filterPath) {
lmFiltered.clear()
lmFiltered.open(filterPath)
vCLMLog("lmFiltered: \(lmFiltered.count) entries of data loaded from: \(path)")
} else {
vCLMLog("lmFiltered: File access failure: \(path)")
}
}
if !Self.asyncLoadingUserData {
loadFilter()
} else {
DispatchQueue.main.async {
loadFilter()
}
}
}
/// GCD
public func reloadUserFilterDirectly(path: String) {
if FileManager.default.isReadableFile(atPath: path) {
lmFiltered.clear()
lmFiltered.open(path)
vCLMLog("lmFiltered: \(lmFiltered.count) entries of data loaded from: \(path)")
} else {
vCLMLog("lmFiltered: File access failure: \(path)")
}
}
public func loadUserSymbolData(path: String) {
func load() {
if FileManager.default.isReadableFile(atPath: path) {
lmUserSymbols.clear()
lmUserSymbols.open(path)
vCLMLog("lmUserSymbol: \(lmUserSymbols.count) entries of data loaded from: \(path)")
} else {
vCLMLog("lmUserSymbol: File access failure: \(path)")
}
}
if !Self.asyncLoadingUserData {
load()
} else {
DispatchQueue.main.async {
load()
}
}
}
public func loadUserAssociatesData(path: String) {
func load() {
if FileManager.default.isReadableFile(atPath: path) {
lmAssociates.clear()
lmAssociates.open(path)
vCLMLog("lmAssociates: \(lmAssociates.count) entries of data loaded from: \(path)")
} else {
vCLMLog("lmAssociates: File access failure: \(path)")
}
}
if !Self.asyncLoadingUserData {
load()
} else {
DispatchQueue.main.async {
load()
}
}
}
public func loadReplacementsData(path: String) {
func load() {
if FileManager.default.isReadableFile(atPath: path) {
lmReplacements.clear()
lmReplacements.open(path)
vCLMLog("lmReplacements: \(lmReplacements.count) entries of data loaded from: \(path)")
} else {
vCLMLog("lmReplacements: File access failure: \(path)")
}
}
if !Self.asyncLoadingUserData {
load()
} else {
DispatchQueue.main.async {
load()
}
}
}
public var isCassetteDataLoaded: Bool { Self.lmCassette.isLoaded }
public static func loadCassetteData(path: String) {
func load() {
if FileManager.default.isReadableFile(atPath: path) {
Self.lmCassette.clear()
Self.lmCassette.open(path)
vCLMLog("lmCassette: \(Self.lmCassette.count) entries of data loaded from: \(path)")
} else {
vCLMLog("lmCassette: File access failure: \(path)")
}
}
if !Self.asyncLoadingUserData {
load()
} else {
DispatchQueue.main.async {
load()
}
}
}
// MARK: -
public func hasAssociatedPhrasesFor(pair: Megrez.KeyValuePaired) -> Bool {
lmAssociates.hasValuesFor(pair: pair)
}
public func associatedPhrasesFor(pair: Megrez.KeyValuePaired) -> [String] {
lmAssociates.valuesFor(pair: pair)
}
public func queryReplacementValue(key: String) -> String? {
let result = lmReplacements.valuesFor(key: key)
return result.isEmpty ? nil : result
}
public func isPairFiltered(pair: Megrez.KeyValuePaired) -> Bool {
lmFiltered.unigramsFor(key: pair.joinedKey()).map(\.value).contains(pair.value)
}
///
/// - Parameters:
/// - key:
/// - unigram:
/// - isFiltering:
public func insertTemporaryData(keyArray: [String], unigram: Megrez.Unigram, isFiltering: Bool) {
let keyChain = keyArray.joined(separator: "-")
_ =
isFiltering
? lmFiltered.temporaryMap[keyChain, default: []].append(unigram)
: lmUserPhrases.temporaryMap[keyChain, default: []].append(unigram)
}
/// 使
/// - Parameters:
/// - targetType:
public func retrieveData(from targetType: ReplacableUserDataType) -> String {
switch targetType {
case .thePhrases: return lmUserPhrases.strData
case .theFilter: return lmFiltered.strData
case .theReplacements: return lmReplacements.strData
case .theAssociates: return lmAssociates.strData
case .theSymbols: return lmUserSymbols.strData
}
}
/// 使
/// - Parameters:
/// - rawStrData:
/// - targetType:
public func replaceData(textData rawStrData: String, for targetType: ReplacableUserDataType, save: Bool = true) {
var rawText = rawStrData
LMConsolidator.consolidate(text: &rawText, pragma: true)
switch targetType {
case .theAssociates:
lmAssociates.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
case .theFilter:
lmFiltered.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
case .theReplacements:
lmReplacements.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
case .thePhrases:
lmUserPhrases.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
case .theSymbols:
lmUserSymbols.replaceData(textData: rawText)
if save { lmAssociates.saveData() }
}
}
///
/// - Parameter key:
/// - Returns:
public func hasUnigramsFor(keyArray: [String]) -> Bool {
let keyChain = keyArray.joined(separator: "-")
// .unigramsFor()
// SQL SQLite
// 2010
return keyChain == " " || (!unigramsFor(keyArray: keyArray).isEmpty && !keyChain.isEmpty)
}
///
/// - Parameters:
/// - keyArray:
/// - value:
/// - factoryDictionaryOnly:
/// - Returns:
public func hasKeyValuePairFor(keyArray: [String], value: String, factoryDictionaryOnly: Bool = false) -> Bool {
factoryDictionaryOnly
? factoryCoreUnigramsFor(key: keyArray.joined(separator: "-")).map(\.value).contains(value)
: unigramsFor(keyArray: keyArray).map(\.value).contains(value)
}
///
/// - Parameters:
/// - keyArray:
/// - factoryDictionaryOnly:
/// - Returns:
public func countKeyValuePairs(keyArray: [String], factoryDictionaryOnly: Bool = false) -> Int {
factoryDictionaryOnly
? factoryCoreUnigramsFor(key: keyArray.joined(separator: "-")).count
: unigramsFor(keyArray: keyArray).count
}
/// LMI
/// - Parameter key:
/// - Returns:
public func unigramsFor(keyArray: [String]) -> [Megrez.Unigram] {
let keyChain = keyArray.joined(separator: "-")
guard !keyChain.isEmpty else { return [] }
///
if keyChain == " " { return [.init(value: " ")] }
///
var rawAllUnigrams: [Megrez.Unigram] = []
if config.isCassetteEnabled { rawAllUnigrams += Self.lmCassette.unigramsFor(key: keyChain) }
// 使
if config.isSCPCEnabled {
rawAllUnigrams += Self.lmPlainBopomofo.valuesFor(key: keyChain, isCHS: isCHS).map {
Megrez.Unigram(value: $0, score: 0)
}
}
if !config.isCassetteEnabled || config.isCassetteEnabled && keyChain.map(\.description)[0] == "_" {
// NumPad
rawAllUnigrams += supplyNumPadUnigrams(key: keyChain)
// LMMisc LMCore score (-10.0, 0.0)
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCHEW)
//
var coreUnigramsResult: [Megrez.Unigram] = factoryCoreUnigramsFor(key: keyChain)
// CNS11643
if config.filterNonCNSReadings, !isCHS {
coreUnigramsResult.removeAll { thisUnigram in
!checkCNSConformation(for: thisUnigram, keyArray: keyArray)
}
}
//
rawAllUnigrams += coreUnigramsResult
if config.isCNSEnabled {
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCNS)
}
}
if config.isSymbolEnabled {
rawAllUnigrams += lmUserSymbols.unigramsFor(key: keyChain)
if !config.isCassetteEnabled {
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataSYMB)
}
}
// reversed 使
//
// rawUserUnigrams
var userPhraseUnigrams = Array(lmUserPhrases.unigramsFor(key: keyChain).reversed())
if keyArray.count == 1, let topScore = rawAllUnigrams.map(\.score).max() {
// 使
userPhraseUnigrams = userPhraseUnigrams.map { currentUnigram in
Megrez.Unigram(
value: currentUnigram.value,
score: Swift.min(topScore + 0.000_114_514, currentUnigram.score)
)
}
}
rawAllUnigrams = userPhraseUnigrams + rawAllUnigrams
// InputToken
rawAllUnigrams = rawAllUnigrams.map { unigram in
let convertedValues = unigram.value.parseAsInputToken(isCHS: isCHS)
guard !convertedValues.isEmpty else { return [unigram] }
var result = [Megrez.Unigram]()
convertedValues.enumerated().forEach { absDelta, value in
let newScore: Double = -80 - Double(absDelta) * 0.01
result.append(.init(value: value, score: newScore))
}
return result
}.flatMap { $0 }
//
rawAllUnigrams.append(contentsOf: queryDateTimeUnigrams(with: keyChain))
if keyChain == "_punctuation_list" {
rawAllUnigrams.append(contentsOf: getHaninSymbolMenuUnigrams())
}
//
if config.isPhraseReplacementEnabled {
for i in 0 ..< rawAllUnigrams.count {
let newValue = lmReplacements.valuesFor(key: rawAllUnigrams[i].value)
guard !newValue.isEmpty else { continue }
rawAllUnigrams[i].value = newValue
}
}
//
rawAllUnigrams.consolidate(filter: .init(lmFiltered.unigramsFor(key: keyChain).map(\.value)))
return rawAllUnigrams
}
}
}