vChewing-macOS/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCassette.swift

330 lines
15 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
// StringView Ranges extension by (c) 2022 and onwards Isaac Xen (MIT License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
// ... with NTL restriction stating that:
// No trademark license is granted to use the trade names, trademarks, service
// marks, or product names of Contributor, except as required to fulfill notice
// requirements defined in MIT License.
import Foundation
import LineReader
import Megrez
extension LMAssembly {
/// 便使
struct LMCassette {
public private(set) var filePath: String?
public private(set) var nameShort: String = ""
public private(set) var nameENG: String = ""
public private(set) var nameCJK: String = ""
public private(set) var nameIntl: String = ""
public private(set) var nullCandidate: String = ""
///
public private(set) var maxKeyLength: Int = 1
public private(set) var selectionKeys: String = ""
public private(set) var endKeys: [String] = []
public private(set) var wildcardKey: String = ""
public private(set) var keysToDirectlyCommit: String = ""
public private(set) var keyNameMap: [String: String] = [:]
public private(set) var quickDefMap: [String: String] = [:]
public private(set) var charDefMap: [String: [String]] = [:]
public private(set) var charDefWildcardMap: [String: [String]] = [:]
public private(set) var symbolDefMap: [String: [String]] = [:]
public private(set) var reverseLookupMap: [String: [String]] = [:]
/// [:]
public private(set) var octagramMap: [String: Int] = [:]
/// [:(, )]
public private(set) var octagramDividedMap: [String: (Int, String)] = [:]
public private(set) var areCandidateKeysShiftHeld: Bool = false
public private(set) var supplyQuickResults: Bool = false
public private(set) var supplyPartiallyMatchedResults: Bool = false
public var candidateKeysValidator: (String) -> Bool = { _ in false }
/// 西 - NORM
private var norm = 0.0
}
}
extension LMAssembly.LMCassette {
/// 西 - fscale
private static let fscale = 2.7
///
var wildcard: String { wildcardKey.isEmpty ? "" : wildcardKey }
/// charDef
var count: Int { charDefMap.count }
///
var isLoaded: Bool { !charDefMap.isEmpty }
/// 使
var allowedKeys: [String] { Array(keyNameMap.keys + [" "]).deduplicated }
///
func convertKeyToDisplay(char: String) -> String {
keyNameMap[char] ?? char
}
/// CIN
/// - Note:
/// - `%gen_inp` `%ename` cin
/// - `%ename` `%cname` CJK
/// `%sname` `%intlname`
/// - `%encoding` Swift UTF-8
/// - `%selkey`
/// - `%endkey`
/// - `%wildcardkey`
/// - `%nullcandidate` `%quick`
/// - `%keyname begin` `%keyname end` Swift
/// - `%quick begin` `%quick end` value
/// - `%chardef begin` `%chardef end`
/// - `%symboldef begin` `%symboldef end`
/// - `%octagram begin` `%octagram end`
///
/// - Parameter path:
/// - Returns:
@discardableResult mutating func open(_ path: String) -> Bool {
if isLoaded { return false }
let oldPath = filePath
filePath = nil
if FileManager.default.fileExists(atPath: path) {
do {
guard let fileHandle = FileHandle(forReadingAtPath: path) else {
throw LMAssembly.FileErrors.fileHandleError("")
}
let lineReader = try LineReader(file: fileHandle)
var theMaxKeyLength = 1
var loadingKeys = false
var loadingQuickSets = false {
willSet {
supplyQuickResults = true
if !newValue, quickDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
}
}
var loadingCharDefinitions = false {
willSet {
if !newValue, charDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
}
}
var loadingSymbolDefinitions = false {
willSet {
if !newValue, symbolDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
}
}
var loadingOctagramData = false
var keysUsedInCharDef: Set<String> = .init()
for strLine in lineReader {
let isTabDelimiting = strLine.contains("\t")
let cells = isTabDelimiting ? strLine.split(separator: "\t") : strLine.split(separator: " ")
guard cells.count >= 1 else { continue }
let strFirstCell = cells[0].trimmingCharacters(in: .newlines)
let strSecondCell = cells.count >= 2 ? cells[1].trimmingCharacters(in: .newlines) : nil
//
if strLine.first == "%", strFirstCell != "%" {
// %flag_disp_partial_match
if strLine == "%flag_disp_partial_match" {
supplyPartiallyMatchedResults = true
supplyQuickResults = true
}
guard let strSecondCell = strSecondCell else { continue }
processTags: switch strFirstCell {
case "%keyname" where strSecondCell == "begin": loadingKeys = true
case "%keyname" where strSecondCell == "end": loadingKeys = false
case "%quick" where strSecondCell == "begin": loadingQuickSets = true
case "%quick" where strSecondCell == "end": loadingQuickSets = false
case "%chardef" where strSecondCell == "begin": loadingCharDefinitions = true
case "%chardef" where strSecondCell == "end": loadingCharDefinitions = false
case "%symboldef" where strSecondCell == "begin": loadingSymbolDefinitions = true
case "%symboldef" where strSecondCell == "end": loadingSymbolDefinitions = false
case "%octagram" where strSecondCell == "begin": loadingOctagramData = true
case "%octagram" where strSecondCell == "end": loadingOctagramData = false
case "%ename" where nameENG.isEmpty:
parseSubCells: for neta in strSecondCell.components(separatedBy: ";") {
let subNetaGroup = neta.components(separatedBy: ":")
guard subNetaGroup.count == 2, subNetaGroup[1].contains("en") else { continue }
nameENG = String(subNetaGroup[0])
break parseSubCells
}
guard nameENG.isEmpty else { break processTags }
nameENG = strSecondCell
case "%intlname" where nameIntl.isEmpty: nameIntl = strSecondCell.replacingOccurrences(of: "_", with: " ")
case "%cname" where nameCJK.isEmpty: nameCJK = strSecondCell
case "%sname" where nameShort.isEmpty: nameShort = strSecondCell
case "%nullcandidate" where nullCandidate.isEmpty: nullCandidate = strSecondCell
case "%selkey" where selectionKeys.isEmpty: selectionKeys = strSecondCell.map(\.description).deduplicated.joined()
case "%endkey" where endKeys.isEmpty: endKeys = strSecondCell.map(\.description).deduplicated
case "%wildcardkey" where wildcardKey.isEmpty: wildcardKey = strSecondCell.first?.description ?? ""
case "%keys_to_directly_commit" where keysToDirectlyCommit.isEmpty: keysToDirectlyCommit = strSecondCell
default: break processTags
}
continue
}
//
guard let strSecondCell = strSecondCell else { continue }
if loadingKeys {
keyNameMap[strFirstCell] = strSecondCell.trimmingCharacters(in: .newlines)
} else if loadingQuickSets {
theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
quickDefMap[strFirstCell, default: .init()].append(strSecondCell)
} else if loadingCharDefinitions, !loadingSymbolDefinitions {
theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
charDefMap[strFirstCell, default: []].append(strSecondCell)
if strFirstCell.count > 1 {
strFirstCell.map(\.description).forEach { keyChar in
keysUsedInCharDef.insert(keyChar.description)
}
}
reverseLookupMap[strSecondCell, default: []].append(strFirstCell)
var keyComps = strFirstCell.map(\.description)
while !keyComps.isEmpty {
keyComps.removeLast()
charDefWildcardMap[keyComps.joined() + wildcard, default: []].append(strSecondCell)
}
} else if loadingSymbolDefinitions {
theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
symbolDefMap[strFirstCell, default: []].append(strSecondCell)
reverseLookupMap[strSecondCell, default: []].append(strFirstCell)
} else if loadingOctagramData {
guard let countValue = Int(strSecondCell) else { continue }
switch cells.count {
case 2: octagramMap[strFirstCell] = countValue
case 3: octagramDividedMap[strFirstCell] = (countValue, cells[2].trimmingCharacters(in: .newlines))
default: break
}
norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue)
}
}
// Post process.
// Package 便 J / K
//
if !candidateKeysValidator(selectionKeys) { selectionKeys = "1234567890" }
if !keysUsedInCharDef.intersection(selectionKeys.map(\.description)).isEmpty {
areCandidateKeysShiftHeld = true
}
maxKeyLength = theMaxKeyLength
keyNameMap[wildcardKey] = keyNameMap[wildcardKey] ?? ""
filePath = path
return true
} catch {
vCLMLog("CIN Loading Failed: File Access Error.")
}
} else {
vCLMLog("CIN Loading Failed: File Missing.")
}
filePath = oldPath
return false
}
mutating func clear() {
self = .init()
}
func quickSetsFor(key: String) -> String? {
guard !key.isEmpty else { return nil }
var result = [String]()
if let specifiedResult = quickDefMap[key], !specifiedResult.isEmpty {
result.append(contentsOf: specifiedResult.map(\.description))
}
if supplyQuickResults, result.isEmpty {
if supplyPartiallyMatchedResults {
let fetched = charDefMap.compactMap {
$0.key.starts(with: key) ? $0 : nil
}.stableSort {
$0.key.count < $1.key.count
}.flatMap(\.value).filter {
$0.count == 1
}
result.append(contentsOf: fetched.deduplicated.prefix(selectionKeys.count * 6))
} else {
let fetched = (charDefMap[key] ?? [String]()).filter { $0.count == 1 }
result.append(contentsOf: fetched.deduplicated.prefix(selectionKeys.count * 6))
}
}
return result.isEmpty ? nil : result.joined(separator: "\t")
}
///
/// - parameters:
/// - key:
func unigramsFor(key: String) -> [Megrez.Unigram] {
let arrRaw = charDefMap[key]?.deduplicated ?? []
var arrRawWildcard: [String] = []
if let arrRawWildcardValues = charDefWildcardMap[key]?.deduplicated,
key.contains(wildcard), key.first?.description != wildcard
{
arrRawWildcard.append(contentsOf: arrRawWildcardValues)
}
var arrResults = [Megrez.Unigram]()
var lowestScore: Double = 0
for neta in arrRaw {
let theScore: Double = {
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
} else if let freqData = octagramMap[neta] {
return calculateWeight(count: freqData, phraseLength: neta.count)
}
return Double(arrResults.count) * -0.001 - 9.5
}()
lowestScore = min(theScore, lowestScore)
arrResults.append(.init(value: neta, score: theScore))
}
lowestScore = min(-9.5, lowestScore)
if !arrRawWildcard.isEmpty {
for neta in arrRawWildcard {
var theScore: Double = {
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
} else if let freqData = octagramMap[neta] {
return calculateWeight(count: freqData, phraseLength: neta.count)
}
return Double(arrResults.count) * -0.001 - 9.7
}()
theScore += lowestScore
arrResults.append(.init(value: neta, score: theScore))
}
}
return arrResults
}
///
/// - parameters:
/// - key:
func hasUnigramsFor(key: String) -> Bool {
charDefMap[key] != nil
|| (charDefWildcardMap[key] != nil && key.contains(wildcard) && key.first?.description != wildcard)
}
// MARK: - Private Functions.
private func calculateWeight(count theCount: Int, phraseLength: Int) -> Double {
var weight: Double = 0
switch theCount {
case -2: //
weight = -13
case -1: //
weight = -13
case 0: //
weight = log10(
Self.fscale ** (Double(phraseLength) / 3.0 - 1.0) * 0.25 / norm)
default:
weight = log10(
Self.fscale ** (Double(phraseLength) / 3.0 - 1.0)
* Double(theCount) / norm
)
}
return weight
}
}
// MARK: -
// Ref: https://stackoverflow.com/a/41581695/4162914
precedencegroup ExponentiationPrecedence {
associativity: right
higherThan: MultiplicationPrecedence
}
infix operator **: ExponentiationPrecedence
private func ** (_ base: Double, _ exp: Double) -> Double {
pow(base, exp)
}