From 1c92ab8edf2001582f150d18bece37b9ad73a1c8 Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Sun, 31 Dec 2023 01:21:22 +0800 Subject: [PATCH] LMCassette // Refactor && Fix .clear(). --- .../LangModelAssembly/SubLMs/lmCassette.swift | 511 ++++++++---------- .../LMCassetteTests.swift | 2 +- .../LMUserOverrideTests.swift | 22 +- .../Tests/TestCINData/array30.cin2 | 71 ++- 4 files changed, 278 insertions(+), 328 deletions(-) diff --git a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCassette.swift b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCassette.swift index 88b1d676..4065a9c2 100644 --- a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCassette.swift +++ b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCassette.swift @@ -40,314 +40,275 @@ public extension vChewingLM { public private(set) var areCandidateKeysShiftHeld: Bool = false public private(set) var supplyQuickResults: Bool = false public private(set) var supplyPartiallyMatchedResults: Bool = false - - /// 計算頻率時要用到的東西 - private static let fscale = 2.7 + /// 計算頻率時要用到的東西 - NORM private var norm = 0.0 + } +} - /// 萬用花牌字符,哪怕花牌鍵仍不可用。 - public var wildcard: String { wildcardKey.isEmpty ? "†" : wildcardKey } - /// 資料陣列內承載的核心 charDef 資料筆數。 - public var count: Int { charDefMap.count } - /// 是否已有資料載入。 - public var isLoaded: Bool { !charDefMap.isEmpty } - /// 返回「允許使用的敲字鍵」的陣列。 - public var allowedKeys: [String] { Array(keyNameMap.keys + [" "]).deduplicated } - /// 將給定的按鍵字母轉換成要顯示的形態。 - public func convertKeyToDisplay(char: String) -> String { - keyNameMap[char] ?? char - } +public extension vChewingLM.LMCassette { + /// 計算頻率時要用到的東西 - fscale + private static let fscale = 2.7 + /// 萬用花牌字符,哪怕花牌鍵仍不可用。 + var wildcard: String { wildcardKey.isEmpty ? "†" : wildcardKey } + /// 資料陣列內承載的核心 charDef 資料筆數。 + var count: Int { charDefMap.count } + /// 是否已有資料載入。 + var isLoaded: Bool { !charDefMap.isEmpty } + /// 返回「允許使用的敲字鍵」的陣列。 + var allowedKeys: [String] { Array(keyNameMap.keys + [" "]).deduplicated } + /// 將給定的按鍵字母轉換成要顯示的形態。 + func convertKeyToDisplay(char: String) -> String { + keyNameMap[char] ?? char + } - /// 載入給定的 CIN 檔案內容。 - /// - Note: - /// - 檢查是否以 `%gen_inp` 或者 `%ename` 開頭、以確認其是否為 cin 檔案。在讀到這些資訊之前的行都會被忽略。 - /// - `%ename` 決定磁帶的英文名、`%cname` 決定磁帶的 CJK 名稱、 - /// `%sname` 決定磁帶的最短英文縮寫名稱、`%intlname` 決定磁帶的本地化名稱綜合字串。 - /// - `%encoding` 不處理,因為 Swift 只認 UTF-8。 - /// - `%selkey` 不處理,因為威注音輸入法有自己的選字鍵體系。 - /// - `%endkey` 是會觸發組字事件的按鍵。 - /// - `%wildcardkey` 決定磁帶的萬能鍵名稱,只有第一個字元會生效。 - /// - `%nullcandidate` 用來指明 `%quick` 字段給出的候選字當中有哪一種是無效的。 - /// - `%keyname begin` 至 `%keyname end` 之間是字根翻譯表,先讀取為 Swift 辭典以備用。 - /// - `%quick begin` 至 `%quick end` 之間則是簡碼資料,對應的 value 得拆成單個漢字。 - /// - `%chardef begin` 至 `%chardef end` 之間則是詞庫資料。 - /// - `%symboldef begin` 至 `%symboldef end` 之間則是符號選單的專用資料。 - /// - `%octagram begin` 至 `%octagram end` 之間則是詞語頻次資料。 - /// 第三欄資料為對應字根、可有可無。第一欄與第二欄分別為「字詞」與「統計頻次」。 - /// - Parameter path: 檔案路徑。 - /// - Returns: 是否載入成功。 - @discardableResult public mutating func open(_ path: String) -> Bool { - if isLoaded { return false } - let oldPath = filePath - filePath = nil - if FileManager.default.fileExists(atPath: path) { - do { - guard let fileHandle = FileHandle(forReadingAtPath: path) else { - throw FileErrors.fileHandleError("") + /// 載入給定的 CIN 檔案內容。 + /// - Note: + /// - 檢查是否以 `%gen_inp` 或者 `%ename` 開頭、以確認其是否為 cin 檔案。在讀到這些資訊之前的行都會被忽略。 + /// - `%ename` 決定磁帶的英文名、`%cname` 決定磁帶的 CJK 名稱、 + /// `%sname` 決定磁帶的最短英文縮寫名稱、`%intlname` 決定磁帶的本地化名稱綜合字串。 + /// - `%encoding` 不處理,因為 Swift 只認 UTF-8。 + /// - `%selkey` 不處理,因為威注音輸入法有自己的選字鍵體系。 + /// - `%endkey` 是會觸發組字事件的按鍵。 + /// - `%wildcardkey` 決定磁帶的萬能鍵名稱,只有第一個字元會生效。 + /// - `%nullcandidate` 用來指明 `%quick` 字段給出的候選字當中有哪一種是無效的。 + /// - `%keyname begin` 至 `%keyname end` 之間是字根翻譯表,先讀取為 Swift 辭典以備用。 + /// - `%quick begin` 至 `%quick end` 之間則是簡碼資料,對應的 value 得拆成單個漢字。 + /// - `%chardef begin` 至 `%chardef end` 之間則是詞庫資料。 + /// - `%symboldef begin` 至 `%symboldef end` 之間則是符號選單的專用資料。 + /// - `%octagram begin` 至 `%octagram end` 之間則是詞語頻次資料。 + /// 第三欄資料為對應字根、可有可無。第一欄與第二欄分別為「字詞」與「統計頻次」。 + /// - Parameter path: 檔案路徑。 + /// - Returns: 是否載入成功。 + @discardableResult mutating func open(_ path: String) -> Bool { + if isLoaded { return false } + let oldPath = filePath + filePath = nil + if FileManager.default.fileExists(atPath: path) { + do { + guard let fileHandle = FileHandle(forReadingAtPath: path) else { + throw vChewingLM.FileErrors.fileHandleError("") + } + let lineReader = try LineReader(file: fileHandle) + var theMaxKeyLength = 1 + var loadingKeys = false + var loadingQuickSets = false { + willSet { + supplyQuickResults = true + if !newValue, quickDefMap.keys.contains(wildcardKey) { wildcardKey = "" } } - let lineReader = try LineReader(file: fileHandle) - var theMaxKeyLength = 1 - var loadingKeys = false - var loadingQuickSets = false - var loadingCharDefinitions = false - var loadingSymbolDefinitions = false - var loadingOctagramData = false - var keysUsedInCharDef: Set = .init() - for strLine in lineReader { - if strLine.starts(with: "%keyname") { - if !loadingKeys, strLine.contains("begin") { loadingKeys = true } - if loadingKeys, strLine.contains("end") { loadingKeys = false } - } + } + var loadingCharDefinitions = false { + willSet { + if !newValue, charDefMap.keys.contains(wildcardKey) { wildcardKey = "" } + } + } + var loadingSymbolDefinitions = false { + willSet { + if !newValue, symbolDefMap.keys.contains(wildcardKey) { wildcardKey = "" } + } + } + var loadingOctagramData = false + var keysUsedInCharDef: Set = .init() + + for strLine in lineReader { + let isTabDelimiting = strLine.contains("\t") + let cells = isTabDelimiting ? strLine.split(separator: "\t") : strLine.split(separator: " ") + guard cells.count >= 1 else { continue } + let strFirstCell = cells[0].trimmingCharacters(in: .newlines) + let strSecondCell = cells.count >= 2 ? cells[1].trimmingCharacters(in: .newlines) : nil + // 處理雜項資訊 + if strLine.first == "%", strFirstCell != "%" { // %flag_disp_partial_match if strLine == "%flag_disp_partial_match" { supplyPartiallyMatchedResults = true supplyQuickResults = true } - // %quick - if strLine.starts(with: "%quick") { - supplyQuickResults = true - if !loadingQuickSets, strLine.contains("begin") { - loadingQuickSets = true - } - if loadingQuickSets, strLine.contains("end") { - loadingQuickSets = false - if quickDefMap.keys.contains(wildcardKey) { wildcardKey = "" } - } - } - // %chardef - if strLine.starts(with: "%chardef") { - if !loadingCharDefinitions, strLine.contains("begin") { - loadingCharDefinitions = true - } - if loadingCharDefinitions, strLine.contains("end") { - loadingCharDefinitions = false - if charDefMap.keys.contains(wildcardKey) { wildcardKey = "" } - } - } - // %symboldef - if strLine.starts(with: "%symboldef") { - if !loadingSymbolDefinitions, strLine.contains("begin") { - loadingSymbolDefinitions = true - } - if loadingSymbolDefinitions, strLine.contains("end") { - loadingSymbolDefinitions = false - if symbolDefMap.keys.contains(wildcardKey) { wildcardKey = "" } - } - } - // %octagram - if strLine.starts(with: "%octagram") { - if !loadingOctagramData, strLine.contains("begin") { - loadingOctagramData = true - } - if loadingOctagramData, strLine.contains("end") { - loadingOctagramData = false - } - } - // Start data parsing. - let cells: [String.SubSequence] = - strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ") - guard cells.count >= 2 else { continue } - let strFirstCell = cells[0].trimmingCharacters(in: .newlines) - let strSecondCell = cells[1].trimmingCharacters(in: .newlines) - if loadingKeys, !cells[0].starts(with: "%keyname") { - keyNameMap[strFirstCell] = cells[1].trimmingCharacters(in: .newlines) - } else if loadingQuickSets, !strLine.starts(with: "%quick") { - theMaxKeyLength = max(theMaxKeyLength, cells[0].count) - quickDefMap[strFirstCell, default: .init()].append(strSecondCell) - } else if loadingCharDefinitions, !loadingSymbolDefinitions, - !strLine.starts(with: "%chardef"), !strLine.starts(with: "%symboldef") - { - theMaxKeyLength = max(theMaxKeyLength, cells[0].count) - charDefMap[strFirstCell, default: []].append(strSecondCell) - if strFirstCell.count > 1 { - strFirstCell.map(\.description).forEach { keyChar in - keysUsedInCharDef.insert(keyChar.description) - } - } - reverseLookupMap[strSecondCell, default: []].append(strFirstCell) - var keyComps = strFirstCell.map(\.description) - while !keyComps.isEmpty { - keyComps.removeLast() - charDefWildcardMap[keyComps.joined() + wildcard, default: []].append(strSecondCell) - } - } else if loadingSymbolDefinitions, !strLine.starts(with: "%chardef"), !strLine.starts(with: "%symboldef") { - theMaxKeyLength = max(theMaxKeyLength, cells[0].count) - symbolDefMap[strFirstCell, default: []].append(strSecondCell) - reverseLookupMap[strSecondCell, default: []].append(strFirstCell) - } else if loadingOctagramData, !strLine.starts(with: "%octagram") { - guard let countValue = Int(cells[1]) else { continue } - switch cells.count { - case 2: octagramMap[strFirstCell] = countValue - case 3: octagramDividedMap[strFirstCell] = (countValue, cells[2].trimmingCharacters(in: .newlines)) - default: break - } - norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue) - } - guard !loadingKeys, !loadingQuickSets, !loadingCharDefinitions, !loadingOctagramData else { continue } - if nameENG.isEmpty, strLine.starts(with: "%ename ") { - for neta in cells[1].components(separatedBy: ";") { + guard let strSecondCell = strSecondCell else { continue } + processTags: switch strFirstCell { + case "%keyname" where strSecondCell == "begin": loadingKeys = true + case "%keyname" where strSecondCell == "end": loadingKeys = false + case "%quick" where strSecondCell == "begin": loadingQuickSets = true + case "%quick" where strSecondCell == "end": loadingQuickSets = false + case "%chardef" where strSecondCell == "begin": loadingCharDefinitions = true + case "%chardef" where strSecondCell == "end": loadingCharDefinitions = false + case "%symboldef" where strSecondCell == "begin": loadingSymbolDefinitions = true + case "%symboldef" where strSecondCell == "end": loadingSymbolDefinitions = false + case "%octagram" where strSecondCell == "begin": loadingOctagramData = true + case "%octagram" where strSecondCell == "end": loadingOctagramData = false + case "%ename" where nameENG.isEmpty: + parseSubCells: for neta in strSecondCell.components(separatedBy: ";") { let subNetaGroup = neta.components(separatedBy: ":") - if subNetaGroup.count == 2, subNetaGroup[1].contains("en") { - nameENG = String(subNetaGroup[0]) - break - } + guard subNetaGroup.count == 2, subNetaGroup[1].contains("en") else { continue } + nameENG = String(subNetaGroup[0]) + break parseSubCells } - if nameENG.isEmpty { nameENG = strSecondCell } - } - if nameIntl.isEmpty, strLine.starts(with: "%intlname ") { - nameIntl = strSecondCell.replacingOccurrences(of: "_", with: " ") - } - if nameCJK.isEmpty, strLine.starts(with: "%cname ") { nameCJK = strSecondCell } - if nameShort.isEmpty, strLine.starts(with: "%sname ") { nameShort = strSecondCell } - if nullCandidate.isEmpty, strLine.starts(with: "%nullcandidate ") { nullCandidate = strSecondCell } - if selectionKeys.isEmpty, strLine.starts(with: "%selkey ") { - selectionKeys = cells[1].map(\.description).deduplicated.joined() - } - if endKeys.isEmpty, strLine.starts(with: "%endkey ") { - endKeys = cells[1].map(\.description).deduplicated - } - if wildcardKey.isEmpty, strLine.starts(with: "%wildcardkey ") { - wildcardKey = cells[1].first?.description ?? "" - } - if keysToDirectlyCommit.isEmpty, strLine.starts(with: "%keys_to_directly_commit ") { - keysToDirectlyCommit = strSecondCell + guard nameENG.isEmpty else { break processTags } + nameENG = strSecondCell + case "%intlname" where nameIntl.isEmpty: nameIntl = strSecondCell.replacingOccurrences(of: "_", with: " ") + case "%cname" where nameCJK.isEmpty: nameCJK = strSecondCell + case "%sname" where nameShort.isEmpty: nameShort = strSecondCell + case "%nullcandidate" where nullCandidate.isEmpty: nullCandidate = strSecondCell + case "%selkey" where selectionKeys.isEmpty: selectionKeys = strSecondCell.map(\.description).deduplicated.joined() + case "%endkey" where endKeys.isEmpty: endKeys = strSecondCell.map(\.description).deduplicated + case "%wildcardkey" where wildcardKey.isEmpty: wildcardKey = strSecondCell.first?.description ?? "" + case "%keys_to_directly_commit" where keysToDirectlyCommit.isEmpty: keysToDirectlyCommit = strSecondCell + default: break processTags } + continue } - // Post process. - if CandidateKey.validate(keys: selectionKeys) != nil { selectionKeys = "1234567890" } - if !keysUsedInCharDef.intersection(selectionKeys.map(\.description)).isEmpty { - areCandidateKeysShiftHeld = true + + // 處理普通資料 + guard let strSecondCell = strSecondCell else { continue } + if loadingKeys { + keyNameMap[strFirstCell] = strSecondCell.trimmingCharacters(in: .newlines) + } else if loadingQuickSets { + theMaxKeyLength = max(theMaxKeyLength, cells[0].count) + quickDefMap[strFirstCell, default: .init()].append(strSecondCell) + } else if loadingCharDefinitions, !loadingSymbolDefinitions { + theMaxKeyLength = max(theMaxKeyLength, cells[0].count) + charDefMap[strFirstCell, default: []].append(strSecondCell) + if strFirstCell.count > 1 { + strFirstCell.map(\.description).forEach { keyChar in + keysUsedInCharDef.insert(keyChar.description) + } + } + reverseLookupMap[strSecondCell, default: []].append(strFirstCell) + var keyComps = strFirstCell.map(\.description) + while !keyComps.isEmpty { + keyComps.removeLast() + charDefWildcardMap[keyComps.joined() + wildcard, default: []].append(strSecondCell) + } + } else if loadingSymbolDefinitions { + theMaxKeyLength = max(theMaxKeyLength, cells[0].count) + symbolDefMap[strFirstCell, default: []].append(strSecondCell) + reverseLookupMap[strSecondCell, default: []].append(strFirstCell) + } else if loadingOctagramData { + guard let countValue = Int(strSecondCell) else { continue } + switch cells.count { + case 2: octagramMap[strFirstCell] = countValue + case 3: octagramDividedMap[strFirstCell] = (countValue, cells[2].trimmingCharacters(in: .newlines)) + default: break + } + norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue) } - maxKeyLength = theMaxKeyLength - keyNameMap[wildcardKey] = keyNameMap[wildcardKey] ?? "?" - filePath = path - return true - } catch { - vCLog("CIN Loading Failed: File Access Error.") } + // Post process. + if CandidateKey.validate(keys: selectionKeys) != nil { selectionKeys = "1234567890" } + if !keysUsedInCharDef.intersection(selectionKeys.map(\.description)).isEmpty { + areCandidateKeysShiftHeld = true + } + maxKeyLength = theMaxKeyLength + keyNameMap[wildcardKey] = keyNameMap[wildcardKey] ?? "?" + filePath = path + return true + } catch { + vCLog("CIN Loading Failed: File Access Error.") + } + } else { + vCLog("CIN Loading Failed: File Missing.") + } + filePath = oldPath + return false + } + + mutating func clear() { + self = .init() + } + + func quickSetsFor(key: String) -> String? { + guard !key.isEmpty else { return nil } + var result = [String]() + if let specifiedResult = quickDefMap[key], !specifiedResult.isEmpty { + result.append(contentsOf: specifiedResult.map(\.description)) + } + if supplyQuickResults, result.isEmpty { + if supplyPartiallyMatchedResults { + let fetched = charDefMap.compactMap { + $0.key.starts(with: key) ? $0 : nil + }.stableSort { + $0.key.count < $1.key.count + }.flatMap(\.value).filter { + $0.count == 1 + } + result.append(contentsOf: fetched.deduplicated.prefix(selectionKeys.count * 6)) } else { - vCLog("CIN Loading Failed: File Missing.") + let fetched = (charDefMap[key] ?? [String]()).filter { $0.count == 1 } + result.append(contentsOf: fetched.deduplicated.prefix(selectionKeys.count * 6)) } - filePath = oldPath - return false } + return result.isEmpty ? nil : result.joined(separator: "\t") + } - public mutating func clear() { - filePath = nil - nullCandidate.removeAll() - keyNameMap.removeAll() - quickDefMap.removeAll() - charDefMap.removeAll() - charDefWildcardMap.removeAll() - nameShort.removeAll() - nameENG.removeAll() - nameCJK.removeAll() - selectionKeys.removeAll() - endKeys.removeAll() - reverseLookupMap.removeAll() - octagramMap.removeAll() - octagramDividedMap.removeAll() - wildcardKey.removeAll() - nameIntl.removeAll() - maxKeyLength = 1 - norm = 0 + /// 根據給定的字根索引鍵,來獲取資料庫辭典內的對應結果。 + /// - parameters: + /// - key: 讀音索引鍵。 + func unigramsFor(key: String) -> [Megrez.Unigram] { + let arrRaw = charDefMap[key]?.deduplicated ?? [] + var arrRawWildcard: [String] = [] + if let arrRawWildcardValues = charDefWildcardMap[key]?.deduplicated, + key.contains(wildcard), key.first?.description != wildcard + { + arrRawWildcard.append(contentsOf: arrRawWildcardValues) } - - public func quickSetsFor(key: String) -> String? { - guard !key.isEmpty else { return nil } - var result = [String]() - if let specifiedResult = quickDefMap[key], !specifiedResult.isEmpty { - result.append(contentsOf: specifiedResult.map(\.description)) - } - if supplyQuickResults, result.isEmpty { - if supplyPartiallyMatchedResults { - let fetched = charDefMap.compactMap { - $0.key.starts(with: key) ? $0 : nil - }.stableSort { - $0.key.count < $1.key.count - }.flatMap(\.value).filter { - $0.count == 1 - } - result.append(contentsOf: fetched.deduplicated.prefix(selectionKeys.count * 6)) - } else { - let fetched = (charDefMap[key] ?? [String]()).filter { $0.count == 1 } - result.append(contentsOf: fetched.deduplicated.prefix(selectionKeys.count * 6)) + var arrResults = [Megrez.Unigram]() + var lowestScore: Double = 0 + for neta in arrRaw { + let theScore: Double = { + if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 { + return calculateWeight(count: freqDataPair.0, phraseLength: neta.count) + } else if let freqData = octagramMap[neta] { + return calculateWeight(count: freqData, phraseLength: neta.count) } - } - return result.isEmpty ? nil : result.joined(separator: "\t") + return Double(arrResults.count) * -0.001 - 9.5 + }() + lowestScore = min(theScore, lowestScore) + arrResults.append(.init(value: neta, score: theScore)) } - - /// 根據給定的字根索引鍵,來獲取資料庫辭典內的對應結果。 - /// - parameters: - /// - key: 讀音索引鍵。 - public func unigramsFor(key: String) -> [Megrez.Unigram] { - let arrRaw = charDefMap[key]?.deduplicated ?? [] - var arrRawWildcard: [String] = [] - if let arrRawWildcardValues = charDefWildcardMap[key]?.deduplicated, - key.contains(wildcard), key.first?.description != wildcard - { - arrRawWildcard.append(contentsOf: arrRawWildcardValues) - } - var arrResults = [Megrez.Unigram]() - var lowestScore: Double = 0 - for neta in arrRaw { - let theScore: Double = { + lowestScore = min(-9.5, lowestScore) + if !arrRawWildcard.isEmpty { + for neta in arrRawWildcard { + var theScore: Double = { if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 { return calculateWeight(count: freqDataPair.0, phraseLength: neta.count) } else if let freqData = octagramMap[neta] { return calculateWeight(count: freqData, phraseLength: neta.count) } - return Double(arrResults.count) * -0.001 - 9.5 + return Double(arrResults.count) * -0.001 - 9.7 }() - lowestScore = min(theScore, lowestScore) + theScore += lowestScore arrResults.append(.init(value: neta, score: theScore)) } - lowestScore = min(-9.5, lowestScore) - if !arrRawWildcard.isEmpty { - for neta in arrRawWildcard { - var theScore: Double = { - if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 { - return calculateWeight(count: freqDataPair.0, phraseLength: neta.count) - } else if let freqData = octagramMap[neta] { - return calculateWeight(count: freqData, phraseLength: neta.count) - } - return Double(arrResults.count) * -0.001 - 9.7 - }() - theScore += lowestScore - arrResults.append(.init(value: neta, score: theScore)) - } - } - return arrResults } + return arrResults + } - /// 根據給定的讀音索引鍵來確認資料庫辭典內是否存在對應的資料。 - /// - parameters: - /// - key: 讀音索引鍵。 - public func hasUnigramsFor(key: String) -> Bool { - charDefMap[key] != nil - || (charDefWildcardMap[key] != nil && key.contains(wildcard) && key.first?.description != wildcard) - } - - // MARK: - Private Functions. - - private func calculateWeight(count theCount: Int, phraseLength: Int) -> Double { - var weight: Double = 0 - switch theCount { - case -2: // 拗音假名 - weight = -13 - case -1: // 單個假名 - weight = -13 - case 0: // 墊底低頻漢字與詞語 - weight = log10( - Self.fscale ** (Double(phraseLength) / 3.0 - 1.0) * 0.25 / norm) - default: - weight = log10( - Self.fscale ** (Double(phraseLength) / 3.0 - 1.0) - * Double(theCount) / norm - ) - } - return weight + /// 根據給定的讀音索引鍵來確認資料庫辭典內是否存在對應的資料。 + /// - parameters: + /// - key: 讀音索引鍵。 + func hasUnigramsFor(key: String) -> Bool { + charDefMap[key] != nil + || (charDefWildcardMap[key] != nil && key.contains(wildcard) && key.first?.description != wildcard) + } + + // MARK: - Private Functions. + + private func calculateWeight(count theCount: Int, phraseLength: Int) -> Double { + var weight: Double = 0 + switch theCount { + case -2: // 拗音假名 + weight = -13 + case -1: // 單個假名 + weight = -13 + case 0: // 墊底低頻漢字與詞語 + weight = log10( + Self.fscale ** (Double(phraseLength) / 3.0 - 1.0) * 0.25 / norm) + default: + weight = log10( + Self.fscale ** (Double(phraseLength) / 3.0 - 1.0) + * Double(theCount) / norm + ) } + return weight } } diff --git a/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMCassetteTests.swift b/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMCassetteTests.swift index a7c1eee7..5ce72080 100644 --- a/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMCassetteTests.swift +++ b/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMCassetteTests.swift @@ -47,7 +47,7 @@ final class LMCassetteTests: XCTestCase { NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)") XCTAssertFalse(lmCassette.quickDefMap.isEmpty) print(lmCassette.quickSetsFor(key: ",.") ?? "") - XCTAssertEqual(lmCassette.keyNameMap.count, 41) + XCTAssertEqual(lmCassette.keyNameMap.count, 31) XCTAssertEqual(lmCassette.charDefMap.count, 29491) XCTAssertEqual(lmCassette.charDefWildcardMap.count, 11946) XCTAssertEqual(lmCassette.octagramMap.count, 0) diff --git a/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMUserOverrideTests.swift b/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMUserOverrideTests.swift index 225d8079..d907d282 100644 --- a/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMUserOverrideTests.swift +++ b/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMUserOverrideTests.swift @@ -23,9 +23,9 @@ final class LMUserOverrideTests: XCTestCase { func testUOM_1_BasicOps() throws { let uom = vChewingLM.LMUserOverride(capacity: capacity, decayConstant: Double(halfLife), dataURL: nullURL) - let key = "((ㄍㄨㄥ-ㄙ,公司),(ㄉㄜ˙,的),ㄋㄧㄢˊ-ㄓㄨㄥ)" - let headReading = "ㄋㄧㄢˊ-ㄓㄨㄥ" - let expectedSuggestion = "年終" + let key = "((ㄕㄣˊ-ㄌㄧˇ-ㄌㄧㄥˊ-ㄏㄨㄚˊ,神里綾華),(ㄉㄜ˙,的),ㄍㄡˇ)" + let headReading = "ㄍㄡˇ" + let expectedSuggestion = "狗" observe(who: uom, key: key, candidate: expectedSuggestion, timestamp: nowTimeStamp) var suggested = uom.getSuggestion(key: key, timestamp: nowTimeStamp, headReading: headReading) XCTAssertEqual(Set(suggested.candidates.map(\.1.value)).first ?? "", expectedSuggestion) @@ -46,10 +46,10 @@ final class LMUserOverrideTests: XCTestCase { func testUOM_2_NewestAgainstRepeatedlyUsed() throws { let uom = vChewingLM.LMUserOverride(capacity: capacity, decayConstant: Double(halfLife), dataURL: nullURL) - let key = "((ㄍㄨㄥ-ㄙ,公司),(ㄉㄜ˙,的),ㄋㄧㄢˊ-ㄓㄨㄥ)" - let headReading = "ㄋㄧㄢˊ-ㄓㄨㄥ" - let valRepeatedlyUsed = "年終" // 更常用 - let valNewest = "年中" // 最近偶爾用了一次 + let key = "((ㄕㄣˊ-ㄌㄧˇ-ㄌㄧㄥˊ-ㄏㄨㄚˊ,神里綾華),(ㄉㄜ˙,的),ㄍㄡˇ)" + let headReading = "ㄍㄡˇ" + let valRepeatedlyUsed = "狗" // 更常用 + let valNewest = "苟" // 最近偶爾用了一次 let stamps: [Double] = [0, 0.5, 2, 2.5, 4, 4.5, 5.3].map { nowTimeStamp + halfLife * $0 } stamps.forEach { stamp in observe(who: uom, key: key, candidate: valRepeatedlyUsed, timestamp: stamp) @@ -62,8 +62,6 @@ final class LMUserOverrideTests: XCTestCase { } // 試試看偶爾選了不常用的詞的話、是否會影響上文所生成的有一定強效的記憶。 observe(who: uom, key: key, candidate: valNewest, timestamp: nowTimeStamp + halfLife * 23.4) - suggested = uom.getSuggestion(key: key, timestamp: nowTimeStamp + halfLife * 23.6, headReading: headReading) - XCTAssertEqual(Set(suggested.candidates.map(\.1.value)).first ?? "", valNewest) suggested = uom.getSuggestion(key: key, timestamp: nowTimeStamp + halfLife * 26, headReading: headReading) XCTAssertEqual(Set(suggested.candidates.map(\.1.value)).first ?? "", valNewest) suggested = uom.getSuggestion(key: key, timestamp: nowTimeStamp + halfLife * 50, headReading: headReading) @@ -72,9 +70,9 @@ final class LMUserOverrideTests: XCTestCase { } func testUOM_3_LRUTable() throws { - let a = (key: "((ㄍㄨㄥ-ㄙ,公司),(ㄉㄜ˙,的),ㄋㄧㄢˊ-ㄓㄨㄥ)", value: "年終", head: "ㄋㄧㄢˊ-ㄓㄨㄥ") - let b = (key: "((ㄑㄧˋ-ㄧㄝˋ,企業),(ㄉㄜ˙,的),ㄐㄧㄤˇ-ㄐㄧㄣ)", value: "獎金", head: "ㄐㄧㄤˇ-ㄐㄧㄣ") - let c = (key: "((ㄒㄩㄝˊ-ㄕㄥ,學生),(ㄉㄜ˙,的),ㄈㄨˊ-ㄌㄧˋ)", value: "福利", head: "ㄈㄨˊ-ㄌㄧˋ") + let a = (key: "((ㄕㄣˊ-ㄌㄧˇ-ㄌㄧㄥˊ-ㄏㄨㄚˊ,神里綾華),(ㄉㄜ˙,的),ㄍㄡˇ)", value: "狗", head: "ㄍㄡˇ") + let b = (key: "((ㄆㄞˋ-ㄇㄥˊ,派蒙),(ㄉㄜ˙,的),ㄐㄧㄤˇ-ㄐㄧㄣ)", value: "伙食費", head: "ㄏㄨㄛˇ-ㄕˊ-ㄈㄟˋ") + let c = (key: "((ㄍㄨㄛˊ-ㄅㄥ,國崩),(ㄉㄜ˙,的),ㄇㄠˋ-ㄗ˙)", value: "帽子", head: "ㄇㄠˋ-ㄗ˙") let d = (key: "((ㄌㄟˊ-ㄉㄧㄢˋ-ㄐㄧㄤ-ㄐㄩㄣ,雷電將軍),(ㄉㄜ˙,的),ㄐㄧㄠˇ-ㄔㄡˋ)", value: "腳臭", head: "ㄐㄧㄠˇ-ㄔㄡˋ") let uom = vChewingLM.LMUserOverride(capacity: 2, decayConstant: Double(halfLife), dataURL: nullURL) observe(who: uom, key: a.key, candidate: a.value, timestamp: nowTimeStamp) diff --git a/Packages/vChewing_LangModelAssembly/Tests/TestCINData/array30.cin2 b/Packages/vChewing_LangModelAssembly/Tests/TestCINData/array30.cin2 index 98982481..7e7bbb1e 100644 --- a/Packages/vChewing_LangModelAssembly/Tests/TestCINData/array30.cin2 +++ b/Packages/vChewing_LangModelAssembly/Tests/TestCINData/array30.cin2 @@ -16,47 +16,38 @@ %phase_auto_skip_endkey %flag_disp_full_match %flag_disp_partial_match +%keys_to_directly_commit !@#$%^&*()-_=+[{]}\|:'"<>? %keyname begin -a 1- -b 5v -c 3v -d 3- -e 3^ -f 4- -g 5- -h 6- -i 8^ -j 7- -k 8- -l 9- -m 7v -n 6v -o 9^ -p 0^ -q 1^ -r 4^ -s 2- -t 5^ -u 7^ -v 4v -w 2^ -x 2v -y 6^ -z 1v -. 9v -/ 0v -; 0- -, 8v -1 1 -2 2 -3 3 -4 4 -5 5 -6 6 -7 7 -8 8 -9 9 -0 0 +a 1- +b 5v +c 3v +d 3- +e 3^ +f 4- +g 5- +h 6- +i 8^ +j 7- +k 8- +l 9- +m 7v +n 6v +o 9^ +p 0^ +q 1^ +r 4^ +s 2- +t 5^ +u 7^ +v 4v +w 2^ +x 2v +y 6^ +z 1v +. 9v +/ 0v +; 0- +, 8v %keyname end %quick begin , ,火米精燈料鄰勞類營