From e0532738154ed1a546eb66ca4db29d408becb156 Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Mon, 8 Aug 2022 09:39:22 +0800 Subject: [PATCH] UOM // Implementing new function sets from upstream. - Keeping previous methods for generating keys. --- .../ControllerModules/KeyHandler_Core.swift | 79 ++-- .../KeyHandler_HandleComposition.swift | 2 +- .../SubLMs/lmUserOverride.swift | 392 ++++++++++-------- 3 files changed, 269 insertions(+), 204 deletions(-) diff --git a/Source/Modules/ControllerModules/KeyHandler_Core.swift b/Source/Modules/ControllerModules/KeyHandler_Core.swift index 621853be..85f98a89 100644 --- a/Source/Modules/ControllerModules/KeyHandler_Core.swift +++ b/Source/Modules/ControllerModules/KeyHandler_Core.swift @@ -126,31 +126,26 @@ public class KeyHandler { return arrResult } - /// 在組字器內,以給定之候選字字串、來試圖在給定游標位置所在之處指定選字處理過程。 + /// 在組字器內,以給定之候選字(詞音配對)、來試圖在給定游標位置所在之處指定選字處理過程。 /// 然後再將對應的節錨內的節點標記為「已經手動選字過」。 /// - Parameters: - /// - value: 給定之候選字字串。 + /// - value: 給定之候選字(詞音配對)。 /// - respectCursorPushing: 若該選項為 true,則會在選字之後始終將游標推送至選字後的節錨的前方。 func fixNode(candidate: (String, String), respectCursorPushing: Bool = true) { let actualCursor = actualCandidateCursor let theCandidate: Megrez.Compositor.Candidate = .init(key: candidate.0, value: candidate.1) - if !compositor.overrideCandidate(theCandidate, at: actualCursor) { return } + if !compositor.overrideCandidate(theCandidate, at: actualCursor, overrideType: .withHighScore) { return } + let previousWalk = compositor.walkedNodes // 開始爬軌。 walk() + let currentWalk = compositor.walkedNodes // 在可行的情況下更新使用者半衰記憶模組。 var accumulatedCursor = 0 - var currentNode: Megrez.Compositor.Node? - for node in compositor.walkedNodes { - accumulatedCursor += node.spanLength - if accumulatedCursor > actualCursor { - currentNode = node - break - } - } + let currentNode = currentWalk.findNode(at: actualCandidateCursor, target: &accumulatedCursor) guard let currentNode = currentNode else { return } - if currentNode.currentUnigram.score > -12 { + if currentNode.currentUnigram.score > -12, mgrPrefs.fetchSuggestionsFromUserOverrideModel { IME.prtDebugIntel("UOM: Start Observation.") // 這個過程可能會因為使用者半衰記憶模組內部資料錯亂、而導致輸入法在選字時崩潰。 // 於是在這裡引入災後狀況察覺專用變數,且先開啟該開關。順利執行完觀察後會關閉。 @@ -158,9 +153,9 @@ public class KeyHandler { mgrPrefs.failureFlagForUOMObservation = true // 令半衰記憶模組觀測給定的三元圖。 // 這個過程會讓半衰引擎根據當前上下文生成三元圖索引鍵。 - currentUOM.observe( - walkedNodes: compositor.walkedNodes, cursorIndex: actualCursor, candidate: theCandidate.value, - timestamp: NSDate().timeIntervalSince1970, saveCallback: { mgrLangModel.saveUserOverrideModelData() } + currentUOM.performObservation( + walkedBefore: previousWalk, walkedAfter: currentWalk, cursor: actualCandidateCursor, + timestamp: Date().timeIntervalSince1970, saveCallback: { mgrLangModel.saveUserOverrideModelData() } ) // 如果沒有出現崩框的話,那就將這個開關復位。 mgrPrefs.failureFlagForUOMObservation = false @@ -196,7 +191,7 @@ public class KeyHandler { return arrCandidates.map { ($0.key, $0.value) } } - let arrSuggestedUnigrams: [(String, Megrez.Unigram)] = fetchSuggestedCandidates() + let arrSuggestedUnigrams: [(String, Megrez.Unigram)] = fetchSuggestionsFromUOM(apply: false) let arrSuggestedCandidates: [Megrez.Compositor.Candidate] = arrSuggestedUnigrams.map { Megrez.Compositor.Candidate(key: $0.0, value: $0.1.value) } @@ -206,32 +201,40 @@ public class KeyHandler { return arrCandidates.map { ($0.key, $0.value) } } - /// 向半衰引擎詢問可能的選字建議。拿到的結果會是一個單元圖陣列,會自動按權重排序。 - func fetchSuggestedCandidates() -> [(String, Megrez.Unigram)] { - currentUOM.suggest( - walkedNodes: compositor.walkedNodes, cursorIndex: compositor.cursor, - timestamp: NSDate().timeIntervalSince1970 - ).stableSort { $0.1.score > $1.1.score } - } - /// 向半衰引擎詢問可能的選字建議、且套用給組字器內的當前游標位置。 - func fetchAndApplySuggestionsFromUserOverrideModel() { + @discardableResult func fetchSuggestionsFromUOM(apply: Bool) -> [(String, Megrez.Unigram)] { + var arrResult = [(String, Megrez.Unigram)]() /// 如果逐字選字模式有啟用的話,直接放棄執行這個函式。 - if mgrPrefs.useSCPCTypingMode { return } + if mgrPrefs.useSCPCTypingMode { return arrResult } /// 如果這個開關沒打開的話,直接放棄執行這個函式。 - if !mgrPrefs.fetchSuggestionsFromUserOverrideModel { return } - /// 先就當前上下文讓半衰引擎重新生成三元圖索引鍵。 - let overrideValue = fetchSuggestedCandidates().first?.1.value ?? "" - - /// 再拿著索引鍵去問半衰模組有沒有選字建議。有的話就遵循之、讓天權星引擎對指定節錨下的節點複寫權重。 - if !overrideValue.isEmpty { - IME.prtDebugIntel( - "UOM: Suggestion retrieved, overriding the node score of the selected candidate.") - // TODO: 這裡回頭改成用詞音配對來覆寫的形式。 - compositor.overrideCandidateLiteral(overrideValue, at: actualCandidateCursor, overrideType: .withTopUnigramScore) - } else { - IME.prtDebugIntel("UOM: Blank suggestion retrieved, dismissing.") + if !mgrPrefs.fetchSuggestionsFromUserOverrideModel { return arrResult } + /// 獲取來自半衰記憶模組的建議結果 + let suggestion = currentUOM.fetchSuggestion( + currentWalk: compositor.walkedNodes, cursor: actualCandidateCursor, timestamp: Date().timeIntervalSince1970 + ) + arrResult.append(contentsOf: suggestion.candidates) + if apply { + /// 再看有沒有選字建議。有的話就遵循之、讓天權星引擎對指定節錨下的節點複寫權重。 + if !suggestion.isEmpty, let newestSuggestedCandidate = suggestion.candidates.last { + let overrideBehavior: Megrez.Compositor.Node.OverrideType = + suggestion.forceHighScoreOverride ? .withHighScore : .withTopUnigramScore + let suggestedPair: Megrez.Compositor.Candidate = .init( + key: newestSuggestedCandidate.0, value: newestSuggestedCandidate.1.value + ) + IME.prtDebugIntel( + "UOM: Suggestion retrieved, overriding the node score of the selected candidate: \(suggestedPair.toNGramKey)") + if !compositor.overrideCandidate(suggestedPair, at: actualCandidateCursor, overrideType: overrideBehavior) { + compositor.overrideCandidateLiteral( + newestSuggestedCandidate.1.value, at: actualCandidateCursor, overrideType: overrideBehavior + ) + } + walk() + } else { + IME.prtDebugIntel("UOM: Blank suggestion retrieved, dismissing.") + } } + arrResult = arrResult.stableSort { $0.1.score > $1.1.score } + return arrResult } // MARK: - Extracted methods and functions (Tekkon). diff --git a/Source/Modules/ControllerModules/KeyHandler_HandleComposition.swift b/Source/Modules/ControllerModules/KeyHandler_HandleComposition.swift index 43d58638..eb57165b 100644 --- a/Source/Modules/ControllerModules/KeyHandler_HandleComposition.swift +++ b/Source/Modules/ControllerModules/KeyHandler_HandleComposition.swift @@ -87,7 +87,7 @@ extension KeyHandler { walk() // 看看半衰記憶模組是否會對目前的狀態給出自動選字建議。 - fetchAndApplySuggestionsFromUserOverrideModel() + fetchSuggestionsFromUOM(apply: true) // 之後就是更新組字區了。先清空注拼槽的內容。 composer.clear() diff --git a/Source/Modules/LangModelRelated/SubLMs/lmUserOverride.swift b/Source/Modules/LangModelRelated/SubLMs/lmUserOverride.swift index eb731d14..3a6da698 100644 --- a/Source/Modules/LangModelRelated/SubLMs/lmUserOverride.swift +++ b/Source/Modules/LangModelRelated/SubLMs/lmUserOverride.swift @@ -26,174 +26,46 @@ extension vChewing { mutDecayExponent = log(0.5) / decayConstant } - public func observe( - walkedNodes: [Megrez.Compositor.Node], - cursorIndex: Int, - candidate: String, - timestamp: Double, - saveCallback: @escaping () -> Void + public func performObservation( + walkedBefore: [Megrez.Compositor.Node], walkedAfter: [Megrez.Compositor.Node], + cursor: Int, timestamp: Double, saveCallback: @escaping () -> Void ) { - let key = convertKeyFrom(walkedNodes: walkedNodes, cursorIndex: cursorIndex) + // 參數合規性檢查。 + guard !walkedAfter.isEmpty, !walkedBefore.isEmpty else { return } + guard walkedBefore.totalReadingsCount == walkedAfter.totalReadingsCount else { return } + // 先判斷用哪種覆寫方法。 + var actualCursor = 0 + guard let currentNode = walkedAfter.findNode(at: cursor, target: &actualCursor) else { return } + // 當前節點超過三個字的話,就不記憶了。在這種情形下,使用者可以考慮新增自訂語彙。 + guard currentNode.spanLength <= 3 else { return } + // 前一個節點得從前一次爬軌結果當中來找。 + guard actualCursor > 0 else { return } // 該情況應該不會出現。 + let currentNodeIndex = actualCursor + actualCursor -= 1 + var prevNodeIndex = 0 + guard let prevNode = walkedBefore.findNode(at: actualCursor, target: &prevNodeIndex) else { return } + + let forceHighScoreOverride: Bool = currentNode.spanLength > prevNode.spanLength + let breakingUp = currentNode.spanLength == 1 && prevNode.spanLength > 1 + + let targetNodeIndex = breakingUp ? currentNodeIndex : prevNodeIndex + let key: String = vChewing.LMUserOverride.formObservationKey( + walkedNodes: walkedAfter, headIndex: targetNodeIndex + ) guard !key.isEmpty else { return } - - guard mutLRUMap[key] != nil else { - var observation: Observation = .init() - observation.update(candidate: candidate, timestamp: timestamp) - let koPair = KeyObservationPair(key: key, observation: observation) - // 先移除 key 再設定 key 的話,就可以影響這個 key 在辭典內的順位。 - // Swift 原生的辭典是沒有數字索引排序的,但資料的插入順序卻有保存著。 - mutLRUMap.removeValue(forKey: key) - mutLRUMap[key] = koPair - mutLRUList.insert(koPair, at: 0) - - if mutLRUList.count > mutCapacity { - mutLRUMap.removeValue(forKey: mutLRUList[mutLRUList.endIndex].key) - mutLRUList.removeLast() - } - IME.prtDebugIntel("UOM: Observation finished with new observation: \(key)") - saveCallback() - return - } - // 降低磁碟寫入次數。唯有失憶的情況下才會更新觀察且記憶。 - if var theNeta = mutLRUMap[key] { - _ = suggest( - walkedNodes: walkedNodes, cursorIndex: cursorIndex, timestamp: timestamp, - decayCallback: { - theNeta.observation.update(candidate: candidate, timestamp: timestamp) - self.mutLRUList.insert(theNeta, at: 0) - self.mutLRUMap[key] = theNeta - IME.prtDebugIntel("UOM: Observation finished with existing observation: \(key)") - saveCallback() - } - ) - } + doObservation( + key: key, candidate: currentNode.currentUnigram.value, timestamp: timestamp, + forceHighScoreOverride: forceHighScoreOverride, saveCallback: { saveCallback() } + ) } - public func suggest( - walkedNodes: [Megrez.Compositor.Node], - cursorIndex: Int, - timestamp: Double, - decayCallback: @escaping () -> Void = {} - ) -> [(String, Megrez.Unigram)] { - let key = convertKeyFrom(walkedNodes: walkedNodes, cursorIndex: cursorIndex) - guard !key.isEmpty else { - IME.prtDebugIntel("UOM: Blank key generated on suggestion, aborting suggestion.") - return .init() - } - let currentReadingKey = convertKeyFrom(walkedNodes: walkedNodes, cursorIndex: cursorIndex, readingOnly: true) - guard let koPair = mutLRUMap[key] else { - IME.prtDebugIntel("UOM: mutLRUMap[key] is nil, throwing blank suggestion for key: \(key).") - return .init() - } - - let observation = koPair.observation - - var arrResults = [(String, Megrez.Unigram)]() - var currentHighScore = 0.0 - for overrideNeta in Array(observation.overrides) { - let override: Override = overrideNeta.value - - let overrideScore: Double = getScore( - eventCount: override.count, - totalCount: observation.count, - eventTimestamp: override.timestamp, - timestamp: timestamp, - lambda: mutDecayExponent - ) - if (0...currentHighScore).contains(overrideScore) { continue } - - let overrideDetectionScore: Double = getScore( - eventCount: override.count, - totalCount: observation.count, - eventTimestamp: override.timestamp, - timestamp: timestamp, - lambda: mutDecayExponent * 2 - ) - if (0...currentHighScore).contains(overrideDetectionScore) { decayCallback() } - - let newUnigram = Megrez.Unigram(value: overrideNeta.key, score: overrideScore) - arrResults.insert((currentReadingKey, newUnigram), at: 0) - currentHighScore = overrideScore - } - if arrResults.isEmpty { - IME.prtDebugIntel("UOM: No usable suggestions in the result for key: \(key).") - } - return arrResults - } - - private func getScore( - eventCount: Int, - totalCount: Int, - eventTimestamp: Double, - timestamp: Double, - lambda: Double - ) -> Double { - let decay = exp((timestamp - eventTimestamp) * lambda) - if decay < kDecayThreshold { return 0.0 } - let prob = Double(eventCount) / Double(totalCount) - return prob * decay - } - - func convertKeyFrom( - walkedNodes: [Megrez.Compositor.Node], cursorIndex: Int, readingOnly: Bool = false - ) -> String { - let whiteList = "你他妳她祢衪它牠再在" - var arrNodes: [Megrez.Compositor.Node] = [] - var intLength = 0 - for theNodeAnchor in walkedNodes { - arrNodes.append(theNodeAnchor) - intLength += theNodeAnchor.spanLength - if intLength >= cursorIndex { - break - } - } - - if arrNodes.isEmpty { return "" } - - arrNodes = Array(arrNodes.reversed()) - - let kvCurrent = arrNodes[0].currentPair - guard !kvCurrent.key.contains("_") else { - return "" - } - - // 字音數與字數不一致的內容會被拋棄。 - if kvCurrent.key.split(separator: "-").count != kvCurrent.value.count { return "" } - - // 前置單元只記錄讀音,在其後的單元則同時記錄讀音與字詞 - let strCurrent = kvCurrent.key - var kvPrevious = Megrez.KeyValuePaired() - var kvAnterior = Megrez.KeyValuePaired() - var readingStack = "" - var trigramKey: String { "(\(kvAnterior.toNGramKey),\(kvPrevious.toNGramKey),\(strCurrent))" } - var result: String { - // 不要把單個漢字的 kvCurrent 當前鍵值領頭的單元圖記入資料庫,不然對敲字體驗破壞太大。 - if readingStack.contains("_") - || (!kvPrevious.isValid && kvCurrent.value.count == 1 && !whiteList.contains(kvCurrent.value)) - { - return "" - } else { - return (readingOnly ? strCurrent : trigramKey) - } - } - - if arrNodes.count >= 2, - !kvPrevious.key.contains("_"), - kvPrevious.key.split(separator: "-").count == kvPrevious.value.count - { - kvPrevious = arrNodes[1].currentPair - readingStack = kvPrevious.key + readingStack - } - - if arrNodes.count >= 3, - !kvAnterior.key.contains("_"), - kvAnterior.key.split(separator: "-").count == kvAnterior.value.count - { - kvAnterior = arrNodes[2].currentPair - readingStack = kvAnterior.key + readingStack - } - - return result + public func fetchSuggestion( + currentWalk: [Megrez.Compositor.Node], cursor: Int, timestamp: Double + ) -> Suggestion { + var headIndex = 0 + guard let nodeIter = currentWalk.findNode(at: cursor, target: &headIndex) else { return .init() } + let key = vChewing.LMUserOverride.formObservationKey(walkedNodes: currentWalk, headIndex: headIndex) + return getSuggestion(key: key, timestamp: timestamp, headReading: nodeIter.key) } } } @@ -208,6 +80,7 @@ extension vChewing.LMUserOverride { struct Override: Hashable, Encodable, Decodable { var count: Int = 0 var timestamp: Double = 0.0 + var forceHighScoreOverride = false static func == (lhs: Override, rhs: Override) -> Bool { lhs.count == rhs.count && lhs.timestamp == rhs.timestamp } @@ -242,11 +115,12 @@ extension vChewing.LMUserOverride { hasher.combine(overrides) } - mutating func update(candidate: String, timestamp: Double) { + mutating func update(candidate: String, timestamp: Double, forceHighScoreOverride: Bool = false) { count += 1 if overrides.keys.contains(candidate) { overrides[candidate]?.timestamp = timestamp overrides[candidate]?.count += 1 + overrides[candidate]?.forceHighScoreOverride = forceHighScoreOverride } else { overrides[candidate] = .init(count: 1, timestamp: timestamp) } @@ -331,4 +205,192 @@ extension vChewing.LMUserOverride { return } } + + public struct Suggestion { + var candidates = [(String, Megrez.Unigram)]() + var forceHighScoreOverride = false + var isEmpty: Bool { candidates.isEmpty } + } +} + +// MARK: - Array Extensions. + +extension Array where Element == Megrez.Compositor.Node { + public var totalReadingsCount: Int { + var counter = 0 + for node in self { + counter += node.keyArray.count + } + return counter + } + + public func findNode(at cursor: Int, target outCursorPastNode: inout Int) -> Megrez.Compositor.Node? { + guard !isEmpty else { return nil } + let cursor = Swift.max(0, Swift.min(cursor, keys.count)) + + if cursor == 0, let theFirst = first { + outCursorPastNode = theFirst.spanLength + return theFirst + } + + // 同時應對「游標在右端」與「游標離右端還差一個位置」的情形。 + if cursor >= keys.count - 1, let theLast = last { + outCursorPastNode = keys.count + return theLast + } + + var accumulated = 0 + for neta in self { + accumulated += neta.spanLength + if accumulated > cursor { + outCursorPastNode = accumulated + return neta + } + } + + // 下述情形本不應該出現。 + return nil + } +} + +// MARK: - Private Methods + +extension vChewing.LMUserOverride { + private func doObservation( + key: String, candidate: String, timestamp: Double, forceHighScoreOverride: Bool, + saveCallback: @escaping () -> Void + ) { + guard mutLRUMap[key] != nil else { + var observation: Observation = .init() + observation.update(candidate: candidate, timestamp: timestamp, forceHighScoreOverride: forceHighScoreOverride) + let koPair = KeyObservationPair(key: key, observation: observation) + // 先移除 key 再設定 key 的話,就可以影響這個 key 在辭典內的順位。 + // Swift 原生的辭典是沒有數字索引排序的,但資料的插入順序卻有保存著。 + mutLRUMap.removeValue(forKey: key) + mutLRUMap[key] = koPair + mutLRUList.insert(koPair, at: 0) + + if mutLRUList.count > mutCapacity { + mutLRUMap.removeValue(forKey: mutLRUList[mutLRUList.endIndex].key) + mutLRUList.removeLast() + } + IME.prtDebugIntel("UOM: Observation finished with new observation: \(key)") + saveCallback() + return + } + // TODO: 降低磁碟寫入次數。唯有失憶的情況下才會更新觀察且記憶。 + if var theNeta = mutLRUMap[key] { + theNeta.observation.update( + candidate: candidate, timestamp: timestamp, forceHighScoreOverride: forceHighScoreOverride + ) + mutLRUList.insert(theNeta, at: 0) + mutLRUMap[key] = theNeta + IME.prtDebugIntel("UOM: Observation finished with existing observation: \(key)") + saveCallback() + } + } + + private func getSuggestion(key: String, timestamp: Double, headReading: String) -> Suggestion { + guard !key.isEmpty, let kvPair = mutLRUMap[key] else { return .init() } + let observation: Observation = kvPair.observation + var candidates: [(String, Megrez.Unigram)] = .init() + var forceHighScoreOverride = false + var score: Double = 0 + for (i, theObservation) in observation.overrides { + let overrideScore = getScore( + eventCount: theObservation.count, totalCount: observation.count, + eventTimestamp: theObservation.timestamp, timestamp: timestamp, lambda: mutDecayExponent + ) + if overrideScore == 0.0 { continue } + if overrideScore > score { + candidates.append((headReading, .init(value: i, score: overrideScore))) + forceHighScoreOverride = theObservation.forceHighScoreOverride + score = overrideScore + } + } + return .init(candidates: candidates, forceHighScoreOverride: forceHighScoreOverride) + } + + private func getScore( + eventCount: Int, + totalCount: Int, + eventTimestamp: Double, + timestamp: Double, + lambda: Double + ) -> Double { + let decay = exp((timestamp - eventTimestamp) * lambda) + if decay < kDecayThreshold { return 0.0 } + let prob = Double(eventCount) / Double(totalCount) + return prob * decay + } + + private static func isPunctuation(_ node: Megrez.Compositor.Node) -> Bool { + for key in node.keyArray { + guard let firstChar = key.first else { continue } + return String(firstChar) == "_" + } + return false + } + + private static func formObservationKey( + walkedNodes: [Megrez.Compositor.Node], headIndex cursorIndex: Int, readingOnly: Bool = false + ) -> String { + let whiteList = "你他妳她祢衪它牠再在" + var arrNodes: [Megrez.Compositor.Node] = [] + var intLength = 0 + for theNodeAnchor in walkedNodes { + arrNodes.append(theNodeAnchor) + intLength += theNodeAnchor.spanLength + if intLength >= cursorIndex { + break + } + } + + if arrNodes.isEmpty { return "" } + + arrNodes = Array(arrNodes.reversed()) + + let kvCurrent = arrNodes[0].currentPair + guard !kvCurrent.key.contains("_") else { + return "" + } + + // 字音數與字數不一致的內容會被拋棄。 + if kvCurrent.key.split(separator: "-").count != kvCurrent.value.count { return "" } + + // 前置單元只記錄讀音,在其後的單元則同時記錄讀音與字詞 + let strCurrent = kvCurrent.key + var kvPrevious = Megrez.KeyValuePaired() + var kvAnterior = Megrez.KeyValuePaired() + var readingStack = "" + var trigramKey: String { "(\(kvAnterior.toNGramKey),\(kvPrevious.toNGramKey),\(strCurrent))" } + var result: String { + // 不要把單個漢字的 kvCurrent 當前鍵值領頭的單元圖記入資料庫,不然對敲字體驗破壞太大。 + if readingStack.contains("_") + || (!kvPrevious.isValid && kvCurrent.value.count == 1 && !whiteList.contains(kvCurrent.value)) + { + return "" + } else { + return (readingOnly ? strCurrent : trigramKey) + } + } + + if arrNodes.count >= 2, + !kvPrevious.key.contains("_"), + kvPrevious.key.split(separator: "-").count == kvPrevious.value.count + { + kvPrevious = arrNodes[1].currentPair + readingStack = kvPrevious.key + readingStack + } + + if arrNodes.count >= 3, + !kvAnterior.key.contains("_"), + kvAnterior.key.split(separator: "-").count == kvAnterior.value.count + { + kvAnterior = arrNodes[2].currentPair + readingStack = kvAnterior.key + readingStack + } + + return result + } }