diff --git a/Source/Modules/ControllerModules/KeyHandler_Core.swift b/Source/Modules/ControllerModules/KeyHandler_Core.swift index 28c1d288..79123ef2 100644 --- a/Source/Modules/ControllerModules/KeyHandler_Core.swift +++ b/Source/Modules/ControllerModules/KeyHandler_Core.swift @@ -56,7 +56,7 @@ class KeyHandler { var compositor: Megrez.Compositor // 組字器 var currentLM: vChewing.LMInstantiator = .init() // 當前主語言模組 var currentUOM: vChewing.LMUserOverride = .init() // 當前半衰記憶模組 - var walkedAnchors: [Megrez.NodeAnchor] = [] // 用以記錄爬過的節錨的陣列 + var walkedAnchors: [Megrez.NodeAnchor] { compositor.walkedAnchors } // 用以記錄爬過的節錨的陣列 /// 委任物件 (ctlInputMethod),以便呼叫其中的函式。 var delegate: KeyHandlerDelegate? @@ -95,7 +95,6 @@ class KeyHandler { func clear() { composer.clear() compositor.clear() - walkedAnchors.removeAll() } // MARK: - Functions dealing with Megrez. @@ -103,7 +102,7 @@ class KeyHandler { /// 實際上要拿給 Megrez 使用的的滑鼠游標位址,以方便在組字器最開頭或者最末尾的時候始終能抓取候選字節點陣列。 /// /// 威注音對游標前置與游標後置模式採取的候選字節點陣列抓取方法是分離的,且不使用 Node Crossing。 - var actualCandidateCursorIndex: Int { + var actualCandidateCursor: Int { mgrPrefs.useRearCursorMode ? min(compositorCursorIndex, compositorLength - 1) : max(compositorCursorIndex, 1) } @@ -113,11 +112,11 @@ class KeyHandler { /// /// 該函式的爬取順序是從頭到尾。 func walk() { - walkedAnchors = compositor.walk() + compositor.walk() // 在偵錯模式開啟時,將 GraphViz 資料寫入至指定位置。 if mgrPrefs.isDebugModeEnabled { - let result = compositor.grid.dumpDOT + let result = compositor.dumpDOT do { try result.write( toFile: "/private/var/tmp/vChewing-visualization.dot", @@ -137,12 +136,10 @@ class KeyHandler { /// 估算對象範圍。用比較形象且生動卻有點噁心的解釋的話,蒼蠅一邊吃一邊屙。 var commitOverflownCompositionAndWalk: String { var textToCommit = "" - if compositor.grid.width > mgrPrefs.composingBufferSize, !walkedAnchors.isEmpty { + if compositor.width > mgrPrefs.composingBufferSize, !walkedAnchors.isEmpty { let anchor: Megrez.NodeAnchor = walkedAnchors[0] - if let theNode = anchor.node { - textToCommit = theNode.currentKeyValue.value - } - compositor.removeHeadReadings(count: anchor.spanningLength) + textToCommit = anchor.node.currentPair.value + compositor.removeHeadReadings(count: anchor.spanLength) } walk() return textToCommit @@ -166,26 +163,22 @@ class KeyHandler { /// - value: 給定之候選字字串。 /// - respectCursorPushing: 若該選項為 true,則會在選字之後始終將游標推送至選字厚的節錨的前方。 func fixNode(value: String, respectCursorPushing: Bool = true) { - let adjustedIndex = max(0, min(actualCandidateCursorIndex + (mgrPrefs.useRearCursorMode ? 1 : 0), compositorLength)) + let adjustedCursor = max(0, min(actualCandidateCursor + (mgrPrefs.useRearCursorMode ? 1 : 0), compositorLength)) // 開始讓半衰模組觀察目前的狀況。 - let selectedNode: Megrez.NodeAnchor = compositor.grid.fixNodeSelectedCandidate( - location: adjustedIndex, value: value - ) + let selectedNode: Megrez.NodeAnchor = compositor.fixNodeSelectedCandidate(value, at: adjustedCursor) // 不要針對逐字選字模式啟用臨時半衰記憶模型。 if !mgrPrefs.useSCPCTypingMode { var addToUserOverrideModel = true // 所有讀音數與字符數不匹配的情況均不得塞入半衰記憶模組。 - if selectedNode.spanningLength != value.count { + if selectedNode.spanLength != value.count { IME.prtDebugIntel("UOM: SpanningLength != value.count, dismissing.") addToUserOverrideModel = false } if addToUserOverrideModel { - if let theNode = selectedNode.node { - // 威注音的 SymbolLM 的 Score 是 -12,符合該條件的內容不得塞入半衰記憶模組。 - if theNode.scoreFor(candidate: value) <= -12 { - IME.prtDebugIntel("UOM: Score <= -12, dismissing.") - addToUserOverrideModel = false - } + // 威注音的 SymbolLM 的 Score 是 -12,符合該條件的內容不得塞入半衰記憶模組。 + if selectedNode.node.scoreFor(candidate: value) <= -12 { + IME.prtDebugIntel("UOM: Score <= -12, dismissing.") + addToUserOverrideModel = false } } if addToUserOverrideModel { @@ -193,7 +186,7 @@ class KeyHandler { // 令半衰記憶模組觀測給定的三元圖。 // 這個過程會讓半衰引擎根據當前上下文生成三元圖索引鍵。 currentUOM.observe( - walkedAnchors: walkedAnchors, cursorIndex: adjustedIndex, candidate: value, + walkedAnchors: walkedAnchors, cursorIndex: adjustedCursor, candidate: value, timestamp: NSDate().timeIntervalSince1970 ) } @@ -206,8 +199,8 @@ class KeyHandler { if mgrPrefs.moveCursorAfterSelectingCandidate, respectCursorPushing { var nextPosition = 0 for theAnchor in walkedAnchors { - if nextPosition >= adjustedIndex { break } - nextPosition += theAnchor.spanningLength + if nextPosition >= adjustedCursor { break } + nextPosition += theAnchor.spanLength } if nextPosition <= compositorLength { compositorCursorIndex = nextPosition @@ -217,20 +210,17 @@ class KeyHandler { /// 組字器內超出最大動態爬軌範圍的節錨都會被自動標記為「已經手動選字過」,減少爬軌運算負擔。 func markNodesFixedIfNecessary() { - let width = compositor.grid.width + let width = compositor.width if width <= kMaxComposingBufferNeedsToWalkSize { return } var index = 0 for anchor in walkedAnchors { - guard let node = anchor.node else { break } if index >= width - kMaxComposingBufferNeedsToWalkSize { break } - if node.score < node.kSelectedCandidateScore { - compositor.grid.fixNodeSelectedCandidate( - location: index + anchor.spanningLength, value: node.currentKeyValue.value - ) + if anchor.node.score < Megrez.Node.kSelectedCandidateScore { + compositor.fixNodeSelectedCandidate(anchor.node.currentPair.value, at: index + anchor.spanLength) } - index += anchor.spanningLength + index += anchor.spanLength } } @@ -248,14 +238,11 @@ class KeyHandler { arrAnchors = arrAnchors.stableSort { $0.keyLength > $1.keyLength } // 將節錨內的候選字詞資料拓印到輸出陣列內。 - for currentNodeAnchor in arrAnchors { - guard let currentNode = currentNodeAnchor.node else { continue } - for currentCandidate in currentNode.candidates { - // 選字窗的內容的康熙轉換 / JIS 轉換不能放在這裡處理,會影響選字有效性。 - // 選字的原理是拿著具體的候選字詞的字串去當前的節錨下找出對應的候選字詞(X元圖)。 - // 一旦在這裡轉換了,節錨內的某些元圖就無法被選中。 - arrCandidates.append(currentCandidate.value) - } + for currentCandidate in arrAnchors.map(\.node.candidates).joined() { + // 選字窗的內容的康熙轉換 / JIS 轉換不能放在這裡處理,會影響選字有效性。 + // 選字的原理是拿著具體的候選字詞的字串去當前的節錨下找出對應的候選字詞(X元圖)。 + // 一旦在這裡轉換了,節錨內的某些元圖就無法被選中。 + arrCandidates.append(currentCandidate.value) } // 決定是否根據半衰記憶模組的建議來調整候選字詞的順序。 if !mgrPrefs.fetchSuggestionsFromUserOverrideModel || mgrPrefs.useSCPCTypingMode || fixOrder { @@ -291,8 +278,8 @@ class KeyHandler { if !overrideValue.isEmpty { IME.prtDebugIntel( "UOM: Suggestion retrieved, overriding the node score of the selected candidate.") - compositor.grid.overrideNodeScoreForSelectedCandidate( - location: min(actualCandidateCursorIndex + (mgrPrefs.useRearCursorMode ? 1 : 0), compositorLength), + compositor.overrideNodeScoreForSelectedCandidate( + location: min(actualCandidateCursor + (mgrPrefs.useRearCursorMode ? 1 : 0), compositorLength), value: overrideValue, overridingScore: findHighestScore(nodeAnchors: rawAnchorsOfNodes, epsilon: kEpsilon) ) @@ -307,7 +294,7 @@ class KeyHandler { /// - epsilon: 半衰模組的衰減指數。 /// - Returns: 尋獲的最高權重數值。 func findHighestScore(nodeAnchors: [Megrez.NodeAnchor], epsilon: Double) -> Double { - return nodeAnchors.compactMap(\.node?.highestUnigramScore).max() ?? 0 + epsilon + return nodeAnchors.map(\.node.highestUnigramScore).max() ?? 0 + epsilon } // MARK: - Extracted methods and functions (Tekkon). @@ -363,8 +350,8 @@ class KeyHandler { /// 警告:不要對游標前置風格使用 nodesCrossing,否則會導致游標行為與 macOS 內建注音輸入法不一致。 /// 微軟新注音輸入法的游標後置風格也是不允許 nodeCrossing 的。 mgrPrefs.useRearCursorMode - ? compositor.grid.nodesBeginningAt(location: actualCandidateCursorIndex) - : compositor.grid.nodesEndingAt(location: actualCandidateCursorIndex) + ? compositor.nodesBeginningAt(location: actualCandidateCursor) + : compositor.nodesEndingAt(location: actualCandidateCursor) } /// 將輸入法偏好設定同步至語言模組內。 @@ -390,7 +377,7 @@ class KeyHandler { /// 在組字器的給定游標位置內插入讀音。 func insertToCompositorAtCursor(reading: String) { - compositor.insertReadingAtCursor(reading: reading) + compositor.insertReading(reading) } /// 組字器的游標位置。 @@ -408,28 +395,27 @@ class KeyHandler { /// /// 在威注音的術語體系當中,「與文字輸入方向相反的方向」為向後(Rear)。 func deleteCompositorReadingAtTheRearOfCursor() { - compositor.deleteReadingAtTheRearOfCursor() + compositor.dropReading(direction: .rear) } /// 在組字器內,朝著往文字輸入方向、砍掉一個與游標相鄰的讀音。 /// /// 在威注音的術語體系當中,「文字輸入方向」為向前(Front)。 func deleteCompositorReadingToTheFrontOfCursor() { - compositor.deleteReadingToTheFrontOfCursor() + compositor.dropReading(direction: .front) } /// 獲取指定游標位置的鍵值長度。 /// - Returns: 指定游標位置的鍵值長度。 var keyLengthAtCurrentIndex: Int { - guard let node = walkedAnchors[compositorCursorIndex].node else { return 0 } - return node.key.split(separator: "-").count + walkedAnchors[compositorCursorIndex].node.key.split(separator: "-").count } var nextPhrasePosition: Int { var nextPosition = 0 for theAnchor in walkedAnchors { - if nextPosition > actualCandidateCursorIndex { break } - nextPosition += theAnchor.spanningLength + if nextPosition > actualCandidateCursor { break } + nextPosition += theAnchor.spanLength } return min(nextPosition, compositorLength) } diff --git a/Source/Modules/ControllerModules/KeyHandler_States.swift b/Source/Modules/ControllerModules/KeyHandler_States.swift index affc3bc3..ae662528 100644 --- a/Source/Modules/ControllerModules/KeyHandler_States.swift +++ b/Source/Modules/ControllerModules/KeyHandler_States.swift @@ -45,15 +45,15 @@ extension KeyHandler { /// 所以在這裡必須做糾偏處理。因為在用 Swift,所以可以用「.utf16」取代「NSString.length()」。 /// 這樣就可以免除不必要的類型轉換。 for theAnchor in walkedAnchors { - guard let theNode = theAnchor.node else { continue } - let strNodeValue = theNode.currentKeyValue.value + let theNode = theAnchor.node + let strNodeValue = theNode.currentPair.value composingBuffer += strNodeValue let arrSplit: [String] = Array(strNodeValue).map { String($0) } let codepointCount = arrSplit.count /// 藉下述步驟重新將「可見游標位置」對齊至「組字器內的游標所在的讀音位置」。 /// 每個節錨(NodeAnchor)都有自身的幅位長度(spanningLength),可以用來 /// 累加、以此為依據,來校正「可見游標位置」。 - let spanningLength: Int = theAnchor.spanningLength + let spanningLength: Int = theAnchor.spanLength if readingCursorIndex + spanningLength <= compositorCursorIndex { composedStringCursorIndex += strNodeValue.utf16.count readingCursorIndex += spanningLength @@ -406,22 +406,20 @@ extension KeyHandler { var composed = "" - for theAnchor in walkedAnchors { - if let node = theAnchor.node { - var key = node.key - if mgrPrefs.inlineDumpPinyinInLieuOfZhuyin { - key = Tekkon.restoreToneOneInZhuyinKey(target: key) // 恢復陰平標記 - key = Tekkon.cnvPhonaToHanyuPinyin(target: key) // 注音轉拼音 - key = Tekkon.cnvHanyuPinyinToTextbookStyle(target: key) // 轉教科書式標調 - key = key.replacingOccurrences(of: "-", with: " ") - } else { - key = Tekkon.cnvZhuyinChainToTextbookReading(target: key, newSeparator: " ") - } - - let value = node.currentKeyValue.value - // 不要給標點符號等特殊元素加注音 - composed += key.contains("_") ? value : "\(value)(\(key))" + for node in walkedAnchors.map(\.node) { + var key = node.key + if mgrPrefs.inlineDumpPinyinInLieuOfZhuyin { + key = Tekkon.restoreToneOneInZhuyinKey(target: key) // 恢復陰平標記 + key = Tekkon.cnvPhonaToHanyuPinyin(target: key) // 注音轉拼音 + key = Tekkon.cnvHanyuPinyinToTextbookStyle(target: key) // 轉教科書式標調 + key = key.replacingOccurrences(of: "-", with: " ") + } else { + key = Tekkon.cnvZhuyinChainToTextbookReading(target: key, newSeparator: " ") } + + let value = node.currentPair.value + // 不要給標點符號等特殊元素加注音 + composed += key.contains("_") ? value : "\(value)(\(key))" } clear() @@ -796,26 +794,21 @@ extension KeyHandler { var length = 0 var currentAnchor = Megrez.NodeAnchor() let cursorIndex = min( - actualCandidateCursorIndex + (mgrPrefs.useRearCursorMode ? 1 : 0), compositorLength + actualCandidateCursor + (mgrPrefs.useRearCursorMode ? 1 : 0), compositorLength ) for anchor in walkedAnchors { - length += anchor.spanningLength + length += anchor.spanLength if length >= cursorIndex { currentAnchor = anchor break } } - guard let currentNode = currentAnchor.node else { - IME.prtDebugIntel("4F2DEC2F") - errorCallback() - return true - } - - let currentValue = currentNode.currentKeyValue.value + let currentNode = currentAnchor.node + let currentValue = currentNode.currentPair.value var currentIndex = 0 - if currentNode.score < currentNode.kSelectedCandidateScore { + if currentNode.score < Megrez.Node.kSelectedCandidateScore { /// 只要是沒有被使用者手動選字過的(節錨下的)節點, /// 就從第一個候選字詞開始,這樣使用者在敲字時就會優先匹配 /// 那些字詞長度不小於 2 的單元圖。換言之,如果使用者敲了兩個 diff --git a/Source/Modules/LangModelRelated/LMInstantiator.swift b/Source/Modules/LangModelRelated/LMInstantiator.swift index a8f81404..d13705f4 100644 --- a/Source/Modules/LangModelRelated/LMInstantiator.swift +++ b/Source/Modules/LangModelRelated/LMInstantiator.swift @@ -28,7 +28,7 @@ import Foundation extension vChewing { /// 語言模組副本化模組(LMInstantiator,下稱「LMI」)自身為符合天權星組字引擎內 - /// 的 LanguageModelProtocol 協定的模組、統籌且整理來自其它子模組的資料(包括使 + /// 的 LangModelProtocol 協定的模組、統籌且整理來自其它子模組的資料(包括使 /// 用者語彙、繪文字模組、語彙濾除表、原廠語言模組等)。 /// /// LMI 型別為與輸入法按鍵調度模組直接溝通之唯一語言模組。當組字器開始根據給定的 @@ -44,7 +44,7 @@ extension vChewing { /// /// LMI 會根據需要分別載入原廠語言模組和其他個別的子語言模組。LMI 本身不會記錄這些 /// 語言模組的相關資料的存放位置,僅藉由參數來讀取相關訊息。 - public class LMInstantiator: LanguageModelProtocol { + public class LMInstantiator: LangModelProtocol { // 在函式內部用以記錄狀態的開關。 public var isPhraseReplacementEnabled = false public var isCNSEnabled = false @@ -256,7 +256,7 @@ extension vChewing { lmAssociates.hasValuesFor(key: key) } - /// 該函式不起作用,僅用來滿足 LanguageModelProtocol 協定的要求。 + /// 該函式不起作用,僅用來滿足 LangModelProtocol 協定的要求。 public func bigramsForKeys(precedingKey _: String, key _: String) -> [Megrez.Bigram] { .init() } // MARK: - 核心函式(對內) diff --git a/Source/Modules/LangModelRelated/SubLMs/lmUserOverride.swift b/Source/Modules/LangModelRelated/SubLMs/lmUserOverride.swift index 8d50fb04..c5f3b96a 100644 --- a/Source/Modules/LangModelRelated/SubLMs/lmUserOverride.swift +++ b/Source/Modules/LangModelRelated/SubLMs/lmUserOverride.swift @@ -130,13 +130,12 @@ extension vChewing { func convertKeyFrom( walkedAnchors: [Megrez.NodeAnchor], cursorIndex: Int, readingOnly: Bool = false ) -> String { - let arrEndingPunctuation = [",", "。", "!", "?", "」", "』", "”", "’"] let whiteList = "你他妳她祢衪它牠再在" var arrNodes: [Megrez.NodeAnchor] = [] var intLength = 0 for theNodeAnchor in walkedAnchors { arrNodes.append(theNodeAnchor) - intLength += theNodeAnchor.spanningLength + intLength += theNodeAnchor.spanLength if intLength >= cursorIndex { break } @@ -146,9 +145,8 @@ extension vChewing { arrNodes = Array(arrNodes.reversed()) - guard let kvCurrent = arrNodes[0].node?.currentKeyValue, - !arrEndingPunctuation.contains(kvCurrent.value) - else { + let kvCurrent = arrNodes[0].node.currentPair + guard !kvCurrent.key.contains("_") else { return "" } @@ -173,20 +171,18 @@ extension vChewing { } if arrNodes.count >= 2, - let kvPreviousThisOne = arrNodes[1].node?.currentKeyValue, - !arrEndingPunctuation.contains(kvPrevious.value), + !kvPrevious.key.contains("_"), kvPrevious.key.split(separator: "-").count == kvPrevious.value.count { - kvPrevious = kvPreviousThisOne + kvPrevious = arrNodes[1].node.currentPair readingStack = kvPrevious.key + readingStack } if arrNodes.count >= 3, - let kvAnteriorThisOne = arrNodes[2].node?.currentKeyValue, - !arrEndingPunctuation.contains(kvAnterior.value), + !kvAnterior.key.contains("_"), kvAnterior.key.split(separator: "-").count == kvAnterior.value.count { - kvAnterior = kvAnteriorThisOne + kvAnterior = arrNodes[2].node.currentPair readingStack = kvAnterior.key + readingStack } diff --git a/Source/Modules/LanguageParsers/Megrez/1_Compositor.swift b/Source/Modules/LanguageParsers/Megrez/1_Compositor.swift index 72a3284b..76a8f6f1 100644 --- a/Source/Modules/LanguageParsers/Megrez/1_Compositor.swift +++ b/Source/Modules/LanguageParsers/Megrez/1_Compositor.swift @@ -25,89 +25,106 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. extension Megrez { /// 組字器。 - public class Compositor { + public class Compositor: Grid { + /// 文字輸入方向 + public enum TypingDirection { case front, rear } /// 給被丟掉的節點路徑施加的負權重。 private let kDroppedPathScore: Double = -999 /// 該組字器的游標位置。 - private var mutCursorIndex: Int = 0 + public var cursor: Int = 0 { didSet { cursor = max(0, min(cursor, readings.count)) } } /// 該組字器的讀音陣列。 - private var mutReadings: [String] = [] - /// 該組字器的軌格。 - private var mutGrid: Grid = .init() + private(set) var readings: [String] = [] /// 該組字器所使用的語言模型。 - private var mutLM: LanguageModelProtocol + private var langModel: LangModelProtocol + /// 允許查詢當前游標位置屬於第幾個幅位座標(從 0 開始算)。 + private(set) var cursorRegionMap: [Int: Int] = .init() + private(set) var walkedAnchors: [Megrez.NodeAnchor] = [] // 用以記錄爬過的節錨的陣列 - /// 公開:該組字器內可以允許的最大詞長。 - public var maxBuildSpanLength: Int { mutGrid.maxBuildSpanLength } /// 公開:多字讀音鍵當中用以分割漢字讀音的記號,預設為空。 - public var joinSeparator: String = "" - /// 公開:該組字器的游標位置。 - public var cursorIndex: Int { - get { mutCursorIndex } - set { mutCursorIndex = (newValue < 0) ? 0 : min(newValue, mutReadings.count) } - } + public var joinSeparator: String = "-" - /// 公開:該組字器是否為空。 - public var isEmpty: Bool { mutGrid.isEmpty } - - /// 公開:該組字器的軌格(唯讀)。 - public var grid: Grid { mutGrid } /// 公開:該組字器的長度,也就是內建漢字讀音的數量(唯讀)。 - public var length: Int { mutReadings.count } - /// 公開:該組字器的讀音陣列(唯讀)。 - public var readings: [String] { mutReadings } + public var length: Int { readings.count } + + /// 按幅位來前後移動游標。 + /// - Parameter direction: 移動方向 + /// - Returns: 該操作是否順利完成。 + @discardableResult public func jumpCursorBySpan(to direction: TypingDirection) -> Bool { + switch direction { + case .front: + if cursor == width { return false } + case .rear: + if cursor == 0 { return false } + } + guard let currentRegion = cursorRegionMap[cursor] else { return false } + + let aRegionForward = max(currentRegion - 1, 0) + let currentRegionBorderRear: Int = walkedAnchors[0.. walkedAnchors.count) + ? readings.count : walkedAnchors[0...currentRegion].map(\.spanLength).reduce(0, +) + case .rear: + cursor = walkedAnchors[0.. Bool { - if mutCursorIndex == 0 { - return false - } - - mutReadings.remove(at: mutCursorIndex - 1) - mutCursorIndex -= 1 - mutGrid.shrinkGridByOneAt(location: mutCursorIndex) + @discardableResult public func insertReading(_ reading: String) -> Bool { + guard !reading.isEmpty, langModel.hasUnigramsFor(key: reading) else { return false } + readings.insert(reading, at: cursor) + resizeGridByOneAt(location: cursor, to: .expand) build() + cursor += 1 return true } - /// 朝著往文字輸入方向、砍掉一個與游標相鄰的讀音。 - /// 在威注音的術語體系當中,「文字輸入方向」為向前(Front)。 - @discardableResult public func deleteReadingToTheFrontOfCursor() -> Bool { - if mutCursorIndex == mutReadings.count { + /// 朝著指定方向砍掉一個與游標相鄰的讀音。 + /// + /// 在威注音的術語體系當中,「與文字輸入方向相反的方向」為向後(Rear),反之則為向前(Front)。 + /// - Parameter direction: 指定方向。 + /// - Returns: 該操作是否順利完成。 + @discardableResult public func dropReading(direction: TypingDirection) -> Bool { + let isBackSpace = direction == .rear + if cursor == (isBackSpace ? 0 : readings.count) { return false } - - mutReadings.remove(at: mutCursorIndex) - mutGrid.shrinkGridByOneAt(location: mutCursorIndex) + readings.remove(at: cursor - (isBackSpace ? 1 : 0)) + cursor -= (isBackSpace ? 1 : 0) + resizeGridByOneAt(location: cursor, to: .shrink) build() return true } @@ -118,98 +135,84 @@ extension Megrez { /// 將該位置要溢出的敲字內容遞交之後、再執行這個函式。 @discardableResult public func removeHeadReadings(count: Int) -> Bool { let count = abs(count) // 防呆 - if count > length { - return false - } - + if count > length { return false } for _ in 0.. 0 { - mutCursorIndex -= 1 - } - if !mutReadings.isEmpty { - mutReadings.removeFirst() - mutGrid.shrinkGridByOneAt(location: 0) + cursor = max(cursor - 1, 0) + if !readings.isEmpty { + readings.removeFirst() + resizeGridByOneAt(location: 0, to: .shrink) } build() } - return true } - // MARK: - Walker - /// 對已給定的軌格按照給定的位置與條件進行正向爬軌。 - /// - Parameters: - /// - location: 開始爬軌的位置。 - /// - accumulatedScore: 給定累計權重,非必填參數。預設值為 0。 - /// - joinedPhrase: 用以統計累計長詞的內部參數,請勿主動使用。 - /// - longPhrases: 用以統計累計長詞的內部參數,請勿主動使用。 - public func walk( - at location: Int = 0, - score accumulatedScore: Double = 0.0, - joinedPhrase: String = "", - longPhrases: [String] = .init() - ) -> [NodeAnchor] { - let newLocation = (mutGrid.width) - abs(location) // 防呆 - return Array( - reverseWalk( - at: newLocation, score: accumulatedScore, - joinedPhrase: joinedPhrase, longPhrases: longPhrases - ).reversed()) + /// - Returns: 一個包含有效結果的節錨陣列。 + @discardableResult public func walk() -> [NodeAnchor] { + let newLocation = width + // 這裡把所有空節點都過濾掉。 + walkedAnchors = Array( + reverseWalk(at: newLocation).reversed() + ).lazy.filter { !$0.isEmpty } + updateCursorJumpingTables(walkedAnchors) + return walkedAnchors } - /// 對已給定的軌格按照給定的位置與條件進行反向爬軌。 + // MARK: - Private functions + + /// 內部專用反芻函式,對已給定的軌格按照給定的位置與條件進行反向爬軌。 /// - Parameters: /// - location: 開始爬軌的位置。 - /// - accumulatedScore: 給定累計權重,非必填參數。預設值為 0。 + /// - mass: 給定累計權重,非必填參數。預設值為 0。 /// - joinedPhrase: 用以統計累計長詞的內部參數,請勿主動使用。 /// - longPhrases: 用以統計累計長詞的內部參數,請勿主動使用。 - public func reverseWalk( + /// - Returns: 一個包含結果的節錨陣列。 + private func reverseWalk( at location: Int, - score accumulatedScore: Double = 0.0, + mass: Double = 0.0, joinedPhrase: String = "", longPhrases: [String] = .init() ) -> [NodeAnchor] { let location = abs(location) // 防呆 - if location == 0 || location > mutGrid.width { + if location == 0 || location > width { return .init() } var paths = [[NodeAnchor]]() - var nodes = mutGrid.nodesEndingAt(location: location) - - nodes = nodes.stableSorted { + let nodes = nodesEndingAt(location: location).stableSorted { $0.scoreForSort > $1.scoreForSort } - if let nodeZero = nodes[0].node, nodeZero.score >= nodeZero.kSelectedCandidateScore { + guard !nodes.isEmpty else { return .init() } // 防止下文出現範圍外索引的錯誤 + + if nodes[0].node.score >= Node.kSelectedCandidateScore { // 在使用者有選過候選字詞的情況下,摒棄非依此據而成的節點路徑。 - var anchorZero = nodes[0] - anchorZero.accumulatedScore = accumulatedScore + nodeZero.score + var theAnchor = nodes[0] + theAnchor.mass = mass + nodes[0].node.score var path: [NodeAnchor] = reverseWalk( - at: location - anchorZero.spanningLength, score: anchorZero.accumulatedScore + at: location - theAnchor.spanLength, mass: theAnchor.mass ) - path.insert(anchorZero, at: 0) + path.insert(theAnchor, at: 0) paths.append(path) } else if !longPhrases.isEmpty { var path = [NodeAnchor]() for theAnchor in nodes { - guard let theNode = theAnchor.node else { continue } var theAnchor = theAnchor - let joinedValue = theNode.currentKeyValue.value + joinedPhrase + let joinedValue = theAnchor.node.currentPair.value + joinedPhrase // 如果只是一堆單漢字的節點組成了同樣的長詞的話,直接棄用這個節點路徑。 // 打比方說「八/月/中/秋/山/林/涼」與「八月/中秋/山林/涼」在使用者來看 // 是「結果等價」的,那就扔掉前者。 if longPhrases.contains(joinedValue) { - theAnchor.accumulatedScore = kDroppedPathScore + theAnchor.mass = kDroppedPathScore path.insert(theAnchor, at: 0) paths.append(path) continue } - theAnchor.accumulatedScore = accumulatedScore + theNode.score + theAnchor.mass = mass + theAnchor.node.score path = reverseWalk( - at: location - theAnchor.spanningLength, - score: theAnchor.accumulatedScore, + at: location - theAnchor.spanLength, + mass: theAnchor.mass, joinedPhrase: (joinedValue.count >= longPhrases[0].count) ? "" : joinedValue, longPhrases: .init() ) @@ -219,9 +222,8 @@ extension Megrez { } else { // 看看當前格位有沒有更長的候選字詞。 var longPhrases = [String]() - for theAnchor in nodes.lazy.filter({ $0.spanningLength > 1 }) { - guard let theNode = theAnchor.node else { continue } - longPhrases.append(theNode.currentKeyValue.value) + for theAnchor in nodes.lazy.filter({ $0.spanLength > 1 }) { + longPhrases.append(theAnchor.node.currentPair.value) } longPhrases = longPhrases.stableSorted { @@ -229,12 +231,11 @@ extension Megrez { } for theAnchor in nodes { var theAnchor = theAnchor - guard let theNode = theAnchor.node else { continue } - theAnchor.accumulatedScore = accumulatedScore + theNode.score + theAnchor.mass = mass + theAnchor.node.score var path = [NodeAnchor]() path = reverseWalk( - at: location - theAnchor.spanningLength, score: theAnchor.accumulatedScore, - joinedPhrase: (theAnchor.spanningLength > 1) ? "" : theNode.currentKeyValue.value, + at: location - theAnchor.spanLength, mass: theAnchor.mass, + joinedPhrase: (theAnchor.spanLength > 1) ? "" : theAnchor.node.currentPair.value, longPhrases: .init() ) path.insert(theAnchor, at: 0) @@ -248,31 +249,29 @@ extension Megrez { var result: [NodeAnchor] = paths[0] for neta in paths.lazy.filter({ - $0.last!.accumulatedScore > result.last!.accumulatedScore + $0.last!.mass > result.last!.mass }) { result = neta } - return result + return result // 空節點過濾的步驟交給 walk() 這個對外函式,以避免重複執行清理步驟。 } - // MARK: - Private functions - private func build() { let itrBegin: Int = - (mutCursorIndex < maxBuildSpanLength) ? 0 : mutCursorIndex - maxBuildSpanLength - let itrEnd: Int = min(mutCursorIndex + maxBuildSpanLength, mutReadings.count) + (cursor < maxBuildSpanLength) ? 0 : cursor - maxBuildSpanLength + let itrEnd: Int = min(cursor + maxBuildSpanLength, readings.count) for p in itrBegin.. itrEnd { break } - let arrSlice = mutReadings[p..<(p + q)] + let arrSlice = readings[p..<(p + q)] let combinedReading: String = join(slice: arrSlice, separator: joinSeparator) - if mutGrid.hasMatchedNode(location: p, spanningLength: q, key: combinedReading) { continue } - let unigrams: [Unigram] = mutLM.unigramsFor(key: combinedReading) + if hasMatchedNode(location: p, spanLength: q, key: combinedReading) { continue } + let unigrams: [Unigram] = langModel.unigramsFor(key: combinedReading) if unigrams.isEmpty { continue } let n = Node(key: combinedReading, unigrams: unigrams) - mutGrid.insertNode(node: n, location: p, spanningLength: q) + insertNode(node: n, location: p, spanLength: q) } } } @@ -280,6 +279,20 @@ extension Megrez { private func join(slice arrSlice: ArraySlice, separator: String) -> String { arrSlice.joined(separator: separator) } + + internal func updateCursorJumpingTables(_ anchors: [NodeAnchor]) { + var cursorRegionMapDict = [Int: Int]() + var counter = 0 + for (i, anchor) in anchors.enumerated() { + for _ in 0..= mutSpans.count { - let diff = location - mutSpans.count + 1 + let spanLength = abs(spanLength) // 防呆 + if location >= spans.count { + let diff = location - spans.count + 1 for _ in 0.. Bool { + public func hasMatchedNode(location: Int, spanLength: Int, key: String) -> Bool { let location = abs(location) // 防呆 - let spanningLength = abs(spanningLength) // 防呆 - if location > mutSpans.count { + let spanLength = abs(spanLength) // 防呆 + if location > spans.count { return false } - let n = mutSpans[location].node(length: spanningLength) + let n = spans[location].nodeOf(length: spanLength) return n != nil && key == n?.key } - /// 在該軌格的指定位置擴增一個幅位。 + /// 在該軌格的指定位置擴增或減少一個幅位。 /// - Parameters: /// - location: 位置。 - public func expandGridByOneAt(location: Int) { - let location = abs(location) // 防呆 - mutSpans.insert(Span(), at: location) - if location == 0 || location == mutSpans.count { return } + public func resizeGridByOneAt(location: Int, to behavior: ResizeBehavior) { + let location = max(0, min(width, location)) // 防呆 + switch behavior { + case .expand: + spans.insert(SpanUnit(), at: location) + if [spans.count, 0].contains(location) { return } + case .shrink: + if location >= spans.count { return } + spans.remove(at: location) + } for i in 0..= mutSpans.count { - return - } - - mutSpans.remove(at: location) - for i in 0.. [NodeAnchor] { let location = abs(location) // 防呆 var results = [NodeAnchor]() - if location >= mutSpans.count { return results } - // 此時 mutSpans 必然不為空,因為 location 不可能小於 0。 - let span = mutSpans[location] + if location >= spans.count { return results } + // 此時 spans 必然不為空,因為 location 不可能小於 0。 + let span = spans[location] for i in 1...maxBuildSpanLength { - if let np = span.node(length: i) { + if let np = span.nodeOf(length: i) { results.append( .init( node: np, location: location, - spanningLength: i + spanLength: i ) ) } } - return results + return results // 已證實不會有空節點產生。 } /// 給定位置,枚舉出所有在這個位置結尾的節點。 @@ -143,21 +132,21 @@ extension Megrez { public func nodesEndingAt(location: Int) -> [NodeAnchor] { let location = abs(location) // 防呆 var results = [NodeAnchor]() - if mutSpans.isEmpty || location > mutSpans.count { return results } + if spans.isEmpty || location > spans.count { return results } for i in 0.. [NodeAnchor] { let location = abs(location) // 防呆 var results = [NodeAnchor]() - if mutSpans.isEmpty || location > mutSpans.count { return results } + if spans.isEmpty || location > spans.count { return results } for i in 0.. [NodeAnchor] { + Array(Set(nodesBeginningAt(location: location) + nodesCrossingOrEndingAt(location: location))) + } + + /// 使用給定的候選字字串,將給定位置的節點的候選字詞改為與給定的字串一致的候選字詞。 + /// + /// 該函式可以僅用作過程函式,但準確度不如用於處理候選字鍵值配對的 fixNodeWithCandidate()。 + /// - Parameters: + /// - location: 位置。 + /// - value: 給定字串。 + @discardableResult public func fixNodeWithCandidateLiteral(_ value: String, at location: Int) -> NodeAnchor { + let location = abs(location) // 防呆 + var node = NodeAnchor() + for theAnchor in nodesOverlappedAt(location: location) { + let candidates = theAnchor.node.candidates + // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 + theAnchor.node.resetCandidate() + for (i, candidate) in candidates.enumerated() { + if candidate.value == value { + theAnchor.node.selectCandidateAt(index: i) + node = theAnchor + break + } + } + } + return node + } + + /// 使用給定的候選字鍵值配對,將給定位置的節點的候選字詞改為與給定的字串一致的候選字詞。 /// /// 該函式可以僅用作過程函式。 /// - Parameters: /// - location: 位置。 - /// - value: 給定字串。 - @discardableResult public func fixNodeSelectedCandidate(location: Int, value: String) -> NodeAnchor { + /// - value: 給定候選字鍵值配對。 + @discardableResult public func fixNodeWithCandidate(_ pair: KeyValuePaired, at location: Int) -> NodeAnchor { let location = abs(location) // 防呆 var node = NodeAnchor() - for nodeAnchor in nodesCrossingOrEndingAt(location: location) { - guard let theNode = nodeAnchor.node else { - continue - } - let candidates = theNode.candidates + for theAnchor in nodesOverlappedAt(location: location) { + let candidates = theAnchor.node.candidates // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 - theNode.resetCandidate() + theAnchor.node.resetCandidate() for (i, candidate) in candidates.enumerated() { - if candidate.value == value { - theNode.selectCandidateAt(index: i) - node = nodeAnchor + if candidate == pair { + theAnchor.node.selectCandidateAt(index: i) + node = theAnchor break } } @@ -220,16 +239,13 @@ extension Megrez { /// - overridingScore: 給定權重數值。 public func overrideNodeScoreForSelectedCandidate(location: Int, value: String, overridingScore: Double) { let location = abs(location) // 防呆 - for nodeAnchor in nodesCrossingOrEndingAt(location: location) { - guard let theNode = nodeAnchor.node else { - continue - } - let candidates = theNode.candidates + for theAnchor in nodesOverlappedAt(location: location) { + let candidates = theAnchor.node.candidates // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 - theNode.resetCandidate() + theAnchor.node.resetCandidate() for (i, candidate) in candidates.enumerated() { if candidate.value == value { - theNode.selectFloatingCandidateAt(index: i, score: overridingScore) + theAnchor.node.selectFloatingCandidateAt(index: i, score: overridingScore) break } } @@ -244,29 +260,22 @@ extension Megrez.Grid { /// 生成用以交給 GraphViz 診斷的資料檔案內容,純文字。 public var dumpDOT: String { var strOutput = "digraph {\ngraph [ rankdir=LR ];\nBOS;\n" - for (p, span) in mutSpans.enumerated() { - for ni in 0...(span.maximumLength) { - guard let np: Megrez.Node = span.node(length: ni) else { - continue - } + for (p, span) in spans.enumerated() { + for ni in 0...(span.maxLength) { + guard let np = span.nodeOf(length: ni) else { continue } if p == 0 { - strOutput += "BOS -> \(np.currentKeyValue.value);\n" + strOutput += "BOS -> \(np.currentPair.value);\n" } - - strOutput += "\(np.currentKeyValue.value);\n" - - if (p + ni) < mutSpans.count { - let destinationSpan = mutSpans[p + ni] - for q in 0...(destinationSpan.maximumLength) { - if let dn = destinationSpan.node(length: q) { - strOutput += np.currentKeyValue.value + " -> " + dn.currentKeyValue.value + ";\n" - } + strOutput += "\(np.currentPair.value);\n" + if (p + ni) < spans.count { + let destinationSpan = spans[p + ni] + for q in 0...(destinationSpan.maxLength) { + guard let dn = destinationSpan.nodeOf(length: q) else { continue } + strOutput += np.currentPair.value + " -> " + dn.currentPair.value + ";\n" } } - - if (p + ni) == mutSpans.count { - strOutput += np.currentKeyValue.value + " -> EOS;\n" - } + guard (p + ni) == spans.count else { continue } + strOutput += np.currentPair.value + " -> EOS;\n" } } strOutput += "EOS;\n}\n" diff --git a/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift b/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift index 4cdaa64b..40c9c89d 100644 --- a/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift +++ b/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift @@ -25,25 +25,34 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. extension Megrez { /// 節锚。 - @frozen public struct NodeAnchor: CustomStringConvertible { + @frozen public struct NodeAnchor: Hashable { + /// 用來判斷該節錨是否為空。 + public var isEmpty: Bool { node.key.isEmpty } /// 節點。一個節锚內不一定有節點。 - public var node: Node? + public var node: Node = .init() /// 節锚所在的位置。 public var location: Int = 0 - /// 幅位長度。 - public var spanningLength: Int = 0 + /// 指定的幅位長度。 + public var spanLength: Int = 0 /// 累計權重。 - public var accumulatedScore: Double = 0.0 + public var mass: Double = 0.0 /// 索引鍵的長度。 public var keyLength: Int { - node?.key.count ?? 0 + isEmpty ? node.key.count : 0 + } + + public func hash(into hasher: inout Hasher) { + hasher.combine(node) + hasher.combine(location) + hasher.combine(spanLength) + hasher.combine(mass) } /// 將當前節锚列印成一個字串。 public var description: String { var stream = "" - stream += "{@(" + String(location) + "," + String(spanningLength) + ")," - if let node = node { + stream += "{@(" + String(location) + "," + String(spanLength) + ")," + if node.key.isEmpty { stream += node.description } else { stream += "null" @@ -54,12 +63,12 @@ extension Megrez { /// 獲取用來比較的權重。 public var scoreForSort: Double { - node?.score ?? 0 + isEmpty ? node.score : 0 } } } -// MARK: - DumpDOT-related functions. +// MARK: - Array Extensions. extension Array where Element == Megrez.NodeAnchor { /// 將節锚陣列列印成一個字串。 @@ -70,4 +79,14 @@ extension Array where Element == Megrez.NodeAnchor { } return arrOutputContent.joined(separator: "<-") } + + /// 從一個節錨陣列當中取出目前的自動選字字串陣列。 + public var values: [String] { + map(\.node.currentPair.value) + } + + /// 從一個節錨陣列當中取出目前的索引鍵陣列。 + public var keys: [String] { + map(\.node.currentPair.key) + } } diff --git a/Source/Modules/LanguageParsers/Megrez/3_Span.swift b/Source/Modules/LanguageParsers/Megrez/3_Span.swift index c59c3eaf..a6c71e81 100644 --- a/Source/Modules/LanguageParsers/Megrez/3_Span.swift +++ b/Source/Modules/LanguageParsers/Megrez/3_Span.swift @@ -25,21 +25,16 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. extension Megrez { /// 幅位。 - @frozen public struct Span { + @frozen public struct SpanUnit { /// 辭典:以節點長度為索引,以節點為資料值。 - private var mutLengthNodeMap: [Int: Megrez.Node] = [:] - /// 最大節點長度。 - private var mutMaximumLength: Int = 0 - - /// 公開:最長幅距(唯讀)。 - public var maximumLength: Int { - mutMaximumLength - } + private var lengthNodeMap: [Int: Megrez.Node] = [:] + /// 最長幅距。 + private(set) var maxLength: Int = 0 /// 自我清空,各項參數歸零。 mutating func clear() { - mutLengthNodeMap.removeAll() - mutMaximumLength = 0 + lengthNodeMap.removeAll() + maxLength = 0 } /// 往自身插入一個節點、及給定的節點長度。 @@ -48,37 +43,37 @@ extension Megrez { /// - length: 給定的節點長度。 mutating func insert(node: Node, length: Int) { let length = abs(length) // 防呆 - mutLengthNodeMap[length] = node - mutMaximumLength = max(mutMaximumLength, length) + lengthNodeMap[length] = node + maxLength = max(maxLength, length) } /// 移除任何比給定的長度更長的節點。 /// - Parameters: /// - length: 給定的節點長度。 - mutating func removeNodeOfLengthGreaterThan(_ length: Int) { + mutating func dropNodesBeyond(length: Int) { let length = abs(length) // 防呆 - if length > mutMaximumLength { return } + if length > maxLength { return } var lenMax = 0 var removalList: [Int: Megrez.Node] = [:] - for key in mutLengthNodeMap.keys { + for key in lengthNodeMap.keys { if key > length { - removalList[key] = mutLengthNodeMap[key] + removalList[key] = lengthNodeMap[key] } else { lenMax = max(lenMax, key) } } for key in removalList.keys { - mutLengthNodeMap.removeValue(forKey: key) + lengthNodeMap.removeValue(forKey: key) } - mutMaximumLength = lenMax + maxLength = lenMax } /// 給定節點長度,獲取節點。 /// - Parameters: /// - length: 給定的節點長度。 - public func node(length: Int) -> Node? { + public func nodeOf(length: Int) -> Node? { // 防呆 Abs() - mutLengthNodeMap.keys.contains(abs(length)) ? mutLengthNodeMap[abs(length)] : nil + lengthNodeMap.keys.contains(abs(length)) ? lengthNodeMap[abs(length)] : nil } } } diff --git a/Source/Modules/LanguageParsers/Megrez/4_Node.swift b/Source/Modules/LanguageParsers/Megrez/4_Node.swift index 1672b2b9..f5fc0d63 100644 --- a/Source/Modules/LanguageParsers/Megrez/4_Node.swift +++ b/Source/Modules/LanguageParsers/Megrez/4_Node.swift @@ -25,76 +25,86 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. extension Megrez { /// 節點。 - public class Node { - /// 鍵。 - private var mutKey: String = "" - /// 當前節點的當前被選中的候選字詞「在該節點內的」目前的權重。 - private var mutScore: Double = 0 - /// 單元圖陣列。 - private var mutUnigrams: [Unigram] - /// 雙元圖陣列。 - private var mutBigrams: [Bigram] - /// 候選字詞陣列,以鍵值陣列的形式存在。 - private var mutCandidates: [KeyValuePaired] = [] - /// 專門「用單元圖資料值來調查索引值」的辭典。 - private var mutValueUnigramIndexMap: [String: Int] = [:] - /// 專門「用給定鍵值來取對應的雙元圖陣列」的辭典。 - private var mutPrecedingBigramMap: [KeyValuePaired: [Megrez.Bigram]] = [:] - /// 狀態標記變數,用來記載當前節點是否處於候選字詞鎖定狀態。 - private var mutCandidateFixed: Bool = false - /// 用來登記「當前選中的單元圖」的索引值的變數。 - private var mutSelectedUnigramIndex: Int = 0 - /// 用來登記要施加給「『被標記為選中狀態』的候選字詞」的複寫權重的數值。 - public let kSelectedCandidateScore: Double = 99 - /// 將當前節點列印成一個字串。 - public var description: String { - "(node,key:\(mutKey),fixed:\(mutCandidateFixed ? "true" : "false"),selected:\(mutSelectedUnigramIndex),\(mutUnigrams))" + public class Node: Equatable, Hashable { + public static func == (lhs: Megrez.Node, rhs: Megrez.Node) -> Bool { + lhs.key == rhs.key && lhs.score == rhs.score && lhs.unigrams == rhs.unigrams && lhs.bigrams == rhs.bigrams + && lhs.candidates == rhs.candidates && lhs.valueUnigramIndexMap == rhs.valueUnigramIndexMap + && lhs.precedingBigramMap == rhs.precedingBigramMap && lhs.isCandidateFixed == rhs.isCandidateFixed + && lhs.selectedUnigramIndex == rhs.selectedUnigramIndex } - /// 公開:候選字詞陣列(唯讀),以鍵值陣列的形式存在。 - public var candidates: [KeyValuePaired] { mutCandidates } - /// 公開:用來登記「當前選中的單元圖」的索引值的變數(唯讀)。 - public var isCandidateFixed: Bool { mutCandidateFixed } + public func hash(into hasher: inout Hasher) { + hasher.combine(key) + hasher.combine(score) + hasher.combine(unigrams) + hasher.combine(bigrams) + hasher.combine(candidates) + hasher.combine(valueUnigramIndexMap) + hasher.combine(precedingBigramMap) + hasher.combine(isCandidateFixed) + hasher.combine(selectedUnigramIndex) + } + + /// 鍵。 + private(set) var key: String = "" + /// 當前節點的當前被選中的候選字詞「在該節點內的」目前的權重。 + private(set) var score: Double = 0 + /// 單元圖陣列。 + private var unigrams: [Unigram] + /// 雙元圖陣列。 + private var bigrams: [Bigram] + /// 候選字詞陣列,以鍵值陣列的形式存在。 + private(set) var candidates: [KeyValuePaired] = [] + /// 專門「用單元圖資料值來調查索引值」的辭典。 + private var valueUnigramIndexMap: [String: Int] = [:] + /// 專門「用給定鍵值來取對應的雙元圖陣列」的辭典。 + private var precedingBigramMap: [KeyValuePaired: [Megrez.Bigram]] = [:] + /// 狀態標記變數,用來記載當前節點是否處於候選字詞鎖定狀態。 + private(set) var isCandidateFixed: Bool = false + /// 用來登記「當前選中的單元圖」的索引值的變數。 + private var selectedUnigramIndex: Int = 0 + /// 用來登記要施加給「『被標記為選中狀態』的候選字詞」的複寫權重的數值。 + public static let kSelectedCandidateScore: Double = 99 + /// 將當前節點列印成一個字串。 + public var description: String { + "(node,key:\(key),fixed:\(isCandidateFixed ? "true" : "false"),selected:\(selectedUnigramIndex),\(unigrams))" + } - /// 公開:鍵(唯讀)。 - public var key: String { mutKey } - /// 公開:當前節點的當前被選中的候選字詞「在該節點內的」目前的權重(唯讀)。 - public var score: Double { mutScore } /// 公開:當前被選中的候選字詞的鍵值配對。 - public var currentKeyValue: KeyValuePaired { - mutSelectedUnigramIndex >= mutUnigrams.count ? KeyValuePaired() : mutCandidates[mutSelectedUnigramIndex] + public var currentPair: KeyValuePaired { + selectedUnigramIndex >= unigrams.count ? KeyValuePaired() : candidates[selectedUnigramIndex] } /// 公開:給出當前單元圖陣列內最高的權重數值。 - public var highestUnigramScore: Double { mutUnigrams.isEmpty ? 0.0 : mutUnigrams[0].score } + public var highestUnigramScore: Double { unigrams.isEmpty ? 0.0 : unigrams[0].score } /// 初期化一個節點。 /// - Parameters: /// - key: 索引鍵。 /// - unigrams: 單元圖陣列。 /// - bigrams: 雙元圖陣列(非必填)。 - public init(key: String, unigrams: [Megrez.Unigram], bigrams: [Megrez.Bigram] = []) { - mutKey = key - mutUnigrams = unigrams - mutBigrams = bigrams + public init(key: String = "", unigrams: [Megrez.Unigram] = [], bigrams: [Megrez.Bigram] = []) { + self.key = key + self.unigrams = unigrams + self.bigrams = bigrams - mutUnigrams.sort { + self.unigrams.sort { $0.score > $1.score } - if !mutUnigrams.isEmpty { - mutScore = mutUnigrams[0].score + if !self.unigrams.isEmpty { + score = unigrams[0].score } - for (i, gram) in mutUnigrams.enumerated() { - mutValueUnigramIndexMap[gram.keyValue.value] = i - mutCandidates.append(gram.keyValue) + for (i, gram) in self.unigrams.enumerated() { + valueUnigramIndexMap[gram.keyValue.value] = i + candidates.append(gram.keyValue) } for gram in bigrams.lazy.filter({ [self] in - mutPrecedingBigramMap.keys.contains($0.precedingKeyValue) + precedingBigramMap.keys.contains($0.precedingKeyValue) }) { - mutPrecedingBigramMap[gram.precedingKeyValue]?.append(gram) + precedingBigramMap[gram.precedingKeyValue]?.append(gram) } } @@ -102,22 +112,22 @@ extension Megrez { /// - Parameters: /// - precedingKeyValues: 前述鍵值陣列。 public func primeNodeWith(precedingKeyValues: [KeyValuePaired]) { - var newIndex = mutSelectedUnigramIndex - var max = mutScore + var newIndex = selectedUnigramIndex + var max = score if !isCandidateFixed { for neta in precedingKeyValues { - let bigrams = mutPrecedingBigramMap[neta] ?? [] + let bigrams = precedingBigramMap[neta] ?? [] for bigram in bigrams.lazy.filter({ [self] in - $0.score > max && mutValueUnigramIndexMap.keys.contains($0.keyValue.value) + $0.score > max && valueUnigramIndexMap.keys.contains($0.keyValue.value) }) { - newIndex = mutValueUnigramIndexMap[bigram.keyValue.value] ?? newIndex + newIndex = valueUnigramIndexMap[bigram.keyValue.value] ?? newIndex max = bigram.score } } } - mutScore = max - mutSelectedUnigramIndex = newIndex + score = max + selectedUnigramIndex = newIndex } /// 選中位於給定索引位置的候選字詞。 @@ -126,17 +136,17 @@ extension Megrez { /// - fix: 是否將當前解點標記為「候選詞已鎖定」的狀態。 public func selectCandidateAt(index: Int = 0, fix: Bool = false) { let index = abs(index) - mutSelectedUnigramIndex = index >= mutUnigrams.count ? 0 : index - mutCandidateFixed = fix - mutScore = kSelectedCandidateScore + selectedUnigramIndex = index >= unigrams.count ? 0 : index + isCandidateFixed = fix + score = Megrez.Node.kSelectedCandidateScore } /// 重設該節點的候選字詞狀態。 public func resetCandidate() { - mutSelectedUnigramIndex = 0 - mutCandidateFixed = false - if !mutUnigrams.isEmpty { - mutScore = mutUnigrams[0].score + selectedUnigramIndex = 0 + isCandidateFixed = false + if !unigrams.isEmpty { + score = unigrams[0].score } } @@ -146,16 +156,26 @@ extension Megrez { /// - score: 給定權重條件。 public func selectFloatingCandidateAt(index: Int, score: Double) { let index = abs(index) // 防呆 - mutSelectedUnigramIndex = index >= mutUnigrams.count ? 0 : index - mutCandidateFixed = false - mutScore = score + selectedUnigramIndex = index >= unigrams.count ? 0 : index + isCandidateFixed = false + self.score = score } /// 藉由給定的候選字詞字串,找出在庫的單元圖權重數值。沒有的話就找零。 /// - Parameters: /// - candidate: 給定的候選字詞字串。 public func scoreFor(candidate: String) -> Double { - for unigram in mutUnigrams.lazy.filter({ $0.keyValue.value == candidate }) { + for unigram in unigrams.lazy.filter({ $0.keyValue.value == candidate }) { + return unigram.score + } + return 0.0 + } + + /// 藉由給定的候選字詞鍵值配對,找出在庫的單元圖權重數值。沒有的話就找零。 + /// - Parameters: + /// - candidate: 給定的候選字詞字串。 + public func scoreForPaired(candidate: KeyValuePaired) -> Double { + for unigram in unigrams.lazy.filter({ $0.keyValue == candidate }) { return unigram.score } return 0.0 diff --git a/Source/Modules/LanguageParsers/Megrez/5_LanguageModel.swift b/Source/Modules/LanguageParsers/Megrez/5_LanguageModel.swift index abe8c822..75ee404e 100644 --- a/Source/Modules/LanguageParsers/Megrez/5_LanguageModel.swift +++ b/Source/Modules/LanguageParsers/Megrez/5_LanguageModel.swift @@ -23,7 +23,7 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -public protocol LanguageModelProtocol { +public protocol LangModelProtocol { /// 給定鍵,讓語言模型找給一組單元圖陣列。 func unigramsFor(key: String) -> [Megrez.Unigram] @@ -36,7 +36,7 @@ public protocol LanguageModelProtocol { extension Megrez { /// 語言模型框架,回頭實際使用時需要派生一個型別、且重寫相關函式。 - open class LanguageModel: LanguageModelProtocol { + open class LangModel: LangModelProtocol { public init() {} // 這裡寫了一點假內容,不然有些 Swift 格式化工具會破壞掉函式的參數設計。 diff --git a/Source/Modules/LanguageParsers/Megrez/6_Bigram.swift b/Source/Modules/LanguageParsers/Megrez/6_Bigram.swift index b64e2658..fdf15d51 100644 --- a/Source/Modules/LanguageParsers/Megrez/6_Bigram.swift +++ b/Source/Modules/LanguageParsers/Megrez/6_Bigram.swift @@ -25,7 +25,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. extension Megrez { /// 雙元圖。 - @frozen public struct Bigram: Equatable, CustomStringConvertible { + @frozen public struct Bigram: Equatable, CustomStringConvertible, Hashable { /// 當前鍵值。 public var keyValue: KeyValuePaired /// 前述鍵值。 @@ -61,7 +61,7 @@ extension Megrez { public static func < (lhs: Bigram, rhs: Bigram) -> Bool { lhs.precedingKeyValue < rhs.precedingKeyValue - || (lhs.keyValue < rhs.keyValue || (lhs.keyValue == rhs.keyValue && lhs.keyValue < rhs.keyValue)) + || (lhs.keyValue < rhs.keyValue || (lhs.keyValue == rhs.keyValue && lhs.score < rhs.score)) } } } diff --git a/Source/Modules/LanguageParsers/Megrez/6_Unigram.swift b/Source/Modules/LanguageParsers/Megrez/6_Unigram.swift index 4bcd894e..c716fe18 100644 --- a/Source/Modules/LanguageParsers/Megrez/6_Unigram.swift +++ b/Source/Modules/LanguageParsers/Megrez/6_Unigram.swift @@ -25,7 +25,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. extension Megrez { /// 單元圖。 - @frozen public struct Unigram: Equatable, CustomStringConvertible { + @frozen public struct Unigram: Equatable, CustomStringConvertible, Hashable { /// 鍵值。 public var keyValue: KeyValuePaired /// 權重。 @@ -54,7 +54,7 @@ extension Megrez { } public static func < (lhs: Unigram, rhs: Unigram) -> Bool { - lhs.keyValue < rhs.keyValue || (lhs.keyValue == rhs.keyValue && lhs.keyValue < rhs.keyValue) + lhs.keyValue < rhs.keyValue || (lhs.keyValue == rhs.keyValue && lhs.score < rhs.score) } } } diff --git a/Source/Modules/LanguageParsers/Megrez/7_KeyValuePaired.swift b/Source/Modules/LanguageParsers/Megrez/7_KeyValuePaired.swift index 3e9dee80..ac07ad4d 100644 --- a/Source/Modules/LanguageParsers/Megrez/7_KeyValuePaired.swift +++ b/Source/Modules/LanguageParsers/Megrez/7_KeyValuePaired.swift @@ -52,7 +52,7 @@ extension Megrez { } public static func == (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool { - lhs.key.count == rhs.key.count && lhs.value == rhs.value + lhs.key == rhs.key && lhs.value == rhs.value } public static func < (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool {