diff --git a/Packages/vChewing_Megrez/Sources/Megrez/2_Walker.swift b/Packages/vChewing_Megrez/Sources/Megrez/2_Walker.swift index 0e795924..366bc137 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/2_Walker.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/2_Walker.swift @@ -18,39 +18,27 @@ public extension Megrez.Compositor { defer { walkedNodes = result } guard !spans.isEmpty else { return (result, true) } - var vertexSpans = [[Vertex]]() - spans.forEach { _ in - vertexSpans.append(.init()) - } - - spans.enumerated().forEach { i, span in - (1 ... max(span.maxLength, 1)).forEach { j in - guard let theNode = span[j] else { return } - vertexSpans[i].append(.init(node: theNode)) - } - } + var vertexSpans: [[Int: Vertex]] = spans.map(\.asVertexSpan) let terminal = Vertex(node: .init(keyArray: ["_TERMINAL_"])) var root = Vertex(node: .init(keyArray: ["_ROOT_"])) + root.distance = 0 - vertexSpans.enumerated().forEach { i, vertexSpan in - vertexSpan.forEach { vertex in - let nextVertexPosition = i + vertex.node.spanLength + vertexSpans.enumerated().forEach { location, vertexSpan in + vertexSpan.values.forEach { vertex in + let nextVertexPosition = location + vertex.node.spanLength if nextVertexPosition == vertexSpans.count { vertex.edges.append(terminal) return } - vertexSpans[nextVertexPosition].forEach { vertex.edges.append($0) } + vertexSpans[nextVertexPosition].values.forEach { vertex.edges.append($0) } } } - root.distance = 0 - root.edges.append(contentsOf: vertexSpans[0]) + root.edges.append(contentsOf: vertexSpans[0].values) - var ordered = topologicalSort(root: &root) - ordered.reversed().enumerated().forEach { j, neta in - neta.edges.indices.forEach { relax(u: neta, v: &neta.edges[$0]) } - ordered[j] = neta + topologicalSort(root: &root).reversed().forEach { neta in + neta.edges.indices.forEach { neta.relax(target: &neta.edges[$0]) } } var iterated = terminal @@ -64,7 +52,6 @@ public extension Megrez.Compositor { } // 清理內容,否則會有記憶體洩漏。 - ordered.removeAll() vertexSpans.removeAll() iterated.destroy() root.destroy() @@ -84,3 +71,14 @@ public extension Megrez.Compositor { return (result, true) } } + +extension Megrez.SpanUnit { + /// 將當前幅位單元由節點辭典轉為頂點辭典。 + var asVertexSpan: [Int: Megrez.Compositor.Vertex] { + var result = [Int: Megrez.Compositor.Vertex]() + forEach { theKey, theValue in + result[theKey] = .init(node: theValue) + } + return result + } +} diff --git a/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift b/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift index 1b0de1ba..bf20e4cb 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift @@ -107,10 +107,7 @@ public extension Megrez.Compositor { location -= 1 } location = max(min(location, keys.count - 1), 0) - let anchors: [NodeAnchor] = fetchOverlappingNodes(at: location).stableSorted { - // 按照讀音的長度(幅位長度)來給節點排序。 - $0.spanLength > $1.spanLength - } + let anchors: [(location: Int, node: Megrez.Node)] = fetchOverlappingNodes(at: location) let keyAtCursor = keys[location] anchors.forEach { theAnchor in let theNode = theAnchor.node @@ -120,11 +117,11 @@ public extension Megrez.Compositor { // 得加上這道篩選,不然會出現很多無效結果。 if !theNode.keyArray.contains(keyAtCursor) { return } case .beginAt: - guard theAnchor.spanIndex == location else { return } + guard theAnchor.location == location else { return } case .endAt: guard theNode.keyArray.last == keyAtCursor else { return } switch theNode.spanLength { - case 2... where theAnchor.spanIndex + theAnchor.spanLength - 1 != location: return + case 2... where theAnchor.location + theAnchor.node.spanLength - 1 != location: return default: break } } @@ -178,8 +175,8 @@ public extension Megrez.Compositor { -> Bool { let location = max(min(location, keys.count), 0) // 防呆 - var arrOverlappedNodes: [NodeAnchor] = fetchOverlappingNodes(at: min(keys.count - 1, location)) - var overridden: NodeAnchor? + var arrOverlappedNodes: [(location: Int, node: Megrez.Node)] = fetchOverlappingNodes(at: min(keys.count - 1, location)) + var overridden: (location: Int, node: Megrez.Node)? for anchor in arrOverlappedNodes { if keyArray != nil, anchor.node.keyArray != keyArray { continue } if !anchor.node.selectOverrideUnigram(value: value, type: type) { continue } @@ -189,7 +186,7 @@ public extension Megrez.Compositor { guard let overridden = overridden else { return false } // 啥也不覆寫。 - (overridden.spanIndex ..< min(spans.count, overridden.spanIndex + overridden.node.spanLength)).forEach { i in + (overridden.location ..< min(spans.count, overridden.location + overridden.node.spanLength)).forEach { i in /// 咱們還得弱化所有在相同的幅位座標的節點的複寫權重。舉例說之前爬軌的結果是「A BC」 /// 且 A 與 BC 都是被覆寫的結果,然後使用者現在在與 A 相同的幅位座標位置 /// 選了「DEF」,那麼 BC 的覆寫狀態就有必要重設(但 A 不用重設)。 @@ -208,32 +205,3 @@ public extension Megrez.Compositor { return true } } - -// MARK: - Stable Sort Extension - -// Reference: https://stackoverflow.com/a/50545761/4162914 - -private extension Sequence { - /// Return a stable-sorted collection. - /// - /// - Parameter areInIncreasingOrder: Return nil when two element are equal. - /// - Returns: The sorted collection. - func stableSorted( - by areInIncreasingOrder: (Element, Element) throws -> Bool - ) - rethrows -> [Element] - { - try enumerated() - .sorted { a, b -> Bool in - try areInIncreasingOrder(a.element, b.element) - || (a.offset < b.offset && !areInIncreasingOrder(b.element, a.element)) - } - .map(\.element) - } -} - -// MARK: - Bool Extension (Private) - -extension Bool { - var negative: Bool { !self } -} diff --git a/Packages/vChewing_Megrez/Sources/Megrez/4_SpanUnit.swift b/Packages/vChewing_Megrez/Sources/Megrez/4_SpanUnit.swift index 60fe6834..757780d6 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/4_SpanUnit.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/4_SpanUnit.swift @@ -64,16 +64,15 @@ extension Megrez.Compositor { /// 找出所有與該位置重疊的節點。其返回值為一個節錨陣列(包含節點、以及其起始位置)。 /// - Parameter location: 游標位置。 /// - Returns: 一個包含所有與該位置重疊的節點的陣列。 - func fetchOverlappingNodes(at givenLocation: Int) -> [NodeAnchor] { - var results = [NodeAnchor]() + public func fetchOverlappingNodes(at givenLocation: Int) -> [(location: Int, node: Megrez.Node)] { + var results = [(location: Int, node: Megrez.Node)]() let givenLocation = max(0, min(givenLocation, keys.count - 1)) guard !spans.isEmpty else { return results } // 先獲取該位置的所有單字節點。 (1 ... max(spans[givenLocation].maxLength, 1)).forEach { theSpanLength in guard let node = spans[givenLocation][theSpanLength] else { return } - guard !node.keyArray.joined().isEmpty else { return } - results.append(.init(node: node, spanIndex: givenLocation)) + Self.insertAnchor(spanIndex: givenLocation, node: node, to: &results) } // 再獲取以當前位置結尾或開頭的節點。 @@ -83,11 +82,27 @@ extension Megrez.Compositor { guard A <= B else { return } (A ... B).forEach { theLength in guard let node = spans[theLocation][theLength] else { return } - guard !node.keyArray.joined().isEmpty else { return } - results.append(.init(node: node, spanIndex: theLocation)) + Self.insertAnchor(spanIndex: theLocation, node: node, to: &results) } } return results } + + /// 要在 fetchOverlappingNodes() 內使用的一個工具函式。 + private static func insertAnchor( + spanIndex location: Int, node: Megrez.Node, + to targetContainer: inout [(location: Int, node: Megrez.Node)] + ) { + guard !node.keyArray.joined().isEmpty else { return } + let anchor = (location: location, node: node) + for i in 0 ... targetContainer.count { + guard !targetContainer.isEmpty else { break } + guard targetContainer[i].node.spanLength <= anchor.node.spanLength else { continue } + targetContainer.insert(anchor, at: i) + return + } + guard targetContainer.isEmpty else { return } + targetContainer.append(anchor) + } } diff --git a/Packages/vChewing_Megrez/Sources/Megrez/5_Vertex.swift b/Packages/vChewing_Megrez/Sources/Megrez/5_Vertex.swift index a101d65b..b60ea371 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/5_Vertex.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/5_Vertex.swift @@ -39,23 +39,23 @@ extension Megrez.Compositor { edges.removeAll() node = .init() } - } - /// 卸勁函式。 - /// - /// 「卸勁 (relax)」一詞出自 Cormen 在 2001 年的著作「Introduction to Algorithms」的 585 頁。 - /// - Parameters: - /// - u: 參照頂點,會在必要時成為 v 的前述頂點。 - /// - v: 要影響的頂點。 - func relax(u: Vertex, v: inout Vertex) { - // 從 u 到 w 的距離,也就是 v 的權重。 - let w: Double = v.node.score - // 這裡計算最大權重: - // 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」, - // 我們就更新 v 的距離及其前述頂點。 - if v.distance >= u.distance + w { return } - v.distance = u.distance + w - v.prev = u + /// 卸勁函式。 + /// + /// 「卸勁 (relax)」一詞出自 Cormen 在 2001 年的著作「Introduction to Algorithms」的 585 頁。 + /// - Remark: 自己就是參照頂點 (u),會在必要時成為 target (v) 的前述頂點。 + /// - Parameters: + /// - target: 要影響的頂點。 + public func relax(target: inout Vertex) { + // 從 u 到 w 的距離,也就是 v 的權重。 + let w: Double = target.node.score + // 這裡計算最大權重: + // 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」, + // 我們就更新 v 的距離及其前述頂點。 + if target.distance >= distance + w { return } + target.distance = distance + w + target.prev = self + } } /// 對持有單個根頂點的有向無環圖進行位相幾何排序(topological @@ -65,7 +65,7 @@ extension Megrez.Compositor { /// 這樣我們就不會受到當前線程的堆棧大小的限制。以下是等價的原始算法。 /// ``` /// func topologicalSort(vertex: Vertex) { - /// vertex.edges.forEach {vertexNode in + /// vertex.edges.forEach { vertexNode in /// if !vertexNode.topologicallySorted { /// dfs(vertexNode, result) /// vertexNode.topologicallySorted = true diff --git a/Packages/vChewing_Megrez/Sources/Megrez/6_Node.swift b/Packages/vChewing_Megrez/Sources/Megrez/6_Node.swift index a502adc0..c4c466e5 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/6_Node.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/6_Node.swift @@ -176,31 +176,6 @@ public extension Megrez { } } -public extension Megrez.Compositor { - /// 節錨。在 Gramambular 2 當中又被稱為「NodeInSpan」。 - struct NodeAnchor: Hashable { - /// 節點。 - let node: Megrez.Node - /// 幅位座標。 - let spanIndex: Int - /// 幅位長度。 - var spanLength: Int { node.spanLength } - /// 單元圖陣列。 - var unigrams: [Megrez.Unigram] { node.unigrams } - /// 索引鍵陣列。 - var keyArray: [String] { node.keyArray } - /// 給出該節點內部單元圖陣列內目前被索引位置所指向的單元圖的資料值。 - var value: String { node.value } - - /// 做為預設雜湊函式。 - /// - Parameter hasher: 目前物件的雜湊碼。 - public func hash(into hasher: inout Hasher) { - hasher.combine(node) - hasher.combine(spanIndex) - } - } -} - // MARK: - Array Extensions. public extension Array where Element == Megrez.Node { diff --git a/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezImplForTests.swift b/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezImplForTests.swift new file mode 100644 index 00000000..4cba0c8e --- /dev/null +++ b/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezImplForTests.swift @@ -0,0 +1,40 @@ +// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License). +// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). +// ==================== +// This code is released under the MIT license (SPDX-License-Identifier: MIT) + +import Megrez + +// MARK: - Megrez Extensions for Test Purposes Only. + +public extension Megrez.Compositor { + /// 返回在當前位置的所有候選字詞(以詞音配對的形式)。如果組字器內有幅位、且游標 + /// 位於組字器的(文字輸入順序的)最前方(也就是游標位置的數值是最大合規數值)的 + /// 話,那麼這裡會用到 location - 1、以免去在呼叫該函式後再處理的麻煩。 + /// - Remark: 該函式已被淘汰,因為有「無法徹底清除 node-crossing 內容」的故障。 + /// 現僅用於單元測試、以確認其繼任者是否有給出所有該給出的正常結果。 + /// - Parameter location: 游標位置。 + /// - Returns: 候選字音配對陣列。 + func fetchCandidatesDeprecated(at location: Int, filter: CandidateFetchFilter = .all) -> [Megrez.KeyValuePaired] { + var result = [Megrez.KeyValuePaired]() + guard !keys.isEmpty else { return result } + let location = max(min(location, keys.count - 1), 0) // 防呆 + let anchors: [(location: Int, node: Megrez.Node)] = fetchOverlappingNodes(at: location) + let keyAtCursor = keys[location] + anchors.map(\.node).forEach { theNode in + theNode.unigrams.forEach { gram in + switch filter { + case .all: + // 得加上這道篩選,不然會出現很多無效結果。 + if !theNode.keyArray.contains(keyAtCursor) { return } + case .beginAt: + if theNode.keyArray[0] != keyAtCursor { return } + case .endAt: + if theNode.keyArray.reversed()[0] != keyAtCursor { return } + } + result.append(.init(keyArray: theNode.keyArray, value: gram.value)) + } + } + return result + } +} diff --git a/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift b/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift index 97764791..1315fd01 100644 --- a/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift +++ b/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift @@ -522,7 +522,7 @@ final class MegrezTests: XCTestCase { XCTAssertEqual(result.values, ["高熱", "🔥", "危險"]) } - func test20_Compositor_updateUnigramData() throws { + func test20_Compositor_UpdateUnigramData() throws { let theLM = SimpleLM(input: strSampleData) var compositor = Megrez.Compositor(with: theLM) compositor.separator = "" @@ -547,7 +547,7 @@ final class MegrezTests: XCTestCase { XCTAssertEqual(newResult2, ["年", "中"]) } - func test21_Compositor_hardCopy() throws { + func test21_Compositor_HardCopy() throws { let theLM = SimpleLM(input: strSampleData) let rawReadings = "gao1 ke1 ji4 gong1 si1 de5 nian2 zhong1 jiang3 jin1" var compositorA = Megrez.Compositor(with: theLM) @@ -580,4 +580,27 @@ final class MegrezTests: XCTestCase { d = compositor.fetchCandidates(at: 2, filter: .endAt).map(\.keyArray.count).max() ?? 0 XCTAssertEqual("\(a) \(b) \(c) \(d)", "1 1 2 2") } + + func test23_Compositor_CheckGetCandidates() throws { + let theLM = SimpleLM(input: strSampleData) + let rawReadings = "gao1 ke1 ji4 gong1 si1 de5 nian2 zhong1 jiang3 jin1" + var compositor = Megrez.Compositor(with: theLM) + rawReadings.split(separator: " ").forEach { key in + compositor.insertKey(key.description) + } + var stack1A = [String]() + var stack1B = [String]() + var stack2A = [String]() + var stack2B = [String]() + for i in 0 ... compositor.keys.count { + stack1A.append(compositor.fetchCandidates(at: i, filter: .beginAt).map(\.value).joined(separator: "-")) + stack1B.append(compositor.fetchCandidates(at: i, filter: .endAt).map(\.value).joined(separator: "-")) + stack2A.append(compositor.fetchCandidatesDeprecated(at: i, filter: .beginAt).map(\.value).joined(separator: "-")) + stack2B.append(compositor.fetchCandidatesDeprecated(at: i, filter: .endAt).map(\.value).joined(separator: "-")) + } + stack1B.removeFirst() + stack2B.removeLast() + XCTAssertEqual(stack1A, stack2A) + XCTAssertEqual(stack1B, stack2B) + } }