From 2063b0e50b18b51613827423679f87fe5dcf1d8e Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Thu, 16 Mar 2023 20:00:36 +0800 Subject: [PATCH] Megrez // Sanitizing node-crossed candidates. --- .../Sources/Megrez/3_KeyValuePaired.swift | 27 ++++++++++++++----- .../Sources/Megrez/4_SpanUnit.swift | 5 +++- .../Tests/MegrezTests/LMDataForTests.swift | 1 + .../Tests/MegrezTests/MegrezTests.swift | 21 +++++++++++++++ 4 files changed, 46 insertions(+), 8 deletions(-) diff --git a/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift b/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift index 983cd079..1b0de1ba 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift @@ -92,28 +92,41 @@ public extension Megrez.Compositor { /// 返回在當前位置的所有候選字詞(以詞音配對的形式)。如果組字器內有幅位、且游標 /// 位於組字器的(文字輸入順序的)最前方(也就是游標位置的數值是最大合規數值)的 - /// 話,那麼這裡會用到 location - 1、以免去在呼叫該函式後再處理的麻煩。 - /// - Parameter location: 游標位置。 + /// 話,那麼這裡會對 location 的位置自動減去 1、以免去在呼叫該函式後再處理的麻煩。 + /// - Parameter location: 游標位置,必須是顯示的游標位置、不得做任何事先糾偏處理。 /// - Returns: 候選字音配對陣列。 - func fetchCandidates(at location: Int, filter: CandidateFetchFilter = .all) -> [Megrez.KeyValuePaired] { + func fetchCandidates( + at givenLocation: Int? = nil, filter givenFilter: CandidateFetchFilter = .all + ) -> [Megrez.KeyValuePaired] { var result = [Megrez.KeyValuePaired]() guard !keys.isEmpty else { return result } - let location = max(min(location, keys.count - 1), 0) // 防呆 + var location = max(min(givenLocation ?? cursor, keys.count), 0) + var filter = givenFilter + if filter == .endAt { + if location == keys.count { filter = .all } + location -= 1 + } + location = max(min(location, keys.count - 1), 0) let anchors: [NodeAnchor] = fetchOverlappingNodes(at: location).stableSorted { // 按照讀音的長度(幅位長度)來給節點排序。 $0.spanLength > $1.spanLength } let keyAtCursor = keys[location] - anchors.map(\.node).filter(\.keyArray.isEmpty.negative).forEach { theNode in + anchors.forEach { theAnchor in + let theNode = theAnchor.node theNode.unigrams.forEach { gram in switch filter { case .all: // 得加上這道篩選,不然會出現很多無效結果。 if !theNode.keyArray.contains(keyAtCursor) { return } case .beginAt: - if theNode.keyArray[0] != keyAtCursor { return } + guard theAnchor.spanIndex == location else { return } case .endAt: - if theNode.keyArray.reversed()[0] != keyAtCursor { return } + guard theNode.keyArray.last == keyAtCursor else { return } + switch theNode.spanLength { + case 2... where theAnchor.spanIndex + theAnchor.spanLength - 1 != location: return + default: break + } } result.append(.init(keyArray: theNode.keyArray, value: gram.value)) } diff --git a/Packages/vChewing_Megrez/Sources/Megrez/4_SpanUnit.swift b/Packages/vChewing_Megrez/Sources/Megrez/4_SpanUnit.swift index 7f46810f..60fe6834 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/4_SpanUnit.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/4_SpanUnit.swift @@ -66,11 +66,13 @@ extension Megrez.Compositor { /// - Returns: 一個包含所有與該位置重疊的節點的陣列。 func fetchOverlappingNodes(at givenLocation: Int) -> [NodeAnchor] { var results = [NodeAnchor]() - guard !spans.isEmpty, givenLocation < spans.count else { return results } + let givenLocation = max(0, min(givenLocation, keys.count - 1)) + guard !spans.isEmpty else { return results } // 先獲取該位置的所有單字節點。 (1 ... max(spans[givenLocation].maxLength, 1)).forEach { theSpanLength in guard let node = spans[givenLocation][theSpanLength] else { return } + guard !node.keyArray.joined().isEmpty else { return } results.append(.init(node: node, spanIndex: givenLocation)) } @@ -81,6 +83,7 @@ extension Megrez.Compositor { guard A <= B else { return } (A ... B).forEach { theLength in guard let node = spans[theLocation][theLength] else { return } + guard !node.keyArray.joined().isEmpty else { return } results.append(.init(node: node, spanIndex: theLocation)) } } diff --git a/Packages/vChewing_Megrez/Tests/MegrezTests/LMDataForTests.swift b/Packages/vChewing_Megrez/Tests/MegrezTests/LMDataForTests.swift index 70a9e61b..b79ba9a1 100644 --- a/Packages/vChewing_Megrez/Tests/MegrezTests/LMDataForTests.swift +++ b/Packages/vChewing_Megrez/Tests/MegrezTests/LMDataForTests.swift @@ -173,6 +173,7 @@ nian2zhong1 年中 -11.373044 gao1ke1ji4 高科技 -9.842421 zhe4yang4 這樣 -6.000000 // Non-LibTaBE ni3zhe4 你這 -9.000000 // Non-LibTaBE +ke1ke1 顆顆 -8.000000 // Non-LibTaBE jiao4 教 -3.676169 jiao4 較 -3.24869962 jiao4yu4 教育 -3.32220565 diff --git a/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift b/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift index fe641775..97764791 100644 --- a/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift +++ b/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift @@ -559,4 +559,25 @@ final class MegrezTests: XCTestCase { let resultB = compositorB.walk().walkedNodes XCTAssertEqual(resultA, resultB) } + + func test22_Compositor_SanitizingNodeCrossing() throws { + let theLM = SimpleLM(input: strSampleData) + let rawReadings = "ke1 ke1" + var compositor = Megrez.Compositor(with: theLM) + rawReadings.split(separator: " ").forEach { key in + compositor.insertKey(key.description) + } + var a = compositor.fetchCandidates(at: 1, filter: .beginAt).map(\.keyArray.count).max() ?? 0 + var b = compositor.fetchCandidates(at: 1, filter: .endAt).map(\.keyArray.count).max() ?? 0 + var c = compositor.fetchCandidates(at: 0, filter: .beginAt).map(\.keyArray.count).max() ?? 0 + var d = compositor.fetchCandidates(at: 2, filter: .endAt).map(\.keyArray.count).max() ?? 0 + XCTAssertEqual("\(a) \(b) \(c) \(d)", "1 1 2 2") + compositor.cursor = compositor.length + compositor.insertKey("jin1") + a = compositor.fetchCandidates(at: 1, filter: .beginAt).map(\.keyArray.count).max() ?? 0 + b = compositor.fetchCandidates(at: 1, filter: .endAt).map(\.keyArray.count).max() ?? 0 + c = compositor.fetchCandidates(at: 0, filter: .beginAt).map(\.keyArray.count).max() ?? 0 + d = compositor.fetchCandidates(at: 2, filter: .endAt).map(\.keyArray.count).max() ?? 0 + XCTAssertEqual("\(a) \(b) \(c) \(d)", "1 1 2 2") + } }