From 87e39bf943f356e786ffd760051f617405cc5132 Mon Sep 17 00:00:00 2001
From: ShikiSuen <shikisuen@outlook.com>
Date: Mon, 30 May 2022 15:38:19 +0800
Subject: [PATCH] Megrez v1.1.8 // Add nodesBeginningAt().

---
 .../Megrez/1_BlockReadingBuilder.swift        | 217 +++++++++++-------
 .../LanguageParsers/Megrez/2_Grid.swift       |  59 ++++-
 .../LanguageParsers/Megrez/3_NodeAnchor.swift |  16 +-
 .../LanguageParsers/Megrez/3_Span.swift       |   4 +-
 .../LanguageParsers/Megrez/4_Node.swift       |   6 +-
 5 files changed, 198 insertions(+), 104 deletions(-)

diff --git a/Source/Modules/LanguageParsers/Megrez/1_BlockReadingBuilder.swift b/Source/Modules/LanguageParsers/Megrez/1_BlockReadingBuilder.swift
index 992aa068..c8f5f1e4 100644
--- a/Source/Modules/LanguageParsers/Megrez/1_BlockReadingBuilder.swift
+++ b/Source/Modules/LanguageParsers/Megrez/1_BlockReadingBuilder.swift
@@ -26,8 +26,8 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 extension Megrez {
   /// 分節讀音槽。
   public class BlockReadingBuilder {
-    /// 該分節讀音曹內可以允許的最大詞長。
-    private var mutMaximumBuildSpanLength = 10
+    /// 給被丟掉的節點路徑施加的負權重。
+    private let kDroppedPathScore: Double = -999
     /// 該分節讀音槽的游標位置。
     private var mutCursorIndex: Int = 0
     /// 該分節讀音槽的讀音陣列。
@@ -37,6 +37,8 @@ extension Megrez {
     /// 該分節讀音槽所使用的語言模型。
     private var mutLM: LanguageModel
 
+    /// 公開該分節讀音槽內可以允許的最大詞長。
+    public var maxBuildSpanLength: Int { mutGrid.maxBuildSpanLength }
     /// 公開：多字讀音鍵當中用以分割漢字讀音的記號，預設為空。
     public var joinSeparator: String = ""
     /// 公開：該分節讀音槽的游標位置。
@@ -55,11 +57,11 @@ extension Megrez {
     /// 分節讀音槽。
     /// - Parameters:
     ///   - lm: 語言模型。可以是任何基於 Megrez.LanguageModel 的衍生型別。
-    ///   - length: 指定該分節讀音曹內可以允許的最大詞長，預設為 10 字。
+    ///   - length: 指定該分節讀音槽內可以允許的最大詞長，預設為 10 字。
     ///   - separator: 多字讀音鍵當中用以分割漢字讀音的記號，預設為空。
     public init(lm: LanguageModel, length: Int = 10, separator: String = "") {
       mutLM = lm
-      mutMaximumBuildSpanLength = length
+      mutGrid = .init(spanLength: abs(length))  // 防呆
       joinSeparator = separator
     }
 
@@ -112,6 +114,7 @@ extension Megrez {
     /// 用於輸入法組字區長度上限處理：
     /// 將該位置要溢出的敲字內容遞交之後、再執行這個函數。
     @discardableResult public func removeHeadReadings(count: Int) -> Bool {
+      let count = abs(count)  // 防呆
       if count > length {
         return false
       }
@@ -120,8 +123,10 @@ extension Megrez {
         if mutCursorIndex > 0 {
           mutCursorIndex -= 1
         }
-        mutReadings.removeFirst()
-        mutGrid.shrinkGridByOneAt(location: 0)
+        if !mutReadings.isEmpty {
+          mutReadings.removeFirst()
+          mutGrid.shrinkGridByOneAt(location: 0)
+        }
         build()
       }
 
@@ -131,23 +136,22 @@ extension Megrez {
     // MARK: - Walker
 
     /// 對已給定的軌格按照給定的位置與條件進行正向爬軌。
-    ///
-    /// 其實就是將反向爬軌的結果顛倒順序再給出來而已，省得使用者自己再顛倒一遍。
     /// - Parameters:
     ///   - at: 開始爬軌的位置。
     ///   - score: 給定累計權重，非必填參數。預設值為 0。
-    ///   - nodesLimit: 限定最多只爬多少個節點。
-    ///   - balanced: 啟用平衡權重，在節點權重的基礎上根據節點幅位長度來加權。
+    ///   - joinedPhrase: 用以統計累計長詞的內部參數，請勿主動使用。
+    ///   - longPhrases: 用以統計累計長詞的內部參數，請勿主動使用。
     public func walk(
-      at location: Int,
+      at location: Int = 0,
       score accumulatedScore: Double = 0.0,
-      nodesLimit: Int = 0,
-      balanced: Bool = false
+      joinedPhrase: String = "",
+      longPhrases: [String] = .init()
     ) -> [NodeAnchor] {
-      Array(
+      let newLocation = (mutGrid.width) - abs(location)  // 防呆
+      return Array(
         reverseWalk(
-          at: location, score: accumulatedScore,
-          nodesLimit: nodesLimit, balanced: balanced
+          at: newLocation, score: accumulatedScore,
+          joinedPhrase: joinedPhrase, longPhrases: longPhrases
         ).reversed())
     }
 
@@ -155,91 +159,125 @@ extension Megrez {
     /// - Parameters:
     ///   - at: 開始爬軌的位置。
     ///   - score: 給定累計權重，非必填參數。預設值為 0。
-    ///   - nodesLimit: 限定最多只爬多少個節點。
-    ///   - balanced: 啟用平衡權重，在節點權重的基礎上根據節點幅位長度來加權。
+    ///   - joinedPhrase: 用以統計累計長詞的內部參數，請勿主動使用。
+    ///   - longPhrases: 用以統計累計長詞的內部參數，請勿主動使用。
     public func reverseWalk(
       at location: Int,
       score accumulatedScore: Double = 0.0,
-      nodesLimit: Int = 0,
-      balanced: Bool = false
+      joinedPhrase: String = "",
+      longPhrases: [String] = .init()
     ) -> [NodeAnchor] {
+      let location = abs(location)  // 防呆
       if location == 0 || location > mutGrid.width {
-        return [] as [NodeAnchor]
+        return .init()
       }
 
-      var paths: [[NodeAnchor]] = []
-      var nodes: [NodeAnchor] = mutGrid.nodesEndingAt(location: location)
+      var paths = [[NodeAnchor]]()
+      var nodes = mutGrid.nodesEndingAt(location: location)
 
-      if balanced {
-        nodes.sort {
-          $0.balancedScore > $1.balancedScore
-        }
+      nodes = nodes.stableSorted {
+        $0.scoreForSort > $1.scoreForSort
       }
 
-      for (i, n) in nodes.enumerated() {
-        // 只檢查前 X 個 NodeAnchor 是否有 node。
-        // 這裡有 abs 是為了防止有白癡填負數。
-        if abs(nodesLimit) > 0, i == abs(nodesLimit) {
-          break
-        }
-
-        var n = n
-        guard let nNode = n.node else {
-          continue
-        }
-
-        n.accumulatedScore = accumulatedScore + nNode.score
-
-        // 利用幅位長度來決定權重。
-        // 這樣一來，例：「再見」比「在」與「見」的權重更高。
-        if balanced {
-          n.accumulatedScore += n.additionalWeights
-        }
-
-        var path: [NodeAnchor] = reverseWalk(
-          at: location - n.spanningLength,
-          score: n.accumulatedScore
-        )
-
-        path.insert(n, at: 0)
-
+      if let nodeOfNodeZero = nodes[0].node, nodeOfNodeZero.score >= nodeOfNodeZero.kSelectedCandidateScore {
+        // 在使用者有選過候選字詞的情況下，摒棄非依此據而成的節點路徑。
+        var nodeZero = nodes[0]
+        nodeZero.accumulatedScore = accumulatedScore + nodeOfNodeZero.score
+        var path: [NodeAnchor] = reverseWalk(at: location - nodeZero.spanningLength, score: nodeZero.accumulatedScore)
+        path.insert(nodeZero, at: 0)
         paths.append(path)
-
-        // 始終使用固定的候選字詞
-        if balanced, nNode.score >= 0 {
-          break
-        }
-      }
-
-      if !paths.isEmpty {
-        if var result = paths.first {
-          for value in paths {
-            if let vLast = value.last, let rLast = result.last {
-              if vLast.accumulatedScore > rLast.accumulatedScore {
-                result = value
-              }
-            }
+      } else if !longPhrases.isEmpty {
+        var path = [NodeAnchor]()
+        for theAnchor in nodes {
+          guard let theNode = theAnchor.node else { continue }
+          var theAnchor = theAnchor
+          let joinedValue = theNode.currentKeyValue.value + joinedPhrase
+          // 如果只是一堆單漢字的節點組成了同樣的長詞的話，直接棄用這個節點路徑。
+          // 打比方說「八/月/中/秋/山/林/涼」與「八月/中秋/山林/涼」在使用者來看
+          // 是「結果等價」的，那就扔掉前者。
+          if longPhrases.contains(joinedValue) {
+            theAnchor.accumulatedScore = kDroppedPathScore
+            path.insert(theAnchor, at: 0)
+            paths.append(path)
+            continue
           }
-          return result
+          theAnchor.accumulatedScore = accumulatedScore + theNode.score
+          if joinedValue.count >= longPhrases[0].count {
+            path = reverseWalk(
+              at: location - theAnchor.spanningLength, score: theAnchor.accumulatedScore, joinedPhrase: "",
+              longPhrases: .init()
+            )
+          } else {
+            path = reverseWalk(
+              at: location - theAnchor.spanningLength, score: theAnchor.accumulatedScore, joinedPhrase: joinedValue,
+              longPhrases: longPhrases
+            )
+          }
+          path.insert(theAnchor, at: 0)
+          paths.append(path)
+        }
+      } else {
+        // 看看當前格位有沒有更長的候選字詞。
+        var longPhrases = [String]()
+        for theAnchor in nodes {
+          guard let theNode = theAnchor.node else { continue }
+          if theAnchor.spanningLength > 1 {
+            longPhrases.append(theNode.currentKeyValue.value)
+          }
+        }
+
+        longPhrases = longPhrases.stableSorted {
+          $0.count > $1.count
+        }
+        for theAnchor in nodes {
+          var theAnchor = theAnchor
+          guard let theNode = theAnchor.node else { continue }
+          theAnchor.accumulatedScore = accumulatedScore + theNode.score
+          var path = [NodeAnchor]()
+          if theAnchor.spanningLength > 1 {
+            path = reverseWalk(
+              at: location - theAnchor.spanningLength, score: theAnchor.accumulatedScore, joinedPhrase: "",
+              longPhrases: .init()
+            )
+          } else {
+            path = reverseWalk(
+              at: location - theAnchor.spanningLength, score: theAnchor.accumulatedScore,
+              joinedPhrase: theNode.currentKeyValue.value, longPhrases: longPhrases
+            )
+          }
+          path.insert(theAnchor, at: 0)
+          paths.append(path)
         }
       }
-      return [] as [NodeAnchor]
+
+      guard !paths.isEmpty else {
+        return .init()
+      }
+
+      var result: [NodeAnchor] = paths[0]
+      for neta in paths {
+        if neta.last!.accumulatedScore > result.last!.accumulatedScore {
+          result = neta
+        }
+      }
+
+      return result
     }
 
     // MARK: - Private functions
 
     private func build() {
       let itrBegin: Int =
-        (mutCursorIndex < mutMaximumBuildSpanLength) ? 0 : mutCursorIndex - mutMaximumBuildSpanLength
-      let itrEnd: Int = min(mutCursorIndex + mutMaximumBuildSpanLength, mutReadings.count)
+        (mutCursorIndex < maxBuildSpanLength) ? 0 : mutCursorIndex - maxBuildSpanLength
+      let itrEnd: Int = min(mutCursorIndex + maxBuildSpanLength, mutReadings.count)
 
       for p in itrBegin..<itrEnd {
-        for q in 1..<mutMaximumBuildSpanLength {
+        for q in 1..<maxBuildSpanLength {
           if p + q > itrEnd {
             break
           }
-          let strSlice = mutReadings[p..<(p + q)]
-          let combinedReading: String = join(slice: strSlice, separator: joinSeparator)
+          let arrSlice = mutReadings[p..<(p + q)]
+          let combinedReading: String = join(slice: arrSlice, separator: joinSeparator)
 
           if !mutGrid.hasMatchedNode(location: p, spanningLength: q, key: combinedReading) {
             let unigrams: [Unigram] = mutLM.unigramsFor(key: combinedReading)
@@ -252,12 +290,35 @@ extension Megrez {
       }
     }
 
-    private func join(slice strSlice: ArraySlice<String>, separator: String) -> String {
+    private func join(slice arrSlice: ArraySlice<String>, separator: String) -> String {
       var arrResult: [String] = []
-      for value in strSlice {
+      for value in arrSlice {
         arrResult.append(value)
       }
       return arrResult.joined(separator: separator)
     }
   }
 }
+
+// MARK: - Stable Sort Extension
+
+// Reference: https://stackoverflow.com/a/50545761/4162914
+
+extension Sequence {
+  /// Return a stable-sorted collection.
+  ///
+  /// - Parameter areInIncreasingOrder: Return nil when two element are equal.
+  /// - Returns: The sorted collection.
+  func stableSorted(
+    by areInIncreasingOrder: (Element, Element) throws -> Bool
+  )
+    rethrows -> [Element]
+  {
+    try enumerated()
+      .sorted { a, b -> Bool in
+        try areInIncreasingOrder(a.element, b.element)
+          || (a.offset < b.offset && !areInIncreasingOrder(b.element, a.element))
+      }
+      .map(\.element)
+  }
+}
diff --git a/Source/Modules/LanguageParsers/Megrez/2_Grid.swift b/Source/Modules/LanguageParsers/Megrez/2_Grid.swift
index 61410e8f..3eec69ba 100644
--- a/Source/Modules/LanguageParsers/Megrez/2_Grid.swift
+++ b/Source/Modules/LanguageParsers/Megrez/2_Grid.swift
@@ -29,16 +29,23 @@ extension Megrez {
     /// 幅位陣列。
     private var mutSpans: [Megrez.Span]
 
+    /// 該幅位內可以允許的最大詞長。
+    private var mutMaxBuildSpanLength = 10
+
+    /// 公開：該幅位內可以允許的最大詞長。
+    public var maxBuildSpanLength: Int { mutMaxBuildSpanLength }
+
     /// 軌格的寬度，也就是其內的幅位陣列當中的幅位數量。
     var width: Int { mutSpans.count }
 
-    public init() {
+    public init(spanLength: Int = 10) {
+      mutMaxBuildSpanLength = spanLength
       mutSpans = [Megrez.Span]()
     }
 
     /// 自我清空該軌格的內容。
     public func clear() {
-      mutSpans = [Megrez.Span]()
+      mutSpans.removeAll()
     }
 
     /// 往該軌格的指定位置插入指定幅位長度的指定節點。
@@ -47,6 +54,8 @@ extension Megrez {
     ///   - location: 位置。
     ///   - spanningLength: 給定的幅位長度。
     public func insertNode(node: Node, location: Int, spanningLength: Int) {
+      let location = abs(location)  // 防呆
+      let spanningLength = abs(spanningLength)  // 防呆
       if location >= mutSpans.count {
         let diff = location - mutSpans.count + 1
         for _ in 0..<diff {
@@ -62,24 +71,26 @@ extension Megrez {
     ///   - spanningLength: 給定的幅位長度。
     ///   - key: 索引鍵。
     public func hasMatchedNode(location: Int, spanningLength: Int, key: String) -> Bool {
+      let location = abs(location)  // 防呆
+      let spanningLength = abs(spanningLength)  // 防呆
       if location > mutSpans.count {
         return false
       }
 
       let n = mutSpans[location].node(length: spanningLength)
-      return n == nil ? false : key == n?.key
+      return n != nil && key == n?.key
     }
 
     /// 在該軌格的指定位置擴增一個幅位。
     /// - Parameters:
     ///   - location: 位置。
     public func expandGridByOneAt(location: Int) {
-      // 這裡加入 abs 完全是一個防呆設計
-      mutSpans.insert(Span(), at: abs(location))
-      if location != 0, abs(location) != mutSpans.count {
-        for i in 0..<abs(location) {
+      let location = abs(location)  // 防呆
+      mutSpans.insert(Span(), at: location)
+      if location != 0, location != mutSpans.count {
+        for i in 0..<location {
           // zaps overlapping spans
-          mutSpans[i].removeNodeOfLengthGreaterThan(abs(location) - i)
+          mutSpans[i].removeNodeOfLengthGreaterThan(location - i)
         }
       }
     }
@@ -88,6 +99,7 @@ extension Megrez {
     /// - Parameters:
     ///   - location: 位置。
     public func shrinkGridByOneAt(location: Int) {
+      let location = abs(location)  // 防呆
       if location >= mutSpans.count {
         return
       }
@@ -99,11 +111,35 @@ extension Megrez {
       }
     }
 
+    /// 給定位置，枚舉出所有在這個位置開始的節點。
+    /// - Parameters:
+    ///   - location: 位置。
+    public func nodesBeginningAt(location: Int) -> [NodeAnchor] {
+      let location = abs(location)  // 防呆
+      var results = [NodeAnchor]()
+      if location < mutSpans.count {  // 此時 mutSpans 必然不為空
+        let span = mutSpans[location]
+        for i in 1...maxBuildSpanLength {
+          if let np = span.node(length: i) {
+            results.append(
+              NodeAnchor(
+                node: np,
+                location: location,
+                spanningLength: i
+              )
+            )
+          }
+        }
+      }
+      return results
+    }
+
     /// 給定位置，枚舉出所有在這個位置結尾的節點。
     /// - Parameters:
     ///   - location: 位置。
     public func nodesEndingAt(location: Int) -> [NodeAnchor] {
-      var results: [NodeAnchor] = []
+      let location = abs(location)  // 防呆
+      var results = [NodeAnchor]()
       if !mutSpans.isEmpty, location <= mutSpans.count {
         for i in 0..<location {
           let span = mutSpans[i]
@@ -127,7 +163,8 @@ extension Megrez {
     /// - Parameters:
     ///   - location: 位置。
     public func nodesCrossingOrEndingAt(location: Int) -> [NodeAnchor] {
-      var results: [NodeAnchor] = []
+      let location = abs(location)  // 防呆
+      var results = [NodeAnchor]()
       if !mutSpans.isEmpty, location <= mutSpans.count {
         for i in 0..<location {
           let span = mutSpans[i]
@@ -157,6 +194,7 @@ extension Megrez {
     ///   - location: 位置。
     ///   - value: 給定字串。
     @discardableResult public func fixNodeSelectedCandidate(location: Int, value: String) -> NodeAnchor {
+      let location = abs(location)  // 防呆
       var node = NodeAnchor()
       for nodeAnchor in nodesCrossingOrEndingAt(location: location) {
         guard let theNode = nodeAnchor.node else {
@@ -182,6 +220,7 @@ extension Megrez {
     ///   - value: 給定字串。
     ///   - overridingScore: 給定權重數值。
     public func overrideNodeScoreForSelectedCandidate(location: Int, value: String, overridingScore: Double) {
+      let location = abs(location)  // 防呆
       for nodeAnchor in nodesCrossingOrEndingAt(location: location) {
         guard let theNode = nodeAnchor.node else {
           continue
diff --git a/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift b/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift
index 11b47258..4cdaa64b 100644
--- a/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift
+++ b/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift
@@ -52,19 +52,9 @@ extension Megrez {
       return stream
     }
 
-    /// 獲取加權量。
-    public var additionalWeights: Double {
-      (Double(spanningLength) - 1) * 0.75
-    }
-
-    /// 獲取平衡權重。
-    public var balancedScore: Double {
-      (node?.score ?? 0) + additionalWeights
-    }
-
-    /// 獲取平衡累計權重。
-    public var balancedAccumulatedScore: Double {
-      accumulatedScore + additionalWeights
+    /// 獲取用來比較的權重。
+    public var scoreForSort: Double {
+      node?.score ?? 0
     }
   }
 }
diff --git a/Source/Modules/LanguageParsers/Megrez/3_Span.swift b/Source/Modules/LanguageParsers/Megrez/3_Span.swift
index d99238ad..6ea9d45a 100644
--- a/Source/Modules/LanguageParsers/Megrez/3_Span.swift
+++ b/Source/Modules/LanguageParsers/Megrez/3_Span.swift
@@ -47,6 +47,7 @@ extension Megrez {
     ///   - node: 節點。
     ///   - length: 給定的節點長度。
     mutating func insert(node: Node, length: Int) {
+      let length = abs(length)  // 防呆
       mutLengthNodeMap[length] = node
       if length > mutMaximumLength {
         mutMaximumLength = length
@@ -57,6 +58,7 @@ extension Megrez {
     /// - Parameters:
     ///   - length: 給定的節點長度。
     mutating func removeNodeOfLengthGreaterThan(_ length: Int) {
+      let length = abs(length)  // 防呆
       if length > mutMaximumLength { return }
       var max = 0
       var removalList: [Int: Megrez.Node] = [:]
@@ -79,7 +81,7 @@ extension Megrez {
     /// - Parameters:
     ///   - length: 給定的節點長度。
     public func node(length: Int) -> Node? {
-      mutLengthNodeMap[length]
+      mutLengthNodeMap[abs(length)]  // 防呆
     }
   }
 }
diff --git a/Source/Modules/LanguageParsers/Megrez/4_Node.swift b/Source/Modules/LanguageParsers/Megrez/4_Node.swift
index 01f6c6d9..813cc30c 100644
--- a/Source/Modules/LanguageParsers/Megrez/4_Node.swift
+++ b/Source/Modules/LanguageParsers/Megrez/4_Node.swift
@@ -47,7 +47,7 @@ extension Megrez {
     /// 用來登記「當前選中的單元圖」的索引值的變數。
     private var mutSelectedUnigramIndex: Int = 0
     /// 用來登記要施加給「『被標記為選中狀態』的候選字詞」的複寫權重的數值。
-    private let kSelectedCandidateScore: Double = 99
+    public let kSelectedCandidateScore: Double = 99
     /// 將當前節點列印成一個字串。
     public var description: String {
       "(node,key:\(mutKey),fixed:\(mutCandidateFixed ? "true" : "false"),selected:\(mutSelectedUnigramIndex),\(mutUnigrams))"
@@ -84,7 +84,7 @@ extension Megrez {
         $0.score > $1.score
       }
 
-      if mutUnigrams.count > 0 {
+      if !mutUnigrams.isEmpty {
         mutScore = mutUnigrams[0].score
       }
 
@@ -133,6 +133,7 @@ extension Megrez {
     ///   - index: 索引位置。
     ///   - fix: 是否將當前解點標記為「候選詞已鎖定」的狀態。
     public func selectCandidateAt(index: Int = 0, fix: Bool = false) {
+      let index = abs(index)
       mutSelectedUnigramIndex = index >= mutUnigrams.count ? 0 : index
       mutCandidateFixed = fix
       mutScore = kSelectedCandidateScore
@@ -152,6 +153,7 @@ extension Megrez {
     ///   - index: 索引位置。
     ///   - score: 給定權重條件。
     public func selectFloatingCandidateAt(index: Int, score: Double) {
+      let index = abs(index)  // 防呆
       mutSelectedUnigramIndex = index >= mutUnigrams.count ? 0 : index
       mutCandidateFixed = false
       mutScore = score