From 87e39bf943f356e786ffd760051f617405cc5132 Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Mon, 30 May 2022 15:38:19 +0800 Subject: [PATCH] Megrez v1.1.8 // Add nodesBeginningAt(). --- .../Megrez/1_BlockReadingBuilder.swift | 217 +++++++++++------- .../LanguageParsers/Megrez/2_Grid.swift | 59 ++++- .../LanguageParsers/Megrez/3_NodeAnchor.swift | 16 +- .../LanguageParsers/Megrez/3_Span.swift | 4 +- .../LanguageParsers/Megrez/4_Node.swift | 6 +- 5 files changed, 198 insertions(+), 104 deletions(-) diff --git a/Source/Modules/LanguageParsers/Megrez/1_BlockReadingBuilder.swift b/Source/Modules/LanguageParsers/Megrez/1_BlockReadingBuilder.swift index 992aa068..c8f5f1e4 100644 --- a/Source/Modules/LanguageParsers/Megrez/1_BlockReadingBuilder.swift +++ b/Source/Modules/LanguageParsers/Megrez/1_BlockReadingBuilder.swift @@ -26,8 +26,8 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. extension Megrez { /// 分節讀音槽。 public class BlockReadingBuilder { - /// 該分節讀音曹內可以允許的最大詞長。 - private var mutMaximumBuildSpanLength = 10 + /// 給被丟掉的節點路徑施加的負權重。 + private let kDroppedPathScore: Double = -999 /// 該分節讀音槽的游標位置。 private var mutCursorIndex: Int = 0 /// 該分節讀音槽的讀音陣列。 @@ -37,6 +37,8 @@ extension Megrez { /// 該分節讀音槽所使用的語言模型。 private var mutLM: LanguageModel + /// 公開該分節讀音槽內可以允許的最大詞長。 + public var maxBuildSpanLength: Int { mutGrid.maxBuildSpanLength } /// 公開:多字讀音鍵當中用以分割漢字讀音的記號,預設為空。 public var joinSeparator: String = "" /// 公開:該分節讀音槽的游標位置。 @@ -55,11 +57,11 @@ extension Megrez { /// 分節讀音槽。 /// - Parameters: /// - lm: 語言模型。可以是任何基於 Megrez.LanguageModel 的衍生型別。 - /// - length: 指定該分節讀音曹內可以允許的最大詞長,預設為 10 字。 + /// - length: 指定該分節讀音槽內可以允許的最大詞長,預設為 10 字。 /// - separator: 多字讀音鍵當中用以分割漢字讀音的記號,預設為空。 public init(lm: LanguageModel, length: Int = 10, separator: String = "") { mutLM = lm - mutMaximumBuildSpanLength = length + mutGrid = .init(spanLength: abs(length)) // 防呆 joinSeparator = separator } @@ -112,6 +114,7 @@ extension Megrez { /// 用於輸入法組字區長度上限處理: /// 將該位置要溢出的敲字內容遞交之後、再執行這個函數。 @discardableResult public func removeHeadReadings(count: Int) -> Bool { + let count = abs(count) // 防呆 if count > length { return false } @@ -120,8 +123,10 @@ extension Megrez { if mutCursorIndex > 0 { mutCursorIndex -= 1 } - mutReadings.removeFirst() - mutGrid.shrinkGridByOneAt(location: 0) + if !mutReadings.isEmpty { + mutReadings.removeFirst() + mutGrid.shrinkGridByOneAt(location: 0) + } build() } @@ -131,23 +136,22 @@ extension Megrez { // MARK: - Walker /// 對已給定的軌格按照給定的位置與條件進行正向爬軌。 - /// - /// 其實就是將反向爬軌的結果顛倒順序再給出來而已,省得使用者自己再顛倒一遍。 /// - Parameters: /// - at: 開始爬軌的位置。 /// - score: 給定累計權重,非必填參數。預設值為 0。 - /// - nodesLimit: 限定最多只爬多少個節點。 - /// - balanced: 啟用平衡權重,在節點權重的基礎上根據節點幅位長度來加權。 + /// - joinedPhrase: 用以統計累計長詞的內部參數,請勿主動使用。 + /// - longPhrases: 用以統計累計長詞的內部參數,請勿主動使用。 public func walk( - at location: Int, + at location: Int = 0, score accumulatedScore: Double = 0.0, - nodesLimit: Int = 0, - balanced: Bool = false + joinedPhrase: String = "", + longPhrases: [String] = .init() ) -> [NodeAnchor] { - Array( + let newLocation = (mutGrid.width) - abs(location) // 防呆 + return Array( reverseWalk( - at: location, score: accumulatedScore, - nodesLimit: nodesLimit, balanced: balanced + at: newLocation, score: accumulatedScore, + joinedPhrase: joinedPhrase, longPhrases: longPhrases ).reversed()) } @@ -155,91 +159,125 @@ extension Megrez { /// - Parameters: /// - at: 開始爬軌的位置。 /// - score: 給定累計權重,非必填參數。預設值為 0。 - /// - nodesLimit: 限定最多只爬多少個節點。 - /// - balanced: 啟用平衡權重,在節點權重的基礎上根據節點幅位長度來加權。 + /// - joinedPhrase: 用以統計累計長詞的內部參數,請勿主動使用。 + /// - longPhrases: 用以統計累計長詞的內部參數,請勿主動使用。 public func reverseWalk( at location: Int, score accumulatedScore: Double = 0.0, - nodesLimit: Int = 0, - balanced: Bool = false + joinedPhrase: String = "", + longPhrases: [String] = .init() ) -> [NodeAnchor] { + let location = abs(location) // 防呆 if location == 0 || location > mutGrid.width { - return [] as [NodeAnchor] + return .init() } - var paths: [[NodeAnchor]] = [] - var nodes: [NodeAnchor] = mutGrid.nodesEndingAt(location: location) + var paths = [[NodeAnchor]]() + var nodes = mutGrid.nodesEndingAt(location: location) - if balanced { - nodes.sort { - $0.balancedScore > $1.balancedScore - } + nodes = nodes.stableSorted { + $0.scoreForSort > $1.scoreForSort } - for (i, n) in nodes.enumerated() { - // 只檢查前 X 個 NodeAnchor 是否有 node。 - // 這裡有 abs 是為了防止有白癡填負數。 - if abs(nodesLimit) > 0, i == abs(nodesLimit) { - break - } - - var n = n - guard let nNode = n.node else { - continue - } - - n.accumulatedScore = accumulatedScore + nNode.score - - // 利用幅位長度來決定權重。 - // 這樣一來,例:「再見」比「在」與「見」的權重更高。 - if balanced { - n.accumulatedScore += n.additionalWeights - } - - var path: [NodeAnchor] = reverseWalk( - at: location - n.spanningLength, - score: n.accumulatedScore - ) - - path.insert(n, at: 0) - + if let nodeOfNodeZero = nodes[0].node, nodeOfNodeZero.score >= nodeOfNodeZero.kSelectedCandidateScore { + // 在使用者有選過候選字詞的情況下,摒棄非依此據而成的節點路徑。 + var nodeZero = nodes[0] + nodeZero.accumulatedScore = accumulatedScore + nodeOfNodeZero.score + var path: [NodeAnchor] = reverseWalk(at: location - nodeZero.spanningLength, score: nodeZero.accumulatedScore) + path.insert(nodeZero, at: 0) paths.append(path) - - // 始終使用固定的候選字詞 - if balanced, nNode.score >= 0 { - break - } - } - - if !paths.isEmpty { - if var result = paths.first { - for value in paths { - if let vLast = value.last, let rLast = result.last { - if vLast.accumulatedScore > rLast.accumulatedScore { - result = value - } - } + } else if !longPhrases.isEmpty { + var path = [NodeAnchor]() + for theAnchor in nodes { + guard let theNode = theAnchor.node else { continue } + var theAnchor = theAnchor + let joinedValue = theNode.currentKeyValue.value + joinedPhrase + // 如果只是一堆單漢字的節點組成了同樣的長詞的話,直接棄用這個節點路徑。 + // 打比方說「八/月/中/秋/山/林/涼」與「八月/中秋/山林/涼」在使用者來看 + // 是「結果等價」的,那就扔掉前者。 + if longPhrases.contains(joinedValue) { + theAnchor.accumulatedScore = kDroppedPathScore + path.insert(theAnchor, at: 0) + paths.append(path) + continue } - return result + theAnchor.accumulatedScore = accumulatedScore + theNode.score + if joinedValue.count >= longPhrases[0].count { + path = reverseWalk( + at: location - theAnchor.spanningLength, score: theAnchor.accumulatedScore, joinedPhrase: "", + longPhrases: .init() + ) + } else { + path = reverseWalk( + at: location - theAnchor.spanningLength, score: theAnchor.accumulatedScore, joinedPhrase: joinedValue, + longPhrases: longPhrases + ) + } + path.insert(theAnchor, at: 0) + paths.append(path) + } + } else { + // 看看當前格位有沒有更長的候選字詞。 + var longPhrases = [String]() + for theAnchor in nodes { + guard let theNode = theAnchor.node else { continue } + if theAnchor.spanningLength > 1 { + longPhrases.append(theNode.currentKeyValue.value) + } + } + + longPhrases = longPhrases.stableSorted { + $0.count > $1.count + } + for theAnchor in nodes { + var theAnchor = theAnchor + guard let theNode = theAnchor.node else { continue } + theAnchor.accumulatedScore = accumulatedScore + theNode.score + var path = [NodeAnchor]() + if theAnchor.spanningLength > 1 { + path = reverseWalk( + at: location - theAnchor.spanningLength, score: theAnchor.accumulatedScore, joinedPhrase: "", + longPhrases: .init() + ) + } else { + path = reverseWalk( + at: location - theAnchor.spanningLength, score: theAnchor.accumulatedScore, + joinedPhrase: theNode.currentKeyValue.value, longPhrases: longPhrases + ) + } + path.insert(theAnchor, at: 0) + paths.append(path) } } - return [] as [NodeAnchor] + + guard !paths.isEmpty else { + return .init() + } + + var result: [NodeAnchor] = paths[0] + for neta in paths { + if neta.last!.accumulatedScore > result.last!.accumulatedScore { + result = neta + } + } + + return result } // MARK: - Private functions private func build() { let itrBegin: Int = - (mutCursorIndex < mutMaximumBuildSpanLength) ? 0 : mutCursorIndex - mutMaximumBuildSpanLength - let itrEnd: Int = min(mutCursorIndex + mutMaximumBuildSpanLength, mutReadings.count) + (mutCursorIndex < maxBuildSpanLength) ? 0 : mutCursorIndex - maxBuildSpanLength + let itrEnd: Int = min(mutCursorIndex + maxBuildSpanLength, mutReadings.count) for p in itrBegin.. itrEnd { break } - let strSlice = mutReadings[p..<(p + q)] - let combinedReading: String = join(slice: strSlice, separator: joinSeparator) + let arrSlice = mutReadings[p..<(p + q)] + let combinedReading: String = join(slice: arrSlice, separator: joinSeparator) if !mutGrid.hasMatchedNode(location: p, spanningLength: q, key: combinedReading) { let unigrams: [Unigram] = mutLM.unigramsFor(key: combinedReading) @@ -252,12 +290,35 @@ extension Megrez { } } - private func join(slice strSlice: ArraySlice, separator: String) -> String { + private func join(slice arrSlice: ArraySlice, separator: String) -> String { var arrResult: [String] = [] - for value in strSlice { + for value in arrSlice { arrResult.append(value) } return arrResult.joined(separator: separator) } } } + +// MARK: - Stable Sort Extension + +// Reference: https://stackoverflow.com/a/50545761/4162914 + +extension Sequence { + /// Return a stable-sorted collection. + /// + /// - Parameter areInIncreasingOrder: Return nil when two element are equal. + /// - Returns: The sorted collection. + func stableSorted( + by areInIncreasingOrder: (Element, Element) throws -> Bool + ) + rethrows -> [Element] + { + try enumerated() + .sorted { a, b -> Bool in + try areInIncreasingOrder(a.element, b.element) + || (a.offset < b.offset && !areInIncreasingOrder(b.element, a.element)) + } + .map(\.element) + } +} diff --git a/Source/Modules/LanguageParsers/Megrez/2_Grid.swift b/Source/Modules/LanguageParsers/Megrez/2_Grid.swift index 61410e8f..3eec69ba 100644 --- a/Source/Modules/LanguageParsers/Megrez/2_Grid.swift +++ b/Source/Modules/LanguageParsers/Megrez/2_Grid.swift @@ -29,16 +29,23 @@ extension Megrez { /// 幅位陣列。 private var mutSpans: [Megrez.Span] + /// 該幅位內可以允許的最大詞長。 + private var mutMaxBuildSpanLength = 10 + + /// 公開:該幅位內可以允許的最大詞長。 + public var maxBuildSpanLength: Int { mutMaxBuildSpanLength } + /// 軌格的寬度,也就是其內的幅位陣列當中的幅位數量。 var width: Int { mutSpans.count } - public init() { + public init(spanLength: Int = 10) { + mutMaxBuildSpanLength = spanLength mutSpans = [Megrez.Span]() } /// 自我清空該軌格的內容。 public func clear() { - mutSpans = [Megrez.Span]() + mutSpans.removeAll() } /// 往該軌格的指定位置插入指定幅位長度的指定節點。 @@ -47,6 +54,8 @@ extension Megrez { /// - location: 位置。 /// - spanningLength: 給定的幅位長度。 public func insertNode(node: Node, location: Int, spanningLength: Int) { + let location = abs(location) // 防呆 + let spanningLength = abs(spanningLength) // 防呆 if location >= mutSpans.count { let diff = location - mutSpans.count + 1 for _ in 0.. Bool { + let location = abs(location) // 防呆 + let spanningLength = abs(spanningLength) // 防呆 if location > mutSpans.count { return false } let n = mutSpans[location].node(length: spanningLength) - return n == nil ? false : key == n?.key + return n != nil && key == n?.key } /// 在該軌格的指定位置擴增一個幅位。 /// - Parameters: /// - location: 位置。 public func expandGridByOneAt(location: Int) { - // 這裡加入 abs 完全是一個防呆設計 - mutSpans.insert(Span(), at: abs(location)) - if location != 0, abs(location) != mutSpans.count { - for i in 0..= mutSpans.count { return } @@ -99,11 +111,35 @@ extension Megrez { } } + /// 給定位置,枚舉出所有在這個位置開始的節點。 + /// - Parameters: + /// - location: 位置。 + public func nodesBeginningAt(location: Int) -> [NodeAnchor] { + let location = abs(location) // 防呆 + var results = [NodeAnchor]() + if location < mutSpans.count { // 此時 mutSpans 必然不為空 + let span = mutSpans[location] + for i in 1...maxBuildSpanLength { + if let np = span.node(length: i) { + results.append( + NodeAnchor( + node: np, + location: location, + spanningLength: i + ) + ) + } + } + } + return results + } + /// 給定位置,枚舉出所有在這個位置結尾的節點。 /// - Parameters: /// - location: 位置。 public func nodesEndingAt(location: Int) -> [NodeAnchor] { - var results: [NodeAnchor] = [] + let location = abs(location) // 防呆 + var results = [NodeAnchor]() if !mutSpans.isEmpty, location <= mutSpans.count { for i in 0.. [NodeAnchor] { - var results: [NodeAnchor] = [] + let location = abs(location) // 防呆 + var results = [NodeAnchor]() if !mutSpans.isEmpty, location <= mutSpans.count { for i in 0.. NodeAnchor { + let location = abs(location) // 防呆 var node = NodeAnchor() for nodeAnchor in nodesCrossingOrEndingAt(location: location) { guard let theNode = nodeAnchor.node else { @@ -182,6 +220,7 @@ extension Megrez { /// - value: 給定字串。 /// - overridingScore: 給定權重數值。 public func overrideNodeScoreForSelectedCandidate(location: Int, value: String, overridingScore: Double) { + let location = abs(location) // 防呆 for nodeAnchor in nodesCrossingOrEndingAt(location: location) { guard let theNode = nodeAnchor.node else { continue diff --git a/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift b/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift index 11b47258..4cdaa64b 100644 --- a/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift +++ b/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift @@ -52,19 +52,9 @@ extension Megrez { return stream } - /// 獲取加權量。 - public var additionalWeights: Double { - (Double(spanningLength) - 1) * 0.75 - } - - /// 獲取平衡權重。 - public var balancedScore: Double { - (node?.score ?? 0) + additionalWeights - } - - /// 獲取平衡累計權重。 - public var balancedAccumulatedScore: Double { - accumulatedScore + additionalWeights + /// 獲取用來比較的權重。 + public var scoreForSort: Double { + node?.score ?? 0 } } } diff --git a/Source/Modules/LanguageParsers/Megrez/3_Span.swift b/Source/Modules/LanguageParsers/Megrez/3_Span.swift index d99238ad..6ea9d45a 100644 --- a/Source/Modules/LanguageParsers/Megrez/3_Span.swift +++ b/Source/Modules/LanguageParsers/Megrez/3_Span.swift @@ -47,6 +47,7 @@ extension Megrez { /// - node: 節點。 /// - length: 給定的節點長度。 mutating func insert(node: Node, length: Int) { + let length = abs(length) // 防呆 mutLengthNodeMap[length] = node if length > mutMaximumLength { mutMaximumLength = length @@ -57,6 +58,7 @@ extension Megrez { /// - Parameters: /// - length: 給定的節點長度。 mutating func removeNodeOfLengthGreaterThan(_ length: Int) { + let length = abs(length) // 防呆 if length > mutMaximumLength { return } var max = 0 var removalList: [Int: Megrez.Node] = [:] @@ -79,7 +81,7 @@ extension Megrez { /// - Parameters: /// - length: 給定的節點長度。 public func node(length: Int) -> Node? { - mutLengthNodeMap[length] + mutLengthNodeMap[abs(length)] // 防呆 } } } diff --git a/Source/Modules/LanguageParsers/Megrez/4_Node.swift b/Source/Modules/LanguageParsers/Megrez/4_Node.swift index 01f6c6d9..813cc30c 100644 --- a/Source/Modules/LanguageParsers/Megrez/4_Node.swift +++ b/Source/Modules/LanguageParsers/Megrez/4_Node.swift @@ -47,7 +47,7 @@ extension Megrez { /// 用來登記「當前選中的單元圖」的索引值的變數。 private var mutSelectedUnigramIndex: Int = 0 /// 用來登記要施加給「『被標記為選中狀態』的候選字詞」的複寫權重的數值。 - private let kSelectedCandidateScore: Double = 99 + public let kSelectedCandidateScore: Double = 99 /// 將當前節點列印成一個字串。 public var description: String { "(node,key:\(mutKey),fixed:\(mutCandidateFixed ? "true" : "false"),selected:\(mutSelectedUnigramIndex),\(mutUnigrams))" @@ -84,7 +84,7 @@ extension Megrez { $0.score > $1.score } - if mutUnigrams.count > 0 { + if !mutUnigrams.isEmpty { mutScore = mutUnigrams[0].score } @@ -133,6 +133,7 @@ extension Megrez { /// - index: 索引位置。 /// - fix: 是否將當前解點標記為「候選詞已鎖定」的狀態。 public func selectCandidateAt(index: Int = 0, fix: Bool = false) { + let index = abs(index) mutSelectedUnigramIndex = index >= mutUnigrams.count ? 0 : index mutCandidateFixed = fix mutScore = kSelectedCandidateScore @@ -152,6 +153,7 @@ extension Megrez { /// - index: 索引位置。 /// - score: 給定權重條件。 public func selectFloatingCandidateAt(index: Int, score: Double) { + let index = abs(index) // 防呆 mutSelectedUnigramIndex = index >= mutUnigrams.count ? 0 : index mutCandidateFixed = false mutScore = score