diff --git a/Packages/vChewing_Megrez/Sources/Megrez/1_Compositor.swift b/Packages/vChewing_Megrez/Sources/Megrez/1_Compositor.swift index 73bc5469..68ebb30e 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/1_Compositor.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/1_Compositor.swift @@ -3,8 +3,6 @@ // ==================== // This code is released under the MIT license (SPDX-License-Identifier: MIT) -import Foundation - public extension Megrez { /// 一個組字器用來在給定一系列的索引鍵的情況下(藉由一系列的觀測行為)返回一套資料值。 /// @@ -186,7 +184,7 @@ public extension Megrez { /// 生成用以交給 GraphViz 診斷的資料檔案內容,純文字。 public var dumpDOT: String { // C# StringBuilder 與 Swift NSMutableString 能提供爆發性的效能。 - let strOutput: NSMutableString = .init(string: "digraph {\ngraph [ rankdir=LR ];\nBOS;\n") + var strOutput = "digraph {\ngraph [ rankdir=LR ];\nBOS;\n" spans.enumerated().forEach { p, span in (0 ... span.maxLength).forEach { ni in guard let np = span[ni] else { return } diff --git a/Packages/vChewing_Megrez/Sources/Megrez/2_Walker.swift b/Packages/vChewing_Megrez/Sources/Megrez/2_Walker.swift index 366bc137..d3f771f4 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/2_Walker.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/2_Walker.swift @@ -12,73 +12,118 @@ public extension Megrez.Compositor { /// 的路徑(所以要找最大的權重),因為在對數概率下,較大的數值意味著較大的概率。 /// 對於 `G = (V, E)`,該算法的運行次數為 `O(|V|+|E|)`,其中 `G` 是一個有向無環圖。 /// 這意味著,即使軌格很大,也可以用很少的算力就可以爬軌。 + /// + /// - Remark: 利用該數學方法進行輸入法智能組句的(已知可考的)最開始的案例是 + /// 郭家寶(ByVoid)的《[基於統計語言模型的拼音輸入法](https://byvoid.com/zht/blog/slm_based_pinyin_ime/) 》; + /// 再後來則是 2022 年中時期劉燈的 Gramambular 2 組字引擎。 /// - Returns: 爬軌結果+該過程是否順利執行。 - @discardableResult mutating func walk() -> (walkedNodes: [Megrez.Node], succeeded: Bool) { - var result = [Megrez.Node]() - defer { walkedNodes = result } - guard !spans.isEmpty else { return (result, true) } + @discardableResult mutating func walk() -> [Megrez.Node] { + defer { Self.reinitVertexNetwork() } + sortAndRelax() + guard !spans.isEmpty else { return [] } + var iterated: Megrez.Node? = Megrez.Node.leadingNode + walkedNodes.removeAll() + while let itPrev = iterated?.prev { + // 此處必須得是 Copy,讓組字器外部對此的操作影響不到組字器內部的節點。 + walkedNodes.insert(itPrev.copy, at: 0) + iterated = itPrev + } + iterated?.destroyVertex() + iterated = nil + walkedNodes.removeFirst() + return walkedNodes + } - var vertexSpans: [[Int: Vertex]] = spans.map(\.asVertexSpan) - - let terminal = Vertex(node: .init(keyArray: ["_TERMINAL_"])) - var root = Vertex(node: .init(keyArray: ["_ROOT_"])) - root.distance = 0 - - vertexSpans.enumerated().forEach { location, vertexSpan in - vertexSpan.values.forEach { vertex in - let nextVertexPosition = location + vertex.node.spanLength - if nextVertexPosition == vertexSpans.count { - vertex.edges.append(terminal) + /// 先進行位相幾何排序、再卸勁。 + internal func sortAndRelax() { + Self.reinitVertexNetwork() + guard !spans.isEmpty else { return } + Megrez.Node.trailingNode.distance = 0 + spans.enumerated().forEach { location, theSpan in + theSpan.values.forEach { theNode in + let nextVertexPosition = location + theNode.spanLength + if nextVertexPosition == spans.count { + theNode.edges.append(.leadingNode) return } - vertexSpans[nextVertexPosition].values.forEach { vertex.edges.append($0) } + spans[nextVertexPosition].values.forEach { theNode.edges.append($0) } } } - - root.edges.append(contentsOf: vertexSpans[0].values) - - topologicalSort(root: &root).reversed().forEach { neta in - neta.edges.indices.forEach { neta.relax(target: &neta.edges[$0]) } + Megrez.Node.trailingNode.edges.append(contentsOf: spans[0].values) + Self.topologicalSort().reversed().forEach { neta in + neta.edges.indices.forEach { Self.relax(u: neta, v: &neta.edges[$0]) } } - - var iterated = terminal - var walked = [Megrez.Node]() - var totalLengthOfKeys = 0 - - while let itPrev = iterated.prev { - walked.append(itPrev.node) - iterated = itPrev - totalLengthOfKeys += iterated.node.spanLength - } - - // 清理內容,否則會有記憶體洩漏。 - vertexSpans.removeAll() - iterated.destroy() - root.destroy() - terminal.destroy() - - guard totalLengthOfKeys == keys.count else { - print("!!! ERROR A") - return (result, false) - } - guard walked.count >= 2 else { - print("!!! ERROR B") - return (result, false) - } - walked = walked.reversed() - walked.removeFirst() - result = walked - return (result, true) } -} -extension Megrez.SpanUnit { - /// 將當前幅位單元由節點辭典轉為頂點辭典。 - var asVertexSpan: [Int: Megrez.Compositor.Vertex] { - var result = [Int: Megrez.Compositor.Vertex]() - forEach { theKey, theValue in - result[theKey] = .init(node: theValue) + /// 摧毀所有與共用起始虛擬節點有牽涉的節點自身的 Vertex 特性資料。 + internal static func reinitVertexNetwork() { + Megrez.Node.trailingNode.destroyVertex() + Megrez.Node.leadingNode.destroyVertex() + } + + /// 對持有單個根頂點的有向無環圖進行位相幾何排序(topological + /// sort)、且將排序結果以頂點陣列的形式給出。 + /// + /// 這裡使用我們自己的堆棧和狀態定義實現了一個非遞迴版本, + /// 這樣我們就不會受到當前線程的堆棧大小的限制。以下是等價的原始算法。 + /// ``` + /// func topologicalSort(node: Node) { + /// node.edges.forEach { nodeNode in + /// if !nodeNode.topologicallySorted { + /// dfs(nodeNode, result) + /// nodeNode.topologicallySorted = true + /// } + /// result.append(nodeNode) + /// } + /// } + /// ``` + /// 至於其遞迴版本,則類似於 Cormen 在 2001 年的著作「Introduction to Algorithms」當中的樣子。 + /// - Returns: 排序結果(頂點陣列)。 + private static func topologicalSort() -> [Megrez.Node] { + class State { + var iterIndex: Int + let node: Megrez.Node + init(node: Megrez.Node, iterIndex: Int = 0) { + self.node = node + self.iterIndex = iterIndex + } + } + var result = [Megrez.Node]() + var stack = [State]() + stack.append(.init(node: .trailingNode)) + while !stack.isEmpty { + let state = stack[stack.count - 1] + let theNode = state.node + if state.iterIndex < state.node.edges.count { + let newNode = state.node.edges[state.iterIndex] + state.iterIndex += 1 + if !newNode.topologicallySorted { + stack.append(.init(node: newNode)) + continue + } + } + theNode.topologicallySorted = true + result.append(theNode) + stack.removeLast() } return result } + + /// 卸勁函式。 + /// + /// 「卸勁 (relax)」一詞出自 Cormen 在 2001 年的著作「Introduction to Algorithms」的 585 頁。 + /// - Remark: 自己就是參照頂點 (u),會在必要時成為 target (v) 的前述頂點。 + /// - Parameters: + /// - u: 基準頂點。 + /// - v: 要影響的頂點。 + private static func relax(u: Megrez.Node, v: inout Megrez.Node) { + // 從 u 到 w 的距離,也就是 v 的權重。 + let w: Double = v.score + // 這裡計算最大權重: + // 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」, + // 我們就更新 v 的距離及其前述頂點。 + guard v.distance < u.distance + w else { return } + v.distance = u.distance + w + v.prev = u + } } diff --git a/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift b/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift index bf20e4cb..19d839a2 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/3_KeyValuePaired.swift @@ -3,54 +3,64 @@ // ==================== // This code is released under the MIT license (SPDX-License-Identifier: MIT) -import Foundation - public extension Megrez { /// 鍵值配對,乃索引鍵陣列與讀音的配對單元。 - struct KeyValuePaired: Equatable, Hashable, Comparable, CustomStringConvertible { + class KeyValuePaired: Unigram, Comparable { /// 索引鍵陣列。一般情況下用來放置讀音等可以用來作為索引的內容。 - public var keyArray: [String] - /// 資料值。 - public var value: String + public var keyArray: [String] = [] /// 將當前鍵值列印成一個字串。 - public var description: String { "(" + keyArray.description + "," + value + ")" } + override public var description: String { "(\(keyArray.description),\(value),\(score))" } /// 判斷當前鍵值配對是否合規。如果鍵與值有任一為空,則結果為 false。 public var isValid: Bool { !keyArray.joined().isEmpty && !value.isEmpty } /// 將當前鍵值列印成一個字串,但如果該鍵值配對為空的話則僅列印「()」。 - public var toNGramKey: String { !isValid ? "()" : "(" + joinedKey() + "," + value + ")" } + public var toNGramKey: String { !isValid ? "()" : "(\(joinedKey()),\(value))" } /// 通用陣列表達形式。 - public var tupletExpression: (keyArray: [String], value: String) { (keyArray, value) } + public var keyValueTuplet: (keyArray: [String], value: String) { (keyArray, value) } + /// 通用陣列表達形式。 + public var triplet: (keyArray: [String], value: String, score: Double) { (keyArray, value, score) } /// 初期化一組鍵值配對。 /// - Parameters: /// - keyArray: 索引鍵陣列。一般情況下用來放置讀音等可以用來作為索引的內容。 /// - value: 資料值。 - public init(keyArray: [String], value: String = "N/A") { + /// - score: 權重(雙精度小數)。 + public init(keyArray: [String], value: String = "N/A", score: Double = 0) { + super.init(value: value.isEmpty ? "N/A" : value, score: score) self.keyArray = keyArray.isEmpty ? ["N/A"] : keyArray - self.value = value.isEmpty ? "N/A" : value } /// 初期化一組鍵值配對。 - /// - Parameter tupletExpression: 傳入的通用陣列表達形式。 + /// - Parameter tripletExpression: 傳入的通用陣列表達形式。 + public init(_ tripletExpression: (keyArray: [String], value: String, score: Double)) { + let theValue = tripletExpression.value.isEmpty ? "N/A" : tripletExpression.value + super.init(value: theValue, score: tripletExpression.score) + keyArray = tripletExpression.keyArray.isEmpty ? ["N/A"] : tripletExpression.keyArray + } + + /// 初期化一組鍵值配對。 + /// - Parameter tuplet: 傳入的通用陣列表達形式。 public init(_ tupletExpression: (keyArray: [String], value: String)) { + let theValue = tupletExpression.value.isEmpty ? "N/A" : tupletExpression.value + super.init(value: theValue, score: 0) keyArray = tupletExpression.keyArray.isEmpty ? ["N/A"] : tupletExpression.keyArray - value = tupletExpression.value.isEmpty ? "N/A" : tupletExpression.value } /// 初期化一組鍵值配對。 /// - Parameters: /// - key: 索引鍵。一般情況下用來放置讀音等可以用來作為索引的內容。 /// - value: 資料值。 - public init(key: String = "N/A", value: String = "N/A") { - keyArray = key.isEmpty ? ["N/A"] : key.components(separatedBy: Megrez.Compositor.theSeparator) - self.value = value.isEmpty ? "N/A" : value + /// - score: 權重(雙精度小數)。 + public init(key: String = "N/A", value: String = "N/A", score: Double = 0) { + super.init(value: value.isEmpty ? "N/A" : value, score: score) + keyArray = key.isEmpty ? ["N/A"] : key.sliced(by: Megrez.Compositor.theSeparator) } /// 做為預設雜湊函式。 /// - Parameter hasher: 目前物件的雜湊碼。 - public func hash(into hasher: inout Hasher) { + override public func hash(into hasher: inout Hasher) { hasher.combine(keyArray) hasher.combine(value) + hasher.combine(score) } public func joinedKey(by separator: String = Megrez.Compositor.theSeparator) -> String { @@ -58,7 +68,7 @@ public extension Megrez { } public static func == (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool { - lhs.keyArray == rhs.keyArray && lhs.value == rhs.value + lhs.score == rhs.score && lhs.keyArray == rhs.keyArray && lhs.value == rhs.value } public static func < (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool { @@ -193,9 +203,9 @@ public extension Megrez.Compositor { arrOverlappedNodes = fetchOverlappingNodes(at: i) arrOverlappedNodes.forEach { anchor in if anchor.node == overridden.node { return } - if !overridden.node.joinedKey(by: "\t").contains(anchor.node.joinedKey(by: "\t")) - || !overridden.node.value.contains(anchor.node.value) - { + let anchorNodeKeyJoined = anchor.node.joinedKey(by: "\t") + let overriddenNodeKeyJoined = overridden.node.joinedKey(by: "\t") + if !overriddenNodeKeyJoined.has(string: anchorNodeKeyJoined) || !overridden.node.value.has(string: anchor.node.value) { anchor.node.reset() return } diff --git a/Packages/vChewing_Megrez/Sources/Megrez/6_Node.swift b/Packages/vChewing_Megrez/Sources/Megrez/5_Node.swift similarity index 89% rename from Packages/vChewing_Megrez/Sources/Megrez/6_Node.swift rename to Packages/vChewing_Megrez/Sources/Megrez/5_Node.swift index c4c466e5..6f008a38 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/6_Node.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/5_Node.swift @@ -3,8 +3,6 @@ // ==================== // This code is released under the MIT license (SPDX-License-Identifier: MIT) -import Foundation - public extension Megrez { /// 字詞節點。 /// @@ -173,6 +171,39 @@ public extension Megrez { } return false } + + // MARK: - Vertex Extensions. + + // 注意:這一段的任何參數都不參與 Hash。 + + /// 組字器「文字輸入方向上的」最後方的虛擬節點。 + internal static let trailingNode = Megrez.Node(keyArray: ["$TRAILING"]) + /// 組字器「文字輸入方向上的」最前方的虛擬節點,也是根頂點。 + internal static let leadingNode = Megrez.Node(keyArray: ["$LEADING"]) + + /// 前述頂點。 + internal var prev: Node? + /// 自身屬下的頂點陣列。 + internal var edges = [Node]() + /// 該變數用於最短路徑的計算。 + /// + /// 我們實際上是在計算具有最大權重的路徑,因此距離的初始值是負無窮的。 + /// 如果我們要計算最短的權重/距離,我們會將其初期值設為正無窮。 + internal var distance = -(Double.infinity) + /// 在進行進行位相幾何排序時會用到的狀態標記。 + internal var topologicallySorted = false + + /// 摧毀一個字詞節點本身的 Vertex 特性資料。 + /// 讓一個 Vertex 順藤摸瓜地將自己的所有的連帶的 Vertex 都摧毀,再摧毀自己。 + /// 此過程必須在一套 Vertex 全部使用完畢之後執行一次,可防止記憶體洩漏。 + internal func destroyVertex() { + while prev?.prev != nil { prev?.destroyVertex() } + prev = nil + edges.forEach { $0.destroyVertex() } + edges.removeAll() + distance = -(Double.infinity) + topologicallySorted = false + } } } @@ -256,14 +287,9 @@ public extension Array where Element == Megrez.Node { /// 提供一組逐字的字音配對陣列(不使用 Megrez 的 KeyValuePaired 類型),但字音不匹配的節點除外。 var smashedPairs: [(key: String, value: String)] { var arrData = [(key: String, value: String)]() - let separator = Megrez.Compositor.theSeparator forEach { node in - if node.isReadingMismatched { - var newKey = node.joinedKey() - if !separator.isEmpty, newKey != separator, newKey.contains(separator) { - newKey = newKey.replacingOccurrences(of: separator, with: "\t") - } - arrData.append((key: newKey, value: node.value)) + if node.isReadingMismatched, !node.keyArray.joined().isEmpty { + arrData.append((key: node.keyArray.joined(separator: "\t"), value: node.value)) return } let arrValueChars = node.value.map(\.description) diff --git a/Packages/vChewing_Megrez/Sources/Megrez/5_Vertex.swift b/Packages/vChewing_Megrez/Sources/Megrez/5_Vertex.swift deleted file mode 100644 index b60ea371..00000000 --- a/Packages/vChewing_Megrez/Sources/Megrez/5_Vertex.swift +++ /dev/null @@ -1,109 +0,0 @@ -// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License). -// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). -// ==================== -// This code is released under the MIT license (SPDX-License-Identifier: MIT) - -extension Megrez.Compositor { - /// 一個「有向無環圖的」的頂點單位。 - /// - /// 這是一個可變的數據結構,用於有向無環圖的構建和單源最短路徑的計算。 - class Vertex { - /// 前述頂點。 - public var prev: Vertex? - /// 自身屬下的頂點陣列。 - public var edges = [Vertex]() - /// 該變數用於最短路徑的計算。 - /// - /// 我們實際上是在計算具有最大權重的路徑,因此距離的初始值是負無窮的。 - /// 如果我們要計算最短的權重/距離,我們會將其初期值設為正無窮。 - public var distance = -(Double.infinity) - /// 在進行進行位相幾何排序時會用到的狀態標記。 - public var topologicallySorted = false - /// 字詞節點。 - public var node: Megrez.Node - - /// 初期化一個「有向無環圖的」的頂點單位。 - /// - /// 這是一個可變的數據結構,用於有向無環圖的構建和單源最短路徑的計算。 - /// - Parameter node: 字詞節點。 - public init(node: Megrez.Node) { - self.node = node - } - - /// 讓一個 Vertex 順藤摸瓜地將自己的所有的連帶的 Vertex 都摧毀,再摧毀自己。 - /// 此過程必須在一套 Vertex 全部使用完畢之後執行一次,可防止記憶體洩漏。 - public func destroy() { - while prev?.prev != nil { prev?.destroy() } - prev = nil - edges.forEach { $0.destroy() } - edges.removeAll() - node = .init() - } - - /// 卸勁函式。 - /// - /// 「卸勁 (relax)」一詞出自 Cormen 在 2001 年的著作「Introduction to Algorithms」的 585 頁。 - /// - Remark: 自己就是參照頂點 (u),會在必要時成為 target (v) 的前述頂點。 - /// - Parameters: - /// - target: 要影響的頂點。 - public func relax(target: inout Vertex) { - // 從 u 到 w 的距離,也就是 v 的權重。 - let w: Double = target.node.score - // 這裡計算最大權重: - // 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」, - // 我們就更新 v 的距離及其前述頂點。 - if target.distance >= distance + w { return } - target.distance = distance + w - target.prev = self - } - } - - /// 對持有單個根頂點的有向無環圖進行位相幾何排序(topological - /// sort)、且將排序結果以頂點陣列的形式給出。 - /// - /// 這裡使用我們自己的堆棧和狀態定義實現了一個非遞迴版本, - /// 這樣我們就不會受到當前線程的堆棧大小的限制。以下是等價的原始算法。 - /// ``` - /// func topologicalSort(vertex: Vertex) { - /// vertex.edges.forEach { vertexNode in - /// if !vertexNode.topologicallySorted { - /// dfs(vertexNode, result) - /// vertexNode.topologicallySorted = true - /// } - /// result.append(vertexNode) - /// } - /// } - /// ``` - /// 至於其遞迴版本,則類似於 Cormen 在 2001 年的著作「Introduction to Algorithms」當中的樣子。 - /// - Parameter root: 根頂點。 - /// - Returns: 排序結果(頂點陣列)。 - func topologicalSort(root: inout Vertex) -> [Vertex] { - class State { - var iterIndex: Int - let vertex: Vertex - init(vertex: Vertex, iterIndex: Int = 0) { - self.vertex = vertex - self.iterIndex = iterIndex - } - } - var result = [Vertex]() - var stack = [State]() - stack.append(.init(vertex: root)) - while !stack.isEmpty { - let state = stack[stack.count - 1] - let theVertex = state.vertex - if state.iterIndex < state.vertex.edges.count { - let newVertex = state.vertex.edges[state.iterIndex] - state.iterIndex += 1 - if !newVertex.topologicallySorted { - stack.append(.init(vertex: newVertex)) - continue - } - } - theVertex.topologicallySorted = true - result.append(theVertex) - stack.removeLast() - } - return result - } -} diff --git a/Packages/vChewing_Megrez/Sources/Megrez/7_LangModel.swift b/Packages/vChewing_Megrez/Sources/Megrez/6_LangModel.swift similarity index 100% rename from Packages/vChewing_Megrez/Sources/Megrez/7_LangModel.swift rename to Packages/vChewing_Megrez/Sources/Megrez/6_LangModel.swift diff --git a/Packages/vChewing_Megrez/Sources/Megrez/8_Unigram.swift b/Packages/vChewing_Megrez/Sources/Megrez/7_Unigram.swift similarity index 96% rename from Packages/vChewing_Megrez/Sources/Megrez/8_Unigram.swift rename to Packages/vChewing_Megrez/Sources/Megrez/7_Unigram.swift index cfccde05..0596fbcb 100644 --- a/Packages/vChewing_Megrez/Sources/Megrez/8_Unigram.swift +++ b/Packages/vChewing_Megrez/Sources/Megrez/7_Unigram.swift @@ -5,7 +5,7 @@ public extension Megrez { /// 單元圖。 - @frozen struct Unigram: Equatable, CustomStringConvertible, Hashable { + class Unigram: Equatable, CustomStringConvertible, Hashable { /// 資料值,通常是詞語或單個字。 public var value: String /// 權重。 diff --git a/Packages/vChewing_Megrez/Sources/Megrez/8_SwiftImpl_Internals.swift b/Packages/vChewing_Megrez/Sources/Megrez/8_SwiftImpl_Internals.swift new file mode 100644 index 00000000..65997e0e --- /dev/null +++ b/Packages/vChewing_Megrez/Sources/Megrez/8_SwiftImpl_Internals.swift @@ -0,0 +1,74 @@ +// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License). +// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). +// ==================== +// This code is released under the MIT license (SPDX-License-Identifier: MIT) + +// This package is trying to deprecate its dependency of Foundation, hence this file. + +extension StringProtocol { + func has(string target: any StringProtocol) -> Bool { + let selfArray = Array(unicodeScalars) + let targetArray = Array(target.description.unicodeScalars) + guard !target.isEmpty else { return isEmpty } + guard count >= target.count else { return false } + for index in 0 ..< selfArray.count { + let range = index ..< (Swift.min(index + targetArray.count, selfArray.count)) + let ripped = Array(selfArray[range]) + if ripped == targetArray { return true } + } + return false + } + + func sliced(by separator: any StringProtocol = "") -> [String] { + let selfArray = Array(unicodeScalars) + let arrSeparator = Array(separator.description.unicodeScalars) + var result: [String] = [] + var buffer: [Unicode.Scalar] = [] + var sleepCount = 0 + for index in 0 ..< selfArray.count { + let currentChar = selfArray[index] + let range = index ..< (Swift.min(index + arrSeparator.count, selfArray.count)) + let ripped = Array(selfArray[range]) + if ripped.isEmpty { continue } + if ripped == arrSeparator { + sleepCount = range.count + result.append(buffer.map { String($0) }.joined()) + buffer.removeAll() + } + if sleepCount < 1 { + buffer.append(currentChar) + } + sleepCount -= 1 + } + result.append(buffer.map { String($0) }.joined()) + buffer.removeAll() + return result + } + + func swapping(_ target: String, with newString: String) -> String { + let selfArray = Array(unicodeScalars) + let arrTarget = Array(target.description.unicodeScalars) + var result = "" + var buffer: [Unicode.Scalar] = [] + var sleepCount = 0 + for index in 0 ..< selfArray.count { + let currentChar = selfArray[index] + let range = index ..< (Swift.min(index + arrTarget.count, selfArray.count)) + let ripped = Array(selfArray[range]) + if ripped.isEmpty { continue } + if ripped == arrTarget { + sleepCount = ripped.count + result.append(buffer.map { String($0) }.joined()) + result.append(newString) + buffer.removeAll() + } + if sleepCount < 1 { + buffer.append(currentChar) + } + sleepCount -= 1 + } + result.append(buffer.map { String($0) }.joined()) + buffer.removeAll() + return result + } +} diff --git a/Packages/vChewing_Megrez/Tests/MegrezTests/LMDataForTests.swift b/Packages/vChewing_Megrez/Tests/MegrezTests/LMDataForTests.swift index b79ba9a1..50e181ba 100644 --- a/Packages/vChewing_Megrez/Tests/MegrezTests/LMDataForTests.swift +++ b/Packages/vChewing_Megrez/Tests/MegrezTests/LMDataForTests.swift @@ -19,7 +19,7 @@ class SimpleLM: LangModelProtocol { let col0 = String(linestream[0]) let col1 = String(linestream[1]) let col2 = Double(linestream[2]) ?? 0.0 - var u = Megrez.Unigram(value: swapKeyValue ? col0 : col1, score: 0) + let u = Megrez.Unigram(value: swapKeyValue ? col0 : col1, score: 0) u.score = col2 mutDatabase[swapKeyValue ? col1 : col0, default: []].append(u) } diff --git a/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift b/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift index 1315fd01..d8823f37 100644 --- a/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift +++ b/Packages/vChewing_Megrez/Tests/MegrezTests/MegrezTests.swift @@ -351,7 +351,7 @@ final class MegrezTests: XCTestCase { "高科技公司的年終獎金".forEach { i in compositor.insertKey(i.description) } - let result = compositor.walk().0 + let result = compositor.walk() XCTAssertEqual(result.joinedKeys(by: ""), ["高科技", "公司", "的", "年終", "獎金"]) } @@ -384,7 +384,7 @@ final class MegrezTests: XCTestCase { compositor.insertKey("jiang3") compositor.walk() compositor.insertKey("jin1") - var result = compositor.walk().0 + var result = compositor.walk() XCTAssertEqual(result.values, ["高科技", "公司", "的", "年中", "獎金"]) XCTAssertEqual(compositor.length, 10) compositor.cursor = 7 @@ -394,7 +394,7 @@ final class MegrezTests: XCTestCase { XCTAssertTrue(candidates.contains("中")) XCTAssertTrue(candidates.contains("鍾")) XCTAssertTrue(compositor.overrideCandidateLiteral("年終", at: 7)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["高科技", "公司", "的", "年終", "獎金"]) let candidatesBeginAt = compositor.fetchCandidates(at: 3, filter: .beginAt).map(\.value) let candidatesEndAt = compositor.fetchCandidates(at: 3, filter: .endAt).map(\.value) @@ -436,11 +436,11 @@ final class MegrezTests: XCTestCase { compositor.insertKey("gao1") compositor.insertKey("ke1") compositor.insertKey("ji4") - var result = compositor.walk().0 + var result = compositor.walk() XCTAssertEqual(result.values, ["高科技"]) compositor.insertKey("gong1") compositor.insertKey("si1") - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["高科技", "公司"]) } @@ -450,29 +450,29 @@ final class MegrezTests: XCTestCase { compositor.insertKey("gao1") compositor.insertKey("ke1") compositor.insertKey("ji4") - var result = compositor.walk().0 + var result = compositor.walk() XCTAssertEqual(result.values, ["高科技"]) compositor.cursor = 0 XCTAssertTrue(compositor.overrideCandidateLiteral("膏", at: compositor.cursor)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["膏", "科技"]) XCTAssertTrue(compositor.overrideCandidateLiteral("高科技", at: 1)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["高科技"]) XCTAssertTrue(compositor.overrideCandidateLiteral("膏", at: 0)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["膏", "科技"]) XCTAssertTrue(compositor.overrideCandidateLiteral("柯", at: 1)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["膏", "柯", "際"]) XCTAssertTrue(compositor.overrideCandidateLiteral("暨", at: 2)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["膏", "柯", "暨"]) XCTAssertTrue(compositor.overrideCandidateLiteral("高科技", at: 3)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["高科技"]) } @@ -484,19 +484,19 @@ final class MegrezTests: XCTestCase { compositor.insertKey("zhong1") compositor.insertKey("jiang3") compositor.insertKey("jin1") - var result = compositor.walk().0 + var result = compositor.walk() XCTAssertEqual(result.values, ["年中", "獎金"]) XCTAssertTrue(compositor.overrideCandidateLiteral("終講", at: 1)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["年", "終講", "金"]) XCTAssertTrue(compositor.overrideCandidateLiteral("槳襟", at: 2)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["年中", "槳襟"]) XCTAssertTrue(compositor.overrideCandidateLiteral("年終", at: 0)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["年終", "槳襟"]) } @@ -509,16 +509,16 @@ final class MegrezTests: XCTestCase { compositor.insertKey("yan4") compositor.insertKey("wei2") compositor.insertKey("xian3") - var result = compositor.walk().0 + var result = compositor.walk() XCTAssertEqual(result.values, ["高熱", "火焰", "危險"]) let location = 2 XCTAssertTrue(compositor.overrideCandidate(.init(keyArray: ["huo3"], value: "🔥"), at: location)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["高熱", "🔥", "焰", "危險"]) XCTAssertTrue(compositor.overrideCandidate(.init(keyArray: ["huo3", "yan4"], value: "🔥"), at: location)) - result = compositor.walk().0 + result = compositor.walk() XCTAssertEqual(result.values, ["高熱", "🔥", "危險"]) } @@ -530,11 +530,11 @@ final class MegrezTests: XCTestCase { compositor.insertKey("zhong1") compositor.insertKey("jiang3") compositor.insertKey("jin1") - let oldResult = compositor.walk().0.values.joined() + let oldResult = compositor.walk().values.joined() print(oldResult) theLM.trim(key: "nian2zhong1", value: "年中") compositor.update(updateExisting: true) - let newResult = compositor.walk().0.values.joined() + let newResult = compositor.walk().values.joined() print(newResult) XCTAssertEqual([oldResult, newResult], ["年中獎金", "年終獎金"]) compositor.cursor = 4 @@ -542,7 +542,7 @@ final class MegrezTests: XCTestCase { compositor.dropKey(direction: .rear) theLM.trim(key: "nian2zhong1", value: "年終") compositor.update(updateExisting: true) - let newResult2 = compositor.walk().0.values + let newResult2 = compositor.walk().values print(newResult2) XCTAssertEqual(newResult2, ["年", "中"]) } @@ -555,8 +555,8 @@ final class MegrezTests: XCTestCase { compositorA.insertKey(key.description) } var compositorB = compositorA.hardCopy - let resultA = compositorA.walk().walkedNodes - let resultB = compositorB.walk().walkedNodes + let resultA = compositorA.walk() + let resultB = compositorB.walk() XCTAssertEqual(resultA, resultB) }