Megrez // v2.7.0 update, removing Foundation dependency.
This commit is contained in:
parent
6ab57f5165
commit
7ef7f33993
|
@ -3,8 +3,6 @@
|
|||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
import Foundation
|
||||
|
||||
public extension Megrez {
|
||||
/// 一個組字器用來在給定一系列的索引鍵的情況下(藉由一系列的觀測行為)返回一套資料值。
|
||||
///
|
||||
|
@ -186,7 +184,7 @@ public extension Megrez {
|
|||
/// 生成用以交給 GraphViz 診斷的資料檔案內容,純文字。
|
||||
public var dumpDOT: String {
|
||||
// C# StringBuilder 與 Swift NSMutableString 能提供爆發性的效能。
|
||||
let strOutput: NSMutableString = .init(string: "digraph {\ngraph [ rankdir=LR ];\nBOS;\n")
|
||||
var strOutput = "digraph {\ngraph [ rankdir=LR ];\nBOS;\n"
|
||||
spans.enumerated().forEach { p, span in
|
||||
(0 ... span.maxLength).forEach { ni in
|
||||
guard let np = span[ni] else { return }
|
||||
|
|
|
@ -12,73 +12,118 @@ public extension Megrez.Compositor {
|
|||
/// 的路徑(所以要找最大的權重),因為在對數概率下,較大的數值意味著較大的概率。
|
||||
/// 對於 `G = (V, E)`,該算法的運行次數為 `O(|V|+|E|)`,其中 `G` 是一個有向無環圖。
|
||||
/// 這意味著,即使軌格很大,也可以用很少的算力就可以爬軌。
|
||||
///
|
||||
/// - Remark: 利用該數學方法進行輸入法智能組句的(已知可考的)最開始的案例是
|
||||
/// 郭家寶(ByVoid)的《[基於統計語言模型的拼音輸入法](https://byvoid.com/zht/blog/slm_based_pinyin_ime/) 》;
|
||||
/// 再後來則是 2022 年中時期劉燈的 Gramambular 2 組字引擎。
|
||||
/// - Returns: 爬軌結果+該過程是否順利執行。
|
||||
@discardableResult mutating func walk() -> (walkedNodes: [Megrez.Node], succeeded: Bool) {
|
||||
var result = [Megrez.Node]()
|
||||
defer { walkedNodes = result }
|
||||
guard !spans.isEmpty else { return (result, true) }
|
||||
@discardableResult mutating func walk() -> [Megrez.Node] {
|
||||
defer { Self.reinitVertexNetwork() }
|
||||
sortAndRelax()
|
||||
guard !spans.isEmpty else { return [] }
|
||||
var iterated: Megrez.Node? = Megrez.Node.leadingNode
|
||||
walkedNodes.removeAll()
|
||||
while let itPrev = iterated?.prev {
|
||||
// 此處必須得是 Copy,讓組字器外部對此的操作影響不到組字器內部的節點。
|
||||
walkedNodes.insert(itPrev.copy, at: 0)
|
||||
iterated = itPrev
|
||||
}
|
||||
iterated?.destroyVertex()
|
||||
iterated = nil
|
||||
walkedNodes.removeFirst()
|
||||
return walkedNodes
|
||||
}
|
||||
|
||||
var vertexSpans: [[Int: Vertex]] = spans.map(\.asVertexSpan)
|
||||
|
||||
let terminal = Vertex(node: .init(keyArray: ["_TERMINAL_"]))
|
||||
var root = Vertex(node: .init(keyArray: ["_ROOT_"]))
|
||||
root.distance = 0
|
||||
|
||||
vertexSpans.enumerated().forEach { location, vertexSpan in
|
||||
vertexSpan.values.forEach { vertex in
|
||||
let nextVertexPosition = location + vertex.node.spanLength
|
||||
if nextVertexPosition == vertexSpans.count {
|
||||
vertex.edges.append(terminal)
|
||||
/// 先進行位相幾何排序、再卸勁。
|
||||
internal func sortAndRelax() {
|
||||
Self.reinitVertexNetwork()
|
||||
guard !spans.isEmpty else { return }
|
||||
Megrez.Node.trailingNode.distance = 0
|
||||
spans.enumerated().forEach { location, theSpan in
|
||||
theSpan.values.forEach { theNode in
|
||||
let nextVertexPosition = location + theNode.spanLength
|
||||
if nextVertexPosition == spans.count {
|
||||
theNode.edges.append(.leadingNode)
|
||||
return
|
||||
}
|
||||
vertexSpans[nextVertexPosition].values.forEach { vertex.edges.append($0) }
|
||||
spans[nextVertexPosition].values.forEach { theNode.edges.append($0) }
|
||||
}
|
||||
}
|
||||
|
||||
root.edges.append(contentsOf: vertexSpans[0].values)
|
||||
|
||||
topologicalSort(root: &root).reversed().forEach { neta in
|
||||
neta.edges.indices.forEach { neta.relax(target: &neta.edges[$0]) }
|
||||
Megrez.Node.trailingNode.edges.append(contentsOf: spans[0].values)
|
||||
Self.topologicalSort().reversed().forEach { neta in
|
||||
neta.edges.indices.forEach { Self.relax(u: neta, v: &neta.edges[$0]) }
|
||||
}
|
||||
|
||||
var iterated = terminal
|
||||
var walked = [Megrez.Node]()
|
||||
var totalLengthOfKeys = 0
|
||||
|
||||
while let itPrev = iterated.prev {
|
||||
walked.append(itPrev.node)
|
||||
iterated = itPrev
|
||||
totalLengthOfKeys += iterated.node.spanLength
|
||||
}
|
||||
|
||||
// 清理內容,否則會有記憶體洩漏。
|
||||
vertexSpans.removeAll()
|
||||
iterated.destroy()
|
||||
root.destroy()
|
||||
terminal.destroy()
|
||||
|
||||
guard totalLengthOfKeys == keys.count else {
|
||||
print("!!! ERROR A")
|
||||
return (result, false)
|
||||
}
|
||||
guard walked.count >= 2 else {
|
||||
print("!!! ERROR B")
|
||||
return (result, false)
|
||||
}
|
||||
walked = walked.reversed()
|
||||
walked.removeFirst()
|
||||
result = walked
|
||||
return (result, true)
|
||||
}
|
||||
}
|
||||
|
||||
extension Megrez.SpanUnit {
|
||||
/// 將當前幅位單元由節點辭典轉為頂點辭典。
|
||||
var asVertexSpan: [Int: Megrez.Compositor.Vertex] {
|
||||
var result = [Int: Megrez.Compositor.Vertex]()
|
||||
forEach { theKey, theValue in
|
||||
result[theKey] = .init(node: theValue)
|
||||
/// 摧毀所有與共用起始虛擬節點有牽涉的節點自身的 Vertex 特性資料。
|
||||
internal static func reinitVertexNetwork() {
|
||||
Megrez.Node.trailingNode.destroyVertex()
|
||||
Megrez.Node.leadingNode.destroyVertex()
|
||||
}
|
||||
|
||||
/// 對持有單個根頂點的有向無環圖進行位相幾何排序(topological
|
||||
/// sort)、且將排序結果以頂點陣列的形式給出。
|
||||
///
|
||||
/// 這裡使用我們自己的堆棧和狀態定義實現了一個非遞迴版本,
|
||||
/// 這樣我們就不會受到當前線程的堆棧大小的限制。以下是等價的原始算法。
|
||||
/// ```
|
||||
/// func topologicalSort(node: Node) {
|
||||
/// node.edges.forEach { nodeNode in
|
||||
/// if !nodeNode.topologicallySorted {
|
||||
/// dfs(nodeNode, result)
|
||||
/// nodeNode.topologicallySorted = true
|
||||
/// }
|
||||
/// result.append(nodeNode)
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
/// 至於其遞迴版本,則類似於 Cormen 在 2001 年的著作「Introduction to Algorithms」當中的樣子。
|
||||
/// - Returns: 排序結果(頂點陣列)。
|
||||
private static func topologicalSort() -> [Megrez.Node] {
|
||||
class State {
|
||||
var iterIndex: Int
|
||||
let node: Megrez.Node
|
||||
init(node: Megrez.Node, iterIndex: Int = 0) {
|
||||
self.node = node
|
||||
self.iterIndex = iterIndex
|
||||
}
|
||||
}
|
||||
var result = [Megrez.Node]()
|
||||
var stack = [State]()
|
||||
stack.append(.init(node: .trailingNode))
|
||||
while !stack.isEmpty {
|
||||
let state = stack[stack.count - 1]
|
||||
let theNode = state.node
|
||||
if state.iterIndex < state.node.edges.count {
|
||||
let newNode = state.node.edges[state.iterIndex]
|
||||
state.iterIndex += 1
|
||||
if !newNode.topologicallySorted {
|
||||
stack.append(.init(node: newNode))
|
||||
continue
|
||||
}
|
||||
}
|
||||
theNode.topologicallySorted = true
|
||||
result.append(theNode)
|
||||
stack.removeLast()
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
/// 卸勁函式。
|
||||
///
|
||||
/// 「卸勁 (relax)」一詞出自 Cormen 在 2001 年的著作「Introduction to Algorithms」的 585 頁。
|
||||
/// - Remark: 自己就是參照頂點 (u),會在必要時成為 target (v) 的前述頂點。
|
||||
/// - Parameters:
|
||||
/// - u: 基準頂點。
|
||||
/// - v: 要影響的頂點。
|
||||
private static func relax(u: Megrez.Node, v: inout Megrez.Node) {
|
||||
// 從 u 到 w 的距離,也就是 v 的權重。
|
||||
let w: Double = v.score
|
||||
// 這裡計算最大權重:
|
||||
// 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」,
|
||||
// 我們就更新 v 的距離及其前述頂點。
|
||||
guard v.distance < u.distance + w else { return }
|
||||
v.distance = u.distance + w
|
||||
v.prev = u
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,54 +3,64 @@
|
|||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
import Foundation
|
||||
|
||||
public extension Megrez {
|
||||
/// 鍵值配對,乃索引鍵陣列與讀音的配對單元。
|
||||
struct KeyValuePaired: Equatable, Hashable, Comparable, CustomStringConvertible {
|
||||
class KeyValuePaired: Unigram, Comparable {
|
||||
/// 索引鍵陣列。一般情況下用來放置讀音等可以用來作為索引的內容。
|
||||
public var keyArray: [String]
|
||||
/// 資料值。
|
||||
public var value: String
|
||||
public var keyArray: [String] = []
|
||||
/// 將當前鍵值列印成一個字串。
|
||||
public var description: String { "(" + keyArray.description + "," + value + ")" }
|
||||
override public var description: String { "(\(keyArray.description),\(value),\(score))" }
|
||||
/// 判斷當前鍵值配對是否合規。如果鍵與值有任一為空,則結果為 false。
|
||||
public var isValid: Bool { !keyArray.joined().isEmpty && !value.isEmpty }
|
||||
/// 將當前鍵值列印成一個字串,但如果該鍵值配對為空的話則僅列印「()」。
|
||||
public var toNGramKey: String { !isValid ? "()" : "(" + joinedKey() + "," + value + ")" }
|
||||
public var toNGramKey: String { !isValid ? "()" : "(\(joinedKey()),\(value))" }
|
||||
/// 通用陣列表達形式。
|
||||
public var tupletExpression: (keyArray: [String], value: String) { (keyArray, value) }
|
||||
public var keyValueTuplet: (keyArray: [String], value: String) { (keyArray, value) }
|
||||
/// 通用陣列表達形式。
|
||||
public var triplet: (keyArray: [String], value: String, score: Double) { (keyArray, value, score) }
|
||||
|
||||
/// 初期化一組鍵值配對。
|
||||
/// - Parameters:
|
||||
/// - keyArray: 索引鍵陣列。一般情況下用來放置讀音等可以用來作為索引的內容。
|
||||
/// - value: 資料值。
|
||||
public init(keyArray: [String], value: String = "N/A") {
|
||||
/// - score: 權重(雙精度小數)。
|
||||
public init(keyArray: [String], value: String = "N/A", score: Double = 0) {
|
||||
super.init(value: value.isEmpty ? "N/A" : value, score: score)
|
||||
self.keyArray = keyArray.isEmpty ? ["N/A"] : keyArray
|
||||
self.value = value.isEmpty ? "N/A" : value
|
||||
}
|
||||
|
||||
/// 初期化一組鍵值配對。
|
||||
/// - Parameter tupletExpression: 傳入的通用陣列表達形式。
|
||||
/// - Parameter tripletExpression: 傳入的通用陣列表達形式。
|
||||
public init(_ tripletExpression: (keyArray: [String], value: String, score: Double)) {
|
||||
let theValue = tripletExpression.value.isEmpty ? "N/A" : tripletExpression.value
|
||||
super.init(value: theValue, score: tripletExpression.score)
|
||||
keyArray = tripletExpression.keyArray.isEmpty ? ["N/A"] : tripletExpression.keyArray
|
||||
}
|
||||
|
||||
/// 初期化一組鍵值配對。
|
||||
/// - Parameter tuplet: 傳入的通用陣列表達形式。
|
||||
public init(_ tupletExpression: (keyArray: [String], value: String)) {
|
||||
let theValue = tupletExpression.value.isEmpty ? "N/A" : tupletExpression.value
|
||||
super.init(value: theValue, score: 0)
|
||||
keyArray = tupletExpression.keyArray.isEmpty ? ["N/A"] : tupletExpression.keyArray
|
||||
value = tupletExpression.value.isEmpty ? "N/A" : tupletExpression.value
|
||||
}
|
||||
|
||||
/// 初期化一組鍵值配對。
|
||||
/// - Parameters:
|
||||
/// - key: 索引鍵。一般情況下用來放置讀音等可以用來作為索引的內容。
|
||||
/// - value: 資料值。
|
||||
public init(key: String = "N/A", value: String = "N/A") {
|
||||
keyArray = key.isEmpty ? ["N/A"] : key.components(separatedBy: Megrez.Compositor.theSeparator)
|
||||
self.value = value.isEmpty ? "N/A" : value
|
||||
/// - score: 權重(雙精度小數)。
|
||||
public init(key: String = "N/A", value: String = "N/A", score: Double = 0) {
|
||||
super.init(value: value.isEmpty ? "N/A" : value, score: score)
|
||||
keyArray = key.isEmpty ? ["N/A"] : key.sliced(by: Megrez.Compositor.theSeparator)
|
||||
}
|
||||
|
||||
/// 做為預設雜湊函式。
|
||||
/// - Parameter hasher: 目前物件的雜湊碼。
|
||||
public func hash(into hasher: inout Hasher) {
|
||||
override public func hash(into hasher: inout Hasher) {
|
||||
hasher.combine(keyArray)
|
||||
hasher.combine(value)
|
||||
hasher.combine(score)
|
||||
}
|
||||
|
||||
public func joinedKey(by separator: String = Megrez.Compositor.theSeparator) -> String {
|
||||
|
@ -58,7 +68,7 @@ public extension Megrez {
|
|||
}
|
||||
|
||||
public static func == (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool {
|
||||
lhs.keyArray == rhs.keyArray && lhs.value == rhs.value
|
||||
lhs.score == rhs.score && lhs.keyArray == rhs.keyArray && lhs.value == rhs.value
|
||||
}
|
||||
|
||||
public static func < (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool {
|
||||
|
@ -193,9 +203,9 @@ public extension Megrez.Compositor {
|
|||
arrOverlappedNodes = fetchOverlappingNodes(at: i)
|
||||
arrOverlappedNodes.forEach { anchor in
|
||||
if anchor.node == overridden.node { return }
|
||||
if !overridden.node.joinedKey(by: "\t").contains(anchor.node.joinedKey(by: "\t"))
|
||||
|| !overridden.node.value.contains(anchor.node.value)
|
||||
{
|
||||
let anchorNodeKeyJoined = anchor.node.joinedKey(by: "\t")
|
||||
let overriddenNodeKeyJoined = overridden.node.joinedKey(by: "\t")
|
||||
if !overriddenNodeKeyJoined.has(string: anchorNodeKeyJoined) || !overridden.node.value.has(string: anchor.node.value) {
|
||||
anchor.node.reset()
|
||||
return
|
||||
}
|
||||
|
|
|
@ -3,8 +3,6 @@
|
|||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
import Foundation
|
||||
|
||||
public extension Megrez {
|
||||
/// 字詞節點。
|
||||
///
|
||||
|
@ -173,6 +171,39 @@ public extension Megrez {
|
|||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// MARK: - Vertex Extensions.
|
||||
|
||||
// 注意:這一段的任何參數都不參與 Hash。
|
||||
|
||||
/// 組字器「文字輸入方向上的」最後方的虛擬節點。
|
||||
internal static let trailingNode = Megrez.Node(keyArray: ["$TRAILING"])
|
||||
/// 組字器「文字輸入方向上的」最前方的虛擬節點,也是根頂點。
|
||||
internal static let leadingNode = Megrez.Node(keyArray: ["$LEADING"])
|
||||
|
||||
/// 前述頂點。
|
||||
internal var prev: Node?
|
||||
/// 自身屬下的頂點陣列。
|
||||
internal var edges = [Node]()
|
||||
/// 該變數用於最短路徑的計算。
|
||||
///
|
||||
/// 我們實際上是在計算具有最大權重的路徑,因此距離的初始值是負無窮的。
|
||||
/// 如果我們要計算最短的權重/距離,我們會將其初期值設為正無窮。
|
||||
internal var distance = -(Double.infinity)
|
||||
/// 在進行進行位相幾何排序時會用到的狀態標記。
|
||||
internal var topologicallySorted = false
|
||||
|
||||
/// 摧毀一個字詞節點本身的 Vertex 特性資料。
|
||||
/// 讓一個 Vertex 順藤摸瓜地將自己的所有的連帶的 Vertex 都摧毀,再摧毀自己。
|
||||
/// 此過程必須在一套 Vertex 全部使用完畢之後執行一次,可防止記憶體洩漏。
|
||||
internal func destroyVertex() {
|
||||
while prev?.prev != nil { prev?.destroyVertex() }
|
||||
prev = nil
|
||||
edges.forEach { $0.destroyVertex() }
|
||||
edges.removeAll()
|
||||
distance = -(Double.infinity)
|
||||
topologicallySorted = false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -256,14 +287,9 @@ public extension Array where Element == Megrez.Node {
|
|||
/// 提供一組逐字的字音配對陣列(不使用 Megrez 的 KeyValuePaired 類型),但字音不匹配的節點除外。
|
||||
var smashedPairs: [(key: String, value: String)] {
|
||||
var arrData = [(key: String, value: String)]()
|
||||
let separator = Megrez.Compositor.theSeparator
|
||||
forEach { node in
|
||||
if node.isReadingMismatched {
|
||||
var newKey = node.joinedKey()
|
||||
if !separator.isEmpty, newKey != separator, newKey.contains(separator) {
|
||||
newKey = newKey.replacingOccurrences(of: separator, with: "\t")
|
||||
}
|
||||
arrData.append((key: newKey, value: node.value))
|
||||
if node.isReadingMismatched, !node.keyArray.joined().isEmpty {
|
||||
arrData.append((key: node.keyArray.joined(separator: "\t"), value: node.value))
|
||||
return
|
||||
}
|
||||
let arrValueChars = node.value.map(\.description)
|
|
@ -1,109 +0,0 @@
|
|||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
extension Megrez.Compositor {
|
||||
/// 一個「有向無環圖的」的頂點單位。
|
||||
///
|
||||
/// 這是一個可變的數據結構,用於有向無環圖的構建和單源最短路徑的計算。
|
||||
class Vertex {
|
||||
/// 前述頂點。
|
||||
public var prev: Vertex?
|
||||
/// 自身屬下的頂點陣列。
|
||||
public var edges = [Vertex]()
|
||||
/// 該變數用於最短路徑的計算。
|
||||
///
|
||||
/// 我們實際上是在計算具有最大權重的路徑,因此距離的初始值是負無窮的。
|
||||
/// 如果我們要計算最短的權重/距離,我們會將其初期值設為正無窮。
|
||||
public var distance = -(Double.infinity)
|
||||
/// 在進行進行位相幾何排序時會用到的狀態標記。
|
||||
public var topologicallySorted = false
|
||||
/// 字詞節點。
|
||||
public var node: Megrez.Node
|
||||
|
||||
/// 初期化一個「有向無環圖的」的頂點單位。
|
||||
///
|
||||
/// 這是一個可變的數據結構,用於有向無環圖的構建和單源最短路徑的計算。
|
||||
/// - Parameter node: 字詞節點。
|
||||
public init(node: Megrez.Node) {
|
||||
self.node = node
|
||||
}
|
||||
|
||||
/// 讓一個 Vertex 順藤摸瓜地將自己的所有的連帶的 Vertex 都摧毀,再摧毀自己。
|
||||
/// 此過程必須在一套 Vertex 全部使用完畢之後執行一次,可防止記憶體洩漏。
|
||||
public func destroy() {
|
||||
while prev?.prev != nil { prev?.destroy() }
|
||||
prev = nil
|
||||
edges.forEach { $0.destroy() }
|
||||
edges.removeAll()
|
||||
node = .init()
|
||||
}
|
||||
|
||||
/// 卸勁函式。
|
||||
///
|
||||
/// 「卸勁 (relax)」一詞出自 Cormen 在 2001 年的著作「Introduction to Algorithms」的 585 頁。
|
||||
/// - Remark: 自己就是參照頂點 (u),會在必要時成為 target (v) 的前述頂點。
|
||||
/// - Parameters:
|
||||
/// - target: 要影響的頂點。
|
||||
public func relax(target: inout Vertex) {
|
||||
// 從 u 到 w 的距離,也就是 v 的權重。
|
||||
let w: Double = target.node.score
|
||||
// 這裡計算最大權重:
|
||||
// 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」,
|
||||
// 我們就更新 v 的距離及其前述頂點。
|
||||
if target.distance >= distance + w { return }
|
||||
target.distance = distance + w
|
||||
target.prev = self
|
||||
}
|
||||
}
|
||||
|
||||
/// 對持有單個根頂點的有向無環圖進行位相幾何排序(topological
|
||||
/// sort)、且將排序結果以頂點陣列的形式給出。
|
||||
///
|
||||
/// 這裡使用我們自己的堆棧和狀態定義實現了一個非遞迴版本,
|
||||
/// 這樣我們就不會受到當前線程的堆棧大小的限制。以下是等價的原始算法。
|
||||
/// ```
|
||||
/// func topologicalSort(vertex: Vertex) {
|
||||
/// vertex.edges.forEach { vertexNode in
|
||||
/// if !vertexNode.topologicallySorted {
|
||||
/// dfs(vertexNode, result)
|
||||
/// vertexNode.topologicallySorted = true
|
||||
/// }
|
||||
/// result.append(vertexNode)
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
/// 至於其遞迴版本,則類似於 Cormen 在 2001 年的著作「Introduction to Algorithms」當中的樣子。
|
||||
/// - Parameter root: 根頂點。
|
||||
/// - Returns: 排序結果(頂點陣列)。
|
||||
func topologicalSort(root: inout Vertex) -> [Vertex] {
|
||||
class State {
|
||||
var iterIndex: Int
|
||||
let vertex: Vertex
|
||||
init(vertex: Vertex, iterIndex: Int = 0) {
|
||||
self.vertex = vertex
|
||||
self.iterIndex = iterIndex
|
||||
}
|
||||
}
|
||||
var result = [Vertex]()
|
||||
var stack = [State]()
|
||||
stack.append(.init(vertex: root))
|
||||
while !stack.isEmpty {
|
||||
let state = stack[stack.count - 1]
|
||||
let theVertex = state.vertex
|
||||
if state.iterIndex < state.vertex.edges.count {
|
||||
let newVertex = state.vertex.edges[state.iterIndex]
|
||||
state.iterIndex += 1
|
||||
if !newVertex.topologicallySorted {
|
||||
stack.append(.init(vertex: newVertex))
|
||||
continue
|
||||
}
|
||||
}
|
||||
theVertex.topologicallySorted = true
|
||||
result.append(theVertex)
|
||||
stack.removeLast()
|
||||
}
|
||||
return result
|
||||
}
|
||||
}
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
public extension Megrez {
|
||||
/// 單元圖。
|
||||
@frozen struct Unigram: Equatable, CustomStringConvertible, Hashable {
|
||||
class Unigram: Equatable, CustomStringConvertible, Hashable {
|
||||
/// 資料值,通常是詞語或單個字。
|
||||
public var value: String
|
||||
/// 權重。
|
|
@ -0,0 +1,74 @@
|
|||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
// This package is trying to deprecate its dependency of Foundation, hence this file.
|
||||
|
||||
extension StringProtocol {
|
||||
func has(string target: any StringProtocol) -> Bool {
|
||||
let selfArray = Array(unicodeScalars)
|
||||
let targetArray = Array(target.description.unicodeScalars)
|
||||
guard !target.isEmpty else { return isEmpty }
|
||||
guard count >= target.count else { return false }
|
||||
for index in 0 ..< selfArray.count {
|
||||
let range = index ..< (Swift.min(index + targetArray.count, selfArray.count))
|
||||
let ripped = Array(selfArray[range])
|
||||
if ripped == targetArray { return true }
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func sliced(by separator: any StringProtocol = "") -> [String] {
|
||||
let selfArray = Array(unicodeScalars)
|
||||
let arrSeparator = Array(separator.description.unicodeScalars)
|
||||
var result: [String] = []
|
||||
var buffer: [Unicode.Scalar] = []
|
||||
var sleepCount = 0
|
||||
for index in 0 ..< selfArray.count {
|
||||
let currentChar = selfArray[index]
|
||||
let range = index ..< (Swift.min(index + arrSeparator.count, selfArray.count))
|
||||
let ripped = Array(selfArray[range])
|
||||
if ripped.isEmpty { continue }
|
||||
if ripped == arrSeparator {
|
||||
sleepCount = range.count
|
||||
result.append(buffer.map { String($0) }.joined())
|
||||
buffer.removeAll()
|
||||
}
|
||||
if sleepCount < 1 {
|
||||
buffer.append(currentChar)
|
||||
}
|
||||
sleepCount -= 1
|
||||
}
|
||||
result.append(buffer.map { String($0) }.joined())
|
||||
buffer.removeAll()
|
||||
return result
|
||||
}
|
||||
|
||||
func swapping(_ target: String, with newString: String) -> String {
|
||||
let selfArray = Array(unicodeScalars)
|
||||
let arrTarget = Array(target.description.unicodeScalars)
|
||||
var result = ""
|
||||
var buffer: [Unicode.Scalar] = []
|
||||
var sleepCount = 0
|
||||
for index in 0 ..< selfArray.count {
|
||||
let currentChar = selfArray[index]
|
||||
let range = index ..< (Swift.min(index + arrTarget.count, selfArray.count))
|
||||
let ripped = Array(selfArray[range])
|
||||
if ripped.isEmpty { continue }
|
||||
if ripped == arrTarget {
|
||||
sleepCount = ripped.count
|
||||
result.append(buffer.map { String($0) }.joined())
|
||||
result.append(newString)
|
||||
buffer.removeAll()
|
||||
}
|
||||
if sleepCount < 1 {
|
||||
buffer.append(currentChar)
|
||||
}
|
||||
sleepCount -= 1
|
||||
}
|
||||
result.append(buffer.map { String($0) }.joined())
|
||||
buffer.removeAll()
|
||||
return result
|
||||
}
|
||||
}
|
|
@ -19,7 +19,7 @@ class SimpleLM: LangModelProtocol {
|
|||
let col0 = String(linestream[0])
|
||||
let col1 = String(linestream[1])
|
||||
let col2 = Double(linestream[2]) ?? 0.0
|
||||
var u = Megrez.Unigram(value: swapKeyValue ? col0 : col1, score: 0)
|
||||
let u = Megrez.Unigram(value: swapKeyValue ? col0 : col1, score: 0)
|
||||
u.score = col2
|
||||
mutDatabase[swapKeyValue ? col1 : col0, default: []].append(u)
|
||||
}
|
||||
|
|
|
@ -351,7 +351,7 @@ final class MegrezTests: XCTestCase {
|
|||
"高科技公司的年終獎金".forEach { i in
|
||||
compositor.insertKey(i.description)
|
||||
}
|
||||
let result = compositor.walk().0
|
||||
let result = compositor.walk()
|
||||
XCTAssertEqual(result.joinedKeys(by: ""), ["高科技", "公司", "的", "年終", "獎金"])
|
||||
}
|
||||
|
||||
|
@ -384,7 +384,7 @@ final class MegrezTests: XCTestCase {
|
|||
compositor.insertKey("jiang3")
|
||||
compositor.walk()
|
||||
compositor.insertKey("jin1")
|
||||
var result = compositor.walk().0
|
||||
var result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高科技", "公司", "的", "年中", "獎金"])
|
||||
XCTAssertEqual(compositor.length, 10)
|
||||
compositor.cursor = 7
|
||||
|
@ -394,7 +394,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertTrue(candidates.contains("中"))
|
||||
XCTAssertTrue(candidates.contains("鍾"))
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("年終", at: 7))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高科技", "公司", "的", "年終", "獎金"])
|
||||
let candidatesBeginAt = compositor.fetchCandidates(at: 3, filter: .beginAt).map(\.value)
|
||||
let candidatesEndAt = compositor.fetchCandidates(at: 3, filter: .endAt).map(\.value)
|
||||
|
@ -436,11 +436,11 @@ final class MegrezTests: XCTestCase {
|
|||
compositor.insertKey("gao1")
|
||||
compositor.insertKey("ke1")
|
||||
compositor.insertKey("ji4")
|
||||
var result = compositor.walk().0
|
||||
var result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高科技"])
|
||||
compositor.insertKey("gong1")
|
||||
compositor.insertKey("si1")
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高科技", "公司"])
|
||||
}
|
||||
|
||||
|
@ -450,29 +450,29 @@ final class MegrezTests: XCTestCase {
|
|||
compositor.insertKey("gao1")
|
||||
compositor.insertKey("ke1")
|
||||
compositor.insertKey("ji4")
|
||||
var result = compositor.walk().0
|
||||
var result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高科技"])
|
||||
compositor.cursor = 0
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("膏", at: compositor.cursor))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["膏", "科技"])
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("高科技", at: 1))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高科技"])
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("膏", at: 0))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["膏", "科技"])
|
||||
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("柯", at: 1))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["膏", "柯", "際"])
|
||||
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("暨", at: 2))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["膏", "柯", "暨"])
|
||||
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("高科技", at: 3))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高科技"])
|
||||
}
|
||||
|
||||
|
@ -484,19 +484,19 @@ final class MegrezTests: XCTestCase {
|
|||
compositor.insertKey("zhong1")
|
||||
compositor.insertKey("jiang3")
|
||||
compositor.insertKey("jin1")
|
||||
var result = compositor.walk().0
|
||||
var result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["年中", "獎金"])
|
||||
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("終講", at: 1))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["年", "終講", "金"])
|
||||
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("槳襟", at: 2))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["年中", "槳襟"])
|
||||
|
||||
XCTAssertTrue(compositor.overrideCandidateLiteral("年終", at: 0))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["年終", "槳襟"])
|
||||
}
|
||||
|
||||
|
@ -509,16 +509,16 @@ final class MegrezTests: XCTestCase {
|
|||
compositor.insertKey("yan4")
|
||||
compositor.insertKey("wei2")
|
||||
compositor.insertKey("xian3")
|
||||
var result = compositor.walk().0
|
||||
var result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高熱", "火焰", "危險"])
|
||||
let location = 2
|
||||
|
||||
XCTAssertTrue(compositor.overrideCandidate(.init(keyArray: ["huo3"], value: "🔥"), at: location))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高熱", "🔥", "焰", "危險"])
|
||||
|
||||
XCTAssertTrue(compositor.overrideCandidate(.init(keyArray: ["huo3", "yan4"], value: "🔥"), at: location))
|
||||
result = compositor.walk().0
|
||||
result = compositor.walk()
|
||||
XCTAssertEqual(result.values, ["高熱", "🔥", "危險"])
|
||||
}
|
||||
|
||||
|
@ -530,11 +530,11 @@ final class MegrezTests: XCTestCase {
|
|||
compositor.insertKey("zhong1")
|
||||
compositor.insertKey("jiang3")
|
||||
compositor.insertKey("jin1")
|
||||
let oldResult = compositor.walk().0.values.joined()
|
||||
let oldResult = compositor.walk().values.joined()
|
||||
print(oldResult)
|
||||
theLM.trim(key: "nian2zhong1", value: "年中")
|
||||
compositor.update(updateExisting: true)
|
||||
let newResult = compositor.walk().0.values.joined()
|
||||
let newResult = compositor.walk().values.joined()
|
||||
print(newResult)
|
||||
XCTAssertEqual([oldResult, newResult], ["年中獎金", "年終獎金"])
|
||||
compositor.cursor = 4
|
||||
|
@ -542,7 +542,7 @@ final class MegrezTests: XCTestCase {
|
|||
compositor.dropKey(direction: .rear)
|
||||
theLM.trim(key: "nian2zhong1", value: "年終")
|
||||
compositor.update(updateExisting: true)
|
||||
let newResult2 = compositor.walk().0.values
|
||||
let newResult2 = compositor.walk().values
|
||||
print(newResult2)
|
||||
XCTAssertEqual(newResult2, ["年", "中"])
|
||||
}
|
||||
|
@ -555,8 +555,8 @@ final class MegrezTests: XCTestCase {
|
|||
compositorA.insertKey(key.description)
|
||||
}
|
||||
var compositorB = compositorA.hardCopy
|
||||
let resultA = compositorA.walk().walkedNodes
|
||||
let resultB = compositorB.walk().walkedNodes
|
||||
let resultA = compositorA.walk()
|
||||
let resultB = compositorB.walk()
|
||||
XCTAssertEqual(resultA, resultB)
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue