Megrez // 2.5.0 update, syncing changes from MegrezNT.
This commit is contained in:
parent
5bca4abef5
commit
258d2f7362
|
@ -1,5 +1,5 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
import Foundation
|
||||
|
||||
extension Megrez {
|
||||
/// 一個組字器用來在給定一系列的索引鍵的情況下(藉由一系列的觀測行為)返回一套資料值。
|
||||
///
|
||||
|
@ -20,9 +22,9 @@ extension Megrez {
|
|||
public enum ResizeBehavior { case expand, shrink }
|
||||
/// 該軌格內可以允許的最大幅位長度。
|
||||
public static var maxSpanLength: Int = 10 { didSet { maxSpanLength = max(6, maxSpanLength) } }
|
||||
/// 公開:多字讀音鍵當中用以分割漢字讀音的記號的預設值,是「-」。
|
||||
/// 多字讀音鍵當中用以分割漢字讀音的記號的預設值,是「-」。
|
||||
public static var theSeparator: String = "-"
|
||||
/// 該組字器的游標位置。
|
||||
/// 該組字器的敲字游標位置。
|
||||
public var cursor: Int = 0 {
|
||||
didSet {
|
||||
cursor = max(0, min(cursor, length))
|
||||
|
@ -30,30 +32,33 @@ extension Megrez {
|
|||
}
|
||||
}
|
||||
|
||||
/// 該組字器的標記器位置。
|
||||
/// 該組字器的標記器(副游標)位置。
|
||||
public var marker: Int = 0 { didSet { marker = max(0, min(marker, length)) } }
|
||||
/// 公開:多字讀音鍵當中用以分割漢字讀音的記號,預設為「-」。
|
||||
/// 多字讀音鍵當中用以分割漢字讀音的記號,預設為「-」。
|
||||
public var separator = theSeparator {
|
||||
didSet {
|
||||
Self.theSeparator = separator
|
||||
}
|
||||
}
|
||||
|
||||
/// 公開:組字器內已經插入的單筆索引鍵的數量。
|
||||
public var width: Int { keys.count }
|
||||
/// 公開:最近一次爬軌結果。
|
||||
/// 最近一次爬軌結果。
|
||||
public var walkedNodes: [Node] = []
|
||||
/// 公開:該組字器的長度,也就是內建漢字讀音的數量(唯讀)。
|
||||
/// 該組字器的長度,組字器內已經插入的單筆索引鍵的數量,也就是內建漢字讀音的數量(唯讀)。
|
||||
/// - Remark: 理論上而言,spans.count 也是這個數。
|
||||
/// 但是,為了防止萬一,就用了目前的方法來計算。
|
||||
public var length: Int { keys.count }
|
||||
/// 公開:組字器是否為空。
|
||||
/// 組字器是否為空。
|
||||
public var isEmpty: Bool { spans.isEmpty && keys.isEmpty }
|
||||
|
||||
/// 該組字器的索引鍵陣列。
|
||||
/// 該組字器已經插入的的索引鍵,以陣列的形式存放。
|
||||
public private(set) var keys = [String]()
|
||||
/// 該組字器的幅位陣列。
|
||||
public private(set) var spans = [Span]()
|
||||
/// 該組字器的幅位單元陣列。
|
||||
public private(set) var spans = [SpanUnit]()
|
||||
/// 該組字器所使用的語言模型(被 LangModelRanked 所封裝)。
|
||||
public var langModel: LangModelRanked
|
||||
public var langModel: LangModelRanked {
|
||||
didSet { clear() }
|
||||
}
|
||||
|
||||
/// 允許查詢當前游標位置屬於第幾個幅位座標(從 0 開始算)。
|
||||
public private(set) var cursorRegionMap: [Int: Int] = .init()
|
||||
|
||||
|
@ -64,8 +69,13 @@ extension Megrez {
|
|||
self.separator = separator
|
||||
}
|
||||
|
||||
/// 重置包括游標在內的各項參數,且清空各種由組字器生成的內部資料。
|
||||
///
|
||||
/// 將已經被插入的索引鍵陣列與幅位單元陣列(包括其內的節點)全部清空。
|
||||
/// 最近一次的爬軌結果陣列也會被清空。游標跳轉換算表也會被清空。
|
||||
public mutating func clear() {
|
||||
cursor = 0
|
||||
marker = 0
|
||||
keys.removeAll()
|
||||
spans.removeAll()
|
||||
walkedNodes.removeAll()
|
||||
|
@ -107,9 +117,16 @@ extension Megrez {
|
|||
}
|
||||
|
||||
/// 按幅位來前後移動游標。
|
||||
///
|
||||
/// 在威注音的術語體系當中,「與文字輸入方向相反的方向」為向後(Rear),反之則為向前(Front)。
|
||||
/// - Parameters:
|
||||
/// - direction: 移動方向。
|
||||
/// - isMarker: 要移動的是否為選擇標記(而非游標)。
|
||||
/// - direction: 指定移動方向(相對於文字輸入方向而言)。
|
||||
/// - isMarker: 要移動的是否為作為選擇標記的副游標(而非打字用的主游標)。
|
||||
/// 具體用法可以是這樣:你在標記模式下,
|
||||
/// 如果出現了「副游標切了某個字音數量不相等的節點」的情況的話,
|
||||
/// 則直接用這個函式將副游標往前推到接下來的正常的位置上。
|
||||
/// // 該特性不適用於小麥注音,除非小麥注音重新設計 InputState 且修改 KeyHandler、
|
||||
/// 將標記游標交給敝引擎來管理。屆時,NSStringUtils 將徹底卸任。
|
||||
/// - Returns: 該操作是否順利完成。
|
||||
@discardableResult public mutating func jumpCursorBySpan(to direction: TypingDirection, isMarker: Bool = false)
|
||||
-> Bool
|
||||
|
@ -117,7 +134,7 @@ extension Megrez {
|
|||
var target = isMarker ? marker : cursor
|
||||
switch direction {
|
||||
case .front:
|
||||
if target == width { return false }
|
||||
if target == length { return false }
|
||||
case .rear:
|
||||
if target == 0 { return false }
|
||||
}
|
||||
|
@ -152,27 +169,28 @@ extension Megrez {
|
|||
|
||||
/// 生成用以交給 GraphViz 診斷的資料檔案內容,純文字。
|
||||
public var dumpDOT: String {
|
||||
var strOutput = "digraph {\ngraph [ rankdir=LR ];\nBOS;\n"
|
||||
// C# StringBuilder 與 Swift NSMutableString 能提供爆發性的效能。
|
||||
let strOutput: NSMutableString = .init(string: "digraph {\ngraph [ rankdir=LR ];\nBOS;\n")
|
||||
for (p, span) in spans.enumerated() {
|
||||
for ni in 0...(span.maxLength) {
|
||||
guard let np = span.nodeOf(length: ni) else { continue }
|
||||
if p == 0 {
|
||||
strOutput += "BOS -> \(np.value);\n"
|
||||
strOutput.append("BOS -> \(np.value);\n")
|
||||
}
|
||||
strOutput += "\(np.value);\n"
|
||||
strOutput.append("\(np.value);\n")
|
||||
if (p + ni) < spans.count {
|
||||
let destinationSpan = spans[p + ni]
|
||||
for q in 0...(destinationSpan.maxLength) {
|
||||
guard let dn = destinationSpan.nodeOf(length: q) else { continue }
|
||||
strOutput += np.value + " -> " + dn.value + ";\n"
|
||||
strOutput.append(np.value + " -> " + dn.value + ";\n")
|
||||
}
|
||||
}
|
||||
guard (p + ni) == spans.count else { continue }
|
||||
strOutput += np.value + " -> EOS;\n"
|
||||
strOutput.append(np.value + " -> EOS;\n")
|
||||
}
|
||||
}
|
||||
strOutput += "EOS;\n}\n"
|
||||
return strOutput
|
||||
strOutput.append("EOS;\n}\n")
|
||||
return strOutput.description
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -180,9 +198,7 @@ extension Megrez {
|
|||
// MARK: - Internal Methods (Maybe Public)
|
||||
|
||||
extension Megrez.Compositor {
|
||||
// MARK: Internal methods for maintaining the grid.
|
||||
|
||||
/// 在該軌格的指定幅位座標擴增或減少一個幅位。
|
||||
/// 在該軌格的指定幅位座標擴增或減少一個幅位單元。
|
||||
/// - Parameters:
|
||||
/// - location: 給定的幅位座標。
|
||||
/// - action: 指定是擴張還是縮減一個幅位。
|
||||
|
@ -190,7 +206,7 @@ extension Megrez.Compositor {
|
|||
let location = max(min(location, spans.count), 0) // 防呆
|
||||
switch action {
|
||||
case .expand:
|
||||
spans.insert(Span(), at: location)
|
||||
spans.insert(SpanUnit(), at: location)
|
||||
if [0, spans.count].contains(location) { return }
|
||||
case .shrink:
|
||||
if spans.count == location { return }
|
||||
|
@ -241,26 +257,31 @@ extension Megrez.Compositor {
|
|||
}
|
||||
}
|
||||
|
||||
@discardableResult mutating func insertNode(_ node: Node, at location: Int) -> Bool {
|
||||
let location = max(min(location, spans.count - 1), 0) // 防呆
|
||||
spans[location].append(node: node)
|
||||
return true
|
||||
}
|
||||
|
||||
/// 自索引鍵陣列獲取指定範圍的資料。
|
||||
/// - Parameter range: 指定範圍。
|
||||
/// - Returns: 拿到的資料。
|
||||
func getJoinedKeyArray(range: Range<Int>) -> [String] {
|
||||
// 下面這句不能用 contains,不然會要求至少 macOS 13 Ventura。
|
||||
guard range.upperBound <= keys.count, range.lowerBound >= 0 else { return [] }
|
||||
return keys[range].map { String($0) }
|
||||
}
|
||||
|
||||
/// 在指定位置(以指定索引鍵陣列和指定幅位長度)拿取節點。
|
||||
/// - Parameters:
|
||||
/// - location: 指定游標位置。
|
||||
/// - length: 指定幅位長度。
|
||||
/// - keyArray: 指定索引鍵陣列。
|
||||
/// - Returns: 拿取的節點。拿不到的話就會是 nil。
|
||||
func getNode(at location: Int, length: Int, keyArray: [String]) -> Node? {
|
||||
let location = max(min(location, spans.count), 0) // 防呆
|
||||
let location = max(min(location, spans.count - 1), 0) // 防呆
|
||||
guard let node = spans[location].nodeOf(length: length) else { return nil }
|
||||
return keyArray == node.keyArray ? node : nil
|
||||
}
|
||||
|
||||
/// 根據當前狀況更新整個組字器的節點文脈。
|
||||
/// - Returns: 新增了多少節點。如果返回「0」則表示可能發生了錯誤。
|
||||
/// - Parameter updateExisting: 是否根據目前的語言模型的資料狀態來對既有節點更新其內部的單元圖陣列資料。
|
||||
/// 該特性可以用於「在選字窗內屏蔽了某個詞之後,立刻生效」這樣的軟體功能需求的實現。
|
||||
/// - Returns: 新增或影響了多少個節點。如果返回「0」則表示可能發生了錯誤。
|
||||
@discardableResult public mutating func update(updateExisting: Bool = false) -> Int {
|
||||
let maxSpanLength = Megrez.Compositor.maxSpanLength
|
||||
let range = max(0, cursor - maxSpanLength)..<min(cursor + maxSpanLength, keys.count)
|
||||
|
@ -276,16 +297,15 @@ extension Megrez.Compositor {
|
|||
if theNode.keyArray.count == 1 { continue }
|
||||
spans[position].nodes.removeAll { $0 == theNode }
|
||||
} else {
|
||||
theNode.resetUnigrams(using: unigrams)
|
||||
theNode.syncingUnigrams(from: unigrams)
|
||||
}
|
||||
nodesChanged += 1
|
||||
continue
|
||||
}
|
||||
let unigrams = langModel.unigramsFor(keyArray: joinedKeyArray)
|
||||
guard !unigrams.isEmpty else { continue }
|
||||
insertNode(
|
||||
.init(keyArray: joinedKeyArray, spanLength: theLength, unigrams: unigrams),
|
||||
at: position
|
||||
spans[position].append(
|
||||
node: .init(keyArray: joinedKeyArray, spanLength: theLength, unigrams: unigrams)
|
||||
)
|
||||
nodesChanged += 1
|
||||
}
|
||||
|
@ -293,12 +313,13 @@ extension Megrez.Compositor {
|
|||
return nodesChanged
|
||||
}
|
||||
|
||||
mutating func updateCursorJumpingTables(_ walkedNodes: [Node]) {
|
||||
/// 更新游標跳轉換算表。
|
||||
mutating func updateCursorJumpingTables() {
|
||||
var cursorRegionMapDict = [Int: Int]()
|
||||
cursorRegionMapDict[-1] = 0 // 防呆
|
||||
var counter = 0
|
||||
for (i, anchor) in walkedNodes.enumerated() {
|
||||
for _ in 0..<anchor.spanLength {
|
||||
for (i, theNode) in walkedNodes.enumerated() {
|
||||
for _ in 0..<theNode.spanLength {
|
||||
cursorRegionMapDict[counter] = i
|
||||
counter += 1
|
||||
}
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
extension Megrez.Compositor {
|
||||
/// 爬軌函式,會更新當前組字器的 walkedNodes。
|
||||
///
|
||||
/// 找到軌格陣圖內權重最大的路徑。該路徑代表了可被觀測到的最可能的隱藏事件鏈。
|
||||
/// 這裡使用 Cormen 在 2001 年出版的教材當中提出的「有向無環圖的最短路徑」的
|
||||
/// 算法來計算這種路徑。不過,這裡不是要計算距離最短的路徑,而是計算距離最長
|
||||
|
@ -11,23 +13,23 @@ extension Megrez.Compositor {
|
|||
/// 對於 `G = (V, E)`,該算法的運行次數為 `O(|V|+|E|)`,其中 `G` 是一個有向無環圖。
|
||||
/// 這意味著,即使軌格很大,也可以用很少的算力就可以爬軌。
|
||||
/// - Returns: 爬軌結果+該過程是否順利執行。
|
||||
@discardableResult public mutating func walk() -> ([Node], Bool) {
|
||||
@discardableResult public mutating func walk() -> (walkedNode: [Node], succeeded: Bool) {
|
||||
var result = [Node]()
|
||||
defer {
|
||||
walkedNodes = result
|
||||
updateCursorJumpingTables(walkedNodes)
|
||||
updateCursorJumpingTables()
|
||||
}
|
||||
guard !spans.isEmpty else { return (result, true) }
|
||||
|
||||
var vertexSpans = [VertexSpan]()
|
||||
var vertexSpans = [[Vertex]]()
|
||||
for _ in spans {
|
||||
vertexSpans.append(.init())
|
||||
}
|
||||
|
||||
for (i, span) in spans.enumerated() {
|
||||
for j in 1...span.maxLength {
|
||||
if let p = span.nodeOf(length: j) {
|
||||
vertexSpans[i].append(.init(node: p))
|
||||
if let theNode = span.nodeOf(length: j) {
|
||||
vertexSpans[i].append(.init(node: theNode))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -60,15 +62,15 @@ extension Megrez.Compositor {
|
|||
}
|
||||
|
||||
var walked = [Node]()
|
||||
var totalKeyLength = 0
|
||||
var it = terminal
|
||||
while let itPrev = it.prev {
|
||||
var totalLengthOfKeys = 0
|
||||
var iterated = terminal
|
||||
while let itPrev = iterated.prev {
|
||||
walked.append(itPrev.node)
|
||||
it = itPrev
|
||||
totalKeyLength += it.node.spanLength
|
||||
iterated = itPrev
|
||||
totalLengthOfKeys += iterated.node.spanLength
|
||||
}
|
||||
|
||||
guard totalKeyLength == keys.count else {
|
||||
guard totalLengthOfKeys == keys.count else {
|
||||
print("!!! ERROR A")
|
||||
return (result, false)
|
||||
}
|
||||
|
@ -82,26 +84,3 @@ extension Megrez.Compositor {
|
|||
return (result, true)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Stable Sort Extension
|
||||
|
||||
// Reference: https://stackoverflow.com/a/50545761/4162914
|
||||
|
||||
extension Sequence {
|
||||
/// Return a stable-sorted collection.
|
||||
///
|
||||
/// - Parameter areInIncreasingOrder: Return nil when two element are equal.
|
||||
/// - Returns: The sorted collection.
|
||||
fileprivate func stableSorted(
|
||||
by areInIncreasingOrder: (Element, Element) throws -> Bool
|
||||
)
|
||||
rethrows -> [Element]
|
||||
{
|
||||
try enumerated()
|
||||
.sorted { a, b -> Bool in
|
||||
try areInIncreasingOrder(a.element, b.element)
|
||||
|| (a.offset < b.offset && !areInIncreasingOrder(b.element, a.element))
|
||||
}
|
||||
.map(\.element)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
import Foundation
|
||||
|
||||
extension Megrez.Compositor {
|
||||
/// 鍵值配對,乃索引鍵陣列與讀音的配對單元。
|
||||
public struct KeyValuePaired: Equatable, Hashable, Comparable, CustomStringConvertible {
|
||||
/// 鍵陣列。一般情況下用來放置讀音等可以用來作為索引的內容。
|
||||
/// 索引鍵陣列。一般情況下用來放置讀音等可以用來作為索引的內容。
|
||||
public var keyArray: [String]
|
||||
/// 資料值。
|
||||
public var value: String
|
||||
|
@ -20,7 +21,7 @@ extension Megrez.Compositor {
|
|||
|
||||
/// 初期化一組鍵值配對。
|
||||
/// - Parameters:
|
||||
/// - key: 鍵陣列。一般情況下用來放置讀音等可以用來作為索引的內容。
|
||||
/// - keyArray: 索引鍵陣列。一般情況下用來放置讀音等可以用來作為索引的內容。
|
||||
/// - value: 資料值。
|
||||
public init(keyArray: [String], value: String = "N/A") {
|
||||
self.keyArray = keyArray.isEmpty ? ["N/A"] : keyArray
|
||||
|
@ -29,13 +30,15 @@ extension Megrez.Compositor {
|
|||
|
||||
/// 初期化一組鍵值配對。
|
||||
/// - Parameters:
|
||||
/// - key: 鍵。一般情況下用來放置讀音等可以用來作為索引的內容。
|
||||
/// - key: 索引鍵。一般情況下用來放置讀音等可以用來作為索引的內容。
|
||||
/// - value: 資料值。
|
||||
public init(key: String = "N/A", value: String = "N/A") {
|
||||
keyArray = key.isEmpty ? ["N/A"] : key.components(separatedBy: Megrez.Compositor.theSeparator)
|
||||
self.value = value.isEmpty ? "N/A" : value
|
||||
}
|
||||
|
||||
/// 做為預設雜湊函式。
|
||||
/// - Parameter hasher: 目前物件的雜湊碼。
|
||||
public func hash(into hasher: inout Hasher) {
|
||||
hasher.combine(keyArray)
|
||||
hasher.combine(value)
|
||||
|
@ -50,26 +53,30 @@ extension Megrez.Compositor {
|
|||
}
|
||||
|
||||
public static func < (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool {
|
||||
(lhs.keyArray.joined().count < rhs.keyArray.joined().count)
|
||||
|| (lhs.keyArray.joined().count == rhs.keyArray.joined().count && lhs.value < rhs.value)
|
||||
(lhs.keyArray.count < rhs.keyArray.count)
|
||||
|| (lhs.keyArray.count == rhs.keyArray.count && lhs.value < rhs.value)
|
||||
}
|
||||
|
||||
public static func > (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool {
|
||||
(lhs.keyArray.joined().count > rhs.keyArray.joined().count)
|
||||
|| (lhs.keyArray.joined().count == rhs.keyArray.joined().count && lhs.value > rhs.value)
|
||||
(lhs.keyArray.count > rhs.keyArray.count)
|
||||
|| (lhs.keyArray.count == rhs.keyArray.count && lhs.value > rhs.value)
|
||||
}
|
||||
|
||||
public static func <= (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool {
|
||||
(lhs.keyArray.joined().count <= rhs.keyArray.joined().count)
|
||||
|| (lhs.keyArray.joined().count == rhs.keyArray.joined().count && lhs.value <= rhs.value)
|
||||
(lhs.keyArray.count <= rhs.keyArray.count)
|
||||
|| (lhs.keyArray.count == rhs.keyArray.count && lhs.value <= rhs.value)
|
||||
}
|
||||
|
||||
public static func >= (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool {
|
||||
(lhs.keyArray.joined().count >= rhs.keyArray.joined().count)
|
||||
|| (lhs.keyArray.joined().count == rhs.keyArray.joined().count && lhs.value >= rhs.value)
|
||||
(lhs.keyArray.count >= rhs.keyArray.count)
|
||||
|| (lhs.keyArray.count == rhs.keyArray.count && lhs.value >= rhs.value)
|
||||
}
|
||||
}
|
||||
|
||||
/// 規定候選字陣列內容的獲取範圍類型:
|
||||
/// - all: 不只包含其它兩類結果,還允許游標穿插候選字。
|
||||
/// - beginAt: 僅獲取從當前游標位置開始的節點內的候選字。
|
||||
/// - endAt 僅獲取在當前游標位置結束的節點內的候選字。
|
||||
public enum CandidateFetchFilter { case all, beginAt, endAt }
|
||||
|
||||
/// 返回在當前位置的所有候選字詞(以詞音配對的形式)。如果組字器內有幅位、且游標
|
||||
|
@ -82,12 +89,12 @@ extension Megrez.Compositor {
|
|||
guard !keys.isEmpty else { return result }
|
||||
let location = max(min(location, keys.count - 1), 0) // 防呆
|
||||
let anchors: [NodeAnchor] = fetchOverlappingNodes(at: location).stableSorted {
|
||||
// 按照讀音的長度來給節點排序。
|
||||
// 按照讀音的長度(幅位長度)來給節點排序。
|
||||
$0.spanLength > $1.spanLength
|
||||
}
|
||||
let keyAtCursor = keys[location]
|
||||
for theNode in anchors.map(\.node) {
|
||||
if theNode.keyArray.joined(separator: separator).isEmpty { continue }
|
||||
if theNode.keyArray.isEmpty { continue }
|
||||
for gram in theNode.unigrams {
|
||||
switch filter {
|
||||
case .all:
|
||||
|
@ -106,9 +113,9 @@ extension Megrez.Compositor {
|
|||
|
||||
/// 使用給定的候選字(詞音配對),將給定位置的節點的候選字詞改為與之一致的候選字詞。
|
||||
///
|
||||
/// 該函式可以僅用作過程函式。
|
||||
/// 該函式僅用作過程函式。
|
||||
/// - Parameters:
|
||||
/// - candidate: 指定用來覆寫為的候選字(詞音配對)。
|
||||
/// - candidate: 指定用來覆寫為的候選字(詞音鍵值配對)。
|
||||
/// - location: 游標位置。
|
||||
/// - overrideType: 指定覆寫行為。
|
||||
/// - Returns: 該操作是否成功執行。
|
||||
|
@ -139,7 +146,7 @@ extension Megrez.Compositor {
|
|||
|
||||
/// 使用給定的候選字(詞音配對)、或給定的候選字詞字串,將給定位置的節點的候選字詞改為與之一致的候選字詞。
|
||||
/// - Parameters:
|
||||
/// - key: 索引鍵,也就是詞音配對當中的讀音。
|
||||
/// - keyArray: 索引鍵陣列,也就是詞音配對當中的讀音。
|
||||
/// - location: 游標位置。
|
||||
/// - value: 資料值。
|
||||
/// - type: 指定覆寫行為。
|
||||
|
@ -151,16 +158,11 @@ extension Megrez.Compositor {
|
|||
var arrOverlappedNodes: [NodeAnchor] = fetchOverlappingNodes(at: min(keys.count - 1, location))
|
||||
var overridden: NodeAnchor?
|
||||
for anchor in arrOverlappedNodes {
|
||||
if let keyArray = keyArray,
|
||||
anchor.node.keyArray.joined(separator: separator) != keyArray.joined(separator: separator)
|
||||
{
|
||||
continue
|
||||
}
|
||||
if anchor.node.selectOverrideUnigram(value: value, type: type) {
|
||||
if keyArray != nil, anchor.node.keyArray != keyArray { continue }
|
||||
if !anchor.node.selectOverrideUnigram(value: value, type: type) { continue }
|
||||
overridden = anchor
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
guard let overridden = overridden else { return false } // 啥也不覆寫。
|
||||
|
||||
|
@ -171,8 +173,8 @@ extension Megrez.Compositor {
|
|||
arrOverlappedNodes = fetchOverlappingNodes(at: i)
|
||||
for anchor in arrOverlappedNodes {
|
||||
if anchor.node == overridden.node { continue }
|
||||
if !overridden.node.keyArray.joined(separator: separator).contains(
|
||||
anchor.node.keyArray.joined(separator: separator)) || !overridden.node.value.contains(anchor.node.value)
|
||||
if !overridden.node.joinedKey(by: "\t").contains(anchor.node.joinedKey(by: "\t"))
|
||||
|| !overridden.node.value.contains(anchor.node.value)
|
||||
{
|
||||
anchor.node.reset()
|
||||
continue
|
||||
|
|
|
@ -1,18 +1,28 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
extension Megrez.Compositor {
|
||||
/// 幅位乃指一組共享起點的節點。
|
||||
public class Span {
|
||||
/// 幅位單元乃指一組共享起點的節點。
|
||||
public class SpanUnit {
|
||||
/// 節點陣列。每個位置上的節點可能是 nil。
|
||||
public var nodes: [Node?] = []
|
||||
/// 該幅位單元內的所有節點當中持有最長幅位的節點長度。
|
||||
/// 該變數受該幅位的自身操作函式而被動更新。
|
||||
public private(set) var maxLength = 0
|
||||
|
||||
/// (該變數為捷徑,代傳 Megrez.Compositor.maxSpanLength。)
|
||||
private var maxSpanLength: Int { Megrez.Compositor.maxSpanLength }
|
||||
/// 該幅位單元內的節點的幅位長度上限。
|
||||
private var allowedLengths: ClosedRange<Int> { 1...maxSpanLength }
|
||||
|
||||
/// 幅位乃指一組共享起點的節點。
|
||||
public init() {
|
||||
clear()
|
||||
}
|
||||
|
||||
/// 清除該幅位單元的全部的節點,且重設最長節點長度為 0,然後再在節點陣列內預留空位。
|
||||
public func clear() {
|
||||
nodes.removeAll()
|
||||
for _ in 0..<maxSpanLength {
|
||||
|
@ -25,7 +35,7 @@ extension Megrez.Compositor {
|
|||
/// - Parameter node: 要塞入的節點。
|
||||
/// - Returns: 該操作是否成功執行。
|
||||
@discardableResult public func append(node: Node) -> Bool {
|
||||
guard (1...maxSpanLength).contains(node.spanLength) else {
|
||||
guard allowedLengths.contains(node.spanLength) else {
|
||||
return false
|
||||
}
|
||||
nodes[node.spanLength - 1] = node
|
||||
|
@ -37,7 +47,7 @@ extension Megrez.Compositor {
|
|||
/// - Parameter length: 給定的幅位長度。
|
||||
/// - Returns: 該操作是否成功執行。
|
||||
@discardableResult public func dropNodesOfOrBeyond(length: Int) -> Bool {
|
||||
guard (1...maxSpanLength).contains(length) else {
|
||||
guard allowedLengths.contains(length) else {
|
||||
return false
|
||||
}
|
||||
for i in length...maxSpanLength {
|
||||
|
@ -47,16 +57,18 @@ extension Megrez.Compositor {
|
|||
guard length > 1 else { return false }
|
||||
let maxR = length - 2
|
||||
for i in 0...maxR {
|
||||
if nodes[maxR - i] != nil {
|
||||
if nodes[maxR - i] == nil { continue }
|
||||
maxLength = maxR - i + 1
|
||||
break
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
/// 以給定的幅位長度,在當前幅位單元內找出對應的節點。
|
||||
/// - Parameter length: 給定的幅位長度。
|
||||
/// - Returns: 查詢結果。
|
||||
public func nodeOf(length: Int) -> Node? {
|
||||
guard (1...maxSpanLength).contains(length) else { return nil }
|
||||
guard allowedLengths.contains(length) else { return nil }
|
||||
return nodes[length - 1]
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
|
@ -19,7 +19,13 @@ extension Megrez.Compositor {
|
|||
public var distance = -(Double.infinity)
|
||||
/// 在進行進行位相幾何排序時會用到的狀態標記。
|
||||
public var topologicallySorted = false
|
||||
/// 字詞節點。
|
||||
public var node: Node
|
||||
|
||||
/// 初期化一個「有向無環圖的」的頂點單位。
|
||||
///
|
||||
/// 這是一個可變的數據結構,用於有向無環圖的構建和單源最短路徑的計算。
|
||||
/// - Parameter node: 字詞節點。
|
||||
public init(node: Node) {
|
||||
self.node = node
|
||||
}
|
||||
|
@ -32,18 +38,15 @@ extension Megrez.Compositor {
|
|||
/// - u: 參照頂點,會在必要時成為 v 的前述頂點。
|
||||
/// - v: 要影響的頂點。
|
||||
func relax(u: Vertex, v: inout Vertex) {
|
||||
/// 從 u 到 w 的距離,也就是 v 的權重。
|
||||
// 從 u 到 w 的距離,也就是 v 的權重。
|
||||
let w: Double = v.node.score
|
||||
/// 這裡計算最大權重:
|
||||
/// 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」,
|
||||
/// 我們就更新 v 的距離及其前述頂點。
|
||||
if v.distance < u.distance + w {
|
||||
// 這裡計算最大權重:
|
||||
// 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」,
|
||||
// 我們就更新 v 的距離及其前述頂點。
|
||||
if v.distance >= u.distance + w { return }
|
||||
v.distance = u.distance + w
|
||||
v.prev = u
|
||||
}
|
||||
}
|
||||
|
||||
typealias VertexSpan = [Vertex]
|
||||
|
||||
/// 對持有單個根頂點的有向無環圖進行位相幾何排序(topological
|
||||
/// sort)、且將排序結果以頂點陣列的形式給出。
|
||||
|
@ -61,13 +64,13 @@ extension Megrez.Compositor {
|
|||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
/// 至於遞迴版本則類似於 Cormen 在 2001 年的著作「Introduction to Algorithms」當中的樣子。
|
||||
/// 至於其遞迴版本,則類似於 Cormen 在 2001 年的著作「Introduction to Algorithms」當中的樣子。
|
||||
/// - Parameter root: 根頂點。
|
||||
/// - Returns: 排序結果(頂點陣列)。
|
||||
func topologicalSort(root: Vertex) -> [Vertex] {
|
||||
class State {
|
||||
var iterIndex: Int
|
||||
var vertex: Vertex
|
||||
let vertex: Vertex
|
||||
init(vertex: Vertex, iterIndex: Int = 0) {
|
||||
self.vertex = vertex
|
||||
self.iterIndex = iterIndex
|
||||
|
|
|
@ -1,14 +1,16 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
extension Megrez.Compositor {
|
||||
/// 字詞節點。
|
||||
///
|
||||
/// 一個節點由這些內容組成:幅位長度、索引鍵、以及一組單元圖。幅位長度就是指這個
|
||||
/// 節點在組字器內橫跨了多少個字長。組字器負責構築自身的節點。對於由多個漢字組成
|
||||
/// 的詞,組字器會將多個讀音索引鍵合併為一個讀音索引鍵、據此向語言模組請求對應的
|
||||
/// 單元圖結果陣列。舉例說,如果一個詞有兩個漢字組成的話,那麼讀音也是有兩個、其
|
||||
/// 索引鍵值也是由兩個讀音組成的,那麼這個節點的幅位長度就是 2。
|
||||
/// 索引鍵也是由兩個讀音組成的,那麼這個節點的幅位長度就是 2。
|
||||
public class Node: Equatable, Hashable {
|
||||
/// 三種不同的針對一個節點的覆寫行為。
|
||||
/// - withNoOverrides: 無覆寫行為。
|
||||
|
@ -17,7 +19,7 @@ extension Megrez.Compositor {
|
|||
/// [("a", -114), ("b", -514), ("c", -1919)] 的話,指定該覆寫行為則會導致該節
|
||||
/// 點返回的結果為 ("c", -114)。該覆寫行為多用於諸如使用者半衰記憶模組的建議
|
||||
/// 行為。被覆寫的這個節點的狀態可能不會再被爬軌行為擅自改回。該覆寫行為無法
|
||||
/// 防止其它節點被爬軌函式所支配。這種情況下就需要用到 overridingScore
|
||||
/// 防止其它節點被爬軌函式所支配。這種情況下就需要用到 overridingScore。
|
||||
/// - withHighScore: 將該節點權重覆寫為 overridingScore,使其被爬軌函式所青睞。
|
||||
public enum OverrideType: Int {
|
||||
case withNoOverrides = 0
|
||||
|
@ -36,84 +38,109 @@ extension Megrez.Compositor {
|
|||
|
||||
// public var key: String { keyArray.joined(separator: Megrez.Compositor.theSeparator) }
|
||||
|
||||
/// 索引鍵陣列。
|
||||
public private(set) var keyArray: [String]
|
||||
/// 幅位長度。
|
||||
public private(set) var spanLength: Int
|
||||
/// 單元圖陣列。
|
||||
public private(set) var unigrams: [Megrez.Unigram]
|
||||
/// 該節點目前的覆寫狀態種類。
|
||||
public private(set) var currentOverrideType: Node.OverrideType
|
||||
/// 當前該節點所指向的(單元圖陣列內的)單元圖索引位置。
|
||||
public private(set) var currentUnigramIndex: Int = 0 {
|
||||
didSet { currentUnigramIndex = max(min(unigrams.count - 1, currentUnigramIndex), 0) }
|
||||
}
|
||||
|
||||
/// 該節點當前狀態所展示的鍵值配對。
|
||||
public var currentPair: Megrez.Compositor.KeyValuePaired { .init(keyArray: keyArray, value: value) }
|
||||
|
||||
/// 做為預設雜湊函式。
|
||||
/// - Parameter hasher: 目前物件的雜湊碼。
|
||||
public func hash(into hasher: inout Hasher) {
|
||||
hasher.combine(keyArray)
|
||||
hasher.combine(spanLength)
|
||||
hasher.combine(unigrams)
|
||||
hasher.combine(currentUnigramIndex)
|
||||
hasher.combine(spanLength)
|
||||
hasher.combine(overrideType)
|
||||
hasher.combine(currentOverrideType)
|
||||
}
|
||||
|
||||
/// 置換掉該節點內的單元圖陣列資料。
|
||||
/// 如果此時影響到了 currentUnigramIndex 所指的內容的話,則將其重設為 0。
|
||||
/// - Parameter source: 新的單元圖陣列資料,必須不能為空(否則必定崩潰)。
|
||||
public func resetUnigrams(using source: [Megrez.Unigram]) {
|
||||
let oldCurrentValue = unigrams[currentUnigramIndex].value
|
||||
unigrams = source
|
||||
// if unigrams.isEmpty { unigrams.append(.init(value: key, score: -114.514)) } // 保險,請按需啟用。
|
||||
currentUnigramIndex = max(min(unigrams.count - 1, currentUnigramIndex), 0)
|
||||
let newCurrentValue = unigrams[currentUnigramIndex].value
|
||||
if oldCurrentValue != newCurrentValue { currentUnigramIndex = 0 }
|
||||
}
|
||||
|
||||
public private(set) var overrideType: Node.OverrideType
|
||||
|
||||
public static func == (lhs: Node, rhs: Node) -> Bool {
|
||||
lhs.keyArray == rhs.keyArray && lhs.spanLength == rhs.spanLength
|
||||
&& lhs.unigrams == rhs.unigrams && lhs.overrideType == rhs.overrideType
|
||||
&& lhs.unigrams == rhs.unigrams && lhs.currentOverrideType == rhs.currentOverrideType
|
||||
}
|
||||
|
||||
/// 生成一個字詞節點。
|
||||
///
|
||||
/// 一個節點由這些內容組成:幅位長度、索引鍵、以及一組單元圖。幅位長度就是指這個
|
||||
/// 節點在組字器內橫跨了多少個字長。組字器負責構築自身的節點。對於由多個漢字組成
|
||||
/// 的詞,組字器會將多個讀音索引鍵合併為一個讀音索引鍵、據此向語言模組請求對應的
|
||||
/// 單元圖結果陣列。舉例說,如果一個詞有兩個漢字組成的話,那麼讀音也是有兩個、其
|
||||
/// 索引鍵也是由兩個讀音組成的,那麼這個節點的幅位長度就是 2。
|
||||
/// - Parameters:
|
||||
/// - keyArray: 給定索引鍵陣列,不得為空。
|
||||
/// - spanLength: 給定幅位長度,一般情況下與給定索引鍵陣列內的索引鍵數量一致。
|
||||
/// - unigrams: 給定單元圖陣列,不得為空。
|
||||
public init(keyArray: [String] = [], spanLength: Int = 0, unigrams: [Megrez.Unigram] = []) {
|
||||
self.keyArray = keyArray
|
||||
self.spanLength = spanLength
|
||||
self.spanLength = max(spanLength, 0)
|
||||
self.unigrams = unigrams
|
||||
overrideType = .withNoOverrides
|
||||
currentOverrideType = .withNoOverrides
|
||||
}
|
||||
|
||||
/// 檢查當前節點是否「讀音字長與候選字字長不一致」。
|
||||
public var isReadingMismatched: Bool {
|
||||
keyArray.count != value.count
|
||||
}
|
||||
public var isReadingMismatched: Bool { keyArray.count != value.count }
|
||||
/// 該節點是否處於被覆寫的狀態。
|
||||
public var isOverridden: Bool { currentOverrideType != .withNoOverrides }
|
||||
|
||||
/// 給出目前的最高權重單元圖。該結果可能會受節點覆寫狀態所影響。
|
||||
/// 給出該節點內部單元圖陣列內目前被索引位置所指向的單元圖。
|
||||
public var currentUnigram: Megrez.Unigram {
|
||||
unigrams.isEmpty ? .init() : unigrams[currentUnigramIndex]
|
||||
}
|
||||
|
||||
/// 給出該節點內部單元圖陣列內目前被索引位置所指向的單元圖的資料值。
|
||||
public var value: String { currentUnigram.value }
|
||||
|
||||
/// 給出目前的最高權重單元圖當中的權重值。該結果可能會受節點覆寫狀態所影響。
|
||||
public var score: Double {
|
||||
guard !unigrams.isEmpty else { return 0 }
|
||||
switch overrideType {
|
||||
switch currentOverrideType {
|
||||
case .withHighScore: return overridingScore
|
||||
case .withTopUnigramScore: return unigrams[0].score
|
||||
default: return currentUnigram.score
|
||||
}
|
||||
}
|
||||
|
||||
public var isOverriden: Bool {
|
||||
overrideType != .withNoOverrides
|
||||
}
|
||||
|
||||
/// 重設該節點的覆寫狀態、及其內部的單元圖索引位置指向。
|
||||
public func reset() {
|
||||
currentUnigramIndex = 0
|
||||
overrideType = .withNoOverrides
|
||||
currentOverrideType = .withNoOverrides
|
||||
}
|
||||
|
||||
/// 將索引鍵按照給定的分隔符銜接成一個字串。
|
||||
/// - Parameter separator: 給定的分隔符,預設值為 Compositor.theSeparator。
|
||||
/// - Returns: 已經銜接完畢的字串。
|
||||
public func joinedKey(by separator: String = Megrez.Compositor.theSeparator) -> String {
|
||||
keyArray.joined(separator: separator)
|
||||
}
|
||||
|
||||
/// 置換掉該節點內的單元圖陣列資料。
|
||||
/// 如果此時影響到了 currentUnigramIndex 所指的內容的話,則將其重設為 0。
|
||||
/// - Parameter source: 新的單元圖陣列資料,必須不能為空(否則必定崩潰)。
|
||||
public func syncingUnigrams(from source: [Megrez.Unigram]) {
|
||||
let oldCurrentValue = unigrams[currentUnigramIndex].value
|
||||
unigrams = source
|
||||
// if unigrams.isEmpty { unigrams.append(.init(value: key, score: -114.514)) } // 保險,請按需啟用。
|
||||
currentUnigramIndex = max(min(unigrams.count - 1, currentUnigramIndex), 0)
|
||||
let newCurrentValue = unigrams[currentUnigramIndex].value
|
||||
if oldCurrentValue != newCurrentValue { reset() }
|
||||
}
|
||||
|
||||
/// 指定要覆寫的單元圖資料值、以及覆寫行為種類。
|
||||
/// - Parameters:
|
||||
/// - value: 給定的單元圖資料值。
|
||||
/// - type: 覆寫行為種類。
|
||||
/// - Returns: 操作是否順利完成。
|
||||
public func selectOverrideUnigram(value: String, type: Node.OverrideType) -> Bool {
|
||||
guard type != .withNoOverrides else {
|
||||
return false
|
||||
|
@ -121,7 +148,7 @@ extension Megrez.Compositor {
|
|||
for (i, gram) in unigrams.enumerated() {
|
||||
if value != gram.value { continue }
|
||||
currentUnigramIndex = i
|
||||
overrideType = type
|
||||
currentOverrideType = type
|
||||
return true
|
||||
}
|
||||
return false
|
||||
|
@ -130,18 +157,23 @@ extension Megrez.Compositor {
|
|||
}
|
||||
|
||||
extension Megrez.Compositor {
|
||||
/// 節錨。
|
||||
///
|
||||
/// 在 Gramambular 當中又被稱為「NodeInSpan」。
|
||||
/// 節錨。在 Gramambular 2 當中又被稱為「NodeInSpan」。
|
||||
public struct NodeAnchor: Hashable {
|
||||
/// 節點。
|
||||
let node: Megrez.Compositor.Node
|
||||
let spanIndex: Int // 幅位座標
|
||||
/// 幅位座標。
|
||||
let spanIndex: Int
|
||||
/// 幅位長度。
|
||||
var spanLength: Int { node.spanLength }
|
||||
/// 單元圖陣列。
|
||||
var unigrams: [Megrez.Unigram] { node.unigrams }
|
||||
/// 索引鍵陣列。
|
||||
var keyArray: [String] { node.keyArray }
|
||||
/// 給出該節點內部單元圖陣列內目前被索引位置所指向的單元圖的資料值。
|
||||
var value: String { node.value }
|
||||
|
||||
/// 將該節錨雜湊化。
|
||||
/// 做為預設雜湊函式。
|
||||
/// - Parameter hasher: 目前物件的雜湊碼。
|
||||
public func hash(into hasher: inout Hasher) {
|
||||
hasher.combine(node)
|
||||
hasher.combine(spanIndex)
|
||||
|
@ -152,7 +184,7 @@ extension Megrez.Compositor {
|
|||
// MARK: - Array Extensions.
|
||||
|
||||
extension Array where Element == Megrez.Compositor.Node {
|
||||
/// 從一個節點陣列當中取出目前的自動選字字串陣列。
|
||||
/// 從一個節點陣列當中取出目前的選字字串陣列。
|
||||
public var values: [String] { map(\.value) }
|
||||
|
||||
/// 從一個節點陣列當中取出目前的索引鍵陣列。
|
||||
|
@ -163,7 +195,7 @@ extension Array where Element == Megrez.Compositor.Node {
|
|||
/// 從一個節點陣列當中取出目前的索引鍵陣列。
|
||||
public var keyArrays: [[String]] { map(\.keyArray) }
|
||||
|
||||
/// 返回一連串的節點起點。結果為 (Result A, Result B) 辭典陣列
|
||||
/// 返回一連串的節點起點。結果為 (Result A, Result B) 辭典陣列。
|
||||
/// Result A 以索引查座標,Result B 以座標查索引。
|
||||
public var nodeBorderPointDictPair: ([Int: Int], [Int: Int]) {
|
||||
// Result A 以索引查座標,Result B 以座標查索引。
|
||||
|
@ -182,7 +214,7 @@ extension Array where Element == Megrez.Compositor.Node {
|
|||
return (resultA, resultB)
|
||||
}
|
||||
|
||||
/// 總讀音單元數量,也就是總幅位長度。
|
||||
/// 總讀音單元數量。在絕大多數情況下,可視為總幅位長度。
|
||||
public var totalKeyCount: Int { map(\.keyArray.count).reduce(0, +) }
|
||||
|
||||
/// 根據給定的游標,返回其前後最近的邊界點。
|
||||
|
@ -194,9 +226,10 @@ extension Array where Element == Megrez.Compositor.Node {
|
|||
if cursor >= totalKeyCount { return nilReturn } // 防呆
|
||||
let cursor = Swift.max(0, cursor) // 防呆
|
||||
nilReturn = cursor..<cursor
|
||||
guard let rearNodeID = nodeBorderPointDictPair.1[cursor] else { return nilReturn } // 應該不會出現 nilReturn
|
||||
guard let rearIndex = nodeBorderPointDictPair.0[rearNodeID] else { return nilReturn } // 應該不會出現 nilReturn
|
||||
guard let frontIndex = nodeBorderPointDictPair.0[rearNodeID + 1] else { return nilReturn } // 應該不會出現 nilReturn
|
||||
// 下文按道理來講不應該會出現 nilReturn。
|
||||
guard let rearNodeID = nodeBorderPointDictPair.1[cursor] else { return nilReturn }
|
||||
guard let rearIndex = nodeBorderPointDictPair.0[rearNodeID] else { return nilReturn }
|
||||
guard let frontIndex = nodeBorderPointDictPair.0[rearNodeID + 1] else { return nilReturn }
|
||||
return rearIndex..<frontIndex
|
||||
}
|
||||
|
||||
|
@ -207,7 +240,7 @@ extension Array where Element == Megrez.Compositor.Node {
|
|||
/// - Returns: 查找結果。
|
||||
public func findNode(at cursor: Int, target outCursorPastNode: inout Int) -> Megrez.Compositor.Node? {
|
||||
guard !isEmpty else { return nil }
|
||||
let cursor = Swift.min(Swift.max(0, cursor), totalKeyCount - 1) // 防呆
|
||||
let cursor = Swift.max(0, Swift.min(cursor, totalKeyCount - 1)) // 防呆
|
||||
let range = contextRange(ofGivenCursor: cursor)
|
||||
outCursorPastNode = range.upperBound
|
||||
guard let rearNodeID = nodeBorderPointDictPair.1[cursor] else { return nil }
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
/// 語言模組協定。
|
||||
public protocol LangModelProtocol {
|
||||
/// 給定鍵陣列,讓語言模型找給一組單元圖陣列。
|
||||
/// 給定索引鍵陣列,讓語言模型找給一組單元圖陣列。
|
||||
func unigramsFor(keyArray: [String]) -> [Megrez.Unigram]
|
||||
/// 給定鍵陣列,確認是否有單元圖記錄在庫。
|
||||
/// 根據給定的索引鍵來確認各個資料庫陣列內是否存在對應的資料。
|
||||
func hasUnigramsFor(keyArray: [String]) -> Bool
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
extension Megrez {
|
||||
/// 單元圖。
|
||||
@frozen public struct Unigram: Equatable, CustomStringConvertible, Hashable {
|
||||
/// 鍵值。
|
||||
/// 資料值,通常是詞語或單個字。
|
||||
public var value: String
|
||||
/// 權重。
|
||||
public var score: Double
|
||||
|
@ -15,15 +15,17 @@ extension Megrez {
|
|||
"(" + value.description + "," + String(score) + ")"
|
||||
}
|
||||
|
||||
/// 初期化一筆「單元圖」。一筆單元圖由一組鍵值配對與一筆權重數值組成。
|
||||
/// 初期化一筆「單元圖」。一筆單元圖由一筆資料值與一筆權重數值組成。
|
||||
/// - Parameters:
|
||||
/// - value: 鍵值。
|
||||
/// - value: 資料值。
|
||||
/// - score: 權重(雙精度小數)。
|
||||
public init(value: String = "", score: Double = 0) {
|
||||
self.value = value
|
||||
self.score = score
|
||||
}
|
||||
|
||||
/// 做為預設雜湊函式。
|
||||
/// - Parameter hasher: 目前物件的雜湊碼。
|
||||
public func hash(into hasher: inout Hasher) {
|
||||
hasher.combine(value)
|
||||
hasher.combine(score)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
|
||||
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
|
||||
// ====================
|
||||
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||
|
||||
|
@ -9,9 +9,9 @@ import XCTest
|
|||
@testable import Megrez
|
||||
|
||||
final class MegrezTests: XCTestCase {
|
||||
func testSpan() throws {
|
||||
func test01_Span() throws {
|
||||
let langModel = SimpleLM(input: strSampleData)
|
||||
let span = Megrez.Compositor.Span()
|
||||
let span = Megrez.Compositor.SpanUnit()
|
||||
let n1 = Megrez.Compositor.Node(
|
||||
keyArray: ["gao1"], spanLength: 1, unigrams: langModel.unigramsFor(keyArray: ["gao1"])
|
||||
)
|
||||
|
@ -50,11 +50,11 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertNil(span.nodeOf(length: Megrez.Compositor.maxSpanLength + 1))
|
||||
}
|
||||
|
||||
func testRankedLangModel() throws {
|
||||
func test02_RankedLangModel() throws {
|
||||
class TestLM: LangModelProtocol {
|
||||
func hasUnigramsFor(keyArray: [String]) -> Bool { keyArray == ["foo"] }
|
||||
func hasUnigramsFor(keyArray: [String]) -> Bool { keyArray.joined() == "foo" }
|
||||
func unigramsFor(keyArray: [String]) -> [Megrez.Unigram] {
|
||||
keyArray == ["foo"]
|
||||
keyArray.joined() == "foo"
|
||||
? [.init(value: "middle", score: -5), .init(value: "highest", score: -2), .init(value: "lowest", score: -10)]
|
||||
: .init()
|
||||
}
|
||||
|
@ -74,7 +74,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(unigrams[2].score, -10)
|
||||
}
|
||||
|
||||
func testCompositor_BasicTests() throws {
|
||||
func test03_Compositor_BasicTests() throws {
|
||||
var compositor = Megrez.Compositor(with: MockLM())
|
||||
XCTAssertEqual(compositor.separator, Megrez.Compositor.theSeparator)
|
||||
XCTAssertEqual(compositor.cursor, 0)
|
||||
|
@ -93,11 +93,11 @@ final class MegrezTests: XCTestCase {
|
|||
|
||||
compositor.dropKey(direction: .rear)
|
||||
XCTAssertEqual(compositor.cursor, 0)
|
||||
XCTAssertEqual(compositor.cursor, 0)
|
||||
XCTAssertEqual(compositor.length, 0)
|
||||
XCTAssertEqual(compositor.spans.count, 0)
|
||||
}
|
||||
|
||||
func testCompositor_InvalidOperations() throws {
|
||||
func test04_Compositor_InvalidOperations() throws {
|
||||
class TestLM: LangModelProtocol {
|
||||
func hasUnigramsFor(keyArray: [String]) -> Bool { keyArray == ["foo"] }
|
||||
func unigramsFor(keyArray: [String]) -> [Megrez.Unigram] {
|
||||
|
@ -122,7 +122,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(compositor.length, 0)
|
||||
}
|
||||
|
||||
func testCompositor_DeleteToTheFrontOfCursor() throws {
|
||||
func test05_Compositor_DeleteToTheFrontOfCursor() throws {
|
||||
var compositor = Megrez.Compositor(with: MockLM())
|
||||
compositor.insertKey("a")
|
||||
compositor.cursor = 0
|
||||
|
@ -132,13 +132,14 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertFalse(compositor.dropKey(direction: .rear))
|
||||
XCTAssertEqual(compositor.cursor, 0)
|
||||
XCTAssertEqual(compositor.length, 1)
|
||||
XCTAssertEqual(compositor.spans.count, 1)
|
||||
XCTAssertTrue(compositor.dropKey(direction: .front))
|
||||
XCTAssertEqual(compositor.cursor, 0)
|
||||
XCTAssertEqual(compositor.length, 0)
|
||||
XCTAssertEqual(compositor.spans.count, 0)
|
||||
}
|
||||
|
||||
func testCompositor_MultipleSpans() throws {
|
||||
func test06_Compositor_MultipleSpans() throws {
|
||||
var compositor = Megrez.Compositor(with: MockLM())
|
||||
compositor.separator = ";"
|
||||
compositor.insertKey("a")
|
||||
|
@ -158,7 +159,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(compositor.spans[2].nodeOf(length: 1)?.keyArray.joined(separator: compositor.separator), "c")
|
||||
}
|
||||
|
||||
func testCompositor_SpanDeletionFromFront() throws {
|
||||
func test07_Compositor_SpanDeletionFromFront() throws {
|
||||
var compositor = Megrez.Compositor(with: MockLM())
|
||||
compositor.separator = ";"
|
||||
compositor.insertKey("a")
|
||||
|
@ -176,7 +177,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(compositor.spans[1].nodeOf(length: 1)?.keyArray.joined(separator: compositor.separator), "b")
|
||||
}
|
||||
|
||||
func testCompositor_SpanDeletionFromMiddle() throws {
|
||||
func test08_Compositor_SpanDeletionFromMiddle() throws {
|
||||
var compositor = Megrez.Compositor(with: MockLM())
|
||||
compositor.separator = ";"
|
||||
compositor.insertKey("a")
|
||||
|
@ -211,7 +212,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(compositor.spans[1].nodeOf(length: 1)?.keyArray.joined(separator: compositor.separator), "c")
|
||||
}
|
||||
|
||||
func testCompositor_SpanDeletionFromRear() throws {
|
||||
func test09_Compositor_SpanDeletionFromRear() throws {
|
||||
var compositor = Megrez.Compositor(with: MockLM())
|
||||
compositor.separator = ";"
|
||||
compositor.insertKey("a")
|
||||
|
@ -231,7 +232,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(compositor.spans[1].nodeOf(length: 1)?.keyArray.joined(separator: compositor.separator), "c")
|
||||
}
|
||||
|
||||
func testCompositor_SpanInsertion() throws {
|
||||
func test10_Compositor_SpanInsertion() throws {
|
||||
var compositor = Megrez.Compositor(with: MockLM())
|
||||
compositor.separator = ";"
|
||||
compositor.insertKey("a")
|
||||
|
@ -259,7 +260,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(compositor.spans[3].nodeOf(length: 1)?.keyArray.joined(separator: compositor.separator), "c")
|
||||
}
|
||||
|
||||
func testCompositor_LongGridDeletion() throws {
|
||||
func test11_Compositor_LongGridDeletion() throws {
|
||||
var compositor = Megrez.Compositor(with: MockLM())
|
||||
compositor.separator = ""
|
||||
compositor.insertKey("a")
|
||||
|
@ -294,7 +295,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(compositor.spans[8].nodeOf(length: 5)?.keyArray.joined(separator: compositor.separator), "jklmn")
|
||||
}
|
||||
|
||||
func testCompositor_LongGridInsertion() throws {
|
||||
func test12_Compositor_LongGridInsertion() throws {
|
||||
var compositor = Megrez.Compositor(with: MockLM())
|
||||
compositor.separator = ""
|
||||
compositor.insertKey("a")
|
||||
|
@ -331,7 +332,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(compositor.spans[8].nodeOf(length: 6)?.keyArray.joined(separator: compositor.separator), "hijklm")
|
||||
}
|
||||
|
||||
func testCompositor_StressBench() throws {
|
||||
func test13_Compositor_StressBench() throws {
|
||||
NSLog("// Stress test preparation begins.")
|
||||
var compositor = Megrez.Compositor(with: SimpleLM(input: strStressData))
|
||||
for _ in 0..<1919 {
|
||||
|
@ -344,7 +345,7 @@ final class MegrezTests: XCTestCase {
|
|||
NSLog("// Stress test elapsed: \(timeElapsed)s.")
|
||||
}
|
||||
|
||||
func testCompositor_WordSegmentation() throws {
|
||||
func test14_Compositor_WordSegmentation() throws {
|
||||
var compositor = Megrez.Compositor(with: SimpleLM(input: strSampleData, swapKeyValue: true))
|
||||
compositor.separator = ""
|
||||
for i in "高科技公司的年終獎金" {
|
||||
|
@ -354,7 +355,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(result.joinedKeys(by: ""), ["高科技", "公司", "的", "年終", "獎金"])
|
||||
}
|
||||
|
||||
func testCompositor_InputTestAndCursorJump() throws {
|
||||
func test15_Compositor_InputTestAndCursorJump() throws {
|
||||
var compositor = Megrez.Compositor(with: SimpleLM(input: strSampleData))
|
||||
compositor.separator = ""
|
||||
compositor.insertKey("gao1")
|
||||
|
@ -429,7 +430,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(compositor.dumpDOT, expectedDumpDOT)
|
||||
}
|
||||
|
||||
func testCompositor_InputTest2() throws {
|
||||
func test16_Compositor_InputTest2() throws {
|
||||
var compositor = Megrez.Compositor(with: SimpleLM(input: strSampleData))
|
||||
compositor.separator = ""
|
||||
compositor.insertKey("gao1")
|
||||
|
@ -443,7 +444,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(result.values, ["高科技", "公司"])
|
||||
}
|
||||
|
||||
func testCompositor_OverrideOverlappingNodes() throws {
|
||||
func test17_Compositor_OverrideOverlappingNodes() throws {
|
||||
var compositor = Megrez.Compositor(with: SimpleLM(input: strSampleData))
|
||||
compositor.separator = ""
|
||||
compositor.insertKey("gao1")
|
||||
|
@ -475,7 +476,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(result.values, ["高科技"])
|
||||
}
|
||||
|
||||
func testCompositor_OverrideReset() throws {
|
||||
func test18_Compositor_OverrideReset() throws {
|
||||
var compositor = Megrez.Compositor(
|
||||
with: SimpleLM(input: strSampleData + "zhong1jiang3 終講 -11.0\n" + "jiang3jin1 槳襟 -11.0\n"))
|
||||
compositor.separator = ""
|
||||
|
@ -499,7 +500,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(result.values, ["年終", "槳襟"])
|
||||
}
|
||||
|
||||
func testCompositor_CandidateDisambiguation() throws {
|
||||
func test19_Compositor_CandidateDisambiguation() throws {
|
||||
var compositor = Megrez.Compositor(with: SimpleLM(input: strEmojiSampleData))
|
||||
compositor.separator = ""
|
||||
compositor.insertKey("gao1")
|
||||
|
@ -521,7 +522,7 @@ final class MegrezTests: XCTestCase {
|
|||
XCTAssertEqual(result.values, ["高熱", "🔥", "危險"])
|
||||
}
|
||||
|
||||
func testCompositor_updateUnigramData() throws {
|
||||
func test20_Compositor_updateUnigramData() throws {
|
||||
let theLM = SimpleLM(input: strSampleData)
|
||||
var compositor = Megrez.Compositor(with: theLM)
|
||||
compositor.separator = ""
|
||||
|
|
Loading…
Reference in New Issue