From 5fc0351a9c5f6d66c01cec885495ca03da3cdd66 Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Sun, 7 Aug 2022 16:18:16 +0800 Subject: [PATCH] Repo // Importing Megrez v2.0.0 update. --- .../LanguageParsers/Megrez/0_Megrez.swift | 24 +- .../LanguageParsers/Megrez/1_Compositor.swift | 468 ++++++++---------- .../LanguageParsers/Megrez/2_Grid.swift | 250 ---------- .../LanguageParsers/Megrez/2_Walker.swift | 107 ++++ .../LanguageParsers/Megrez/3_Candidate.swift | 181 +++++++ .../LanguageParsers/Megrez/3_NodeAnchor.swift | 78 --- .../LanguageParsers/Megrez/3_Span.swift | 63 --- .../LanguageParsers/Megrez/4_Node.swift | 172 ------- .../LanguageParsers/Megrez/4_Span.swift | 96 ++++ .../Megrez/5_LanguageModel.swift | 43 -- .../LanguageParsers/Megrez/5_Vertex.swift | 96 ++++ .../LanguageParsers/Megrez/6_Bigram.swift | 64 --- .../LanguageParsers/Megrez/6_Node.swift | 142 ++++++ .../LanguageParsers/Megrez/6_Unigram.swift | 57 --- .../Megrez/7_KeyValuePaired.swift | 58 --- .../LanguageParsers/Megrez/7_LangModel.swift | 61 +++ .../LanguageParsers/Megrez/8_Unigram.swift | 40 ++ vChewing.xcodeproj/project.pbxproj | 76 ++- 18 files changed, 990 insertions(+), 1086 deletions(-) delete mode 100644 Source/Modules/LanguageParsers/Megrez/2_Grid.swift create mode 100644 Source/Modules/LanguageParsers/Megrez/2_Walker.swift create mode 100644 Source/Modules/LanguageParsers/Megrez/3_Candidate.swift delete mode 100644 Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift delete mode 100644 Source/Modules/LanguageParsers/Megrez/3_Span.swift delete mode 100644 Source/Modules/LanguageParsers/Megrez/4_Node.swift create mode 100644 Source/Modules/LanguageParsers/Megrez/4_Span.swift delete mode 100644 Source/Modules/LanguageParsers/Megrez/5_LanguageModel.swift create mode 100644 Source/Modules/LanguageParsers/Megrez/5_Vertex.swift delete mode 100644 Source/Modules/LanguageParsers/Megrez/6_Bigram.swift create mode 100644 Source/Modules/LanguageParsers/Megrez/6_Node.swift delete mode 100644 Source/Modules/LanguageParsers/Megrez/6_Unigram.swift delete mode 100644 Source/Modules/LanguageParsers/Megrez/7_KeyValuePaired.swift create mode 100644 Source/Modules/LanguageParsers/Megrez/7_LangModel.swift create mode 100644 Source/Modules/LanguageParsers/Megrez/8_Unigram.swift diff --git a/Source/Modules/LanguageParsers/Megrez/0_Megrez.swift b/Source/Modules/LanguageParsers/Megrez/0_Megrez.swift index ee2ecea9..3bdc535c 100644 --- a/Source/Modules/LanguageParsers/Megrez/0_Megrez.swift +++ b/Source/Modules/LanguageParsers/Megrez/0_Megrez.swift @@ -1,11 +1,21 @@ -// Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). -// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). +// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License). +// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). // ==================== // This code is released under the MIT license (SPDX-License-Identifier: MIT) -// ... with NTL restriction stating that: -// No trademark license is granted to use the trade names, trademarks, service -// marks, or product names of Contributor, except as required to fulfill notice -// requirements defined in MIT License. /// The namespace for this package. -public enum Megrez {} +public enum Megrez { + public typealias KeyValuePaired = Compositor.Candidate // 相容性措施。 +} + +// 著作權聲明: +// 除了 Megrez 專有的修改與實作以外,該套件所有程式邏輯來自於 Gramambular、算法歸 Lukhnos Liu 所有。 +// 天權星引擎(Megrez Compositor)僅僅是將 Gramambular 用 Swift 重寫之後繼續開發的結果而已。 + +// 術語: + +// Grid: 節軌 +// Walk: 爬軌 +// Node: 節點 +// SpanLength: 節幅 +// Span: 幅位 diff --git a/Source/Modules/LanguageParsers/Megrez/1_Compositor.swift b/Source/Modules/LanguageParsers/Megrez/1_Compositor.swift index fa217383..9d0be506 100644 --- a/Source/Modules/LanguageParsers/Megrez/1_Compositor.swift +++ b/Source/Modules/LanguageParsers/Megrez/1_Compositor.swift @@ -1,41 +1,91 @@ -// Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). -// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). +// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License). +// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). // ==================== // This code is released under the MIT license (SPDX-License-Identifier: MIT) -// ... with NTL restriction stating that: -// No trademark license is granted to use the trade names, trademarks, service -// marks, or product names of Contributor, except as required to fulfill notice -// requirements defined in MIT License. extension Megrez { - /// 組字器。 - public class Compositor: Grid { + /// 一個組字器用來在給定一系列的索引鍵的情況下(藉由一系列的觀測行為)返回一套資料值。 + /// + /// 用於輸入法的話,給定的索引鍵可以是注音、且返回的資料值都是漢語字詞組合。該組字器 + /// 還可以用來對文章做分節處理:此時的索引鍵為漢字,返回的資料值則是漢語字詞分節組合。 + /// + /// - Remark: 雖然這裡用了隱性 Markov 模型(HMM)的術語,但實際上在爬軌時用到的則是更 + /// 簡單的貝氏推論:因為底層的語言模組只會提供單元圖資料。一旦將所有可以組字的單元圖 + /// 作為節點塞到組字器內,就可以用一個簡單的有向無環圖爬軌過程、來利用這些隱性資料值 + /// 算出最大相似估算結果。 + public class Compositor { /// 就文字輸入方向而言的方向。 public enum TypingDirection { case front, rear } - /// 給被丟掉的節點路徑施加的負權重。 - private let kDroppedPathScore: Double = -999 + /// 軌格增減行為。 + public enum ResizeBehavior { case expand, shrink } + /// 該軌格內可以允許的最大幅位長度。 + public static var maxSpanLength: Int = 10 { didSet { maxSpanLength = max(6, maxSpanLength) } } + /// 公開:多字讀音鍵當中用以分割漢字讀音的記號的預設值,是「-」。 + public static let kDefaultSeparator: String = "-" /// 該組字器的游標位置。 - public var cursor: Int = 0 { didSet { cursor = max(0, min(cursor, readings.count)) } } - /// 該組字器的讀音陣列。 - private(set) var readings: [String] = [] - /// 該組字器所使用的語言模型。 - private var langModel: LangModelProtocol + public var cursor: Int = 0 { didSet { cursor = max(0, min(cursor, length)) } } + /// 公開:多字讀音鍵當中用以分割漢字讀音的記號,預設為「-」。 + public var separator = kDefaultSeparator + /// 公開:組字器內已經插入的單筆索引鍵的數量。 + public var width: Int { keys.count } + /// 公開:最近一次爬軌結果。 + public var walkedNodes: [Node] = [] + /// 公開:該組字器的長度,也就是內建漢字讀音的數量(唯讀)。 + public var length: Int { keys.count } + /// 公開:組字器是否為空。 + public var isEmpty: Bool { spans.isEmpty && keys.isEmpty } + + /// 該組字器的索引鍵陣列。 + private(set) var keys = [String]() + /// 該組字器的幅位陣列。 + private(set) var spans = [Span]() + /// 該組字器所使用的語言模型(被 LangModelRanked 所封裝)。 + private(set) var langModel: LangModelRanked /// 允許查詢當前游標位置屬於第幾個幅位座標(從 0 開始算)。 private(set) var cursorRegionMap: [Int: Int] = .init() - /// 用以記錄爬過的節錨的陣列。 - private(set) var walkedAnchors: [NodeAnchor] = [] - /// 該函式用以更新爬過的節錨的陣列。 - /// - Parameter nodes: 傳入的節點陣列。 - public func updateWalkedAnchors(with nodes: [Node]) { - walkedAnchors = nodes.map { Megrez.NodeAnchor(node: $0) } + /// 初期化一個組字器。 + /// - Parameter langModel: 要對接的語言模組。 + public init(with langModel: LangModelProtocol, separator: String = "-") { + self.langModel = .init(withLM: langModel) + self.separator = separator } - /// 公開:多字讀音鍵當中用以分割漢字讀音的記號,預設為空。 - public var joinSeparator: String = "-" + public func clear() { + cursor = 0 + keys.removeAll() + spans.removeAll() + walkedNodes.removeAll() + cursorRegionMap.removeAll() + } - /// 公開:該組字器的長度,也就是內建漢字讀音的數量(唯讀)。 - public var length: Int { readings.count } + /// 在游標位置插入給定的索引鍵。 + /// - Parameter key: 要插入的索引鍵。 + /// - Returns: 該操作是否成功執行。 + @discardableResult public func insertKey(_ key: String) -> Bool { + guard !key.isEmpty, key != separator, langModel.hasUnigramsFor(key: key) else { return false } + keys.insert(key, at: cursor) + resizeGrid(at: cursor, do: .expand) + update() + cursor += 1 // 游標必須得在執行 update() 之後才可以變動。 + return true + } + + /// 朝著指定方向砍掉一個與游標相鄰的讀音。 + /// + /// 在威注音的術語體系當中,「與文字輸入方向相反的方向」為向後(Rear),反之則為向前(Front)。 + /// 如果是朝著與文字輸入方向相反的方向砍的話,游標位置會自動遞減。 + /// - Parameter direction: 指定方向(相對於文字輸入方向而言)。 + /// - Returns: 該操作是否成功執行。 + @discardableResult public func dropKey(direction: TypingDirection) -> Bool { + let isBackSpace: Bool = direction == .rear ? true : false + guard cursor != (isBackSpace ? 0 : keys.count) else { return false } + keys.remove(at: cursor - (isBackSpace ? 1 : 0)) + cursor -= isBackSpace ? 1 : 0 // 在縮節之前。 + resizeGrid(at: cursor, do: .shrink) + update() + return true + } /// 按幅位來前後移動游標。 /// - Parameter direction: 移動方向。 @@ -50,21 +100,21 @@ extension Megrez { guard let currentRegion = cursorRegionMap[cursor] else { return false } let aRegionForward = max(currentRegion - 1, 0) - let currentRegionBorderRear: Int = walkedAnchors[0.. walkedAnchors.count) - ? readings.count : walkedAnchors[0...currentRegion].map(\.spanLength).reduce(0, +) + (currentRegion > walkedNodes.count) + ? keys.count : walkedNodes[0...currentRegion].map(\.spanLength).reduce(0, +) case .rear: - cursor = walkedAnchors[0.. Bool { - guard !reading.isEmpty, langModel.hasUnigramsFor(key: reading) else { return false } - readings.insert(reading, at: cursor) - resizeGridByOneAt(location: cursor, to: .expand) - build() - cursor += 1 - return true - } - - /// 朝著指定方向砍掉一個與游標相鄰的讀音。 - /// - /// 在威注音的術語體系當中,「與文字輸入方向相反的方向」為向後(Rear),反之則為向前(Front)。 - /// - Parameter direction: 指定方向。 - /// - Returns: 該操作是否順利完成。 - @discardableResult public func dropReading(direction: TypingDirection) -> Bool { - let isBackSpace = direction == .rear - if cursor == (isBackSpace ? 0 : readings.count) { - return false - } - readings.remove(at: cursor - (isBackSpace ? 1 : 0)) - cursor -= (isBackSpace ? 1 : 0) - resizeGridByOneAt(location: cursor, to: .shrink) - build() - return true - } - - /// 移除該組字器最先被輸入的第 X 個讀音單元。 - /// - /// 用於輸入法組字區長度上限處理: - /// 將該位置要溢出的敲字內容遞交之後、再執行這個函式。 - @discardableResult public func removeHeadReadings(count: Int) -> Bool { - let count = abs(count) // 防呆 - if count > length { return false } - for _ in 0.. [NodeAnchor] { - let newLocation = width - // 這裡把所有空節點都過濾掉。 - walkedAnchors = Array( - reverseWalk(at: newLocation).reversed() - ).lazy.filter { !$0.isEmpty } - updateCursorJumpingTables(walkedAnchors) - return walkedAnchors - } - - // MARK: - Private functions - - /// 內部專用反芻函式,對已給定的軌格按照給定的位置與條件進行反向爬軌。 - /// - Parameters: - /// - location: 開始爬軌的位置。 - /// - mass: 給定累計權重,非必填參數。預設值為 0。 - /// - joinedPhrase: 用以統計累計長詞的內部參數,請勿主動使用。 - /// - longPhrases: 用以統計累計長詞的內部參數,請勿主動使用。 - /// - Returns: 一個包含結果的節錨陣列。 - private func reverseWalk( - at location: Int, - mass: Double = 0.0, - joinedPhrase: String = "", - longPhrases: [String] = .init() - ) -> [NodeAnchor] { - let location = abs(location) // 防呆 - if location == 0 || location > width { - return .init() - } - - var paths = [[NodeAnchor]]() - let nodes = nodesEndingAt(location: location).stableSorted { - $0.node.score > $1.node.score - } - - guard !nodes.isEmpty else { return .init() } // 防止下文出現範圍外索引的錯誤 - - if nodes[0].node.score >= Node.kSelectedCandidateScore { - // 在使用者有選過候選字詞的情況下,摒棄非依此據而成的節點路徑。 - var theAnchor = nodes[0] - theAnchor.mass = mass + nodes[0].node.score - var path: [NodeAnchor] = reverseWalk( - at: location - theAnchor.spanLength, mass: theAnchor.mass - ) - path.insert(theAnchor, at: 0) - paths.append(path) - } else if !longPhrases.isEmpty { - var path = [NodeAnchor]() - for theAnchor in nodes { - var theAnchor = theAnchor - let joinedValue = theAnchor.node.currentPair.value + joinedPhrase - // 如果只是一堆單漢字的節點組成了同樣的長詞的話,直接棄用這個節點路徑。 - // 打比方說「八/月/中/秋/山/林/涼」與「八月/中秋/山林/涼」在使用者來看 - // 是「結果等價」的,那就扔掉前者。 - if longPhrases.contains(joinedValue) { - theAnchor.mass = kDroppedPathScore - path.insert(theAnchor, at: 0) - paths.append(path) - continue + /// 生成用以交給 GraphViz 診斷的資料檔案內容,純文字。 + public var dumpDOT: String { + var strOutput = "digraph {\ngraph [ rankdir=LR ];\nBOS;\n" + for (p, span) in spans.enumerated() { + for ni in 0...(span.maxLength) { + guard let np = span.nodeOf(length: ni) else { continue } + if p == 0 { + strOutput += "BOS -> \(np.value);\n" } - theAnchor.mass = mass + theAnchor.node.score - path = reverseWalk( - at: location - theAnchor.spanLength, - mass: theAnchor.mass, - joinedPhrase: (joinedValue.count >= longPhrases[0].count) ? "" : joinedValue, - longPhrases: .init() - ) - path.insert(theAnchor, at: 0) - paths.append(path) - } - } else { - // 看看當前格位有沒有更長的候選字詞。 - var longPhrases = [String]() - for theAnchor in nodes.lazy.filter({ $0.spanLength > 1 }) { - longPhrases.append(theAnchor.node.currentPair.value) - } - - longPhrases = longPhrases.stableSorted { - $0.count > $1.count - } - for theAnchor in nodes { - var theAnchor = theAnchor - theAnchor.mass = mass + theAnchor.node.score - var path = [NodeAnchor]() - path = reverseWalk( - at: location - theAnchor.spanLength, mass: theAnchor.mass, - joinedPhrase: (theAnchor.spanLength > 1) ? "" : theAnchor.node.currentPair.value, - longPhrases: .init() - ) - path.insert(theAnchor, at: 0) - paths.append(path) + strOutput += "\(np.value);\n" + if (p + ni) < spans.count { + let destinationSpan = spans[p + ni] + for q in 0...(destinationSpan.maxLength) { + guard let dn = destinationSpan.nodeOf(length: q) else { continue } + strOutput += np.value + " -> " + dn.value + ";\n" + } + } + guard (p + ni) == spans.count else { continue } + strOutput += np.value + " -> EOS;\n" } } - - guard !paths.isEmpty else { - return .init() - } - - var result: [NodeAnchor] = paths[0] - for neta in paths.lazy.filter({ - $0.last!.mass > result.last!.mass - }) { - result = neta - } - - return result // 空節點過濾的步驟交給 walk() 這個對外函式,以避免重複執行清理步驟。 - } - - private func build() { - let itrBegin: Int = - (cursor < maxBuildSpanLength) ? 0 : cursor - maxBuildSpanLength - let itrEnd: Int = min(cursor + maxBuildSpanLength, readings.count) - - for p in itrBegin.. itrEnd { break } - let arrSlice = readings[p..<(p + q)] - let combinedReading: String = join(slice: arrSlice, separator: joinSeparator) - if hasMatchedNode(location: p, spanLength: q, key: combinedReading) { continue } - let unigrams: [Unigram] = langModel.unigramsFor(key: combinedReading) - if unigrams.isEmpty { continue } - let n: Node = .init(key: combinedReading, spanLength: q, unigrams: unigrams) - insertNode(node: n, location: p, spanLength: q) - } - } - } - - private func join(slice arrSlice: ArraySlice, separator: String) -> String { - arrSlice.joined(separator: separator) - } - - internal func updateCursorJumpingTables(_ anchors: [NodeAnchor]) { - var cursorRegionMapDict = [Int: Int]() - cursorRegionMapDict[-1] = 0 // 防呆 - var counter = 0 - for (i, anchor) in anchors.enumerated() { - for _ in 0.. Bool - ) - rethrows -> [Element] - { - try enumerated() - .sorted { a, b -> Bool in - try areInIncreasingOrder(a.element, b.element) - || (a.offset < b.offset && !areInIncreasingOrder(b.element, a.element)) + /// 拿新增幅位來打比方的話,在擴增幅位之前: + /// ``` + /// Span Index 0 1 2 3 + /// (---) + /// (-------) + /// (-----------) + /// ``` + /// 在幅位座標 2 (SpanIndex = 2) 的位置擴增一個幅位之後: + /// ``` + /// Span Index 0 1 2 3 4 + /// (---) + /// (XXX? ?XXX) <-被扯爛的節點 + /// (XXXXXXX? ?XXX) <-被扯爛的節點 + /// ``` + /// 拿縮減幅位來打比方的話,在縮減幅位之前: + /// ``` + /// Span Index 0 1 2 3 + /// (---) + /// (-------) + /// (-----------) + /// ``` + /// 在幅位座標 2 的位置就地砍掉一個幅位之後: + /// ``` + /// Span Index 0 1 2 3 4 + /// (---) + /// (XXX? <-被砍爛的節點 + /// (XXXXXXX? <-被砍爛的節點 + /// ``` + /// - Parameter location: 給定的幅位座標。 + func dropWreckedNodes(at location: Int) { + let location = max(min(location, spans.count), 0) // 防呆 + guard !spans.isEmpty else { return } + let affectedLength = Megrez.Compositor.maxSpanLength - 1 + let begin = max(0, location - affectedLength) + guard location >= begin else { return } + for i in begin.. Bool { + let location = max(min(location, spans.count - 1), 0) // 防呆 + spans[location].append(node: node) + return true + } + + func getJointKey(range: Range) -> String { + // 下面這句不能用 contains,不然會要求至少 macOS 13 Ventura。 + guard range.upperBound <= keys.count, range.lowerBound >= 0 else { return "" } + return keys[range].joined(separator: separator) + } + + func getJointKeyArray(range: Range) -> [String] { + // 下面這句不能用 contains,不然會要求至少 macOS 13 Ventura。 + guard range.upperBound <= keys.count, range.lowerBound >= 0 else { return [] } + return keys[range].map { String($0) } + } + + func hasNode(at location: Int, length: Int, key: String) -> Bool { + let location = max(min(location, spans.count), 0) // 防呆 + guard let node = spans[location].nodeOf(length: length) else { return false } + return key == node.key + } + + func update() { + let maxSpanLength = Megrez.Compositor.maxSpanLength + let range = max(0, cursor - maxSpanLength)..= spans.count { - let diff = location - spans.count + 1 - for _ in 0.. Bool { - let location = abs(location) // 防呆 - let spanLength = abs(spanLength) // 防呆 - if location > spans.count { - return false - } - - let n = spans[location].nodeOf(length: spanLength) - return n != nil && key == n?.key - } - - /// 在該軌格的指定位置擴增或減少一個幅位。 - /// - Parameters: - /// - location: 位置。 - public func resizeGridByOneAt(location: Int, to behavior: ResizeBehavior) { - let location = max(0, min(width, location)) // 防呆 - switch behavior { - case .expand: - spans.insert(SpanUnit(), at: location) - if [spans.count, 0].contains(location) { return } - case .shrink: - if location >= spans.count { return } - spans.remove(at: location) - } - for i in 0.. [NodeAnchor] { - let location = abs(location) // 防呆 - var results = [NodeAnchor]() - if location >= spans.count { return results } - // 此時 spans 必然不為空,因為 location 不可能小於 0。 - let span = spans[location] - for i in 1...maxBuildSpanLength { - if let np = span.nodeOf(length: i) { - results.append(.init(node: np)) - } - } - return results // 已證實不會有空節點產生。 - } - - /// 給定位置,枚舉出所有在這個位置結尾的節點。 - /// - Parameters: - /// - location: 位置。 - public func nodesEndingAt(location: Int) -> [NodeAnchor] { - let location = abs(location) // 防呆 - var results = [NodeAnchor]() - if spans.isEmpty || location > spans.count { return results } - for i in 0.. [NodeAnchor] { - let location = abs(location) // 防呆 - var results = [NodeAnchor]() - if spans.isEmpty || location > spans.count { return results } - for i in 0.. [NodeAnchor] { - Array(Set(nodesBeginningAt(location: location) + nodesCrossingOrEndingAt(location: location))) - } - - /// 使用給定的候選字字串,將給定位置的節點的候選字詞改為與給定的字串一致的候選字詞。 - /// - /// 該函式可以僅用作過程函式,但準確度不如用於處理候選字鍵值配對的 fixNodeWithCandidate()。 - /// - Parameters: - /// - location: 位置。 - /// - value: 給定字串。 - @discardableResult public func fixNodeWithCandidateLiteral(_ value: String, at location: Int) -> NodeAnchor { - let location = abs(location) // 防呆 - var node = NodeAnchor() - for theAnchor in nodesCrossingOrEndingAt(location: location) { - let candidates = theAnchor.node.candidates - // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 - theAnchor.node.resetCandidate() - for (i, candidate) in candidates.enumerated() { - if candidate.value == value { - theAnchor.node.selectCandidateAt(index: i) - node = theAnchor - break - } - } - } - return node - } - - /// 使用給定的候選字鍵值配對,將給定位置的節點的候選字詞改為與給定的字串一致的候選字詞。 - /// - /// 該函式可以僅用作過程函式。 - /// - Parameters: - /// - location: 位置。 - /// - value: 給定候選字鍵值配對。 - @discardableResult public func fixNodeWithCandidate(_ pair: KeyValuePaired, at location: Int) -> NodeAnchor { - let location = abs(location) // 防呆 - var node = NodeAnchor() - for theAnchor in nodesCrossingOrEndingAt(location: location) { - let candidates = theAnchor.node.candidates - // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 - theAnchor.node.resetCandidate() - for (i, candidate) in candidates.enumerated() { - if candidate == pair { - theAnchor.node.selectCandidateAt(index: i) - node = theAnchor - break - } - } - } - return node - } - - /// 將給定位置的節點的與給定的字串一致的候選字詞的權重複寫為給定權重數值。 - /// - Parameters: - /// - location: 位置。 - /// - value: 給定字串。 - /// - overridingScore: 給定權重數值。 - public func overrideNodeScoreForSelectedCandidate(location: Int, value: String, overridingScore: Double) { - let location = abs(location) // 防呆 - for theAnchor in nodesOverlappedAt(location: location) { - let candidates = theAnchor.node.candidates - // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 - theAnchor.node.resetCandidate() - for (i, candidate) in candidates.enumerated() { - if candidate.value == value { - theAnchor.node.selectFloatingCandidateAt(index: i, score: overridingScore) - break - } - } - } - } - } -} - -// MARK: - DumpDOT-related functions. - -extension Megrez.Grid { - /// 生成用以交給 GraphViz 診斷的資料檔案內容,純文字。 - public var dumpDOT: String { - var strOutput = "digraph {\ngraph [ rankdir=LR ];\nBOS;\n" - for (p, span) in spans.enumerated() { - for ni in 0...(span.maxLength) { - guard let np = span.nodeOf(length: ni) else { continue } - if p == 0 { - strOutput += "BOS -> \(np.currentPair.value);\n" - } - strOutput += "\(np.currentPair.value);\n" - if (p + ni) < spans.count { - let destinationSpan = spans[p + ni] - for q in 0...(destinationSpan.maxLength) { - guard let dn = destinationSpan.nodeOf(length: q) else { continue } - strOutput += np.currentPair.value + " -> " + dn.currentPair.value + ";\n" - } - } - guard (p + ni) == spans.count else { continue } - strOutput += np.currentPair.value + " -> EOS;\n" - } - } - strOutput += "EOS;\n}\n" - return strOutput - } -} diff --git a/Source/Modules/LanguageParsers/Megrez/2_Walker.swift b/Source/Modules/LanguageParsers/Megrez/2_Walker.swift new file mode 100644 index 00000000..b7f4b091 --- /dev/null +++ b/Source/Modules/LanguageParsers/Megrez/2_Walker.swift @@ -0,0 +1,107 @@ +// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License). +// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). +// ==================== +// This code is released under the MIT license (SPDX-License-Identifier: MIT) + +extension Megrez.Compositor { + /// 找到軌格陣圖內權重最大的路徑。該路徑代表了可被觀測到的最可能的隱藏事件鏈。 + /// 這裡使用 Cormen 在 2001 年出版的教材當中提出的「有向無環圖的最短路徑」的 + /// 算法來計算這種路徑。不過,這裡不是要計算距離最短的路徑,而是計算距離最長 + /// 的路徑(所以要找最大的權重),因為在對數概率下,較大的數值意味著較大的概率。 + /// 對於 `G = (V, E)`,該算法的運行次數為 `O(|V|+|E|)`,其中 `G` 是一個有向無環圖。 + /// 這意味著,即使軌格很大,也可以用很少的算力就可以爬軌。 + /// - Returns: 爬軌結果+該過程是否順利執行。 + @discardableResult public func walk() -> ([Node], Bool) { + var result = [Node]() + defer { + walkedNodes = result + updateCursorJumpingTables(walkedNodes) + } + guard !spans.isEmpty else { return (result, true) } + + var vertexSpans = [VertexSpan]() + for _ in spans { + vertexSpans.append(.init()) + } + + for (i, span) in spans.enumerated() { + for j in 1...span.maxLength { + if let p = span.nodeOf(length: j) { + vertexSpans[i].append(.init(node: p)) + } + } + } + + let terminal = Vertex(node: .init(keyArray: ["_TERMINAL_"], keySeparator: separator)) + + for (i, vertexSpan) in vertexSpans.enumerated() { + for vertex in vertexSpan { + let nextVertexPosition = i + vertex.node.spanLength + if nextVertexPosition == vertexSpans.count { + vertex.edges.append(terminal) + continue + } + for nextVertex in vertexSpans[nextVertexPosition] { + vertex.edges.append(nextVertex) + } + } + } + + let root = Vertex(node: .init(keyArray: ["_ROOT_"], keySeparator: separator)) + root.distance = 0 + root.edges.append(contentsOf: vertexSpans[0]) + + var ordered: [Vertex] = topologicalSort(root: root) + for (j, neta) in ordered.reversed().enumerated() { + for (k, _) in neta.edges.enumerated() { + relax(u: neta, v: &neta.edges[k]) + } + ordered[j] = neta + } + + var walked = [Node]() + var totalKeyLength = 0 + var it = terminal + while let itPrev = it.prev { + walked.append(itPrev.node) + it = itPrev + totalKeyLength += it.node.spanLength + } + + guard totalKeyLength == keys.count else { + print("!!! ERROR A") + return (result, false) + } + guard walked.count >= 2 else { + print("!!! ERROR B") + return (result, false) + } + walked = walked.reversed() + walked.removeFirst() + result = walked + return (result, true) + } +} + +// MARK: - Stable Sort Extension + +// Reference: https://stackoverflow.com/a/50545761/4162914 + +extension Sequence { + /// Return a stable-sorted collection. + /// + /// - Parameter areInIncreasingOrder: Return nil when two element are equal. + /// - Returns: The sorted collection. + fileprivate func stableSorted( + by areInIncreasingOrder: (Element, Element) throws -> Bool + ) + rethrows -> [Element] + { + try enumerated() + .sorted { a, b -> Bool in + try areInIncreasingOrder(a.element, b.element) + || (a.offset < b.offset && !areInIncreasingOrder(b.element, a.element)) + } + .map(\.element) + } +} diff --git a/Source/Modules/LanguageParsers/Megrez/3_Candidate.swift b/Source/Modules/LanguageParsers/Megrez/3_Candidate.swift new file mode 100644 index 00000000..ddca9506 --- /dev/null +++ b/Source/Modules/LanguageParsers/Megrez/3_Candidate.swift @@ -0,0 +1,181 @@ +// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License). +// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). +// ==================== +// This code is released under the MIT license (SPDX-License-Identifier: MIT) + +import Foundation + +extension Megrez.Compositor { + public struct Candidate: Equatable, Hashable, Comparable, CustomStringConvertible { + /// 鍵。一般情況下用來放置讀音等可以用來作為索引的內容。 + public var key: String + /// 資料值。 + public var value: String + /// 將當前鍵值列印成一個字串。 + public var description: String { "(" + key + "," + value + ")" } + /// 判斷當前鍵值配對是否合規。如果鍵與值有任一為空,則結果為 false。 + public var isValid: Bool { !key.isEmpty && !value.isEmpty } + /// 將當前鍵值列印成一個字串,但如果該鍵值配對為空的話則僅列印「()」。 + public var toNGramKey: String { !isValid ? "()" : "(" + key + "," + value + ")" } + + /// 初期化一組鍵值配對。 + /// - Parameters: + /// - key: 鍵。一般情況下用來放置讀音等可以用來作為索引的內容。 + /// - value: 資料值。 + public init(key: String = "", value: String = "") { + self.key = key + self.value = value + } + + public func hash(into hasher: inout Hasher) { + hasher.combine(key) + hasher.combine(value) + } + + public static func == (lhs: Candidate, rhs: Candidate) -> Bool { + lhs.key == rhs.key && lhs.value == rhs.value + } + + public static func < (lhs: Candidate, rhs: Candidate) -> Bool { + (lhs.key.count < rhs.key.count) || (lhs.key.count == rhs.key.count && lhs.value < rhs.value) + } + + public static func > (lhs: Candidate, rhs: Candidate) -> Bool { + (lhs.key.count > rhs.key.count) || (lhs.key.count == rhs.key.count && lhs.value > rhs.value) + } + + public static func <= (lhs: Candidate, rhs: Candidate) -> Bool { + (lhs.key.count <= rhs.key.count) || (lhs.key.count == rhs.key.count && lhs.value <= rhs.value) + } + + public static func >= (lhs: Candidate, rhs: Candidate) -> Bool { + (lhs.key.count >= rhs.key.count) || (lhs.key.count == rhs.key.count && lhs.value >= rhs.value) + } + } + + public enum CandidateFetchFilter { case all, beginAt, endAt } + + /// 返回在當前位置的所有候選字詞(以詞音配對的形式)。如果組字器內有幅位、且游標 + /// 位於組字器的(文字輸入順序的)最前方(也就是游標位置的數值是最大合規數值)的 + /// 話,那麼這裡會用到 location - 1、以免去在呼叫該函數後再處理的麻煩。 + /// - Parameter location: 游標位置。 + /// - Returns: 候選字音配對陣列。 + public func fetchCandidates(at location: Int, filter: CandidateFetchFilter = .all) -> [Candidate] { + var result = [Candidate]() + guard !keys.isEmpty else { return result } + let location = max(min(location, keys.count - 1), 0) // 防呆 + let anchors: [NodeAnchor] = fetchOverlappingNodes(at: location).stableSorted { + // 按照讀音的長度來給節點排序。 + $0.spanLength > $1.spanLength + } + let keyAtCursor = keys[location] + for theNode in anchors.map(\.node) { + if theNode.key.isEmpty { continue } + for gram in theNode.unigrams { + switch filter { + case .all: + // 得加上這道篩選,所以會出現很多無效結果。 + if !theNode.keyArray.contains(keyAtCursor) { continue } + case .beginAt: + if theNode.keyArray[0] != keyAtCursor { continue } + case .endAt: + if theNode.keyArray.reversed()[0] != keyAtCursor { continue } + } + result.append(.init(key: theNode.key, value: gram.value)) + } + } + return result + } + + /// 使用給定的候選字(詞音配對),將給定位置的節點的候選字詞改為與之一致的候選字詞。 + /// + /// 該函式可以僅用作過程函式。 + /// - Parameters: + /// - candidate: 指定用來覆寫為的候選字(詞音配對)。 + /// - location: 游標位置。 + /// - overrideType: 指定覆寫行為。 + /// - Returns: 該操作是否成功執行。 + @discardableResult public func overrideCandidate( + _ candidate: Candidate, at location: Int, overrideType: Node.OverrideType = .withHighScore + ) + -> Bool + { + overrideCandidateAgainst(key: candidate.key, at: location, value: candidate.value, type: overrideType) + } + + /// 使用給定的候選字詞字串,將給定位置的節點的候選字詞改為與之一致的候選字詞。 + /// + /// 注意:如果有多個「單元圖資料值雷同、卻讀音不同」的節點的話,該函數的行為結果不可控。 + /// - Parameters: + /// - candidate: 指定用來覆寫為的候選字(字串)。 + /// - location: 游標位置。 + /// - overrideType: 指定覆寫行為。 + /// - Returns: 該操作是否成功執行。 + @discardableResult public func overrideCandidateLiteral( + _ candidate: String, + at location: Int, overrideType: Node.OverrideType = .withHighScore + ) -> Bool { + overrideCandidateAgainst(key: nil, at: location, value: candidate, type: overrideType) + } + + // MARK: Internal implementations. + + /// 使用給定的候選字(詞音配對)、或給定的候選字詞字串,將給定位置的節點的候選字詞改為與之一致的候選字詞。 + /// - Parameters: + /// - key: 索引鍵,也就是詞音配對當中的讀音。 + /// - location: 游標位置。 + /// - value: 資料值。 + /// - type: 指定覆寫行為。 + /// - Returns: 該操作是否成功執行。 + internal func overrideCandidateAgainst(key: String?, at location: Int, value: String, type: Node.OverrideType) + -> Bool + { + let location = max(min(location, keys.count), 0) // 防呆 + var arrOverlappedNodes: [NodeAnchor] = fetchOverlappingNodes(at: min(keys.count - 1, location)) + var overridden: NodeAnchor? + for anchor in arrOverlappedNodes { + if let key = key, anchor.node.key != key { continue } + if anchor.node.selectOverrideUnigram(value: value, type: type) { + overridden = anchor + break + } + } + + guard let overridden = overridden else { return false } // 啥也不覆寫。 + + for i in overridden.spanIndex.. Bool + ) + rethrows -> [Element] + { + try enumerated() + .sorted { a, b -> Bool in + try areInIncreasingOrder(a.element, b.element) + || (a.offset < b.offset && !areInIncreasingOrder(b.element, a.element)) + } + .map(\.element) + } +} diff --git a/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift b/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift deleted file mode 100644 index 72a0ff08..00000000 --- a/Source/Modules/LanguageParsers/Megrez/3_NodeAnchor.swift +++ /dev/null @@ -1,78 +0,0 @@ -// Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). -// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). -// ==================== -// This code is released under the MIT license (SPDX-License-Identifier: MIT) -// ... with NTL restriction stating that: -// No trademark license is granted to use the trade names, trademarks, service -// marks, or product names of Contributor, except as required to fulfill notice -// requirements defined in MIT License. - -extension Megrez { - /// 節锚。 - @frozen public struct NodeAnchor: Hashable { - /// 用來判斷該節錨是否為空。 - public var isEmpty: Bool { node.key.isEmpty } - /// 節點。一個節锚內不一定有節點。 - public var node: Node = .init() - /// 指定的幅位長度。 - public var spanLength: Int { node.spanLength } - /// 獲取用來比較的權重。 - public var scoreForSort: Double { node.score } - /// 累計權重。 - public var mass: Double = 0.0 - /// 單元圖陣列。 - public var unigrams: [Unigram] { node.unigrams } - /// 雙元圖陣列。 - public var bigrams: [Bigram] { node.bigrams } - /// 鍵。 - public var key: String { node.key } - - /// 初期化一個節錨。 - public init(node: Node = .init(), mass: Double? = nil) { - self.node = node - self.mass = mass ?? self.node.score - } - - /// 將該節錨雜湊化。 - public func hash(into hasher: inout Hasher) { - hasher.combine(node) - hasher.combine(mass) - } - - /// 將當前節锚列印成一個字串。 - public var description: String { - var stream = "" - stream += "{@(" + String(spanLength) + ")," - if node.key.isEmpty { - stream += node.description - } else { - stream += "null" - } - stream += "}" - return stream - } - } -} - -// MARK: - Array Extensions. - -extension Array where Element == Megrez.NodeAnchor { - /// 將節锚陣列列印成一個字串。 - public var description: String { - var arrOutputContent = [""] - for anchor in self { - arrOutputContent.append(anchor.description) - } - return arrOutputContent.joined(separator: "<-") - } - - /// 從一個節錨陣列當中取出目前的自動選字字串陣列。 - public var values: [String] { - map(\.node.currentPair.value) - } - - /// 從一個節錨陣列當中取出目前的索引鍵陣列。 - public var keys: [String] { - map(\.node.currentPair.key) - } -} diff --git a/Source/Modules/LanguageParsers/Megrez/3_Span.swift b/Source/Modules/LanguageParsers/Megrez/3_Span.swift deleted file mode 100644 index 0e4c00eb..00000000 --- a/Source/Modules/LanguageParsers/Megrez/3_Span.swift +++ /dev/null @@ -1,63 +0,0 @@ -// Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). -// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). -// ==================== -// This code is released under the MIT license (SPDX-License-Identifier: MIT) -// ... with NTL restriction stating that: -// No trademark license is granted to use the trade names, trademarks, service -// marks, or product names of Contributor, except as required to fulfill notice -// requirements defined in MIT License. - -extension Megrez { - /// 幅位。 - @frozen public struct SpanUnit { - /// 辭典:以節點長度為索引,以節點為資料值。 - private var lengthNodeMap: [Int: Megrez.Node] = [:] - /// 最長幅距。 - private(set) var maxLength: Int = 0 - - /// 自我清空,各項參數歸零。 - mutating func clear() { - lengthNodeMap.removeAll() - maxLength = 0 - } - - /// 往自身插入一個節點、及給定的節點長度。 - /// - Parameters: - /// - node: 節點。 - /// - length: 給定的節點長度。 - mutating func insert(node: Node, length: Int) { - let length = abs(length) // 防呆 - lengthNodeMap[length] = node - maxLength = max(maxLength, length) - } - - /// 移除任何比給定的長度更長的節點。 - /// - Parameters: - /// - length: 給定的節點長度。 - mutating func dropNodesBeyond(length: Int) { - let length = abs(length) // 防呆 - if length > maxLength { return } - var lenMax = 0 - var removalList: [Int: Megrez.Node] = [:] - for key in lengthNodeMap.keys { - if key > length { - removalList[key] = lengthNodeMap[key] - } else { - lenMax = max(lenMax, key) - } - } - for key in removalList.keys { - lengthNodeMap.removeValue(forKey: key) - } - maxLength = lenMax - } - - /// 給定節點長度,獲取節點。 - /// - Parameters: - /// - length: 給定的節點長度。 - public func nodeOf(length: Int) -> Node? { - // 防呆 Abs() - lengthNodeMap.keys.contains(abs(length)) ? lengthNodeMap[abs(length)] : nil - } - } -} diff --git a/Source/Modules/LanguageParsers/Megrez/4_Node.swift b/Source/Modules/LanguageParsers/Megrez/4_Node.swift deleted file mode 100644 index fe05ca8c..00000000 --- a/Source/Modules/LanguageParsers/Megrez/4_Node.swift +++ /dev/null @@ -1,172 +0,0 @@ -// Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). -// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). -// ==================== -// This code is released under the MIT license (SPDX-License-Identifier: MIT) -// ... with NTL restriction stating that: -// No trademark license is granted to use the trade names, trademarks, service -// marks, or product names of Contributor, except as required to fulfill notice -// requirements defined in MIT License. - -extension Megrez { - /// 節點。 - public class Node: Equatable, Hashable { - public static func == (lhs: Megrez.Node, rhs: Megrez.Node) -> Bool { - lhs.key == rhs.key && lhs.score == rhs.score && lhs.unigrams == rhs.unigrams && lhs.bigrams == rhs.bigrams - && lhs.candidates == rhs.candidates && lhs.valueUnigramIndexMap == rhs.valueUnigramIndexMap - && lhs.precedingBigramMap == rhs.precedingBigramMap && lhs.isCandidateFixed == rhs.isCandidateFixed - && lhs.selectedUnigramIndex == rhs.selectedUnigramIndex && lhs.spanLength == rhs.spanLength - } - - public func hash(into hasher: inout Hasher) { - hasher.combine(key) - hasher.combine(score) - hasher.combine(unigrams) - hasher.combine(bigrams) - hasher.combine(spanLength) - hasher.combine(candidates) - hasher.combine(valueUnigramIndexMap) - hasher.combine(precedingBigramMap) - hasher.combine(isCandidateFixed) - hasher.combine(selectedUnigramIndex) - } - - /// 鍵。 - private(set) var key: String = "" - /// 當前節點的當前被選中的候選字詞「在該節點內的」目前的權重。 - private(set) var score: Double = 0 - /// 單元圖陣列。 - private(set) var unigrams: [Unigram] - /// 雙元圖陣列。 - private(set) var bigrams: [Bigram] - /// 指定的幅位長度。 - public var spanLength: Int = 0 - /// 候選字詞陣列,以鍵值陣列的形式存在。 - private(set) var candidates: [KeyValuePaired] = [] - /// 專門「用單元圖資料值來調查索引值」的辭典。 - private var valueUnigramIndexMap: [String: Int] = [:] - /// 專門「用給定鍵值來取對應的雙元圖陣列」的辭典。 - private var precedingBigramMap: [KeyValuePaired: [Megrez.Bigram]] = [:] - /// 狀態標記變數,用來記載當前節點是否處於候選字詞鎖定狀態。 - private(set) var isCandidateFixed: Bool = false - /// 用來登記「當前選中的單元圖」的索引值的變數。 - private var selectedUnigramIndex: Int = 0 - /// 用來登記要施加給「『被標記為選中狀態』的候選字詞」的複寫權重的數值。 - public static let kSelectedCandidateScore: Double = 99 - /// 將當前節點列印成一個字串。 - public var description: String { - "(node,key:\(key),fixed:\(isCandidateFixed ? "true" : "false"),selected:\(selectedUnigramIndex),\(unigrams))" - } - - /// 公開:當前被選中的候選字詞的鍵值配對。 - public var currentPair: KeyValuePaired { - selectedUnigramIndex >= unigrams.count ? KeyValuePaired() : candidates[selectedUnigramIndex] - } - - /// 公開:給出當前單元圖陣列內最高的權重數值。 - public var highestUnigramScore: Double { unigrams.isEmpty ? 0.0 : unigrams[0].score } - - /// 初期化一個節點。 - /// - Parameters: - /// - key: 索引鍵。 - /// - unigrams: 單元圖陣列。 - /// - bigrams: 雙元圖陣列(非必填)。 - public init(key: String = "", spanLength: Int = 0, unigrams: [Megrez.Unigram] = [], bigrams: [Megrez.Bigram] = []) { - self.key = key - self.unigrams = unigrams - self.bigrams = bigrams - self.spanLength = spanLength - - self.unigrams.sort { - $0.score > $1.score - } - - if !self.unigrams.isEmpty { - score = unigrams[0].score - } - - for (i, gram) in self.unigrams.enumerated() { - valueUnigramIndexMap[gram.keyValue.value] = i - candidates.append(gram.keyValue) - } - - for gram in bigrams.lazy.filter({ [self] in - precedingBigramMap.keys.contains($0.precedingKeyValue) - }) { - precedingBigramMap[gram.precedingKeyValue]?.append(gram) - } - } - - /// 對擁有「給定的前述鍵值陣列」的節點提權。 - /// - Parameters: - /// - precedingKeyValues: 前述鍵值陣列。 - public func primeNodeWith(precedingKeyValues: [KeyValuePaired]) { - var newIndex = selectedUnigramIndex - var max = score - - if !isCandidateFixed { - for neta in precedingKeyValues { - let bigrams = precedingBigramMap[neta] ?? [] - for bigram in bigrams.lazy.filter({ [self] in - $0.score > max && valueUnigramIndexMap.keys.contains($0.keyValue.value) - }) { - newIndex = valueUnigramIndexMap[bigram.keyValue.value] ?? newIndex - max = bigram.score - } - } - } - score = max - selectedUnigramIndex = newIndex - } - - /// 選中位於給定索引位置的候選字詞。 - /// - Parameters: - /// - index: 索引位置。 - /// - fix: 是否將當前解點標記為「候選詞已鎖定」的狀態。 - public func selectCandidateAt(index: Int = 0, fix: Bool = false) { - let index = abs(index) - selectedUnigramIndex = index >= unigrams.count ? 0 : index - isCandidateFixed = fix - score = Megrez.Node.kSelectedCandidateScore - } - - /// 重設該節點的候選字詞狀態。 - public func resetCandidate() { - selectedUnigramIndex = 0 - isCandidateFixed = false - if !unigrams.isEmpty { - score = unigrams[0].score - } - } - - /// 選中位於給定索引位置的候選字詞、且施加給定的權重。 - /// - Parameters: - /// - index: 索引位置。 - /// - score: 給定權重條件。 - public func selectFloatingCandidateAt(index: Int, score: Double) { - let index = abs(index) // 防呆 - selectedUnigramIndex = index >= unigrams.count ? 0 : index - isCandidateFixed = false - self.score = score - } - - /// 藉由給定的候選字詞字串,找出在庫的單元圖權重數值。沒有的話就找零。 - /// - Parameters: - /// - candidate: 給定的候選字詞字串。 - public func scoreFor(candidate: String) -> Double { - for unigram in unigrams.lazy.filter({ $0.keyValue.value == candidate }) { - return unigram.score - } - return 0.0 - } - - /// 藉由給定的候選字詞鍵值配對,找出在庫的單元圖權重數值。沒有的話就找零。 - /// - Parameters: - /// - candidate: 給定的候選字詞字串。 - public func scoreForPaired(candidate: KeyValuePaired) -> Double { - for unigram in unigrams.lazy.filter({ $0.keyValue == candidate }) { - return unigram.score - } - return 0.0 - } - } -} diff --git a/Source/Modules/LanguageParsers/Megrez/4_Span.swift b/Source/Modules/LanguageParsers/Megrez/4_Span.swift new file mode 100644 index 00000000..9d7efb30 --- /dev/null +++ b/Source/Modules/LanguageParsers/Megrez/4_Span.swift @@ -0,0 +1,96 @@ +// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License). +// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). +// ==================== +// This code is released under the MIT license (SPDX-License-Identifier: MIT) + +extension Megrez.Compositor { + /// 幅位乃指一組共享起點的節點。 + public class Span { + private var nodes: [Node?] = [] + private(set) var maxLength = 0 + private var maxSpanLength: Int { Megrez.Compositor.maxSpanLength } + public init() { + clear() + } + + public func clear() { + nodes.removeAll() + for _ in 0.. Bool { + guard (1...maxSpanLength).contains(node.spanLength) else { + return false + } + nodes[node.spanLength - 1] = node + maxLength = max(maxLength, node.spanLength) + return true + } + + /// 丟掉任何不小於給定幅位長度的節點。 + /// - Parameter length: 給定的幅位長度。 + /// - Returns: 該操作是否成功執行。 + @discardableResult public func dropNodesOfOrBeyond(length: Int) -> Bool { + guard (1...maxSpanLength).contains(length) else { + return false + } + for i in length...maxSpanLength { + nodes[i - 1] = nil + } + maxLength = 0 + guard length > 1 else { return false } + let maxR = length - 2 + for i in 0...maxR { + if nodes[maxR - i] != nil { + maxLength = maxR - i + 1 + break + } + } + return true + } + + public func nodeOf(length: Int) -> Node? { + guard (1...maxSpanLength).contains(length) else { return nil } + return nodes[length - 1] ?? nil + } + } + + // MARK: Internal implementations. + + /// 找出所有與該位置重疊的節點。其返回值為一個節錨陣列(包含節點、以及其起始位置)。 + /// - Parameter location: 游標位置。 + /// - Returns: 一個包含所有與該位置重疊的節點的陣列。 + func fetchOverlappingNodes(at location: Int) -> [NodeAnchor] { + var results = [NodeAnchor]() + guard !spans.isEmpty, location < spans.count else { return results } + + // 先獲取該位置的所有單字節點。 + for theLocation in 1...spans[location].maxLength { + guard let node = spans[location].nodeOf(length: theLocation) else { continue } + results.append(.init(node: node, spanIndex: location)) + } + + // 再獲取以當前位置結尾或開頭的節點。 + let begin: Int = location - min(location, Megrez.Compositor.maxSpanLength - 1) + for theLocation in begin.. [Megrez.Unigram] - - /// 給定當前鍵與前述鍵,讓語言模型找給一組雙元圖陣列。 - func bigramsFor(precedingKey: String, key: String) -> [Megrez.Bigram] - - /// 給定鍵,確認是否有單元圖記錄在庫。 - func hasUnigramsFor(key: String) -> Bool -} - -extension Megrez { - /// 語言模型框架,回頭實際使用時需要派生一個型別、且重寫相關函式。 - open class LangModel: LangModelProtocol { - public init() {} - - // 這裡寫了一點假內容,不然有些 Swift 格式化工具會破壞掉函式的參數設計。 - - /// 給定鍵,讓語言模型找給一組單元圖陣列。 - open func unigramsFor(key: String) -> [Megrez.Unigram] { - key.isEmpty ? [Megrez.Unigram]() : [Megrez.Unigram]() - } - - /// 給定當前鍵與前述鍵,讓語言模型找給一組雙元圖陣列。 - open func bigramsFor(precedingKey: String, key: String) -> [Megrez.Bigram] { - precedingKey == key ? [Megrez.Bigram]() : [Megrez.Bigram]() - } - - /// 給定鍵,確認是否有單元圖記錄在庫。 - open func hasUnigramsFor(key: String) -> Bool { - key.count != 0 - } - } -} diff --git a/Source/Modules/LanguageParsers/Megrez/5_Vertex.swift b/Source/Modules/LanguageParsers/Megrez/5_Vertex.swift new file mode 100644 index 00000000..ea4c44df --- /dev/null +++ b/Source/Modules/LanguageParsers/Megrez/5_Vertex.swift @@ -0,0 +1,96 @@ +// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License). +// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). +// ==================== +// This code is released under the MIT license (SPDX-License-Identifier: MIT) + +extension Megrez.Compositor { + /// 一個「有向無環圖的」的頂點單位。 + /// + /// 這是一個可變的數據結構,用於有向無環圖的構建和單源最短路徑的計算。 + class Vertex { + /// 前述頂點。 + public var prev: Vertex? + /// 自身屬下的頂點陣列。 + public var edges = [Vertex]() + /// 該變數用於最短路徑的計算。 + /// + /// 我們實際上是在計算具有最大權重的路徑,因此距離的初始值是負無窮的。 + /// 如果我們要計算最短的權重/距離,我們會將其初期值設為正無窮。 + public var distance = -(Double.infinity) + /// 在進行進行位相幾何排序時會用到的狀態標記。 + public var topologicallySorted = false + public var node: Node + public init(node: Node) { + self.node = node + } + } + + /// 卸勁函式。 + /// + /// 「卸勁 (relax)」一詞出自 Cormen 在 2001 年的著作「Introduction to Algorithms」的 585 頁。 + /// - Parameters: + /// - u: 參照頂點,會在必要時成為 v 的前述頂點。 + /// - v: 要影響的頂點。 + func relax(u: Vertex, v: inout Vertex) { + /// 從 u 到 w 的距離,也就是 v 的權重。 + let w: Double = v.node.score + /// 這裡計算最大權重: + /// 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」, + /// 我們就更新 v 的距離及其前述頂點。 + if v.distance < u.distance + w { + v.distance = u.distance + w + v.prev = u + } + } + + typealias VertexSpan = [Vertex] + + /// 對持有單個根頂點的有向無環圖進行位相幾何排序(topological + /// sort)、且將排序結果以頂點陣列的形式給出。 + /// + /// 這裡使用我們自己的堆棧和狀態定義實現了一個非遞迴版本, + /// 這樣我們就不會受到當前線程的堆棧大小的限制。以下是等價的原始算法。 + /// ``` + /// func topologicalSort(vertex: Vertex) { + /// for vertexNode in vertex.edges { + /// if !vertexNode.topologicallySorted { + /// dfs(vertexNode, result) + /// vertexNode.topologicallySorted = true + /// } + /// result.append(vertexNode) + /// } + /// } + /// ``` + /// 至於遞迴版本則類似於 Cormen 在 2001 年的著作「Introduction to Algorithms」當中的樣子。 + /// - Parameter root: 根頂點。 + /// - Returns: 排序結果(頂點陣列)。 + func topologicalSort(root: Vertex) -> [Vertex] { + class State { + var iterIndex: Int + var vertex: Vertex + init(vertex: Vertex, iterIndex: Int = 0) { + self.vertex = vertex + self.iterIndex = iterIndex + } + } + var result = [Vertex]() + var stack = [State]() + stack.append(.init(vertex: root)) + while !stack.isEmpty { + let state = stack[stack.count - 1] + let theVertex = state.vertex + if state.iterIndex < state.vertex.edges.count { + let newVertex = state.vertex.edges[state.iterIndex] + state.iterIndex += 1 + if !newVertex.topologicallySorted { + stack.append(.init(vertex: newVertex)) + continue + } + } + theVertex.topologicallySorted = true + result.append(theVertex) + stack.removeLast() + } + return result + } +} diff --git a/Source/Modules/LanguageParsers/Megrez/6_Bigram.swift b/Source/Modules/LanguageParsers/Megrez/6_Bigram.swift deleted file mode 100644 index d355a016..00000000 --- a/Source/Modules/LanguageParsers/Megrez/6_Bigram.swift +++ /dev/null @@ -1,64 +0,0 @@ -// Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). -// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). -// ==================== -// This code is released under the MIT license (SPDX-License-Identifier: MIT) -// ... with NTL restriction stating that: -// No trademark license is granted to use the trade names, trademarks, service -// marks, or product names of Contributor, except as required to fulfill notice -// requirements defined in MIT License. - -extension Megrez { - /// 雙元圖。 - @frozen public struct Bigram: Equatable, CustomStringConvertible, Hashable { - /// 當前鍵值。 - public var keyValue: KeyValuePaired - /// 前述鍵值。 - public var precedingKeyValue: KeyValuePaired - /// 權重。 - public var score: Double - /// 將當前雙元圖列印成一個字串。 - public var description: String { - "(" + keyValue.description + "|" + precedingKeyValue.description + "," + String(score) + ")" - } - - /// 初期化一筆「雙元圖」。一筆雙元圖由一組前述鍵值配對、一組當前鍵值配對、與一筆權重數值組成。 - /// - Parameters: - /// - precedingKeyValue: 前述鍵值。 - /// - keyValue: 當前鍵值。 - /// - score: 權重(雙精度小數)。 - public init(precedingKeyValue: KeyValuePaired, keyValue: KeyValuePaired, score: Double) { - self.keyValue = keyValue - self.precedingKeyValue = precedingKeyValue - self.score = score - } - - public func hash(into hasher: inout Hasher) { - hasher.combine(keyValue) - hasher.combine(precedingKeyValue) - hasher.combine(score) - // hasher.combine(paired) - } - - public static func == (lhs: Bigram, rhs: Bigram) -> Bool { - lhs.precedingKeyValue == rhs.precedingKeyValue && lhs.keyValue == rhs.keyValue && lhs.score == rhs.score - } - - public static func < (lhs: Bigram, rhs: Bigram) -> Bool { - lhs.precedingKeyValue < rhs.precedingKeyValue - || (lhs.keyValue < rhs.keyValue || (lhs.keyValue == rhs.keyValue && lhs.score < rhs.score)) - } - } -} - -// MARK: - DumpDOT-related functions. - -extension Array where Element == Megrez.Bigram { - /// 將雙元圖陣列列印成一個字串。 - public var description: String { - var arrOutputContent = [""] - for (index, gram) in enumerated() { - arrOutputContent.append(contentsOf: [String(index) + "=>" + gram.description]) - } - return "[" + String(count) + "]=>{" + arrOutputContent.joined(separator: ",") + "}" - } -} diff --git a/Source/Modules/LanguageParsers/Megrez/6_Node.swift b/Source/Modules/LanguageParsers/Megrez/6_Node.swift new file mode 100644 index 00000000..a42f1788 --- /dev/null +++ b/Source/Modules/LanguageParsers/Megrez/6_Node.swift @@ -0,0 +1,142 @@ +// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License). +// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). +// ==================== +// This code is released under the MIT license (SPDX-License-Identifier: MIT) + +extension Megrez.Compositor { + /// 一個節點由這些內容組成:幅位長度、索引鍵、以及一組單元圖。幅位長度就是指這個 + /// 節點在組字器內橫跨了多少個字長。組字器負責構築自身的節點。對於由多個漢字組成 + /// 的詞,組字器會將多個讀音索引鍵合併為一個讀音索引鍵、據此向語言模組請求對應的 + /// 單元圖結果陣列。舉例說,如果一個詞有兩個漢字組成的話,那麼讀音也是有兩個、其 + /// 索引鍵值也是由兩個讀音組成的,那麼這個節點的幅位長度就是 2。 + public class Node: Equatable, Hashable { + /// 三種不同的針對一個節點的覆寫行為。 + /// - withNoOverrides: 無覆寫行為。 + /// - withTopUnigramScore: 使用指定的單元圖資料值來覆寫該節點,但卻使用 + /// 當前狀態下權重最高的單元圖的權重數值。打比方說,如果該節點內的單元圖陣列是 + /// [("a", -114), ("b", -514), ("c", -1919)] 的話,指定該覆寫行為則會導致該節 + /// 點返回的結果為 ("c", -114)。該覆寫行為多用於諸如使用者半衰記憶模組的建議 + /// 行為。被覆寫的這個節點的狀態可能不會再被爬軌行為擅自改回。該覆寫行為無法 + /// 防止其它節點被爬軌函數所支配。這種情況下就需要用到 kOverridingScore + /// - withHighScore: 將該節點權重覆寫為 kOverridingScore,使其被爬軌函數所青睞。 + public enum OverrideType: Int { + case withNoOverrides = 0 + case withTopUnigramScore = 1 + case withHighScore = 2 + } + + /// 一個用以覆寫權重的數值。該數值之高足以改變爬軌函數對該節點的讀取結果。這裡用 + /// 「0」可能看似足夠了,但仍會使得該節點的覆寫狀態有被爬軌函數忽視的可能。比方說 + /// 要針對索引鍵「a b c」複寫的資料值為「A B C」,使用大寫資料值來覆寫節點。這時, + /// 如果這個獨立的 c 有一個可以拮抗權重的詞「bc」的話,可能就會導致爬軌函數的算法 + /// 找出「A->bc」的爬軌途徑(尤其是當 A 和 B 使用「0」作為複寫數值的情況下)。這樣 + /// 一來,「A-B」就不一定始終會是爬軌函數的青睞結果了。所以,這裡一定要用大於 0 的 + /// 數(比如野獸常數),以讓「c」更容易單獨被選中。 + public static let kOverridingScore: Double = 114_514 + + private(set) var key: String + private(set) var keyArray: [String] + private(set) var spanLength: Int + private(set) var unigrams: [Megrez.Unigram] + private(set) var currentUnigramIndex: Int = 0 { + didSet { currentUnigramIndex = min(max(0, currentUnigramIndex), unigrams.count - 1) } + } + + public var currentPair: Megrez.Compositor.Candidate { .init(key: key, value: value) } + + public func hash(into hasher: inout Hasher) { + hasher.combine(key) + hasher.combine(spanLength) + hasher.combine(unigrams) + hasher.combine(currentUnigramIndex) + hasher.combine(spanLength) + hasher.combine(overrideType) + } + + private(set) var overrideType: Node.OverrideType + + public static func == (lhs: Node, rhs: Node) -> Bool { + lhs.key == rhs.key && lhs.spanLength == rhs.spanLength + && lhs.unigrams == rhs.unigrams && lhs.overrideType == rhs.overrideType + } + + public init( + keyArray: [String] = [], spanLength: Int = 0, unigrams: [Megrez.Unigram] = [], keySeparator: String = "" + ) { + key = keyArray.joined(separator: keySeparator) + self.keyArray = keyArray + self.spanLength = spanLength + self.unigrams = unigrams + overrideType = .withNoOverrides + } + + /// 給出目前的最高權重單元圖。該結果可能會受節點覆寫狀態所影響。 + /// - Returns: 目前的最高權重單元圖。該結果可能會受節點覆寫狀態所影響。 + public var currentUnigram: Megrez.Unigram { + unigrams.isEmpty ? .init() : unigrams[currentUnigramIndex] + } + + public var value: String { currentUnigram.value } + + public var score: Double { + guard !unigrams.isEmpty else { return 0 } + switch overrideType { + case .withHighScore: return Megrez.Compositor.Node.kOverridingScore + case .withTopUnigramScore: return unigrams[0].score + default: return currentUnigram.score + } + } + + public var isOverriden: Bool { + overrideType != .withNoOverrides + } + + public func reset() { + currentUnigramIndex = 0 + overrideType = .withNoOverrides + } + + public func selectOverrideUnigram(value: String, type: Node.OverrideType) -> Bool { + guard type != .withNoOverrides else { + return false + } + for (i, gram) in unigrams.enumerated() { + if value != gram.value { continue } + currentUnigramIndex = i + overrideType = type + return true + } + return false + } + } +} + +extension Megrez.Compositor { + /// 節錨。 + /// + /// 在 Gramambular 當中又被稱為「NodeInSpan」。 + public struct NodeAnchor: Hashable { + let node: Megrez.Compositor.Node + let spanIndex: Int // 幅位座標 + var spanLength: Int { node.spanLength } + var unigrams: [Megrez.Unigram] { node.unigrams } + var key: String { node.key } + var value: String { node.value } + + /// 將該節錨雜湊化。 + public func hash(into hasher: inout Hasher) { + hasher.combine(node) + hasher.combine(spanIndex) + } + } +} + +// MARK: - Array Extensions. + +extension Array where Element == Megrez.Compositor.Node { + /// 從一個節點陣列當中取出目前的自動選字字串陣列。 + public var values: [String] { map(\.value) } + + /// 從一個節點陣列當中取出目前的索引鍵陣列。 + public var keys: [String] { map(\.key) } +} diff --git a/Source/Modules/LanguageParsers/Megrez/6_Unigram.swift b/Source/Modules/LanguageParsers/Megrez/6_Unigram.swift deleted file mode 100644 index d6e78ac8..00000000 --- a/Source/Modules/LanguageParsers/Megrez/6_Unigram.swift +++ /dev/null @@ -1,57 +0,0 @@ -// Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). -// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). -// ==================== -// This code is released under the MIT license (SPDX-License-Identifier: MIT) -// ... with NTL restriction stating that: -// No trademark license is granted to use the trade names, trademarks, service -// marks, or product names of Contributor, except as required to fulfill notice -// requirements defined in MIT License. - -extension Megrez { - /// 單元圖。 - @frozen public struct Unigram: Equatable, CustomStringConvertible, Hashable { - /// 鍵值。 - public var keyValue: KeyValuePaired - /// 權重。 - public var score: Double - /// 將當前單元圖列印成一個字串。 - public var description: String { - "(" + keyValue.description + "," + String(score) + ")" - } - - /// 初期化一筆「單元圖」。一筆單元圖由一組鍵值配對與一筆權重數值組成。 - /// - Parameters: - /// - keyValue: 鍵值。 - /// - score: 權重(雙精度小數)。 - public init(keyValue: KeyValuePaired, score: Double) { - self.keyValue = keyValue - self.score = score - } - - public func hash(into hasher: inout Hasher) { - hasher.combine(keyValue) - hasher.combine(score) - } - - public static func == (lhs: Unigram, rhs: Unigram) -> Bool { - lhs.keyValue == rhs.keyValue && lhs.score == rhs.score - } - - public static func < (lhs: Unigram, rhs: Unigram) -> Bool { - lhs.keyValue < rhs.keyValue || (lhs.keyValue == rhs.keyValue && lhs.score < rhs.score) - } - } -} - -// MARK: - DumpDOT-related functions. - -extension Array where Element == Megrez.Unigram { - /// 將單元圖陣列列印成一個字串。 - public var description: String { - var arrOutputContent = [""] - for (index, gram) in enumerated() { - arrOutputContent.append(contentsOf: [String(index) + "=>" + gram.description]) - } - return "[" + String(count) + "]=>{" + arrOutputContent.joined(separator: ",") + "}" - } -} diff --git a/Source/Modules/LanguageParsers/Megrez/7_KeyValuePaired.swift b/Source/Modules/LanguageParsers/Megrez/7_KeyValuePaired.swift deleted file mode 100644 index 5678e615..00000000 --- a/Source/Modules/LanguageParsers/Megrez/7_KeyValuePaired.swift +++ /dev/null @@ -1,58 +0,0 @@ -// Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). -// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). -// ==================== -// This code is released under the MIT license (SPDX-License-Identifier: MIT) -// ... with NTL restriction stating that: -// No trademark license is granted to use the trade names, trademarks, service -// marks, or product names of Contributor, except as required to fulfill notice -// requirements defined in MIT License. - -extension Megrez { - /// 鍵值配對。 - @frozen public struct KeyValuePaired: Equatable, Hashable, Comparable, CustomStringConvertible { - /// 鍵。一般情況下用來放置讀音等可以用來作為索引的內容。 - public var key: String - /// 資料值。 - public var value: String - /// 將當前鍵值列印成一個字串。 - public var description: String { "(" + key + "," + value + ")" } - /// 判斷當前鍵值配對是否合規。如果鍵與值有任一為空,則結果為 false。 - public var isValid: Bool { !key.isEmpty && !value.isEmpty } - /// 將當前鍵值列印成一個字串,但如果該鍵值配對為空的話則僅列印「()」。 - public var toNGramKey: String { !isValid ? "()" : "(" + key + "," + value + ")" } - - /// 初期化一組鍵值配對。 - /// - Parameters: - /// - key: 鍵。一般情況下用來放置讀音等可以用來作為索引的內容。 - /// - value: 資料值。 - public init(key: String = "", value: String = "") { - self.key = key - self.value = value - } - - public func hash(into hasher: inout Hasher) { - hasher.combine(key) - hasher.combine(value) - } - - public static func == (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool { - lhs.key == rhs.key && lhs.value == rhs.value - } - - public static func < (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool { - (lhs.key.count < rhs.key.count) || (lhs.key.count == rhs.key.count && lhs.value < rhs.value) - } - - public static func > (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool { - (lhs.key.count > rhs.key.count) || (lhs.key.count == rhs.key.count && lhs.value > rhs.value) - } - - public static func <= (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool { - (lhs.key.count <= rhs.key.count) || (lhs.key.count == rhs.key.count && lhs.value <= rhs.value) - } - - public static func >= (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool { - (lhs.key.count >= rhs.key.count) || (lhs.key.count == rhs.key.count && lhs.value >= rhs.value) - } - } -} diff --git a/Source/Modules/LanguageParsers/Megrez/7_LangModel.swift b/Source/Modules/LanguageParsers/Megrez/7_LangModel.swift new file mode 100644 index 00000000..b08a6ed0 --- /dev/null +++ b/Source/Modules/LanguageParsers/Megrez/7_LangModel.swift @@ -0,0 +1,61 @@ +// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License). +// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). +// ==================== +// This code is released under the MIT license (SPDX-License-Identifier: MIT) + +/// 語言模組協定。 +public protocol LangModelProtocol { + /// 給定鍵,讓語言模型找給一組單元圖陣列。 + func unigramsFor(key: String) -> [Megrez.Unigram] + /// 給定鍵,確認是否有單元圖記錄在庫。 + func hasUnigramsFor(key: String) -> Bool +} + +extension Megrez.Compositor { + /// 一個套殼語言模型,用來始終返回經過排序的單元圖。 + public class LangModelRanked: LangModelProtocol { + private let langModel: LangModelProtocol + /// 一個套殼語言模型,用來始終返回經過排序的單元圖。 + /// - Parameter withLM: 用來對接的語言模型。 + public init(withLM: LangModelProtocol) { + langModel = withLM + } + + /// 給定索引鍵,讓語言模型找給一組經過穩定排序的單元圖陣列。 + /// - Parameter key: 給定的索引鍵字串。 + /// - Returns: 對應的經過穩定排序的單元圖陣列。 + public func unigramsFor(key: String) -> [Megrez.Unigram] { + langModel.unigramsFor(key: key).stableSorted { $0.score > $1.score } + } + + /// 根據給定的索引鍵來確認各個資料庫陣列內是否存在對應的資料。 + /// - Parameter key: 索引鍵。 + /// - Returns: 是否在庫。 + public func hasUnigramsFor(key: String) -> Bool { + langModel.hasUnigramsFor(key: key) + } + } +} + +// MARK: - Stable Sort Extension + +// Reference: https://stackoverflow.com/a/50545761/4162914 + +extension Sequence { + /// Return a stable-sorted collection. + /// + /// - Parameter areInIncreasingOrder: Return nil when two element are equal. + /// - Returns: The sorted collection. + fileprivate func stableSorted( + by areInIncreasingOrder: (Element, Element) throws -> Bool + ) + rethrows -> [Element] + { + try enumerated() + .sorted { a, b -> Bool in + try areInIncreasingOrder(a.element, b.element) + || (a.offset < b.offset && !areInIncreasingOrder(b.element, a.element)) + } + .map(\.element) + } +} diff --git a/Source/Modules/LanguageParsers/Megrez/8_Unigram.swift b/Source/Modules/LanguageParsers/Megrez/8_Unigram.swift new file mode 100644 index 00000000..b8aa9cb2 --- /dev/null +++ b/Source/Modules/LanguageParsers/Megrez/8_Unigram.swift @@ -0,0 +1,40 @@ +// Swiftified by (c) 2022 and onwards The vChewing Project (MIT License). +// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License). +// ==================== +// This code is released under the MIT license (SPDX-License-Identifier: MIT) + +extension Megrez { + /// 單元圖。 + @frozen public struct Unigram: Equatable, CustomStringConvertible, Hashable { + /// 鍵值。 + public var value: String + /// 權重。 + public var score: Double + /// 將當前單元圖列印成一個字串。 + public var description: String { + "(" + value.description + "," + String(score) + ")" + } + + /// 初期化一筆「單元圖」。一筆單元圖由一組鍵值配對與一筆權重數值組成。 + /// - Parameters: + /// - value: 鍵值。 + /// - score: 權重(雙精度小數)。 + public init(value: String = "", score: Double = 0) { + self.value = value + self.score = score + } + + public func hash(into hasher: inout Hasher) { + hasher.combine(value) + hasher.combine(score) + } + + public static func == (lhs: Unigram, rhs: Unigram) -> Bool { + lhs.value == rhs.value && lhs.score == rhs.score + } + + public static func < (lhs: Unigram, rhs: Unigram) -> Bool { + lhs.value < rhs.value || (lhs.value == rhs.value && lhs.score < rhs.score) + } + } +} diff --git a/vChewing.xcodeproj/project.pbxproj b/vChewing.xcodeproj/project.pbxproj index 7840e6e5..08987ac1 100644 --- a/vChewing.xcodeproj/project.pbxproj +++ b/vChewing.xcodeproj/project.pbxproj @@ -12,18 +12,17 @@ 5B21176C287539BB000443A9 /* ctlInputMethod_HandleStates.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B21176B287539BB000443A9 /* ctlInputMethod_HandleStates.swift */; }; 5B21176E28753B35000443A9 /* ctlInputMethod_HandleDisplay.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B21176D28753B35000443A9 /* ctlInputMethod_HandleDisplay.swift */; }; 5B21177028753B9D000443A9 /* ctlInputMethod_Delegates.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B21176F28753B9D000443A9 /* ctlInputMethod_Delegates.swift */; }; + 5B2170E0289FACAD00BE7304 /* 7_LangModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B2170D7289FACAC00BE7304 /* 7_LangModel.swift */; }; + 5B2170E1289FACAD00BE7304 /* 0_Megrez.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B2170D8289FACAC00BE7304 /* 0_Megrez.swift */; }; + 5B2170E2289FACAD00BE7304 /* 8_Unigram.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B2170D9289FACAC00BE7304 /* 8_Unigram.swift */; }; + 5B2170E3289FACAD00BE7304 /* 3_Candidate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B2170DA289FACAC00BE7304 /* 3_Candidate.swift */; }; + 5B2170E4289FACAD00BE7304 /* 2_Walker.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B2170DB289FACAC00BE7304 /* 2_Walker.swift */; }; + 5B2170E5289FACAD00BE7304 /* 6_Node.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B2170DC289FACAC00BE7304 /* 6_Node.swift */; }; + 5B2170E6289FACAD00BE7304 /* 4_Span.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B2170DD289FACAC00BE7304 /* 4_Span.swift */; }; + 5B2170E7289FACAD00BE7304 /* 1_Compositor.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B2170DE289FACAC00BE7304 /* 1_Compositor.swift */; }; + 5B2170E8289FACAD00BE7304 /* 5_Vertex.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B2170DF289FACAC00BE7304 /* 5_Vertex.swift */; }; 5B242403284B0D6500520FE4 /* ctlCandidateUniversal.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B242402284B0D6500520FE4 /* ctlCandidateUniversal.swift */; }; 5B3133BF280B229700A4A505 /* KeyHandler_States.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B3133BE280B229700A4A505 /* KeyHandler_States.swift */; }; - 5B38F59A281E2E49007D5F5D /* 6_Unigram.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1D15FC0EB100ABF4B3 /* 6_Unigram.swift */; }; - 5B38F59B281E2E49007D5F5D /* 7_KeyValuePaired.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1815FC0EB100ABF4B3 /* 7_KeyValuePaired.swift */; }; - 5B38F59C281E2E49007D5F5D /* 2_Grid.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1715FC0EB100ABF4B3 /* 2_Grid.swift */; }; - 5B38F59D281E2E49007D5F5D /* 4_Node.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1A15FC0EB100ABF4B3 /* 4_Node.swift */; }; - 5B38F59E281E2E49007D5F5D /* 6_Bigram.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1415FC0EB100ABF4B3 /* 6_Bigram.swift */; }; - 5B38F59F281E2E49007D5F5D /* 3_NodeAnchor.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1B15FC0EB100ABF4B3 /* 3_NodeAnchor.swift */; }; - 5B38F5A1281E2E49007D5F5D /* 1_Compositor.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1515FC0EB100ABF4B3 /* 1_Compositor.swift */; }; - 5B38F5A2281E2E49007D5F5D /* 0_Megrez.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1615FC0EB100ABF4B3 /* 0_Megrez.swift */; }; - 5B38F5A3281E2E49007D5F5D /* 3_Span.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1C15FC0EB100ABF4B3 /* 3_Span.swift */; }; - 5B38F5A4281E2E49007D5F5D /* 5_LanguageModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A0D4F1915FC0EB100ABF4B3 /* 5_LanguageModel.swift */; }; 5B3A87BC28597CDB0090E163 /* LMSymbolNode.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B3A87BB28597CDB0090E163 /* LMSymbolNode.swift */; }; 5B40730C281672610023DFFF /* lmAssociates.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B407309281672610023DFFF /* lmAssociates.swift */; }; 5B40730D281672610023DFFF /* lmReplacements.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B40730A281672610023DFFF /* lmReplacements.swift */; }; @@ -213,6 +212,15 @@ 5B21176B287539BB000443A9 /* ctlInputMethod_HandleStates.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ctlInputMethod_HandleStates.swift; sourceTree = ""; }; 5B21176D28753B35000443A9 /* ctlInputMethod_HandleDisplay.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ctlInputMethod_HandleDisplay.swift; sourceTree = ""; }; 5B21176F28753B9D000443A9 /* ctlInputMethod_Delegates.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ctlInputMethod_Delegates.swift; sourceTree = ""; }; + 5B2170D7289FACAC00BE7304 /* 7_LangModel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = 7_LangModel.swift; sourceTree = ""; }; + 5B2170D8289FACAC00BE7304 /* 0_Megrez.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = 0_Megrez.swift; sourceTree = ""; }; + 5B2170D9289FACAC00BE7304 /* 8_Unigram.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = 8_Unigram.swift; sourceTree = ""; }; + 5B2170DA289FACAC00BE7304 /* 3_Candidate.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = 3_Candidate.swift; sourceTree = ""; }; + 5B2170DB289FACAC00BE7304 /* 2_Walker.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = 2_Walker.swift; sourceTree = ""; }; + 5B2170DC289FACAC00BE7304 /* 6_Node.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = 6_Node.swift; sourceTree = ""; }; + 5B2170DD289FACAC00BE7304 /* 4_Span.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = 4_Span.swift; sourceTree = ""; }; + 5B2170DE289FACAC00BE7304 /* 1_Compositor.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = 1_Compositor.swift; sourceTree = ""; }; + 5B2170DF289FACAC00BE7304 /* 5_Vertex.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = 5_Vertex.swift; sourceTree = ""; }; 5B242402284B0D6500520FE4 /* ctlCandidateUniversal.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ctlCandidateUniversal.swift; sourceTree = ""; }; 5B2DB17127AF8771006D874E /* Makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; name = Makefile; path = Data/Makefile; sourceTree = ""; }; 5B2F2BB3286216A500B8557B /* vChewingTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = vChewingTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; @@ -329,16 +337,6 @@ 5BFDF48C27B51867009523B6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hant"; path = "zh-Hant.lproj/Main.strings"; sourceTree = ""; }; 6A0D4EA215FC0D2D00ABF4B3 /* vChewing.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = vChewing.app; sourceTree = BUILT_PRODUCTS_DIR; }; 6A0D4EF515FC0DA600ABF4B3 /* IME-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "IME-Info.plist"; sourceTree = ""; }; - 6A0D4F1415FC0EB100ABF4B3 /* 6_Bigram.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = 6_Bigram.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; - 6A0D4F1515FC0EB100ABF4B3 /* 1_Compositor.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = 1_Compositor.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; - 6A0D4F1615FC0EB100ABF4B3 /* 0_Megrez.swift */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.swift; lineEnding = 0; path = 0_Megrez.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; - 6A0D4F1715FC0EB100ABF4B3 /* 2_Grid.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = 2_Grid.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; - 6A0D4F1815FC0EB100ABF4B3 /* 7_KeyValuePaired.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = 7_KeyValuePaired.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; - 6A0D4F1915FC0EB100ABF4B3 /* 5_LanguageModel.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = 5_LanguageModel.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; - 6A0D4F1A15FC0EB100ABF4B3 /* 4_Node.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = 4_Node.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; - 6A0D4F1B15FC0EB100ABF4B3 /* 3_NodeAnchor.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = 3_NodeAnchor.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; - 6A0D4F1C15FC0EB100ABF4B3 /* 3_Span.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = 3_Span.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; - 6A0D4F1D15FC0EB100ABF4B3 /* 6_Unigram.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = 6_Unigram.swift; sourceTree = ""; tabWidth = 2; usesTabs = 0; }; 6A15B32421A51F2300B92CD3 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = Base; path = Base.lproj/MainMenu.xib; sourceTree = ""; }; 6A15B32521A51F2300B92CD3 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = Base; path = Base.lproj/MainMenu.xib; sourceTree = ""; }; 6A225A1E23679F2600F685C6 /* NotarizedArchives */ = {isa = PBXFileReference; lastKnownFileType = folder; path = NotarizedArchives; sourceTree = ""; }; @@ -821,16 +819,15 @@ 6A0D4F1315FC0EB100ABF4B3 /* Megrez */ = { isa = PBXGroup; children = ( - 6A0D4F1615FC0EB100ABF4B3 /* 0_Megrez.swift */, - 6A0D4F1515FC0EB100ABF4B3 /* 1_Compositor.swift */, - 6A0D4F1715FC0EB100ABF4B3 /* 2_Grid.swift */, - 6A0D4F1B15FC0EB100ABF4B3 /* 3_NodeAnchor.swift */, - 6A0D4F1C15FC0EB100ABF4B3 /* 3_Span.swift */, - 6A0D4F1A15FC0EB100ABF4B3 /* 4_Node.swift */, - 6A0D4F1915FC0EB100ABF4B3 /* 5_LanguageModel.swift */, - 6A0D4F1415FC0EB100ABF4B3 /* 6_Bigram.swift */, - 6A0D4F1D15FC0EB100ABF4B3 /* 6_Unigram.swift */, - 6A0D4F1815FC0EB100ABF4B3 /* 7_KeyValuePaired.swift */, + 5B2170D8289FACAC00BE7304 /* 0_Megrez.swift */, + 5B2170DE289FACAC00BE7304 /* 1_Compositor.swift */, + 5B2170DB289FACAC00BE7304 /* 2_Walker.swift */, + 5B2170DA289FACAC00BE7304 /* 3_Candidate.swift */, + 5B2170DD289FACAC00BE7304 /* 4_Span.swift */, + 5B2170DF289FACAC00BE7304 /* 5_Vertex.swift */, + 5B2170DC289FACAC00BE7304 /* 6_Node.swift */, + 5B2170D7289FACAC00BE7304 /* 7_LangModel.swift */, + 5B2170D9289FACAC00BE7304 /* 8_Unigram.swift */, ); path = Megrez; sourceTree = ""; @@ -1155,8 +1152,6 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( - 5B38F59D281E2E49007D5F5D /* 4_Node.swift in Sources */, - 5B38F5A3281E2E49007D5F5D /* 3_Span.swift in Sources */, 5B40730C281672610023DFFF /* lmAssociates.swift in Sources */, D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */, 5BA9FD4527FEF3C9002DE248 /* ToolbarItemStyleViewController.swift in Sources */, @@ -1169,22 +1164,25 @@ D4A13D5A27A59F0B003BE359 /* ctlInputMethod_Core.swift in Sources */, 5BA9FD4827FEF3C9002DE248 /* PreferencesWindowController.swift in Sources */, 5BD0113B28180D6100609769 /* LMInstantiator.swift in Sources */, + 5B2170E7289FACAD00BE7304 /* 1_Compositor.swift in Sources */, 5B21177028753B9D000443A9 /* ctlInputMethod_Delegates.swift in Sources */, 5B21176E28753B35000443A9 /* ctlInputMethod_HandleDisplay.swift in Sources */, 5B84579F2871AD2200C93B01 /* HotenkaChineseConverter.swift in Sources */, 5B887F302826AEA400B6651E /* lmCoreEX.swift in Sources */, 5BA9FD4627FEF3C9002DE248 /* Container.swift in Sources */, D47F7DD0278C0897002F9DD7 /* ctlNonModalAlertWindow.swift in Sources */, - 5B38F5A2281E2E49007D5F5D /* 0_Megrez.swift in Sources */, + 5B2170E5289FACAD00BE7304 /* 6_Node.swift in Sources */, 5B949BD92816DC5400D87B5D /* LineReader.swift in Sources */, D456576E279E4F7B00DF6BC9 /* InputSignal.swift in Sources */, 5BA9FD1027FEDB6B002DE248 /* suiPrefPaneKeyboard.swift in Sources */, 5B3133BF280B229700A4A505 /* KeyHandler_States.swift in Sources */, + 5B2170E1289FACAD00BE7304 /* 0_Megrez.swift in Sources */, 5B3A87BC28597CDB0090E163 /* LMSymbolNode.swift in Sources */, 5BA9FD4327FEF3C8002DE248 /* Preferences.swift in Sources */, 5BA9FD4427FEF3C8002DE248 /* SegmentedControlStyleViewController.swift in Sources */, D47F7DCE278BFB57002F9DD7 /* ctlPrefWindow.swift in Sources */, 5BD0113D2818543900609769 /* KeyHandler_Core.swift in Sources */, + 5B2170E4289FACAD00BE7304 /* 2_Walker.swift in Sources */, 5BA9FD4227FEF3C8002DE248 /* PreferencePane.swift in Sources */, 5BA0DF312817857D009E73BB /* lmUserOverride.swift in Sources */, 5BA9FD8B28006B41002DE248 /* VDKComboBox.swift in Sources */, @@ -1194,39 +1192,37 @@ 5B11328927B94CFB00E58451 /* AppleKeyboardConverter.swift in Sources */, 5B54E743283A7D89001ECBDC /* lmCoreNS.swift in Sources */, 5B62A32927AE77D100A19448 /* FSEventStreamHelper.swift in Sources */, + 5B2170E2289FACAD00BE7304 /* 8_Unigram.swift in Sources */, 5B21176C287539BB000443A9 /* ctlInputMethod_HandleStates.swift in Sources */, - 5B38F59B281E2E49007D5F5D /* 7_KeyValuePaired.swift in Sources */, 5B62A33627AE795800A19448 /* mgrPrefs.swift in Sources */, - 5B38F5A4281E2E49007D5F5D /* 5_LanguageModel.swift in Sources */, 5BAEFAD028012565001F42C9 /* mgrLangModel.swift in Sources */, 5B782EC4280C243C007276DE /* KeyHandler_HandleCandidate.swift in Sources */, 5B62A33827AE79CD00A19448 /* StringUtils.swift in Sources */, + 5B2170E3289FACAD00BE7304 /* 3_Candidate.swift in Sources */, 5BA9FD0F27FEDB6B002DE248 /* suiPrefPaneGeneral.swift in Sources */, + 5B2170E6289FACAD00BE7304 /* 4_Span.swift in Sources */, 5BA9FD4927FEF3C9002DE248 /* Section.swift in Sources */, 5BA9FD3E27FEF3C8002DE248 /* Utilities.swift in Sources */, 5B242403284B0D6500520FE4 /* ctlCandidateUniversal.swift in Sources */, 5BA9FD1127FEDB6B002DE248 /* ctlPrefUI.swift in Sources */, 5B8457A12871ADBE00C93B01 /* HotenkaCCBridge.swift in Sources */, - 5B38F59C281E2E49007D5F5D /* 2_Grid.swift in Sources */, 5B40730D281672610023DFFF /* lmReplacements.swift in Sources */, - 5B38F59E281E2E49007D5F5D /* 6_Bigram.swift in Sources */, 5B62A33227AE792F00A19448 /* InputSourceHelper.swift in Sources */, 5B5E535227EF261400C6AA1E /* IME.swift in Sources */, + 5B2170E0289FACAD00BE7304 /* 7_LangModel.swift in Sources */, 5B62A34927AE7CD900A19448 /* TooltipController.swift in Sources */, - 5B38F59A281E2E49007D5F5D /* 6_Unigram.swift in Sources */, 5BA9FD4027FEF3C8002DE248 /* Localization.swift in Sources */, 5BAA8FBE282CAF380066C406 /* SyllableComposer.swift in Sources */, 5BA9FD1327FEDB6B002DE248 /* suiPrefPaneDictionary.swift in Sources */, + 5B2170E8289FACAD00BE7304 /* 5_Vertex.swift in Sources */, 5BBBB77A27AEDC690023B93A /* clsSFX.swift in Sources */, 5BA9FD4727FEF3C9002DE248 /* PreferencesStyleController.swift in Sources */, 5B949BDB2816DDBC00D87B5D /* LMConsolidator.swift in Sources */, - 5B38F59F281E2E49007D5F5D /* 3_NodeAnchor.swift in Sources */, 5BFDF011289635C100417BBC /* ctlCandidateIMK.swift in Sources */, 5B62A34727AE7CD900A19448 /* ctlCandidate.swift in Sources */, 5BA58646289BCFAC0077D02F /* ShiftKeyUpChecker.swift in Sources */, 5BA9FD3F27FEF3C8002DE248 /* Pane.swift in Sources */, 5BB802DA27FABA8300CF1C19 /* ctlInputMethod_Menu.swift in Sources */, - 5B38F5A1281E2E49007D5F5D /* 1_Compositor.swift in Sources */, 5BE377A0288FED8D0037365B /* KeyHandler_HandleComposition.swift in Sources */, 5BDC1CFA27FDF1310052C2B9 /* apiUpdate.swift in Sources */, );