Megrez // Allow resynchronizing unigram data in update().
This commit is contained in:
parent
d870d5ad2a
commit
9028c6a5dd
|
@ -172,7 +172,7 @@ extension Megrez {
|
|||
}
|
||||
}
|
||||
|
||||
// MARK: - Internal Methods
|
||||
// MARK: - Internal Methods (Maybe Public)
|
||||
|
||||
extension Megrez.Compositor {
|
||||
// MARK: Internal methods for maintaining the grid.
|
||||
|
@ -242,45 +242,51 @@ extension Megrez.Compositor {
|
|||
return true
|
||||
}
|
||||
|
||||
func getJointKey(range: Range<Int>) -> String {
|
||||
// 下面這句不能用 contains,不然會要求至少 macOS 13 Ventura。
|
||||
guard range.upperBound <= keys.count, range.lowerBound >= 0 else { return "" }
|
||||
return keys[range].joined(separator: separator)
|
||||
}
|
||||
|
||||
func getJointKeyArray(range: Range<Int>) -> [String] {
|
||||
// 下面這句不能用 contains,不然會要求至少 macOS 13 Ventura。
|
||||
guard range.upperBound <= keys.count, range.lowerBound >= 0 else { return [] }
|
||||
return keys[range].map { String($0) }
|
||||
}
|
||||
|
||||
func hasNode(at location: Int, length: Int, key: String) -> Bool {
|
||||
func getNode(at location: Int, length: Int, keyArray: [String]) -> Node? {
|
||||
let location = max(min(location, spans.count), 0) // 防呆
|
||||
guard let node = spans[location].nodeOf(length: length) else { return false }
|
||||
return key == node.key
|
||||
guard let node = spans[location].nodeOf(length: length) else { return nil }
|
||||
return keyArray == node.keyArray ? node : nil
|
||||
}
|
||||
|
||||
/// 根據當前狀況更新整個組字器的節點文脈。
|
||||
/// - Returns: 新增了多少節點。
|
||||
@discardableResult mutating func update() -> Int {
|
||||
/// - Returns: 新增了多少節點。如果返回「0」則表示可能發生了錯誤。
|
||||
@discardableResult public mutating func update(updateExisting: Bool = false) -> Int {
|
||||
let maxSpanLength = Megrez.Compositor.maxSpanLength
|
||||
let range = max(0, cursor - maxSpanLength)..<min(cursor + maxSpanLength, keys.count)
|
||||
var nodesInserted = 0
|
||||
var nodesChanged = 0
|
||||
for position in range {
|
||||
for theLength in 1...min(maxSpanLength, range.upperBound - position) {
|
||||
let jointKeyArray = getJointKeyArray(range: position..<(position + theLength))
|
||||
let jointKey = getJointKey(range: position..<(position + theLength))
|
||||
if hasNode(at: position, length: theLength, key: jointKey) { continue }
|
||||
let jointKey = jointKeyArray.joined(separator: separator)
|
||||
if let theNode = getNode(at: position, length: theLength, keyArray: jointKeyArray) {
|
||||
if !updateExisting { continue }
|
||||
let unigrams = langModel.unigramsFor(key: jointKey)
|
||||
// 自動銷毀無效的節點。
|
||||
if unigrams.isEmpty {
|
||||
if theNode.keyArray.count == 1 { continue }
|
||||
spans[position].nodes.removeAll { $0 == theNode }
|
||||
} else {
|
||||
theNode.resetUnigrams(using: unigrams)
|
||||
}
|
||||
nodesChanged += 1
|
||||
continue
|
||||
}
|
||||
let unigrams = langModel.unigramsFor(key: jointKey)
|
||||
guard !unigrams.isEmpty else { continue }
|
||||
insertNode(
|
||||
.init(keyArray: jointKeyArray, spanLength: theLength, unigrams: unigrams, keySeparator: separator),
|
||||
at: position
|
||||
)
|
||||
nodesInserted += 1
|
||||
nodesChanged += 1
|
||||
}
|
||||
}
|
||||
return nodesInserted
|
||||
return nodesChanged
|
||||
}
|
||||
|
||||
mutating func updateCursorJumpingTables(_ walkedNodes: [Node]) {
|
||||
|
|
|
@ -5,15 +5,15 @@
|
|||
|
||||
extension Megrez.Compositor {
|
||||
/// 幅位乃指一組共享起點的節點。
|
||||
public struct Span {
|
||||
private var nodes: [Node?] = []
|
||||
public class Span {
|
||||
public var nodes: [Node?] = []
|
||||
public private(set) var maxLength = 0
|
||||
private var maxSpanLength: Int { Megrez.Compositor.maxSpanLength }
|
||||
public init() {
|
||||
clear()
|
||||
}
|
||||
|
||||
public mutating func clear() {
|
||||
public func clear() {
|
||||
nodes.removeAll()
|
||||
for _ in 0..<maxSpanLength {
|
||||
nodes.append(nil)
|
||||
|
@ -24,7 +24,7 @@ extension Megrez.Compositor {
|
|||
/// 往該幅位塞入一個節點。
|
||||
/// - Parameter node: 要塞入的節點。
|
||||
/// - Returns: 該操作是否成功執行。
|
||||
@discardableResult public mutating func append(node: Node) -> Bool {
|
||||
@discardableResult public func append(node: Node) -> Bool {
|
||||
guard (1...maxSpanLength).contains(node.spanLength) else {
|
||||
return false
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ extension Megrez.Compositor {
|
|||
/// 丟掉任何不小於給定幅位長度的節點。
|
||||
/// - Parameter length: 給定的幅位長度。
|
||||
/// - Returns: 該操作是否成功執行。
|
||||
@discardableResult public mutating func dropNodesOfOrBeyond(length: Int) -> Bool {
|
||||
@discardableResult public func dropNodesOfOrBeyond(length: Int) -> Bool {
|
||||
guard (1...maxSpanLength).contains(length) else {
|
||||
return false
|
||||
}
|
||||
|
@ -66,7 +66,7 @@ extension Megrez.Compositor {
|
|||
/// 找出所有與該位置重疊的節點。其返回值為一個節錨陣列(包含節點、以及其起始位置)。
|
||||
/// - Parameter location: 游標位置。
|
||||
/// - Returns: 一個包含所有與該位置重疊的節點的陣列。
|
||||
func fetchOverlappingNodes(at location: Int) -> [NodeAnchor] {
|
||||
internal func fetchOverlappingNodes(at location: Int) -> [NodeAnchor] {
|
||||
var results = [NodeAnchor]()
|
||||
guard !spans.isEmpty, location < spans.count else { return results }
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ extension Megrez.Compositor {
|
|||
public private(set) var spanLength: Int
|
||||
public private(set) var unigrams: [Megrez.Unigram]
|
||||
public private(set) var currentUnigramIndex: Int = 0 {
|
||||
didSet { currentUnigramIndex = min(max(0, currentUnigramIndex), unigrams.count - 1) }
|
||||
didSet { currentUnigramIndex = max(min(unigrams.count - 1, currentUnigramIndex), 0) }
|
||||
}
|
||||
|
||||
public var currentPair: Megrez.Compositor.KeyValuePaired { .init(key: key, value: value) }
|
||||
|
@ -53,6 +53,18 @@ extension Megrez.Compositor {
|
|||
hasher.combine(overrideType)
|
||||
}
|
||||
|
||||
/// 置換掉該節點內的單元圖陣列資料。
|
||||
/// 如果此時影響到了 currentUnigramIndex 所指的內容的話,則將其重設為 0。
|
||||
/// - Parameter source: 新的單元圖陣列資料,必須不能為空(否則必定崩潰)。
|
||||
public func resetUnigrams(using source: [Megrez.Unigram]) {
|
||||
let oldCurrentValue = unigrams[currentUnigramIndex].value
|
||||
unigrams = source
|
||||
// if unigrams.isEmpty { unigrams.append(.init(value: key, score: -114.514)) } // 保險,請按需啟用。
|
||||
currentUnigramIndex = max(min(unigrams.count - 1, currentUnigramIndex), 0)
|
||||
let newCurrentValue = unigrams[currentUnigramIndex].value
|
||||
if oldCurrentValue != newCurrentValue { currentUnigramIndex = 0 }
|
||||
}
|
||||
|
||||
public private(set) var overrideType: Node.OverrideType
|
||||
|
||||
public static func == (lhs: Node, rhs: Node) -> Bool {
|
||||
|
|
|
@ -36,6 +36,13 @@ class SimpleLM: LangModelProtocol {
|
|||
func hasUnigramsFor(key: String) -> Bool {
|
||||
mutDatabase.keys.contains(key)
|
||||
}
|
||||
|
||||
func trim(key: String, value: String) {
|
||||
guard var arr = mutDatabase[key] else { return }
|
||||
arr = arr.compactMap { $0.value == value ? nil : $0 }
|
||||
guard !arr.isEmpty else { return }
|
||||
mutDatabase[key] = arr
|
||||
}
|
||||
}
|
||||
|
||||
class MockLM: LangModelProtocol {
|
||||
|
|
|
@ -11,7 +11,7 @@ import XCTest
|
|||
final class MegrezTests: XCTestCase {
|
||||
func testSpan() throws {
|
||||
let langModel = SimpleLM(input: strSampleData)
|
||||
var span = Megrez.Compositor.Span()
|
||||
let span = Megrez.Compositor.Span()
|
||||
let n1 = Megrez.Compositor.Node(keyArray: ["gao1"], spanLength: 1, unigrams: langModel.unigramsFor(key: "gao1"))
|
||||
let n3 = Megrez.Compositor.Node(
|
||||
keyArray: ["gao1ke1ji4"], spanLength: 3, unigrams: langModel.unigramsFor(key: "gao1ke1ji4")
|
||||
|
@ -518,4 +518,21 @@ final class MegrezTests: XCTestCase {
|
|||
result = compositor.walk().0
|
||||
XCTAssertEqual(result.values, ["高熱", "🔥", "危險"])
|
||||
}
|
||||
|
||||
func testCompositor_updateUnigramData() throws {
|
||||
let theLM = SimpleLM(input: strSampleData)
|
||||
var compositor = Megrez.Compositor(with: theLM)
|
||||
compositor.separator = ""
|
||||
compositor.insertKey("nian2")
|
||||
compositor.insertKey("zhong1")
|
||||
compositor.insertKey("jiang3")
|
||||
compositor.insertKey("jin1")
|
||||
let oldResult = compositor.walk().0.values.joined()
|
||||
print(oldResult)
|
||||
theLM.trim(key: "nian2zhong1", value: "年中")
|
||||
compositor.update(updateExisting: true)
|
||||
let newResult = compositor.walk().0.values.joined()
|
||||
print(newResult)
|
||||
XCTAssertEqual([oldResult, newResult], ["年中獎金", "年終獎金"])
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue