Megrez // v2.7.0 update, removing Foundation dependency.

This commit is contained in:
ShikiSuen 2023-05-27 21:18:21 +08:00
parent 6ab57f5165
commit 7ef7f33993
10 changed files with 269 additions and 225 deletions

View File

@ -3,8 +3,6 @@
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
import Foundation
public extension Megrez {
///
///
@ -186,7 +184,7 @@ public extension Megrez {
/// GraphViz
public var dumpDOT: String {
// C# StringBuilder Swift NSMutableString
let strOutput: NSMutableString = .init(string: "digraph {\ngraph [ rankdir=LR ];\nBOS;\n")
var strOutput = "digraph {\ngraph [ rankdir=LR ];\nBOS;\n"
spans.enumerated().forEach { p, span in
(0 ... span.maxLength).forEach { ni in
guard let np = span[ni] else { return }

View File

@ -12,73 +12,118 @@ public extension Megrez.Compositor {
///
/// `G = (V, E)` `O(|V|+|E|)` `G`
/// 使
///
/// - Remark:
/// ByVoid[](https://byvoid.com/zht/blog/slm_based_pinyin_ime/)
/// 2022 Gramambular 2
/// - Returns:
@discardableResult mutating func walk() -> (walkedNodes: [Megrez.Node], succeeded: Bool) {
var result = [Megrez.Node]()
defer { walkedNodes = result }
guard !spans.isEmpty else { return (result, true) }
@discardableResult mutating func walk() -> [Megrez.Node] {
defer { Self.reinitVertexNetwork() }
sortAndRelax()
guard !spans.isEmpty else { return [] }
var iterated: Megrez.Node? = Megrez.Node.leadingNode
walkedNodes.removeAll()
while let itPrev = iterated?.prev {
// Copy
walkedNodes.insert(itPrev.copy, at: 0)
iterated = itPrev
}
iterated?.destroyVertex()
iterated = nil
walkedNodes.removeFirst()
return walkedNodes
}
var vertexSpans: [[Int: Vertex]] = spans.map(\.asVertexSpan)
let terminal = Vertex(node: .init(keyArray: ["_TERMINAL_"]))
var root = Vertex(node: .init(keyArray: ["_ROOT_"]))
root.distance = 0
vertexSpans.enumerated().forEach { location, vertexSpan in
vertexSpan.values.forEach { vertex in
let nextVertexPosition = location + vertex.node.spanLength
if nextVertexPosition == vertexSpans.count {
vertex.edges.append(terminal)
///
internal func sortAndRelax() {
Self.reinitVertexNetwork()
guard !spans.isEmpty else { return }
Megrez.Node.trailingNode.distance = 0
spans.enumerated().forEach { location, theSpan in
theSpan.values.forEach { theNode in
let nextVertexPosition = location + theNode.spanLength
if nextVertexPosition == spans.count {
theNode.edges.append(.leadingNode)
return
}
vertexSpans[nextVertexPosition].values.forEach { vertex.edges.append($0) }
spans[nextVertexPosition].values.forEach { theNode.edges.append($0) }
}
}
root.edges.append(contentsOf: vertexSpans[0].values)
topologicalSort(root: &root).reversed().forEach { neta in
neta.edges.indices.forEach { neta.relax(target: &neta.edges[$0]) }
Megrez.Node.trailingNode.edges.append(contentsOf: spans[0].values)
Self.topologicalSort().reversed().forEach { neta in
neta.edges.indices.forEach { Self.relax(u: neta, v: &neta.edges[$0]) }
}
var iterated = terminal
var walked = [Megrez.Node]()
var totalLengthOfKeys = 0
while let itPrev = iterated.prev {
walked.append(itPrev.node)
iterated = itPrev
totalLengthOfKeys += iterated.node.spanLength
}
//
vertexSpans.removeAll()
iterated.destroy()
root.destroy()
terminal.destroy()
guard totalLengthOfKeys == keys.count else {
print("!!! ERROR A")
return (result, false)
}
guard walked.count >= 2 else {
print("!!! ERROR B")
return (result, false)
}
walked = walked.reversed()
walked.removeFirst()
result = walked
return (result, true)
}
}
extension Megrez.SpanUnit {
///
var asVertexSpan: [Int: Megrez.Compositor.Vertex] {
var result = [Int: Megrez.Compositor.Vertex]()
forEach { theKey, theValue in
result[theKey] = .init(node: theValue)
/// Vertex
internal static func reinitVertexNetwork() {
Megrez.Node.trailingNode.destroyVertex()
Megrez.Node.leadingNode.destroyVertex()
}
/// topological
/// sort
///
/// 使
///
/// ```
/// func topologicalSort(node: Node) {
/// node.edges.forEach { nodeNode in
/// if !nodeNode.topologicallySorted {
/// dfs(nodeNode, result)
/// nodeNode.topologicallySorted = true
/// }
/// result.append(nodeNode)
/// }
/// }
/// ```
/// Cormen 2001 Introduction to Algorithms
/// - Returns:
private static func topologicalSort() -> [Megrez.Node] {
class State {
var iterIndex: Int
let node: Megrez.Node
init(node: Megrez.Node, iterIndex: Int = 0) {
self.node = node
self.iterIndex = iterIndex
}
}
var result = [Megrez.Node]()
var stack = [State]()
stack.append(.init(node: .trailingNode))
while !stack.isEmpty {
let state = stack[stack.count - 1]
let theNode = state.node
if state.iterIndex < state.node.edges.count {
let newNode = state.node.edges[state.iterIndex]
state.iterIndex += 1
if !newNode.topologicallySorted {
stack.append(.init(node: newNode))
continue
}
}
theNode.topologicallySorted = true
result.append(theNode)
stack.removeLast()
}
return result
}
///
///
/// (relax) Cormen 2001 Introduction to Algorithms 585
/// - Remark: (u) target (v)
/// - Parameters:
/// - u:
/// - v:
private static func relax(u: Megrez.Node, v: inout Megrez.Node) {
// u w v
let w: Double = v.score
//
// v u ww u w v
// v
guard v.distance < u.distance + w else { return }
v.distance = u.distance + w
v.prev = u
}
}

View File

@ -3,54 +3,64 @@
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
import Foundation
public extension Megrez {
///
struct KeyValuePaired: Equatable, Hashable, Comparable, CustomStringConvertible {
class KeyValuePaired: Unigram, Comparable {
///
public var keyArray: [String]
///
public var value: String
public var keyArray: [String] = []
///
public var description: String { "(" + keyArray.description + "," + value + ")" }
override public var description: String { "(\(keyArray.description),\(value),\(score))" }
/// false
public var isValid: Bool { !keyArray.joined().isEmpty && !value.isEmpty }
/// ()
public var toNGramKey: String { !isValid ? "()" : "(" + joinedKey() + "," + value + ")" }
public var toNGramKey: String { !isValid ? "()" : "(\(joinedKey()),\(value))" }
///
public var tupletExpression: (keyArray: [String], value: String) { (keyArray, value) }
public var keyValueTuplet: (keyArray: [String], value: String) { (keyArray, value) }
///
public var triplet: (keyArray: [String], value: String, score: Double) { (keyArray, value, score) }
///
/// - Parameters:
/// - keyArray:
/// - value:
public init(keyArray: [String], value: String = "N/A") {
/// - score:
public init(keyArray: [String], value: String = "N/A", score: Double = 0) {
super.init(value: value.isEmpty ? "N/A" : value, score: score)
self.keyArray = keyArray.isEmpty ? ["N/A"] : keyArray
self.value = value.isEmpty ? "N/A" : value
}
///
/// - Parameter tupletExpression:
/// - Parameter tripletExpression:
public init(_ tripletExpression: (keyArray: [String], value: String, score: Double)) {
let theValue = tripletExpression.value.isEmpty ? "N/A" : tripletExpression.value
super.init(value: theValue, score: tripletExpression.score)
keyArray = tripletExpression.keyArray.isEmpty ? ["N/A"] : tripletExpression.keyArray
}
///
/// - Parameter tuplet:
public init(_ tupletExpression: (keyArray: [String], value: String)) {
let theValue = tupletExpression.value.isEmpty ? "N/A" : tupletExpression.value
super.init(value: theValue, score: 0)
keyArray = tupletExpression.keyArray.isEmpty ? ["N/A"] : tupletExpression.keyArray
value = tupletExpression.value.isEmpty ? "N/A" : tupletExpression.value
}
///
/// - Parameters:
/// - key:
/// - value:
public init(key: String = "N/A", value: String = "N/A") {
keyArray = key.isEmpty ? ["N/A"] : key.components(separatedBy: Megrez.Compositor.theSeparator)
self.value = value.isEmpty ? "N/A" : value
/// - score:
public init(key: String = "N/A", value: String = "N/A", score: Double = 0) {
super.init(value: value.isEmpty ? "N/A" : value, score: score)
keyArray = key.isEmpty ? ["N/A"] : key.sliced(by: Megrez.Compositor.theSeparator)
}
///
/// - Parameter hasher:
public func hash(into hasher: inout Hasher) {
override public func hash(into hasher: inout Hasher) {
hasher.combine(keyArray)
hasher.combine(value)
hasher.combine(score)
}
public func joinedKey(by separator: String = Megrez.Compositor.theSeparator) -> String {
@ -58,7 +68,7 @@ public extension Megrez {
}
public static func == (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool {
lhs.keyArray == rhs.keyArray && lhs.value == rhs.value
lhs.score == rhs.score && lhs.keyArray == rhs.keyArray && lhs.value == rhs.value
}
public static func < (lhs: KeyValuePaired, rhs: KeyValuePaired) -> Bool {
@ -193,9 +203,9 @@ public extension Megrez.Compositor {
arrOverlappedNodes = fetchOverlappingNodes(at: i)
arrOverlappedNodes.forEach { anchor in
if anchor.node == overridden.node { return }
if !overridden.node.joinedKey(by: "\t").contains(anchor.node.joinedKey(by: "\t"))
|| !overridden.node.value.contains(anchor.node.value)
{
let anchorNodeKeyJoined = anchor.node.joinedKey(by: "\t")
let overriddenNodeKeyJoined = overridden.node.joinedKey(by: "\t")
if !overriddenNodeKeyJoined.has(string: anchorNodeKeyJoined) || !overridden.node.value.has(string: anchor.node.value) {
anchor.node.reset()
return
}

View File

@ -3,8 +3,6 @@
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
import Foundation
public extension Megrez {
///
///
@ -173,6 +171,39 @@ public extension Megrez {
}
return false
}
// MARK: - Vertex Extensions.
// Hash
///
internal static let trailingNode = Megrez.Node(keyArray: ["$TRAILING"])
///
internal static let leadingNode = Megrez.Node(keyArray: ["$LEADING"])
///
internal var prev: Node?
///
internal var edges = [Node]()
///
///
///
/// /
internal var distance = -(Double.infinity)
///
internal var topologicallySorted = false
/// Vertex
/// Vertex Vertex
/// Vertex 使
internal func destroyVertex() {
while prev?.prev != nil { prev?.destroyVertex() }
prev = nil
edges.forEach { $0.destroyVertex() }
edges.removeAll()
distance = -(Double.infinity)
topologicallySorted = false
}
}
}
@ -256,14 +287,9 @@ public extension Array where Element == Megrez.Node {
/// 使 Megrez KeyValuePaired
var smashedPairs: [(key: String, value: String)] {
var arrData = [(key: String, value: String)]()
let separator = Megrez.Compositor.theSeparator
forEach { node in
if node.isReadingMismatched {
var newKey = node.joinedKey()
if !separator.isEmpty, newKey != separator, newKey.contains(separator) {
newKey = newKey.replacingOccurrences(of: separator, with: "\t")
}
arrData.append((key: newKey, value: node.value))
if node.isReadingMismatched, !node.keyArray.joined().isEmpty {
arrData.append((key: node.keyArray.joined(separator: "\t"), value: node.value))
return
}
let arrValueChars = node.value.map(\.description)

View File

@ -1,109 +0,0 @@
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
extension Megrez.Compositor {
///
///
///
class Vertex {
///
public var prev: Vertex?
///
public var edges = [Vertex]()
///
///
///
/// /
public var distance = -(Double.infinity)
///
public var topologicallySorted = false
///
public var node: Megrez.Node
///
///
///
/// - Parameter node:
public init(node: Megrez.Node) {
self.node = node
}
/// Vertex Vertex
/// Vertex 使
public func destroy() {
while prev?.prev != nil { prev?.destroy() }
prev = nil
edges.forEach { $0.destroy() }
edges.removeAll()
node = .init()
}
///
///
/// (relax) Cormen 2001 Introduction to Algorithms 585
/// - Remark: (u) target (v)
/// - Parameters:
/// - target:
public func relax(target: inout Vertex) {
// u w v
let w: Double = target.node.score
//
// v u ww u w v
// v
if target.distance >= distance + w { return }
target.distance = distance + w
target.prev = self
}
}
/// topological
/// sort
///
/// 使
///
/// ```
/// func topologicalSort(vertex: Vertex) {
/// vertex.edges.forEach { vertexNode in
/// if !vertexNode.topologicallySorted {
/// dfs(vertexNode, result)
/// vertexNode.topologicallySorted = true
/// }
/// result.append(vertexNode)
/// }
/// }
/// ```
/// Cormen 2001 Introduction to Algorithms
/// - Parameter root:
/// - Returns:
func topologicalSort(root: inout Vertex) -> [Vertex] {
class State {
var iterIndex: Int
let vertex: Vertex
init(vertex: Vertex, iterIndex: Int = 0) {
self.vertex = vertex
self.iterIndex = iterIndex
}
}
var result = [Vertex]()
var stack = [State]()
stack.append(.init(vertex: root))
while !stack.isEmpty {
let state = stack[stack.count - 1]
let theVertex = state.vertex
if state.iterIndex < state.vertex.edges.count {
let newVertex = state.vertex.edges[state.iterIndex]
state.iterIndex += 1
if !newVertex.topologicallySorted {
stack.append(.init(vertex: newVertex))
continue
}
}
theVertex.topologicallySorted = true
result.append(theVertex)
stack.removeLast()
}
return result
}
}

View File

@ -5,7 +5,7 @@
public extension Megrez {
///
@frozen struct Unigram: Equatable, CustomStringConvertible, Hashable {
class Unigram: Equatable, CustomStringConvertible, Hashable {
///
public var value: String
///

View File

@ -0,0 +1,74 @@
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
// This package is trying to deprecate its dependency of Foundation, hence this file.
extension StringProtocol {
func has(string target: any StringProtocol) -> Bool {
let selfArray = Array(unicodeScalars)
let targetArray = Array(target.description.unicodeScalars)
guard !target.isEmpty else { return isEmpty }
guard count >= target.count else { return false }
for index in 0 ..< selfArray.count {
let range = index ..< (Swift.min(index + targetArray.count, selfArray.count))
let ripped = Array(selfArray[range])
if ripped == targetArray { return true }
}
return false
}
func sliced(by separator: any StringProtocol = "") -> [String] {
let selfArray = Array(unicodeScalars)
let arrSeparator = Array(separator.description.unicodeScalars)
var result: [String] = []
var buffer: [Unicode.Scalar] = []
var sleepCount = 0
for index in 0 ..< selfArray.count {
let currentChar = selfArray[index]
let range = index ..< (Swift.min(index + arrSeparator.count, selfArray.count))
let ripped = Array(selfArray[range])
if ripped.isEmpty { continue }
if ripped == arrSeparator {
sleepCount = range.count
result.append(buffer.map { String($0) }.joined())
buffer.removeAll()
}
if sleepCount < 1 {
buffer.append(currentChar)
}
sleepCount -= 1
}
result.append(buffer.map { String($0) }.joined())
buffer.removeAll()
return result
}
func swapping(_ target: String, with newString: String) -> String {
let selfArray = Array(unicodeScalars)
let arrTarget = Array(target.description.unicodeScalars)
var result = ""
var buffer: [Unicode.Scalar] = []
var sleepCount = 0
for index in 0 ..< selfArray.count {
let currentChar = selfArray[index]
let range = index ..< (Swift.min(index + arrTarget.count, selfArray.count))
let ripped = Array(selfArray[range])
if ripped.isEmpty { continue }
if ripped == arrTarget {
sleepCount = ripped.count
result.append(buffer.map { String($0) }.joined())
result.append(newString)
buffer.removeAll()
}
if sleepCount < 1 {
buffer.append(currentChar)
}
sleepCount -= 1
}
result.append(buffer.map { String($0) }.joined())
buffer.removeAll()
return result
}
}

View File

@ -19,7 +19,7 @@ class SimpleLM: LangModelProtocol {
let col0 = String(linestream[0])
let col1 = String(linestream[1])
let col2 = Double(linestream[2]) ?? 0.0
var u = Megrez.Unigram(value: swapKeyValue ? col0 : col1, score: 0)
let u = Megrez.Unigram(value: swapKeyValue ? col0 : col1, score: 0)
u.score = col2
mutDatabase[swapKeyValue ? col1 : col0, default: []].append(u)
}

View File

@ -351,7 +351,7 @@ final class MegrezTests: XCTestCase {
"高科技公司的年終獎金".forEach { i in
compositor.insertKey(i.description)
}
let result = compositor.walk().0
let result = compositor.walk()
XCTAssertEqual(result.joinedKeys(by: ""), ["高科技", "公司", "", "年終", "獎金"])
}
@ -384,7 +384,7 @@ final class MegrezTests: XCTestCase {
compositor.insertKey("jiang3")
compositor.walk()
compositor.insertKey("jin1")
var result = compositor.walk().0
var result = compositor.walk()
XCTAssertEqual(result.values, ["高科技", "公司", "", "年中", "獎金"])
XCTAssertEqual(compositor.length, 10)
compositor.cursor = 7
@ -394,7 +394,7 @@ final class MegrezTests: XCTestCase {
XCTAssertTrue(candidates.contains(""))
XCTAssertTrue(candidates.contains(""))
XCTAssertTrue(compositor.overrideCandidateLiteral("年終", at: 7))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["高科技", "公司", "", "年終", "獎金"])
let candidatesBeginAt = compositor.fetchCandidates(at: 3, filter: .beginAt).map(\.value)
let candidatesEndAt = compositor.fetchCandidates(at: 3, filter: .endAt).map(\.value)
@ -436,11 +436,11 @@ final class MegrezTests: XCTestCase {
compositor.insertKey("gao1")
compositor.insertKey("ke1")
compositor.insertKey("ji4")
var result = compositor.walk().0
var result = compositor.walk()
XCTAssertEqual(result.values, ["高科技"])
compositor.insertKey("gong1")
compositor.insertKey("si1")
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["高科技", "公司"])
}
@ -450,29 +450,29 @@ final class MegrezTests: XCTestCase {
compositor.insertKey("gao1")
compositor.insertKey("ke1")
compositor.insertKey("ji4")
var result = compositor.walk().0
var result = compositor.walk()
XCTAssertEqual(result.values, ["高科技"])
compositor.cursor = 0
XCTAssertTrue(compositor.overrideCandidateLiteral("", at: compositor.cursor))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["", "科技"])
XCTAssertTrue(compositor.overrideCandidateLiteral("高科技", at: 1))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["高科技"])
XCTAssertTrue(compositor.overrideCandidateLiteral("", at: 0))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["", "科技"])
XCTAssertTrue(compositor.overrideCandidateLiteral("", at: 1))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["", "", ""])
XCTAssertTrue(compositor.overrideCandidateLiteral("", at: 2))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["", "", ""])
XCTAssertTrue(compositor.overrideCandidateLiteral("高科技", at: 3))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["高科技"])
}
@ -484,19 +484,19 @@ final class MegrezTests: XCTestCase {
compositor.insertKey("zhong1")
compositor.insertKey("jiang3")
compositor.insertKey("jin1")
var result = compositor.walk().0
var result = compositor.walk()
XCTAssertEqual(result.values, ["年中", "獎金"])
XCTAssertTrue(compositor.overrideCandidateLiteral("終講", at: 1))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["", "終講", ""])
XCTAssertTrue(compositor.overrideCandidateLiteral("槳襟", at: 2))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["年中", "槳襟"])
XCTAssertTrue(compositor.overrideCandidateLiteral("年終", at: 0))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["年終", "槳襟"])
}
@ -509,16 +509,16 @@ final class MegrezTests: XCTestCase {
compositor.insertKey("yan4")
compositor.insertKey("wei2")
compositor.insertKey("xian3")
var result = compositor.walk().0
var result = compositor.walk()
XCTAssertEqual(result.values, ["高熱", "火焰", "危險"])
let location = 2
XCTAssertTrue(compositor.overrideCandidate(.init(keyArray: ["huo3"], value: "🔥"), at: location))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["高熱", "🔥", "", "危險"])
XCTAssertTrue(compositor.overrideCandidate(.init(keyArray: ["huo3", "yan4"], value: "🔥"), at: location))
result = compositor.walk().0
result = compositor.walk()
XCTAssertEqual(result.values, ["高熱", "🔥", "危險"])
}
@ -530,11 +530,11 @@ final class MegrezTests: XCTestCase {
compositor.insertKey("zhong1")
compositor.insertKey("jiang3")
compositor.insertKey("jin1")
let oldResult = compositor.walk().0.values.joined()
let oldResult = compositor.walk().values.joined()
print(oldResult)
theLM.trim(key: "nian2zhong1", value: "年中")
compositor.update(updateExisting: true)
let newResult = compositor.walk().0.values.joined()
let newResult = compositor.walk().values.joined()
print(newResult)
XCTAssertEqual([oldResult, newResult], ["年中獎金", "年終獎金"])
compositor.cursor = 4
@ -542,7 +542,7 @@ final class MegrezTests: XCTestCase {
compositor.dropKey(direction: .rear)
theLM.trim(key: "nian2zhong1", value: "年終")
compositor.update(updateExisting: true)
let newResult2 = compositor.walk().0.values
let newResult2 = compositor.walk().values
print(newResult2)
XCTAssertEqual(newResult2, ["", ""])
}
@ -555,8 +555,8 @@ final class MegrezTests: XCTestCase {
compositorA.insertKey(key.description)
}
var compositorB = compositorA.hardCopy
let resultA = compositorA.walk().walkedNodes
let resultB = compositorB.walk().walkedNodes
let resultA = compositorA.walk()
let resultB = compositorB.walk()
XCTAssertEqual(resultA, resultB)
}