LMAssembly // Introducing LMCassette.

- Powered by Tohno Engine.
This commit is contained in:
ShikiSuen 2022-10-14 21:55:30 +08:00
parent 57d9c3f5e1
commit f74d3c174d
6 changed files with 36409 additions and 1 deletions

View File

@ -27,6 +27,10 @@ let package = Package(
.product(name: "Shared", package: "vChewing_Shared"),
.product(name: "PinyinPhonaConverter", package: "vChewing_PinyinPhonaConverter"),
]
)
),
.testTarget(
name: "LangModelAssemblyTests",
dependencies: ["LangModelAssembly"]
),
]
)

View File

@ -2,6 +2,20 @@
威注音輸入法的語言模組總成套裝。
- vChewingLM總命名空間也承載一些在套裝內共用的工具函式。
- LMConsolidator自動格式整理模組。
- LMInstantiator語言模組副本化模組。另有其日期時間擴充模組可用對 CIN 磁帶模式無效)。
以下是子模組:
- lmCassette專門用來處理 CIN 磁帶檔案的模組,命名為「遠野」引擎。
- LMAssociates聯想詞模組。
- LMCoreEX可以直接讀取 TXT 格式的帶有權重資料的語彙檔案的模組。
- LMCoreNS專門用來讀取原廠 plist 檔案的模組。
- lmPlainBopomofo專門用來讀取使用者自訂ㄅ半候選字順序覆蓋定義檔案plist的模組。
- lmReplacements專門用來讀取使用者語彙置換模式的辭典資料的模組。
- lmUserOverride半衰記憶模組。
```
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
// ====================

View File

@ -0,0 +1,143 @@
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
// StringView Ranges extension by (c) 2022 and onwards Isaac Xen (MIT License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
// ... with NTL restriction stating that:
// No trademark license is granted to use the trade names, trademarks, service
// marks, or product names of Contributor, except as required to fulfill notice
// requirements defined in MIT License.
import Foundation
import LineReader
import Megrez
import Shared
extension vChewingLM {
/// 便使
@frozen public struct LMCassette {
public private(set) var nameENG: String = ""
public private(set) var nameCJK: String = ""
///
public private(set) var maxKeyLength: Int = 1
public private(set) var selectionKeys: [String] = []
public private(set) var endKeys: [String] = []
public private(set) var keyNameMap: [String: String] = [:]
public private(set) var charDefMap: [String: [String]] = [:]
///
public var count: Int { charDefMap.count }
///
public var isLoaded: Bool { !charDefMap.isEmpty }
/// 使
public var allowedKeys: [String] { Array(keyNameMap.keys) }
///
public func convertKeyToDisplay(char: String) -> String {
keyNameMap[char] ?? char
}
/// CIN
/// - Note:
/// - `%gen_inp` `%ename` cin
/// - `%ename` `%cname` CJK
/// - `%encoding` Swift UTF-8
/// - `%selkey`
/// - `%endkey`
/// - `%keyname begin` `%keyname end` Swift
/// - `%chardef begin` `%chardef end`
/// - Parameter path:
/// - Returns:
@discardableResult public mutating func open(_ path: String) -> Bool {
if isLoaded { return false }
if FileManager.default.fileExists(atPath: path) {
do {
guard let fileHandle = FileHandle(forReadingAtPath: path) else {
throw FileErrors.fileHandleError("")
}
let lineReader = try LineReader(file: fileHandle)
var theMaxKeyLength = 1
var isOV = false
var shouldStartReading = false
var loadingKeys = false
var loadingCharDefinitions = false
for (_, strLine) in lineReader.enumerated() {
if !shouldStartReading, strLine.contains("%gen_inp") || strLine.contains("%ename ") {
isOV = strLine.contains("%gen_inp")
shouldStartReading = true
}
guard shouldStartReading else { continue }
if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true }
if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false }
if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true }
if loadingCharDefinitions, strLine.contains("%chardef end") { loadingCharDefinitions = false }
let cells = strLine.split(separator: " ")
guard cells.count == 2 else { continue }
if loadingKeys {
keyNameMap[String(cells[0])] = String(cells[1])
} else if loadingCharDefinitions {
if !strLine.contains("%chardef") { theMaxKeyLength = max(theMaxKeyLength, cells[0].count) }
charDefMap[String(cells[0]), default: []].append(String(cells[1]))
}
guard !loadingKeys, !loadingCharDefinitions else { continue }
if nameENG.isEmpty, strLine.contains("%ename ") {
if isOV {
nameENG = String(cells[1])
} else {
for neta in cells[1].components(separatedBy: ";") {
let subNetaGroup = neta.components(separatedBy: ":")
if subNetaGroup.count == 2, subNetaGroup[1].contains("en") {
nameENG = String(subNetaGroup[0])
break
}
}
if nameENG.isEmpty { nameENG = String(cells[1]) }
}
}
if nameCJK.isEmpty, strLine.contains("%cname ") { nameCJK = String(cells[1]) }
if selectionKeys.isEmpty, strLine.contains("%selkey ") {
selectionKeys = cells[1].map { String($0) }.deduplicated
}
if endKeys.isEmpty, strLine.contains("%endkey ") {
endKeys = cells[1].map { String($0) }.deduplicated
}
}
maxKeyLength = theMaxKeyLength
return true
} catch {
vCLog("CIN Loading Failed: File Access Error.")
return false
}
}
vCLog("CIN Loading Failed: File Missing.")
return false
}
public mutating func close() {
keyNameMap.removeAll()
charDefMap.removeAll()
nameENG.removeAll()
nameCJK.removeAll()
selectionKeys.removeAll()
endKeys.removeAll()
maxKeyLength = 1
}
///
/// - parameters:
/// - key:
public func unigramsFor(key: String) -> [Megrez.Unigram] {
guard let arrRaw = charDefMap[key]?.deduplicated, !arrRaw.isEmpty else { return [] }
var arrResults = [Megrez.Unigram]()
for (i, neta) in arrRaw.enumerated() {
arrResults.append(.init(value: neta, score: Double(i) * -0.001))
}
return arrResults
}
///
/// - parameters:
/// - key:
public func hasUnigramsFor(key: String) -> Bool {
charDefMap[key] != nil
}
}
}

View File

@ -0,0 +1,51 @@
//// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
// StringView Ranges extension by (c) 2022 and onwards Isaac Xen (MIT License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
// ... with NTL restriction stating that:
// No trademark license is granted to use the trade names, trademarks, service
// marks, or product names of Contributor, except as required to fulfill notice
// requirements defined in MIT License.
import Foundation
import XCTest
@testable import LangModelAssembly
private let packageRootPath = URL(fileURLWithPath: #file).pathComponents.prefix(while: { $0 != "Tests" }).joined(
separator: "/"
).dropFirst()
private let testDataPath: String = packageRootPath + "/Tests/TestCINData/"
final class LMCassetteTests: XCTestCase {
func testCassetteLoadWubi98() throws {
let pathCINFile = testDataPath + "wubi98.cin"
var lmCassette = vChewingLM.LMCassette()
NSLog("LMCassette: Start loading CIN.")
lmCassette.open(pathCINFile)
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
XCTAssertEqual(lmCassette.charDefMap.count, 21492)
XCTAssertEqual(lmCassette.keyNameMap.count, 26)
XCTAssertEqual(lmCassette.nameENG, "Wubi98")
XCTAssertEqual(lmCassette.nameCJK, "五笔98")
XCTAssertEqual(lmCassette.maxKeyLength, 4)
XCTAssertEqual(lmCassette.endKeys.count, 0)
XCTAssertEqual(lmCassette.selectionKeys.count, 10)
}
func testCassetteLoadWubi86() throws {
let pathCINFile = testDataPath + "wubi86.cin"
var lmCassette = vChewingLM.LMCassette()
NSLog("LMCassette: Start loading CIN.")
lmCassette.open(pathCINFile)
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
XCTAssertEqual(lmCassette.charDefMap.count, 10691)
XCTAssertEqual(lmCassette.keyNameMap.count, 26)
XCTAssertEqual(lmCassette.nameENG, "Wubi86")
XCTAssertEqual(lmCassette.nameCJK, "五笔86")
XCTAssertEqual(lmCassette.maxKeyLength, 4)
XCTAssertEqual(lmCassette.endKeys.count, 0)
XCTAssertEqual(lmCassette.selectionKeys.count, 10)
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff