LMAssembly // Introducing LMCassette.
- Powered by Tohno Engine.
This commit is contained in:
parent
57d9c3f5e1
commit
f74d3c174d
|
@ -27,6 +27,10 @@ let package = Package(
|
||||||
.product(name: "Shared", package: "vChewing_Shared"),
|
.product(name: "Shared", package: "vChewing_Shared"),
|
||||||
.product(name: "PinyinPhonaConverter", package: "vChewing_PinyinPhonaConverter"),
|
.product(name: "PinyinPhonaConverter", package: "vChewing_PinyinPhonaConverter"),
|
||||||
]
|
]
|
||||||
)
|
),
|
||||||
|
.testTarget(
|
||||||
|
name: "LangModelAssemblyTests",
|
||||||
|
dependencies: ["LangModelAssembly"]
|
||||||
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,6 +2,20 @@
|
||||||
|
|
||||||
威注音輸入法的語言模組總成套裝。
|
威注音輸入法的語言模組總成套裝。
|
||||||
|
|
||||||
|
- vChewingLM:總命名空間,也承載一些在套裝內共用的工具函式。
|
||||||
|
- LMConsolidator:自動格式整理模組。
|
||||||
|
- LMInstantiator:語言模組副本化模組。另有其日期時間擴充模組可用(對 CIN 磁帶模式無效)。
|
||||||
|
|
||||||
|
以下是子模組:
|
||||||
|
|
||||||
|
- lmCassette:專門用來處理 CIN 磁帶檔案的模組,命名為「遠野」引擎。
|
||||||
|
- LMAssociates:聯想詞模組。
|
||||||
|
- LMCoreEX:可以直接讀取 TXT 格式的帶有權重資料的語彙檔案的模組。
|
||||||
|
- LMCoreNS:專門用來讀取原廠 plist 檔案的模組。
|
||||||
|
- lmPlainBopomofo:專門用來讀取使用者自訂ㄅ半候選字順序覆蓋定義檔案(plist)的模組。
|
||||||
|
- lmReplacements:專門用來讀取使用者語彙置換模式的辭典資料的模組。
|
||||||
|
- lmUserOverride:半衰記憶模組。
|
||||||
|
|
||||||
```
|
```
|
||||||
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
|
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
|
||||||
// ====================
|
// ====================
|
||||||
|
|
|
@ -0,0 +1,143 @@
|
||||||
|
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
|
||||||
|
// StringView Ranges extension by (c) 2022 and onwards Isaac Xen (MIT License).
|
||||||
|
// ====================
|
||||||
|
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||||
|
// ... with NTL restriction stating that:
|
||||||
|
// No trademark license is granted to use the trade names, trademarks, service
|
||||||
|
// marks, or product names of Contributor, except as required to fulfill notice
|
||||||
|
// requirements defined in MIT License.
|
||||||
|
|
||||||
|
import Foundation
|
||||||
|
import LineReader
|
||||||
|
import Megrez
|
||||||
|
import Shared
|
||||||
|
|
||||||
|
extension vChewingLM {
|
||||||
|
/// 磁帶模組,用來方便使用者自行擴充字根輸入法。
|
||||||
|
@frozen public struct LMCassette {
|
||||||
|
public private(set) var nameENG: String = ""
|
||||||
|
public private(set) var nameCJK: String = ""
|
||||||
|
/// 一個漢字可能最多要用到多少碼。
|
||||||
|
public private(set) var maxKeyLength: Int = 1
|
||||||
|
public private(set) var selectionKeys: [String] = []
|
||||||
|
public private(set) var endKeys: [String] = []
|
||||||
|
public private(set) var keyNameMap: [String: String] = [:]
|
||||||
|
public private(set) var charDefMap: [String: [String]] = [:]
|
||||||
|
|
||||||
|
/// 資料陣列內承載的資料筆數。
|
||||||
|
public var count: Int { charDefMap.count }
|
||||||
|
/// 是否已有資料載入。
|
||||||
|
public var isLoaded: Bool { !charDefMap.isEmpty }
|
||||||
|
/// 返回「允許使用的敲字鍵」的
|
||||||
|
public var allowedKeys: [String] { Array(keyNameMap.keys) }
|
||||||
|
/// 將給定的按鍵字母轉換成要顯示的形態。
|
||||||
|
public func convertKeyToDisplay(char: String) -> String {
|
||||||
|
keyNameMap[char] ?? char
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 載入給定的 CIN 檔案內容。
|
||||||
|
/// - Note:
|
||||||
|
/// - 檢查是否以 `%gen_inp` 或者 `%ename` 開頭、以確認其是否為 cin 檔案。在讀到這些資訊之前的行都會被忽略。
|
||||||
|
/// - `%ename` 決定磁帶的英文名、`%cname` 決定磁帶的 CJK 名稱。
|
||||||
|
/// - `%encoding` 不處理,因為 Swift 只認 UTF-8。
|
||||||
|
/// - `%selkey` 不處理,因為威注音輸入法有自己的選字鍵體系。
|
||||||
|
/// - `%endkey` 是會觸發組字事件的按鍵。
|
||||||
|
/// - `%keyname begin` 至 `%keyname end` 之間是字根翻譯表,先讀取為 Swift 辭典以備用。
|
||||||
|
/// - `%chardef begin` 至 `%chardef end` 之間則是詞庫資料。
|
||||||
|
/// - Parameter path: 檔案路徑。
|
||||||
|
/// - Returns: 是否載入成功。
|
||||||
|
@discardableResult public mutating func open(_ path: String) -> Bool {
|
||||||
|
if isLoaded { return false }
|
||||||
|
if FileManager.default.fileExists(atPath: path) {
|
||||||
|
do {
|
||||||
|
guard let fileHandle = FileHandle(forReadingAtPath: path) else {
|
||||||
|
throw FileErrors.fileHandleError("")
|
||||||
|
}
|
||||||
|
let lineReader = try LineReader(file: fileHandle)
|
||||||
|
var theMaxKeyLength = 1
|
||||||
|
var isOV = false
|
||||||
|
var shouldStartReading = false
|
||||||
|
var loadingKeys = false
|
||||||
|
var loadingCharDefinitions = false
|
||||||
|
for (_, strLine) in lineReader.enumerated() {
|
||||||
|
if !shouldStartReading, strLine.contains("%gen_inp") || strLine.contains("%ename ") {
|
||||||
|
isOV = strLine.contains("%gen_inp")
|
||||||
|
shouldStartReading = true
|
||||||
|
}
|
||||||
|
guard shouldStartReading else { continue }
|
||||||
|
if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true }
|
||||||
|
if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false }
|
||||||
|
if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true }
|
||||||
|
if loadingCharDefinitions, strLine.contains("%chardef end") { loadingCharDefinitions = false }
|
||||||
|
let cells = strLine.split(separator: " ")
|
||||||
|
guard cells.count == 2 else { continue }
|
||||||
|
if loadingKeys {
|
||||||
|
keyNameMap[String(cells[0])] = String(cells[1])
|
||||||
|
} else if loadingCharDefinitions {
|
||||||
|
if !strLine.contains("%chardef") { theMaxKeyLength = max(theMaxKeyLength, cells[0].count) }
|
||||||
|
charDefMap[String(cells[0]), default: []].append(String(cells[1]))
|
||||||
|
}
|
||||||
|
guard !loadingKeys, !loadingCharDefinitions else { continue }
|
||||||
|
if nameENG.isEmpty, strLine.contains("%ename ") {
|
||||||
|
if isOV {
|
||||||
|
nameENG = String(cells[1])
|
||||||
|
} else {
|
||||||
|
for neta in cells[1].components(separatedBy: ";") {
|
||||||
|
let subNetaGroup = neta.components(separatedBy: ":")
|
||||||
|
if subNetaGroup.count == 2, subNetaGroup[1].contains("en") {
|
||||||
|
nameENG = String(subNetaGroup[0])
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if nameENG.isEmpty { nameENG = String(cells[1]) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if nameCJK.isEmpty, strLine.contains("%cname ") { nameCJK = String(cells[1]) }
|
||||||
|
if selectionKeys.isEmpty, strLine.contains("%selkey ") {
|
||||||
|
selectionKeys = cells[1].map { String($0) }.deduplicated
|
||||||
|
}
|
||||||
|
if endKeys.isEmpty, strLine.contains("%endkey ") {
|
||||||
|
endKeys = cells[1].map { String($0) }.deduplicated
|
||||||
|
}
|
||||||
|
}
|
||||||
|
maxKeyLength = theMaxKeyLength
|
||||||
|
return true
|
||||||
|
} catch {
|
||||||
|
vCLog("CIN Loading Failed: File Access Error.")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
vCLog("CIN Loading Failed: File Missing.")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
public mutating func close() {
|
||||||
|
keyNameMap.removeAll()
|
||||||
|
charDefMap.removeAll()
|
||||||
|
nameENG.removeAll()
|
||||||
|
nameCJK.removeAll()
|
||||||
|
selectionKeys.removeAll()
|
||||||
|
endKeys.removeAll()
|
||||||
|
maxKeyLength = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 根據給定的讀音索引鍵,來獲取資料庫辭典內的對應結果。
|
||||||
|
/// - parameters:
|
||||||
|
/// - key: 讀音索引鍵。
|
||||||
|
public func unigramsFor(key: String) -> [Megrez.Unigram] {
|
||||||
|
guard let arrRaw = charDefMap[key]?.deduplicated, !arrRaw.isEmpty else { return [] }
|
||||||
|
var arrResults = [Megrez.Unigram]()
|
||||||
|
for (i, neta) in arrRaw.enumerated() {
|
||||||
|
arrResults.append(.init(value: neta, score: Double(i) * -0.001))
|
||||||
|
}
|
||||||
|
return arrResults
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 根據給定的讀音索引鍵來確認資料庫辭典內是否存在對應的資料。
|
||||||
|
/// - parameters:
|
||||||
|
/// - key: 讀音索引鍵。
|
||||||
|
public func hasUnigramsFor(key: String) -> Bool {
|
||||||
|
charDefMap[key] != nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
//// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
|
||||||
|
// StringView Ranges extension by (c) 2022 and onwards Isaac Xen (MIT License).
|
||||||
|
// ====================
|
||||||
|
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
|
||||||
|
// ... with NTL restriction stating that:
|
||||||
|
// No trademark license is granted to use the trade names, trademarks, service
|
||||||
|
// marks, or product names of Contributor, except as required to fulfill notice
|
||||||
|
// requirements defined in MIT License.
|
||||||
|
|
||||||
|
import Foundation
|
||||||
|
import XCTest
|
||||||
|
|
||||||
|
@testable import LangModelAssembly
|
||||||
|
|
||||||
|
private let packageRootPath = URL(fileURLWithPath: #file).pathComponents.prefix(while: { $0 != "Tests" }).joined(
|
||||||
|
separator: "/"
|
||||||
|
).dropFirst()
|
||||||
|
|
||||||
|
private let testDataPath: String = packageRootPath + "/Tests/TestCINData/"
|
||||||
|
|
||||||
|
final class LMCassetteTests: XCTestCase {
|
||||||
|
func testCassetteLoadWubi98() throws {
|
||||||
|
let pathCINFile = testDataPath + "wubi98.cin"
|
||||||
|
var lmCassette = vChewingLM.LMCassette()
|
||||||
|
NSLog("LMCassette: Start loading CIN.")
|
||||||
|
lmCassette.open(pathCINFile)
|
||||||
|
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
|
||||||
|
XCTAssertEqual(lmCassette.charDefMap.count, 21492)
|
||||||
|
XCTAssertEqual(lmCassette.keyNameMap.count, 26)
|
||||||
|
XCTAssertEqual(lmCassette.nameENG, "Wubi98")
|
||||||
|
XCTAssertEqual(lmCassette.nameCJK, "五笔98")
|
||||||
|
XCTAssertEqual(lmCassette.maxKeyLength, 4)
|
||||||
|
XCTAssertEqual(lmCassette.endKeys.count, 0)
|
||||||
|
XCTAssertEqual(lmCassette.selectionKeys.count, 10)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testCassetteLoadWubi86() throws {
|
||||||
|
let pathCINFile = testDataPath + "wubi86.cin"
|
||||||
|
var lmCassette = vChewingLM.LMCassette()
|
||||||
|
NSLog("LMCassette: Start loading CIN.")
|
||||||
|
lmCassette.open(pathCINFile)
|
||||||
|
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
|
||||||
|
XCTAssertEqual(lmCassette.charDefMap.count, 10691)
|
||||||
|
XCTAssertEqual(lmCassette.keyNameMap.count, 26)
|
||||||
|
XCTAssertEqual(lmCassette.nameENG, "Wubi86")
|
||||||
|
XCTAssertEqual(lmCassette.nameCJK, "五笔86")
|
||||||
|
XCTAssertEqual(lmCassette.maxKeyLength, 4)
|
||||||
|
XCTAssertEqual(lmCassette.endKeys.count, 0)
|
||||||
|
XCTAssertEqual(lmCassette.selectionKeys.count, 10)
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue