From 58815d7c54ff599978b8664b4f2e71dabe3f756d Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Fri, 16 Feb 2024 02:36:05 +0800 Subject: [PATCH] LMAssembly // Implement CNS pronunciation filter. --- .../LangModelAssembly/LMInstantiator.swift | 13 +++++++- .../LMInstantiator_SQLExtension.swift | 31 +++++++++++++++++++ .../LangModelAssembly/TestCoreLMSQLData.swift | 2 ++ .../LMInstantiatorSQLTests.swift | 17 ++++++++++ 4 files changed, 62 insertions(+), 1 deletion(-) diff --git a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift index 486accc3..deb70455 100644 --- a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift +++ b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift @@ -39,6 +39,7 @@ public extension vChewingLM { public var isCNSEnabled = false public var isSymbolEnabled = false public var isSCPCEnabled = false + public var filterNonCNSReadings = false public var deltaOfCalendarYears: Int = -2000 } @@ -345,7 +346,17 @@ public extension vChewingLM { rawAllUnigrams += supplyNumPadUnigrams(key: keyChain) // LMMisc 與 LMCore 的 score 在 (-10.0, 0.0) 這個區間內。 rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCHEW) - rawAllUnigrams += factoryCoreUnigramsFor(key: keyChain) + // 原廠核心辭典內容。 + var coreUnigramsResult: [Megrez.Unigram] = factoryCoreUnigramsFor(key: keyChain) + // 如果是繁體中文、且有開啟 CNS11643 全字庫讀音過濾開關的話,對原廠核心辭典內容追加過濾處理: + if config.filterNonCNSReadings, !isCHS { + coreUnigramsResult.removeAll { thisUnigram in + !checkCNSConformation(for: thisUnigram, keyArray: keyArray) + } + } + // 正式追加原廠核心辭典檢索結果。 + rawAllUnigrams += coreUnigramsResult + if config.isCNSEnabled { rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCNS) } diff --git a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator_SQLExtension.swift b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator_SQLExtension.swift index 6608b564..41eb2e33 100644 --- a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator_SQLExtension.swift +++ b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator_SQLExtension.swift @@ -165,6 +165,24 @@ extension vChewingLM.LMInstantiator { return grams } + /// 根據給定的讀音索引鍵,來獲取原廠 CNS 資料庫辭典內的對應資料陣列的 UTF8 資料。 + /// 該函式僅用來快速篩查 CNS 檢索結果 + /// - parameters: + /// - key: 讀音索引鍵。 + /// - column: 資料欄位。 + private func factoryCNSFilterThreadFor(key: String) -> String? { + let column = CoreColumn.theDataCNS + if key == "_punctuation_list" { return nil } + var results: [String] = [] + // 此處需要把 ASCII 單引號換成連續兩個單引號,否則會有 SQLite 語句查詢故障。 + let encryptedKey = Self.cnvPhonabetToASCII(key.replacingOccurrences(of: "'", with: "''")) + let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)';" + Self.querySQL(strStmt: sqlQuery, coreColumn: column) { currentResult in + results.append(currentResult) + } + return results.joined(separator: "\t") + } + /// 根據給定的讀音索引鍵,來獲取原廠資料庫辭典內的對應資料陣列的 UTF8 資料、就地分析、生成單元圖陣列。 /// - remark: 該函式暫時用不到,但先不用刪除。沒準今後會有用場。 /// - parameters: @@ -177,6 +195,19 @@ extension vChewingLM.LMInstantiator { let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)' AND \(column.name) IS NOT NULL" return Self.hasSQLResult(strStmt: sqlQuery) } + + /// 檢查該當 Unigram 結果是否完全符合台澎金馬 CNS11643 的規定讀音。 + /// 該函式不適合拿給簡體中文模式使用。 + func checkCNSConformation(for unigram: Megrez.Unigram, keyArray: [String]) -> Bool { + guard unigram.value.count == keyArray.count else { return true } + let chars = unigram.value.map(\.description) + for (i, key) in keyArray.enumerated() { + guard !key.hasPrefix("_") else { continue } + guard let matchedCNSResult = factoryCNSFilterThreadFor(key: key) else { continue } + guard matchedCNSResult.contains(chars[i]) else { return false } + } + return true + } } private extension vChewingLM.LMInstantiator { diff --git a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/TestCoreLMSQLData.swift b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/TestCoreLMSQLData.swift index 4fe9e8cd..674ca7b2 100644 --- a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/TestCoreLMSQLData.swift +++ b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/TestCoreLMSQLData.swift @@ -29,6 +29,8 @@ INSERT INTO DATA_MAIN VALUES('de5','-3.516024 的\t-7.427179 得','-3.516024 的 INSERT INTO DATA_MAIN VALUES('di2','-3.516024 的','-3.516024 的',NULL,NULL,NULL,NULL); INSERT INTO DATA_MAIN VALUES('di4','-3.516024 的','-3.516024 的',NULL,NULL,NULL,NULL); INSERT INTO DATA_MAIN VALUES('duP3','-9.544 㨃','-9.544 㨃','㨃\t䇏\t𦞙\t謉\t𠡒\t𡑈\t𥫉\t𦞱\t𧫏\t𩛔','','',''); +INSERT INTO DATA_MAIN VALUES('uP','-6.0 危','-6.0 危',NULL,NULL,NULL,NULL); /* 用來測試 CNS 過濾器的。 */ +INSERT INTO DATA_MAIN VALUES('uP2','-6.0 危','-6.0 危','-6.0 危',NULL,NULL,NULL); /* 用來測試 CNS 過濾器的。 */ INSERT INTO DATA_MAIN VALUES('fL','-11.0 🐝','-11.0 🐝',NULL,NULL,NULL,NULL); INSERT INTO DATA_MAIN VALUES('gM','-7.171551 高\t-11.92872 膏\t-13.624335 篙\t-12.390804 糕','-7.171551 高\t-11.92872 膏\t-13.624335 篙\t-12.390804 糕',NULL,NULL,NULL,NULL); INSERT INTO DATA_MAIN VALUES('gM-ke-ji4','-9.842421 高科技','-9.842421 高科技',NULL,NULL,NULL,NULL); diff --git a/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMInstantiatorSQLTests.swift b/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMInstantiatorSQLTests.swift index ca8296f0..3364810e 100644 --- a/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMInstantiatorSQLTests.swift +++ b/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMInstantiatorSQLTests.swift @@ -44,4 +44,21 @@ final class LMInstantiatorSQLTests: XCTestCase { XCTAssertEqual(vChewingLM.LMInstantiator.getFactoryReverseLookupData(with: "和"), expectedReverseLookupResults) vChewingLM.LMInstantiator.disconnectSQLDB() } + + func testCNSMask() throws { + let instance = vChewingLM.LMInstantiator(isCHS: false) + XCTAssertTrue(vChewingLM.LMInstantiator.connectToTestSQLDB()) + instance.setOptions { config in + config.isCNSEnabled = false + config.isSymbolEnabled = false + config.filterNonCNSReadings = false + } + XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟ"]).description, "[(危,-6.0)]") + XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟˊ"]).description, "[(危,-6.0)]") + instance.setOptions { config in + config.filterNonCNSReadings = true + } + XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟ"]).description, "[]") + XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟˊ"]).description, "[(危,-6.0)]") + } }