LMAssembly // Implement CNS pronunciation filter.
This commit is contained in:
parent
b479acf779
commit
58815d7c54
|
@ -39,6 +39,7 @@ public extension vChewingLM {
|
|||
public var isCNSEnabled = false
|
||||
public var isSymbolEnabled = false
|
||||
public var isSCPCEnabled = false
|
||||
public var filterNonCNSReadings = false
|
||||
public var deltaOfCalendarYears: Int = -2000
|
||||
}
|
||||
|
||||
|
@ -345,7 +346,17 @@ public extension vChewingLM {
|
|||
rawAllUnigrams += supplyNumPadUnigrams(key: keyChain)
|
||||
// LMMisc 與 LMCore 的 score 在 (-10.0, 0.0) 這個區間內。
|
||||
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCHEW)
|
||||
rawAllUnigrams += factoryCoreUnigramsFor(key: keyChain)
|
||||
// 原廠核心辭典內容。
|
||||
var coreUnigramsResult: [Megrez.Unigram] = factoryCoreUnigramsFor(key: keyChain)
|
||||
// 如果是繁體中文、且有開啟 CNS11643 全字庫讀音過濾開關的話,對原廠核心辭典內容追加過濾處理:
|
||||
if config.filterNonCNSReadings, !isCHS {
|
||||
coreUnigramsResult.removeAll { thisUnigram in
|
||||
!checkCNSConformation(for: thisUnigram, keyArray: keyArray)
|
||||
}
|
||||
}
|
||||
// 正式追加原廠核心辭典檢索結果。
|
||||
rawAllUnigrams += coreUnigramsResult
|
||||
|
||||
if config.isCNSEnabled {
|
||||
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCNS)
|
||||
}
|
||||
|
|
|
@ -165,6 +165,24 @@ extension vChewingLM.LMInstantiator {
|
|||
return grams
|
||||
}
|
||||
|
||||
/// 根據給定的讀音索引鍵,來獲取原廠 CNS 資料庫辭典內的對應資料陣列的 UTF8 資料。
|
||||
/// 該函式僅用來快速篩查 CNS 檢索結果
|
||||
/// - parameters:
|
||||
/// - key: 讀音索引鍵。
|
||||
/// - column: 資料欄位。
|
||||
private func factoryCNSFilterThreadFor(key: String) -> String? {
|
||||
let column = CoreColumn.theDataCNS
|
||||
if key == "_punctuation_list" { return nil }
|
||||
var results: [String] = []
|
||||
// 此處需要把 ASCII 單引號換成連續兩個單引號,否則會有 SQLite 語句查詢故障。
|
||||
let encryptedKey = Self.cnvPhonabetToASCII(key.replacingOccurrences(of: "'", with: "''"))
|
||||
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)';"
|
||||
Self.querySQL(strStmt: sqlQuery, coreColumn: column) { currentResult in
|
||||
results.append(currentResult)
|
||||
}
|
||||
return results.joined(separator: "\t")
|
||||
}
|
||||
|
||||
/// 根據給定的讀音索引鍵,來獲取原廠資料庫辭典內的對應資料陣列的 UTF8 資料、就地分析、生成單元圖陣列。
|
||||
/// - remark: 該函式暫時用不到,但先不用刪除。沒準今後會有用場。
|
||||
/// - parameters:
|
||||
|
@ -177,6 +195,19 @@ extension vChewingLM.LMInstantiator {
|
|||
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)' AND \(column.name) IS NOT NULL"
|
||||
return Self.hasSQLResult(strStmt: sqlQuery)
|
||||
}
|
||||
|
||||
/// 檢查該當 Unigram 結果是否完全符合台澎金馬 CNS11643 的規定讀音。
|
||||
/// 該函式不適合拿給簡體中文模式使用。
|
||||
func checkCNSConformation(for unigram: Megrez.Unigram, keyArray: [String]) -> Bool {
|
||||
guard unigram.value.count == keyArray.count else { return true }
|
||||
let chars = unigram.value.map(\.description)
|
||||
for (i, key) in keyArray.enumerated() {
|
||||
guard !key.hasPrefix("_") else { continue }
|
||||
guard let matchedCNSResult = factoryCNSFilterThreadFor(key: key) else { continue }
|
||||
guard matchedCNSResult.contains(chars[i]) else { return false }
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
private extension vChewingLM.LMInstantiator {
|
||||
|
|
|
@ -29,6 +29,8 @@ INSERT INTO DATA_MAIN VALUES('de5','-3.516024 的\t-7.427179 得','-3.516024 的
|
|||
INSERT INTO DATA_MAIN VALUES('di2','-3.516024 的','-3.516024 的',NULL,NULL,NULL,NULL);
|
||||
INSERT INTO DATA_MAIN VALUES('di4','-3.516024 的','-3.516024 的',NULL,NULL,NULL,NULL);
|
||||
INSERT INTO DATA_MAIN VALUES('duP3','-9.544 㨃','-9.544 㨃','㨃\t䇏\t𦞙\t謉\t𠡒\t𡑈\t𥫉\t𦞱\t𧫏\t𩛔','','','');
|
||||
INSERT INTO DATA_MAIN VALUES('uP','-6.0 危','-6.0 危',NULL,NULL,NULL,NULL); /* 用來測試 CNS 過濾器的。 */
|
||||
INSERT INTO DATA_MAIN VALUES('uP2','-6.0 危','-6.0 危','-6.0 危',NULL,NULL,NULL); /* 用來測試 CNS 過濾器的。 */
|
||||
INSERT INTO DATA_MAIN VALUES('fL','-11.0 🐝','-11.0 🐝',NULL,NULL,NULL,NULL);
|
||||
INSERT INTO DATA_MAIN VALUES('gM','-7.171551 高\t-11.92872 膏\t-13.624335 篙\t-12.390804 糕','-7.171551 高\t-11.92872 膏\t-13.624335 篙\t-12.390804 糕',NULL,NULL,NULL,NULL);
|
||||
INSERT INTO DATA_MAIN VALUES('gM-ke-ji4','-9.842421 高科技','-9.842421 高科技',NULL,NULL,NULL,NULL);
|
||||
|
|
|
@ -44,4 +44,21 @@ final class LMInstantiatorSQLTests: XCTestCase {
|
|||
XCTAssertEqual(vChewingLM.LMInstantiator.getFactoryReverseLookupData(with: "和"), expectedReverseLookupResults)
|
||||
vChewingLM.LMInstantiator.disconnectSQLDB()
|
||||
}
|
||||
|
||||
func testCNSMask() throws {
|
||||
let instance = vChewingLM.LMInstantiator(isCHS: false)
|
||||
XCTAssertTrue(vChewingLM.LMInstantiator.connectToTestSQLDB())
|
||||
instance.setOptions { config in
|
||||
config.isCNSEnabled = false
|
||||
config.isSymbolEnabled = false
|
||||
config.filterNonCNSReadings = false
|
||||
}
|
||||
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟ"]).description, "[(危,-6.0)]")
|
||||
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟˊ"]).description, "[(危,-6.0)]")
|
||||
instance.setOptions { config in
|
||||
config.filterNonCNSReadings = true
|
||||
}
|
||||
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟ"]).description, "[]")
|
||||
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟˊ"]).description, "[(危,-6.0)]")
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue