LMAssembly // Implement CNS pronunciation filter.
This commit is contained in:
parent
b479acf779
commit
58815d7c54
|
@ -39,6 +39,7 @@ public extension vChewingLM {
|
||||||
public var isCNSEnabled = false
|
public var isCNSEnabled = false
|
||||||
public var isSymbolEnabled = false
|
public var isSymbolEnabled = false
|
||||||
public var isSCPCEnabled = false
|
public var isSCPCEnabled = false
|
||||||
|
public var filterNonCNSReadings = false
|
||||||
public var deltaOfCalendarYears: Int = -2000
|
public var deltaOfCalendarYears: Int = -2000
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -345,7 +346,17 @@ public extension vChewingLM {
|
||||||
rawAllUnigrams += supplyNumPadUnigrams(key: keyChain)
|
rawAllUnigrams += supplyNumPadUnigrams(key: keyChain)
|
||||||
// LMMisc 與 LMCore 的 score 在 (-10.0, 0.0) 這個區間內。
|
// LMMisc 與 LMCore 的 score 在 (-10.0, 0.0) 這個區間內。
|
||||||
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCHEW)
|
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCHEW)
|
||||||
rawAllUnigrams += factoryCoreUnigramsFor(key: keyChain)
|
// 原廠核心辭典內容。
|
||||||
|
var coreUnigramsResult: [Megrez.Unigram] = factoryCoreUnigramsFor(key: keyChain)
|
||||||
|
// 如果是繁體中文、且有開啟 CNS11643 全字庫讀音過濾開關的話,對原廠核心辭典內容追加過濾處理:
|
||||||
|
if config.filterNonCNSReadings, !isCHS {
|
||||||
|
coreUnigramsResult.removeAll { thisUnigram in
|
||||||
|
!checkCNSConformation(for: thisUnigram, keyArray: keyArray)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 正式追加原廠核心辭典檢索結果。
|
||||||
|
rawAllUnigrams += coreUnigramsResult
|
||||||
|
|
||||||
if config.isCNSEnabled {
|
if config.isCNSEnabled {
|
||||||
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCNS)
|
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCNS)
|
||||||
}
|
}
|
||||||
|
|
|
@ -165,6 +165,24 @@ extension vChewingLM.LMInstantiator {
|
||||||
return grams
|
return grams
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// 根據給定的讀音索引鍵,來獲取原廠 CNS 資料庫辭典內的對應資料陣列的 UTF8 資料。
|
||||||
|
/// 該函式僅用來快速篩查 CNS 檢索結果
|
||||||
|
/// - parameters:
|
||||||
|
/// - key: 讀音索引鍵。
|
||||||
|
/// - column: 資料欄位。
|
||||||
|
private func factoryCNSFilterThreadFor(key: String) -> String? {
|
||||||
|
let column = CoreColumn.theDataCNS
|
||||||
|
if key == "_punctuation_list" { return nil }
|
||||||
|
var results: [String] = []
|
||||||
|
// 此處需要把 ASCII 單引號換成連續兩個單引號,否則會有 SQLite 語句查詢故障。
|
||||||
|
let encryptedKey = Self.cnvPhonabetToASCII(key.replacingOccurrences(of: "'", with: "''"))
|
||||||
|
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)';"
|
||||||
|
Self.querySQL(strStmt: sqlQuery, coreColumn: column) { currentResult in
|
||||||
|
results.append(currentResult)
|
||||||
|
}
|
||||||
|
return results.joined(separator: "\t")
|
||||||
|
}
|
||||||
|
|
||||||
/// 根據給定的讀音索引鍵,來獲取原廠資料庫辭典內的對應資料陣列的 UTF8 資料、就地分析、生成單元圖陣列。
|
/// 根據給定的讀音索引鍵,來獲取原廠資料庫辭典內的對應資料陣列的 UTF8 資料、就地分析、生成單元圖陣列。
|
||||||
/// - remark: 該函式暫時用不到,但先不用刪除。沒準今後會有用場。
|
/// - remark: 該函式暫時用不到,但先不用刪除。沒準今後會有用場。
|
||||||
/// - parameters:
|
/// - parameters:
|
||||||
|
@ -177,6 +195,19 @@ extension vChewingLM.LMInstantiator {
|
||||||
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)' AND \(column.name) IS NOT NULL"
|
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)' AND \(column.name) IS NOT NULL"
|
||||||
return Self.hasSQLResult(strStmt: sqlQuery)
|
return Self.hasSQLResult(strStmt: sqlQuery)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// 檢查該當 Unigram 結果是否完全符合台澎金馬 CNS11643 的規定讀音。
|
||||||
|
/// 該函式不適合拿給簡體中文模式使用。
|
||||||
|
func checkCNSConformation(for unigram: Megrez.Unigram, keyArray: [String]) -> Bool {
|
||||||
|
guard unigram.value.count == keyArray.count else { return true }
|
||||||
|
let chars = unigram.value.map(\.description)
|
||||||
|
for (i, key) in keyArray.enumerated() {
|
||||||
|
guard !key.hasPrefix("_") else { continue }
|
||||||
|
guard let matchedCNSResult = factoryCNSFilterThreadFor(key: key) else { continue }
|
||||||
|
guard matchedCNSResult.contains(chars[i]) else { return false }
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private extension vChewingLM.LMInstantiator {
|
private extension vChewingLM.LMInstantiator {
|
||||||
|
|
|
@ -29,6 +29,8 @@ INSERT INTO DATA_MAIN VALUES('de5','-3.516024 的\t-7.427179 得','-3.516024 的
|
||||||
INSERT INTO DATA_MAIN VALUES('di2','-3.516024 的','-3.516024 的',NULL,NULL,NULL,NULL);
|
INSERT INTO DATA_MAIN VALUES('di2','-3.516024 的','-3.516024 的',NULL,NULL,NULL,NULL);
|
||||||
INSERT INTO DATA_MAIN VALUES('di4','-3.516024 的','-3.516024 的',NULL,NULL,NULL,NULL);
|
INSERT INTO DATA_MAIN VALUES('di4','-3.516024 的','-3.516024 的',NULL,NULL,NULL,NULL);
|
||||||
INSERT INTO DATA_MAIN VALUES('duP3','-9.544 㨃','-9.544 㨃','㨃\t䇏\t𦞙\t謉\t𠡒\t𡑈\t𥫉\t𦞱\t𧫏\t𩛔','','','');
|
INSERT INTO DATA_MAIN VALUES('duP3','-9.544 㨃','-9.544 㨃','㨃\t䇏\t𦞙\t謉\t𠡒\t𡑈\t𥫉\t𦞱\t𧫏\t𩛔','','','');
|
||||||
|
INSERT INTO DATA_MAIN VALUES('uP','-6.0 危','-6.0 危',NULL,NULL,NULL,NULL); /* 用來測試 CNS 過濾器的。 */
|
||||||
|
INSERT INTO DATA_MAIN VALUES('uP2','-6.0 危','-6.0 危','-6.0 危',NULL,NULL,NULL); /* 用來測試 CNS 過濾器的。 */
|
||||||
INSERT INTO DATA_MAIN VALUES('fL','-11.0 🐝','-11.0 🐝',NULL,NULL,NULL,NULL);
|
INSERT INTO DATA_MAIN VALUES('fL','-11.0 🐝','-11.0 🐝',NULL,NULL,NULL,NULL);
|
||||||
INSERT INTO DATA_MAIN VALUES('gM','-7.171551 高\t-11.92872 膏\t-13.624335 篙\t-12.390804 糕','-7.171551 高\t-11.92872 膏\t-13.624335 篙\t-12.390804 糕',NULL,NULL,NULL,NULL);
|
INSERT INTO DATA_MAIN VALUES('gM','-7.171551 高\t-11.92872 膏\t-13.624335 篙\t-12.390804 糕','-7.171551 高\t-11.92872 膏\t-13.624335 篙\t-12.390804 糕',NULL,NULL,NULL,NULL);
|
||||||
INSERT INTO DATA_MAIN VALUES('gM-ke-ji4','-9.842421 高科技','-9.842421 高科技',NULL,NULL,NULL,NULL);
|
INSERT INTO DATA_MAIN VALUES('gM-ke-ji4','-9.842421 高科技','-9.842421 高科技',NULL,NULL,NULL,NULL);
|
||||||
|
|
|
@ -44,4 +44,21 @@ final class LMInstantiatorSQLTests: XCTestCase {
|
||||||
XCTAssertEqual(vChewingLM.LMInstantiator.getFactoryReverseLookupData(with: "和"), expectedReverseLookupResults)
|
XCTAssertEqual(vChewingLM.LMInstantiator.getFactoryReverseLookupData(with: "和"), expectedReverseLookupResults)
|
||||||
vChewingLM.LMInstantiator.disconnectSQLDB()
|
vChewingLM.LMInstantiator.disconnectSQLDB()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testCNSMask() throws {
|
||||||
|
let instance = vChewingLM.LMInstantiator(isCHS: false)
|
||||||
|
XCTAssertTrue(vChewingLM.LMInstantiator.connectToTestSQLDB())
|
||||||
|
instance.setOptions { config in
|
||||||
|
config.isCNSEnabled = false
|
||||||
|
config.isSymbolEnabled = false
|
||||||
|
config.filterNonCNSReadings = false
|
||||||
|
}
|
||||||
|
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟ"]).description, "[(危,-6.0)]")
|
||||||
|
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟˊ"]).description, "[(危,-6.0)]")
|
||||||
|
instance.setOptions { config in
|
||||||
|
config.filterNonCNSReadings = true
|
||||||
|
}
|
||||||
|
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟ"]).description, "[]")
|
||||||
|
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟˊ"]).description, "[(危,-6.0)]")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue