LMAssembly // Implement CNS pronunciation filter.

This commit is contained in:
ShikiSuen 2024-02-16 02:36:05 +08:00
parent b479acf779
commit 58815d7c54
4 changed files with 62 additions and 1 deletions

View File

@ -39,6 +39,7 @@ public extension vChewingLM {
public var isCNSEnabled = false public var isCNSEnabled = false
public var isSymbolEnabled = false public var isSymbolEnabled = false
public var isSCPCEnabled = false public var isSCPCEnabled = false
public var filterNonCNSReadings = false
public var deltaOfCalendarYears: Int = -2000 public var deltaOfCalendarYears: Int = -2000
} }
@ -345,7 +346,17 @@ public extension vChewingLM {
rawAllUnigrams += supplyNumPadUnigrams(key: keyChain) rawAllUnigrams += supplyNumPadUnigrams(key: keyChain)
// LMMisc LMCore score (-10.0, 0.0) // LMMisc LMCore score (-10.0, 0.0)
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCHEW) rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCHEW)
rawAllUnigrams += factoryCoreUnigramsFor(key: keyChain) //
var coreUnigramsResult: [Megrez.Unigram] = factoryCoreUnigramsFor(key: keyChain)
// CNS11643
if config.filterNonCNSReadings, !isCHS {
coreUnigramsResult.removeAll { thisUnigram in
!checkCNSConformation(for: thisUnigram, keyArray: keyArray)
}
}
//
rawAllUnigrams += coreUnigramsResult
if config.isCNSEnabled { if config.isCNSEnabled {
rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCNS) rawAllUnigrams += factoryUnigramsFor(key: keyChain, column: .theDataCNS)
} }

View File

@ -165,6 +165,24 @@ extension vChewingLM.LMInstantiator {
return grams return grams
} }
/// CNS UTF8
/// CNS
/// - parameters:
/// - key:
/// - column:
private func factoryCNSFilterThreadFor(key: String) -> String? {
let column = CoreColumn.theDataCNS
if key == "_punctuation_list" { return nil }
var results: [String] = []
// ASCII SQLite
let encryptedKey = Self.cnvPhonabetToASCII(key.replacingOccurrences(of: "'", with: "''"))
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)';"
Self.querySQL(strStmt: sqlQuery, coreColumn: column) { currentResult in
results.append(currentResult)
}
return results.joined(separator: "\t")
}
/// UTF8 /// UTF8
/// - remark: /// - remark:
/// - parameters: /// - parameters:
@ -177,6 +195,19 @@ extension vChewingLM.LMInstantiator {
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)' AND \(column.name) IS NOT NULL" let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)' AND \(column.name) IS NOT NULL"
return Self.hasSQLResult(strStmt: sqlQuery) return Self.hasSQLResult(strStmt: sqlQuery)
} }
/// Unigram CNS11643
/// 使
func checkCNSConformation(for unigram: Megrez.Unigram, keyArray: [String]) -> Bool {
guard unigram.value.count == keyArray.count else { return true }
let chars = unigram.value.map(\.description)
for (i, key) in keyArray.enumerated() {
guard !key.hasPrefix("_") else { continue }
guard let matchedCNSResult = factoryCNSFilterThreadFor(key: key) else { continue }
guard matchedCNSResult.contains(chars[i]) else { return false }
}
return true
}
} }
private extension vChewingLM.LMInstantiator { private extension vChewingLM.LMInstantiator {

View File

@ -29,6 +29,8 @@ INSERT INTO DATA_MAIN VALUES('de5','-3.516024 的\t-7.427179 得','-3.516024 的
INSERT INTO DATA_MAIN VALUES('di2','-3.516024 ','-3.516024 ',NULL,NULL,NULL,NULL); INSERT INTO DATA_MAIN VALUES('di2','-3.516024 ','-3.516024 ',NULL,NULL,NULL,NULL);
INSERT INTO DATA_MAIN VALUES('di4','-3.516024 ','-3.516024 ',NULL,NULL,NULL,NULL); INSERT INTO DATA_MAIN VALUES('di4','-3.516024 ','-3.516024 ',NULL,NULL,NULL,NULL);
INSERT INTO DATA_MAIN VALUES('duP3','-9.544 ','-9.544 ','\t䇏\t𦞙\t謉\t𠡒\t𡑈\t𥫉\t𦞱\t𧫏\t𩛔','','',''); INSERT INTO DATA_MAIN VALUES('duP3','-9.544 ','-9.544 ','\t䇏\t𦞙\t謉\t𠡒\t𡑈\t𥫉\t𦞱\t𧫏\t𩛔','','','');
INSERT INTO DATA_MAIN VALUES('uP','-6.0 ','-6.0 ',NULL,NULL,NULL,NULL); /* CNS */
INSERT INTO DATA_MAIN VALUES('uP2','-6.0 ','-6.0 ','-6.0 ',NULL,NULL,NULL); /* CNS */
INSERT INTO DATA_MAIN VALUES('fL','-11.0 🐝','-11.0 🐝',NULL,NULL,NULL,NULL); INSERT INTO DATA_MAIN VALUES('fL','-11.0 🐝','-11.0 🐝',NULL,NULL,NULL,NULL);
INSERT INTO DATA_MAIN VALUES('gM','-7.171551 \t-11.92872 \t-13.624335 \t-12.390804 ','-7.171551 \t-11.92872 \t-13.624335 \t-12.390804 ',NULL,NULL,NULL,NULL); INSERT INTO DATA_MAIN VALUES('gM','-7.171551 \t-11.92872 \t-13.624335 \t-12.390804 ','-7.171551 \t-11.92872 \t-13.624335 \t-12.390804 ',NULL,NULL,NULL,NULL);
INSERT INTO DATA_MAIN VALUES('gM-ke-ji4','-9.842421 ','-9.842421 ',NULL,NULL,NULL,NULL); INSERT INTO DATA_MAIN VALUES('gM-ke-ji4','-9.842421 ','-9.842421 ',NULL,NULL,NULL,NULL);

View File

@ -44,4 +44,21 @@ final class LMInstantiatorSQLTests: XCTestCase {
XCTAssertEqual(vChewingLM.LMInstantiator.getFactoryReverseLookupData(with: ""), expectedReverseLookupResults) XCTAssertEqual(vChewingLM.LMInstantiator.getFactoryReverseLookupData(with: ""), expectedReverseLookupResults)
vChewingLM.LMInstantiator.disconnectSQLDB() vChewingLM.LMInstantiator.disconnectSQLDB()
} }
func testCNSMask() throws {
let instance = vChewingLM.LMInstantiator(isCHS: false)
XCTAssertTrue(vChewingLM.LMInstantiator.connectToTestSQLDB())
instance.setOptions { config in
config.isCNSEnabled = false
config.isSymbolEnabled = false
config.filterNonCNSReadings = false
}
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟ"]).description, "[(危,-6.0)]")
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟˊ"]).description, "[(危,-6.0)]")
instance.setOptions { config in
config.filterNonCNSReadings = true
}
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟ"]).description, "[]")
XCTAssertEqual(instance.unigramsFor(keyArray: ["ㄨㄟˊ"]).description, "[(危,-6.0)]")
}
} }