vChewing-macOS/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator_SQLExtension...

296 lines
13 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
// ... with NTL restriction stating that:
// No trademark license is granted to use the trade names, trademarks, service
// marks, or product names of Contributor, except as required to fulfill notice
// requirements defined in MIT License.
import Foundation
import Megrez
import SQLite3
/* ==============
Apple 8GB SQLite
CREATE TABLE IF NOT EXISTS DATA_MAIN (
theKey TEXT NOT NULL,
theDataCHS TEXT,
theDataCHT TEXT,
theDataCNS TEXT,
theDataMISC TEXT,
theDataSYMB TEXT,
theDataCHEW TEXT,
PRIMARY KEY (theKey)
) WITHOUT ROWID;
CREATE TABLE IF NOT EXISTS DATA_REV (
theChar TEXT NOT NULL,
theReadings TEXT NOT NULL,
PRIMARY KEY (theChar)
) WITHOUT ROWID;
*/
extension LMAssembly.LMInstantiator {
enum CoreColumn: Int32 {
case theDataCHS = 1 //
case theDataCHT = 2 //
case theDataCNS = 3 //
case theDataMISC = 4 //
case theDataSYMB = 5 //
case theDataCHEW = 6 //
var name: String { String(describing: self) }
var id: Int32 { rawValue }
var defaultScore: Double {
switch self {
case .theDataCHEW: return -1
case .theDataCNS: return -11
case .theDataSYMB: return -13
case .theDataMISC: return -10
default: return -9.9
}
}
}
}
extension LMAssembly.LMInstantiator {
@discardableResult public static func connectSQLDB(dbPath: String, dropPreviousConnection: Bool = true) -> Bool {
if dropPreviousConnection { disconnectSQLDB() }
vCLMLog("Establishing SQLite connection to: \(dbPath)")
guard sqlite3_open(dbPath, &Self.ptrSQL) == SQLITE_OK else { return false }
guard "PRAGMA journal_mode = OFF;".runAsSQLExec(dbPointer: &ptrSQL) else { return false }
isSQLDBConnected = true
return true
}
public static func disconnectSQLDB() {
if Self.ptrSQL != nil {
sqlite3_close_v2(Self.ptrSQL)
Self.ptrSQL = nil
}
isSQLDBConnected = false
}
fileprivate static func querySQL(strStmt sqlQuery: String, coreColumn column: CoreColumn, handler: (String) -> Void) {
guard Self.ptrSQL != nil else { return }
performStatementSansResult { ptrStatement in
sqlite3_prepare_v2(Self.ptrSQL, sqlQuery, -1, &ptrStatement, nil)
while sqlite3_step(ptrStatement) == SQLITE_ROW {
guard let rawValue = sqlite3_column_text(ptrStatement, column.id) else { continue }
handler(String(cString: rawValue))
}
}
}
fileprivate static func hasSQLResult(strStmt sqlQuery: String) -> Bool {
guard Self.ptrSQL != nil else { return false }
var sqlQuery = sqlQuery
if sqlQuery.last == ";" { sqlQuery = sqlQuery.dropLast(1).description } //
guard !sqlQuery.isEmpty else { return false }
return performStatement { ptrStatement in
let wrappedQuery = "SELECT EXISTS(\(sqlQuery));"
sqlite3_prepare_v2(Self.ptrSQL, wrappedQuery, -1, &ptrStatement, nil)
while sqlite3_step(ptrStatement) == SQLITE_ROW {
return sqlite3_column_int(ptrStatement, 0) == 1
}
return false
}
}
///
public static func getFactoryReverseLookupData(with kanji: String) -> [String]? {
var results: [String] = []
let sqlQuery = "SELECT * FROM DATA_REV WHERE theChar='\(kanji)';"
guard Self.ptrSQL != nil else { return nil }
performStatementSansResult { ptrStatement in
sqlite3_prepare_v2(Self.ptrSQL, sqlQuery, -1, &ptrStatement, nil)
while sqlite3_step(ptrStatement) == SQLITE_ROW {
guard let rawValue = sqlite3_column_text(ptrStatement, 1) else { continue }
results.append(
contentsOf: String(cString: rawValue).split(separator: "\t").map { reading in
Self.restorePhonabetFromASCII(reading.description)
}
)
}
}
return results.isEmpty ? nil : results
}
func getHaninSymbolMenuUnigrams() -> [Megrez.Unigram] {
let column: CoreColumn = isCHS ? .theDataCHS : .theDataCHT
var grams: [Megrez.Unigram] = []
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='_punctuation_list';"
Self.querySQL(strStmt: sqlQuery, coreColumn: column) { currentResult in
let arrRangeRecords = currentResult.split(separator: "\t")
for strNetaSet in arrRangeRecords {
let neta = Array(strNetaSet.trimmingCharacters(in: .newlines).split(separator: " ").reversed())
let theValue: String = .init(neta[0])
var theScore = column.defaultScore
if neta.count >= 2, let thisScore = Double(String(neta[1])) {
theScore = thisScore
}
if theScore > 0 {
theScore *= -1 //
}
grams.append(Megrez.Unigram(value: theValue, score: theScore))
}
}
return grams
}
/// UTF8
/// - Remark: 使
/// - parameters:
/// - key:
public func factoryCoreUnigramsFor(key: String) -> [Megrez.Unigram] {
// ASCII SQLite
factoryUnigramsFor(key: key, column: isCHS ? .theDataCHS : .theDataCHT)
}
/// UTF8
/// - parameters:
/// - key:
/// - column:
func factoryUnigramsFor(
key: String, column: LMAssembly.LMInstantiator.CoreColumn
) -> [Megrez.Unigram] {
if key == "_punctuation_list" { return [] }
var grams: [Megrez.Unigram] = []
var gramsHW: [Megrez.Unigram] = []
// ASCII SQLite
let encryptedKey = Self.cnvPhonabetToASCII(key.replacingOccurrences(of: "'", with: "''"))
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)';"
Self.querySQL(strStmt: sqlQuery, coreColumn: column) { currentResult in
var i: Double = 0
var previousScore: Double?
currentResult.split(separator: "\t").forEach { strNetaSet in
// stable sort
let neta = Array(strNetaSet.trimmingCharacters(in: .newlines).split(separator: " ").reversed())
let theValue: String = .init(neta[0])
var theScore = column.defaultScore
if neta.count >= 2, let thisScore = Double(String(neta[1])) {
theScore = thisScore
}
if theScore > 0 {
theScore *= -1 //
}
if previousScore == theScore {
theScore -= i * 0.000_001
i += 1
} else {
previousScore = theScore
i = 0
}
grams.append(Megrez.Unigram(value: theValue, score: theScore))
if !key.contains("_punctuation") { return }
let halfValue = theValue.applyingTransformFW2HW(reverse: false)
if halfValue != theValue {
gramsHW.append(Megrez.Unigram(value: halfValue, score: theScore))
}
}
}
grams.append(contentsOf: gramsHW)
return grams
}
/// CNS UTF8
/// CNS
/// - parameters:
/// - key:
/// - column:
private func factoryCNSFilterThreadFor(key: String) -> String? {
let column = CoreColumn.theDataCNS
if key == "_punctuation_list" { return nil }
var results: [String] = []
// ASCII SQLite
let encryptedKey = Self.cnvPhonabetToASCII(key.replacingOccurrences(of: "'", with: "''"))
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)';"
Self.querySQL(strStmt: sqlQuery, coreColumn: column) { currentResult in
results.append(currentResult)
}
return results.joined(separator: "\t")
}
/// UTF8
/// - remark:
/// - parameters:
/// - key:
func hasFactoryCoreUnigramsFor(keyArray: [String]) -> Bool {
let column: CoreColumn = isCHS ? .theDataCHS : .theDataCHT
// ASCII SQLite
let encryptedKey = Self.cnvPhonabetToASCII(keyArray.joined(separator: "-").replacingOccurrences(of: "'", with: "''"))
// SELECT EXISTS();
let sqlQuery = "SELECT * FROM DATA_MAIN WHERE theKey='\(encryptedKey)' AND \(column.name) IS NOT NULL"
return Self.hasSQLResult(strStmt: sqlQuery)
}
/// Unigram CNS11643
/// 使
func checkCNSConformation(for unigram: Megrez.Unigram, keyArray: [String]) -> Bool {
guard unigram.value.count == keyArray.count else { return true }
let chars = unigram.value.map(\.description)
for (i, key) in keyArray.enumerated() {
guard !key.hasPrefix("_") else { continue }
guard let matchedCNSResult = factoryCNSFilterThreadFor(key: key) else { continue }
guard matchedCNSResult.contains(chars[i]) else { return false }
}
return true
}
}
private extension LMAssembly.LMInstantiator {
///
///
/// 使 json
///
/// ASCII
/// - parameters:
/// - incoming:
static func cnvPhonabetToASCII(_ incoming: String) -> String {
var strOutput = incoming
if !strOutput.contains("_") {
for entry in Self.dicPhonabet2ASCII {
strOutput = strOutput.replacingOccurrences(of: entry.key, with: entry.value)
}
}
return strOutput
}
static let dicPhonabet2ASCII: [String: String] = [
"": "b", "": "p", "": "m", "": "f", "": "d", "": "t", "": "n", "": "l", "": "g", "": "k", "": "h",
"": "j", "": "q", "": "x", "": "Z", "": "C", "": "S", "": "r", "": "z", "": "c", "": "s", "": "i",
"": "u", "": "v", "": "a", "": "o", "": "e", "": "E", "": "B", "": "P", "": "M", "": "F", "": "D",
"": "T", "": "N", "": "L", "": "R", "ˊ": "2", "ˇ": "3", "ˋ": "4", "˙": "5",
]
///
///
/// ASCII
/// - parameters:
/// - incoming:
static func restorePhonabetFromASCII(_ incoming: String) -> String {
var strOutput = incoming
if !strOutput.contains("_") {
for entry in Self.dicPhonabet4ASCII {
strOutput = strOutput.replacingOccurrences(of: entry.key, with: entry.value)
}
}
return strOutput
}
static let dicPhonabet4ASCII: [String: String] = [
"b": "", "p": "", "m": "", "f": "", "d": "", "t": "", "n": "", "l": "", "g": "", "k": "", "h": "",
"j": "", "q": "", "x": "", "Z": "", "C": "", "S": "", "r": "", "z": "", "c": "", "s": "", "i": "",
"u": "", "v": "", "a": "", "o": "", "e": "", "E": "", "B": "", "P": "", "M": "", "F": "", "D": "",
"T": "", "N": "", "L": "", "R": "", "2": "ˊ", "3": "ˇ", "4": "ˋ", "5": "˙",
]
}
public extension LMAssembly.LMInstantiator {
@discardableResult static func connectToTestSQLDB() -> Bool {
Self.connectSQLDB(dbPath: #":memory:"#) && sqlTestCoreLMData.runAsSQLExec(dbPointer: &ptrSQL)
}
}