vChewing-macOS/DataCompiler/dataCompiler.swift

1164 lines
45 KiB
Swift
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env swift
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
// ... with NTL restriction stating that:
// No trademark license is granted to use the trade names, trademarks, service
// marks, or product names of Contributor, except as required to fulfill notice
// requirements defined in MIT License.
import Foundation
import SQLite3
// MARK: -
fileprivate extension String {
mutating func regReplace(pattern: String, replaceWith: String = "") {
// Ref: https://stackoverflow.com/a/40993403/4162914 && https://stackoverflow.com/a/71291137/4162914
do {
let regex = try NSRegularExpression(
pattern: pattern, options: [.caseInsensitive, .anchorsMatchLines]
)
let range = NSRange(startIndex..., in: self)
self = regex.stringByReplacingMatches(
in: self, options: [], range: range, withTemplate: replaceWith
)
} catch { return }
}
}
// MARK: - String as SQL Command
fileprivate extension String {
@discardableResult func runAsSQLExec(dbPointer ptrDB: inout OpaquePointer?) -> Bool {
ptrDB != nil && sqlite3_exec(ptrDB, self, nil, nil, nil) == SQLITE_OK
}
}
// MARK: - StringView Ranges Extension (by Isaac Xen)
fileprivate extension String {
func ranges(splitBy separator: Element) -> [Range<String.Index>] {
var startIndex = startIndex
return split(separator: separator).reduce(into: []) { ranges, substring in
_ = range(of: substring, range: startIndex ..< endIndex).map { range in
ranges.append(range)
startIndex = range.upperBound
}
}
}
}
// MARK: -
// Ref: https://stackoverflow.com/a/32581409/4162914
fileprivate extension Double {
func rounded(toPlaces places: Int) -> Double {
let divisor = pow(10.0, Double(places))
return (self * divisor).rounded() / divisor
}
}
// MARK: -
// Ref: https://stackoverflow.com/a/41581695/4162914
precedencegroup ExponentiationPrecedence {
associativity: right
higherThan: MultiplicationPrecedence
}
infix operator **: ExponentiationPrecedence
func ** (_ base: Double, _ exp: Double) -> Double {
pow(base, exp)
}
// MARK: -
struct Unigram: CustomStringConvertible {
enum UnigramCategory: String {
case macv = "MACV"
case tabe = "TABE"
case moe = "MOED"
case custom = "CUST"
case misc = "MISC"
var description: String { rawValue }
}
init(key: String, value: String, score: Double, count: Int, category: Unigram.UnigramCategory) {
self.key = key
self.value = value
self.score = score
self.count = count
self.category = category
}
var key: String = ""
var value: String = ""
var score: Double = -1.0
var count: Int = 0
var category: UnigramCategory
var description: String {
"(\(key), \(value), \(score), \(category)"
}
}
// MARK: - JSON
func cnvPhonabetToASCII(_ incoming: String) -> String {
let dicPhonabet2ASCII = [
"": "b", "": "p", "": "m", "": "f", "": "d", "": "t", "": "n", "": "l", "": "g", "": "k", "": "h",
"": "j", "": "q", "": "x", "": "Z", "": "C", "": "S", "": "r", "": "z", "": "c", "": "s", "": "i",
"": "u", "": "v", "": "a", "": "o", "": "e", "": "E", "": "B", "": "P", "": "M", "": "F", "": "D",
"": "T", "": "N", "": "L", "": "R", "ˊ": "2", "ˇ": "3", "ˋ": "4", "˙": "5",
]
var strOutput = incoming
if !strOutput.contains("_") {
for Unigram in dicPhonabet2ASCII {
strOutput = strOutput.replacingOccurrences(of: Unigram.key, with: Unigram.value)
}
}
return strOutput
}
// MARK: -
private let urlCurrentFolder = URL(fileURLWithPath: FileManager.default.currentDirectoryPath)
private let urlCHSRoot: String = "\(urlCurrentFolder.path)/components/chs/"
private let urlCHTRoot: String = "\(urlCurrentFolder.path)/components/cht/"
private let urlKanjiCore: String = "\(urlCurrentFolder.path)/components/common/char-kanji-core.txt"
private let urlMiscBPMF: String = "\(urlCurrentFolder.path)/components/common/char-misc-bpmf.txt"
private let urlMiscNonKanji: String = "\(urlCurrentFolder.path)/components/common/char-misc-nonkanji.txt"
private let urlPunctuation: String = "\(urlCurrentFolder.path)/components/common/data-punctuations.txt"
private let urlSymbols: String = "\(urlCurrentFolder.path)/components/common/data-symbols.txt"
private let urlZhuyinwen: String = "\(urlCurrentFolder.path)/components/common/data-zhuyinwen.txt"
private let urlCNS: String = "\(urlCurrentFolder.path)/components/common/char-kanji-cns.txt"
private let urlOutputCHS: String = "\(urlCurrentFolder.path)/data-chs.txt"
private let urlOutputCHT: String = "\(urlCurrentFolder.path)/data-cht.txt"
private let urlJSONSymbols: String = "\(urlCurrentFolder.path)/data-symbols.json"
private let urlJSONZhuyinwen: String = "\(urlCurrentFolder.path)/data-zhuyinwen.json"
private let urlJSONCNS: String = "\(urlCurrentFolder.path)/data-cns.json"
private let urlJSONCHS: String = "\(urlCurrentFolder.path)/data-chs.json"
private let urlJSONCHT: String = "\(urlCurrentFolder.path)/data-cht.json"
private let urlJSONBPMFReverseLookup: String = "\(urlCurrentFolder.path)/data-bpmf-reverse-lookup.json"
private let urlJSONBPMFReverseLookupCNS1: String = "\(urlCurrentFolder.path)/data-bpmf-reverse-lookup-CNS1.json"
private let urlJSONBPMFReverseLookupCNS2: String = "\(urlCurrentFolder.path)/data-bpmf-reverse-lookup-CNS2.json"
private let urlJSONBPMFReverseLookupCNS3: String = "\(urlCurrentFolder.path)/data-bpmf-reverse-lookup-CNS3.json"
private let urlJSONBPMFReverseLookupCNS4: String = "\(urlCurrentFolder.path)/data-bpmf-reverse-lookup-CNS4.json"
private let urlJSONBPMFReverseLookupCNS5: String = "\(urlCurrentFolder.path)/data-bpmf-reverse-lookup-CNS5.json"
private let urlJSONBPMFReverseLookupCNS6: String = "\(urlCurrentFolder.path)/data-bpmf-reverse-lookup-CNS6.json"
private var isReverseLookupDictionaryProcessed: Bool = false
private let urlSQLite: String = "\(urlCurrentFolder.path)/Build/Release/vChewingFactoryDatabase.sqlite"
private var mapReverseLookupForCheck: [String: [String]] = [:]
private var exceptedChars: Set<String> = .init()
private var ptrSQL: OpaquePointer?
var rangeMapJSONCHS: [String: [String]] = [:]
var rangeMapJSONCHT: [String: [String]] = [:]
var rangeMapSymbols: [String: [String]] = [:]
var rangeMapZhuyinwen: [String: [String]] = [:]
var rangeMapCNS: [String: [String]] = [:]
var rangeMapReverseLookup: [String: [String]] = [:]
/// Also use mapReverseLookupForCheck.
// MARK: -
func prepareDatabase() -> Bool {
let sqlMakeTableMACV = """
DROP TABLE IF EXISTS DATA_REV;
DROP TABLE IF EXISTS DATA_MAIN;
CREATE TABLE IF NOT EXISTS DATA_MAIN (
theKey TEXT NOT NULL,
theDataCHS TEXT,
theDataCHT TEXT,
theDataCNS TEXT,
theDataMISC TEXT,
theDataSYMB TEXT,
theDataCHEW TEXT,
PRIMARY KEY (theKey)
) WITHOUT ROWID;
CREATE TABLE IF NOT EXISTS DATA_REV (
theChar TEXT NOT NULL,
theReadings TEXT NOT NULL,
PRIMARY KEY (theChar)
) WITHOUT ROWID;
"""
guard sqlite3_open(":memory:", &ptrSQL) == SQLITE_OK else { return false }
guard sqlite3_exec(ptrSQL, "PRAGMA synchronous = OFF;", nil, nil, nil) == SQLITE_OK else { return false }
guard sqlite3_exec(ptrSQL, "PRAGMA journal_mode = OFF;", nil, nil, nil) == SQLITE_OK else { return false }
guard sqlMakeTableMACV.runAsSQLExec(dbPointer: &ptrSQL) else { return false }
guard "begin;".runAsSQLExec(dbPointer: &ptrSQL) else { return false }
return true
}
@discardableResult func writeMainMapToSQL(_ theMap: [String: [String]], column columnName: String) -> Bool {
for (encryptedKey, arrValues) in theMap {
// SQL 西 ASCII 退''
let safeKey = encryptedKey.replacingOccurrences(of: "'", with: "''")
let valueText = arrValues.joined(separator: "\t").replacingOccurrences(of: "'", with: "''")
let sqlStmt = "INSERT INTO DATA_MAIN (theKey, \(columnName)) VALUES ('\(safeKey)', '\(valueText)') ON CONFLICT(theKey) DO UPDATE SET \(columnName)='\(valueText)';"
guard sqlStmt.runAsSQLExec(dbPointer: &ptrSQL) else {
print("Failed: " + sqlStmt)
return false
}
}
return true
}
@discardableResult func writeRevLookupMapToSQL(_ theMap: [String: [String]]) -> Bool {
for (encryptedKey, arrValues) in theMap {
// SQL 西 ASCII 退''
let safeKey = encryptedKey.replacingOccurrences(of: "'", with: "''")
let valueText = arrValues.joined(separator: "\t").replacingOccurrences(of: "'", with: "''")
let sqlStmt = "INSERT INTO DATA_REV (theChar, theReadings) VALUES ('\(safeKey)', '\(valueText)') ON CONFLICT(theChar) DO UPDATE SET theReadings='\(valueText)';"
guard sqlStmt.runAsSQLExec(dbPointer: &ptrSQL) else {
print("Failed: " + sqlStmt)
return false
}
}
return true
}
// MARK: - Dump SQLite3 Memory Database to File.
@discardableResult func dumpSQLDB() -> Bool {
var ptrSQLTarget: OpaquePointer?
defer { sqlite3_close_v2(ptrSQLTarget) }
guard sqlite3_open(urlSQLite, &ptrSQLTarget) == SQLITE_OK else { return false }
let ptrBackupObj = sqlite3_backup_init(ptrSQLTarget, "main", ptrSQL, "main")
if ptrBackupObj != nil {
sqlite3_backup_step(ptrBackupObj, -1)
sqlite3_backup_finish(ptrBackupObj)
}
return sqlite3_errcode(ptrSQLTarget) == SQLITE_OK
}
// MARK: -
func rawDictForPhrases(isCHS: Bool) -> [Unigram] {
var arrUnigramRAW: [Unigram] = []
var strRAWOrigDict: [String: String] = [:]
let urlFolderRoot: String = isCHS ? urlCHSRoot : urlCHTRoot
let i18n: String = isCHS ? "簡體中文" : "繁體中文"
//
do {
try FileManager.default.contentsOfDirectory(atPath: urlFolderRoot).forEach { thePath in
guard thePath.contains("phrases-") else { return }
let str = try String(contentsOfFile: urlFolderRoot + thePath, encoding: .utf8)
strRAWOrigDict[thePath] = str
}
} catch {
NSLog(" - Exception happened when reading raw phrases data.")
return []
}
for key in strRAWOrigDict.keys {
guard var strRAW = strRAWOrigDict[key] else { continue }
//
strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // macOS
// CJKWhiteSpace (\x{3000}) to ASCII Space
// NonBreakWhiteSpace (\x{A0}) to ASCII Space
// Tab to ASCII Space
// ASCII
strRAW.regReplace(pattern: #"( +| +| +|\t+)+"#, replaceWith: " ")
strRAW.regReplace(pattern: #"(^ | $)"#, replaceWith: "") //
strRAW.regReplace(pattern: #"(\f+|\r+|\n+)+"#, replaceWith: "\n") // CR & Form Feed to LF,
strRAW.regReplace(pattern: #"^(#.*|.*#WIN32.*)$"#, replaceWith: "") // #+ WIN32
strRAWOrigDict[key] = strRAW
let currentCategory: Unigram.UnigramCategory = {
if key.contains("-custom-") { return .custom }
if key.contains("-tabe-") { return .tabe }
if key.contains("-moe-") { return .moe }
if key.contains("-vchewing-") { return .macv }
return .custom
}()
var lineData = ""
for lineNeta in strRAW.split(separator: "\n") {
lineData = lineNeta.description
//
let arrLineData = lineData.components(separatedBy: " ")
var varLineDataProcessed = ""
var count = 0
for currentCell in arrLineData {
count += 1
if count < 3 {
varLineDataProcessed += currentCell + "\t"
} else if count < arrLineData.count {
varLineDataProcessed += currentCell + "-"
} else {
varLineDataProcessed += currentCell
}
}
// Unigram
let arrCells: [String] = varLineDataProcessed.components(separatedBy: "\t")
count = 0 //
var phone = ""
var phrase = ""
var occurrence = 0
for cell in arrCells {
count += 1
switch count {
case 1: phrase = cell
case 3: phone = cell
case 2: occurrence = Int(cell) ?? 0
default: break
}
}
if phrase != "" { //
arrUnigramRAW += [
Unigram(
key: phone, value: phrase, score: 0.0,
count: occurrence, category: currentCategory
),
]
}
}
}
NSLog(" - \(i18n): 成功生成詞語語料辭典(權重待計算)。")
return arrUnigramRAW
}
// MARK: -
func rawDictForKanjis(isCHS: Bool) -> [Unigram] {
var arrUnigramRAW: [Unigram] = []
var strRAW = ""
let i18n: String = isCHS ? "簡體中文" : "繁體中文"
//
do {
strRAW += try String(contentsOfFile: urlKanjiCore, encoding: .utf8)
} catch {
NSLog(" - Exception happened when reading raw core kanji data.")
return []
}
//
strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // macOS
// CJKWhiteSpace (\x{3000}) to ASCII Space
// NonBreakWhiteSpace (\x{A0}) to ASCII Space
// Tab to ASCII Space
// ASCII
strRAW.regReplace(pattern: #"( +| +| +|\t+)+"#, replaceWith: " ")
strRAW.regReplace(pattern: #"(^ | $)"#, replaceWith: "") //
strRAW.regReplace(pattern: #"(\f+|\r+|\n+)+"#, replaceWith: "\n") // CR & Form Feed to LF,
strRAW.regReplace(pattern: #"^(#.*|.*#WIN32.*)$"#, replaceWith: "") // #+ WIN32
//
let arrData = Array(
NSOrderedSet(array: strRAW.components(separatedBy: "\n")).array as! [String])
var varLineData = ""
var mapReverseLookupJSON: [String: [String]] = [:]
var mapReverseLookupUnencrypted: [String: [String]] = [:]
for lineData in arrData {
// 1,2,4 1,3,4
let varLineDataPre = lineData.components(separatedBy: " ").prefix(isCHS ? 2 : 1)
.joined(
separator: "\t")
let varLineDataPost = lineData.components(separatedBy: " ").suffix(isCHS ? 1 : 2)
.joined(
separator: "\t")
varLineData = varLineDataPre + "\t" + varLineDataPost
let arrLineData = varLineData.components(separatedBy: " ")
var varLineDataProcessed = ""
var count = 0
for currentCell in arrLineData {
count += 1
if count < 3 {
varLineDataProcessed += currentCell + "\t"
} else if count < arrLineData.count {
varLineDataProcessed += currentCell + "-"
} else {
varLineDataProcessed += currentCell
}
}
// Unigram
let arrCells: [String] = varLineDataProcessed.components(separatedBy: "\t")
count = 0 //
var phone = ""
var phrase = ""
var occurrence = 0
for cell in arrCells {
count += 1
switch count {
case 1: phrase = cell
case 3: phone = cell
case 2: occurrence = Int(cell) ?? 0
default: break
}
}
if phrase != "" { //
if !isReverseLookupDictionaryProcessed {
mapReverseLookupUnencrypted[phrase, default: []].append(phone)
mapReverseLookupJSON[phrase, default: []].append(cnvPhonabetToASCII(phone))
}
arrUnigramRAW += [
Unigram(
key: phone, value: phrase, score: 0.0,
count: occurrence, category: .misc
),
]
}
}
if !isReverseLookupDictionaryProcessed {
do {
isReverseLookupDictionaryProcessed = true
if compileJSON {
try JSONSerialization.data(withJSONObject: mapReverseLookupJSON, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONBPMFReverseLookup))
}
mapReverseLookupForCheck = mapReverseLookupUnencrypted
} catch {
NSLog(" - Core Reverse Lookup Data Generation Failed.")
}
}
NSLog(" - \(i18n): 成功生成單字語料辭典(權重待計算)。")
return arrUnigramRAW
}
// MARK: -
func rawDictForNonKanjis(isCHS: Bool) -> [Unigram] {
var arrUnigramRAW: [Unigram] = []
var strRAW = ""
let i18n: String = isCHS ? "簡體中文" : "繁體中文"
//
do {
strRAW += try String(contentsOfFile: urlMiscBPMF, encoding: .utf8)
strRAW += "\n"
strRAW += try String(contentsOfFile: urlMiscNonKanji, encoding: .utf8)
} catch {
NSLog(" - Exception happened when reading raw core kanji data.")
return []
}
//
strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // macOS
// CJKWhiteSpace (\x{3000}) to ASCII Space
// NonBreakWhiteSpace (\x{A0}) to ASCII Space
// Tab to ASCII Space
// ASCII
strRAW.regReplace(pattern: #"( +| +| +|\t+)+"#, replaceWith: " ")
strRAW.regReplace(pattern: #"(^ | $)"#, replaceWith: "") //
strRAW.regReplace(pattern: #"(\f+|\r+|\n+)+"#, replaceWith: "\n") // CR & Form Feed to LF,
strRAW.regReplace(pattern: #"^(#.*|.*#WIN32.*)$"#, replaceWith: "") // #+ WIN32
//
let arrData = Array(
NSOrderedSet(array: strRAW.components(separatedBy: "\n")).array as! [String])
var varLineData = ""
for lineData in arrData {
varLineData = lineData
//
varLineData = varLineData.components(separatedBy: " ").prefix(3).joined(
separator: "\t") //
let arrLineData = varLineData.components(separatedBy: " ")
var varLineDataProcessed = ""
var count = 0
for currentCell in arrLineData {
count += 1
if count < 3 {
varLineDataProcessed += currentCell + "\t"
} else if count < arrLineData.count {
varLineDataProcessed += currentCell + "-"
} else {
varLineDataProcessed += currentCell
}
}
// Unigram
let arrCells: [String] = varLineDataProcessed.components(separatedBy: "\t")
count = 0 //
var phone = ""
var phrase = ""
var occurrence = 0
for cell in arrCells {
count += 1
switch count {
case 1: phrase = cell
case 3: phone = cell
case 2: occurrence = Int(cell) ?? 0
default: break
}
}
if phrase != "" { //
exceptedChars.insert(phrase)
arrUnigramRAW += [
Unigram(
key: phone, value: phrase, score: 0.0,
count: occurrence, category: .misc
),
]
}
}
NSLog(" - \(i18n): 成功生成非漢字語料辭典(權重待計算)。")
return arrUnigramRAW
}
func weightAndSort(_ arrStructUncalculated: [Unigram], isCHS: Bool) -> [Unigram] {
let i18n: String = isCHS ? "簡體中文" : "繁體中文"
var arrStructCalculated: [Unigram] = []
let fscale = 2.7
var norm = 0.0
for unigram in arrStructUncalculated {
if (1 ... 6).contains(unigram.value.count), unigram.category != .custom {
norm += fscale ** (Double(unigram.value.count) / 3.0 - 1.0)
* Double(unigram.count)
}
}
NSLog(" - \(i18n): NORM 計算值為 \(norm)")
// norm norm
//
// 1 0 0.5
for unigram in arrStructUncalculated {
var weight: Double = 0
switch unigram.count {
case -2: //
weight = -13
case -1: //
weight = -13
case 0: //
weight = log10(
fscale ** (Double(unigram.value.count) / 3.0 - 1.0) * 0.25 / norm)
default:
weight = log10(
fscale ** (Double(unigram.value.count) / 3.0 - 1.0)
* Double(unigram.count) / norm) // Credit: MJHsieh.
}
let weightRounded: Double = weight.rounded(toPlaces: 3) //
arrStructCalculated += [
Unigram(
key: unigram.key, value: unigram.value, score: weightRounded,
count: unigram.count, category: unigram.category
),
]
}
NSLog(" - \(i18n): 成功計算權重。")
// ==========================================
//
var arrStructSorted: [Unigram] = arrStructCalculated.sorted(by: { lhs, rhs -> Bool in
(lhs.key, rhs.count) < (rhs.key, lhs.count)
})
NSLog(" - \(i18n): 排序整理完畢,準備編譯要寫入的檔案內容。")
arrStructSorted.append(Unigram(key: "__NORM__", value: norm.description, score: 0, count: 0, category: .misc))
return arrStructSorted
}
func fileOutput(isCHS: Bool) {
let i18n: String = isCHS ? "簡體中文" : "繁體中文"
var strPunctuation = ""
var rangeMapJSON: [String: [String]] = [:]
let pathOutput = URL(fileURLWithPath: isCHS ? urlOutputCHS : urlOutputCHT)
let jsonURL = URL(fileURLWithPath: isCHS ? urlJSONCHS : urlJSONCHT)
var strPrintLine = ""
//
do {
strPunctuation = try String(contentsOfFile: urlPunctuation, encoding: .utf8).replacingOccurrences(
of: "\t", with: " "
)
if let charLast = strPunctuation.last, !"\n".contains(charLast) {
strPunctuation += "\n"
}
strPrintLine += try String(contentsOfFile: urlPunctuation, encoding: .utf8).replacingOccurrences(
of: "\t", with: " "
)
if let charLast = strPunctuation.last, !"\n".contains(charLast) {
strPunctuation += "\n"
}
} catch {
NSLog(" - \(i18n): Exception happened when reading raw punctuation data.")
}
NSLog(" - \(i18n): 成功插入標點符號與西文字母數據txt")
//
strPunctuation.ranges(splitBy: "\n").forEach {
let neta = strPunctuation[$0].split(separator: " ")
let line = String(strPunctuation[$0])
if neta.count >= 2 {
let theKey = String(neta[0])
let theValue = String(neta[1])
if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#" {
rangeMapJSON[cnvPhonabetToASCII(theKey), default: []].append(theValue)
}
}
}
var arrStructUnified: [Unigram] = []
arrStructUnified += rawDictForKanjis(isCHS: isCHS)
arrStructUnified += rawDictForNonKanjis(isCHS: isCHS)
arrStructUnified += rawDictForPhrases(isCHS: isCHS)
//
arrStructUnified = weightAndSort(arrStructUnified, isCHS: isCHS)
//
NSLog(" - \(i18n): 執行資料重複性檢查,會在之後再給出對應的檢查結果。")
var setAlreadyInserted = Set<String>()
var arrFoundedDuplications = [String]()
//
NSLog(" - \(i18n): 執行資料健康狀況檢查。")
print(healthCheck(arrStructUnified))
for unigram in arrStructUnified {
if setAlreadyInserted.contains(unigram.value + "\t" + unigram.key) {
arrFoundedDuplications.append(unigram.value + "\t" + unigram.key)
} else {
setAlreadyInserted.insert(unigram.value + "\t" + unigram.key)
}
let theKey = unigram.key
let theValue = (String(unigram.score) + " " + unigram.value)
if !theKey.contains("_punctuation_list") {
rangeMapJSON[cnvPhonabetToASCII(theKey), default: []].append(theValue)
}
strPrintLine += unigram.key + " " + unigram.value
strPrintLine += " " + String(unigram.score)
if unigram.count != 0 {
strPrintLine += " " + String(unigram.count)
}
strPrintLine += "\n"
}
NSLog(" - \(i18n): 要寫入檔案的 txt 內容編譯完畢。")
do {
try strPrintLine.write(to: pathOutput, atomically: true, encoding: .utf8)
if compileJSON {
try JSONSerialization.data(withJSONObject: rangeMapJSON, options: .sortedKeys).write(to: jsonURL)
}
if isCHS {
rangeMapJSONCHS = rangeMapJSON
} else {
rangeMapJSONCHT = rangeMapJSON
}
} catch {
NSLog(" - \(i18n): Error on writing strings to file: \(error)")
}
NSLog(" - \(i18n): JSON & TXT 寫入完成。")
if !arrFoundedDuplications.isEmpty {
NSLog(" - \(i18n): 尋得下述重複項目,請務必手動排查:")
print("-------------------")
print(arrFoundedDuplications.joined(separator: "\n"))
}
print("===================")
}
func commonFileOutput() {
let i18n = "語言中性"
var strSymbols = ""
var strZhuyinwen = ""
var strCNS = ""
var mapSymbols: [String: [String]] = [:]
var mapZhuyinwen: [String: [String]] = [:]
var mapCNS: [String: [String]] = [:]
var mapReverseLookupCNS1: [String: [String]] = [:]
var mapReverseLookupCNS2: [String: [String]] = [:]
var mapReverseLookupCNS3: [String: [String]] = [:]
var mapReverseLookupCNS4: [String: [String]] = [:]
var mapReverseLookupCNS5: [String: [String]] = [:]
var mapReverseLookupCNS6: [String: [String]] = [:]
//
do {
strSymbols = try String(contentsOfFile: urlSymbols, encoding: .utf8).replacingOccurrences(of: "\t", with: " ")
strZhuyinwen = try String(contentsOfFile: urlZhuyinwen, encoding: .utf8).replacingOccurrences(of: "\t", with: " ")
strCNS = try String(contentsOfFile: urlCNS, encoding: .utf8).replacingOccurrences(of: "\t", with: " ")
} catch {
NSLog(" - \(i18n): Exception happened when reading raw punctuation data.")
}
NSLog(" - \(i18n): 成功取得標點符號與西文字母原始資料JSON")
//
strSymbols.ranges(splitBy: "\n").forEach {
let neta = strSymbols[$0].split(separator: " ")
let line = String(strSymbols[$0])
if neta.count >= 2 {
let theKey = String(neta[1])
let theValue = String(neta[0])
if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#" {
let encryptedKey = cnvPhonabetToASCII(theKey)
mapSymbols[encryptedKey, default: []].append(theValue)
rangeMapSymbols[encryptedKey, default: []].append(theValue)
}
}
}
strZhuyinwen.ranges(splitBy: "\n").forEach {
let neta = strZhuyinwen[$0].split(separator: " ")
let line = String(strZhuyinwen[$0])
if neta.count >= 2 {
let theKey = String(neta[1])
let theValue = String(neta[0])
if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#" {
let encryptedKey = cnvPhonabetToASCII(theKey)
mapZhuyinwen[encryptedKey, default: []].append(theValue)
rangeMapZhuyinwen[encryptedKey, default: []].append(theValue)
}
}
}
strCNS.ranges(splitBy: "\n").forEach {
let neta = strCNS[$0].split(separator: " ")
let line = String(strCNS[$0])
if neta.count >= 2 {
let theKey = String(neta[1])
let theValue = String(neta[0])
if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#" {
let encryptedKey = cnvPhonabetToASCII(theKey)
mapCNS[encryptedKey, default: []].append(theValue)
rangeMapCNS[encryptedKey, default: []].append(theValue)
json: if !theKey.contains("_"), !theKey.contains("-") {
rangeMapReverseLookup[theValue, default: []].append(encryptedKey)
if mapReverseLookupCNS1.keys.count <= 16500 {
mapReverseLookupCNS1[theValue, default: []].append(encryptedKey)
break json
}
if mapReverseLookupCNS2.keys.count <= 16500 {
mapReverseLookupCNS2[theValue, default: []].append(encryptedKey)
break json
}
if mapReverseLookupCNS3.keys.count <= 16500 {
mapReverseLookupCNS3[theValue, default: []].append(encryptedKey)
break json
}
if mapReverseLookupCNS4.keys.count <= 16500 {
mapReverseLookupCNS4[theValue, default: []].append(encryptedKey)
break json
}
if mapReverseLookupCNS5.keys.count <= 16500 {
mapReverseLookupCNS5[theValue, default: []].append(encryptedKey)
break json
}
if mapReverseLookupCNS6.keys.count <= 16500 {
mapReverseLookupCNS6[theValue, default: []].append(encryptedKey)
break json
}
}
}
}
}
NSLog(" - \(i18n): 要寫入檔案的內容編譯完畢。")
do {
if compileJSON {
try JSONSerialization.data(withJSONObject: mapSymbols, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONSymbols))
try JSONSerialization.data(withJSONObject: mapZhuyinwen, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONZhuyinwen))
try JSONSerialization.data(withJSONObject: mapCNS, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONCNS))
try JSONSerialization.data(withJSONObject: mapReverseLookupCNS1, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONBPMFReverseLookupCNS1))
try JSONSerialization.data(withJSONObject: mapReverseLookupCNS2, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONBPMFReverseLookupCNS2))
try JSONSerialization.data(withJSONObject: mapReverseLookupCNS3, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONBPMFReverseLookupCNS3))
try JSONSerialization.data(withJSONObject: mapReverseLookupCNS4, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONBPMFReverseLookupCNS4))
try JSONSerialization.data(withJSONObject: mapReverseLookupCNS5, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONBPMFReverseLookupCNS5))
try JSONSerialization.data(withJSONObject: mapReverseLookupCNS6, options: .sortedKeys).write(
to: URL(fileURLWithPath: urlJSONBPMFReverseLookupCNS6))
}
} catch {
NSLog(" - \(i18n): Error on writing strings to file: \(error)")
}
NSLog(" - \(i18n): 寫入完成。")
}
// MARK: -
func healthCheck(_ data: [Unigram]) -> String {
while mapReverseLookupForCheck.isEmpty { sleep(1) }
var result = ""
var unigramMonoChar = [String: Unigram]()
var valueToScore = [String: Double]()
let unigramMonoCharCounter = data.filter { $0.score > -14 && $0.key.split(separator: "-").count == 1 }.count
let unigramPolyCharCounter = data.filter { $0.score > -14 && $0.key.split(separator: "-").count > 1 }.count
// -10
for neta in data.filter({ $0.score > -14 }) {
valueToScore[neta.value] = max(neta.score, valueToScore[neta.value] ?? -14)
let theKeySliceArr = neta.key.split(separator: "-")
guard let theKey = theKeySliceArr.first, theKeySliceArr.count == 1 else { continue }
if unigramMonoChar.keys.contains(String(theKey)), let theRecord = unigramMonoChar[String(theKey)] {
if neta.score > theRecord.score { unigramMonoChar[String(theKey)] = neta }
} else {
unigramMonoChar[String(theKey)] = neta
}
}
var faulty = [[String]: [Unigram]]()
var indifferents: [(String, String, Double, [Unigram], Double)] = []
var insufficients: [(String, String, Double, [Unigram], Double)] = []
var competingUnigrams = [(String, Double, String, Double)]()
for neta in data.filter({ $0.key.split(separator: "-").count >= 2 && $0.score > -14 }) {
var competants = [Unigram]()
var tscore: Double = 0
var bad = false
let checkPerCharMachingStatus: Bool = neta.key.split(separator: "-").count == neta.value.count
var mispronouncedKanji: [String] = []
let arrNetaKeys = neta.key.split(separator: "-")
outerMatchCheck: for (i, x) in arrNetaKeys.enumerated() {
if !unigramMonoChar.keys.contains(String(x)) {
if neta.value.count == 1 {
mispronouncedKanji.append("\(neta.category)@\(neta.value)@\(neta.key)")
} else if neta.value.count == arrNetaKeys.count {
mispronouncedKanji.append("\(neta.category)@\(neta.value.map(\.description)[i])@\(arrNetaKeys[i])")
} else {
mispronouncedKanji.append("\(neta.category)@OTHER@\(String(x))")
}
bad = true
break outerMatchCheck
}
innerMatchCheck: if checkPerCharMachingStatus {
let char = neta.value.map(\.description)[i]
if exceptedChars.contains(char) { break innerMatchCheck }
guard let queriedPhones = mapReverseLookupForCheck[char] else {
mispronouncedKanji.append("\(neta.category)@\(char)@\(String(x))")
bad = true
break outerMatchCheck
}
for queriedPhone in queriedPhones {
if queriedPhone == x.description { break innerMatchCheck }
}
mispronouncedKanji.append("\(neta.category)@\(char)@\(String(x))")
bad = true
break outerMatchCheck
}
guard let u = unigramMonoChar[String(x)] else { continue }
tscore += u.score
competants.append(u)
}
if bad {
faulty[mispronouncedKanji, default: []].append(neta)
continue
}
if tscore >= neta.score {
let instance = (neta.key, neta.value, neta.score, competants, neta.score - tscore)
let valueJoined = String(competants.map(\.value).joined(separator: ""))
if neta.value == valueJoined {
indifferents.append(instance)
} else {
if valueToScore.keys.contains(valueJoined), neta.value != valueJoined {
if let valueJoinedScore = valueToScore[valueJoined], neta.score < valueJoinedScore {
competingUnigrams.append((neta.value, neta.score, valueJoined, valueJoinedScore))
}
}
insufficients.append(instance)
}
}
}
insufficients = insufficients.sorted(by: { lhs, rhs -> Bool in
(lhs.2) > (rhs.2)
})
competingUnigrams = competingUnigrams.sorted(by: { lhs, rhs -> Bool in
(lhs.1 - lhs.3) > (rhs.1 - rhs.3)
})
let separator: String = {
var result = ""
for _ in 0 ..< 72 { result += "-" }
return result
}()
func printl(_ input: String) {
result += input + "\n"
}
printl(separator)
printl("持單個字符的有效單元圖數量:\(unigramMonoCharCounter)")
printl("持多個字符的有效單元圖數量:\(unigramPolyCharCounter)")
printl(separator)
printl("總結一下那些容易被單個漢字的字頻干擾輸入的詞組單元圖:")
printl("因干擾組件和字詞本身完全重疊、而不需要處理的單元圖的數量:\(indifferents.count)")
printl(
"\(insufficients.count) 個複字單元圖被自身成分讀音對應的其它單字單元圖奪權,約佔全部有效單元圖的 \(insufficients.count / unigramPolyCharCounter * 100)%"
)
printl("\n其中有:")
var insufficientsMap = [Int: [(String, String, Double, [Unigram], Double)]]()
for x in 2 ... 10 {
insufficientsMap[x] = insufficients.filter { $0.0.split(separator: "-").count == x }
}
printl(" \(insufficientsMap[2]?.count ?? 0) 個有效雙字單元圖")
printl(" \(insufficientsMap[3]?.count ?? 0) 個有效三字單元圖")
printl(" \(insufficientsMap[4]?.count ?? 0) 個有效四字單元圖")
printl(" \(insufficientsMap[5]?.count ?? 0) 個有效五字單元圖")
printl(" \(insufficientsMap[6]?.count ?? 0) 個有效六字單元圖")
printl(" \(insufficientsMap[7]?.count ?? 0) 個有效七字單元圖")
printl(" \(insufficientsMap[8]?.count ?? 0) 個有效八字單元圖")
printl(" \(insufficientsMap[9]?.count ?? 0) 個有效九字單元圖")
printl(" \(insufficientsMap[10]?.count ?? 0) 個有效十字單元圖")
if let insufficientsMap2 = insufficientsMap[2], !insufficientsMap2.isEmpty {
printl(separator)
printl("前二十五個被奪權的有效雙字單元圖")
for (i, content) in insufficientsMap2.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += content.1 + ","
contentToPrint += String(content.2) + ","
contentToPrint += "[" + content.3.map(\.description).joined(separator: ",") + "]" + ","
contentToPrint += String(content.4) + "}"
printl(contentToPrint)
}
}
if let insufficientsMap3 = insufficientsMap[3], !insufficientsMap3.isEmpty {
printl(separator)
printl("前二十五個被奪權的有效三字單元圖")
for (i, content) in insufficientsMap3.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += content.1 + ","
contentToPrint += String(content.2) + ","
contentToPrint += "[" + content.3.map(\.description).joined(separator: ",") + "]" + ","
contentToPrint += String(content.4) + "}"
printl(contentToPrint)
}
}
if let insufficientsMap4 = insufficientsMap[4], !insufficientsMap4.isEmpty {
printl(separator)
printl("前二十五個被奪權的有效四字單元圖")
for (i, content) in insufficientsMap4.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += content.1 + ","
contentToPrint += String(content.2) + ","
contentToPrint += "[" + content.3.map(\.description).joined(separator: ",") + "]" + ","
contentToPrint += String(content.4) + "}"
printl(contentToPrint)
}
}
if let insufficientsMap5 = insufficientsMap[5], !insufficientsMap5.isEmpty {
printl(separator)
printl("前二十五個被奪權的有效五字單元圖")
for (i, content) in insufficientsMap5.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += content.1 + ","
contentToPrint += String(content.2) + ","
contentToPrint += "[" + content.3.map(\.description).joined(separator: ",") + "]" + ","
contentToPrint += String(content.4) + "}"
printl(contentToPrint)
}
}
if let insufficientsMap6 = insufficientsMap[6], !insufficientsMap6.isEmpty {
printl(separator)
printl("前二十五個被奪權的有效六字單元圖")
for (i, content) in insufficientsMap6.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += content.1 + ","
contentToPrint += String(content.2) + ","
contentToPrint += "[" + content.3.map(\.description).joined(separator: ",") + "]" + ","
contentToPrint += String(content.4) + "}"
printl(contentToPrint)
}
}
if let insufficientsMap7 = insufficientsMap[7], !insufficientsMap7.isEmpty {
printl(separator)
printl("前二十五個被奪權的有效七字單元圖")
for (i, content) in insufficientsMap7.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += content.1 + ","
contentToPrint += String(content.2) + ","
contentToPrint += "[" + content.3.map(\.description).joined(separator: ",") + "]" + ","
contentToPrint += String(content.4) + "}"
printl(contentToPrint)
}
}
if let insufficientsMap8 = insufficientsMap[8], !insufficientsMap8.isEmpty {
printl(separator)
printl("前二十五個被奪權的有效八字單元圖")
for (i, content) in insufficientsMap8.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += content.1 + ","
contentToPrint += String(content.2) + ","
contentToPrint += "[" + content.3.map(\.description).joined(separator: ",") + "]" + ","
contentToPrint += String(content.4) + "}"
printl(contentToPrint)
}
}
if let insufficientsMap9 = insufficientsMap[9], !insufficientsMap9.isEmpty {
printl(separator)
printl("前二十五個被奪權的有效九字單元圖")
for (i, content) in insufficientsMap9.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += content.1 + ","
contentToPrint += String(content.2) + ","
contentToPrint += "[" + content.3.map(\.description).joined(separator: ",") + "]" + ","
contentToPrint += String(content.4) + "}"
printl(contentToPrint)
}
}
if let insufficientsMap10 = insufficientsMap[10], !insufficientsMap10.isEmpty {
printl(separator)
printl("前二十五個被奪權的有效十字單元圖")
for (i, content) in insufficientsMap10.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += content.1 + ","
contentToPrint += String(content.2) + ","
contentToPrint += "[" + content.3.map(\.description).joined(separator: ",") + "]" + ","
contentToPrint += String(content.4) + "}"
printl(contentToPrint)
}
}
if !competingUnigrams.isEmpty {
printl(separator)
printl("也發現有 \(competingUnigrams.count) 個複字單元圖被某些由高頻單字組成的複字單元圖奪權的情況,")
printl("例如(前二十五例):")
for (i, content) in competingUnigrams.enumerated() {
if i == 25 { break }
var contentToPrint = "{"
contentToPrint += content.0 + ","
contentToPrint += String(content.1) + ","
contentToPrint += content.2 + ","
contentToPrint += String(content.3) + "}"
printl(contentToPrint)
}
}
if !faulty.isEmpty {
printl(separator)
printl("下述單元圖用到了漢字核心表當中尚未收錄的讀音,可能無法正常輸入:")
for content in faulty {
printl("\(content.key): \(content.value)")
}
}
result += "\n"
return result
}
// MARK: - Flags
struct TaskFlags: OptionSet {
public let rawValue: Int
public init(rawValue: Int) {
self.rawValue = rawValue
}
public static let common = TaskFlags(rawValue: 1 << 0)
public static let chs = TaskFlags(rawValue: 1 << 1)
public static let cht = TaskFlags(rawValue: 1 << 2)
}
// MARK: -
var compileJSON = false
var compileSQLite = true
func main() {
let arguments = CommandLine.arguments.compactMap { $0.lowercased() }
let jsonConditionMet = arguments.contains(where: { $0 == "--json" || $0 == "json" })
if jsonConditionMet {
NSLog("// 接下來準備建置 JSON 格式的原廠辭典,同時生成用來偵錯的 TXT 副產物。")
compileJSON = true
compileSQLite = false
} else {
NSLog("// 接下來準備建置 SQLite 格式的原廠辭典,同時生成用來偵錯的 TXT 副產物。")
compileJSON = false
compileSQLite = true
}
let prepared = prepareDatabase()
if compileSQLite, !prepared {
NSLog("// SQLite 資料庫初期化失敗。")
exit(-1)
}
var taskFlags: TaskFlags = [.common, .chs, .cht] {
didSet {
guard taskFlags.isEmpty else { return }
NSLog("// 全部 TXT 辭典檔案建置完畢。")
if compileJSON {
NSLog("// 全部 JSON 辭典檔案建置完畢。")
}
if compileSQLite, prepared {
NSLog("// 開始整合反查資料。")
mapReverseLookupForCheck.forEach { key, values in
values.reversed().forEach { valueLiteral in
let value = cnvPhonabetToASCII(valueLiteral)
if !rangeMapReverseLookup[key, default: []].contains(value) {
rangeMapReverseLookup[key, default: []].insert(value, at: 0)
}
}
}
NSLog("// 反查資料整合完畢。")
NSLog("// 準備建置 SQL 資料庫。")
writeMainMapToSQL(rangeMapJSONCHS, column: "theDataCHS")
writeMainMapToSQL(rangeMapJSONCHT, column: "theDataCHT")
writeMainMapToSQL(rangeMapSymbols, column: "theDataSYMB")
writeMainMapToSQL(rangeMapZhuyinwen, column: "theDataCHEW")
writeMainMapToSQL(rangeMapCNS, column: "theDataCNS")
writeRevLookupMapToSQL(rangeMapReverseLookup)
let committed = "commit;".runAsSQLExec(dbPointer: &ptrSQL)
assert(committed)
let compressed = "VACUUM;".runAsSQLExec(dbPointer: &ptrSQL)
assert(compressed)
if !dumpSQLDB() {
NSLog("// SQLite 辭典傾印失敗。")
} else {
NSLog("// 全部 SQLite 辭典檔案建置完畢。")
}
sqlite3_close_v2(ptrSQL)
}
}
}
let globalQueue = DispatchQueue.global(qos: .default)
let group = DispatchGroup()
group.enter()
globalQueue.async {
NSLog("// 準備編譯符號表情ㄅ文語料檔案。")
commonFileOutput()
taskFlags.remove(.common)
group.leave()
}
group.enter()
globalQueue.async {
NSLog("// 準備編譯繁體中文核心語料檔案。")
fileOutput(isCHS: false)
taskFlags.remove(.cht)
group.leave()
}
group.enter()
globalQueue.async {
NSLog("// 準備編譯簡體中文核心語料檔案。")
fileOutput(isCHS: true)
taskFlags.remove(.chs)
group.leave()
}
//
group.wait()
}
main()