DataCompiler // Also generate factory dictionaries in plist format.

This commit is contained in:
ShikiSuen 2022-05-22 21:52:35 +08:00
parent 8fe9fb1ee1
commit 703b427ddd
1 changed files with 138 additions and 7 deletions

View File

@ -43,6 +43,20 @@ extension String {
}
}
// MARK: - StringView Ranges Extension (by Isaac Xen)
extension String {
fileprivate func ranges(splitBy separator: Element) -> [Range<String.Index>] {
var startIndex = startIndex
return split(separator: separator).reduce(into: []) { ranges, substring in
_ = range(of: substring, range: startIndex..<endIndex).map { range in
ranges.append(range)
startIndex = range.upperBound
}
}
}
}
// MARK: -
// Ref: https://stackoverflow.com/a/32581409/4162914
@ -53,7 +67,7 @@ extension Float {
}
}
// MARK: -
// MARK: -
// Ref: https://stackoverflow.com/a/41581695/4162914
precedencegroup ExponentiationPrecedence {
@ -80,6 +94,24 @@ struct Entry {
var valCount: Int = 0
}
// MARK: - plist
func cnvPhonabetToASCII(_ incoming: String) -> String {
let dicPhonabet2ASCII = [
"": "b", "": "p", "": "m", "": "f", "": "d", "": "t", "": "n", "": "l", "": "g", "": "k", "": "h",
"": "j", "": "q", "": "x", "": "Z", "": "C", "": "S", "": "r", "": "z", "": "c", "": "s", "": "i",
"": "u", "": "v", "": "a", "": "o", "": "e", "": "E", "": "B", "": "P", "": "M", "": "F", "": "D",
"": "T", "": "N", "": "L", "": "R", "ˊ": "2", "ˇ": "3", "ˋ": "4", "˙": "5",
]
var strOutput = incoming
if !strOutput.contains("_") {
for entry in dicPhonabet2ASCII {
strOutput = strOutput.replacingOccurrences(of: entry.key, with: entry.value)
}
}
return strOutput
}
// MARK: -
private let urlCurrentFolder = URL(fileURLWithPath: FileManager.default.currentDirectoryPath)
@ -95,12 +127,22 @@ private let urlCHTforMOE: String = "./components/cht/phrases-moe-cht.txt"
private let urlCHTforVCHEW: String = "./components/cht/phrases-vchewing-cht.txt"
private let urlKanjiCore: String = "./components/common/char-kanji-core.txt"
private let urlPunctuation: String = "./components/common/data-punctuations.txt"
private let urlMiscBPMF: String = "./components/common/char-misc-bpmf.txt"
private let urlMiscNonKanji: String = "./components/common/char-misc-nonkanji.txt"
private let urlPunctuation: String = "./components/common/data-punctuations.txt"
private let urlSymbols: String = "./components/common/data-symbols.txt"
private let urlZhuyinwen: String = "./components/common/data-zhuyinwen.txt"
private let urlCNS: String = "./components/common/char-kanji-cns.txt"
private let urlPlistSymbols: String = "./data-symbols.plist"
private let urlPlistZhuyinwen: String = "./data-zhuyinwen.plist"
private let urlPlistCNS: String = "./data-cns.plist"
private let urlOutputCHS: String = "./data-chs.txt"
private let urlPlistCHS: String = "./data-chs.plist"
private let urlOutputCHT: String = "./data-cht.txt"
private let urlPlistCHT: String = "./data-cht.plist"
// MARK: -
@ -382,41 +424,130 @@ func weightAndSort(_ arrStructUncalculated: [Entry], isCHS: Bool) -> [Entry] {
func fileOutput(isCHS: Bool) {
let i18n: String = isCHS ? "簡體中文" : "繁體中文"
var strPunctuation = ""
var rangeMap: [String: [Data]] = [:]
let pathOutput = urlCurrentFolder.appendingPathComponent(
isCHS ? urlOutputCHS : urlOutputCHT)
let plistURL = urlCurrentFolder.appendingPathComponent(
isCHS ? urlPlistCHS : urlPlistCHT)
var strPrintLine = ""
//
do {
strPrintLine += try String(contentsOfFile: urlPunctuation, encoding: .utf8)
strPunctuation = try String(contentsOfFile: urlPunctuation, encoding: .utf8).replacingOccurrences(
of: "\t", with: " ")
strPrintLine += try String(contentsOfFile: urlPunctuation, encoding: .utf8).replacingOccurrences(
of: "\t", with: " ")
} catch {
NSLog(" - \(i18n): Exception happened when reading raw punctuation data.")
}
NSLog(" - \(i18n): 成功插入標點符號與西文字母數據")
NSLog(" - \(i18n): 成功插入標點符號與西文字母數據txt")
//
strPunctuation.ranges(splitBy: "\n").forEach {
let neta = strPunctuation[$0].split(separator: " ")
let line = String(strPunctuation[$0])
if neta.count >= 2 {
let theKey = String(neta[0])
let theValue = String(neta[1])
if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#" {
rangeMap[cnvPhonabetToASCII(theKey), default: []].append(theValue.data(using: .utf8)!)
}
}
}
var arrStructUnified: [Entry] = []
arrStructUnified += rawDictForKanjis(isCHS: isCHS)
arrStructUnified += rawDictForNonKanjis(isCHS: isCHS)
arrStructUnified += rawDictForPhrases(isCHS: isCHS)
//
arrStructUnified = weightAndSort(arrStructUnified, isCHS: isCHS)
for entry in arrStructUnified {
let theKey = entry.valPhone
let theValue = (String(entry.valWeight) + " " + entry.valPhrase)
rangeMap[cnvPhonabetToASCII(theKey), default: []].append(theValue.data(using: .utf8)!)
strPrintLine +=
entry.valPhone + " " + entry.valPhrase + " " + String(entry.valWeight)
+ "\n"
}
NSLog(" - \(i18n): 要寫入檔案的內容編譯完畢。")
NSLog(" - \(i18n): 要寫入檔案的 txt 內容編譯完畢。")
do {
try strPrintLine.write(to: pathOutput, atomically: false, encoding: .utf8)
let plistData = try PropertyListSerialization.data(fromPropertyList: rangeMap, format: .binary, options: 0)
try plistData.write(to: plistURL)
} catch {
NSLog(" - \(i18n): Error on writing strings to file: \(error)")
}
NSLog(" - \(i18n): 寫入完成。")
}
// MARK: -
func commonFileOutput() {
let i18n = "語言中性"
var strSymbols = ""
var strZhuyinwen = ""
var strCNS = ""
var mapSymbols: [String: [Data]] = [:]
var mapZhuyinwen: [String: [Data]] = [:]
var mapCNS: [String: [Data]] = [:]
//
do {
strSymbols = try String(contentsOfFile: urlSymbols, encoding: .utf8).replacingOccurrences(of: "\t", with: " ")
strZhuyinwen = try String(contentsOfFile: urlZhuyinwen, encoding: .utf8).replacingOccurrences(of: "\t", with: " ")
strCNS = try String(contentsOfFile: urlCNS, encoding: .utf8).replacingOccurrences(of: "\t", with: " ")
} catch {
NSLog(" - \(i18n): Exception happened when reading raw punctuation data.")
}
NSLog(" - \(i18n): 成功取得標點符號與西文字母原始資料plist")
//
strSymbols.ranges(splitBy: "\n").forEach {
let neta = strSymbols[$0].split(separator: " ")
let line = String(strSymbols[$0])
if neta.count >= 2 {
let theKey = String(neta[1])
let theValue = String(neta[0])
if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#" {
mapSymbols[cnvPhonabetToASCII(theKey), default: []].append(theValue.data(using: .utf8)!)
}
}
}
strZhuyinwen.ranges(splitBy: "\n").forEach {
let neta = strZhuyinwen[$0].split(separator: " ")
let line = String(strZhuyinwen[$0])
if neta.count >= 2 {
let theKey = String(neta[1])
let theValue = String(neta[0])
if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#" {
mapZhuyinwen[cnvPhonabetToASCII(theKey), default: []].append(theValue.data(using: .utf8)!)
}
}
}
strCNS.ranges(splitBy: "\n").forEach {
let neta = strCNS[$0].split(separator: " ")
let line = String(strCNS[$0])
if neta.count >= 2 {
let theKey = String(neta[1])
let theValue = String(neta[0])
if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#" {
mapCNS[cnvPhonabetToASCII(theKey), default: []].append(theValue.data(using: .utf8)!)
}
}
}
NSLog(" - \(i18n): 要寫入檔案的內容編譯完畢。")
do {
try PropertyListSerialization.data(fromPropertyList: mapSymbols, format: .binary, options: 0).write(
to: URL(fileURLWithPath: urlPlistSymbols))
try PropertyListSerialization.data(fromPropertyList: mapZhuyinwen, format: .binary, options: 0).write(
to: URL(fileURLWithPath: urlPlistZhuyinwen))
try PropertyListSerialization.data(fromPropertyList: mapCNS, format: .binary, options: 0).write(
to: URL(fileURLWithPath: urlPlistCNS))
} catch {
NSLog(" - \(i18n): Error on writing strings to file: \(error)")
}
NSLog(" - \(i18n): 寫入完成。")
}
// MARK: -
func main() {
NSLog("// 準備編譯符號表情ㄅ文語料檔案。")
commonFileOutput()
NSLog("// 準備編譯繁體中文核心語料檔案。")
fileOutput(isCHS: false)
NSLog("// 準備編譯簡體中文核心語料檔案。")