From dd23a5f18aa67a7555f6f0a808b2d22d950a9ce2 Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Thu, 24 Feb 2022 23:49:09 +0800 Subject: [PATCH] IME // Introducing DataCompiler. - We included this file in vChewing main repo for further research purposes. - The one in the data submodule is what we are actually using. --- DataCompiler/dataCompiler.swift | 409 ++++++++++++++++++++++++++++++++ 1 file changed, 409 insertions(+) create mode 100644 DataCompiler/dataCompiler.swift diff --git a/DataCompiler/dataCompiler.swift b/DataCompiler/dataCompiler.swift new file mode 100644 index 00000000..7565a568 --- /dev/null +++ b/DataCompiler/dataCompiler.swift @@ -0,0 +1,409 @@ +// Copyright (c) 2021 and onwards The vChewing Project (MIT-NTL License). +/* +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +1. The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +2. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, + except as required to fulfill notice requirements above. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +import Foundation + +// MARK: - 前導工作 +fileprivate extension String { + mutating func regReplace(pattern: String, replaceWith: String = "") { + do { + let regex = try NSRegularExpression(pattern: pattern, options: .caseInsensitive) + let range = NSRange(location: 0, length: count) + self = regex.stringByReplacingMatches(in: self, options: [], range: range, withTemplate: replaceWith) + } catch { return } + } +} + +fileprivate func getDocumentsDirectory() -> URL { + let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask) + return paths[0] +} + +// MARK: - 引入小數點位數控制函數 +// Ref: https://stackoverflow.com/a/32581409/4162914 +fileprivate extension Float { + func rounded(toPlaces places:Int) -> Float { + let divisor = pow(10.0, Float(places)) + return (self * divisor).rounded() / divisor + } +} + +// MARK: - 引入幂乘函數 +// Ref: https://stackoverflow.com/a/41581695/4162914 +precedencegroup ExponentiationPrecedence { + associativity: right + higherThan: MultiplicationPrecedence +} + +infix operator ** : ExponentiationPrecedence + +func ** (_ base: Double, _ exp: Double) -> Double { + return pow(base, exp) +} + +func ** (_ base: Float, _ exp: Float) -> Float { + return pow(base, exp) +} + +// MARK: - 定義檔案結構 + +struct Entry { + var valPhone: String = "" + var valPhrase: String = "" + var valWeight: Float = -1.0 + var valCount: Int = 0 +} + +// MARK: - 登記全局根常數變數 + +fileprivate let urlCurrentFolder = URL(fileURLWithPath: FileManager.default.currentDirectoryPath) + +fileprivate let url_CHS_Custom: String = "./components/chs/phrases-custom-chs.txt" +fileprivate let url_CHS_MCBP: String = "./components/chs/phrases-mcbp-chs.txt" +fileprivate let url_CHS_MOE: String = "./components/chs/phrases-moe-chs.txt" +fileprivate let url_CHS_VCHEW: String = "./components/chs/phrases-vchewing-chs.txt" + +fileprivate let url_CHT_Custom: String = "./components/cht/phrases-custom-cht.txt" +fileprivate let url_CHT_MCBP: String = "./components/cht/phrases-mcbp-cht.txt" +fileprivate let url_CHT_MOE: String = "./components/cht/phrases-moe-cht.txt" +fileprivate let url_CHT_VCHEW: String = "./components/cht/phrases-vchewing-cht.txt" + +fileprivate let urlKanjiCore: String = "./components/common/char-kanji-core.txt" +fileprivate let urlPunctuation: String = "./components/common/data-punctuations.txt" +fileprivate let urlMiscBPMF: String = "./components/common/char-misc-bpmf.txt" +fileprivate let urlMiscNonKanji: String = "./components/common/char-misc-nonkanji.txt" + +fileprivate let urlOutputCHS: String = "./data-chs.txt" +fileprivate let urlOutputCHT: String = "./data-cht.txt" + +// MARK: - 載入詞組檔案且輸出數組 + +func rawDictForPhrases(isCHS: Bool) -> [Entry] { + var arrEntryRAW: [Entry] = [] + var strRAW: String = "" + let urlCustom: String = isCHS ? url_CHS_Custom : url_CHT_Custom + let urlMCBP: String = isCHS ? url_CHS_MCBP : url_CHT_MCBP + let urlMOE: String = isCHS ? url_CHS_MOE : url_CHT_MOE + let urlVCHEW: String = isCHS ? url_CHS_VCHEW : url_CHT_VCHEW + let i18n: String = isCHS ? "簡體中文" : "繁體中文" + // 讀取內容 + do { + strRAW += try String(contentsOfFile: urlCustom, encoding: .utf8) + strRAW += "\n" + strRAW += try String(contentsOfFile: urlMCBP, encoding: .utf8) + strRAW += "\n" + strRAW += try String(contentsOfFile: urlMOE, encoding: .utf8) + strRAW += "\n" + strRAW += try String(contentsOfFile: urlVCHEW, encoding: .utf8) + } + catch { + NSLog(" - Exception happened when reading raw phrases data.") + return [] + } + // 預處理格式 + strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記 + strRAW = strRAW.replacingOccurrences(of: " ", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space + strRAW = strRAW.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space + strRAW = strRAW.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space + strRAW.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF + strRAW = strRAW.replacingOccurrences(of: "\r", with: "\n") // CR to LF + strRAW.regReplace(pattern: " +", replaceWith: " ") // 統整連續空格為一個 ASCII 空格 + // strRAW.regReplace(pattern: "\\n+", replaceWith: "\n") // 統整連續 LF 為一個 LF + // (不需要處理純空行,因為空記錄不會被轉為 Entry) + strRAW = strRAW.replacingOccurrences(of: " \n", with: "\n") // 去除行尾空格 + strRAW = strRAW.replacingOccurrences(of: "\n ", with: "\n") // 去除行首空格 + if strRAW.prefix(1) == " " { // 去除檔案開頭空格 + strRAW.removeFirst() + } + if strRAW.suffix(1) == " " { // 去除檔案結尾空格 + strRAW.removeLast() + } + // 正式整理格式,現在就開始去重複: + let arrData = Array(NSOrderedSet(array: strRAW.components(separatedBy: "\n")).array as! [String]) + var varLineData: String = "" + for lineData in arrData { + varLineData = lineData + // 先完成某兩步需要分行處理才能完成的格式整理。 + varLineData.regReplace(pattern: "^#.*$", replaceWith: "") // 以#開頭的行都淨空 + varLineData.regReplace(pattern: "^.*#WIN32.*$", replaceWith: "") // 去掉所有 WIN32 特有的行 + // 第三欄開始是注音 + let arrLineData = varLineData.components(separatedBy: " ") + var varLineDataProcessed: String = "" + var count = 0 + for currentCell in arrLineData { + count += 1 + if count < 3 { + varLineDataProcessed += currentCell + "\t" + } else if count < arrLineData.count { + varLineDataProcessed += currentCell + "-" + } else { + varLineDataProcessed += currentCell + } + } + // 然後直接乾脆就轉成 Entry 吧。 + let arrCells : [String] = varLineDataProcessed.components(separatedBy: "\t") + count = 0 // 不需要再定義,因為之前已經有定義過了。 + var phone = "" + var phrase = "" + var occurrence = 0 + for cell in arrCells { + count += 1 + switch count { + case 1: phrase = cell + case 3: phone = cell + case 2: occurrence = Int(cell) ?? 0 + default: break + } + } + if phrase != "" { // 廢掉空數據;之後無須再這樣處理。 + arrEntryRAW += [Entry.init(valPhone: phone, valPhrase: phrase, valWeight: 0.0, valCount: occurrence)] + } + } + NSLog(" - \(i18n): 成功生成詞語語料辭典(權重待計算)。") + return arrEntryRAW +} + +// MARK: - 載入單字檔案且輸出數組 + +func rawDictForKanjis(isCHS: Bool) -> [Entry] { + var arrEntryRAW: [Entry] = [] + var strRAW: String = "" + let i18n: String = isCHS ? "簡體中文" : "繁體中文" + // 讀取內容 + do { + strRAW += try String(contentsOfFile: urlKanjiCore, encoding: .utf8) + } + catch { + NSLog(" - Exception happened when reading raw core kanji data.") + return [] + } + // 預處理格式 + strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記 + strRAW = strRAW.replacingOccurrences(of: " ", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space + strRAW = strRAW.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space + strRAW = strRAW.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space + strRAW.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF + strRAW = strRAW.replacingOccurrences(of: "\r", with: "\n") // CR to LF + strRAW.regReplace(pattern: " +", replaceWith: " ") // 統整連續空格為一個 ASCII 空格 + // strRAW.regReplace(pattern: "\\n+", replaceWith: "\n") // 統整連續 LF 為一個 LF + // (不需要處理純空行,因為空記錄不會被轉為 Entry) + strRAW = strRAW.replacingOccurrences(of: " \n", with: "\n") // 去除行尾空格 + strRAW = strRAW.replacingOccurrences(of: "\n ", with: "\n") // 去除行首空格 + if strRAW.prefix(1) == " " { // 去除檔案開頭空格 + strRAW.removeFirst() + } + if strRAW.suffix(1) == " " { // 去除檔案結尾空格 + strRAW.removeLast() + } + // 正式整理格式,現在就開始去重複: + let arrData = Array(NSOrderedSet(array: strRAW.components(separatedBy: "\n")).array as! [String]) + var varLineData: String = "" + for lineData in arrData { + varLineData = lineData + // 先完成某兩步需要分行處理才能完成的格式整理。 + varLineData.regReplace(pattern: "^#.*$", replaceWith: "") // 以#開頭的行都淨空 + varLineData.regReplace(pattern: "^.*#WIN32.*$", replaceWith: "") // 去掉所有 WIN32 特有的行 + // 簡體中文的話,提取 1,2,4;繁體中文的話,提取 1,3,4。 + let varLineDataPre = varLineData.components(separatedBy: " ").prefix(isCHS ? 2 : 1).joined(separator: "\t") + let varLineDataPost = varLineData.components(separatedBy: " ").suffix(isCHS ? 1 : 2).joined(separator: "\t") + varLineData = varLineDataPre + "\t" + varLineDataPost + let arrLineData = varLineData.components(separatedBy: " ") + var varLineDataProcessed: String = "" + var count = 0 + for currentCell in arrLineData { + count += 1 + if count < 3 { + varLineDataProcessed += currentCell + "\t" + } else if count < arrLineData.count { + varLineDataProcessed += currentCell + "-" + } else { + varLineDataProcessed += currentCell + } + } + // 然後直接乾脆就轉成 Entry 吧。 + let arrCells : [String] = varLineDataProcessed.components(separatedBy: "\t") + count = 0 // 不需要再定義,因為之前已經有定義過了。 + var phone = "" + var phrase = "" + var occurrence = 0 + for cell in arrCells { + count += 1 + switch count { + case 1: phrase = cell + case 3: phone = cell + case 2: occurrence = Int(cell) ?? 0 + default: break + } + } + if phrase != "" { // 廢掉空數據;之後無須再這樣處理。 + arrEntryRAW += [Entry.init(valPhone: phone, valPhrase: phrase, valWeight: 0.0, valCount: occurrence)] + } + } + NSLog(" - \(i18n): 成功生成單字語料辭典(權重待計算)。") + return arrEntryRAW +} + +// MARK: - 載入非漢字檔案且輸出數組 + +func rawDictForNonKanjis(isCHS: Bool) -> [Entry] { + var arrEntryRAW: [Entry] = [] + var strRAW: String = "" + let i18n: String = isCHS ? "簡體中文" : "繁體中文" + // 讀取內容 + do { + strRAW += try String(contentsOfFile: urlMiscBPMF, encoding: .utf8) + strRAW += "\n" + strRAW += try String(contentsOfFile: urlMiscNonKanji, encoding: .utf8) + } + catch { + NSLog(" - Exception happened when reading raw core kanji data.") + return [] + } + // 預處理格式 + strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記 + strRAW = strRAW.replacingOccurrences(of: " ", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space + strRAW = strRAW.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space + strRAW = strRAW.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space + strRAW.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF + strRAW = strRAW.replacingOccurrences(of: "\r", with: "\n") // CR to LF + strRAW.regReplace(pattern: " +", replaceWith: " ") // 統整連續空格為一個 ASCII 空格 + // strRAW.regReplace(pattern: "\\n+", replaceWith: "\n") // 統整連續 LF 為一個 LF + // (不需要處理純空行,因為空記錄不會被轉為 Entry) + strRAW = strRAW.replacingOccurrences(of: " \n", with: "\n") // 去除行尾空格 + strRAW = strRAW.replacingOccurrences(of: "\n ", with: "\n") // 去除行首空格 + if strRAW.prefix(1) == " " { // 去除檔案開頭空格 + strRAW.removeFirst() + } + if strRAW.suffix(1) == " " { // 去除檔案結尾空格 + strRAW.removeLast() + } + // 正式整理格式,現在就開始去重複: + let arrData = Array(NSOrderedSet(array: strRAW.components(separatedBy: "\n")).array as! [String]) + var varLineData: String = "" + for lineData in arrData { + varLineData = lineData + // 先完成某兩步需要分行處理才能完成的格式整理。 + varLineData.regReplace(pattern: "^#.*$", replaceWith: "") // 以#開頭的行都淨空 + varLineData.regReplace(pattern: "^.*#WIN32.*$", replaceWith: "") // 去掉所有 WIN32 特有的行 + varLineData = varLineData.components(separatedBy: " ").prefix(3).joined(separator: "\t") // 提取前三欄的內容。 + let arrLineData = varLineData.components(separatedBy: " ") + var varLineDataProcessed: String = "" + var count = 0 + for currentCell in arrLineData { + count += 1 + if count < 3 { + varLineDataProcessed += currentCell + "\t" + } else if count < arrLineData.count { + varLineDataProcessed += currentCell + "-" + } else { + varLineDataProcessed += currentCell + } + } + // 然後直接乾脆就轉成 Entry 吧。 + let arrCells : [String] = varLineDataProcessed.components(separatedBy: "\t") + count = 0 // 不需要再定義,因為之前已經有定義過了。 + var phone = "" + var phrase = "" + var occurrence = 0 + for cell in arrCells { + count += 1 + switch count { + case 1: phrase = cell + case 3: phone = cell + case 2: occurrence = Int(cell) ?? 0 + default: break + } + } + if phrase != "" { // 廢掉空數據;之後無須再這樣處理。 + arrEntryRAW += [Entry.init(valPhone: phone, valPhrase: phrase, valWeight: 0.0, valCount: occurrence)] + } + } + NSLog(" - \(i18n): 成功生成非漢字語料辭典(權重待計算)。") + return arrEntryRAW +} + +func weightAndSort(_ arrStructUncalculated: [Entry], isCHS: Bool) -> [Entry] { + let i18n: String = isCHS ? "簡體中文" : "繁體中文" + var arrStructCalculated: [Entry] = [] + let fscale: Float = 2.7 + var norm: Float = 0.0 + for entry in arrStructUncalculated { + norm += fscale**(Float(entry.valPhrase.count) / 3.0 - 1.0) * Float(entry.valCount) // Credit: MJHsieh. + } + // norm 計算完畢,開始將 norm 作為新的固定常數來為每個詞條記錄計算權重。 + // 將新酷音的詞語出現次數數據轉換成小麥引擎可讀的數據形式。 + // 對出現次數小於 1 的詞條,將 0 當成 0.5 來處理、以防止除零。 + // 統計公式著作權歸 MJHsieh 所有(MIT License)。 + for entry in arrStructUncalculated { + let weight: Float = (entry.valCount < 1) ? + log10(fscale**(Float(entry.valPhrase.count) / 3.0 - 1.0) * 0.5 / norm) // Credit: MJHsieh. + : + log10(fscale**(Float(entry.valPhrase.count) / 3.0 - 1.0) * Float(entry.valCount) / norm) // Credit: MJHsieh. + let weightRounded: Float = weight.rounded(toPlaces: 3) // 為了節省生成的檔案體積,僅保留小數點後三位。 + arrStructCalculated += [Entry.init(valPhone: entry.valPhone, valPhrase: entry.valPhrase, valWeight: weightRounded, valCount: entry.valCount)] + } + NSLog(" - \(i18n): 成功計算權重。") + // ========================================== + // 接下來是排序,先按照注音遞減排序一遍、再按照權重遞減排序一遍。 + let arrStructSorted: [Entry] = arrStructCalculated.sorted(by: {(lhs, rhs) -> Bool in return (lhs.valPhone, rhs.valCount) < (rhs.valPhone, lhs.valCount)}) + NSLog(" - \(i18n): 排序整理完畢,準備編譯要寫入的檔案內容。") + return arrStructSorted +} + +func fileOutput(isCHS: Bool) { + let i18n: String = isCHS ? "簡體中文" : "繁體中文" + let pathOutput = urlCurrentFolder.appendingPathComponent(isCHS ? urlOutputCHS : urlOutputCHT) + var strPrintLine = "" + // 讀取標點內容 + do { + strPrintLine += try String(contentsOfFile: urlPunctuation, encoding: .utf8) + } + catch { + NSLog(" - \(i18n): Exception happened when reading raw punctuation data.") + } + NSLog(" - \(i18n): 成功插入標點符號與西文字母數據。") + // 統合辭典內容 + var arrStructUnified: [Entry] = [] + arrStructUnified += rawDictForKanjis(isCHS: isCHS) + arrStructUnified += rawDictForNonKanjis(isCHS: isCHS) + arrStructUnified += rawDictForPhrases(isCHS: isCHS) + // 計算權重且排序 + arrStructUnified = weightAndSort(arrStructUnified, isCHS: isCHS) + + for entry in arrStructUnified { + strPrintLine += entry.valPhone + " " + entry.valPhrase + " " + String(entry.valWeight) + "\n" + } + NSLog(" - \(i18n): 要寫入檔案的內容編譯完畢。") + do { + try strPrintLine.write(to: pathOutput, atomically: false, encoding: .utf8) + } + catch { + NSLog(" - \(i18n): Error on writing strings to file: \(error)") + } + NSLog(" - \(i18n): 寫入完成。") +} + +// MARK: - 主执行绪 +func main() { + NSLog("// 準備編譯繁體中文核心語料檔案。") + fileOutput(isCHS: false) + NSLog("// 準備編譯簡體中文核心語料檔案。") + fileOutput(isCHS: true) +} + +main()