IME // Introducing DataCompiler.

- We included this file in vChewing main repo for further research purposes. - The one in the data submodule is what we are actually using.
2022-02-24 23:49:09 +08:00 · 2022-02-24 23:49:09 +08:00 · 6f5ac531ee
parent 693fc9e7d1
commit 6f5ac531ee
1 changed files with 409 additions and 0 deletions
--- a/DataCompiler/dataCompiler.swift
+++ b/DataCompiler/dataCompiler.swift
@ -0,0 +1,409 @@
+// Copyright (c) 2021 and onwards The vChewing Project (MIT-NTL License).
+/*
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+1. The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+2. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor,
+   except as required to fulfill notice requirements above.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+import Foundation
+
+// MARK: - 前導工作
+fileprivate extension String {
+    mutating func regReplace(pattern: String, replaceWith: String = "") {
+        do {
+            let regex = try NSRegularExpression(pattern: pattern, options: .caseInsensitive)
+            let range = NSRange(location: 0, length: count)
+            self = regex.stringByReplacingMatches(in: self, options: [], range: range, withTemplate: replaceWith)
+        } catch { return }
+    }
+}
+
+fileprivate func getDocumentsDirectory() -> URL {
+    let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
+    return paths[0]
+}
+
+// MARK: - 引入小數點位數控制函數
+// Ref: https://stackoverflow.com/a/32581409/4162914
+fileprivate extension Float {
+    func rounded(toPlaces places:Int) -> Float {
+        let divisor = pow(10.0, Float(places))
+        return (self * divisor).rounded() / divisor
+    }
+}
+
+// MARK: - 引入幂乘函數
+// Ref: https://stackoverflow.com/a/41581695/4162914
+precedencegroup ExponentiationPrecedence {
+    associativity: right
+    higherThan: MultiplicationPrecedence
+}
+
+infix operator ** : ExponentiationPrecedence
+
+func ** (_ base: Double, _ exp: Double) -> Double {
+    return pow(base, exp)
+}
+
+func ** (_ base: Float, _ exp: Float) -> Float {
+    return pow(base, exp)
+}
+
+// MARK: - 定義檔案結構
+
+struct Entry {
+    var valPhone: String = ""
+    var valPhrase: String = ""
+    var valWeight: Float = -1.0
+    var valCount: Int = 0
+}
+
+// MARK: - 登記全局根常數變數
+
+fileprivate let urlCurrentFolder = URL(fileURLWithPath: FileManager.default.currentDirectoryPath)
+
+fileprivate let url_CHS_Custom: String = "./components/chs/phrases-custom-chs.txt"
+fileprivate let url_CHS_MCBP: String = "./components/chs/phrases-mcbp-chs.txt"
+fileprivate let url_CHS_MOE: String = "./components/chs/phrases-moe-chs.txt"
+fileprivate let url_CHS_VCHEW: String = "./components/chs/phrases-vchewing-chs.txt"
+
+fileprivate let url_CHT_Custom: String = "./components/cht/phrases-custom-cht.txt"
+fileprivate let url_CHT_MCBP: String = "./components/cht/phrases-mcbp-cht.txt"
+fileprivate let url_CHT_MOE: String = "./components/cht/phrases-moe-cht.txt"
+fileprivate let url_CHT_VCHEW: String = "./components/cht/phrases-vchewing-cht.txt"
+
+fileprivate let urlKanjiCore: String = "./components/common/char-kanji-core.txt"
+fileprivate let urlPunctuation: String = "./components/common/data-punctuations.txt"
+fileprivate let urlMiscBPMF: String = "./components/common/char-misc-bpmf.txt"
+fileprivate let urlMiscNonKanji: String = "./components/common/char-misc-nonkanji.txt"
+
+fileprivate let urlOutputCHS: String = "./data-chs.txt"
+fileprivate let urlOutputCHT: String = "./data-cht.txt"
+
+// MARK: - 載入詞組檔案且輸出數組
+
+func rawDictForPhrases(isCHS: Bool) -> [Entry] {
+    var arrEntryRAW: [Entry] = []
+    var strRAW: String = ""
+    let urlCustom: String = isCHS ? url_CHS_Custom : url_CHT_Custom
+    let urlMCBP: String = isCHS ? url_CHS_MCBP : url_CHT_MCBP
+    let urlMOE: String = isCHS ? url_CHS_MOE : url_CHT_MOE
+    let urlVCHEW: String = isCHS ? url_CHS_VCHEW : url_CHT_VCHEW
+    let i18n: String = isCHS ? "簡體中文" : "繁體中文"
+    // 讀取內容
+    do {
+        strRAW += try String(contentsOfFile: urlCustom, encoding: .utf8)
+        strRAW += "\n"
+        strRAW += try String(contentsOfFile: urlMCBP, encoding: .utf8)
+        strRAW += "\n"
+        strRAW += try String(contentsOfFile: urlMOE, encoding: .utf8)
+        strRAW += "\n"
+        strRAW += try String(contentsOfFile: urlVCHEW, encoding: .utf8)
+    }
+    catch {
+        NSLog(" - Exception happened when reading raw phrases data.")
+        return []
+    }
+    // 預處理格式
+    strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記
+    strRAW = strRAW.replacingOccurrences(of: "　", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space
+    strRAW = strRAW.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space
+    strRAW = strRAW.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space
+    strRAW.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF
+    strRAW = strRAW.replacingOccurrences(of: "\r", with: "\n") // CR to LF
+    strRAW.regReplace(pattern: " +", replaceWith: " ") // 統整連續空格為一個 ASCII 空格
+    // strRAW.regReplace(pattern: "\\n+", replaceWith: "\n") // 統整連續 LF 為一個 LF
+    // (不需要處理純空行，因為空記錄不會被轉為 Entry)
+    strRAW = strRAW.replacingOccurrences(of: " \n", with: "\n") // 去除行尾空格
+    strRAW = strRAW.replacingOccurrences(of: "\n ", with: "\n") // 去除行首空格
+    if strRAW.prefix(1) == " " { // 去除檔案開頭空格
+        strRAW.removeFirst()
+    }
+    if strRAW.suffix(1) == " " { // 去除檔案結尾空格
+        strRAW.removeLast()
+    }
+    // 正式整理格式，現在就開始去重複：
+    let arrData = Array(NSOrderedSet(array: strRAW.components(separatedBy: "\n")).array as! [String])
+    var varLineData: String = ""
+    for lineData in arrData {
+        varLineData = lineData
+        // 先完成某兩步需要分行處理才能完成的格式整理。
+        varLineData.regReplace(pattern: "^#.*$", replaceWith: "") // 以#開頭的行都淨空
+        varLineData.regReplace(pattern: "^.*#WIN32.*$", replaceWith: "") // 去掉所有 WIN32 特有的行
+        // 第三欄開始是注音
+        let arrLineData = varLineData.components(separatedBy: " ")
+        var varLineDataProcessed: String = ""
+        var count = 0
+        for currentCell in arrLineData {
+            count += 1
+            if count < 3 {
+                varLineDataProcessed += currentCell + "\t"
+            } else if count < arrLineData.count {
+                    varLineDataProcessed += currentCell + "-"
+            } else {
+                varLineDataProcessed += currentCell
+            }
+        }
+        // 然後直接乾脆就轉成 Entry 吧。
+        let arrCells : [String] = varLineDataProcessed.components(separatedBy: "\t")
+        count = 0 // 不需要再定義，因為之前已經有定義過了。
+        var phone = ""
+        var phrase = ""
+        var occurrence = 0
+        for cell in arrCells {
+            count += 1
+            switch count {
+            case 1: phrase = cell
+            case 3: phone = cell
+            case 2: occurrence = Int(cell) ?? 0
+            default: break
+            }
+        }
+        if phrase != "" { // 廢掉空數據；之後無須再這樣處理。
+            arrEntryRAW += [Entry.init(valPhone: phone, valPhrase: phrase, valWeight: 0.0, valCount: occurrence)]
+        }
+    }
+    NSLog(" - \(i18n): 成功生成詞語語料辭典（權重待計算）。")
+    return arrEntryRAW
+}
+
+// MARK: - 載入單字檔案且輸出數組
+
+func rawDictForKanjis(isCHS: Bool) -> [Entry] {
+    var arrEntryRAW: [Entry] = []
+    var strRAW: String = ""
+    let i18n: String = isCHS ? "簡體中文" : "繁體中文"
+    // 讀取內容
+    do {
+        strRAW += try String(contentsOfFile: urlKanjiCore, encoding: .utf8)
+    }
+    catch {
+        NSLog(" - Exception happened when reading raw core kanji data.")
+        return []
+    }
+    // 預處理格式
+    strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記
+    strRAW = strRAW.replacingOccurrences(of: "　", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space
+    strRAW = strRAW.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space
+    strRAW = strRAW.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space
+    strRAW.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF
+    strRAW = strRAW.replacingOccurrences(of: "\r", with: "\n") // CR to LF
+    strRAW.regReplace(pattern: " +", replaceWith: " ") // 統整連續空格為一個 ASCII 空格
+    // strRAW.regReplace(pattern: "\\n+", replaceWith: "\n") // 統整連續 LF 為一個 LF
+    // (不需要處理純空行，因為空記錄不會被轉為 Entry)
+    strRAW = strRAW.replacingOccurrences(of: " \n", with: "\n") // 去除行尾空格
+    strRAW = strRAW.replacingOccurrences(of: "\n ", with: "\n") // 去除行首空格
+    if strRAW.prefix(1) == " " { // 去除檔案開頭空格
+        strRAW.removeFirst()
+    }
+    if strRAW.suffix(1) == " " { // 去除檔案結尾空格
+        strRAW.removeLast()
+    }
+    // 正式整理格式，現在就開始去重複：
+    let arrData = Array(NSOrderedSet(array: strRAW.components(separatedBy: "\n")).array as! [String])
+    var varLineData: String = ""
+    for lineData in arrData {
+        varLineData = lineData
+        // 先完成某兩步需要分行處理才能完成的格式整理。
+        varLineData.regReplace(pattern: "^#.*$", replaceWith: "") // 以#開頭的行都淨空
+        varLineData.regReplace(pattern: "^.*#WIN32.*$", replaceWith: "") // 去掉所有 WIN32 特有的行
+        // 簡體中文的話，提取 1,2,4；繁體中文的話，提取 1,3,4。
+        let varLineDataPre = varLineData.components(separatedBy: " ").prefix(isCHS ? 2 : 1).joined(separator: "\t")
+        let varLineDataPost = varLineData.components(separatedBy: " ").suffix(isCHS ? 1 : 2).joined(separator: "\t")
+        varLineData = varLineDataPre + "\t" + varLineDataPost
+        let arrLineData = varLineData.components(separatedBy: " ")
+        var varLineDataProcessed: String = ""
+        var count = 0
+        for currentCell in arrLineData {
+            count += 1
+            if count < 3 {
+                varLineDataProcessed += currentCell + "\t"
+            } else if count < arrLineData.count {
+                    varLineDataProcessed += currentCell + "-"
+            } else {
+                varLineDataProcessed += currentCell
+            }
+        }
+        // 然後直接乾脆就轉成 Entry 吧。
+        let arrCells : [String] = varLineDataProcessed.components(separatedBy: "\t")
+        count = 0 // 不需要再定義，因為之前已經有定義過了。
+        var phone = ""
+        var phrase = ""
+        var occurrence = 0
+        for cell in arrCells {
+            count += 1
+            switch count {
+            case 1: phrase = cell
+            case 3: phone = cell
+            case 2: occurrence = Int(cell) ?? 0
+            default: break
+            }
+        }
+        if phrase != "" { // 廢掉空數據；之後無須再這樣處理。
+            arrEntryRAW += [Entry.init(valPhone: phone, valPhrase: phrase, valWeight: 0.0, valCount: occurrence)]
+        }
+    }
+    NSLog(" - \(i18n): 成功生成單字語料辭典（權重待計算）。")
+    return arrEntryRAW
+}
+
+// MARK: - 載入非漢字檔案且輸出數組
+
+func rawDictForNonKanjis(isCHS: Bool) -> [Entry] {
+    var arrEntryRAW: [Entry] = []
+    var strRAW: String = ""
+    let i18n: String = isCHS ? "簡體中文" : "繁體中文"
+    // 讀取內容
+    do {
+        strRAW += try String(contentsOfFile: urlMiscBPMF, encoding: .utf8)
+        strRAW += "\n"
+        strRAW += try String(contentsOfFile: urlMiscNonKanji, encoding: .utf8)
+    }
+    catch {
+        NSLog(" - Exception happened when reading raw core kanji data.")
+        return []
+    }
+    // 預處理格式
+    strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記
+    strRAW = strRAW.replacingOccurrences(of: "　", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space
+    strRAW = strRAW.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space
+    strRAW = strRAW.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space
+    strRAW.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF
+    strRAW = strRAW.replacingOccurrences(of: "\r", with: "\n") // CR to LF
+    strRAW.regReplace(pattern: " +", replaceWith: " ") // 統整連續空格為一個 ASCII 空格
+    // strRAW.regReplace(pattern: "\\n+", replaceWith: "\n") // 統整連續 LF 為一個 LF
+    // (不需要處理純空行，因為空記錄不會被轉為 Entry)
+    strRAW = strRAW.replacingOccurrences(of: " \n", with: "\n") // 去除行尾空格
+    strRAW = strRAW.replacingOccurrences(of: "\n ", with: "\n") // 去除行首空格
+    if strRAW.prefix(1) == " " { // 去除檔案開頭空格
+        strRAW.removeFirst()
+    }
+    if strRAW.suffix(1) == " " { // 去除檔案結尾空格
+        strRAW.removeLast()
+    }
+    // 正式整理格式，現在就開始去重複：
+    let arrData = Array(NSOrderedSet(array: strRAW.components(separatedBy: "\n")).array as! [String])
+    var varLineData: String = ""
+    for lineData in arrData {
+        varLineData = lineData
+        // 先完成某兩步需要分行處理才能完成的格式整理。
+        varLineData.regReplace(pattern: "^#.*$", replaceWith: "") // 以#開頭的行都淨空
+        varLineData.regReplace(pattern: "^.*#WIN32.*$", replaceWith: "") // 去掉所有 WIN32 特有的行
+        varLineData = varLineData.components(separatedBy: " ").prefix(3).joined(separator: "\t") // 提取前三欄的內容。
+        let arrLineData = varLineData.components(separatedBy: " ")
+        var varLineDataProcessed: String = ""
+        var count = 0
+        for currentCell in arrLineData {
+            count += 1
+            if count < 3 {
+                varLineDataProcessed += currentCell + "\t"
+            } else if count < arrLineData.count {
+                    varLineDataProcessed += currentCell + "-"
+            } else {
+                varLineDataProcessed += currentCell
+            }
+        }
+        // 然後直接乾脆就轉成 Entry 吧。
+        let arrCells : [String] = varLineDataProcessed.components(separatedBy: "\t")
+        count = 0 // 不需要再定義，因為之前已經有定義過了。
+        var phone = ""
+        var phrase = ""
+        var occurrence = 0
+        for cell in arrCells {
+            count += 1
+            switch count {
+            case 1: phrase = cell
+            case 3: phone = cell
+            case 2: occurrence = Int(cell) ?? 0
+            default: break
+            }
+        }
+        if phrase != "" { // 廢掉空數據；之後無須再這樣處理。
+            arrEntryRAW += [Entry.init(valPhone: phone, valPhrase: phrase, valWeight: 0.0, valCount: occurrence)]
+        }
+    }
+    NSLog(" - \(i18n): 成功生成非漢字語料辭典（權重待計算）。")
+    return arrEntryRAW
+}
+
+func weightAndSort(_ arrStructUncalculated: [Entry], isCHS: Bool) -> [Entry] {
+    let i18n: String = isCHS ? "簡體中文" : "繁體中文"
+    var arrStructCalculated: [Entry] = []
+    let fscale: Float = 2.7
+    var norm: Float = 0.0
+    for entry in arrStructUncalculated {
+        norm += fscale**(Float(entry.valPhrase.count) / 3.0 - 1.0) * Float(entry.valCount) // Credit: MJHsieh.
+    }
+    // norm 計算完畢，開始將 norm 作為新的固定常數來為每個詞條記錄計算權重。
+    // 將新酷音的詞語出現次數數據轉換成小麥引擎可讀的數據形式。
+    // 對出現次數小於 1 的詞條，將 0 當成 0.5 來處理、以防止除零。
+    // 統計公式著作權歸 MJHsieh 所有（MIT License）。
+    for entry in arrStructUncalculated {
+        let weight: Float = (entry.valCount < 1) ?
+            log10(fscale**(Float(entry.valPhrase.count) / 3.0 - 1.0) * 0.5 / norm) // Credit: MJHsieh.
+        :
+            log10(fscale**(Float(entry.valPhrase.count) / 3.0 - 1.0) * Float(entry.valCount) / norm) // Credit: MJHsieh.
+        let weightRounded: Float = weight.rounded(toPlaces: 3) // 為了節省生成的檔案體積，僅保留小數點後三位。
+        arrStructCalculated += [Entry.init(valPhone: entry.valPhone, valPhrase: entry.valPhrase, valWeight: weightRounded, valCount: entry.valCount)]
+    }
+    NSLog(" - \(i18n): 成功計算權重。")
+    // ==========================================
+    // 接下來是排序，先按照注音遞減排序一遍、再按照權重遞減排序一遍。
+    let arrStructSorted: [Entry] = arrStructCalculated.sorted(by: {(lhs, rhs) -> Bool in return (lhs.valPhone, rhs.valCount) < (rhs.valPhone, lhs.valCount)})
+    NSLog(" - \(i18n): 排序整理完畢，準備編譯要寫入的檔案內容。")
+    return arrStructSorted
+}
+
+func fileOutput(isCHS: Bool) {
+    let i18n: String = isCHS ? "簡體中文" : "繁體中文"
+    let pathOutput = urlCurrentFolder.appendingPathComponent(isCHS ? urlOutputCHS : urlOutputCHT)
+    var strPrintLine = ""
+    // 讀取標點內容
+    do {
+        strPrintLine += try String(contentsOfFile: urlPunctuation, encoding: .utf8)
+    }
+    catch {
+        NSLog(" - \(i18n): Exception happened when reading raw punctuation data.")
+    }
+    NSLog(" - \(i18n): 成功插入標點符號與西文字母數據。")
+    // 統合辭典內容
+    var arrStructUnified: [Entry] = []
+    arrStructUnified += rawDictForKanjis(isCHS: isCHS)
+    arrStructUnified += rawDictForNonKanjis(isCHS: isCHS)
+    arrStructUnified += rawDictForPhrases(isCHS: isCHS)
+    // 計算權重且排序
+    arrStructUnified = weightAndSort(arrStructUnified, isCHS: isCHS)
+    
+    for entry in arrStructUnified {
+        strPrintLine += entry.valPhone + " " + entry.valPhrase + " " + String(entry.valWeight) + "\n"
+    }
+    NSLog(" - \(i18n): 要寫入檔案的內容編譯完畢。")
+    do {
+        try strPrintLine.write(to: pathOutput, atomically: false, encoding: .utf8)
+    }
+    catch {
+        NSLog(" - \(i18n): Error on writing strings to file: \(error)")
+    }
+    NSLog(" - \(i18n): 寫入完成。")
+}
+
+// MARK: - 主执行绪
+func main() {
+    NSLog("// 準備編譯繁體中文核心語料檔案。")
+    fileOutput(isCHS: false)
+    NSLog("// 準備編譯簡體中文核心語料檔案。")
+    fileOutput(isCHS: true)
+}
+
+main()