MACV // Add advanced debug abilities.
This commit is contained in:
parent
c80ce1f54d
commit
7a9d3f8e89
|
@ -68,12 +68,30 @@ func ** (_ base: Double, _ exp: Double) -> Double {
|
||||||
// MARK: - 定義檔案結構
|
// MARK: - 定義檔案結構
|
||||||
|
|
||||||
struct Unigram: CustomStringConvertible {
|
struct Unigram: CustomStringConvertible {
|
||||||
|
enum UnigramCategory: String {
|
||||||
|
case macv = "MACV"
|
||||||
|
case tabe = "TABE"
|
||||||
|
case moe = "MOED"
|
||||||
|
case custom = "CUST"
|
||||||
|
case misc = "MISC"
|
||||||
|
var description: String { rawValue }
|
||||||
|
}
|
||||||
|
|
||||||
|
init(key: String, value: String, score: Double, count: Int, category: Unigram.UnigramCategory) {
|
||||||
|
self.key = key
|
||||||
|
self.value = value
|
||||||
|
self.score = score
|
||||||
|
self.count = count
|
||||||
|
self.category = category
|
||||||
|
}
|
||||||
|
|
||||||
var key: String = ""
|
var key: String = ""
|
||||||
var value: String = ""
|
var value: String = ""
|
||||||
var score: Double = -1.0
|
var score: Double = -1.0
|
||||||
var count: Int = 0
|
var count: Int = 0
|
||||||
|
var category: UnigramCategory
|
||||||
var description: String {
|
var description: String {
|
||||||
"(\(key), \(value), \(score))"
|
"(\(key), \(value), \(score), \(category)"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -143,7 +161,7 @@ private var exceptedChars: Set<String> = .init()
|
||||||
|
|
||||||
func rawDictForPhrases(isCHS: Bool) -> [Unigram] {
|
func rawDictForPhrases(isCHS: Bool) -> [Unigram] {
|
||||||
var arrUnigramRAW: [Unigram] = []
|
var arrUnigramRAW: [Unigram] = []
|
||||||
var strRAW = ""
|
var strRAWOrig: [String] = []
|
||||||
let urlCustom: String = isCHS ? urlCHSforCustom : urlCHTforCustom
|
let urlCustom: String = isCHS ? urlCHSforCustom : urlCHTforCustom
|
||||||
let urlTABE: String = isCHS ? urlCHSforTABE : urlCHTforTABE
|
let urlTABE: String = isCHS ? urlCHSforTABE : urlCHTforTABE
|
||||||
let urlMOE: String = isCHS ? urlCHSforMOE : urlCHTforMOE
|
let urlMOE: String = isCHS ? urlCHSforMOE : urlCHTforMOE
|
||||||
|
@ -151,17 +169,20 @@ func rawDictForPhrases(isCHS: Bool) -> [Unigram] {
|
||||||
let i18n: String = isCHS ? "簡體中文" : "繁體中文"
|
let i18n: String = isCHS ? "簡體中文" : "繁體中文"
|
||||||
// 讀取內容
|
// 讀取內容
|
||||||
do {
|
do {
|
||||||
strRAW += try String(contentsOfFile: urlCustom, encoding: .utf8)
|
let str1 = try String(contentsOfFile: urlCustom, encoding: .utf8)
|
||||||
strRAW += "\n"
|
let str2 = try String(contentsOfFile: urlTABE, encoding: .utf8)
|
||||||
strRAW += try String(contentsOfFile: urlTABE, encoding: .utf8)
|
let str3 = try String(contentsOfFile: urlMOE, encoding: .utf8)
|
||||||
strRAW += "\n"
|
let str4 = try String(contentsOfFile: urlVCHEW, encoding: .utf8)
|
||||||
strRAW += try String(contentsOfFile: urlMOE, encoding: .utf8)
|
strRAWOrig.append(str1)
|
||||||
strRAW += "\n"
|
strRAWOrig.append(str2)
|
||||||
strRAW += try String(contentsOfFile: urlVCHEW, encoding: .utf8)
|
strRAWOrig.append(str3)
|
||||||
|
strRAWOrig.append(str4)
|
||||||
} catch {
|
} catch {
|
||||||
NSLog(" - Exception happened when reading raw phrases data.")
|
NSLog(" - Exception happened when reading raw phrases data.")
|
||||||
return []
|
return []
|
||||||
}
|
}
|
||||||
|
for i in 0 ..< strRAWOrig.count {
|
||||||
|
var strRAW = strRAWOrig[i]
|
||||||
// 預處理格式
|
// 預處理格式
|
||||||
strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記
|
strRAW = strRAW.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記
|
||||||
// CJKWhiteSpace (\x{3000}) to ASCII Space
|
// CJKWhiteSpace (\x{3000}) to ASCII Space
|
||||||
|
@ -172,10 +193,20 @@ func rawDictForPhrases(isCHS: Bool) -> [Unigram] {
|
||||||
strRAW.regReplace(pattern: #"(^ | $)"#, replaceWith: "") // 去除行尾行首空格
|
strRAW.regReplace(pattern: #"(^ | $)"#, replaceWith: "") // 去除行尾行首空格
|
||||||
strRAW.regReplace(pattern: #"(\f+|\r+|\n+)+"#, replaceWith: "\n") // CR & Form Feed to LF, 且去除重複行
|
strRAW.regReplace(pattern: #"(\f+|\r+|\n+)+"#, replaceWith: "\n") // CR & Form Feed to LF, 且去除重複行
|
||||||
strRAW.regReplace(pattern: #"^(#.*|.*#WIN32.*)$"#, replaceWith: "") // 以#開頭的行都淨空+去掉所有 WIN32 特有的行
|
strRAW.regReplace(pattern: #"^(#.*|.*#WIN32.*)$"#, replaceWith: "") // 以#開頭的行都淨空+去掉所有 WIN32 特有的行
|
||||||
// 正式整理格式,現在就開始去重複:
|
strRAWOrig[i] = strRAW
|
||||||
let arrData = Array(
|
|
||||||
NSOrderedSet(array: strRAW.components(separatedBy: "\n")).array as! [String])
|
let currentCategory: Unigram.UnigramCategory = {
|
||||||
for lineData in arrData {
|
switch i {
|
||||||
|
case 0: return .custom
|
||||||
|
case 1: return .tabe
|
||||||
|
case 2: return .moe
|
||||||
|
case 3: return .macv
|
||||||
|
default: return .custom
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
var lineData = ""
|
||||||
|
for lineNeta in strRAW.split(separator: "\n") {
|
||||||
|
lineData = lineNeta.description
|
||||||
// 第三欄開始是注音
|
// 第三欄開始是注音
|
||||||
let arrLineData = lineData.components(separatedBy: " ")
|
let arrLineData = lineData.components(separatedBy: " ")
|
||||||
var varLineDataProcessed = ""
|
var varLineDataProcessed = ""
|
||||||
|
@ -209,11 +240,13 @@ func rawDictForPhrases(isCHS: Bool) -> [Unigram] {
|
||||||
arrUnigramRAW += [
|
arrUnigramRAW += [
|
||||||
Unigram(
|
Unigram(
|
||||||
key: phone, value: phrase, score: 0.0,
|
key: phone, value: phrase, score: 0.0,
|
||||||
count: occurrence
|
count: occurrence, category: currentCategory
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
NSLog(" - \(i18n): 成功生成詞語語料辭典(權重待計算)。")
|
NSLog(" - \(i18n): 成功生成詞語語料辭典(權重待計算)。")
|
||||||
return arrUnigramRAW
|
return arrUnigramRAW
|
||||||
}
|
}
|
||||||
|
@ -292,7 +325,7 @@ func rawDictForKanjis(isCHS: Bool) -> [Unigram] {
|
||||||
arrUnigramRAW += [
|
arrUnigramRAW += [
|
||||||
Unigram(
|
Unigram(
|
||||||
key: phone, value: phrase, score: 0.0,
|
key: phone, value: phrase, score: 0.0,
|
||||||
count: occurrence
|
count: occurrence, category: .misc
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -378,7 +411,7 @@ func rawDictForNonKanjis(isCHS: Bool) -> [Unigram] {
|
||||||
arrUnigramRAW += [
|
arrUnigramRAW += [
|
||||||
Unigram(
|
Unigram(
|
||||||
key: phone, value: phrase, score: 0.0,
|
key: phone, value: phrase, score: 0.0,
|
||||||
count: occurrence
|
count: occurrence, category: .misc
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -421,7 +454,7 @@ func weightAndSort(_ arrStructUncalculated: [Unigram], isCHS: Bool) -> [Unigram]
|
||||||
arrStructCalculated += [
|
arrStructCalculated += [
|
||||||
Unigram(
|
Unigram(
|
||||||
key: unigram.key, value: unigram.value, score: weightRounded,
|
key: unigram.key, value: unigram.value, score: weightRounded,
|
||||||
count: unigram.count
|
count: unigram.count, category: unigram.category
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -432,7 +465,7 @@ func weightAndSort(_ arrStructUncalculated: [Unigram], isCHS: Bool) -> [Unigram]
|
||||||
(lhs.key, rhs.count) < (rhs.key, lhs.count)
|
(lhs.key, rhs.count) < (rhs.key, lhs.count)
|
||||||
})
|
})
|
||||||
NSLog(" - \(i18n): 排序整理完畢,準備編譯要寫入的檔案內容。")
|
NSLog(" - \(i18n): 排序整理完畢,準備編譯要寫入的檔案內容。")
|
||||||
arrStructSorted.append(Unigram(key: "__NORM__", value: norm.description, score: 0, count: 0))
|
arrStructSorted.append(Unigram(key: "__NORM__", value: norm.description, score: 0, count: 0, category: .misc))
|
||||||
return arrStructSorted
|
return arrStructSorted
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -690,7 +723,7 @@ func healthCheck(_ data: [Unigram]) -> String {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var faulty = [Unigram]()
|
var faulty = [[String]: [Unigram]]()
|
||||||
var indifferents: [(String, String, Double, [Unigram], Double)] = []
|
var indifferents: [(String, String, Double, [Unigram], Double)] = []
|
||||||
var insufficients: [(String, String, Double, [Unigram], Double)] = []
|
var insufficients: [(String, String, Double, [Unigram], Double)] = []
|
||||||
var competingUnigrams = [(String, Double, String, Double)]()
|
var competingUnigrams = [(String, Double, String, Double)]()
|
||||||
|
@ -701,8 +734,18 @@ func healthCheck(_ data: [Unigram]) -> String {
|
||||||
var bad = false
|
var bad = false
|
||||||
let checkPerCharMachingStatus: Bool = neta.key.split(separator: "-").count == neta.value.count
|
let checkPerCharMachingStatus: Bool = neta.key.split(separator: "-").count == neta.value.count
|
||||||
|
|
||||||
outerMatchCheck: for (i, x) in neta.key.split(separator: "-").enumerated() {
|
var mispronouncedKanji: [String] = []
|
||||||
|
|
||||||
|
let arrNetaKeys = neta.key.split(separator: "-")
|
||||||
|
outerMatchCheck: for (i, x) in arrNetaKeys.enumerated() {
|
||||||
if !unigramMonoChar.keys.contains(String(x)) {
|
if !unigramMonoChar.keys.contains(String(x)) {
|
||||||
|
if neta.value.count == 1 {
|
||||||
|
mispronouncedKanji.append("\(neta.category)@\(neta.value)@\(neta.key)")
|
||||||
|
} else if neta.value.count == arrNetaKeys.count {
|
||||||
|
mispronouncedKanji.append("\(neta.category)@\(neta.value.map(\.description)[i])@\(arrNetaKeys[i])")
|
||||||
|
} else {
|
||||||
|
mispronouncedKanji.append("\(neta.category)@OTHER@\(String(x))")
|
||||||
|
}
|
||||||
bad = true
|
bad = true
|
||||||
break outerMatchCheck
|
break outerMatchCheck
|
||||||
}
|
}
|
||||||
|
@ -710,12 +753,14 @@ func healthCheck(_ data: [Unigram]) -> String {
|
||||||
let char = neta.value.map(\.description)[i]
|
let char = neta.value.map(\.description)[i]
|
||||||
if exceptedChars.contains(char) { break innerMatchCheck }
|
if exceptedChars.contains(char) { break innerMatchCheck }
|
||||||
guard let queriedPhones = mapReverseLookupForCheck[char] else {
|
guard let queriedPhones = mapReverseLookupForCheck[char] else {
|
||||||
|
mispronouncedKanji.append("\(neta.category)@\(char)@\(String(x))")
|
||||||
bad = true
|
bad = true
|
||||||
break outerMatchCheck
|
break outerMatchCheck
|
||||||
}
|
}
|
||||||
for queriedPhone in queriedPhones {
|
for queriedPhone in queriedPhones {
|
||||||
if queriedPhone == x.description { break innerMatchCheck }
|
if queriedPhone == x.description { break innerMatchCheck }
|
||||||
}
|
}
|
||||||
|
mispronouncedKanji.append("\(neta.category)@\(char)@\(String(x))")
|
||||||
bad = true
|
bad = true
|
||||||
break outerMatchCheck
|
break outerMatchCheck
|
||||||
}
|
}
|
||||||
|
@ -725,7 +770,7 @@ func healthCheck(_ data: [Unigram]) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
if bad {
|
if bad {
|
||||||
faulty.append(neta)
|
faulty[mispronouncedKanji, default: []].append(neta)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if tscore >= neta.score {
|
if tscore >= neta.score {
|
||||||
|
@ -942,7 +987,7 @@ func healthCheck(_ data: [Unigram]) -> String {
|
||||||
printl(separator)
|
printl(separator)
|
||||||
printl("下述單元圖用到了漢字核心表當中尚未收錄的讀音,可能無法正常輸入:")
|
printl("下述單元圖用到了漢字核心表當中尚未收錄的讀音,可能無法正常輸入:")
|
||||||
for content in faulty {
|
for content in faulty {
|
||||||
printl(content.description)
|
printl("\(content.key): \(content.value)")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue