UPE // UTF8 Surrogate Pair Issue Fix.
- We don't use RegEx to remove the trailing spaces of each lines anymore. Reason: Swift built-in RegEx relies on NSRange which has no UTF8 surrogate pair support, leading to incorrectly-calculated modification spot when dealing with lines that contains surrogate-pair characters. - Also optimized the format-consolidating process.
This commit is contained in:
parent
9a8c720632
commit
f971339f3e
|
@ -28,26 +28,31 @@ extension String {
|
||||||
}
|
}
|
||||||
mutating func formatConsolidate(HYPY2BPMF: Bool) {
|
mutating func formatConsolidate(HYPY2BPMF: Bool) {
|
||||||
// Step 1: Consolidating formats per line.
|
// Step 1: Consolidating formats per line.
|
||||||
var arrData = self.components(separatedBy: "\n")
|
var strProcessed = self
|
||||||
var varLineData = ""
|
// 預處理格式
|
||||||
var strProcessed = ""
|
strProcessed = strProcessed.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記
|
||||||
for lineData in arrData {
|
strProcessed = strProcessed.replacingOccurrences(of: " ", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space
|
||||||
varLineData = lineData
|
strProcessed = strProcessed.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space
|
||||||
varLineData.regReplace(pattern: " ", replaceWith: " ") // CJKWhiteSpace to ASCIISpace
|
strProcessed = strProcessed.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space
|
||||||
varLineData.regReplace(pattern: " ", replaceWith: " ") // NonBreakWhiteSpace to ASCIISpace
|
strProcessed.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF
|
||||||
varLineData.regReplace(pattern: "\\s+", replaceWith: " ") // Consolidating Consecutive Spaves
|
strProcessed = strProcessed.replacingOccurrences(of: "\r", with: "\n") // CR to LF
|
||||||
varLineData.regReplace(pattern: "^\\s", replaceWith: "") // Trim Leading Space
|
strProcessed.regReplace(pattern: " +", replaceWith: " ") // 統整連續空格為一個 ASCII 空格
|
||||||
varLineData.regReplace(pattern: "\\s$", replaceWith: "") // Trim Trailing Space
|
strProcessed.regReplace(pattern: "\\n+", replaceWith: "\n") // 統整連續 LF 為一個 LF
|
||||||
strProcessed += varLineData
|
strProcessed = strProcessed.replacingOccurrences(of: " \n", with: "\n") // 去除行尾空格
|
||||||
strProcessed += "\n"
|
strProcessed = strProcessed.replacingOccurrences(of: "\n ", with: "\n") // 去除行首空格
|
||||||
|
if strProcessed.prefix(1) == " " { // 去除檔案開頭空格
|
||||||
|
strProcessed.removeFirst()
|
||||||
}
|
}
|
||||||
|
if strProcessed.suffix(1) == " " { // 去除檔案結尾空格
|
||||||
|
strProcessed.removeLast()
|
||||||
|
}
|
||||||
|
var arrData = [""]
|
||||||
if HYPY2BPMF {
|
if HYPY2BPMF {
|
||||||
// Step 0: Convert HanyuPinyin to Bopomofo.
|
// Step 0: Convert HanyuPinyin to Bopomofo.
|
||||||
arrData = strProcessed.components(separatedBy: "\n")
|
arrData = strProcessed.components(separatedBy: "\n")
|
||||||
strProcessed = "" // Reset its value
|
strProcessed = "" // Reset its value
|
||||||
for lineData in arrData {
|
for lineData in arrData {
|
||||||
varLineData = lineData
|
var varLineData = lineData
|
||||||
// 漢語拼音轉注音,得先從最長的可能的拼音組合開始轉起,
|
// 漢語拼音轉注音,得先從最長的可能的拼音組合開始轉起,
|
||||||
// 這樣等轉換到更短的可能的漢語拼音組合時就不會出錯。
|
// 這樣等轉換到更短的可能的漢語拼音組合時就不會出錯。
|
||||||
// 依此類推,聲調放在最後來轉換。
|
// 依此類推,聲調放在最後來轉換。
|
||||||
|
|
Loading…
Reference in New Issue