UPE // UTF8 Surrogate Pair Issue Fix.

- We don't use RegEx to remove the trailing spaces of each lines anymore. Reason: Swift built-in RegEx relies on NSRange which has no UTF8 surrogate pair support, leading to incorrectly-calculated modification spot when dealing with lines that contains surrogate-pair characters. - Also optimized the format-consolidating process.
2022-02-26 14:28:03 +08:00 · 2022-02-26 14:28:03 +08:00 · 0218625ab3
parent 62db6a7105
commit 0218625ab3
1 changed files with 19 additions and 14 deletions
--- a/UserPhraseEditor/StringExtension.swift
+++ b/UserPhraseEditor/StringExtension.swift
@ -28,26 +28,31 @@ extension String {
    }
    mutating func formatConsolidate(HYPY2BPMF: Bool) {
        // Step 1: Consolidating formats per line.
-        var arrData = self.components(separatedBy: "\n")
-        var varLineData = ""
-        var strProcessed = ""
-        for lineData in arrData {
-            varLineData = lineData
-            varLineData.regReplace(pattern: "　", replaceWith: " ") // CJKWhiteSpace to ASCIISpace
-            varLineData.regReplace(pattern: " ", replaceWith: " ") // NonBreakWhiteSpace to ASCIISpace
-            varLineData.regReplace(pattern: "\\s+", replaceWith: " ") // Consolidating Consecutive Spaves
-            varLineData.regReplace(pattern: "^\\s", replaceWith: "") // Trim Leading Space
-            varLineData.regReplace(pattern: "\\s$", replaceWith: "") // Trim Trailing Space
-            strProcessed += varLineData
-            strProcessed += "\n"
+        var strProcessed = self
+        // 預處理格式
+        strProcessed = strProcessed.replacingOccurrences(of: " #MACOS", with: "") // 去掉 macOS 標記
+        strProcessed = strProcessed.replacingOccurrences(of: "　", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space
+        strProcessed = strProcessed.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space
+        strProcessed = strProcessed.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space
+        strProcessed.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF
+        strProcessed = strProcessed.replacingOccurrences(of: "\r", with: "\n") // CR to LF
+        strProcessed.regReplace(pattern: " +", replaceWith: " ") // 統整連續空格為一個 ASCII 空格
+        strProcessed.regReplace(pattern: "\\n+", replaceWith: "\n") // 統整連續 LF 為一個 LF
+        strProcessed = strProcessed.replacingOccurrences(of: " \n", with: "\n") // 去除行尾空格
+        strProcessed = strProcessed.replacingOccurrences(of: "\n ", with: "\n") // 去除行首空格
+        if strProcessed.prefix(1) == " " { // 去除檔案開頭空格
+            strProcessed.removeFirst()
        }
-        
+        if strProcessed.suffix(1) == " " { // 去除檔案結尾空格
+            strProcessed.removeLast()
+        }
+        var arrData = [""]
        if HYPY2BPMF {
            // Step 0: Convert HanyuPinyin to Bopomofo.
            arrData = strProcessed.components(separatedBy: "\n")
            strProcessed = "" // Reset its value
            for lineData in arrData {
-                varLineData = lineData
+                var varLineData = lineData
                // 漢語拼音轉注音，得先從最長的可能的拼音組合開始轉起，
                // 這樣等轉換到更短的可能的漢語拼音組合時就不會出錯。
                // 依此類推，聲調放在最後來轉換。