UPE // UTF8 Surrogate Pair Issue Fix.

- We don't use RegEx to remove the trailing spaces of each lines anymore. Reason: Swift built-in RegEx relies on NSRange which has no UTF8 surrogate pair support, leading to incorrectly-calculated modification spot when dealing with lines that contains surrogate-pair characters.
- Also optimized the format-consolidating process.
This commit is contained in:
ShikiSuen 2022-02-26 14:28:03 +08:00
parent 62db6a7105
commit 0218625ab3
1 changed files with 19 additions and 14 deletions

View File

@ -28,26 +28,31 @@ extension String {
} }
mutating func formatConsolidate(HYPY2BPMF: Bool) { mutating func formatConsolidate(HYPY2BPMF: Bool) {
// Step 1: Consolidating formats per line. // Step 1: Consolidating formats per line.
var arrData = self.components(separatedBy: "\n") var strProcessed = self
var varLineData = "" //
var strProcessed = "" strProcessed = strProcessed.replacingOccurrences(of: " #MACOS", with: "") // macOS
for lineData in arrData { strProcessed = strProcessed.replacingOccurrences(of: " ", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space
varLineData = lineData strProcessed = strProcessed.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space
varLineData.regReplace(pattern: " ", replaceWith: " ") // CJKWhiteSpace to ASCIISpace strProcessed = strProcessed.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space
varLineData.regReplace(pattern: " ", replaceWith: " ") // NonBreakWhiteSpace to ASCIISpace strProcessed.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF
varLineData.regReplace(pattern: "\\s+", replaceWith: " ") // Consolidating Consecutive Spaves strProcessed = strProcessed.replacingOccurrences(of: "\r", with: "\n") // CR to LF
varLineData.regReplace(pattern: "^\\s", replaceWith: "") // Trim Leading Space strProcessed.regReplace(pattern: " +", replaceWith: " ") // ASCII
varLineData.regReplace(pattern: "\\s$", replaceWith: "") // Trim Trailing Space strProcessed.regReplace(pattern: "\\n+", replaceWith: "\n") // LF LF
strProcessed += varLineData strProcessed = strProcessed.replacingOccurrences(of: " \n", with: "\n") //
strProcessed += "\n" strProcessed = strProcessed.replacingOccurrences(of: "\n ", with: "\n") //
if strProcessed.prefix(1) == " " { //
strProcessed.removeFirst()
} }
if strProcessed.suffix(1) == " " { //
strProcessed.removeLast()
}
var arrData = [""]
if HYPY2BPMF { if HYPY2BPMF {
// Step 0: Convert HanyuPinyin to Bopomofo. // Step 0: Convert HanyuPinyin to Bopomofo.
arrData = strProcessed.components(separatedBy: "\n") arrData = strProcessed.components(separatedBy: "\n")
strProcessed = "" // Reset its value strProcessed = "" // Reset its value
for lineData in arrData { for lineData in arrData {
varLineData = lineData var varLineData = lineData
// //
// //
// 調 // 調