UPE // UTF8 Surrogate Pair Issue Fix.

- We don't use RegEx to remove the trailing spaces of each lines anymore. Reason: Swift built-in RegEx relies on NSRange which has no UTF8 surrogate pair support, leading to incorrectly-calculated modification spot when dealing with lines that contains surrogate-pair characters.
- Also optimized the format-consolidating process.
This commit is contained in:
ShikiSuen 2022-02-26 14:28:03 +08:00
parent 9a8c720632
commit f971339f3e
1 changed files with 19 additions and 14 deletions

View File

@ -28,26 +28,31 @@ extension String {
}
mutating func formatConsolidate(HYPY2BPMF: Bool) {
// Step 1: Consolidating formats per line.
var arrData = self.components(separatedBy: "\n")
var varLineData = ""
var strProcessed = ""
for lineData in arrData {
varLineData = lineData
varLineData.regReplace(pattern: " ", replaceWith: " ") // CJKWhiteSpace to ASCIISpace
varLineData.regReplace(pattern: " ", replaceWith: " ") // NonBreakWhiteSpace to ASCIISpace
varLineData.regReplace(pattern: "\\s+", replaceWith: " ") // Consolidating Consecutive Spaves
varLineData.regReplace(pattern: "^\\s", replaceWith: "") // Trim Leading Space
varLineData.regReplace(pattern: "\\s$", replaceWith: "") // Trim Trailing Space
strProcessed += varLineData
strProcessed += "\n"
var strProcessed = self
//
strProcessed = strProcessed.replacingOccurrences(of: " #MACOS", with: "") // macOS
strProcessed = strProcessed.replacingOccurrences(of: " ", with: " ") // CJKWhiteSpace (\x{3000}) to ASCII Space
strProcessed = strProcessed.replacingOccurrences(of: " ", with: " ") // NonBreakWhiteSpace (\x{A0}) to ASCII Space
strProcessed = strProcessed.replacingOccurrences(of: "\t", with: " ") // Tab to ASCII Space
strProcessed.regReplace(pattern: "\\f", replaceWith: "\n") // Form Feed to LF
strProcessed = strProcessed.replacingOccurrences(of: "\r", with: "\n") // CR to LF
strProcessed.regReplace(pattern: " +", replaceWith: " ") // ASCII
strProcessed.regReplace(pattern: "\\n+", replaceWith: "\n") // LF LF
strProcessed = strProcessed.replacingOccurrences(of: " \n", with: "\n") //
strProcessed = strProcessed.replacingOccurrences(of: "\n ", with: "\n") //
if strProcessed.prefix(1) == " " { //
strProcessed.removeFirst()
}
if strProcessed.suffix(1) == " " { //
strProcessed.removeLast()
}
var arrData = [""]
if HYPY2BPMF {
// Step 0: Convert HanyuPinyin to Bopomofo.
arrData = strProcessed.components(separatedBy: "\n")
strProcessed = "" // Reset its value
for lineData in arrData {
varLineData = lineData
var varLineData = lineData
//
//
// 調