diff --git a/DataCompiler/dataCompiler.swift b/DataCompiler/dataCompiler.swift index 941d4f13..39ec92c8 100644 --- a/DataCompiler/dataCompiler.swift +++ b/DataCompiler/dataCompiler.swift @@ -480,7 +480,7 @@ func fileOutput(isCHS: Bool) { if neta.count >= 2 { let theKey = String(neta[0]) let theValue = String(neta[1]) - if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#", !theKey.contains("_punctuation_list") { + if !neta[0].isEmpty, !neta[1].isEmpty, line.first != "#" { rangeMap[cnvPhonabetToASCII(theKey), default: []].append(theValue.data(using: .utf8)!) } } diff --git a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift index 0d590867..57118396 100644 --- a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift +++ b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/LMInstantiator.swift @@ -346,6 +346,10 @@ public extension vChewingLM { // 新增與日期、時間、星期有關的單元圖資料 rawAllUnigrams.append(contentsOf: queryDateTimeUnigrams(with: keyChain)) + if keyChain == "_punctuation_list" { + rawAllUnigrams.append(contentsOf: lmCore.getHaninSymbolMenuUnigrams()) + } + // 提前處理語彙置換 if isPhraseReplacementEnabled { for i in 0 ..< rawAllUnigrams.count { diff --git a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCoreNS.swift b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCoreNS.swift index 45d015b0..61e63f8c 100644 --- a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCoreNS.swift +++ b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCoreNS.swift @@ -128,10 +128,31 @@ public extension vChewingLM { vCLog(strDump) } + public func getHaninSymbolMenuUnigrams() -> [Megrez.Unigram] { + let key = "_punctuation_list" + var grams: [Megrez.Unigram] = [] + guard let arrRangeRecords: [Data] = dataMap[cnvPhonabetToASCII(key)] else { return grams } + for netaSet in arrRangeRecords { + let strNetaSet = String(decoding: netaSet, as: UTF8.self) + let neta = Array(strNetaSet.trimmingCharacters(in: .newlines).split(separator: " ").reversed()) + let theValue: String = .init(neta[0]) + var theScore = defaultScore + if neta.count >= 2, !shouldForceDefaultScore { + theScore = .init(String(neta[1])) ?? defaultScore + } + if theScore > 0 { + theScore *= -1 // 應對可能忘記寫負號的情形 + } + grams.append(Megrez.Unigram(value: theValue, score: theScore)) + } + return grams + } + /// 根據給定的讀音索引鍵,來獲取資料庫辭典內的對應資料陣列的 UTF8 資料、就地分析、生成單元圖陣列。 /// - parameters: /// - key: 讀音索引鍵。 public func unigramsFor(key: String) -> [Megrez.Unigram] { + if key == "_punctuation_list" { return [] } var grams: [Megrez.Unigram] = [] var gramsHW: [Megrez.Unigram] = [] guard let arrRangeRecords: [Data] = dataMap[cnvPhonabetToASCII(key)] else { return grams } @@ -148,10 +169,9 @@ public extension vChewingLM { } grams.append(Megrez.Unigram(value: theValue, score: theScore)) if !key.contains("_punctuation") { continue } - if let halfValue = theValue.applyingTransform(.fullwidthToHalfwidth, reverse: false) { - if halfValue != theValue { - gramsHW.append(Megrez.Unigram(value: halfValue, score: theScore)) - } + let halfValue = theValue.applyingTransformFW2HW(reverse: false) + if halfValue != theValue { + gramsHW.append(Megrez.Unigram(value: halfValue, score: theScore)) } } grams.append(contentsOf: gramsHW)