UOM // Implementing new function sets from upstream.
- Keeping previous methods for generating keys.
This commit is contained in:
parent
0cb8ffd649
commit
e053273815
|
@ -126,31 +126,26 @@ public class KeyHandler {
|
|||
return arrResult
|
||||
}
|
||||
|
||||
/// 在組字器內,以給定之候選字字串、來試圖在給定游標位置所在之處指定選字處理過程。
|
||||
/// 在組字器內,以給定之候選字(詞音配對)、來試圖在給定游標位置所在之處指定選字處理過程。
|
||||
/// 然後再將對應的節錨內的節點標記為「已經手動選字過」。
|
||||
/// - Parameters:
|
||||
/// - value: 給定之候選字字串。
|
||||
/// - value: 給定之候選字(詞音配對)。
|
||||
/// - respectCursorPushing: 若該選項為 true,則會在選字之後始終將游標推送至選字後的節錨的前方。
|
||||
func fixNode(candidate: (String, String), respectCursorPushing: Bool = true) {
|
||||
let actualCursor = actualCandidateCursor
|
||||
let theCandidate: Megrez.Compositor.Candidate = .init(key: candidate.0, value: candidate.1)
|
||||
if !compositor.overrideCandidate(theCandidate, at: actualCursor) { return }
|
||||
if !compositor.overrideCandidate(theCandidate, at: actualCursor, overrideType: .withHighScore) { return }
|
||||
let previousWalk = compositor.walkedNodes
|
||||
// 開始爬軌。
|
||||
walk()
|
||||
let currentWalk = compositor.walkedNodes
|
||||
|
||||
// 在可行的情況下更新使用者半衰記憶模組。
|
||||
var accumulatedCursor = 0
|
||||
var currentNode: Megrez.Compositor.Node?
|
||||
for node in compositor.walkedNodes {
|
||||
accumulatedCursor += node.spanLength
|
||||
if accumulatedCursor > actualCursor {
|
||||
currentNode = node
|
||||
break
|
||||
}
|
||||
}
|
||||
let currentNode = currentWalk.findNode(at: actualCandidateCursor, target: &accumulatedCursor)
|
||||
guard let currentNode = currentNode else { return }
|
||||
|
||||
if currentNode.currentUnigram.score > -12 {
|
||||
if currentNode.currentUnigram.score > -12, mgrPrefs.fetchSuggestionsFromUserOverrideModel {
|
||||
IME.prtDebugIntel("UOM: Start Observation.")
|
||||
// 這個過程可能會因為使用者半衰記憶模組內部資料錯亂、而導致輸入法在選字時崩潰。
|
||||
// 於是在這裡引入災後狀況察覺專用變數,且先開啟該開關。順利執行完觀察後會關閉。
|
||||
|
@ -158,9 +153,9 @@ public class KeyHandler {
|
|||
mgrPrefs.failureFlagForUOMObservation = true
|
||||
// 令半衰記憶模組觀測給定的三元圖。
|
||||
// 這個過程會讓半衰引擎根據當前上下文生成三元圖索引鍵。
|
||||
currentUOM.observe(
|
||||
walkedNodes: compositor.walkedNodes, cursorIndex: actualCursor, candidate: theCandidate.value,
|
||||
timestamp: NSDate().timeIntervalSince1970, saveCallback: { mgrLangModel.saveUserOverrideModelData() }
|
||||
currentUOM.performObservation(
|
||||
walkedBefore: previousWalk, walkedAfter: currentWalk, cursor: actualCandidateCursor,
|
||||
timestamp: Date().timeIntervalSince1970, saveCallback: { mgrLangModel.saveUserOverrideModelData() }
|
||||
)
|
||||
// 如果沒有出現崩框的話,那就將這個開關復位。
|
||||
mgrPrefs.failureFlagForUOMObservation = false
|
||||
|
@ -196,7 +191,7 @@ public class KeyHandler {
|
|||
return arrCandidates.map { ($0.key, $0.value) }
|
||||
}
|
||||
|
||||
let arrSuggestedUnigrams: [(String, Megrez.Unigram)] = fetchSuggestedCandidates()
|
||||
let arrSuggestedUnigrams: [(String, Megrez.Unigram)] = fetchSuggestionsFromUOM(apply: false)
|
||||
let arrSuggestedCandidates: [Megrez.Compositor.Candidate] = arrSuggestedUnigrams.map {
|
||||
Megrez.Compositor.Candidate(key: $0.0, value: $0.1.value)
|
||||
}
|
||||
|
@ -206,33 +201,41 @@ public class KeyHandler {
|
|||
return arrCandidates.map { ($0.key, $0.value) }
|
||||
}
|
||||
|
||||
/// 向半衰引擎詢問可能的選字建議。拿到的結果會是一個單元圖陣列,會自動按權重排序。
|
||||
func fetchSuggestedCandidates() -> [(String, Megrez.Unigram)] {
|
||||
currentUOM.suggest(
|
||||
walkedNodes: compositor.walkedNodes, cursorIndex: compositor.cursor,
|
||||
timestamp: NSDate().timeIntervalSince1970
|
||||
).stableSort { $0.1.score > $1.1.score }
|
||||
}
|
||||
|
||||
/// 向半衰引擎詢問可能的選字建議、且套用給組字器內的當前游標位置。
|
||||
func fetchAndApplySuggestionsFromUserOverrideModel() {
|
||||
@discardableResult func fetchSuggestionsFromUOM(apply: Bool) -> [(String, Megrez.Unigram)] {
|
||||
var arrResult = [(String, Megrez.Unigram)]()
|
||||
/// 如果逐字選字模式有啟用的話,直接放棄執行這個函式。
|
||||
if mgrPrefs.useSCPCTypingMode { return }
|
||||
if mgrPrefs.useSCPCTypingMode { return arrResult }
|
||||
/// 如果這個開關沒打開的話,直接放棄執行這個函式。
|
||||
if !mgrPrefs.fetchSuggestionsFromUserOverrideModel { return }
|
||||
/// 先就當前上下文讓半衰引擎重新生成三元圖索引鍵。
|
||||
let overrideValue = fetchSuggestedCandidates().first?.1.value ?? ""
|
||||
|
||||
/// 再拿著索引鍵去問半衰模組有沒有選字建議。有的話就遵循之、讓天權星引擎對指定節錨下的節點複寫權重。
|
||||
if !overrideValue.isEmpty {
|
||||
if !mgrPrefs.fetchSuggestionsFromUserOverrideModel { return arrResult }
|
||||
/// 獲取來自半衰記憶模組的建議結果
|
||||
let suggestion = currentUOM.fetchSuggestion(
|
||||
currentWalk: compositor.walkedNodes, cursor: actualCandidateCursor, timestamp: Date().timeIntervalSince1970
|
||||
)
|
||||
arrResult.append(contentsOf: suggestion.candidates)
|
||||
if apply {
|
||||
/// 再看有沒有選字建議。有的話就遵循之、讓天權星引擎對指定節錨下的節點複寫權重。
|
||||
if !suggestion.isEmpty, let newestSuggestedCandidate = suggestion.candidates.last {
|
||||
let overrideBehavior: Megrez.Compositor.Node.OverrideType =
|
||||
suggestion.forceHighScoreOverride ? .withHighScore : .withTopUnigramScore
|
||||
let suggestedPair: Megrez.Compositor.Candidate = .init(
|
||||
key: newestSuggestedCandidate.0, value: newestSuggestedCandidate.1.value
|
||||
)
|
||||
IME.prtDebugIntel(
|
||||
"UOM: Suggestion retrieved, overriding the node score of the selected candidate.")
|
||||
// TODO: 這裡回頭改成用詞音配對來覆寫的形式。
|
||||
compositor.overrideCandidateLiteral(overrideValue, at: actualCandidateCursor, overrideType: .withTopUnigramScore)
|
||||
"UOM: Suggestion retrieved, overriding the node score of the selected candidate: \(suggestedPair.toNGramKey)")
|
||||
if !compositor.overrideCandidate(suggestedPair, at: actualCandidateCursor, overrideType: overrideBehavior) {
|
||||
compositor.overrideCandidateLiteral(
|
||||
newestSuggestedCandidate.1.value, at: actualCandidateCursor, overrideType: overrideBehavior
|
||||
)
|
||||
}
|
||||
walk()
|
||||
} else {
|
||||
IME.prtDebugIntel("UOM: Blank suggestion retrieved, dismissing.")
|
||||
}
|
||||
}
|
||||
arrResult = arrResult.stableSort { $0.1.score > $1.1.score }
|
||||
return arrResult
|
||||
}
|
||||
|
||||
// MARK: - Extracted methods and functions (Tekkon).
|
||||
|
||||
|
|
|
@ -87,7 +87,7 @@ extension KeyHandler {
|
|||
walk()
|
||||
|
||||
// 看看半衰記憶模組是否會對目前的狀態給出自動選字建議。
|
||||
fetchAndApplySuggestionsFromUserOverrideModel()
|
||||
fetchSuggestionsFromUOM(apply: true)
|
||||
|
||||
// 之後就是更新組字區了。先清空注拼槽的內容。
|
||||
composer.clear()
|
||||
|
|
|
@ -26,174 +26,46 @@ extension vChewing {
|
|||
mutDecayExponent = log(0.5) / decayConstant
|
||||
}
|
||||
|
||||
public func observe(
|
||||
walkedNodes: [Megrez.Compositor.Node],
|
||||
cursorIndex: Int,
|
||||
candidate: String,
|
||||
timestamp: Double,
|
||||
saveCallback: @escaping () -> Void
|
||||
public func performObservation(
|
||||
walkedBefore: [Megrez.Compositor.Node], walkedAfter: [Megrez.Compositor.Node],
|
||||
cursor: Int, timestamp: Double, saveCallback: @escaping () -> Void
|
||||
) {
|
||||
let key = convertKeyFrom(walkedNodes: walkedNodes, cursorIndex: cursorIndex)
|
||||
// 參數合規性檢查。
|
||||
guard !walkedAfter.isEmpty, !walkedBefore.isEmpty else { return }
|
||||
guard walkedBefore.totalReadingsCount == walkedAfter.totalReadingsCount else { return }
|
||||
// 先判斷用哪種覆寫方法。
|
||||
var actualCursor = 0
|
||||
guard let currentNode = walkedAfter.findNode(at: cursor, target: &actualCursor) else { return }
|
||||
// 當前節點超過三個字的話,就不記憶了。在這種情形下,使用者可以考慮新增自訂語彙。
|
||||
guard currentNode.spanLength <= 3 else { return }
|
||||
// 前一個節點得從前一次爬軌結果當中來找。
|
||||
guard actualCursor > 0 else { return } // 該情況應該不會出現。
|
||||
let currentNodeIndex = actualCursor
|
||||
actualCursor -= 1
|
||||
var prevNodeIndex = 0
|
||||
guard let prevNode = walkedBefore.findNode(at: actualCursor, target: &prevNodeIndex) else { return }
|
||||
|
||||
let forceHighScoreOverride: Bool = currentNode.spanLength > prevNode.spanLength
|
||||
let breakingUp = currentNode.spanLength == 1 && prevNode.spanLength > 1
|
||||
|
||||
let targetNodeIndex = breakingUp ? currentNodeIndex : prevNodeIndex
|
||||
let key: String = vChewing.LMUserOverride.formObservationKey(
|
||||
walkedNodes: walkedAfter, headIndex: targetNodeIndex
|
||||
)
|
||||
guard !key.isEmpty else { return }
|
||||
|
||||
guard mutLRUMap[key] != nil else {
|
||||
var observation: Observation = .init()
|
||||
observation.update(candidate: candidate, timestamp: timestamp)
|
||||
let koPair = KeyObservationPair(key: key, observation: observation)
|
||||
// 先移除 key 再設定 key 的話,就可以影響這個 key 在辭典內的順位。
|
||||
// Swift 原生的辭典是沒有數字索引排序的,但資料的插入順序卻有保存著。
|
||||
mutLRUMap.removeValue(forKey: key)
|
||||
mutLRUMap[key] = koPair
|
||||
mutLRUList.insert(koPair, at: 0)
|
||||
|
||||
if mutLRUList.count > mutCapacity {
|
||||
mutLRUMap.removeValue(forKey: mutLRUList[mutLRUList.endIndex].key)
|
||||
mutLRUList.removeLast()
|
||||
}
|
||||
IME.prtDebugIntel("UOM: Observation finished with new observation: \(key)")
|
||||
saveCallback()
|
||||
return
|
||||
}
|
||||
// 降低磁碟寫入次數。唯有失憶的情況下才會更新觀察且記憶。
|
||||
if var theNeta = mutLRUMap[key] {
|
||||
_ = suggest(
|
||||
walkedNodes: walkedNodes, cursorIndex: cursorIndex, timestamp: timestamp,
|
||||
decayCallback: {
|
||||
theNeta.observation.update(candidate: candidate, timestamp: timestamp)
|
||||
self.mutLRUList.insert(theNeta, at: 0)
|
||||
self.mutLRUMap[key] = theNeta
|
||||
IME.prtDebugIntel("UOM: Observation finished with existing observation: \(key)")
|
||||
saveCallback()
|
||||
}
|
||||
doObservation(
|
||||
key: key, candidate: currentNode.currentUnigram.value, timestamp: timestamp,
|
||||
forceHighScoreOverride: forceHighScoreOverride, saveCallback: { saveCallback() }
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
public func suggest(
|
||||
walkedNodes: [Megrez.Compositor.Node],
|
||||
cursorIndex: Int,
|
||||
timestamp: Double,
|
||||
decayCallback: @escaping () -> Void = {}
|
||||
) -> [(String, Megrez.Unigram)] {
|
||||
let key = convertKeyFrom(walkedNodes: walkedNodes, cursorIndex: cursorIndex)
|
||||
guard !key.isEmpty else {
|
||||
IME.prtDebugIntel("UOM: Blank key generated on suggestion, aborting suggestion.")
|
||||
return .init()
|
||||
}
|
||||
let currentReadingKey = convertKeyFrom(walkedNodes: walkedNodes, cursorIndex: cursorIndex, readingOnly: true)
|
||||
guard let koPair = mutLRUMap[key] else {
|
||||
IME.prtDebugIntel("UOM: mutLRUMap[key] is nil, throwing blank suggestion for key: \(key).")
|
||||
return .init()
|
||||
}
|
||||
|
||||
let observation = koPair.observation
|
||||
|
||||
var arrResults = [(String, Megrez.Unigram)]()
|
||||
var currentHighScore = 0.0
|
||||
for overrideNeta in Array(observation.overrides) {
|
||||
let override: Override = overrideNeta.value
|
||||
|
||||
let overrideScore: Double = getScore(
|
||||
eventCount: override.count,
|
||||
totalCount: observation.count,
|
||||
eventTimestamp: override.timestamp,
|
||||
timestamp: timestamp,
|
||||
lambda: mutDecayExponent
|
||||
)
|
||||
if (0...currentHighScore).contains(overrideScore) { continue }
|
||||
|
||||
let overrideDetectionScore: Double = getScore(
|
||||
eventCount: override.count,
|
||||
totalCount: observation.count,
|
||||
eventTimestamp: override.timestamp,
|
||||
timestamp: timestamp,
|
||||
lambda: mutDecayExponent * 2
|
||||
)
|
||||
if (0...currentHighScore).contains(overrideDetectionScore) { decayCallback() }
|
||||
|
||||
let newUnigram = Megrez.Unigram(value: overrideNeta.key, score: overrideScore)
|
||||
arrResults.insert((currentReadingKey, newUnigram), at: 0)
|
||||
currentHighScore = overrideScore
|
||||
}
|
||||
if arrResults.isEmpty {
|
||||
IME.prtDebugIntel("UOM: No usable suggestions in the result for key: \(key).")
|
||||
}
|
||||
return arrResults
|
||||
}
|
||||
|
||||
private func getScore(
|
||||
eventCount: Int,
|
||||
totalCount: Int,
|
||||
eventTimestamp: Double,
|
||||
timestamp: Double,
|
||||
lambda: Double
|
||||
) -> Double {
|
||||
let decay = exp((timestamp - eventTimestamp) * lambda)
|
||||
if decay < kDecayThreshold { return 0.0 }
|
||||
let prob = Double(eventCount) / Double(totalCount)
|
||||
return prob * decay
|
||||
}
|
||||
|
||||
func convertKeyFrom(
|
||||
walkedNodes: [Megrez.Compositor.Node], cursorIndex: Int, readingOnly: Bool = false
|
||||
) -> String {
|
||||
let whiteList = "你他妳她祢衪它牠再在"
|
||||
var arrNodes: [Megrez.Compositor.Node] = []
|
||||
var intLength = 0
|
||||
for theNodeAnchor in walkedNodes {
|
||||
arrNodes.append(theNodeAnchor)
|
||||
intLength += theNodeAnchor.spanLength
|
||||
if intLength >= cursorIndex {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if arrNodes.isEmpty { return "" }
|
||||
|
||||
arrNodes = Array(arrNodes.reversed())
|
||||
|
||||
let kvCurrent = arrNodes[0].currentPair
|
||||
guard !kvCurrent.key.contains("_") else {
|
||||
return ""
|
||||
}
|
||||
|
||||
// 字音數與字數不一致的內容會被拋棄。
|
||||
if kvCurrent.key.split(separator: "-").count != kvCurrent.value.count { return "" }
|
||||
|
||||
// 前置單元只記錄讀音,在其後的單元則同時記錄讀音與字詞
|
||||
let strCurrent = kvCurrent.key
|
||||
var kvPrevious = Megrez.KeyValuePaired()
|
||||
var kvAnterior = Megrez.KeyValuePaired()
|
||||
var readingStack = ""
|
||||
var trigramKey: String { "(\(kvAnterior.toNGramKey),\(kvPrevious.toNGramKey),\(strCurrent))" }
|
||||
var result: String {
|
||||
// 不要把單個漢字的 kvCurrent 當前鍵值領頭的單元圖記入資料庫,不然對敲字體驗破壞太大。
|
||||
if readingStack.contains("_")
|
||||
|| (!kvPrevious.isValid && kvCurrent.value.count == 1 && !whiteList.contains(kvCurrent.value))
|
||||
{
|
||||
return ""
|
||||
} else {
|
||||
return (readingOnly ? strCurrent : trigramKey)
|
||||
}
|
||||
}
|
||||
|
||||
if arrNodes.count >= 2,
|
||||
!kvPrevious.key.contains("_"),
|
||||
kvPrevious.key.split(separator: "-").count == kvPrevious.value.count
|
||||
{
|
||||
kvPrevious = arrNodes[1].currentPair
|
||||
readingStack = kvPrevious.key + readingStack
|
||||
}
|
||||
|
||||
if arrNodes.count >= 3,
|
||||
!kvAnterior.key.contains("_"),
|
||||
kvAnterior.key.split(separator: "-").count == kvAnterior.value.count
|
||||
{
|
||||
kvAnterior = arrNodes[2].currentPair
|
||||
readingStack = kvAnterior.key + readingStack
|
||||
}
|
||||
|
||||
return result
|
||||
public func fetchSuggestion(
|
||||
currentWalk: [Megrez.Compositor.Node], cursor: Int, timestamp: Double
|
||||
) -> Suggestion {
|
||||
var headIndex = 0
|
||||
guard let nodeIter = currentWalk.findNode(at: cursor, target: &headIndex) else { return .init() }
|
||||
let key = vChewing.LMUserOverride.formObservationKey(walkedNodes: currentWalk, headIndex: headIndex)
|
||||
return getSuggestion(key: key, timestamp: timestamp, headReading: nodeIter.key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -208,6 +80,7 @@ extension vChewing.LMUserOverride {
|
|||
struct Override: Hashable, Encodable, Decodable {
|
||||
var count: Int = 0
|
||||
var timestamp: Double = 0.0
|
||||
var forceHighScoreOverride = false
|
||||
static func == (lhs: Override, rhs: Override) -> Bool {
|
||||
lhs.count == rhs.count && lhs.timestamp == rhs.timestamp
|
||||
}
|
||||
|
@ -242,11 +115,12 @@ extension vChewing.LMUserOverride {
|
|||
hasher.combine(overrides)
|
||||
}
|
||||
|
||||
mutating func update(candidate: String, timestamp: Double) {
|
||||
mutating func update(candidate: String, timestamp: Double, forceHighScoreOverride: Bool = false) {
|
||||
count += 1
|
||||
if overrides.keys.contains(candidate) {
|
||||
overrides[candidate]?.timestamp = timestamp
|
||||
overrides[candidate]?.count += 1
|
||||
overrides[candidate]?.forceHighScoreOverride = forceHighScoreOverride
|
||||
} else {
|
||||
overrides[candidate] = .init(count: 1, timestamp: timestamp)
|
||||
}
|
||||
|
@ -331,4 +205,192 @@ extension vChewing.LMUserOverride {
|
|||
return
|
||||
}
|
||||
}
|
||||
|
||||
public struct Suggestion {
|
||||
var candidates = [(String, Megrez.Unigram)]()
|
||||
var forceHighScoreOverride = false
|
||||
var isEmpty: Bool { candidates.isEmpty }
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Array Extensions.
|
||||
|
||||
extension Array where Element == Megrez.Compositor.Node {
|
||||
public var totalReadingsCount: Int {
|
||||
var counter = 0
|
||||
for node in self {
|
||||
counter += node.keyArray.count
|
||||
}
|
||||
return counter
|
||||
}
|
||||
|
||||
public func findNode(at cursor: Int, target outCursorPastNode: inout Int) -> Megrez.Compositor.Node? {
|
||||
guard !isEmpty else { return nil }
|
||||
let cursor = Swift.max(0, Swift.min(cursor, keys.count))
|
||||
|
||||
if cursor == 0, let theFirst = first {
|
||||
outCursorPastNode = theFirst.spanLength
|
||||
return theFirst
|
||||
}
|
||||
|
||||
// 同時應對「游標在右端」與「游標離右端還差一個位置」的情形。
|
||||
if cursor >= keys.count - 1, let theLast = last {
|
||||
outCursorPastNode = keys.count
|
||||
return theLast
|
||||
}
|
||||
|
||||
var accumulated = 0
|
||||
for neta in self {
|
||||
accumulated += neta.spanLength
|
||||
if accumulated > cursor {
|
||||
outCursorPastNode = accumulated
|
||||
return neta
|
||||
}
|
||||
}
|
||||
|
||||
// 下述情形本不應該出現。
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Private Methods
|
||||
|
||||
extension vChewing.LMUserOverride {
|
||||
private func doObservation(
|
||||
key: String, candidate: String, timestamp: Double, forceHighScoreOverride: Bool,
|
||||
saveCallback: @escaping () -> Void
|
||||
) {
|
||||
guard mutLRUMap[key] != nil else {
|
||||
var observation: Observation = .init()
|
||||
observation.update(candidate: candidate, timestamp: timestamp, forceHighScoreOverride: forceHighScoreOverride)
|
||||
let koPair = KeyObservationPair(key: key, observation: observation)
|
||||
// 先移除 key 再設定 key 的話,就可以影響這個 key 在辭典內的順位。
|
||||
// Swift 原生的辭典是沒有數字索引排序的,但資料的插入順序卻有保存著。
|
||||
mutLRUMap.removeValue(forKey: key)
|
||||
mutLRUMap[key] = koPair
|
||||
mutLRUList.insert(koPair, at: 0)
|
||||
|
||||
if mutLRUList.count > mutCapacity {
|
||||
mutLRUMap.removeValue(forKey: mutLRUList[mutLRUList.endIndex].key)
|
||||
mutLRUList.removeLast()
|
||||
}
|
||||
IME.prtDebugIntel("UOM: Observation finished with new observation: \(key)")
|
||||
saveCallback()
|
||||
return
|
||||
}
|
||||
// TODO: 降低磁碟寫入次數。唯有失憶的情況下才會更新觀察且記憶。
|
||||
if var theNeta = mutLRUMap[key] {
|
||||
theNeta.observation.update(
|
||||
candidate: candidate, timestamp: timestamp, forceHighScoreOverride: forceHighScoreOverride
|
||||
)
|
||||
mutLRUList.insert(theNeta, at: 0)
|
||||
mutLRUMap[key] = theNeta
|
||||
IME.prtDebugIntel("UOM: Observation finished with existing observation: \(key)")
|
||||
saveCallback()
|
||||
}
|
||||
}
|
||||
|
||||
private func getSuggestion(key: String, timestamp: Double, headReading: String) -> Suggestion {
|
||||
guard !key.isEmpty, let kvPair = mutLRUMap[key] else { return .init() }
|
||||
let observation: Observation = kvPair.observation
|
||||
var candidates: [(String, Megrez.Unigram)] = .init()
|
||||
var forceHighScoreOverride = false
|
||||
var score: Double = 0
|
||||
for (i, theObservation) in observation.overrides {
|
||||
let overrideScore = getScore(
|
||||
eventCount: theObservation.count, totalCount: observation.count,
|
||||
eventTimestamp: theObservation.timestamp, timestamp: timestamp, lambda: mutDecayExponent
|
||||
)
|
||||
if overrideScore == 0.0 { continue }
|
||||
if overrideScore > score {
|
||||
candidates.append((headReading, .init(value: i, score: overrideScore)))
|
||||
forceHighScoreOverride = theObservation.forceHighScoreOverride
|
||||
score = overrideScore
|
||||
}
|
||||
}
|
||||
return .init(candidates: candidates, forceHighScoreOverride: forceHighScoreOverride)
|
||||
}
|
||||
|
||||
private func getScore(
|
||||
eventCount: Int,
|
||||
totalCount: Int,
|
||||
eventTimestamp: Double,
|
||||
timestamp: Double,
|
||||
lambda: Double
|
||||
) -> Double {
|
||||
let decay = exp((timestamp - eventTimestamp) * lambda)
|
||||
if decay < kDecayThreshold { return 0.0 }
|
||||
let prob = Double(eventCount) / Double(totalCount)
|
||||
return prob * decay
|
||||
}
|
||||
|
||||
private static func isPunctuation(_ node: Megrez.Compositor.Node) -> Bool {
|
||||
for key in node.keyArray {
|
||||
guard let firstChar = key.first else { continue }
|
||||
return String(firstChar) == "_"
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
private static func formObservationKey(
|
||||
walkedNodes: [Megrez.Compositor.Node], headIndex cursorIndex: Int, readingOnly: Bool = false
|
||||
) -> String {
|
||||
let whiteList = "你他妳她祢衪它牠再在"
|
||||
var arrNodes: [Megrez.Compositor.Node] = []
|
||||
var intLength = 0
|
||||
for theNodeAnchor in walkedNodes {
|
||||
arrNodes.append(theNodeAnchor)
|
||||
intLength += theNodeAnchor.spanLength
|
||||
if intLength >= cursorIndex {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if arrNodes.isEmpty { return "" }
|
||||
|
||||
arrNodes = Array(arrNodes.reversed())
|
||||
|
||||
let kvCurrent = arrNodes[0].currentPair
|
||||
guard !kvCurrent.key.contains("_") else {
|
||||
return ""
|
||||
}
|
||||
|
||||
// 字音數與字數不一致的內容會被拋棄。
|
||||
if kvCurrent.key.split(separator: "-").count != kvCurrent.value.count { return "" }
|
||||
|
||||
// 前置單元只記錄讀音,在其後的單元則同時記錄讀音與字詞
|
||||
let strCurrent = kvCurrent.key
|
||||
var kvPrevious = Megrez.KeyValuePaired()
|
||||
var kvAnterior = Megrez.KeyValuePaired()
|
||||
var readingStack = ""
|
||||
var trigramKey: String { "(\(kvAnterior.toNGramKey),\(kvPrevious.toNGramKey),\(strCurrent))" }
|
||||
var result: String {
|
||||
// 不要把單個漢字的 kvCurrent 當前鍵值領頭的單元圖記入資料庫,不然對敲字體驗破壞太大。
|
||||
if readingStack.contains("_")
|
||||
|| (!kvPrevious.isValid && kvCurrent.value.count == 1 && !whiteList.contains(kvCurrent.value))
|
||||
{
|
||||
return ""
|
||||
} else {
|
||||
return (readingOnly ? strCurrent : trigramKey)
|
||||
}
|
||||
}
|
||||
|
||||
if arrNodes.count >= 2,
|
||||
!kvPrevious.key.contains("_"),
|
||||
kvPrevious.key.split(separator: "-").count == kvPrevious.value.count
|
||||
{
|
||||
kvPrevious = arrNodes[1].currentPair
|
||||
readingStack = kvPrevious.key + readingStack
|
||||
}
|
||||
|
||||
if arrNodes.count >= 3,
|
||||
!kvAnterior.key.contains("_"),
|
||||
kvAnterior.key.split(separator: "-").count == kvAnterior.value.count
|
||||
{
|
||||
kvAnterior = arrNodes[2].currentPair
|
||||
readingStack = kvAnterior.key + readingStack
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue