UOM // Implementing new function sets from upstream.
- Keeping previous methods for generating keys.
This commit is contained in:
parent
0cb8ffd649
commit
e053273815
|
@ -126,31 +126,26 @@ public class KeyHandler {
|
||||||
return arrResult
|
return arrResult
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 在組字器內,以給定之候選字字串、來試圖在給定游標位置所在之處指定選字處理過程。
|
/// 在組字器內,以給定之候選字(詞音配對)、來試圖在給定游標位置所在之處指定選字處理過程。
|
||||||
/// 然後再將對應的節錨內的節點標記為「已經手動選字過」。
|
/// 然後再將對應的節錨內的節點標記為「已經手動選字過」。
|
||||||
/// - Parameters:
|
/// - Parameters:
|
||||||
/// - value: 給定之候選字字串。
|
/// - value: 給定之候選字(詞音配對)。
|
||||||
/// - respectCursorPushing: 若該選項為 true,則會在選字之後始終將游標推送至選字後的節錨的前方。
|
/// - respectCursorPushing: 若該選項為 true,則會在選字之後始終將游標推送至選字後的節錨的前方。
|
||||||
func fixNode(candidate: (String, String), respectCursorPushing: Bool = true) {
|
func fixNode(candidate: (String, String), respectCursorPushing: Bool = true) {
|
||||||
let actualCursor = actualCandidateCursor
|
let actualCursor = actualCandidateCursor
|
||||||
let theCandidate: Megrez.Compositor.Candidate = .init(key: candidate.0, value: candidate.1)
|
let theCandidate: Megrez.Compositor.Candidate = .init(key: candidate.0, value: candidate.1)
|
||||||
if !compositor.overrideCandidate(theCandidate, at: actualCursor) { return }
|
if !compositor.overrideCandidate(theCandidate, at: actualCursor, overrideType: .withHighScore) { return }
|
||||||
|
let previousWalk = compositor.walkedNodes
|
||||||
// 開始爬軌。
|
// 開始爬軌。
|
||||||
walk()
|
walk()
|
||||||
|
let currentWalk = compositor.walkedNodes
|
||||||
|
|
||||||
// 在可行的情況下更新使用者半衰記憶模組。
|
// 在可行的情況下更新使用者半衰記憶模組。
|
||||||
var accumulatedCursor = 0
|
var accumulatedCursor = 0
|
||||||
var currentNode: Megrez.Compositor.Node?
|
let currentNode = currentWalk.findNode(at: actualCandidateCursor, target: &accumulatedCursor)
|
||||||
for node in compositor.walkedNodes {
|
|
||||||
accumulatedCursor += node.spanLength
|
|
||||||
if accumulatedCursor > actualCursor {
|
|
||||||
currentNode = node
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
guard let currentNode = currentNode else { return }
|
guard let currentNode = currentNode else { return }
|
||||||
|
|
||||||
if currentNode.currentUnigram.score > -12 {
|
if currentNode.currentUnigram.score > -12, mgrPrefs.fetchSuggestionsFromUserOverrideModel {
|
||||||
IME.prtDebugIntel("UOM: Start Observation.")
|
IME.prtDebugIntel("UOM: Start Observation.")
|
||||||
// 這個過程可能會因為使用者半衰記憶模組內部資料錯亂、而導致輸入法在選字時崩潰。
|
// 這個過程可能會因為使用者半衰記憶模組內部資料錯亂、而導致輸入法在選字時崩潰。
|
||||||
// 於是在這裡引入災後狀況察覺專用變數,且先開啟該開關。順利執行完觀察後會關閉。
|
// 於是在這裡引入災後狀況察覺專用變數,且先開啟該開關。順利執行完觀察後會關閉。
|
||||||
|
@ -158,9 +153,9 @@ public class KeyHandler {
|
||||||
mgrPrefs.failureFlagForUOMObservation = true
|
mgrPrefs.failureFlagForUOMObservation = true
|
||||||
// 令半衰記憶模組觀測給定的三元圖。
|
// 令半衰記憶模組觀測給定的三元圖。
|
||||||
// 這個過程會讓半衰引擎根據當前上下文生成三元圖索引鍵。
|
// 這個過程會讓半衰引擎根據當前上下文生成三元圖索引鍵。
|
||||||
currentUOM.observe(
|
currentUOM.performObservation(
|
||||||
walkedNodes: compositor.walkedNodes, cursorIndex: actualCursor, candidate: theCandidate.value,
|
walkedBefore: previousWalk, walkedAfter: currentWalk, cursor: actualCandidateCursor,
|
||||||
timestamp: NSDate().timeIntervalSince1970, saveCallback: { mgrLangModel.saveUserOverrideModelData() }
|
timestamp: Date().timeIntervalSince1970, saveCallback: { mgrLangModel.saveUserOverrideModelData() }
|
||||||
)
|
)
|
||||||
// 如果沒有出現崩框的話,那就將這個開關復位。
|
// 如果沒有出現崩框的話,那就將這個開關復位。
|
||||||
mgrPrefs.failureFlagForUOMObservation = false
|
mgrPrefs.failureFlagForUOMObservation = false
|
||||||
|
@ -196,7 +191,7 @@ public class KeyHandler {
|
||||||
return arrCandidates.map { ($0.key, $0.value) }
|
return arrCandidates.map { ($0.key, $0.value) }
|
||||||
}
|
}
|
||||||
|
|
||||||
let arrSuggestedUnigrams: [(String, Megrez.Unigram)] = fetchSuggestedCandidates()
|
let arrSuggestedUnigrams: [(String, Megrez.Unigram)] = fetchSuggestionsFromUOM(apply: false)
|
||||||
let arrSuggestedCandidates: [Megrez.Compositor.Candidate] = arrSuggestedUnigrams.map {
|
let arrSuggestedCandidates: [Megrez.Compositor.Candidate] = arrSuggestedUnigrams.map {
|
||||||
Megrez.Compositor.Candidate(key: $0.0, value: $0.1.value)
|
Megrez.Compositor.Candidate(key: $0.0, value: $0.1.value)
|
||||||
}
|
}
|
||||||
|
@ -206,32 +201,40 @@ public class KeyHandler {
|
||||||
return arrCandidates.map { ($0.key, $0.value) }
|
return arrCandidates.map { ($0.key, $0.value) }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 向半衰引擎詢問可能的選字建議。拿到的結果會是一個單元圖陣列,會自動按權重排序。
|
|
||||||
func fetchSuggestedCandidates() -> [(String, Megrez.Unigram)] {
|
|
||||||
currentUOM.suggest(
|
|
||||||
walkedNodes: compositor.walkedNodes, cursorIndex: compositor.cursor,
|
|
||||||
timestamp: NSDate().timeIntervalSince1970
|
|
||||||
).stableSort { $0.1.score > $1.1.score }
|
|
||||||
}
|
|
||||||
|
|
||||||
/// 向半衰引擎詢問可能的選字建議、且套用給組字器內的當前游標位置。
|
/// 向半衰引擎詢問可能的選字建議、且套用給組字器內的當前游標位置。
|
||||||
func fetchAndApplySuggestionsFromUserOverrideModel() {
|
@discardableResult func fetchSuggestionsFromUOM(apply: Bool) -> [(String, Megrez.Unigram)] {
|
||||||
|
var arrResult = [(String, Megrez.Unigram)]()
|
||||||
/// 如果逐字選字模式有啟用的話,直接放棄執行這個函式。
|
/// 如果逐字選字模式有啟用的話,直接放棄執行這個函式。
|
||||||
if mgrPrefs.useSCPCTypingMode { return }
|
if mgrPrefs.useSCPCTypingMode { return arrResult }
|
||||||
/// 如果這個開關沒打開的話,直接放棄執行這個函式。
|
/// 如果這個開關沒打開的話,直接放棄執行這個函式。
|
||||||
if !mgrPrefs.fetchSuggestionsFromUserOverrideModel { return }
|
if !mgrPrefs.fetchSuggestionsFromUserOverrideModel { return arrResult }
|
||||||
/// 先就當前上下文讓半衰引擎重新生成三元圖索引鍵。
|
/// 獲取來自半衰記憶模組的建議結果
|
||||||
let overrideValue = fetchSuggestedCandidates().first?.1.value ?? ""
|
let suggestion = currentUOM.fetchSuggestion(
|
||||||
|
currentWalk: compositor.walkedNodes, cursor: actualCandidateCursor, timestamp: Date().timeIntervalSince1970
|
||||||
/// 再拿著索引鍵去問半衰模組有沒有選字建議。有的話就遵循之、讓天權星引擎對指定節錨下的節點複寫權重。
|
)
|
||||||
if !overrideValue.isEmpty {
|
arrResult.append(contentsOf: suggestion.candidates)
|
||||||
IME.prtDebugIntel(
|
if apply {
|
||||||
"UOM: Suggestion retrieved, overriding the node score of the selected candidate.")
|
/// 再看有沒有選字建議。有的話就遵循之、讓天權星引擎對指定節錨下的節點複寫權重。
|
||||||
// TODO: 這裡回頭改成用詞音配對來覆寫的形式。
|
if !suggestion.isEmpty, let newestSuggestedCandidate = suggestion.candidates.last {
|
||||||
compositor.overrideCandidateLiteral(overrideValue, at: actualCandidateCursor, overrideType: .withTopUnigramScore)
|
let overrideBehavior: Megrez.Compositor.Node.OverrideType =
|
||||||
} else {
|
suggestion.forceHighScoreOverride ? .withHighScore : .withTopUnigramScore
|
||||||
IME.prtDebugIntel("UOM: Blank suggestion retrieved, dismissing.")
|
let suggestedPair: Megrez.Compositor.Candidate = .init(
|
||||||
|
key: newestSuggestedCandidate.0, value: newestSuggestedCandidate.1.value
|
||||||
|
)
|
||||||
|
IME.prtDebugIntel(
|
||||||
|
"UOM: Suggestion retrieved, overriding the node score of the selected candidate: \(suggestedPair.toNGramKey)")
|
||||||
|
if !compositor.overrideCandidate(suggestedPair, at: actualCandidateCursor, overrideType: overrideBehavior) {
|
||||||
|
compositor.overrideCandidateLiteral(
|
||||||
|
newestSuggestedCandidate.1.value, at: actualCandidateCursor, overrideType: overrideBehavior
|
||||||
|
)
|
||||||
|
}
|
||||||
|
walk()
|
||||||
|
} else {
|
||||||
|
IME.prtDebugIntel("UOM: Blank suggestion retrieved, dismissing.")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
arrResult = arrResult.stableSort { $0.1.score > $1.1.score }
|
||||||
|
return arrResult
|
||||||
}
|
}
|
||||||
|
|
||||||
// MARK: - Extracted methods and functions (Tekkon).
|
// MARK: - Extracted methods and functions (Tekkon).
|
||||||
|
|
|
@ -87,7 +87,7 @@ extension KeyHandler {
|
||||||
walk()
|
walk()
|
||||||
|
|
||||||
// 看看半衰記憶模組是否會對目前的狀態給出自動選字建議。
|
// 看看半衰記憶模組是否會對目前的狀態給出自動選字建議。
|
||||||
fetchAndApplySuggestionsFromUserOverrideModel()
|
fetchSuggestionsFromUOM(apply: true)
|
||||||
|
|
||||||
// 之後就是更新組字區了。先清空注拼槽的內容。
|
// 之後就是更新組字區了。先清空注拼槽的內容。
|
||||||
composer.clear()
|
composer.clear()
|
||||||
|
|
|
@ -26,174 +26,46 @@ extension vChewing {
|
||||||
mutDecayExponent = log(0.5) / decayConstant
|
mutDecayExponent = log(0.5) / decayConstant
|
||||||
}
|
}
|
||||||
|
|
||||||
public func observe(
|
public func performObservation(
|
||||||
walkedNodes: [Megrez.Compositor.Node],
|
walkedBefore: [Megrez.Compositor.Node], walkedAfter: [Megrez.Compositor.Node],
|
||||||
cursorIndex: Int,
|
cursor: Int, timestamp: Double, saveCallback: @escaping () -> Void
|
||||||
candidate: String,
|
|
||||||
timestamp: Double,
|
|
||||||
saveCallback: @escaping () -> Void
|
|
||||||
) {
|
) {
|
||||||
let key = convertKeyFrom(walkedNodes: walkedNodes, cursorIndex: cursorIndex)
|
// 參數合規性檢查。
|
||||||
|
guard !walkedAfter.isEmpty, !walkedBefore.isEmpty else { return }
|
||||||
|
guard walkedBefore.totalReadingsCount == walkedAfter.totalReadingsCount else { return }
|
||||||
|
// 先判斷用哪種覆寫方法。
|
||||||
|
var actualCursor = 0
|
||||||
|
guard let currentNode = walkedAfter.findNode(at: cursor, target: &actualCursor) else { return }
|
||||||
|
// 當前節點超過三個字的話,就不記憶了。在這種情形下,使用者可以考慮新增自訂語彙。
|
||||||
|
guard currentNode.spanLength <= 3 else { return }
|
||||||
|
// 前一個節點得從前一次爬軌結果當中來找。
|
||||||
|
guard actualCursor > 0 else { return } // 該情況應該不會出現。
|
||||||
|
let currentNodeIndex = actualCursor
|
||||||
|
actualCursor -= 1
|
||||||
|
var prevNodeIndex = 0
|
||||||
|
guard let prevNode = walkedBefore.findNode(at: actualCursor, target: &prevNodeIndex) else { return }
|
||||||
|
|
||||||
|
let forceHighScoreOverride: Bool = currentNode.spanLength > prevNode.spanLength
|
||||||
|
let breakingUp = currentNode.spanLength == 1 && prevNode.spanLength > 1
|
||||||
|
|
||||||
|
let targetNodeIndex = breakingUp ? currentNodeIndex : prevNodeIndex
|
||||||
|
let key: String = vChewing.LMUserOverride.formObservationKey(
|
||||||
|
walkedNodes: walkedAfter, headIndex: targetNodeIndex
|
||||||
|
)
|
||||||
guard !key.isEmpty else { return }
|
guard !key.isEmpty else { return }
|
||||||
|
doObservation(
|
||||||
guard mutLRUMap[key] != nil else {
|
key: key, candidate: currentNode.currentUnigram.value, timestamp: timestamp,
|
||||||
var observation: Observation = .init()
|
forceHighScoreOverride: forceHighScoreOverride, saveCallback: { saveCallback() }
|
||||||
observation.update(candidate: candidate, timestamp: timestamp)
|
)
|
||||||
let koPair = KeyObservationPair(key: key, observation: observation)
|
|
||||||
// 先移除 key 再設定 key 的話,就可以影響這個 key 在辭典內的順位。
|
|
||||||
// Swift 原生的辭典是沒有數字索引排序的,但資料的插入順序卻有保存著。
|
|
||||||
mutLRUMap.removeValue(forKey: key)
|
|
||||||
mutLRUMap[key] = koPair
|
|
||||||
mutLRUList.insert(koPair, at: 0)
|
|
||||||
|
|
||||||
if mutLRUList.count > mutCapacity {
|
|
||||||
mutLRUMap.removeValue(forKey: mutLRUList[mutLRUList.endIndex].key)
|
|
||||||
mutLRUList.removeLast()
|
|
||||||
}
|
|
||||||
IME.prtDebugIntel("UOM: Observation finished with new observation: \(key)")
|
|
||||||
saveCallback()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// 降低磁碟寫入次數。唯有失憶的情況下才會更新觀察且記憶。
|
|
||||||
if var theNeta = mutLRUMap[key] {
|
|
||||||
_ = suggest(
|
|
||||||
walkedNodes: walkedNodes, cursorIndex: cursorIndex, timestamp: timestamp,
|
|
||||||
decayCallback: {
|
|
||||||
theNeta.observation.update(candidate: candidate, timestamp: timestamp)
|
|
||||||
self.mutLRUList.insert(theNeta, at: 0)
|
|
||||||
self.mutLRUMap[key] = theNeta
|
|
||||||
IME.prtDebugIntel("UOM: Observation finished with existing observation: \(key)")
|
|
||||||
saveCallback()
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public func suggest(
|
public func fetchSuggestion(
|
||||||
walkedNodes: [Megrez.Compositor.Node],
|
currentWalk: [Megrez.Compositor.Node], cursor: Int, timestamp: Double
|
||||||
cursorIndex: Int,
|
) -> Suggestion {
|
||||||
timestamp: Double,
|
var headIndex = 0
|
||||||
decayCallback: @escaping () -> Void = {}
|
guard let nodeIter = currentWalk.findNode(at: cursor, target: &headIndex) else { return .init() }
|
||||||
) -> [(String, Megrez.Unigram)] {
|
let key = vChewing.LMUserOverride.formObservationKey(walkedNodes: currentWalk, headIndex: headIndex)
|
||||||
let key = convertKeyFrom(walkedNodes: walkedNodes, cursorIndex: cursorIndex)
|
return getSuggestion(key: key, timestamp: timestamp, headReading: nodeIter.key)
|
||||||
guard !key.isEmpty else {
|
|
||||||
IME.prtDebugIntel("UOM: Blank key generated on suggestion, aborting suggestion.")
|
|
||||||
return .init()
|
|
||||||
}
|
|
||||||
let currentReadingKey = convertKeyFrom(walkedNodes: walkedNodes, cursorIndex: cursorIndex, readingOnly: true)
|
|
||||||
guard let koPair = mutLRUMap[key] else {
|
|
||||||
IME.prtDebugIntel("UOM: mutLRUMap[key] is nil, throwing blank suggestion for key: \(key).")
|
|
||||||
return .init()
|
|
||||||
}
|
|
||||||
|
|
||||||
let observation = koPair.observation
|
|
||||||
|
|
||||||
var arrResults = [(String, Megrez.Unigram)]()
|
|
||||||
var currentHighScore = 0.0
|
|
||||||
for overrideNeta in Array(observation.overrides) {
|
|
||||||
let override: Override = overrideNeta.value
|
|
||||||
|
|
||||||
let overrideScore: Double = getScore(
|
|
||||||
eventCount: override.count,
|
|
||||||
totalCount: observation.count,
|
|
||||||
eventTimestamp: override.timestamp,
|
|
||||||
timestamp: timestamp,
|
|
||||||
lambda: mutDecayExponent
|
|
||||||
)
|
|
||||||
if (0...currentHighScore).contains(overrideScore) { continue }
|
|
||||||
|
|
||||||
let overrideDetectionScore: Double = getScore(
|
|
||||||
eventCount: override.count,
|
|
||||||
totalCount: observation.count,
|
|
||||||
eventTimestamp: override.timestamp,
|
|
||||||
timestamp: timestamp,
|
|
||||||
lambda: mutDecayExponent * 2
|
|
||||||
)
|
|
||||||
if (0...currentHighScore).contains(overrideDetectionScore) { decayCallback() }
|
|
||||||
|
|
||||||
let newUnigram = Megrez.Unigram(value: overrideNeta.key, score: overrideScore)
|
|
||||||
arrResults.insert((currentReadingKey, newUnigram), at: 0)
|
|
||||||
currentHighScore = overrideScore
|
|
||||||
}
|
|
||||||
if arrResults.isEmpty {
|
|
||||||
IME.prtDebugIntel("UOM: No usable suggestions in the result for key: \(key).")
|
|
||||||
}
|
|
||||||
return arrResults
|
|
||||||
}
|
|
||||||
|
|
||||||
private func getScore(
|
|
||||||
eventCount: Int,
|
|
||||||
totalCount: Int,
|
|
||||||
eventTimestamp: Double,
|
|
||||||
timestamp: Double,
|
|
||||||
lambda: Double
|
|
||||||
) -> Double {
|
|
||||||
let decay = exp((timestamp - eventTimestamp) * lambda)
|
|
||||||
if decay < kDecayThreshold { return 0.0 }
|
|
||||||
let prob = Double(eventCount) / Double(totalCount)
|
|
||||||
return prob * decay
|
|
||||||
}
|
|
||||||
|
|
||||||
func convertKeyFrom(
|
|
||||||
walkedNodes: [Megrez.Compositor.Node], cursorIndex: Int, readingOnly: Bool = false
|
|
||||||
) -> String {
|
|
||||||
let whiteList = "你他妳她祢衪它牠再在"
|
|
||||||
var arrNodes: [Megrez.Compositor.Node] = []
|
|
||||||
var intLength = 0
|
|
||||||
for theNodeAnchor in walkedNodes {
|
|
||||||
arrNodes.append(theNodeAnchor)
|
|
||||||
intLength += theNodeAnchor.spanLength
|
|
||||||
if intLength >= cursorIndex {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if arrNodes.isEmpty { return "" }
|
|
||||||
|
|
||||||
arrNodes = Array(arrNodes.reversed())
|
|
||||||
|
|
||||||
let kvCurrent = arrNodes[0].currentPair
|
|
||||||
guard !kvCurrent.key.contains("_") else {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
// 字音數與字數不一致的內容會被拋棄。
|
|
||||||
if kvCurrent.key.split(separator: "-").count != kvCurrent.value.count { return "" }
|
|
||||||
|
|
||||||
// 前置單元只記錄讀音,在其後的單元則同時記錄讀音與字詞
|
|
||||||
let strCurrent = kvCurrent.key
|
|
||||||
var kvPrevious = Megrez.KeyValuePaired()
|
|
||||||
var kvAnterior = Megrez.KeyValuePaired()
|
|
||||||
var readingStack = ""
|
|
||||||
var trigramKey: String { "(\(kvAnterior.toNGramKey),\(kvPrevious.toNGramKey),\(strCurrent))" }
|
|
||||||
var result: String {
|
|
||||||
// 不要把單個漢字的 kvCurrent 當前鍵值領頭的單元圖記入資料庫,不然對敲字體驗破壞太大。
|
|
||||||
if readingStack.contains("_")
|
|
||||||
|| (!kvPrevious.isValid && kvCurrent.value.count == 1 && !whiteList.contains(kvCurrent.value))
|
|
||||||
{
|
|
||||||
return ""
|
|
||||||
} else {
|
|
||||||
return (readingOnly ? strCurrent : trigramKey)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if arrNodes.count >= 2,
|
|
||||||
!kvPrevious.key.contains("_"),
|
|
||||||
kvPrevious.key.split(separator: "-").count == kvPrevious.value.count
|
|
||||||
{
|
|
||||||
kvPrevious = arrNodes[1].currentPair
|
|
||||||
readingStack = kvPrevious.key + readingStack
|
|
||||||
}
|
|
||||||
|
|
||||||
if arrNodes.count >= 3,
|
|
||||||
!kvAnterior.key.contains("_"),
|
|
||||||
kvAnterior.key.split(separator: "-").count == kvAnterior.value.count
|
|
||||||
{
|
|
||||||
kvAnterior = arrNodes[2].currentPair
|
|
||||||
readingStack = kvAnterior.key + readingStack
|
|
||||||
}
|
|
||||||
|
|
||||||
return result
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -208,6 +80,7 @@ extension vChewing.LMUserOverride {
|
||||||
struct Override: Hashable, Encodable, Decodable {
|
struct Override: Hashable, Encodable, Decodable {
|
||||||
var count: Int = 0
|
var count: Int = 0
|
||||||
var timestamp: Double = 0.0
|
var timestamp: Double = 0.0
|
||||||
|
var forceHighScoreOverride = false
|
||||||
static func == (lhs: Override, rhs: Override) -> Bool {
|
static func == (lhs: Override, rhs: Override) -> Bool {
|
||||||
lhs.count == rhs.count && lhs.timestamp == rhs.timestamp
|
lhs.count == rhs.count && lhs.timestamp == rhs.timestamp
|
||||||
}
|
}
|
||||||
|
@ -242,11 +115,12 @@ extension vChewing.LMUserOverride {
|
||||||
hasher.combine(overrides)
|
hasher.combine(overrides)
|
||||||
}
|
}
|
||||||
|
|
||||||
mutating func update(candidate: String, timestamp: Double) {
|
mutating func update(candidate: String, timestamp: Double, forceHighScoreOverride: Bool = false) {
|
||||||
count += 1
|
count += 1
|
||||||
if overrides.keys.contains(candidate) {
|
if overrides.keys.contains(candidate) {
|
||||||
overrides[candidate]?.timestamp = timestamp
|
overrides[candidate]?.timestamp = timestamp
|
||||||
overrides[candidate]?.count += 1
|
overrides[candidate]?.count += 1
|
||||||
|
overrides[candidate]?.forceHighScoreOverride = forceHighScoreOverride
|
||||||
} else {
|
} else {
|
||||||
overrides[candidate] = .init(count: 1, timestamp: timestamp)
|
overrides[candidate] = .init(count: 1, timestamp: timestamp)
|
||||||
}
|
}
|
||||||
|
@ -331,4 +205,192 @@ extension vChewing.LMUserOverride {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public struct Suggestion {
|
||||||
|
var candidates = [(String, Megrez.Unigram)]()
|
||||||
|
var forceHighScoreOverride = false
|
||||||
|
var isEmpty: Bool { candidates.isEmpty }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Array Extensions.
|
||||||
|
|
||||||
|
extension Array where Element == Megrez.Compositor.Node {
|
||||||
|
public var totalReadingsCount: Int {
|
||||||
|
var counter = 0
|
||||||
|
for node in self {
|
||||||
|
counter += node.keyArray.count
|
||||||
|
}
|
||||||
|
return counter
|
||||||
|
}
|
||||||
|
|
||||||
|
public func findNode(at cursor: Int, target outCursorPastNode: inout Int) -> Megrez.Compositor.Node? {
|
||||||
|
guard !isEmpty else { return nil }
|
||||||
|
let cursor = Swift.max(0, Swift.min(cursor, keys.count))
|
||||||
|
|
||||||
|
if cursor == 0, let theFirst = first {
|
||||||
|
outCursorPastNode = theFirst.spanLength
|
||||||
|
return theFirst
|
||||||
|
}
|
||||||
|
|
||||||
|
// 同時應對「游標在右端」與「游標離右端還差一個位置」的情形。
|
||||||
|
if cursor >= keys.count - 1, let theLast = last {
|
||||||
|
outCursorPastNode = keys.count
|
||||||
|
return theLast
|
||||||
|
}
|
||||||
|
|
||||||
|
var accumulated = 0
|
||||||
|
for neta in self {
|
||||||
|
accumulated += neta.spanLength
|
||||||
|
if accumulated > cursor {
|
||||||
|
outCursorPastNode = accumulated
|
||||||
|
return neta
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 下述情形本不應該出現。
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Private Methods
|
||||||
|
|
||||||
|
extension vChewing.LMUserOverride {
|
||||||
|
private func doObservation(
|
||||||
|
key: String, candidate: String, timestamp: Double, forceHighScoreOverride: Bool,
|
||||||
|
saveCallback: @escaping () -> Void
|
||||||
|
) {
|
||||||
|
guard mutLRUMap[key] != nil else {
|
||||||
|
var observation: Observation = .init()
|
||||||
|
observation.update(candidate: candidate, timestamp: timestamp, forceHighScoreOverride: forceHighScoreOverride)
|
||||||
|
let koPair = KeyObservationPair(key: key, observation: observation)
|
||||||
|
// 先移除 key 再設定 key 的話,就可以影響這個 key 在辭典內的順位。
|
||||||
|
// Swift 原生的辭典是沒有數字索引排序的,但資料的插入順序卻有保存著。
|
||||||
|
mutLRUMap.removeValue(forKey: key)
|
||||||
|
mutLRUMap[key] = koPair
|
||||||
|
mutLRUList.insert(koPair, at: 0)
|
||||||
|
|
||||||
|
if mutLRUList.count > mutCapacity {
|
||||||
|
mutLRUMap.removeValue(forKey: mutLRUList[mutLRUList.endIndex].key)
|
||||||
|
mutLRUList.removeLast()
|
||||||
|
}
|
||||||
|
IME.prtDebugIntel("UOM: Observation finished with new observation: \(key)")
|
||||||
|
saveCallback()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// TODO: 降低磁碟寫入次數。唯有失憶的情況下才會更新觀察且記憶。
|
||||||
|
if var theNeta = mutLRUMap[key] {
|
||||||
|
theNeta.observation.update(
|
||||||
|
candidate: candidate, timestamp: timestamp, forceHighScoreOverride: forceHighScoreOverride
|
||||||
|
)
|
||||||
|
mutLRUList.insert(theNeta, at: 0)
|
||||||
|
mutLRUMap[key] = theNeta
|
||||||
|
IME.prtDebugIntel("UOM: Observation finished with existing observation: \(key)")
|
||||||
|
saveCallback()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func getSuggestion(key: String, timestamp: Double, headReading: String) -> Suggestion {
|
||||||
|
guard !key.isEmpty, let kvPair = mutLRUMap[key] else { return .init() }
|
||||||
|
let observation: Observation = kvPair.observation
|
||||||
|
var candidates: [(String, Megrez.Unigram)] = .init()
|
||||||
|
var forceHighScoreOverride = false
|
||||||
|
var score: Double = 0
|
||||||
|
for (i, theObservation) in observation.overrides {
|
||||||
|
let overrideScore = getScore(
|
||||||
|
eventCount: theObservation.count, totalCount: observation.count,
|
||||||
|
eventTimestamp: theObservation.timestamp, timestamp: timestamp, lambda: mutDecayExponent
|
||||||
|
)
|
||||||
|
if overrideScore == 0.0 { continue }
|
||||||
|
if overrideScore > score {
|
||||||
|
candidates.append((headReading, .init(value: i, score: overrideScore)))
|
||||||
|
forceHighScoreOverride = theObservation.forceHighScoreOverride
|
||||||
|
score = overrideScore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return .init(candidates: candidates, forceHighScoreOverride: forceHighScoreOverride)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func getScore(
|
||||||
|
eventCount: Int,
|
||||||
|
totalCount: Int,
|
||||||
|
eventTimestamp: Double,
|
||||||
|
timestamp: Double,
|
||||||
|
lambda: Double
|
||||||
|
) -> Double {
|
||||||
|
let decay = exp((timestamp - eventTimestamp) * lambda)
|
||||||
|
if decay < kDecayThreshold { return 0.0 }
|
||||||
|
let prob = Double(eventCount) / Double(totalCount)
|
||||||
|
return prob * decay
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func isPunctuation(_ node: Megrez.Compositor.Node) -> Bool {
|
||||||
|
for key in node.keyArray {
|
||||||
|
guard let firstChar = key.first else { continue }
|
||||||
|
return String(firstChar) == "_"
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func formObservationKey(
|
||||||
|
walkedNodes: [Megrez.Compositor.Node], headIndex cursorIndex: Int, readingOnly: Bool = false
|
||||||
|
) -> String {
|
||||||
|
let whiteList = "你他妳她祢衪它牠再在"
|
||||||
|
var arrNodes: [Megrez.Compositor.Node] = []
|
||||||
|
var intLength = 0
|
||||||
|
for theNodeAnchor in walkedNodes {
|
||||||
|
arrNodes.append(theNodeAnchor)
|
||||||
|
intLength += theNodeAnchor.spanLength
|
||||||
|
if intLength >= cursorIndex {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if arrNodes.isEmpty { return "" }
|
||||||
|
|
||||||
|
arrNodes = Array(arrNodes.reversed())
|
||||||
|
|
||||||
|
let kvCurrent = arrNodes[0].currentPair
|
||||||
|
guard !kvCurrent.key.contains("_") else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// 字音數與字數不一致的內容會被拋棄。
|
||||||
|
if kvCurrent.key.split(separator: "-").count != kvCurrent.value.count { return "" }
|
||||||
|
|
||||||
|
// 前置單元只記錄讀音,在其後的單元則同時記錄讀音與字詞
|
||||||
|
let strCurrent = kvCurrent.key
|
||||||
|
var kvPrevious = Megrez.KeyValuePaired()
|
||||||
|
var kvAnterior = Megrez.KeyValuePaired()
|
||||||
|
var readingStack = ""
|
||||||
|
var trigramKey: String { "(\(kvAnterior.toNGramKey),\(kvPrevious.toNGramKey),\(strCurrent))" }
|
||||||
|
var result: String {
|
||||||
|
// 不要把單個漢字的 kvCurrent 當前鍵值領頭的單元圖記入資料庫,不然對敲字體驗破壞太大。
|
||||||
|
if readingStack.contains("_")
|
||||||
|
|| (!kvPrevious.isValid && kvCurrent.value.count == 1 && !whiteList.contains(kvCurrent.value))
|
||||||
|
{
|
||||||
|
return ""
|
||||||
|
} else {
|
||||||
|
return (readingOnly ? strCurrent : trigramKey)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if arrNodes.count >= 2,
|
||||||
|
!kvPrevious.key.contains("_"),
|
||||||
|
kvPrevious.key.split(separator: "-").count == kvPrevious.value.count
|
||||||
|
{
|
||||||
|
kvPrevious = arrNodes[1].currentPair
|
||||||
|
readingStack = kvPrevious.key + readingStack
|
||||||
|
}
|
||||||
|
|
||||||
|
if arrNodes.count >= 3,
|
||||||
|
!kvAnterior.key.contains("_"),
|
||||||
|
kvAnterior.key.split(separator: "-").count == kvAnterior.value.count
|
||||||
|
{
|
||||||
|
kvAnterior = arrNodes[2].currentPair
|
||||||
|
readingStack = kvAnterior.key + readingStack
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue