實作簡單的用戶選字記憶模型

這個模型基本上只是根據游標前的兩個 unigram 記憶當前的用戶選字。當有超過
一個以上的用戶選字時,則要給每個選字評分。評分標準是選字頻率乘上一個透過
半衰期遞減的最近選字經歷時間。如此一來我們在「少用但最近選過」及「常用但
最近少選」之間取得一個平衡。半衰期透過經驗法則決定。

目前這個簡易模型並不存入磁碟,因此下一次重開機後就會洗掉重來。目前這樣選
擇純粹是因為模型有半衰期,因此長時間存放後還是會遺忘。

這個模型的好處是對既有詞庫提供詞的影響很小,對於連續單字詞的 override 有
還不錯的幫助。如此對於人名、地名、公司名等專有名詞,應該可以減少選字的頻
率。這個模型應用起來的缺點是,如果用戶修改的字詞原來是個雙字詞,例如先前
的兩個 unigram 分別是 A, BB ,而用戶想改的是 BB 的第二個字,使選完後的三
個字分別是 A, B', C,這個 C 往往是記不起來的,但如果一開始用戶逐字選取,
亦即在 BB 只出現 B 時就選取 B' 然後再打 C ,則 A, B', C 這個組合往往能被
正確記憶。實際發生原因在此不討論,但跟底層所用的組字網架的架構有關。確實
要改進的話得要從底層重新架構來下手,但至少目前這個模型給的建議偏保守,不
至干擾原有的預設選字。衡諸得失,這個模型提供一些邊際上的改善,應該還是值
得採用的。
This commit is contained in:
ovadmin 2017-09-30 11:03:04 +08:00 committed by Lukhnos Liu
parent a17438b67a
commit d672136843
3 changed files with 291 additions and 0 deletions

View File

@ -49,6 +49,7 @@
6AFF97F3253B299E007F1C49 /* OVNonModalAlertWindowController.m in Sources */ = {isa = PBXBuildFile; fileRef = 6AFF97F1253B299E007F1C49 /* OVNonModalAlertWindowController.m */; }; 6AFF97F3253B299E007F1C49 /* OVNonModalAlertWindowController.m in Sources */ = {isa = PBXBuildFile; fileRef = 6AFF97F1253B299E007F1C49 /* OVNonModalAlertWindowController.m */; };
D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */; }; D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */; };
D48550A325EBE689006A204C /* OpenCC in Frameworks */ = {isa = PBXBuildFile; productRef = D48550A225EBE689006A204C /* OpenCC */; }; D48550A325EBE689006A204C /* OpenCC in Frameworks */ = {isa = PBXBuildFile; productRef = D48550A225EBE689006A204C /* OpenCC */; };
6AE30A491F7F40B7008735BD /* UserOverrideModel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6AE30A471F7F40B7008735BD /* UserOverrideModel.cpp */; };
/* End PBXBuildFile section */ /* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */ /* Begin PBXContainerItemProxy section */
@ -211,6 +212,8 @@
6AFF97F1253B299E007F1C49 /* OVNonModalAlertWindowController.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = OVNonModalAlertWindowController.m; sourceTree = "<group>"; }; 6AFF97F1253B299E007F1C49 /* OVNonModalAlertWindowController.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = OVNonModalAlertWindowController.m; sourceTree = "<group>"; };
D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "McBopomofo-Bridging-Header.h"; sourceTree = "<group>"; }; D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "McBopomofo-Bridging-Header.h"; sourceTree = "<group>"; };
D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpenCCBridge.swift; sourceTree = "<group>"; }; D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpenCCBridge.swift; sourceTree = "<group>"; };
6AE30A471F7F40B7008735BD /* UserOverrideModel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UserOverrideModel.cpp; sourceTree = "<group>"; };
6AE30A481F7F40B7008735BD /* UserOverrideModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UserOverrideModel.h; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */ /* Begin PBXFrameworksBuildPhase section */
@ -286,6 +289,10 @@
6A0D4ECC15FC0D6400ABF4B3 /* PreferencesWindowController.m */, 6A0D4ECC15FC0D6400ABF4B3 /* PreferencesWindowController.m */,
D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */, D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */,
D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */, D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */,
6A0D4ECD15FC0D6400ABF4B3 /* UpdateNotificationController.h */,
6A0D4ECE15FC0D6400ABF4B3 /* UpdateNotificationController.m */,
6AE30A471F7F40B7008735BD /* UserOverrideModel.cpp */,
6AE30A481F7F40B7008735BD /* UserOverrideModel.h */,
); );
path = Source; path = Source;
sourceTree = "<group>"; sourceTree = "<group>";
@ -647,6 +654,7 @@
6A0D4F0015FC0DA600ABF4B3 /* VTHorizontalCandidateView.m in Sources */, 6A0D4F0015FC0DA600ABF4B3 /* VTHorizontalCandidateView.m in Sources */,
6AFF97F3253B299E007F1C49 /* OVNonModalAlertWindowController.m in Sources */, 6AFF97F3253B299E007F1C49 /* OVNonModalAlertWindowController.m in Sources */,
6A0D4F0115FC0DA600ABF4B3 /* VTVerticalCandidateController.m in Sources */, 6A0D4F0115FC0DA600ABF4B3 /* VTVerticalCandidateController.m in Sources */,
6AE30A491F7F40B7008735BD /* UserOverrideModel.cpp in Sources */,
6A0D4F0215FC0DA600ABF4B3 /* VTVerticalCandidateTableView.m in Sources */, 6A0D4F0215FC0DA600ABF4B3 /* VTVerticalCandidateTableView.m in Sources */,
6A0D4F0315FC0DA600ABF4B3 /* VTVerticalKeyLabelStripView.m in Sources */, 6A0D4F0315FC0DA600ABF4B3 /* VTVerticalKeyLabelStripView.m in Sources */,
D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */, D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */,

View File

@ -0,0 +1,202 @@
//
// UserOverrideModel.cpp
//
// Copyright (c) 2017 The McBopomofo Project.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
#include "UserOverrideModel.h"
#include <cassert>
#include <cmath>
#include <sstream>
using namespace McBopomofo;
// About 20 generations.
static const double DecayThreshould = 1.0 / 1048576.0;
static double Score(size_t eventCount,
size_t totalCount,
double eventTimestamp,
double timestamp,
double lambda);
static string WalkedNodesToKey(const std::vector<NodeAnchor>& walkedNodes,
size_t cursorIndex);
UserOverrideModel::UserOverrideModel(size_t capacity, double decayConstant)
: m_capacity(capacity) {
assert(m_capacity > 0);
m_decayExponent = log(0.5) / decayConstant;
}
void UserOverrideModel::observe(const std::vector<NodeAnchor>& walkedNodes,
size_t cursorIndex,
const string& candidate,
double timestamp) {
string key = WalkedNodesToKey(walkedNodes, cursorIndex);
auto mapIter = m_lruMap.find(key);
if (mapIter == m_lruMap.end()) {
auto keyValuePair = KeyObservationPair(key, Observation());
Observation& observation = keyValuePair.second;
observation.update(candidate, timestamp);
m_lruList.push_front(keyValuePair);
auto listIter = m_lruList.begin();
auto lruKeyValue = std::pair<std::string,
std::list<KeyObservationPair>::iterator>(key, listIter);
m_lruMap.insert(lruKeyValue);
if (m_lruList.size() > m_capacity) {
auto lastKeyValuePair = m_lruList.end();
--lastKeyValuePair;
m_lruMap.erase(lastKeyValuePair->first);
m_lruList.pop_back();
}
} else {
auto listIter = mapIter->second;
m_lruList.splice(m_lruList.begin(), m_lruList, listIter);
auto& keyValuePair = *listIter;
Observation& observation = keyValuePair.second;
observation.update(candidate, timestamp);
}
}
string UserOverrideModel::suggest(const std::vector<NodeAnchor>& walkedNodes,
size_t cursorIndex,
double timestamp) {
string key = WalkedNodesToKey(walkedNodes, cursorIndex);
auto mapIter = m_lruMap.find(key);
if (mapIter == m_lruMap.end()) {
return string();
}
auto listIter = mapIter->second;
auto& keyValuePair = *listIter;
const Observation& observation = keyValuePair.second;
string candidate;
double score = 0.0;
for (auto i = observation.overrides.begin();
i != observation.overrides.end();
++i) {
const Override& o = i->second;
double overrideScore = Score(o.count,
observation.count,
o.timestamp,
timestamp,
m_decayExponent);
if (overrideScore == 0.0) {
continue;
}
if (overrideScore > score) {
candidate = i->first;
score = overrideScore;
}
}
return candidate;
}
void UserOverrideModel::Observation::update(const string& candidate,
double timestamp) {
count++;
auto& o = overrides[candidate];
o.timestamp = timestamp;
o.count++;
}
static double Score(size_t eventCount,
size_t totalCount,
double eventTimestamp,
double timestamp,
double lambda) {
double decay = exp((timestamp - eventTimestamp) * lambda);
if (decay < DecayThreshould) {
return 0.0;
}
double prob = (double)eventCount / (double)totalCount;
return prob * decay;
}
static string WalkedNodesToKey(const std::vector<NodeAnchor>& walkedNodes,
size_t cursorIndex) {
std::stringstream s;
std::vector<NodeAnchor> n;
size_t ll = 0;
for (std::vector<NodeAnchor>::const_iterator i = walkedNodes.begin();
i != walkedNodes.end();
++i) {
const auto& nn = *i;
n.push_back(nn);
ll += nn.spanningLength;
if (ll >= cursorIndex) {
break;
}
}
std::vector<NodeAnchor>::const_reverse_iterator r = n.rbegin();
if (r == n.rend()) {
return "";
}
string current = (*r).node->currentKeyValue().key;
++r;
s.clear();
s.str(std::string());
if (r != n.rend()) {
s << "("
<< (*r).node->currentKeyValue().key
<< ","
<< (*r).node->currentKeyValue().value
<< ")";
++r;
} else {
s << "()";
}
string prev = s.str();
s.clear();
s.str(std::string());
if (r != n.rend()) {
s << "("
<< (*r).node->currentKeyValue().key
<< ","
<< (*r).node->currentKeyValue().value
<< ")";
++r;
} else {
s << "()";
}
string anterior = s.str();
s.clear();
s.str(std::string());
s << "(" << anterior << "," << prev << "," << current << ")";
return s.str();
}

View File

@ -0,0 +1,81 @@
//
// UserOverrideModel.h
//
// Copyright (c) 2017 The McBopomofo Project.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
#ifndef USEROVERRIDEMODEL_H
#define USEROVERRIDEMODEL_H
#include <list>
#include <map>
#include <string>
#include "Gramambular.h"
namespace McBopomofo {
using namespace Formosa::Gramambular;
class UserOverrideModel {
public:
UserOverrideModel(size_t capacity, double decayConstant);
void observe(const std::vector<NodeAnchor>& walkedNodes,
size_t cursorIndex,
const string& candidate,
double timestamp);
string suggest(const std::vector<NodeAnchor>& walkedNodes,
size_t cursorIndex,
double timestamp);
private:
struct Override {
size_t count;
double timestamp;
Override() : count(0), timestamp(0.0) {}
};
struct Observation {
size_t count;
std::map<std::string, Override> overrides;
Observation() : count(0) {}
void update(const string& candidate, double timestamp);
};
typedef std::pair<std::string, Observation> KeyObservationPair;
size_t m_capacity;
double m_decayExponent;
std::list<KeyObservationPair> m_lruList;
std::map<std::string, std::list<KeyObservationPair>::iterator> m_lruMap;
};
}; // namespace McBopomofo
#endif