From d672136843bc5f887adb6ee4bd9e41b27c1aa5a0 Mon Sep 17 00:00:00 2001 From: ovadmin Date: Sat, 30 Sep 2017 11:03:04 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AF=A6=E4=BD=9C=E7=B0=A1=E5=96=AE=E7=9A=84?= =?UTF-8?q?=E7=94=A8=E6=88=B6=E9=81=B8=E5=AD=97=E8=A8=98=E6=86=B6=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 這個模型基本上只是根據游標前的兩個 unigram 記憶當前的用戶選字。當有超過 一個以上的用戶選字時,則要給每個選字評分。評分標準是選字頻率乘上一個透過 半衰期遞減的最近選字經歷時間。如此一來我們在「少用但最近選過」及「常用但 最近少選」之間取得一個平衡。半衰期透過經驗法則決定。 目前這個簡易模型並不存入磁碟,因此下一次重開機後就會洗掉重來。目前這樣選 擇純粹是因為模型有半衰期,因此長時間存放後還是會遺忘。 這個模型的好處是對既有詞庫提供詞的影響很小,對於連續單字詞的 override 有 還不錯的幫助。如此對於人名、地名、公司名等專有名詞,應該可以減少選字的頻 率。這個模型應用起來的缺點是,如果用戶修改的字詞原來是個雙字詞,例如先前 的兩個 unigram 分別是 A, BB ,而用戶想改的是 BB 的第二個字,使選完後的三 個字分別是 A, B', C,這個 C 往往是記不起來的,但如果一開始用戶逐字選取, 亦即在 BB 只出現 B 時就選取 B' 然後再打 C ,則 A, B', C 這個組合往往能被 正確記憶。實際發生原因在此不討論,但跟底層所用的組字網架的架構有關。確實 要改進的話得要從底層重新架構來下手,但至少目前這個模型給的建議偏保守,不 至干擾原有的預設選字。衡諸得失,這個模型提供一些邊際上的改善,應該還是值 得採用的。 --- McBopomofo.xcodeproj/project.pbxproj | 8 ++ Source/UserOverrideModel.cpp | 202 +++++++++++++++++++++++++++ Source/UserOverrideModel.h | 81 +++++++++++ 3 files changed, 291 insertions(+) create mode 100644 Source/UserOverrideModel.cpp create mode 100644 Source/UserOverrideModel.h diff --git a/McBopomofo.xcodeproj/project.pbxproj b/McBopomofo.xcodeproj/project.pbxproj index 64f7b808..aa27af5b 100644 --- a/McBopomofo.xcodeproj/project.pbxproj +++ b/McBopomofo.xcodeproj/project.pbxproj @@ -49,6 +49,7 @@ 6AFF97F3253B299E007F1C49 /* OVNonModalAlertWindowController.m in Sources */ = {isa = PBXBuildFile; fileRef = 6AFF97F1253B299E007F1C49 /* OVNonModalAlertWindowController.m */; }; D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */; }; D48550A325EBE689006A204C /* OpenCC in Frameworks */ = {isa = PBXBuildFile; productRef = D48550A225EBE689006A204C /* OpenCC */; }; + 6AE30A491F7F40B7008735BD /* UserOverrideModel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6AE30A471F7F40B7008735BD /* UserOverrideModel.cpp */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -211,6 +212,8 @@ 6AFF97F1253B299E007F1C49 /* OVNonModalAlertWindowController.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = OVNonModalAlertWindowController.m; sourceTree = ""; }; D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "McBopomofo-Bridging-Header.h"; sourceTree = ""; }; D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpenCCBridge.swift; sourceTree = ""; }; + 6AE30A471F7F40B7008735BD /* UserOverrideModel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UserOverrideModel.cpp; sourceTree = ""; }; + 6AE30A481F7F40B7008735BD /* UserOverrideModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UserOverrideModel.h; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -286,6 +289,10 @@ 6A0D4ECC15FC0D6400ABF4B3 /* PreferencesWindowController.m */, D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */, D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */, + 6A0D4ECD15FC0D6400ABF4B3 /* UpdateNotificationController.h */, + 6A0D4ECE15FC0D6400ABF4B3 /* UpdateNotificationController.m */, + 6AE30A471F7F40B7008735BD /* UserOverrideModel.cpp */, + 6AE30A481F7F40B7008735BD /* UserOverrideModel.h */, ); path = Source; sourceTree = ""; @@ -647,6 +654,7 @@ 6A0D4F0015FC0DA600ABF4B3 /* VTHorizontalCandidateView.m in Sources */, 6AFF97F3253B299E007F1C49 /* OVNonModalAlertWindowController.m in Sources */, 6A0D4F0115FC0DA600ABF4B3 /* VTVerticalCandidateController.m in Sources */, + 6AE30A491F7F40B7008735BD /* UserOverrideModel.cpp in Sources */, 6A0D4F0215FC0DA600ABF4B3 /* VTVerticalCandidateTableView.m in Sources */, 6A0D4F0315FC0DA600ABF4B3 /* VTVerticalKeyLabelStripView.m in Sources */, D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */, diff --git a/Source/UserOverrideModel.cpp b/Source/UserOverrideModel.cpp new file mode 100644 index 00000000..9f38cf1d --- /dev/null +++ b/Source/UserOverrideModel.cpp @@ -0,0 +1,202 @@ +// +// UserOverrideModel.cpp +// +// Copyright (c) 2017 The McBopomofo Project. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#include "UserOverrideModel.h" + +#include +#include +#include + +using namespace McBopomofo; + +// About 20 generations. +static const double DecayThreshould = 1.0 / 1048576.0; + +static double Score(size_t eventCount, + size_t totalCount, + double eventTimestamp, + double timestamp, + double lambda); +static string WalkedNodesToKey(const std::vector& walkedNodes, + size_t cursorIndex); + +UserOverrideModel::UserOverrideModel(size_t capacity, double decayConstant) + : m_capacity(capacity) { + assert(m_capacity > 0); + m_decayExponent = log(0.5) / decayConstant; +} + +void UserOverrideModel::observe(const std::vector& walkedNodes, + size_t cursorIndex, + const string& candidate, + double timestamp) { + string key = WalkedNodesToKey(walkedNodes, cursorIndex); + auto mapIter = m_lruMap.find(key); + if (mapIter == m_lruMap.end()) { + auto keyValuePair = KeyObservationPair(key, Observation()); + Observation& observation = keyValuePair.second; + observation.update(candidate, timestamp); + + m_lruList.push_front(keyValuePair); + auto listIter = m_lruList.begin(); + auto lruKeyValue = std::pair::iterator>(key, listIter); + m_lruMap.insert(lruKeyValue); + + if (m_lruList.size() > m_capacity) { + auto lastKeyValuePair = m_lruList.end(); + --lastKeyValuePair; + m_lruMap.erase(lastKeyValuePair->first); + m_lruList.pop_back(); + } + } else { + auto listIter = mapIter->second; + m_lruList.splice(m_lruList.begin(), m_lruList, listIter); + + auto& keyValuePair = *listIter; + Observation& observation = keyValuePair.second; + observation.update(candidate, timestamp); + } +} + +string UserOverrideModel::suggest(const std::vector& walkedNodes, + size_t cursorIndex, + double timestamp) { + string key = WalkedNodesToKey(walkedNodes, cursorIndex); + auto mapIter = m_lruMap.find(key); + if (mapIter == m_lruMap.end()) { + return string(); + } + + auto listIter = mapIter->second; + auto& keyValuePair = *listIter; + const Observation& observation = keyValuePair.second; + + string candidate; + double score = 0.0; + for (auto i = observation.overrides.begin(); + i != observation.overrides.end(); + ++i) { + const Override& o = i->second; + double overrideScore = Score(o.count, + observation.count, + o.timestamp, + timestamp, + m_decayExponent); + if (overrideScore == 0.0) { + continue; + } + + if (overrideScore > score) { + candidate = i->first; + score = overrideScore; + } + } + return candidate; +} + +void UserOverrideModel::Observation::update(const string& candidate, + double timestamp) { + count++; + auto& o = overrides[candidate]; + o.timestamp = timestamp; + o.count++; +} + +static double Score(size_t eventCount, + size_t totalCount, + double eventTimestamp, + double timestamp, + double lambda) { + double decay = exp((timestamp - eventTimestamp) * lambda); + if (decay < DecayThreshould) { + return 0.0; + } + + double prob = (double)eventCount / (double)totalCount; + return prob * decay; +} + +static string WalkedNodesToKey(const std::vector& walkedNodes, + size_t cursorIndex) { + std::stringstream s; + std::vector n; + size_t ll = 0; + for (std::vector::const_iterator i = walkedNodes.begin(); + i != walkedNodes.end(); + ++i) { + const auto& nn = *i; + n.push_back(nn); + ll += nn.spanningLength; + if (ll >= cursorIndex) { + break; + } + } + + std::vector::const_reverse_iterator r = n.rbegin(); + + if (r == n.rend()) { + return ""; + } + + string current = (*r).node->currentKeyValue().key; + ++r; + + s.clear(); + s.str(std::string()); + if (r != n.rend()) { + s << "(" + << (*r).node->currentKeyValue().key + << "," + << (*r).node->currentKeyValue().value + << ")"; + ++r; + } else { + s << "()"; + } + string prev = s.str(); + + s.clear(); + s.str(std::string()); + if (r != n.rend()) { + s << "(" + << (*r).node->currentKeyValue().key + << "," + << (*r).node->currentKeyValue().value + << ")"; + ++r; + } else { + s << "()"; + } + string anterior = s.str(); + + s.clear(); + s.str(std::string()); + s << "(" << anterior << "," << prev << "," << current << ")"; + + return s.str(); +} diff --git a/Source/UserOverrideModel.h b/Source/UserOverrideModel.h new file mode 100644 index 00000000..0b981923 --- /dev/null +++ b/Source/UserOverrideModel.h @@ -0,0 +1,81 @@ +// +// UserOverrideModel.h +// +// Copyright (c) 2017 The McBopomofo Project. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#ifndef USEROVERRIDEMODEL_H +#define USEROVERRIDEMODEL_H + +#include +#include +#include + +#include "Gramambular.h" + +namespace McBopomofo { + +using namespace Formosa::Gramambular; + +class UserOverrideModel { +public: + UserOverrideModel(size_t capacity, double decayConstant); + + void observe(const std::vector& walkedNodes, + size_t cursorIndex, + const string& candidate, + double timestamp); + + string suggest(const std::vector& walkedNodes, + size_t cursorIndex, + double timestamp); + +private: + struct Override { + size_t count; + double timestamp; + + Override() : count(0), timestamp(0.0) {} + }; + + struct Observation { + size_t count; + std::map overrides; + + Observation() : count(0) {} + void update(const string& candidate, double timestamp); + }; + + typedef std::pair KeyObservationPair; + + size_t m_capacity; + double m_decayExponent; + std::list m_lruList; + std::map::iterator> m_lruMap; +}; + +}; // namespace McBopomofo + +#endif +