From 502644af9cd286cf869744f10f63eba6eef206f7 Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Fri, 28 Jan 2022 12:22:40 +0800 Subject: [PATCH] LMConsolidator // Module Implementation. - We aren't like those cowards living in the upstream who prefer to make LM modules "tolerant". We actively consolidate user-editable files to fix common user-generated mistakes and duplicated entries. - The LMConsolidator has an independent EOF fixer and a comprehensive Content-Consolidator. The Content-Consolidator receives a parameter to decide whether it should sort the contents in the language model file, 'cause some users may prefer their own content sequences in their editable language model files. - We don't introduce HYPY2BPMF conversion module this time until we can find a good solution. --- Source/Engine/LanguageModel/LMConsolidator.h | 32 +++++++ Source/Engine/LanguageModel/LMConsolidator.mm | 87 +++++++++++++++++++ .../LanguageModel/PhraseReplacementMap.mm | 20 +---- Source/Engine/LanguageModel/UserPhrasesLM.mm | 19 +--- vChewing.xcodeproj/project.pbxproj | 6 ++ 5 files changed, 132 insertions(+), 32 deletions(-) create mode 100644 Source/Engine/LanguageModel/LMConsolidator.h create mode 100644 Source/Engine/LanguageModel/LMConsolidator.mm diff --git a/Source/Engine/LanguageModel/LMConsolidator.h b/Source/Engine/LanguageModel/LMConsolidator.h new file mode 100644 index 00000000..6ea8faea --- /dev/null +++ b/Source/Engine/LanguageModel/LMConsolidator.h @@ -0,0 +1,32 @@ +/* + * LMConsolidator.h + * vChewing-Specific module for Consolidating Language Model Data files. + * Copyright 2021-2022 vChewing Project (3-Clause BSD License). + * Some rights reserved. See "LICENSE.TXT" for details. + */ + +#ifndef LMConsolidator_hpp +#define LMConsolidator_hpp + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +namespace vChewing { + +class LMConsolidator +{ +public: + static bool FixEOF(const char *path); + static bool ConsolidateContent(const char *path, bool shouldsort); +}; + +} // namespace vChewing +#endif /* LMConsolidator_hpp */ diff --git a/Source/Engine/LanguageModel/LMConsolidator.mm b/Source/Engine/LanguageModel/LMConsolidator.mm new file mode 100644 index 00000000..bfba1f80 --- /dev/null +++ b/Source/Engine/LanguageModel/LMConsolidator.mm @@ -0,0 +1,87 @@ +/* + * LMConsolidator.mm + * vChewing-Specific module for Consolidating Language Model Data files. + * Copyright 2021-2022 vChewing Project (3-Clause BSD License). + * Some rights reserved. See "LICENSE.TXT" for details. + */ + +#include "LMConsolidator.h" + +namespace vChewing { + +// EOF FIXER. CREDIT: Shiki Suen. +bool LMConsolidator::FixEOF(const char *path) +{ + std::fstream zfdEOFFixerIncomingStream(path); + zfdEOFFixerIncomingStream.seekg(-1,std::ios_base::end); + char z; + zfdEOFFixerIncomingStream.get(z); + if(z!='\n'){ + syslog(LOG_CONS, "// REPORT: Data File not ended with a new line.\n"); + syslog(LOG_CONS, "// DATA FILE: %s", path); + syslog(LOG_CONS, "// PROCEDURE: Trying to insert a new line as EOF before per-line check process.\n"); + std::ofstream zfdEOFFixerOutput(path, std::ios_base::app); + zfdEOFFixerOutput << std::endl; + zfdEOFFixerOutput.close(); + if (zfdEOFFixerOutput.fail()) { + syslog(LOG_CONS, "// REPORT: Failed to append a newline to the data file. Insufficient Privileges?\n"); + syslog(LOG_CONS, "// DATA FILE: %s", path); + return false; + } + } + zfdEOFFixerIncomingStream.close(); + if (zfdEOFFixerIncomingStream.fail()) { + syslog(LOG_CONS, "// REPORT: Failed to read lines through the data file for EOF check. Insufficient Privileges?\n"); + syslog(LOG_CONS, "// DATA FILE: %s", path); + return false; + } + return true; +} // END: EOF FIXER. + +// CONTENT CONSOLIDATOR. CREDIT: Shiki Suen. +bool LMConsolidator::ConsolidateContent(const char *path, bool shouldsort) { + ifstream zfdContentConsolidatorIncomingStream(path); + vectorvecEntry; + while(!zfdContentConsolidatorIncomingStream.eof()) + { // Xcode 13 能用的 ObjCpp 與 Cpp 並無原生支援「\h」這個 Regex 參數的能力,只能逐行處理。 + string zfdBuffer; + getline(zfdContentConsolidatorIncomingStream,zfdBuffer); + vecEntry.push_back(zfdBuffer); + } + // 第一遍 for 用來統整每行內的內容。 + regex sedCJKWhiteSpace(" "), sedWhiteSpace("\\s+"), sedLeadingSpace("^\\s"), sedTrailingSpace("\\s$"); // RegEx 先定義好。 + for(int i=0;i #include "KeyValueBlobReader.h" #include "PhraseReplacementMap.h" +#include "LMConsolidator.h" namespace vChewing { @@ -38,22 +39,9 @@ bool PhraseReplacementMap::open(const char *path) if (data) { return false; } - - std::fstream zfd(path); - zfd.seekg(-1,std::ios_base::end); - char z; - zfd.get(z); - if(z!='\n'){ - syslog(LOG_CONS, "REPORT: Phrase Replacement Map File is not ended with a new line.\n"); - syslog(LOG_CONS, "PROCEDURE: Trying to insert a new line as EOF before per-line check process.\n"); - std::ofstream zfdo(path, std::ios_base::app); - zfdo << std::endl; - zfdo.close(); - if (zfdo.fail()) { - syslog(LOG_CONS, "REPORT: Failed to append a newline to the data file. Insufficient Privileges?\n"); - return false; - } - } + + LMConsolidator::FixEOF(path); + LMConsolidator::ConsolidateContent(path, false); fd = ::open(path, O_RDONLY); if (fd == -1) { diff --git a/Source/Engine/LanguageModel/UserPhrasesLM.mm b/Source/Engine/LanguageModel/UserPhrasesLM.mm index a57dacc3..fe94a305 100644 --- a/Source/Engine/LanguageModel/UserPhrasesLM.mm +++ b/Source/Engine/LanguageModel/UserPhrasesLM.mm @@ -13,7 +13,7 @@ #include #include #include - +#include "LMConsolidator.h" #include "KeyValueBlobReader.h" namespace vChewing { @@ -38,21 +38,8 @@ bool UserPhrasesLM::open(const char *path) return false; } - std::fstream zfd(path); - zfd.seekg(-1,std::ios_base::end); - char z; - zfd.get(z); - if(z!='\n'){ - syslog(LOG_CONS, "REPORT: User Phrase Data File is not ended with a new line.\n"); - syslog(LOG_CONS, "PROCEDURE: Trying to insert a new line as EOF before per-line check process.\n"); - std::ofstream zfdo(path, std::ios_base::app); - zfdo << std::endl; - zfdo.close(); - if (zfdo.fail()) { - syslog(LOG_CONS, "REPORT: Failed to append a newline to the data file. Insufficient Privileges?\n"); - return false; - } - } + LMConsolidator::FixEOF(path); + LMConsolidator::ConsolidateContent(path, false); fd = ::open(path, O_RDONLY); if (fd == -1) { diff --git a/vChewing.xcodeproj/project.pbxproj b/vChewing.xcodeproj/project.pbxproj index 120bdeba..d171ee00 100644 --- a/vChewing.xcodeproj/project.pbxproj +++ b/vChewing.xcodeproj/project.pbxproj @@ -20,6 +20,7 @@ 5B5F4F93279294A300922DC2 /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */; }; 5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.mm */; }; 5B6797B52794822C004AC7CE /* PhraseReplacementMap.h in Sources */ = {isa = PBXBuildFile; fileRef = 5B6797B32794822C004AC7CE /* PhraseReplacementMap.h */; }; + 5B810D9F27A3A5E50032C1A9 /* LMConsolidator.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B810D9D27A3A5E50032C1A9 /* LMConsolidator.mm */; }; 5BC2D2882793B434002C0BEC /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D2862793B434002C0BEC /* KeyValueBlobReader.cpp */; }; 5BC2D28B2793B8FB002C0BEC /* EmacsKeyHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D28A2793B8FB002C0BEC /* EmacsKeyHelper.swift */; }; 5BC2D28D2793B98F002C0BEC /* PreferencesModule.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D28C2793B98F002C0BEC /* PreferencesModule.swift */; }; @@ -118,6 +119,8 @@ 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = UserPhrasesLM.mm; sourceTree = ""; }; 5B6797B32794822C004AC7CE /* PhraseReplacementMap.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PhraseReplacementMap.h; sourceTree = ""; }; 5B6797B42794822C004AC7CE /* PhraseReplacementMap.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = PhraseReplacementMap.mm; sourceTree = ""; }; + 5B810D9D27A3A5E50032C1A9 /* LMConsolidator.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LMConsolidator.mm; sourceTree = ""; }; + 5B810D9E27A3A5E50032C1A9 /* LMConsolidator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LMConsolidator.h; sourceTree = ""; }; 5B9781D32763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/InfoPlist.strings"; sourceTree = ""; }; 5B9781D52763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/Localizable.strings"; sourceTree = ""; }; 5B9781D72763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "Source/zh-Hans.lproj/InfoPlist.strings"; sourceTree = ""; }; @@ -261,6 +264,8 @@ 5B5F4F952792A4EA00922DC2 /* UserPhrasesLM.h */, 5B6797B42794822C004AC7CE /* PhraseReplacementMap.mm */, 5B6797B32794822C004AC7CE /* PhraseReplacementMap.h */, + 5B810D9D27A3A5E50032C1A9 /* LMConsolidator.mm */, + 5B810D9E27A3A5E50032C1A9 /* LMConsolidator.h */, ); path = LanguageModel; sourceTree = ""; @@ -734,6 +739,7 @@ 5B217128279BB22700F91A2B /* frmAboutWindow.swift in Sources */, 5BD13F482794F0A6000E429F /* PhraseReplacementMap.mm in Sources */, 5BDD25FA279D6D1200AA18F8 /* mztools.m in Sources */, + 5B810D9F27A3A5E50032C1A9 /* LMConsolidator.mm in Sources */, 5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */, 5BDF2D032791C71200838ADB /* NonModalAlertWindowController.swift in Sources */, 5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */,