LMConsolidator // Module Implementation.

- We aren't like those cowards living in the upstream who prefer to make LM modules "tolerant". We actively consolidate user-editable files to fix common user-generated mistakes and duplicated entries.
- The LMConsolidator has an independent EOF fixer and a comprehensive Content-Consolidator. The Content-Consolidator receives a parameter to decide whether it should sort the contents in the language model file, 'cause some users may prefer their own content sequences in their editable language model files.
- We don't introduce HYPY2BPMF conversion module this time until we can find a good solution.
This commit is contained in:
ShikiSuen 2022-01-28 12:22:40 +08:00
parent ecee3ad7bf
commit 502644af9c
5 changed files with 132 additions and 32 deletions

View File

@ -0,0 +1,32 @@
/*
* LMConsolidator.h
* vChewing-Specific module for Consolidating Language Model Data files.
* Copyright 2021-2022 vChewing Project (3-Clause BSD License).
* Some rights reserved. See "LICENSE.TXT" for details.
*/
#ifndef LMConsolidator_hpp
#define LMConsolidator_hpp
#include <syslog.h>
#include <stdio.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <string>
#include <map>
#include <set>
#include <regex>
using namespace std;
namespace vChewing {
class LMConsolidator
{
public:
static bool FixEOF(const char *path);
static bool ConsolidateContent(const char *path, bool shouldsort);
};
} // namespace vChewing
#endif /* LMConsolidator_hpp */

View File

@ -0,0 +1,87 @@
/*
* LMConsolidator.mm
* vChewing-Specific module for Consolidating Language Model Data files.
* Copyright 2021-2022 vChewing Project (3-Clause BSD License).
* Some rights reserved. See "LICENSE.TXT" for details.
*/
#include "LMConsolidator.h"
namespace vChewing {
// EOF FIXER. CREDIT: Shiki Suen.
bool LMConsolidator::FixEOF(const char *path)
{
std::fstream zfdEOFFixerIncomingStream(path);
zfdEOFFixerIncomingStream.seekg(-1,std::ios_base::end);
char z;
zfdEOFFixerIncomingStream.get(z);
if(z!='\n'){
syslog(LOG_CONS, "// REPORT: Data File not ended with a new line.\n");
syslog(LOG_CONS, "// DATA FILE: %s", path);
syslog(LOG_CONS, "// PROCEDURE: Trying to insert a new line as EOF before per-line check process.\n");
std::ofstream zfdEOFFixerOutput(path, std::ios_base::app);
zfdEOFFixerOutput << std::endl;
zfdEOFFixerOutput.close();
if (zfdEOFFixerOutput.fail()) {
syslog(LOG_CONS, "// REPORT: Failed to append a newline to the data file. Insufficient Privileges?\n");
syslog(LOG_CONS, "// DATA FILE: %s", path);
return false;
}
}
zfdEOFFixerIncomingStream.close();
if (zfdEOFFixerIncomingStream.fail()) {
syslog(LOG_CONS, "// REPORT: Failed to read lines through the data file for EOF check. Insufficient Privileges?\n");
syslog(LOG_CONS, "// DATA FILE: %s", path);
return false;
}
return true;
} // END: EOF FIXER.
// CONTENT CONSOLIDATOR. CREDIT: Shiki Suen.
bool LMConsolidator::ConsolidateContent(const char *path, bool shouldsort) {
ifstream zfdContentConsolidatorIncomingStream(path);
vector<string>vecEntry;
while(!zfdContentConsolidatorIncomingStream.eof())
{ // Xcode 13 能用的 ObjCpp 與 Cpp 並無原生支援「\h」這個 Regex 參數的能力,只能逐行處理。
string zfdBuffer;
getline(zfdContentConsolidatorIncomingStream,zfdBuffer);
vecEntry.push_back(zfdBuffer);
}
// 第一遍 for 用來統整每行內的內容。
regex sedCJKWhiteSpace(" "), sedWhiteSpace("\\s+"), sedLeadingSpace("^\\s"), sedTrailingSpace("\\s$"); // RegEx 先定義好。
for(int i=0;i<vecEntry.size();i++) { // 第一遍 for 用來統整每行內的內容。
if (vecEntry[i].size() != 0) { // 不要理會空行,否則給空行加上 endl 等於再加空行。
// RegEx 處理順序:先將全形空格換成西文空格,然後合併任何意義上的連續空格(包括 tab 等),最後去除每行首尾空格。
vecEntry[i] = regex_replace(vecEntry[i], sedCJKWhiteSpace, " ").c_str(); // 中日韓全形空格轉為 ASCII 空格。
vecEntry[i] = regex_replace(vecEntry[i], sedWhiteSpace, " ").c_str(); // 所有意義上的連續的 \s 型空格都轉為單個 ASCII 空格。
vecEntry[i] = regex_replace(vecEntry[i], sedLeadingSpace, "").c_str(); // 去掉行首空格。
vecEntry[i] = regex_replace(vecEntry[i], sedTrailingSpace, "").c_str(); // 去掉行尾空格。
}
}
// 在第二遍 for 運算之前,針對 vecEntry 排序+去除重複條目。
if (shouldsort) {sort(vecEntry.begin(), vecEntry.end());} // 要不要排序,得做成開關。
vecEntry.erase(unique(vecEntry.begin(), vecEntry.end()), vecEntry.end()); // 排序。
// 統整完畢。開始將統整過的內容寫入檔案。
ofstream zfdContentConsolidatorOutput(path); // 這裡是要從頭開始重寫檔案內容,所以不需要「 ios_base::app 」。
for(int i=0;i<vecEntry.size();i++) { // 第二遍 for 用來寫入統整過的內容。
if (vecEntry[i].size() != 0) { // 這句很重要,不然還是會把經過 RegEx 處理後出現的空行搞到檔案裡。
zfdContentConsolidatorOutput<<vecEntry[i]<<endl; // 這裡是必須得加上 endl 的,不然所有行都變成一個整合行。
}
}
zfdContentConsolidatorOutput.close();
if (zfdContentConsolidatorOutput.fail()) {
syslog(LOG_CONS, "// REPORT: Failed to write content-consolidated data to the file. Insufficient Privileges?\n");
syslog(LOG_CONS, "// DATA FILE: %s", path);
return false;
}
zfdContentConsolidatorIncomingStream.close();
if (zfdContentConsolidatorIncomingStream.fail()) {
syslog(LOG_CONS, "// REPORT: Failed to read lines through the data file for content-consolidation. Insufficient Privileges?\n");
syslog(LOG_CONS, "// DATA FILE: %s", path);
return false;
}
return true;
} // END: CONTENT CONSOLIDATOR.
} // namespace vChewing

View File

@ -14,6 +14,7 @@
#include <unistd.h>
#include "KeyValueBlobReader.h"
#include "PhraseReplacementMap.h"
#include "LMConsolidator.h"
namespace vChewing {
@ -38,22 +39,9 @@ bool PhraseReplacementMap::open(const char *path)
if (data) {
return false;
}
std::fstream zfd(path);
zfd.seekg(-1,std::ios_base::end);
char z;
zfd.get(z);
if(z!='\n'){
syslog(LOG_CONS, "REPORT: Phrase Replacement Map File is not ended with a new line.\n");
syslog(LOG_CONS, "PROCEDURE: Trying to insert a new line as EOF before per-line check process.\n");
std::ofstream zfdo(path, std::ios_base::app);
zfdo << std::endl;
zfdo.close();
if (zfdo.fail()) {
syslog(LOG_CONS, "REPORT: Failed to append a newline to the data file. Insufficient Privileges?\n");
return false;
}
}
LMConsolidator::FixEOF(path);
LMConsolidator::ConsolidateContent(path, false);
fd = ::open(path, O_RDONLY);
if (fd == -1) {

View File

@ -13,7 +13,7 @@
#include <fstream>
#include <unistd.h>
#include <syslog.h>
#include "LMConsolidator.h"
#include "KeyValueBlobReader.h"
namespace vChewing {
@ -38,21 +38,8 @@ bool UserPhrasesLM::open(const char *path)
return false;
}
std::fstream zfd(path);
zfd.seekg(-1,std::ios_base::end);
char z;
zfd.get(z);
if(z!='\n'){
syslog(LOG_CONS, "REPORT: User Phrase Data File is not ended with a new line.\n");
syslog(LOG_CONS, "PROCEDURE: Trying to insert a new line as EOF before per-line check process.\n");
std::ofstream zfdo(path, std::ios_base::app);
zfdo << std::endl;
zfdo.close();
if (zfdo.fail()) {
syslog(LOG_CONS, "REPORT: Failed to append a newline to the data file. Insufficient Privileges?\n");
return false;
}
}
LMConsolidator::FixEOF(path);
LMConsolidator::ConsolidateContent(path, false);
fd = ::open(path, O_RDONLY);
if (fd == -1) {

View File

@ -20,6 +20,7 @@
5B5F4F93279294A300922DC2 /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */; };
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.mm */; };
5B6797B52794822C004AC7CE /* PhraseReplacementMap.h in Sources */ = {isa = PBXBuildFile; fileRef = 5B6797B32794822C004AC7CE /* PhraseReplacementMap.h */; };
5B810D9F27A3A5E50032C1A9 /* LMConsolidator.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B810D9D27A3A5E50032C1A9 /* LMConsolidator.mm */; };
5BC2D2882793B434002C0BEC /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D2862793B434002C0BEC /* KeyValueBlobReader.cpp */; };
5BC2D28B2793B8FB002C0BEC /* EmacsKeyHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D28A2793B8FB002C0BEC /* EmacsKeyHelper.swift */; };
5BC2D28D2793B98F002C0BEC /* PreferencesModule.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D28C2793B98F002C0BEC /* PreferencesModule.swift */; };
@ -118,6 +119,8 @@
5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = UserPhrasesLM.mm; sourceTree = "<group>"; };
5B6797B32794822C004AC7CE /* PhraseReplacementMap.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PhraseReplacementMap.h; sourceTree = "<group>"; };
5B6797B42794822C004AC7CE /* PhraseReplacementMap.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = PhraseReplacementMap.mm; sourceTree = "<group>"; };
5B810D9D27A3A5E50032C1A9 /* LMConsolidator.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LMConsolidator.mm; sourceTree = "<group>"; };
5B810D9E27A3A5E50032C1A9 /* LMConsolidator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LMConsolidator.h; sourceTree = "<group>"; };
5B9781D32763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/InfoPlist.strings"; sourceTree = "<group>"; };
5B9781D52763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/Localizable.strings"; sourceTree = "<group>"; };
5B9781D72763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "Source/zh-Hans.lproj/InfoPlist.strings"; sourceTree = "<group>"; };
@ -261,6 +264,8 @@
5B5F4F952792A4EA00922DC2 /* UserPhrasesLM.h */,
5B6797B42794822C004AC7CE /* PhraseReplacementMap.mm */,
5B6797B32794822C004AC7CE /* PhraseReplacementMap.h */,
5B810D9D27A3A5E50032C1A9 /* LMConsolidator.mm */,
5B810D9E27A3A5E50032C1A9 /* LMConsolidator.h */,
);
path = LanguageModel;
sourceTree = "<group>";
@ -734,6 +739,7 @@
5B217128279BB22700F91A2B /* frmAboutWindow.swift in Sources */,
5BD13F482794F0A6000E429F /* PhraseReplacementMap.mm in Sources */,
5BDD25FA279D6D1200AA18F8 /* mztools.m in Sources */,
5B810D9F27A3A5E50032C1A9 /* LMConsolidator.mm in Sources */,
5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */,
5BDF2D032791C71200838ADB /* NonModalAlertWindowController.swift in Sources */,
5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */,