LMConsolidator // Module Implementation.
- We aren't like those cowards living in the upstream who prefer to make LM modules "tolerant". We actively consolidate user-editable files to fix common user-generated mistakes and duplicated entries. - The LMConsolidator has an independent EOF fixer and a comprehensive Content-Consolidator. The Content-Consolidator receives a parameter to decide whether it should sort the contents in the language model file, 'cause some users may prefer their own content sequences in their editable language model files. - We don't introduce HYPY2BPMF conversion module this time until we can find a good solution.
This commit is contained in:
parent
ecee3ad7bf
commit
502644af9c
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
* LMConsolidator.h
|
||||
* vChewing-Specific module for Consolidating Language Model Data files.
|
||||
* Copyright 2021-2022 vChewing Project (3-Clause BSD License).
|
||||
* Some rights reserved. See "LICENSE.TXT" for details.
|
||||
*/
|
||||
|
||||
#ifndef LMConsolidator_hpp
|
||||
#define LMConsolidator_hpp
|
||||
|
||||
#include <syslog.h>
|
||||
#include <stdio.h>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <regex>
|
||||
|
||||
using namespace std;
|
||||
namespace vChewing {
|
||||
|
||||
class LMConsolidator
|
||||
{
|
||||
public:
|
||||
static bool FixEOF(const char *path);
|
||||
static bool ConsolidateContent(const char *path, bool shouldsort);
|
||||
};
|
||||
|
||||
} // namespace vChewing
|
||||
#endif /* LMConsolidator_hpp */
|
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* LMConsolidator.mm
|
||||
* vChewing-Specific module for Consolidating Language Model Data files.
|
||||
* Copyright 2021-2022 vChewing Project (3-Clause BSD License).
|
||||
* Some rights reserved. See "LICENSE.TXT" for details.
|
||||
*/
|
||||
|
||||
#include "LMConsolidator.h"
|
||||
|
||||
namespace vChewing {
|
||||
|
||||
// EOF FIXER. CREDIT: Shiki Suen.
|
||||
bool LMConsolidator::FixEOF(const char *path)
|
||||
{
|
||||
std::fstream zfdEOFFixerIncomingStream(path);
|
||||
zfdEOFFixerIncomingStream.seekg(-1,std::ios_base::end);
|
||||
char z;
|
||||
zfdEOFFixerIncomingStream.get(z);
|
||||
if(z!='\n'){
|
||||
syslog(LOG_CONS, "// REPORT: Data File not ended with a new line.\n");
|
||||
syslog(LOG_CONS, "// DATA FILE: %s", path);
|
||||
syslog(LOG_CONS, "// PROCEDURE: Trying to insert a new line as EOF before per-line check process.\n");
|
||||
std::ofstream zfdEOFFixerOutput(path, std::ios_base::app);
|
||||
zfdEOFFixerOutput << std::endl;
|
||||
zfdEOFFixerOutput.close();
|
||||
if (zfdEOFFixerOutput.fail()) {
|
||||
syslog(LOG_CONS, "// REPORT: Failed to append a newline to the data file. Insufficient Privileges?\n");
|
||||
syslog(LOG_CONS, "// DATA FILE: %s", path);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
zfdEOFFixerIncomingStream.close();
|
||||
if (zfdEOFFixerIncomingStream.fail()) {
|
||||
syslog(LOG_CONS, "// REPORT: Failed to read lines through the data file for EOF check. Insufficient Privileges?\n");
|
||||
syslog(LOG_CONS, "// DATA FILE: %s", path);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} // END: EOF FIXER.
|
||||
|
||||
// CONTENT CONSOLIDATOR. CREDIT: Shiki Suen.
|
||||
bool LMConsolidator::ConsolidateContent(const char *path, bool shouldsort) {
|
||||
ifstream zfdContentConsolidatorIncomingStream(path);
|
||||
vector<string>vecEntry;
|
||||
while(!zfdContentConsolidatorIncomingStream.eof())
|
||||
{ // Xcode 13 能用的 ObjCpp 與 Cpp 並無原生支援「\h」這個 Regex 參數的能力,只能逐行處理。
|
||||
string zfdBuffer;
|
||||
getline(zfdContentConsolidatorIncomingStream,zfdBuffer);
|
||||
vecEntry.push_back(zfdBuffer);
|
||||
}
|
||||
// 第一遍 for 用來統整每行內的內容。
|
||||
regex sedCJKWhiteSpace(" "), sedWhiteSpace("\\s+"), sedLeadingSpace("^\\s"), sedTrailingSpace("\\s$"); // RegEx 先定義好。
|
||||
for(int i=0;i<vecEntry.size();i++) { // 第一遍 for 用來統整每行內的內容。
|
||||
if (vecEntry[i].size() != 0) { // 不要理會空行,否則給空行加上 endl 等於再加空行。
|
||||
// RegEx 處理順序:先將全形空格換成西文空格,然後合併任何意義上的連續空格(包括 tab 等),最後去除每行首尾空格。
|
||||
vecEntry[i] = regex_replace(vecEntry[i], sedCJKWhiteSpace, " ").c_str(); // 中日韓全形空格轉為 ASCII 空格。
|
||||
vecEntry[i] = regex_replace(vecEntry[i], sedWhiteSpace, " ").c_str(); // 所有意義上的連續的 \s 型空格都轉為單個 ASCII 空格。
|
||||
vecEntry[i] = regex_replace(vecEntry[i], sedLeadingSpace, "").c_str(); // 去掉行首空格。
|
||||
vecEntry[i] = regex_replace(vecEntry[i], sedTrailingSpace, "").c_str(); // 去掉行尾空格。
|
||||
}
|
||||
}
|
||||
// 在第二遍 for 運算之前,針對 vecEntry 排序+去除重複條目。
|
||||
if (shouldsort) {sort(vecEntry.begin(), vecEntry.end());} // 要不要排序,得做成開關。
|
||||
vecEntry.erase(unique(vecEntry.begin(), vecEntry.end()), vecEntry.end()); // 排序。
|
||||
// 統整完畢。開始將統整過的內容寫入檔案。
|
||||
ofstream zfdContentConsolidatorOutput(path); // 這裡是要從頭開始重寫檔案內容,所以不需要「 ios_base::app 」。
|
||||
for(int i=0;i<vecEntry.size();i++) { // 第二遍 for 用來寫入統整過的內容。
|
||||
if (vecEntry[i].size() != 0) { // 這句很重要,不然還是會把經過 RegEx 處理後出現的空行搞到檔案裡。
|
||||
zfdContentConsolidatorOutput<<vecEntry[i]<<endl; // 這裡是必須得加上 endl 的,不然所有行都變成一個整合行。
|
||||
}
|
||||
}
|
||||
zfdContentConsolidatorOutput.close();
|
||||
if (zfdContentConsolidatorOutput.fail()) {
|
||||
syslog(LOG_CONS, "// REPORT: Failed to write content-consolidated data to the file. Insufficient Privileges?\n");
|
||||
syslog(LOG_CONS, "// DATA FILE: %s", path);
|
||||
return false;
|
||||
}
|
||||
zfdContentConsolidatorIncomingStream.close();
|
||||
if (zfdContentConsolidatorIncomingStream.fail()) {
|
||||
syslog(LOG_CONS, "// REPORT: Failed to read lines through the data file for content-consolidation. Insufficient Privileges?\n");
|
||||
syslog(LOG_CONS, "// DATA FILE: %s", path);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} // END: CONTENT CONSOLIDATOR.
|
||||
|
||||
} // namespace vChewing
|
|
@ -14,6 +14,7 @@
|
|||
#include <unistd.h>
|
||||
#include "KeyValueBlobReader.h"
|
||||
#include "PhraseReplacementMap.h"
|
||||
#include "LMConsolidator.h"
|
||||
|
||||
namespace vChewing {
|
||||
|
||||
|
@ -38,22 +39,9 @@ bool PhraseReplacementMap::open(const char *path)
|
|||
if (data) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::fstream zfd(path);
|
||||
zfd.seekg(-1,std::ios_base::end);
|
||||
char z;
|
||||
zfd.get(z);
|
||||
if(z!='\n'){
|
||||
syslog(LOG_CONS, "REPORT: Phrase Replacement Map File is not ended with a new line.\n");
|
||||
syslog(LOG_CONS, "PROCEDURE: Trying to insert a new line as EOF before per-line check process.\n");
|
||||
std::ofstream zfdo(path, std::ios_base::app);
|
||||
zfdo << std::endl;
|
||||
zfdo.close();
|
||||
if (zfdo.fail()) {
|
||||
syslog(LOG_CONS, "REPORT: Failed to append a newline to the data file. Insufficient Privileges?\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
LMConsolidator::FixEOF(path);
|
||||
LMConsolidator::ConsolidateContent(path, false);
|
||||
|
||||
fd = ::open(path, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#include <fstream>
|
||||
#include <unistd.h>
|
||||
#include <syslog.h>
|
||||
|
||||
#include "LMConsolidator.h"
|
||||
#include "KeyValueBlobReader.h"
|
||||
|
||||
namespace vChewing {
|
||||
|
@ -38,21 +38,8 @@ bool UserPhrasesLM::open(const char *path)
|
|||
return false;
|
||||
}
|
||||
|
||||
std::fstream zfd(path);
|
||||
zfd.seekg(-1,std::ios_base::end);
|
||||
char z;
|
||||
zfd.get(z);
|
||||
if(z!='\n'){
|
||||
syslog(LOG_CONS, "REPORT: User Phrase Data File is not ended with a new line.\n");
|
||||
syslog(LOG_CONS, "PROCEDURE: Trying to insert a new line as EOF before per-line check process.\n");
|
||||
std::ofstream zfdo(path, std::ios_base::app);
|
||||
zfdo << std::endl;
|
||||
zfdo.close();
|
||||
if (zfdo.fail()) {
|
||||
syslog(LOG_CONS, "REPORT: Failed to append a newline to the data file. Insufficient Privileges?\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
LMConsolidator::FixEOF(path);
|
||||
LMConsolidator::ConsolidateContent(path, false);
|
||||
|
||||
fd = ::open(path, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
5B5F4F93279294A300922DC2 /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */; };
|
||||
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.mm */; };
|
||||
5B6797B52794822C004AC7CE /* PhraseReplacementMap.h in Sources */ = {isa = PBXBuildFile; fileRef = 5B6797B32794822C004AC7CE /* PhraseReplacementMap.h */; };
|
||||
5B810D9F27A3A5E50032C1A9 /* LMConsolidator.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B810D9D27A3A5E50032C1A9 /* LMConsolidator.mm */; };
|
||||
5BC2D2882793B434002C0BEC /* KeyValueBlobReader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D2862793B434002C0BEC /* KeyValueBlobReader.cpp */; };
|
||||
5BC2D28B2793B8FB002C0BEC /* EmacsKeyHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D28A2793B8FB002C0BEC /* EmacsKeyHelper.swift */; };
|
||||
5BC2D28D2793B98F002C0BEC /* PreferencesModule.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC2D28C2793B98F002C0BEC /* PreferencesModule.swift */; };
|
||||
|
@ -118,6 +119,8 @@
|
|||
5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = UserPhrasesLM.mm; sourceTree = "<group>"; };
|
||||
5B6797B32794822C004AC7CE /* PhraseReplacementMap.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PhraseReplacementMap.h; sourceTree = "<group>"; };
|
||||
5B6797B42794822C004AC7CE /* PhraseReplacementMap.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = PhraseReplacementMap.mm; sourceTree = "<group>"; };
|
||||
5B810D9D27A3A5E50032C1A9 /* LMConsolidator.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LMConsolidator.mm; sourceTree = "<group>"; };
|
||||
5B810D9E27A3A5E50032C1A9 /* LMConsolidator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LMConsolidator.h; sourceTree = "<group>"; };
|
||||
5B9781D32763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/InfoPlist.strings"; sourceTree = "<group>"; };
|
||||
5B9781D52763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/Localizable.strings"; sourceTree = "<group>"; };
|
||||
5B9781D72763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "Source/zh-Hans.lproj/InfoPlist.strings"; sourceTree = "<group>"; };
|
||||
|
@ -261,6 +264,8 @@
|
|||
5B5F4F952792A4EA00922DC2 /* UserPhrasesLM.h */,
|
||||
5B6797B42794822C004AC7CE /* PhraseReplacementMap.mm */,
|
||||
5B6797B32794822C004AC7CE /* PhraseReplacementMap.h */,
|
||||
5B810D9D27A3A5E50032C1A9 /* LMConsolidator.mm */,
|
||||
5B810D9E27A3A5E50032C1A9 /* LMConsolidator.h */,
|
||||
);
|
||||
path = LanguageModel;
|
||||
sourceTree = "<group>";
|
||||
|
@ -734,6 +739,7 @@
|
|||
5B217128279BB22700F91A2B /* frmAboutWindow.swift in Sources */,
|
||||
5BD13F482794F0A6000E429F /* PhraseReplacementMap.mm in Sources */,
|
||||
5BDD25FA279D6D1200AA18F8 /* mztools.m in Sources */,
|
||||
5B810D9F27A3A5E50032C1A9 /* LMConsolidator.mm in Sources */,
|
||||
5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */,
|
||||
5BDF2D032791C71200838ADB /* NonModalAlertWindowController.swift in Sources */,
|
||||
5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */,
|
||||
|
|
Loading…
Reference in New Issue