From 78c90cadeab6250e3f676e523695ef8c4c0e98f6 Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Mon, 24 Jan 2022 10:30:49 +0800 Subject: [PATCH] CNS // Phase 2: + CNSLM (with Debug Messaging System). --- Source/Engine/LanguageModel/CNSLM.cpp | 131 +++++++++++++++++++++ Source/Engine/LanguageModel/CNSLM.h | 48 ++++++++ Source/Engine/LanguageModel/vChewingLM.cpp | 8 ++ Source/Engine/LanguageModel/vChewingLM.h | 6 +- Source/LanguageModelManager.h | 3 +- Source/LanguageModelManager.mm | 22 ++-- vChewing.xcodeproj/project.pbxproj | 6 + 7 files changed, 214 insertions(+), 10 deletions(-) create mode 100644 Source/Engine/LanguageModel/CNSLM.cpp create mode 100644 Source/Engine/LanguageModel/CNSLM.h diff --git a/Source/Engine/LanguageModel/CNSLM.cpp b/Source/Engine/LanguageModel/CNSLM.cpp new file mode 100644 index 00000000..6c184821 --- /dev/null +++ b/Source/Engine/LanguageModel/CNSLM.cpp @@ -0,0 +1,131 @@ +/* + * CNSLM.cpp + * + * Copyright 2021-2022 vChewing Project (3-Clause BSD License). + * Derived from 2011-2022 OpenVanilla Project (MIT License). + * Some rights reserved. See "LICENSE.TXT" for details. + */ + +#include "CNSLM.h" + +#include +#include +#include +#include +#include +#include + +#include "KeyValueBlobReader.h" + +namespace vChewing { + +CNSLM::CNSLM() + : fd(-1) + , data(0) + , length(0) +{ +} + +CNSLM::~CNSLM() +{ + if (data) { + close(); + } +} + +bool CNSLM::open(const char *path) +{ + if (data) { + syslog(LOG_CONS, "CNSLM: Failed at Open Step 1.\n"); + return false; + } + + fd = ::open(path, O_RDONLY); + if (fd == -1) { + syslog(LOG_CONS, "CNSLM: Failed at Open Step 2.\n"); + printf("open:: file not exist"); + return false; + } + + struct stat sb; + if (fstat(fd, &sb) == -1) { + syslog(LOG_CONS, "CNSLM: Failed at Open Step 3.\n"); + printf("open:: cannot open file"); + return false; + } + + length = (size_t)sb.st_size; + + data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); + if (!data) { + ::close(fd); + syslog(LOG_CONS, "CNSLM: Failed at Open Step 4.\n"); + return false; + } + + KeyValueBlobReader reader(static_cast(data), length); + KeyValueBlobReader::KeyValue keyValue; + KeyValueBlobReader::State state; + while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) { + // We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading. + keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key); + } + // 下面這一段或許可以做成開關、來詢問是否對使用者語彙採取寬鬆策略(哪怕有行內容寫錯也會放行) + if (state == KeyValueBlobReader::State::ERROR) { + // close(); + syslog(LOG_CONS, "CNSLM: Failed at Open Step 5. On Error Resume Next.\n"); + // return false; + } + return true; +} + +void CNSLM::close() +{ + if (data) { + munmap(data, length); + ::close(fd); + data = 0; + } + + keyRowMap.clear(); +} + +void CNSLM::dump() +{ + for (const auto& entry : keyRowMap) { + const std::vector& rows = entry.second; + for (const auto& row : rows) { + std::cerr << row.key << " " << row.value << "\n"; + } + } +} + +const std::vector CNSLM::bigramsForKeys(const std::string& preceedingKey, const std::string& key) +{ + return std::vector(); +} + +const std::vector CNSLM::unigramsForKey(const std::string& key) +{ + std::vector v; + auto iter = keyRowMap.find(key); + if (iter != keyRowMap.end()) { + const std::vector& rows = iter->second; + for (const auto& row : rows) { + Taiyan::Gramambular::Unigram g; + g.keyValue.key = row.key; + g.keyValue.value = row.value; + g.score = -17.0; + v.push_back(g); + } + } + + return v; +} + +bool CNSLM::hasUnigramsForKey(const std::string& key) +{ + return keyRowMap.find(key) != keyRowMap.end(); +} + +}; // namespace vChewing diff --git a/Source/Engine/LanguageModel/CNSLM.h b/Source/Engine/LanguageModel/CNSLM.h new file mode 100644 index 00000000..fd2d199b --- /dev/null +++ b/Source/Engine/LanguageModel/CNSLM.h @@ -0,0 +1,48 @@ +/* + * CNSLM.h + * + * Copyright 2021-2022 vChewing Project (3-Clause BSD License). + * Derived from 2011-2022 OpenVanilla Project (MIT License). + * Some rights reserved. See "LICENSE.TXT" for details. + */ + +#ifndef CNSLM_H +#define CNSLM_H + +#include +#include +#include +#include "LanguageModel.h" + +namespace vChewing { + +class CNSLM : public Taiyan::Gramambular::LanguageModel +{ +public: + CNSLM(); + ~CNSLM(); + + bool open(const char *path); + void close(); + void dump(); + + virtual const std::vector bigramsForKeys(const std::string& preceedingKey, const std::string& key); + virtual const std::vector unigramsForKey(const std::string& key); + virtual bool hasUnigramsForKey(const std::string& key); + +protected: + struct Row { + Row(std::string_view& k, std::string_view& v) : key(k), value(v) {} + std::string_view key; + std::string_view value; + }; + + std::map> keyRowMap; + int fd; + void *data; + size_t length; +}; + +} + +#endif diff --git a/Source/Engine/LanguageModel/vChewingLM.cpp b/Source/Engine/LanguageModel/vChewingLM.cpp index 8fd5b82a..d6c77be3 100644 --- a/Source/Engine/LanguageModel/vChewingLM.cpp +++ b/Source/Engine/LanguageModel/vChewingLM.cpp @@ -32,6 +32,14 @@ void vChewingLM::loadLanguageModel(const char* languageModelDataPath) } } +void vChewingLM::loadCNSData(const char* cnsDataPath) +{ + if (cnsDataPath) { + m_cnsData.close(); + m_cnsData.open(cnsDataPath); + } +} + void vChewingLM::loadUserPhrases(const char* userPhrasesDataPath, const char* excludedPhrasesDataPath) { diff --git a/Source/Engine/LanguageModel/vChewingLM.h b/Source/Engine/LanguageModel/vChewingLM.h index ce339db5..06feb42b 100644 --- a/Source/Engine/LanguageModel/vChewingLM.h +++ b/Source/Engine/LanguageModel/vChewingLM.h @@ -10,8 +10,9 @@ #define VCHEWINGLM_H #include -#include "UserPhrasesLM.h" #include "FastLM.h" +#include "CNSLM.h" +#include "UserPhrasesLM.h" #include "PhraseReplacementMap.h" #include @@ -25,7 +26,9 @@ public: ~vChewingLM(); void loadLanguageModel(const char* languageModelPath); + void loadCNSData(const char* cnsDataPath); void loadUserPhrases(const char* userPhrasesPath, const char* excludedPhrasesPath); + void loadPhraseReplacementMap(const char* phraseReplacementPath); const vector bigramsForKeys(const string& preceedingKey, const string& key); @@ -41,6 +44,7 @@ protected: std::unordered_set& insertedValues); FastLM m_languageModel; + CNSLM m_cnsData; UserPhrasesLM m_userPhrases; UserPhrasesLM m_excludedPhrases; PhraseReplacementMap m_phraseReplacement; diff --git a/Source/LanguageModelManager.h b/Source/LanguageModelManager.h index f186fef9..78b541d1 100644 --- a/Source/LanguageModelManager.h +++ b/Source/LanguageModelManager.h @@ -18,6 +18,7 @@ NS_ASSUME_NONNULL_BEGIN + (void)loadDataModels; + (void)deployZipDataFile:(NSString *)filenameWithoutExtension; ++ (void)loadCNSData; + (void)loadUserPhrases; + (void)loadUserPhraseReplacement; + (BOOL)checkIfUserLanguageModelFilesExist; @@ -25,7 +26,7 @@ NS_ASSUME_NONNULL_BEGIN + (NSString *)userPhrasesDataPath:(NSString *)inputMode; + (NSString *)excludedPhrasesDataPath:(NSString *)inputMode; + (NSString *)phraseReplacementDataPath:(NSString *)inputMode; -+ (NSString *)cnsDataPath:(NSString *)inputMode; ++ (NSString *)cnsDataPath; @property (class, readonly, nonatomic) NSString *dataFolderPath; @property (class, readonly, nonatomic) vChewing::vChewingLM *languageModelCoreCHT; diff --git a/Source/LanguageModelManager.mm b/Source/LanguageModelManager.mm index 751ecf34..2684e352 100644 --- a/Source/LanguageModelManager.mm +++ b/Source/LanguageModelManager.mm @@ -32,13 +32,6 @@ static NSString *const kBopomofoModeIdentifierCHS = @"org.atelierInmu.inputmetho @implementation LanguageModelManager -static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewingLM &lm) -{ - Class cls = NSClassFromString(@"vChewingInputMethodController"); - NSString *dataPath = [[NSBundle bundleForClass:cls] pathForResource:filenameWithoutExtension ofType:@"txt"]; - lm.loadLanguageModel([dataPath UTF8String]); -} - + (void)deployZipDataFile:(NSString *)filenameWithoutExtension { Class cls = NSClassFromString(@"vChewingInputMethodController"); @@ -47,12 +40,25 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewing [SSZipArchive unzipFileAtPath:zipPath toDestination:destinationPath]; } +static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewingLM &lm) +{ + Class cls = NSClassFromString(@"vChewingInputMethodController"); + NSString *dataPath = [[NSBundle bundleForClass:cls] pathForResource:filenameWithoutExtension ofType:@"txt"]; + lm.loadLanguageModel([dataPath UTF8String]); +} + + (void)loadDataModels { LTLoadLanguageModelFile(@"data-cht", glanguageModelCoreCHT); LTLoadLanguageModelFile(@"data-chs", glanguageModelCoreCHS); } ++ (void)loadCNSData +{ + glanguageModelCoreCHT.loadCNSData([[self cnsDataPath] UTF8String]); + glanguageModelCoreCHS.loadCNSData([[self cnsDataPath] UTF8String]); +} + + (void)loadUserPhrases { glanguageModelCoreCHT.loadUserPhrases([[self userPhrasesDataPath:kBopomofoModeIdentifierCHT] UTF8String], [[self excludedPhrasesDataPath:kBopomofoModeIdentifierCHT] UTF8String]); @@ -201,7 +207,7 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewing return [[self dataFolderPath] stringByAppendingPathComponent:fileName]; } -+ (NSString *)cnsDataPath:(NSString *)inputMode ++ (NSString *)cnsDataPath { return [[self dataFolderPath] stringByAppendingPathComponent:@"UNICHARS.csv"]; } diff --git a/vChewing.xcodeproj/project.pbxproj b/vChewing.xcodeproj/project.pbxproj index 7b5a47a9..4b401bef 100644 --- a/vChewing.xcodeproj/project.pbxproj +++ b/vChewing.xcodeproj/project.pbxproj @@ -40,6 +40,7 @@ 5BDD25F8279D6D1200AA18F8 /* zip.m in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25E7279D64FB00AA18F8 /* zip.m */; }; 5BDD25F9279D6D1200AA18F8 /* ioapi.m in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25E8279D64FB00AA18F8 /* ioapi.m */; }; 5BDD25FA279D6D1200AA18F8 /* mztools.m in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25E9279D64FB00AA18F8 /* mztools.m */; }; + 5BDD25FD279D6D6300AA18F8 /* CNSLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25FB279D6D6200AA18F8 /* CNSLM.cpp */; }; 5BDF2CFE2791BE4400838ADB /* InputSourceHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BDF2CFD2791BE4400838ADB /* InputSourceHelper.swift */; }; 5BDF2CFF2791BECC00838ADB /* InputSourceHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BDF2CFD2791BE4400838ADB /* InputSourceHelper.swift */; }; 5BDF2D012791C03B00838ADB /* PreferencesWindowController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BDF2D002791C03B00838ADB /* PreferencesWindowController.swift */; }; @@ -158,6 +159,8 @@ 5BDD25F0279D64FB00AA18F8 /* SSZipArchive.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = SSZipArchive.m; sourceTree = ""; }; 5BDD25F1279D65CB00AA18F8 /* UNICHARS.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; name = UNICHARS.zip; path = Data/components/common/UNICHARS.zip; sourceTree = ""; }; 5BDD25F3279D677F00AA18F8 /* libz.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libz.tbd; path = usr/lib/libz.tbd; sourceTree = SDKROOT; }; + 5BDD25FB279D6D6200AA18F8 /* CNSLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CNSLM.cpp; sourceTree = ""; }; + 5BDD25FC279D6D6300AA18F8 /* CNSLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CNSLM.h; sourceTree = ""; }; 5BDF2CFD2791BE4400838ADB /* InputSourceHelper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputSourceHelper.swift; sourceTree = ""; }; 5BDF2D002791C03B00838ADB /* PreferencesWindowController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreferencesWindowController.swift; sourceTree = ""; }; 5BDF2D022791C71200838ADB /* NonModalAlertWindowController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NonModalAlertWindowController.swift; sourceTree = ""; }; @@ -276,6 +279,8 @@ 5BA8DAFE27928120009C9FFF /* LanguageModel */ = { isa = PBXGroup; children = ( + 5BDD25FB279D6D6200AA18F8 /* CNSLM.cpp */, + 5BDD25FC279D6D6300AA18F8 /* CNSLM.h */, 5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */, 5B5F4F8C27928F9300922DC2 /* vChewingLM.h */, 6A0421A615FEF3F50061ED63 /* FastLM.cpp */, @@ -792,6 +797,7 @@ 5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */, 5BDF2D032791C71200838ADB /* NonModalAlertWindowController.swift in Sources */, 5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */, + 5BDD25FD279D6D6300AA18F8 /* CNSLM.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; };