CNS // Phase 2: + CNSLM (with Debug Messaging System).

This commit is contained in:
ShikiSuen 2022-01-24 10:30:49 +08:00
parent 52140dcfde
commit 78c90cadea
7 changed files with 214 additions and 10 deletions

View File

@ -0,0 +1,131 @@
/*
* CNSLM.cpp
*
* Copyright 2021-2022 vChewing Project (3-Clause BSD License).
* Derived from 2011-2022 OpenVanilla Project (MIT License).
* Some rights reserved. See "LICENSE.TXT" for details.
*/
#include "CNSLM.h"
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <fstream>
#include <unistd.h>
#include <syslog.h>
#include "KeyValueBlobReader.h"
namespace vChewing {
CNSLM::CNSLM()
: fd(-1)
, data(0)
, length(0)
{
}
CNSLM::~CNSLM()
{
if (data) {
close();
}
}
bool CNSLM::open(const char *path)
{
if (data) {
syslog(LOG_CONS, "CNSLM: Failed at Open Step 1.\n");
return false;
}
fd = ::open(path, O_RDONLY);
if (fd == -1) {
syslog(LOG_CONS, "CNSLM: Failed at Open Step 2.\n");
printf("open:: file not exist");
return false;
}
struct stat sb;
if (fstat(fd, &sb) == -1) {
syslog(LOG_CONS, "CNSLM: Failed at Open Step 3.\n");
printf("open:: cannot open file");
return false;
}
length = (size_t)sb.st_size;
data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
if (!data) {
::close(fd);
syslog(LOG_CONS, "CNSLM: Failed at Open Step 4.\n");
return false;
}
KeyValueBlobReader reader(static_cast<char*>(data), length);
KeyValueBlobReader::KeyValue keyValue;
KeyValueBlobReader::State state;
while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) {
// We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading.
keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key);
}
// 下面這一段或許可以做成開關、來詢問是否對使用者語彙採取寬鬆策略(哪怕有行內容寫錯也會放行)
if (state == KeyValueBlobReader::State::ERROR) {
// close();
syslog(LOG_CONS, "CNSLM: Failed at Open Step 5. On Error Resume Next.\n");
// return false;
}
return true;
}
void CNSLM::close()
{
if (data) {
munmap(data, length);
::close(fd);
data = 0;
}
keyRowMap.clear();
}
void CNSLM::dump()
{
for (const auto& entry : keyRowMap) {
const std::vector<Row>& rows = entry.second;
for (const auto& row : rows) {
std::cerr << row.key << " " << row.value << "\n";
}
}
}
const std::vector<Taiyan::Gramambular::Bigram> CNSLM::bigramsForKeys(const std::string& preceedingKey, const std::string& key)
{
return std::vector<Taiyan::Gramambular::Bigram>();
}
const std::vector<Taiyan::Gramambular::Unigram> CNSLM::unigramsForKey(const std::string& key)
{
std::vector<Taiyan::Gramambular::Unigram> v;
auto iter = keyRowMap.find(key);
if (iter != keyRowMap.end()) {
const std::vector<Row>& rows = iter->second;
for (const auto& row : rows) {
Taiyan::Gramambular::Unigram g;
g.keyValue.key = row.key;
g.keyValue.value = row.value;
g.score = -17.0;
v.push_back(g);
}
}
return v;
}
bool CNSLM::hasUnigramsForKey(const std::string& key)
{
return keyRowMap.find(key) != keyRowMap.end();
}
}; // namespace vChewing

View File

@ -0,0 +1,48 @@
/*
* CNSLM.h
*
* Copyright 2021-2022 vChewing Project (3-Clause BSD License).
* Derived from 2011-2022 OpenVanilla Project (MIT License).
* Some rights reserved. See "LICENSE.TXT" for details.
*/
#ifndef CNSLM_H
#define CNSLM_H
#include <string>
#include <map>
#include <iostream>
#include "LanguageModel.h"
namespace vChewing {
class CNSLM : public Taiyan::Gramambular::LanguageModel
{
public:
CNSLM();
~CNSLM();
bool open(const char *path);
void close();
void dump();
virtual const std::vector<Taiyan::Gramambular::Bigram> bigramsForKeys(const std::string& preceedingKey, const std::string& key);
virtual const std::vector<Taiyan::Gramambular::Unigram> unigramsForKey(const std::string& key);
virtual bool hasUnigramsForKey(const std::string& key);
protected:
struct Row {
Row(std::string_view& k, std::string_view& v) : key(k), value(v) {}
std::string_view key;
std::string_view value;
};
std::map<std::string_view, std::vector<Row>> keyRowMap;
int fd;
void *data;
size_t length;
};
}
#endif

View File

@ -32,6 +32,14 @@ void vChewingLM::loadLanguageModel(const char* languageModelDataPath)
}
}
void vChewingLM::loadCNSData(const char* cnsDataPath)
{
if (cnsDataPath) {
m_cnsData.close();
m_cnsData.open(cnsDataPath);
}
}
void vChewingLM::loadUserPhrases(const char* userPhrasesDataPath,
const char* excludedPhrasesDataPath)
{

View File

@ -10,8 +10,9 @@
#define VCHEWINGLM_H
#include <stdio.h>
#include "UserPhrasesLM.h"
#include "FastLM.h"
#include "CNSLM.h"
#include "UserPhrasesLM.h"
#include "PhraseReplacementMap.h"
#include <unordered_set>
@ -25,7 +26,9 @@ public:
~vChewingLM();
void loadLanguageModel(const char* languageModelPath);
void loadCNSData(const char* cnsDataPath);
void loadUserPhrases(const char* userPhrasesPath, const char* excludedPhrasesPath);
void loadPhraseReplacementMap(const char* phraseReplacementPath);
const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key);
@ -41,6 +44,7 @@ protected:
std::unordered_set<string>& insertedValues);
FastLM m_languageModel;
CNSLM m_cnsData;
UserPhrasesLM m_userPhrases;
UserPhrasesLM m_excludedPhrases;
PhraseReplacementMap m_phraseReplacement;

View File

@ -18,6 +18,7 @@ NS_ASSUME_NONNULL_BEGIN
+ (void)loadDataModels;
+ (void)deployZipDataFile:(NSString *)filenameWithoutExtension;
+ (void)loadCNSData;
+ (void)loadUserPhrases;
+ (void)loadUserPhraseReplacement;
+ (BOOL)checkIfUserLanguageModelFilesExist;
@ -25,7 +26,7 @@ NS_ASSUME_NONNULL_BEGIN
+ (NSString *)userPhrasesDataPath:(NSString *)inputMode;
+ (NSString *)excludedPhrasesDataPath:(NSString *)inputMode;
+ (NSString *)phraseReplacementDataPath:(NSString *)inputMode;
+ (NSString *)cnsDataPath:(NSString *)inputMode;
+ (NSString *)cnsDataPath;
@property (class, readonly, nonatomic) NSString *dataFolderPath;
@property (class, readonly, nonatomic) vChewing::vChewingLM *languageModelCoreCHT;

View File

@ -32,13 +32,6 @@ static NSString *const kBopomofoModeIdentifierCHS = @"org.atelierInmu.inputmetho
@implementation LanguageModelManager
static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewingLM &lm)
{
Class cls = NSClassFromString(@"vChewingInputMethodController");
NSString *dataPath = [[NSBundle bundleForClass:cls] pathForResource:filenameWithoutExtension ofType:@"txt"];
lm.loadLanguageModel([dataPath UTF8String]);
}
+ (void)deployZipDataFile:(NSString *)filenameWithoutExtension
{
Class cls = NSClassFromString(@"vChewingInputMethodController");
@ -47,12 +40,25 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewing
[SSZipArchive unzipFileAtPath:zipPath toDestination:destinationPath];
}
static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewingLM &lm)
{
Class cls = NSClassFromString(@"vChewingInputMethodController");
NSString *dataPath = [[NSBundle bundleForClass:cls] pathForResource:filenameWithoutExtension ofType:@"txt"];
lm.loadLanguageModel([dataPath UTF8String]);
}
+ (void)loadDataModels
{
LTLoadLanguageModelFile(@"data-cht", glanguageModelCoreCHT);
LTLoadLanguageModelFile(@"data-chs", glanguageModelCoreCHS);
}
+ (void)loadCNSData
{
glanguageModelCoreCHT.loadCNSData([[self cnsDataPath] UTF8String]);
glanguageModelCoreCHS.loadCNSData([[self cnsDataPath] UTF8String]);
}
+ (void)loadUserPhrases
{
glanguageModelCoreCHT.loadUserPhrases([[self userPhrasesDataPath:kBopomofoModeIdentifierCHT] UTF8String], [[self excludedPhrasesDataPath:kBopomofoModeIdentifierCHT] UTF8String]);
@ -201,7 +207,7 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewing
return [[self dataFolderPath] stringByAppendingPathComponent:fileName];
}
+ (NSString *)cnsDataPath:(NSString *)inputMode
+ (NSString *)cnsDataPath
{
return [[self dataFolderPath] stringByAppendingPathComponent:@"UNICHARS.csv"];
}

View File

@ -40,6 +40,7 @@
5BDD25F8279D6D1200AA18F8 /* zip.m in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25E7279D64FB00AA18F8 /* zip.m */; };
5BDD25F9279D6D1200AA18F8 /* ioapi.m in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25E8279D64FB00AA18F8 /* ioapi.m */; };
5BDD25FA279D6D1200AA18F8 /* mztools.m in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25E9279D64FB00AA18F8 /* mztools.m */; };
5BDD25FD279D6D6300AA18F8 /* CNSLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25FB279D6D6200AA18F8 /* CNSLM.cpp */; };
5BDF2CFE2791BE4400838ADB /* InputSourceHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BDF2CFD2791BE4400838ADB /* InputSourceHelper.swift */; };
5BDF2CFF2791BECC00838ADB /* InputSourceHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BDF2CFD2791BE4400838ADB /* InputSourceHelper.swift */; };
5BDF2D012791C03B00838ADB /* PreferencesWindowController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BDF2D002791C03B00838ADB /* PreferencesWindowController.swift */; };
@ -158,6 +159,8 @@
5BDD25F0279D64FB00AA18F8 /* SSZipArchive.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = SSZipArchive.m; sourceTree = "<group>"; };
5BDD25F1279D65CB00AA18F8 /* UNICHARS.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; name = UNICHARS.zip; path = Data/components/common/UNICHARS.zip; sourceTree = "<group>"; };
5BDD25F3279D677F00AA18F8 /* libz.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libz.tbd; path = usr/lib/libz.tbd; sourceTree = SDKROOT; };
5BDD25FB279D6D6200AA18F8 /* CNSLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CNSLM.cpp; sourceTree = "<group>"; };
5BDD25FC279D6D6300AA18F8 /* CNSLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CNSLM.h; sourceTree = "<group>"; };
5BDF2CFD2791BE4400838ADB /* InputSourceHelper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputSourceHelper.swift; sourceTree = "<group>"; };
5BDF2D002791C03B00838ADB /* PreferencesWindowController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreferencesWindowController.swift; sourceTree = "<group>"; };
5BDF2D022791C71200838ADB /* NonModalAlertWindowController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NonModalAlertWindowController.swift; sourceTree = "<group>"; };
@ -276,6 +279,8 @@
5BA8DAFE27928120009C9FFF /* LanguageModel */ = {
isa = PBXGroup;
children = (
5BDD25FB279D6D6200AA18F8 /* CNSLM.cpp */,
5BDD25FC279D6D6300AA18F8 /* CNSLM.h */,
5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */,
5B5F4F8C27928F9300922DC2 /* vChewingLM.h */,
6A0421A615FEF3F50061ED63 /* FastLM.cpp */,
@ -792,6 +797,7 @@
5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */,
5BDF2D032791C71200838ADB /* NonModalAlertWindowController.swift in Sources */,
5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */,
5BDD25FD279D6D6300AA18F8 /* CNSLM.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};