CNS // Phase 2: + CNSLM (with Debug Messaging System).
This commit is contained in:
parent
52140dcfde
commit
78c90cadea
|
@ -0,0 +1,131 @@
|
|||
/*
|
||||
* CNSLM.cpp
|
||||
*
|
||||
* Copyright 2021-2022 vChewing Project (3-Clause BSD License).
|
||||
* Derived from 2011-2022 OpenVanilla Project (MIT License).
|
||||
* Some rights reserved. See "LICENSE.TXT" for details.
|
||||
*/
|
||||
|
||||
#include "CNSLM.h"
|
||||
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <fstream>
|
||||
#include <unistd.h>
|
||||
#include <syslog.h>
|
||||
|
||||
#include "KeyValueBlobReader.h"
|
||||
|
||||
namespace vChewing {
|
||||
|
||||
CNSLM::CNSLM()
|
||||
: fd(-1)
|
||||
, data(0)
|
||||
, length(0)
|
||||
{
|
||||
}
|
||||
|
||||
CNSLM::~CNSLM()
|
||||
{
|
||||
if (data) {
|
||||
close();
|
||||
}
|
||||
}
|
||||
|
||||
bool CNSLM::open(const char *path)
|
||||
{
|
||||
if (data) {
|
||||
syslog(LOG_CONS, "CNSLM: Failed at Open Step 1.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
fd = ::open(path, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
syslog(LOG_CONS, "CNSLM: Failed at Open Step 2.\n");
|
||||
printf("open:: file not exist");
|
||||
return false;
|
||||
}
|
||||
|
||||
struct stat sb;
|
||||
if (fstat(fd, &sb) == -1) {
|
||||
syslog(LOG_CONS, "CNSLM: Failed at Open Step 3.\n");
|
||||
printf("open:: cannot open file");
|
||||
return false;
|
||||
}
|
||||
|
||||
length = (size_t)sb.st_size;
|
||||
|
||||
data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
||||
if (!data) {
|
||||
::close(fd);
|
||||
syslog(LOG_CONS, "CNSLM: Failed at Open Step 4.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
KeyValueBlobReader reader(static_cast<char*>(data), length);
|
||||
KeyValueBlobReader::KeyValue keyValue;
|
||||
KeyValueBlobReader::State state;
|
||||
while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) {
|
||||
// We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading.
|
||||
keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key);
|
||||
}
|
||||
// 下面這一段或許可以做成開關、來詢問是否對使用者語彙採取寬鬆策略(哪怕有行內容寫錯也會放行)
|
||||
if (state == KeyValueBlobReader::State::ERROR) {
|
||||
// close();
|
||||
syslog(LOG_CONS, "CNSLM: Failed at Open Step 5. On Error Resume Next.\n");
|
||||
// return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void CNSLM::close()
|
||||
{
|
||||
if (data) {
|
||||
munmap(data, length);
|
||||
::close(fd);
|
||||
data = 0;
|
||||
}
|
||||
|
||||
keyRowMap.clear();
|
||||
}
|
||||
|
||||
void CNSLM::dump()
|
||||
{
|
||||
for (const auto& entry : keyRowMap) {
|
||||
const std::vector<Row>& rows = entry.second;
|
||||
for (const auto& row : rows) {
|
||||
std::cerr << row.key << " " << row.value << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<Taiyan::Gramambular::Bigram> CNSLM::bigramsForKeys(const std::string& preceedingKey, const std::string& key)
|
||||
{
|
||||
return std::vector<Taiyan::Gramambular::Bigram>();
|
||||
}
|
||||
|
||||
const std::vector<Taiyan::Gramambular::Unigram> CNSLM::unigramsForKey(const std::string& key)
|
||||
{
|
||||
std::vector<Taiyan::Gramambular::Unigram> v;
|
||||
auto iter = keyRowMap.find(key);
|
||||
if (iter != keyRowMap.end()) {
|
||||
const std::vector<Row>& rows = iter->second;
|
||||
for (const auto& row : rows) {
|
||||
Taiyan::Gramambular::Unigram g;
|
||||
g.keyValue.key = row.key;
|
||||
g.keyValue.value = row.value;
|
||||
g.score = -17.0;
|
||||
v.push_back(g);
|
||||
}
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
bool CNSLM::hasUnigramsForKey(const std::string& key)
|
||||
{
|
||||
return keyRowMap.find(key) != keyRowMap.end();
|
||||
}
|
||||
|
||||
}; // namespace vChewing
|
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* CNSLM.h
|
||||
*
|
||||
* Copyright 2021-2022 vChewing Project (3-Clause BSD License).
|
||||
* Derived from 2011-2022 OpenVanilla Project (MIT License).
|
||||
* Some rights reserved. See "LICENSE.TXT" for details.
|
||||
*/
|
||||
|
||||
#ifndef CNSLM_H
|
||||
#define CNSLM_H
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include "LanguageModel.h"
|
||||
|
||||
namespace vChewing {
|
||||
|
||||
class CNSLM : public Taiyan::Gramambular::LanguageModel
|
||||
{
|
||||
public:
|
||||
CNSLM();
|
||||
~CNSLM();
|
||||
|
||||
bool open(const char *path);
|
||||
void close();
|
||||
void dump();
|
||||
|
||||
virtual const std::vector<Taiyan::Gramambular::Bigram> bigramsForKeys(const std::string& preceedingKey, const std::string& key);
|
||||
virtual const std::vector<Taiyan::Gramambular::Unigram> unigramsForKey(const std::string& key);
|
||||
virtual bool hasUnigramsForKey(const std::string& key);
|
||||
|
||||
protected:
|
||||
struct Row {
|
||||
Row(std::string_view& k, std::string_view& v) : key(k), value(v) {}
|
||||
std::string_view key;
|
||||
std::string_view value;
|
||||
};
|
||||
|
||||
std::map<std::string_view, std::vector<Row>> keyRowMap;
|
||||
int fd;
|
||||
void *data;
|
||||
size_t length;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -32,6 +32,14 @@ void vChewingLM::loadLanguageModel(const char* languageModelDataPath)
|
|||
}
|
||||
}
|
||||
|
||||
void vChewingLM::loadCNSData(const char* cnsDataPath)
|
||||
{
|
||||
if (cnsDataPath) {
|
||||
m_cnsData.close();
|
||||
m_cnsData.open(cnsDataPath);
|
||||
}
|
||||
}
|
||||
|
||||
void vChewingLM::loadUserPhrases(const char* userPhrasesDataPath,
|
||||
const char* excludedPhrasesDataPath)
|
||||
{
|
||||
|
|
|
@ -10,8 +10,9 @@
|
|||
#define VCHEWINGLM_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include "UserPhrasesLM.h"
|
||||
#include "FastLM.h"
|
||||
#include "CNSLM.h"
|
||||
#include "UserPhrasesLM.h"
|
||||
#include "PhraseReplacementMap.h"
|
||||
#include <unordered_set>
|
||||
|
||||
|
@ -25,7 +26,9 @@ public:
|
|||
~vChewingLM();
|
||||
|
||||
void loadLanguageModel(const char* languageModelPath);
|
||||
void loadCNSData(const char* cnsDataPath);
|
||||
void loadUserPhrases(const char* userPhrasesPath, const char* excludedPhrasesPath);
|
||||
|
||||
void loadPhraseReplacementMap(const char* phraseReplacementPath);
|
||||
|
||||
const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key);
|
||||
|
@ -41,6 +44,7 @@ protected:
|
|||
std::unordered_set<string>& insertedValues);
|
||||
|
||||
FastLM m_languageModel;
|
||||
CNSLM m_cnsData;
|
||||
UserPhrasesLM m_userPhrases;
|
||||
UserPhrasesLM m_excludedPhrases;
|
||||
PhraseReplacementMap m_phraseReplacement;
|
||||
|
|
|
@ -18,6 +18,7 @@ NS_ASSUME_NONNULL_BEGIN
|
|||
|
||||
+ (void)loadDataModels;
|
||||
+ (void)deployZipDataFile:(NSString *)filenameWithoutExtension;
|
||||
+ (void)loadCNSData;
|
||||
+ (void)loadUserPhrases;
|
||||
+ (void)loadUserPhraseReplacement;
|
||||
+ (BOOL)checkIfUserLanguageModelFilesExist;
|
||||
|
@ -25,7 +26,7 @@ NS_ASSUME_NONNULL_BEGIN
|
|||
+ (NSString *)userPhrasesDataPath:(NSString *)inputMode;
|
||||
+ (NSString *)excludedPhrasesDataPath:(NSString *)inputMode;
|
||||
+ (NSString *)phraseReplacementDataPath:(NSString *)inputMode;
|
||||
+ (NSString *)cnsDataPath:(NSString *)inputMode;
|
||||
+ (NSString *)cnsDataPath;
|
||||
|
||||
@property (class, readonly, nonatomic) NSString *dataFolderPath;
|
||||
@property (class, readonly, nonatomic) vChewing::vChewingLM *languageModelCoreCHT;
|
||||
|
|
|
@ -32,13 +32,6 @@ static NSString *const kBopomofoModeIdentifierCHS = @"org.atelierInmu.inputmetho
|
|||
|
||||
@implementation LanguageModelManager
|
||||
|
||||
static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewingLM &lm)
|
||||
{
|
||||
Class cls = NSClassFromString(@"vChewingInputMethodController");
|
||||
NSString *dataPath = [[NSBundle bundleForClass:cls] pathForResource:filenameWithoutExtension ofType:@"txt"];
|
||||
lm.loadLanguageModel([dataPath UTF8String]);
|
||||
}
|
||||
|
||||
+ (void)deployZipDataFile:(NSString *)filenameWithoutExtension
|
||||
{
|
||||
Class cls = NSClassFromString(@"vChewingInputMethodController");
|
||||
|
@ -47,12 +40,25 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewing
|
|||
[SSZipArchive unzipFileAtPath:zipPath toDestination:destinationPath];
|
||||
}
|
||||
|
||||
static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewingLM &lm)
|
||||
{
|
||||
Class cls = NSClassFromString(@"vChewingInputMethodController");
|
||||
NSString *dataPath = [[NSBundle bundleForClass:cls] pathForResource:filenameWithoutExtension ofType:@"txt"];
|
||||
lm.loadLanguageModel([dataPath UTF8String]);
|
||||
}
|
||||
|
||||
+ (void)loadDataModels
|
||||
{
|
||||
LTLoadLanguageModelFile(@"data-cht", glanguageModelCoreCHT);
|
||||
LTLoadLanguageModelFile(@"data-chs", glanguageModelCoreCHS);
|
||||
}
|
||||
|
||||
+ (void)loadCNSData
|
||||
{
|
||||
glanguageModelCoreCHT.loadCNSData([[self cnsDataPath] UTF8String]);
|
||||
glanguageModelCoreCHS.loadCNSData([[self cnsDataPath] UTF8String]);
|
||||
}
|
||||
|
||||
+ (void)loadUserPhrases
|
||||
{
|
||||
glanguageModelCoreCHT.loadUserPhrases([[self userPhrasesDataPath:kBopomofoModeIdentifierCHT] UTF8String], [[self excludedPhrasesDataPath:kBopomofoModeIdentifierCHT] UTF8String]);
|
||||
|
@ -201,7 +207,7 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewing
|
|||
return [[self dataFolderPath] stringByAppendingPathComponent:fileName];
|
||||
}
|
||||
|
||||
+ (NSString *)cnsDataPath:(NSString *)inputMode
|
||||
+ (NSString *)cnsDataPath
|
||||
{
|
||||
return [[self dataFolderPath] stringByAppendingPathComponent:@"UNICHARS.csv"];
|
||||
}
|
||||
|
|
|
@ -40,6 +40,7 @@
|
|||
5BDD25F8279D6D1200AA18F8 /* zip.m in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25E7279D64FB00AA18F8 /* zip.m */; };
|
||||
5BDD25F9279D6D1200AA18F8 /* ioapi.m in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25E8279D64FB00AA18F8 /* ioapi.m */; };
|
||||
5BDD25FA279D6D1200AA18F8 /* mztools.m in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25E9279D64FB00AA18F8 /* mztools.m */; };
|
||||
5BDD25FD279D6D6300AA18F8 /* CNSLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD25FB279D6D6200AA18F8 /* CNSLM.cpp */; };
|
||||
5BDF2CFE2791BE4400838ADB /* InputSourceHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BDF2CFD2791BE4400838ADB /* InputSourceHelper.swift */; };
|
||||
5BDF2CFF2791BECC00838ADB /* InputSourceHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BDF2CFD2791BE4400838ADB /* InputSourceHelper.swift */; };
|
||||
5BDF2D012791C03B00838ADB /* PreferencesWindowController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BDF2D002791C03B00838ADB /* PreferencesWindowController.swift */; };
|
||||
|
@ -158,6 +159,8 @@
|
|||
5BDD25F0279D64FB00AA18F8 /* SSZipArchive.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = SSZipArchive.m; sourceTree = "<group>"; };
|
||||
5BDD25F1279D65CB00AA18F8 /* UNICHARS.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; name = UNICHARS.zip; path = Data/components/common/UNICHARS.zip; sourceTree = "<group>"; };
|
||||
5BDD25F3279D677F00AA18F8 /* libz.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libz.tbd; path = usr/lib/libz.tbd; sourceTree = SDKROOT; };
|
||||
5BDD25FB279D6D6200AA18F8 /* CNSLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CNSLM.cpp; sourceTree = "<group>"; };
|
||||
5BDD25FC279D6D6300AA18F8 /* CNSLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CNSLM.h; sourceTree = "<group>"; };
|
||||
5BDF2CFD2791BE4400838ADB /* InputSourceHelper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputSourceHelper.swift; sourceTree = "<group>"; };
|
||||
5BDF2D002791C03B00838ADB /* PreferencesWindowController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreferencesWindowController.swift; sourceTree = "<group>"; };
|
||||
5BDF2D022791C71200838ADB /* NonModalAlertWindowController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NonModalAlertWindowController.swift; sourceTree = "<group>"; };
|
||||
|
@ -276,6 +279,8 @@
|
|||
5BA8DAFE27928120009C9FFF /* LanguageModel */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
5BDD25FB279D6D6200AA18F8 /* CNSLM.cpp */,
|
||||
5BDD25FC279D6D6300AA18F8 /* CNSLM.h */,
|
||||
5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */,
|
||||
5B5F4F8C27928F9300922DC2 /* vChewingLM.h */,
|
||||
6A0421A615FEF3F50061ED63 /* FastLM.cpp */,
|
||||
|
@ -792,6 +797,7 @@
|
|||
5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */,
|
||||
5BDF2D032791C71200838ADB /* NonModalAlertWindowController.swift in Sources */,
|
||||
5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */,
|
||||
5BDD25FD279D6D6300AA18F8 /* CNSLM.cpp in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue