CNSLM // Instantiation with -11.0 value.

This commit is contained in:
ShikiSuen 2022-02-25 20:59:55 +08:00
parent d3576fc885
commit 8fe16957d9
5 changed files with 31 additions and 183 deletions

View File

@ -1,150 +0,0 @@
// Copyright (c) 2011 and onwards The OpenVanilla Project (MIT License).
// All possible vChewing-specific modifications are (c) 2021 and onwards The vChewing Project (MIT-NTL License).
/*
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
to permit persons to whom the Software is furnished to do so, subject to the following conditions:
1. The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
2. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor,
except as required to fulfill notice requirements above.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "CNSLM.h"
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <fstream>
#include <unistd.h>
#include <syslog.h>
#include "KeyValueBlobReader.h"
namespace vChewing {
CNSLM::CNSLM()
: fd(-1)
, data(0)
, length(0)
{
}
CNSLM::~CNSLM()
{
if (data) {
close();
}
}
bool CNSLM::isLoaded()
{
if (data) {
return true;
}
return false;
}
bool CNSLM::open(const char *path)
{
if (data) {
syslog(LOG_CONS, "CNSLM: Failed at Open Step 1.\n");
return false;
}
fd = ::open(path, O_RDONLY);
if (fd == -1) {
syslog(LOG_CONS, "CNSLM: Failed at Open Step 2.\n");
printf("open:: file not exist");
return false;
}
struct stat sb;
if (fstat(fd, &sb) == -1) {
syslog(LOG_CONS, "CNSLM: Failed at Open Step 3.\n");
printf("open:: cannot open file");
return false;
}
length = (size_t)sb.st_size;
data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
if (!data) {
::close(fd);
syslog(LOG_CONS, "CNSLM: Failed at Open Step 4.\n");
return false;
}
KeyValueBlobReader reader(static_cast<char*>(data), length);
KeyValueBlobReader::KeyValue keyValue;
KeyValueBlobReader::State state;
while ((state = reader.Next(&keyValue)) == KeyValueBlobReader::State::HAS_PAIR) {
// We invert the key and value, since in user phrases, "key" is the phrase value, and "value" is the BPMF reading.
keyRowMap[keyValue.value].emplace_back(keyValue.value, keyValue.key);
}
// 下面這一段或許可以做成開關、來詢問是否對使用者語彙採取寬鬆策略(哪怕有行內容寫錯也會放行)
if (state == KeyValueBlobReader::State::ERROR) {
// close();
syslog(LOG_CONS, "CNSLM: Failed at Open Step 5. On Error Resume Next.\n");
// return false;
}
return true;
}
void CNSLM::close()
{
if (data) {
munmap(data, length);
::close(fd);
data = 0;
}
keyRowMap.clear();
}
void CNSLM::dump()
{
for (const auto& entry : keyRowMap) {
const std::vector<Row>& rows = entry.second;
for (const auto& row : rows) {
std::cerr << row.key << " " << row.value << "\n";
}
}
}
const std::vector<Taiyan::Gramambular::Bigram> CNSLM::bigramsForKeys(const std::string& preceedingKey, const std::string& key)
{
return std::vector<Taiyan::Gramambular::Bigram>();
}
const std::vector<Taiyan::Gramambular::Unigram> CNSLM::unigramsForKey(const std::string& key)
{
std::vector<Taiyan::Gramambular::Unigram> v;
auto iter = keyRowMap.find(key);
if (iter != keyRowMap.end()) {
const std::vector<Row>& rows = iter->second;
for (const auto& row : rows) {
Taiyan::Gramambular::Unigram g;
g.keyValue.key = row.key;
g.keyValue.value = row.value;
g.score = -17.0;
v.push_back(g);
}
}
return v;
}
bool CNSLM::hasUnigramsForKey(const std::string& key)
{
return keyRowMap.find(key) != keyRowMap.end();
}
}; // namespace vChewing

View File

@ -24,35 +24,19 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR TH
#include <map>
#include <iostream>
#include "LanguageModel.h"
#include "UserPhrasesLM.h"
namespace vChewing {
class CNSLM : public Taiyan::Gramambular::LanguageModel
class CNSLM: public UserPhrasesLM
{
public:
CNSLM();
~CNSLM();
bool isLoaded();
bool open(const char *path);
void close();
void dump();
virtual const std::vector<Taiyan::Gramambular::Bigram> bigramsForKeys(const std::string& preceedingKey, const std::string& key);
virtual const std::vector<Taiyan::Gramambular::Unigram> unigramsForKey(const std::string& key);
virtual bool hasUnigramsForKey(const std::string& key);
protected:
struct Row {
Row(std::string_view& k, std::string_view& v) : key(k), value(v) {}
std::string_view key;
std::string_view value;
};
std::map<std::string_view, std::vector<Row>> keyRowMap;
int fd;
void *data;
size_t length;
virtual bool allowConsolidation() override {
return false;
}
virtual float overridedValue() override {
return -11.0;
}
};
}

View File

@ -38,6 +38,14 @@ public:
void close();
void dump();
virtual bool allowConsolidation() {
return true;
}
virtual float overridedValue() {
return 0.0;
}
virtual const std::vector<Taiyan::Gramambular::Bigram> bigramsForKeys(const std::string& preceedingKey, const std::string& key);
virtual const std::vector<Taiyan::Gramambular::Unigram> unigramsForKey(const std::string& key);
virtual bool hasUnigramsForKey(const std::string& key);

View File

@ -59,8 +59,10 @@ bool UserPhrasesLM::open(const char *path)
return false;
}
if (allowConsolidation()) {
LMConsolidator::FixEOF(path);
LMConsolidator::ConsolidateContent(path, true);
}
fd = ::open(path, O_RDONLY);
if (fd == -1) {
@ -134,7 +136,7 @@ const std::vector<Taiyan::Gramambular::Unigram> UserPhrasesLM::unigramsForKey(co
Taiyan::Gramambular::Unigram g;
g.keyValue.key = row.key;
g.keyValue.value = row.value;
g.score = 0.0;
g.score = overridedValue();
v.push_back(g);
}
}

View File

@ -18,7 +18,6 @@
5B62A31B27AE73A700A19448 /* SSZipArchive.m in Sources */ = {isa = PBXBuildFile; fileRef = 5B62A31327AE73A700A19448 /* SSZipArchive.m */; };
5B62A31C27AE73A700A19448 /* AWFileHash.m in Sources */ = {isa = PBXBuildFile; fileRef = 5B62A31627AE73A700A19448 /* AWFileHash.m */; };
5B62A32927AE77D100A19448 /* FSEventStreamHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B62A32827AE77D100A19448 /* FSEventStreamHelper.swift */; };
5B62A32E27AE78B000A19448 /* CNSLM.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B62A32A27AE78B000A19448 /* CNSLM.mm */; };
5B62A32F27AE78B000A19448 /* CoreLM.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B62A32D27AE78B000A19448 /* CoreLM.mm */; };
5B62A33227AE792F00A19448 /* InputSourceHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B62A33127AE792F00A19448 /* InputSourceHelper.swift */; };
5B62A33627AE795800A19448 /* PreferencesModule.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B62A33527AE795800A19448 /* PreferencesModule.swift */; };
@ -191,7 +190,6 @@
5B62A32627AE77BB00A19448 /* LMConsolidator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LMConsolidator.h; sourceTree = "<group>"; };
5B62A32727AE77BB00A19448 /* LMConsolidator.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LMConsolidator.mm; sourceTree = "<group>"; };
5B62A32827AE77D100A19448 /* FSEventStreamHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FSEventStreamHelper.swift; sourceTree = "<group>"; };
5B62A32A27AE78B000A19448 /* CNSLM.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = CNSLM.mm; sourceTree = "<group>"; };
5B62A32B27AE78B000A19448 /* CNSLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = CNSLM.h; sourceTree = "<group>"; };
5B62A32C27AE78B000A19448 /* CoreLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = CoreLM.h; sourceTree = "<group>"; };
5B62A32D27AE78B000A19448 /* CoreLM.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = CoreLM.mm; sourceTree = "<group>"; };
@ -364,6 +362,14 @@
name = MiscRootFiles;
sourceTree = "<group>";
};
5B4D47B627C9186900220DDC /* InstantiatedModels */ = {
isa = PBXGroup;
children = (
5B62A32B27AE78B000A19448 /* CNSLM.h */,
);
path = InstantiatedModels;
sourceTree = "<group>";
};
5B62A30127AE732800A19448 /* 3rdParty */ = {
isa = PBXGroup;
children = (
@ -495,8 +501,7 @@
5B62A32527AE758000A19448 /* SubLanguageModels */ = {
isa = PBXGroup;
children = (
5B62A32B27AE78B000A19448 /* CNSLM.h */,
5B62A32A27AE78B000A19448 /* CNSLM.mm */,
5B4D47B627C9186900220DDC /* InstantiatedModels */,
5B62A32C27AE78B000A19448 /* CoreLM.h */,
5B62A32D27AE78B000A19448 /* CoreLM.mm */,
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.mm */,
@ -1054,7 +1059,6 @@
5B62A34A27AE7CD900A19448 /* NotifierController.swift in Sources */,
5B11328927B94CFB00E58451 /* AppleKeyboardConverter.swift in Sources */,
5B62A31827AE73A700A19448 /* zip.m in Sources */,
5B62A32E27AE78B000A19448 /* CNSLM.mm in Sources */,
D41355DB278E6D17005E5CBD /* LMInstantiator.mm in Sources */,
5B62A31A27AE73A700A19448 /* mztools.m in Sources */,
5B62A32927AE77D100A19448 /* FSEventStreamHelper.swift in Sources */,