Repo // Remove remained ParselessLM files.
This commit is contained in:
parent
07a6a51954
commit
69b8c36186
|
@ -1,168 +0,0 @@
|
|||
// Copyright (c) 2011 and onwards The OpenVanilla Project (MIT License).
|
||||
// All possible vChewing-specific modifications are of:
|
||||
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
1. The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
2. No trademark license is granted to use the trade names, trademarks, service
|
||||
marks, or product names of Contributor, except as required to fulfill notice
|
||||
requirements above.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "ParselessLM.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
vChewing::ParselessLM::~ParselessLM()
|
||||
{
|
||||
close();
|
||||
}
|
||||
|
||||
bool vChewing::ParselessLM::isLoaded()
|
||||
{
|
||||
if (data_)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool vChewing::ParselessLM::open(const std::string_view &path)
|
||||
{
|
||||
if (data_)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
fd_ = ::open(path.data(), O_RDONLY);
|
||||
if (fd_ == -1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
struct stat sb;
|
||||
if (fstat(fd_, &sb) == -1)
|
||||
{
|
||||
::close(fd_);
|
||||
fd_ = -1;
|
||||
return false;
|
||||
}
|
||||
|
||||
length_ = static_cast<size_t>(sb.st_size);
|
||||
|
||||
data_ = mmap(NULL, length_, PROT_READ, MAP_SHARED, fd_, 0);
|
||||
if (data_ == nullptr)
|
||||
{
|
||||
::close(fd_);
|
||||
fd_ = -1;
|
||||
length_ = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
db_ = std::unique_ptr<ParselessPhraseDB>(new ParselessPhraseDB(static_cast<char *>(data_), length_));
|
||||
return true;
|
||||
}
|
||||
|
||||
void vChewing::ParselessLM::close()
|
||||
{
|
||||
if (data_ != nullptr)
|
||||
{
|
||||
munmap(data_, length_);
|
||||
::close(fd_);
|
||||
fd_ = -1;
|
||||
length_ = 0;
|
||||
data_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<Gramambular::Bigram> vChewing::ParselessLM::bigramsForKeys(const std::string &preceedingKey,
|
||||
const std::string &key)
|
||||
{
|
||||
return std::vector<Gramambular::Bigram>();
|
||||
}
|
||||
|
||||
const std::vector<Gramambular::Unigram> vChewing::ParselessLM::unigramsForKey(const std::string &key)
|
||||
{
|
||||
if (db_ == nullptr)
|
||||
{
|
||||
return std::vector<Gramambular::Unigram>();
|
||||
}
|
||||
|
||||
std::vector<Gramambular::Unigram> results;
|
||||
for (const auto &row : db_->findRows(key + " "))
|
||||
{
|
||||
Gramambular::Unigram unigram;
|
||||
|
||||
// Move ahead until we encounter the first space. This is the key.
|
||||
auto it = row.begin();
|
||||
while (it != row.end() && *it != ' ')
|
||||
{
|
||||
++it;
|
||||
}
|
||||
|
||||
unigram.keyValue.key = std::string(row.begin(), it);
|
||||
|
||||
// Read past the space.
|
||||
if (it != row.end())
|
||||
{
|
||||
++it;
|
||||
}
|
||||
|
||||
if (it != row.end())
|
||||
{
|
||||
// Now it is the start of the value portion.
|
||||
auto value_begin = it;
|
||||
|
||||
// Move ahead until we encounter the second space. This is the
|
||||
// value.
|
||||
while (it != row.end() && *it != ' ')
|
||||
{
|
||||
++it;
|
||||
}
|
||||
unigram.keyValue.value = std::string(value_begin, it);
|
||||
}
|
||||
|
||||
// Read past the space. The remainder, if it exists, is the score.
|
||||
if (it != row.end())
|
||||
{
|
||||
++it;
|
||||
}
|
||||
|
||||
if (it != row.end())
|
||||
{
|
||||
unigram.score = std::stod(std::string(it, row.end()));
|
||||
}
|
||||
results.push_back(unigram);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
bool vChewing::ParselessLM::hasUnigramsForKey(const std::string &key)
|
||||
{
|
||||
if (db_ == nullptr)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return db_->findFirstMatchingLine(key + " ") != nullptr;
|
||||
}
|
|
@ -1,63 +0,0 @@
|
|||
// Copyright (c) 2011 and onwards The OpenVanilla Project (MIT License).
|
||||
// All possible vChewing-specific modifications are of:
|
||||
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
1. The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
2. No trademark license is granted to use the trade names, trademarks, service
|
||||
marks, or product names of Contributor, except as required to fulfill notice
|
||||
requirements above.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef SOURCE_ENGINE_PARSELESSLM_H_
|
||||
#define SOURCE_ENGINE_PARSELESSLM_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "LanguageModel.h"
|
||||
#include "ParselessPhraseDB.h"
|
||||
|
||||
namespace vChewing
|
||||
{
|
||||
|
||||
class ParselessLM : public Gramambular::LanguageModel
|
||||
{
|
||||
public:
|
||||
~ParselessLM() override;
|
||||
|
||||
bool isLoaded();
|
||||
bool open(const std::string_view &path);
|
||||
void close();
|
||||
|
||||
const std::vector<Gramambular::Bigram> bigramsForKeys(const std::string &preceedingKey,
|
||||
const std::string &key) override;
|
||||
const std::vector<Gramambular::Unigram> unigramsForKey(const std::string &key) override;
|
||||
bool hasUnigramsForKey(const std::string &key) override;
|
||||
|
||||
private:
|
||||
int fd_ = -1;
|
||||
void *data_ = nullptr;
|
||||
size_t length_ = 0;
|
||||
std::unique_ptr<ParselessPhraseDB> db_;
|
||||
};
|
||||
|
||||
}; // namespace vChewing
|
||||
|
||||
#endif // SOURCE_ENGINE_PARSELESSLM_H_
|
|
@ -1,163 +0,0 @@
|
|||
// Copyright (c) 2011 and onwards The OpenVanilla Project (MIT License).
|
||||
// All possible vChewing-specific modifications are of:
|
||||
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
1. The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
2. No trademark license is granted to use the trade names, trademarks, service
|
||||
marks, or product names of Contributor, except as required to fulfill notice
|
||||
requirements above.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "ParselessPhraseDB.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
namespace vChewing
|
||||
{
|
||||
|
||||
ParselessPhraseDB::ParselessPhraseDB(const char *buf, size_t length) : begin_(buf), end_(buf + length)
|
||||
{
|
||||
}
|
||||
|
||||
std::vector<std::string_view> ParselessPhraseDB::findRows(const std::string_view &key)
|
||||
{
|
||||
std::vector<std::string_view> rows;
|
||||
|
||||
const char *ptr = findFirstMatchingLine(key);
|
||||
if (ptr == nullptr)
|
||||
{
|
||||
return rows;
|
||||
}
|
||||
|
||||
while (ptr + key.length() <= end_ && memcmp(ptr, key.data(), key.length()) == 0)
|
||||
{
|
||||
const char *eol = ptr;
|
||||
|
||||
while (eol != end_ && *eol != '\n')
|
||||
{
|
||||
++eol;
|
||||
}
|
||||
|
||||
rows.emplace_back(ptr, eol - ptr);
|
||||
if (eol == end_)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
ptr = ++eol;
|
||||
}
|
||||
|
||||
return rows;
|
||||
}
|
||||
|
||||
// Implements a binary search that returns the pointer to the first matching
|
||||
// row. In its core it's just a standard binary search, but we use backtracking
|
||||
// to locate the line start. We also check the previous line to see if the
|
||||
// current line is actually the first matching line: if the previous line is
|
||||
// less to the key and the current line starts exactly with the key, then
|
||||
// the current line is the first matching line.
|
||||
const char *ParselessPhraseDB::findFirstMatchingLine(const std::string_view &key)
|
||||
{
|
||||
if (key.empty())
|
||||
{
|
||||
return begin_;
|
||||
}
|
||||
|
||||
const char *top = begin_;
|
||||
const char *bottom = end_;
|
||||
|
||||
while (top < bottom)
|
||||
{
|
||||
const char *mid = top + (bottom - top) / 2;
|
||||
const char *ptr = mid;
|
||||
|
||||
if (ptr != begin_)
|
||||
{
|
||||
--ptr;
|
||||
}
|
||||
|
||||
while (ptr != begin_ && *ptr != '\n')
|
||||
{
|
||||
--ptr;
|
||||
}
|
||||
|
||||
const char *prev = nullptr;
|
||||
if (*ptr == '\n')
|
||||
{
|
||||
prev = ptr;
|
||||
++ptr;
|
||||
}
|
||||
|
||||
// ptr is now in the "current" line we're interested in.
|
||||
if (ptr + key.length() > end_)
|
||||
{
|
||||
// not enough data to compare at this point, bail.
|
||||
break;
|
||||
}
|
||||
|
||||
int current_cmp = memcmp(ptr, key.data(), key.length());
|
||||
|
||||
if (current_cmp > 0)
|
||||
{
|
||||
bottom = mid - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current_cmp < 0)
|
||||
{
|
||||
top = mid + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!prev)
|
||||
{
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Move the prev so that it reaches the previous line.
|
||||
if (prev != begin_)
|
||||
{
|
||||
--prev;
|
||||
}
|
||||
while (prev != begin_ && *prev != '\n')
|
||||
{
|
||||
--prev;
|
||||
}
|
||||
if (*prev == '\n')
|
||||
{
|
||||
++prev;
|
||||
}
|
||||
|
||||
int prev_cmp = memcmp(prev, key.data(), key.length());
|
||||
|
||||
// This is the first occurrence.
|
||||
if (prev_cmp < 0 && current_cmp == 0)
|
||||
{
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// This is not, which means ptr is "larger" than the keyData.
|
||||
bottom = mid - 1;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
}; // namespace vChewing
|
|
@ -1,61 +0,0 @@
|
|||
// Copyright (c) 2011 and onwards The OpenVanilla Project (MIT License).
|
||||
// All possible vChewing-specific modifications are of:
|
||||
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
1. The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
2. No trademark license is granted to use the trade names, trademarks, service
|
||||
marks, or product names of Contributor, except as required to fulfill notice
|
||||
requirements above.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef SOURCE_ENGINE_PARSELESSPHRASEDB_H_
|
||||
#define SOURCE_ENGINE_PARSELESSPHRASEDB_H_
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace vChewing
|
||||
{
|
||||
|
||||
// Defines phrase database that consists of (key, value, score) rows that are
|
||||
// pre-sorted by the byte value of the keys. It is way faster than FastLM
|
||||
// because it does not need to parse anything. Instead, it relies on the fact
|
||||
// that the database is already sorted, and binary search is used to find the
|
||||
// rows.
|
||||
class ParselessPhraseDB
|
||||
{
|
||||
public:
|
||||
ParselessPhraseDB(const char *buf, size_t length);
|
||||
|
||||
// Find the rows that match the key. Note that prefix match is used. If you
|
||||
// need exact match, the key will need to have a delimiter (usually a space)
|
||||
// at the end.
|
||||
std::vector<std::string_view> findRows(const std::string_view &key);
|
||||
|
||||
const char *findFirstMatchingLine(const std::string_view &key);
|
||||
|
||||
private:
|
||||
const char *begin_;
|
||||
const char *end_;
|
||||
};
|
||||
|
||||
}; // namespace vChewing
|
||||
|
||||
#endif // SOURCE_ENGINE_PARSELESSPHRASEDB_H_
|
|
@ -315,10 +315,6 @@
|
|||
6ACA41EF15FC1D9000935EF6 /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/Localizable.strings; sourceTree = "<group>"; };
|
||||
6ACA41F215FC1D9000935EF6 /* Installer-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; name = "Installer-Info.plist"; path = "Installer/Installer-Info.plist"; sourceTree = SOURCE_ROOT; };
|
||||
6ACA41F315FC1D9000935EF6 /* Installer-Prefix.pch */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "Installer-Prefix.pch"; path = "Installer/Installer-Prefix.pch"; sourceTree = SOURCE_ROOT; };
|
||||
6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = ParselessPhraseDB.cpp; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
|
||||
6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = ParselessPhraseDB.h; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
|
||||
6ACC3D422793701600F1B140 /* ParselessLM.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = ParselessLM.cpp; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
|
||||
6ACC3D432793701600F1B140 /* ParselessLM.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = ParselessLM.h; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
|
||||
D427A9BF25ED28CC005D43E0 /* vChewing-Bridging-Header.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = "vChewing-Bridging-Header.h"; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
|
||||
D427F76B278CA1BA004A2160 /* AppDelegate.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = AppDelegate.swift; sourceTree = "<group>"; tabWidth = 2; usesTabs = 0; };
|
||||
D456576D279E4F7B00DF6BC9 /* InputHandler.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = InputHandler.swift; sourceTree = "<group>"; tabWidth = 2; usesTabs = 0; };
|
||||
|
@ -501,10 +497,6 @@
|
|||
5B62A32527AE758000A19448 /* OldFileReferences */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
6ACC3D422793701600F1B140 /* ParselessLM.cpp */,
|
||||
6ACC3D432793701600F1B140 /* ParselessLM.h */,
|
||||
6ACC3D402793701600F1B140 /* ParselessPhraseDB.cpp */,
|
||||
6ACC3D412793701600F1B140 /* ParselessPhraseDB.h */,
|
||||
D47F7DD2278C1263002F9DD7 /* UserOverrideModel.cpp */,
|
||||
D47F7DD1278C1263002F9DD7 /* UserOverrideModel.h */,
|
||||
);
|
||||
|
|
Loading…
Reference in New Issue