Remove OpenVanilla dependencies

This removes one overengineered method from BopomofoSyllable and
rewrites a helper using a simpler UTF-8 heuristics.

Also adds the CMake project file and a unit test suite.
This commit is contained in:
Lukhnos Liu 2022-01-31 21:41:12 -08:00
parent 0700e0fc60
commit 4ebe1a1a11
4 changed files with 114 additions and 39 deletions

View File

@ -0,0 +1,31 @@
cmake_minimum_required(VERSION 3.17)
project(Mandarin)
set(CMAKE_CXX_STANDARD 17)
add_library(MandarinLib Mandarin.h Mandarin.cpp)
# Let CMake fetch Google Test for us.
# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project
include(FetchContent)
FetchContent_Declare(
googletest
# Specify the commit you depend on and update it regularly.
URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
)
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
# Test target declarations.
add_executable(MandarinTest MandarinTest.cpp)
target_link_libraries(MandarinTest gtest_main MandarinLib)
include(GoogleTest)
gtest_discover_tests(MandarinTest)
add_custom_target(
runTest
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/MandarinTest
)
add_dependencies(runTest MandarinTest)

View File

@ -29,14 +29,9 @@
#include <algorithm> #include <algorithm>
#include "Mandarin.h" #include "Mandarin.h"
#include "OVUTF8Helper.h"
#include "OVWildcard.h"
namespace Formosa { namespace Formosa {
namespace Mandarin { namespace Mandarin {
using namespace OpenVanilla;
class PinyinParseHelper { class PinyinParseHelper {
public: public:
static const bool ConsumePrefix(string &target, const string &prefix) static const bool ConsumePrefix(string &target, const string &prefix)
@ -591,15 +586,43 @@ const BPMF BPMF::FromPHT(const string& str)
const BPMF BPMF::FromComposedString(const string& str) const BPMF BPMF::FromComposedString(const string& str)
{ {
BPMF syllable; BPMF syllable;
vector<string> components = OVUTF8Helper::SplitStringByCodePoint(str); auto iter = str.begin();
for (vector<string>::iterator iter = components.begin() ; iter != components.end() ; ++iter) { while (iter != str.end()) {
// This is a naive implementation and we bail early at anything we don't recognize.
// A sound implementation would require to either use a trie for the Bopomofo character map
// or to split the input by codepoints. This suffices for now.
const map<string, BPMF::Component>& charToComp = BopomofoCharacterMap::SharedInstance().characterToComponent; // Illegal.
map<string, BPMF::Component>::const_iterator result = charToComp.find(*iter); if (!(*iter & 0x80)) {
if (result != charToComp.end()) break;
syllable += BPMF((*result).second);
} }
size_t utf8_length = -1;
// These are the code points for the tone markers.
if ((*iter & (0x80 | 0x40)) && !(*iter & 0x20)) {
utf8_length = 2;
} else if ((*iter & (0x80 | 0x40 | 0x20)) && !(*iter & 0x10)) {
utf8_length = 3;
} else {
// Illegal.
break;
}
if (iter + (utf8_length - 1) == str.end()) {
break;
}
string component = string(iter, iter + utf8_length);
const map<string, BPMF::Component>& charToComp = BopomofoCharacterMap::SharedInstance().characterToComponent;
map<string, BPMF::Component>::const_iterator result = charToComp.find(component);
if (result == charToComp.end()) {
break;
} else {
syllable += BPMF((*result).second);
}
iter += utf8_length;
}
return syllable; return syllable;
} }
@ -736,29 +759,6 @@ void BopomofoKeyboardLayout::FinalizeLayouts()
#undef FL #undef FL
} }
const BopomofoKeyboardLayout* BopomofoKeyboardLayout::LayoutForName(const string& name)
{
if (OVWildcard::Match(name, "standard"))
return StandardLayout();
if (OVWildcard::Match(name, "eten"))
return ETenLayout();
if (OVWildcard::Match(name, "hsu"))
return HsuLayout();
if (OVWildcard::Match(name, "eten26"))
return ETen26Layout();
if (OVWildcard::Match(name, "IBM"))
return IBMLayout();
if (OVWildcard::Match(name, "hanyupinyin") || OVWildcard::Match(name, "hanyu pinyin") || OVWildcard::Match(name, "hanyu-pinyin") || OVWildcard::Match(name, "pinyin"))
return HanyuPinyinLayout();
return 0;
}
#define ASSIGNKEY1(m, vec, k, val) m[k] = (vec.clear(), vec.push_back((BPMF::Component)val), vec) #define ASSIGNKEY1(m, vec, k, val) m[k] = (vec.clear(), vec.push_back((BPMF::Component)val), vec)
#define ASSIGNKEY2(m, vec, k, val1, val2) m[k] = (vec.clear(), vec.push_back((BPMF::Component)val1), vec.push_back((BPMF::Component)val2), vec) #define ASSIGNKEY2(m, vec, k, val1, val2) m[k] = (vec.clear(), vec.push_back((BPMF::Component)val1), vec.push_back((BPMF::Component)val2), vec)
#define ASSIGNKEY3(m, vec, k, val1, val2, val3) m[k] = (vec.clear(), vec.push_back((BPMF::Component)val1), vec.push_back((BPMF::Component)val2), vec.push_back((BPMF::Component)val3), vec) #define ASSIGNKEY3(m, vec, k, val1, val2, val3) m[k] = (vec.clear(), vec.push_back((BPMF::Component)val1), vec.push_back((BPMF::Component)val2), vec.push_back((BPMF::Component)val3), vec)

View File

@ -271,9 +271,6 @@ namespace Formosa {
static const BopomofoKeyboardLayout* IBMLayout(); static const BopomofoKeyboardLayout* IBMLayout();
static const BopomofoKeyboardLayout* HanyuPinyinLayout(); static const BopomofoKeyboardLayout* HanyuPinyinLayout();
// recognizes (case-insensitive): standard, eten, hsu, eten26, ibm
static const BopomofoKeyboardLayout* LayoutForName(const string& name);
BopomofoKeyboardLayout(const BopomofoKeyToComponentMap& ktcm, const string& name) BopomofoKeyboardLayout(const BopomofoKeyToComponentMap& ktcm, const string& name)
: m_keyToComponent(ktcm) : m_keyToComponent(ktcm)
, m_name(name) , m_name(name)

View File

@ -0,0 +1,47 @@
// Copyright (c) 2022 and onwards Lukhnos Liu
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include "Mandarin.h"
#include "gtest/gtest.h"
namespace Formosa {
namespace Mandarin {
static std::string RoundTrip(const std::string& composedString) {
return BopomofoSyllable::FromComposedString(composedString).composedString();
}
TEST(MandarinTest, FromComposedString) {
ASSERT_EQ(RoundTrip(""), "");
ASSERT_EQ(RoundTrip("ㄅㄧ"), "ㄅㄧ");
ASSERT_EQ(RoundTrip("ㄅㄧˇ"), "ㄅㄧˇ");
ASSERT_EQ(RoundTrip("ㄅㄧˇㄆ"), "ㄆㄧˇ");
ASSERT_EQ(RoundTrip("ㄅㄧˇㄆ"), "ㄆㄧˇ");
ASSERT_EQ(RoundTrip("e"), "");
ASSERT_EQ(RoundTrip("é"), "");
ASSERT_EQ(RoundTrip("ㄅéㄆ"), "");
ASSERT_EQ(RoundTrip("ㄅeㄆ"), "");
}
} // namespace Mandarin
} // namespace Formosa