From 4ebe1a1a11ba40e1edc7c112c0cfb61bfdacff34 Mon Sep 17 00:00:00 2001 From: Lukhnos Liu Date: Mon, 31 Jan 2022 21:41:12 -0800 Subject: [PATCH] Remove OpenVanilla dependencies This removes one overengineered method from BopomofoSyllable and rewrites a helper using a simpler UTF-8 heuristics. Also adds the CMake project file and a unit test suite. --- Source/Engine/Mandarin/CMakeLists.txt | 31 +++++++++++ Source/Engine/Mandarin/Mandarin.cpp | 72 ++++++++++++------------- Source/Engine/Mandarin/Mandarin.h | 3 -- Source/Engine/Mandarin/MandarinTest.cpp | 47 ++++++++++++++++ 4 files changed, 114 insertions(+), 39 deletions(-) create mode 100644 Source/Engine/Mandarin/CMakeLists.txt create mode 100644 Source/Engine/Mandarin/MandarinTest.cpp diff --git a/Source/Engine/Mandarin/CMakeLists.txt b/Source/Engine/Mandarin/CMakeLists.txt new file mode 100644 index 00000000..fbfedea2 --- /dev/null +++ b/Source/Engine/Mandarin/CMakeLists.txt @@ -0,0 +1,31 @@ +cmake_minimum_required(VERSION 3.17) +project(Mandarin) + +set(CMAKE_CXX_STANDARD 17) + +add_library(MandarinLib Mandarin.h Mandarin.cpp) + +# Let CMake fetch Google Test for us. +# https://github.com/google/googletest/tree/main/googletest#incorporating-into-an-existing-cmake-project +include(FetchContent) + +FetchContent_Declare( + googletest + # Specify the commit you depend on and update it regularly. + URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip +) +# For Windows: Prevent overriding the parent project's compiler/linker settings +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + +# Test target declarations. +add_executable(MandarinTest MandarinTest.cpp) +target_link_libraries(MandarinTest gtest_main MandarinLib) +include(GoogleTest) +gtest_discover_tests(MandarinTest) + +add_custom_target( + runTest + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/MandarinTest +) +add_dependencies(runTest MandarinTest) diff --git a/Source/Engine/Mandarin/Mandarin.cpp b/Source/Engine/Mandarin/Mandarin.cpp index 2fa0452b..bfd2cb55 100644 --- a/Source/Engine/Mandarin/Mandarin.cpp +++ b/Source/Engine/Mandarin/Mandarin.cpp @@ -29,14 +29,9 @@ #include #include "Mandarin.h" -#include "OVUTF8Helper.h" -#include "OVWildcard.h" - namespace Formosa { namespace Mandarin { -using namespace OpenVanilla; - class PinyinParseHelper { public: static const bool ConsumePrefix(string &target, const string &prefix) @@ -591,15 +586,43 @@ const BPMF BPMF::FromPHT(const string& str) const BPMF BPMF::FromComposedString(const string& str) { BPMF syllable; - vector components = OVUTF8Helper::SplitStringByCodePoint(str); - for (vector::iterator iter = components.begin() ; iter != components.end() ; ++iter) { - - const map& charToComp = BopomofoCharacterMap::SharedInstance().characterToComponent; - map::const_iterator result = charToComp.find(*iter); - if (result != charToComp.end()) - syllable += BPMF((*result).second); + auto iter = str.begin(); + while (iter != str.end()) { + // This is a naive implementation and we bail early at anything we don't recognize. + // A sound implementation would require to either use a trie for the Bopomofo character map + // or to split the input by codepoints. This suffices for now. + + // Illegal. + if (!(*iter & 0x80)) { + break; + } + + size_t utf8_length = -1; + + // These are the code points for the tone markers. + if ((*iter & (0x80 | 0x40)) && !(*iter & 0x20)) { + utf8_length = 2; + } else if ((*iter & (0x80 | 0x40 | 0x20)) && !(*iter & 0x10)) { + utf8_length = 3; + } else { + // Illegal. + break; + } + + if (iter + (utf8_length - 1) == str.end()) { + break; + } + + string component = string(iter, iter + utf8_length); + const map& charToComp = BopomofoCharacterMap::SharedInstance().characterToComponent; + map::const_iterator result = charToComp.find(component); + if (result == charToComp.end()) { + break; + } else { + syllable += BPMF((*result).second); + } + iter += utf8_length; } - return syllable; } @@ -736,29 +759,6 @@ void BopomofoKeyboardLayout::FinalizeLayouts() #undef FL } -const BopomofoKeyboardLayout* BopomofoKeyboardLayout::LayoutForName(const string& name) -{ - if (OVWildcard::Match(name, "standard")) - return StandardLayout(); - - if (OVWildcard::Match(name, "eten")) - return ETenLayout(); - - if (OVWildcard::Match(name, "hsu")) - return HsuLayout(); - - if (OVWildcard::Match(name, "eten26")) - return ETen26Layout(); - - if (OVWildcard::Match(name, "IBM")) - return IBMLayout(); - - if (OVWildcard::Match(name, "hanyupinyin") || OVWildcard::Match(name, "hanyu pinyin") || OVWildcard::Match(name, "hanyu-pinyin") || OVWildcard::Match(name, "pinyin")) - return HanyuPinyinLayout(); - - return 0; -} - #define ASSIGNKEY1(m, vec, k, val) m[k] = (vec.clear(), vec.push_back((BPMF::Component)val), vec) #define ASSIGNKEY2(m, vec, k, val1, val2) m[k] = (vec.clear(), vec.push_back((BPMF::Component)val1), vec.push_back((BPMF::Component)val2), vec) #define ASSIGNKEY3(m, vec, k, val1, val2, val3) m[k] = (vec.clear(), vec.push_back((BPMF::Component)val1), vec.push_back((BPMF::Component)val2), vec.push_back((BPMF::Component)val3), vec) diff --git a/Source/Engine/Mandarin/Mandarin.h b/Source/Engine/Mandarin/Mandarin.h index 01ede589..40df0133 100644 --- a/Source/Engine/Mandarin/Mandarin.h +++ b/Source/Engine/Mandarin/Mandarin.h @@ -271,9 +271,6 @@ namespace Formosa { static const BopomofoKeyboardLayout* IBMLayout(); static const BopomofoKeyboardLayout* HanyuPinyinLayout(); - // recognizes (case-insensitive): standard, eten, hsu, eten26, ibm - static const BopomofoKeyboardLayout* LayoutForName(const string& name); - BopomofoKeyboardLayout(const BopomofoKeyToComponentMap& ktcm, const string& name) : m_keyToComponent(ktcm) , m_name(name) diff --git a/Source/Engine/Mandarin/MandarinTest.cpp b/Source/Engine/Mandarin/MandarinTest.cpp new file mode 100644 index 00000000..c0242b7a --- /dev/null +++ b/Source/Engine/Mandarin/MandarinTest.cpp @@ -0,0 +1,47 @@ +// Copyright (c) 2022 and onwards Lukhnos Liu +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "Mandarin.h" +#include "gtest/gtest.h" + +namespace Formosa { +namespace Mandarin { + +static std::string RoundTrip(const std::string& composedString) { + return BopomofoSyllable::FromComposedString(composedString).composedString(); +} + +TEST(MandarinTest, FromComposedString) { + ASSERT_EQ(RoundTrip("ㄅ"), "ㄅ"); + ASSERT_EQ(RoundTrip("ㄅㄧ"), "ㄅㄧ"); + ASSERT_EQ(RoundTrip("ㄅㄧˇ"), "ㄅㄧˇ"); + ASSERT_EQ(RoundTrip("ㄅㄧˇㄆ"), "ㄆㄧˇ"); + ASSERT_EQ(RoundTrip("ㄅㄧˇㄆ"), "ㄆㄧˇ"); + ASSERT_EQ(RoundTrip("e"), ""); + ASSERT_EQ(RoundTrip("é"), ""); + ASSERT_EQ(RoundTrip("ㄅéㄆ"), "ㄅ"); + ASSERT_EQ(RoundTrip("ㄅeㄆ"), "ㄅ"); +} + +} // namespace Mandarin +} // namespace Formosa