From 945eb97f9ff34b92469df349c66ea5ed75deca78 Mon Sep 17 00:00:00 2001 From: Nikita Kniazev Date: Fri, 17 Mar 2023 19:50:27 +0300 Subject: [PATCH] Cease dependence on Boost.Regex `boost/spirit/include/classic_regex.hpp` is an optional header --- CMakeLists.txt | 1 - include/boost/spirit/home/support/utf8.hpp | 96 ++++++++++------ .../spirit/home/x3/support/utility/utf8.hpp | 105 +++++++++++------- 3 files changed, 124 insertions(+), 78 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a46979c89..6217f54c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,6 @@ target_link_libraries(boost_spirit Boost::preprocessor Boost::proto Boost::range - Boost::regex Boost::smart_ptr Boost::static_assert Boost::thread diff --git a/include/boost/spirit/home/support/utf8.hpp b/include/boost/spirit/home/support/utf8.hpp index 59fcb9881..9d0fb23ad 100644 --- a/include/boost/spirit/home/support/utf8.hpp +++ b/include/boost/spirit/home/support/utf8.hpp @@ -1,5 +1,6 @@ /*============================================================================= Copyright (c) 2001-2011 Joel de Guzman + Copyright (c) 2023 Nikita Kniazev Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -11,10 +12,9 @@ #pragma once #endif +#include #include -#include #include -#include #include namespace boost { namespace spirit @@ -24,48 +24,63 @@ namespace boost { namespace spirit typedef std::basic_string ucs4_string; typedef std::basic_string utf8_string; +namespace detail { + inline void utf8_put_encode(utf8_string& out, ucs4_char x) + { + // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf D90 + if (BOOST_UNLIKELY(x > 0x10FFFFul || (0xD7FFul < x && x < 0xE000ul))) + x = 0xFFFDul; + + // Table 3-6. UTF-8 Bit Distribution + if (x < 0x80ul) { + out.push_back(static_cast(x)); + } + else if (x < 0x800ul) { + out.push_back(static_cast(0xC0ul + (x >> 6))); + out.push_back(static_cast(0x80ul + (x & 0x3Ful))); + } + else if (x < 0x10000ul) { + out.push_back(static_cast(0xE0ul + (x >> 12))); + out.push_back(static_cast(0x80ul + ((x >> 6) & 0x3Ful))); + out.push_back(static_cast(0x80ul + (x & 0x3Ful))); + } + else { + out.push_back(static_cast(0xF0ul + (x >> 18))); + out.push_back(static_cast(0x80ul + ((x >> 12) & 0x3Ful))); + out.push_back(static_cast(0x80ul + ((x >> 6) & 0x3Ful))); + out.push_back(static_cast(0x80ul + (x & 0x3Ful))); + } + } +} + template inline utf8_string to_utf8(Char value) { - // always store as UTF8 utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); typedef typename make_unsigned::type UChar; - *utf8_iter = (UChar)value; + detail::utf8_put_encode(result, static_cast(value)); return result; } template inline utf8_string to_utf8(Char const* str) { - // always store as UTF8 utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); typedef typename make_unsigned::type UChar; while (*str) - *utf8_iter++ = (UChar)*str++; + detail::utf8_put_encode(result, static_cast(*str++)); return result; } template - inline utf8_string + inline utf8_string to_utf8(std::basic_string const& str) { - // always store as UTF8 utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); typedef typename make_unsigned::type UChar; for (Char const* ptr = str.data(), * end = ptr + str.size(); ptr < end; ++ptr) - { - *utf8_iter++ = (UChar)*ptr; - } + detail::utf8_put_encode(result, static_cast(*ptr)); return result; } @@ -74,28 +89,37 @@ namespace boost { namespace spirit inline utf8_string to_utf8(wchar_t value) { utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); - - u16_to_u32_iterator ucs4_iter(&value); - *utf8_iter++ = *ucs4_iter; - + detail::utf8_put_encode(result, static_cast::type>(value)); return result; } +namespace detail { + inline ucs4_char decode_utf16(wchar_t const*& s) + { + typedef make_unsigned::type uwchar_t; + + uwchar_t x(*s); + if (x < 0xD800ul || x > 0xDFFFul) + return x; + + // expected high-surrogate + if (BOOST_UNLIKELY((x >> 10) != 0x36ul)) + return 0xFFFDul; + + uwchar_t y(*++s); + // expected low-surrogate + if (BOOST_UNLIKELY((y >> 10) != 0x37ul)) + return 0xFFFDul; + + return ((x & 0x3FFul) << 10) + (y & 0x3FFul) + 0x10000ul; + } +} + inline utf8_string to_utf8(wchar_t const* str) { utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); - - u16_to_u32_iterator ucs4_iter(str); - for (ucs4_char c; (c = *ucs4_iter) != ucs4_char(); ++ucs4_iter) { - *utf8_iter++ = c; - } - + for (ucs4_char c; (c = detail::decode_utf16(str)) != ucs4_char(); ++str) + detail::utf8_put_encode(result, c); return result; } diff --git a/include/boost/spirit/home/x3/support/utility/utf8.hpp b/include/boost/spirit/home/x3/support/utility/utf8.hpp index 94aa401ee..2b0d6dcfc 100644 --- a/include/boost/spirit/home/x3/support/utility/utf8.hpp +++ b/include/boost/spirit/home/x3/support/utility/utf8.hpp @@ -1,5 +1,6 @@ /*============================================================================= Copyright (c) 2001-2014 Joel de Guzman + Copyright (c) 2023 Nikita Kniazev Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -7,43 +8,62 @@ #if !defined(BOOST_SPIRIT_X3_UC_TYPES_NOVEMBER_23_2008_0840PM) #define BOOST_SPIRIT_X3_UC_TYPES_NOVEMBER_23_2008_0840PM -#include -#include -#include -#include +#include +#include #include namespace boost { namespace spirit { namespace x3 { - typedef ::boost::uint32_t ucs4_char; + typedef char32_t ucs4_char; typedef char utf8_char; typedef std::basic_string ucs4_string; typedef std::basic_string utf8_string; +namespace detail { + inline void utf8_put_encode(utf8_string& out, ucs4_char x) + { + // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf D90 + if (BOOST_UNLIKELY(x > 0x10FFFFul || (0xD7FFul < x && x < 0xE000ul))) + x = 0xFFFDul; + + // Table 3-6. UTF-8 Bit Distribution + if (x < 0x80ul) { + out.push_back(static_cast(x)); + } + else if (x < 0x800ul) { + out.push_back(static_cast(0xC0ul + (x >> 6))); + out.push_back(static_cast(0x80ul + (x & 0x3Ful))); + } + else if (x < 0x10000ul) { + out.push_back(static_cast(0xE0ul + (x >> 12))); + out.push_back(static_cast(0x80ul + ((x >> 6) & 0x3Ful))); + out.push_back(static_cast(0x80ul + (x & 0x3Ful))); + } + else { + out.push_back(static_cast(0xF0ul + (x >> 18))); + out.push_back(static_cast(0x80ul + ((x >> 12) & 0x3Ful))); + out.push_back(static_cast(0x80ul + ((x >> 6) & 0x3Ful))); + out.push_back(static_cast(0x80ul + (x & 0x3Ful))); + } + } +} + template inline utf8_string to_utf8(Char value) { - // always store as UTF8 utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); - typedef typename make_unsigned::type UChar; - *utf8_iter = (UChar)value; + typedef typename std::make_unsigned::type UChar; + detail::utf8_put_encode(result, static_cast(value)); return result; } template inline utf8_string to_utf8(Char const* str) { - // always store as UTF8 utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); - typedef typename make_unsigned::type UChar; + typedef typename std::make_unsigned::type UChar; while (*str) - *utf8_iter++ = (UChar)*str++; + detail::utf8_put_encode(result, static_cast(*str++)); return result; } @@ -51,16 +71,10 @@ namespace boost { namespace spirit { namespace x3 inline utf8_string to_utf8(std::basic_string const& str) { - // always store as UTF8 utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); - typedef typename make_unsigned::type UChar; + typedef typename std::make_unsigned::type UChar; for (Char ch : str) - { - *utf8_iter++ = (UChar)ch; - } + detail::utf8_put_encode(result, static_cast(ch)); return result; } @@ -69,28 +83,37 @@ namespace boost { namespace spirit { namespace x3 inline utf8_string to_utf8(wchar_t value) { utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); - - u16_to_u32_iterator ucs4_iter(&value); - *utf8_iter++ = *ucs4_iter; - + detail::utf8_put_encode(result, static_cast::type>(value)); return result; } +namespace detail { + inline ucs4_char decode_utf16(wchar_t const*& s) + { + typedef std::make_unsigned::type uwchar_t; + + uwchar_t x(*s); + if (x < 0xD800ul || x > 0xDFFFul) + return x; + + // expected high-surrogate + if (BOOST_UNLIKELY((x >> 10) != 0b110110ul)) + return 0xFFFDul; + + uwchar_t y(*++s); + // expected low-surrogate + if (BOOST_UNLIKELY((y >> 10) != 0b110111ul)) + return 0xFFFDul; + + return ((x & 0x3FFul) << 10) + (y & 0x3FFul) + 0x10000ul; + } +} + inline utf8_string to_utf8(wchar_t const* str) { utf8_string result; - typedef std::back_insert_iterator insert_iter; - insert_iter out_iter(result); - utf8_output_iterator utf8_iter(out_iter); - - u16_to_u32_iterator ucs4_iter(str); - for (ucs4_char c; (c = *ucs4_iter) != ucs4_char(); ++ucs4_iter) { - *utf8_iter++ = c; - } - + for (ucs4_char c; (c = detail::decode_utf16(str)) != ucs4_char(); ++str) + detail::utf8_put_encode(result, c); return result; }