From 839c4311526f2363b3ee30d309c21ed65403e42a Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Tue, 16 Dec 2025 02:34:16 +0300 Subject: [PATCH 1/6] Added x86 SIMD implementation of to_chars. Moved the generic to_chars implementation to a separate header and made to_chars.hpp select the implementation based on the enabled SIMD ISA extensions. Added an x86 implementation leveraging SSSE3 and later vector extensions. Added detection of the said extensions to config.hpp. The performance effect on Intel Golden Cove (Core i7-12700K), gcc 13.3, in millions of to_chars() calls per second with a 16-byte aligned output buffer: Char | Generic | SSE4.1 | AVX2 | AVX-512 =========+=========+==================+==================+================= char | 203.190 | 1059.322 (5.21x) | 1053.352 (5.18x) | 1058.089 (5.21x) char16_t | 184.003 | 848.356 (4.61x) | 1009.489 (5.49x) | 1011.122 (5.50x) char32_t | 202.425 | 484.801 (2.39x) | 676.338 (3.34x) | 462.770 (2.29x) The core of the SIMD implementation is using 128-bit vectors, larger vectors are only used to convert to the target character types. This means that for 1-byte character types all vector implementations are basically the same (barring the extra ISA flexibility added by AVX) and for 2-byte character types AVX2 and AVX-512 are basically the same. For 4-byte character types, AVX-512 showed worse performance than SSE4.1 and AVX2 on the test system. It isn't clear why that is, but it is possible that the CPU throttles 512-bit instructions so much that the performance drops below a 256-bit equivalent. Perhaps, there are just not enough 512-bit instructions for the CPU to power up the full 512-bit pipeline. Therefore, the AVX-512 code path for 4-byte character types is currently disabled and the AVX2 path is used instead (which makes AVX2 and AVX-512 versions basically equivalent). The AVX-512 path can be enabled again if new CPU microarchitectures appear that will benefit from it. Higher alignment values of the output buffer were also tested, but they did not meaningfully improve performance. --- include/boost/uuid/detail/config.hpp | 26 +- include/boost/uuid/detail/to_chars.hpp | 62 ++--- .../boost/uuid/detail/to_chars_generic.hpp | 74 +++++ include/boost/uuid/detail/to_chars_x86.hpp | 261 ++++++++++++++++++ test/test_to_chars_cx.cpp | 37 +-- 5 files changed, 397 insertions(+), 63 deletions(-) create mode 100644 include/boost/uuid/detail/to_chars_generic.hpp create mode 100644 include/boost/uuid/detail/to_chars_x86.hpp diff --git a/include/boost/uuid/detail/config.hpp b/include/boost/uuid/detail/config.hpp index 3710f42..902d8c0 100644 --- a/include/boost/uuid/detail/config.hpp +++ b/include/boost/uuid/detail/config.hpp @@ -32,6 +32,10 @@ #define BOOST_UUID_USE_SSE3 #endif +#if defined(__SSSE3__) && !defined(BOOST_UUID_USE_SSSE3) +#define BOOST_UUID_USE_SSSE3 +#endif + #if defined(__SSE4_1__) && !defined(BOOST_UUID_USE_SSE41) #define BOOST_UUID_USE_SSE41 #endif @@ -40,6 +44,10 @@ #define BOOST_UUID_USE_AVX #endif +#if defined(__AVX2__) && !defined(BOOST_UUID_USE_AVX2) +#define BOOST_UUID_USE_AVX2 +#endif + #if ((defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512BW__)) || defined(__AVX10_1__)) && !defined(BOOST_UUID_USE_AVX10_1) #define BOOST_UUID_USE_AVX10_1 #endif @@ -54,6 +62,10 @@ #define BOOST_UUID_USE_AVX #endif +#if defined(__AVX2__) && !defined(BOOST_UUID_USE_AVX2) +#define BOOST_UUID_USE_AVX2 +#endif + #if ((defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512BW__)) || defined(__AVX10_1__)) && !defined(BOOST_UUID_USE_AVX10_1) #define BOOST_UUID_USE_AVX10_1 #endif @@ -61,7 +73,11 @@ #endif // More advanced ISA extensions imply less advanced are also available -#if !defined(BOOST_UUID_USE_AVX) && defined(BOOST_UUID_USE_AVX10_1) +#if !defined(BOOST_UUID_USE_AVX2) && defined(BOOST_UUID_USE_AVX10_1) +#define BOOST_UUID_USE_AVX2 +#endif + +#if !defined(BOOST_UUID_USE_AVX) && defined(BOOST_UUID_USE_AVX2) #define BOOST_UUID_USE_AVX #endif @@ -69,7 +85,11 @@ #define BOOST_UUID_USE_SSE41 #endif -#if !defined(BOOST_UUID_USE_SSE3) && defined(BOOST_UUID_USE_SSE41) +#if !defined(BOOST_UUID_USE_SSSE3) && defined(BOOST_UUID_USE_SSE41) +#define BOOST_UUID_USE_SSSE3 +#endif + +#if !defined(BOOST_UUID_USE_SSE3) && defined(BOOST_UUID_USE_SSSE3) #define BOOST_UUID_USE_SSE3 #endif @@ -79,8 +99,10 @@ #if !defined(BOOST_UUID_NO_SIMD) && \ !defined(BOOST_UUID_USE_AVX10_1) && \ + !defined(BOOST_UUID_USE_AVX2) && \ !defined(BOOST_UUID_USE_AVX) && \ !defined(BOOST_UUID_USE_SSE41) && \ + !defined(BOOST_UUID_USE_SSSE3) && \ !defined(BOOST_UUID_USE_SSE3) && \ !defined(BOOST_UUID_USE_SSE2) #define BOOST_UUID_NO_SIMD diff --git a/include/boost/uuid/detail/to_chars.hpp b/include/boost/uuid/detail/to_chars.hpp index 061acfd..a5f78f1 100644 --- a/include/boost/uuid/detail/to_chars.hpp +++ b/include/boost/uuid/detail/to_chars.hpp @@ -7,59 +7,31 @@ // https://www.boost.org/LICENSE_1_0.txt #include -#include +#include +#include +#include +#if defined(BOOST_UUID_USE_SSSE3) +#include +#endif namespace boost { namespace uuids { namespace detail { -constexpr char const* to_chars_digits( char const* ) noexcept +template BOOST_UUID_CXX14_CONSTEXPR_RT inline Ch* to_chars( uuid const& u, Ch* out ) noexcept { - return "0123456789abcdef-"; -} - -constexpr wchar_t const* to_chars_digits( wchar_t const* ) noexcept -{ - return L"0123456789abcdef-"; -} - -constexpr char16_t const* to_chars_digits( char16_t const* ) noexcept -{ - return u"0123456789abcdef-"; -} - -constexpr char32_t const* to_chars_digits( char32_t const* ) noexcept -{ - return U"0123456789abcdef-"; -} - -#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L - -constexpr char8_t const* to_chars_digits( char8_t const* ) noexcept -{ - return u8"0123456789abcdef-"; -} - -#endif - -template BOOST_CXX14_CONSTEXPR inline Ch* to_chars( uuid const& u, Ch* out ) noexcept -{ - constexpr Ch const* digits = to_chars_digits( static_cast( nullptr ) ); - - for( std::size_t i = 0; i < 16; ++i ) +#if defined(BOOST_UUID_USE_SSSE3) + if( detail::is_constant_evaluated_rt() ) { - std::uint8_t ch = u.data()[ i ]; - - *out++ = digits[ (ch >> 4) & 0x0F ]; - *out++ = digits[ ch & 0x0F ]; - - if( i == 3 || i == 5 || i == 7 || i == 9 ) - { - *out++ = digits[ 16 ]; - } + return detail::to_chars_generic( u, out ); } - - return out; + else + { + return detail::to_chars_simd( u, out ); + } +#else + return detail::to_chars_generic( u, out ); +#endif } } // namespace detail diff --git a/include/boost/uuid/detail/to_chars_generic.hpp b/include/boost/uuid/detail/to_chars_generic.hpp new file mode 100644 index 0000000..9601752 --- /dev/null +++ b/include/boost/uuid/detail/to_chars_generic.hpp @@ -0,0 +1,74 @@ +#ifndef BOOST_UUID_DETAIL_TO_CHARS_GENERIC_HPP_INCLUDED +#define BOOST_UUID_DETAIL_TO_CHARS_GENERIC_HPP_INCLUDED + +// Copyright 2009 Andy Tompkins +// Copyright 2024 Peter Dimov +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt + +#include +#include + +#if defined(BOOST_UUID_REPORT_IMPLEMENTATION) + +#include +BOOST_PRAGMA_MESSAGE( "Using to_chars_generic.hpp" ) + +#endif + +namespace boost { +namespace uuids { +namespace detail { + +constexpr char const* to_chars_digits( char const* ) noexcept +{ + return "0123456789abcdef-"; +} + +constexpr wchar_t const* to_chars_digits( wchar_t const* ) noexcept +{ + return L"0123456789abcdef-"; +} + +constexpr char16_t const* to_chars_digits( char16_t const* ) noexcept +{ + return u"0123456789abcdef-"; +} + +constexpr char32_t const* to_chars_digits( char32_t const* ) noexcept +{ + return U"0123456789abcdef-"; +} + +#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L + +constexpr char8_t const* to_chars_digits( char8_t const* ) noexcept +{ + return u8"0123456789abcdef-"; +} + +#endif + +template BOOST_CXX14_CONSTEXPR inline Ch* to_chars_generic( uuid const& u, Ch* out ) noexcept +{ + constexpr Ch const* digits = to_chars_digits( static_cast( nullptr ) ); + + for( std::size_t i = 0; i < 16; ++i ) + { + std::uint8_t ch = u.data()[ i ]; + + *out++ = digits[ (ch >> 4) & 0x0F ]; + *out++ = digits[ ch & 0x0F ]; + + if( i == 3 || i == 5 || i == 7 || i == 9 ) + { + *out++ = digits[ 16 ]; + } + } + + return out; +} + +}}} //namespace boost::uuids::detail + +#endif // BOOST_UUID_DETAIL_TO_CHARS_GENERIC_HPP_INCLUDED diff --git a/include/boost/uuid/detail/to_chars_x86.hpp b/include/boost/uuid/detail/to_chars_x86.hpp new file mode 100644 index 0000000..246e24d --- /dev/null +++ b/include/boost/uuid/detail/to_chars_x86.hpp @@ -0,0 +1,261 @@ +#ifndef BOOST_UUID_DETAIL_TO_CHARS_X86_HPP_INCLUDED +#define BOOST_UUID_DETAIL_TO_CHARS_X86_HPP_INCLUDED + +// Copyright 2025 Andrey Semashev +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt + +#include + +#if defined(BOOST_UUID_USE_SSSE3) + +#include +#include + +#if defined(BOOST_UUID_REPORT_IMPLEMENTATION) +#include + +#if defined(BOOST_UUID_USE_AVX10_1) +BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, AVX10.1" ) + +#elif defined(BOOST_UUID_USE_AVX2) +BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, AVX2" ) + +#elif defined(BOOST_UUID_USE_SSE41) +BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSE4.1" ) + +#else +BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSSE3" ) + +#endif +#endif // #if defined(BOOST_UUID_REPORT_IMPLEMENTATION) + +#if defined(BOOST_UUID_USE_AVX2) +#include +#elif defined(BOOST_UUID_USE_SSE41) +#include +#else +#include +#endif + +namespace boost { +namespace uuids { +namespace detail { + +template< + typename Char, + bool IsCharASCIICompatible = ('0' == 0x30 && '9' == 0x39 && 'a' == 0x61 && 'f' == 0x66 && '-' == 0x2D), + bool IsWCharASCIICompatible = (L'0' == 0x30 && L'9' == 0x39 && L'a' == 0x61 && L'f' == 0x66 && L'-' == 0x2D) +> +struct to_chars_simd_char_constants +{ + alignas(16) static const std::uint8_t mm_char_table[16]; + alignas(16) static const std::uint8_t mm_char_dash[16]; +}; + +template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > +alignas(16) const std::uint8_t to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_table[16] = + { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66 }; // 0123456789abcdef in ASCII +template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > +alignas(16) const std::uint8_t to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_dash[16] = + { 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D }; // ---------------- in ASCII + +template< bool IsWCharASCIICompatible > +struct to_chars_simd_char_constants< char, false, IsWCharASCIICompatible > +{ + // This requirement is necessary for the _mm_max_epu8 trick in to_chars_simd_core below to work + static_assert(static_cast< std::uint8_t >('-') < static_cast< std::uint8_t >('0') && static_cast< std::uint8_t >('-') < static_cast< std::uint8_t >('a'), + "Boost.UUID: Unsupported char encoding, '-' character code is expected to be less than any hexadecimal characters"); + + alignas(16) static const std::uint8_t mm_char_table[16]; + alignas(16) static const std::uint8_t mm_char_dash[16]; +}; + +template< bool IsWCharASCIICompatible > +alignas(16) const std::uint8_t to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_table[16] = +{ + static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('1'), static_cast< std::uint8_t >('2'), static_cast< std::uint8_t >('3'), + static_cast< std::uint8_t >('4'), static_cast< std::uint8_t >('5'), static_cast< std::uint8_t >('6'), static_cast< std::uint8_t >('7'), + static_cast< std::uint8_t >('8'), static_cast< std::uint8_t >('9'), static_cast< std::uint8_t >('a'), static_cast< std::uint8_t >('b'), + static_cast< std::uint8_t >('c'), static_cast< std::uint8_t >('d'), static_cast< std::uint8_t >('e'), static_cast< std::uint8_t >('f') +}; +template< bool IsWCharASCIICompatible > +alignas(16) const std::uint8_t to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_dash[16] = +{ + static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), + static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), + static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), + static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-'), static_cast< std::uint8_t >('-') +}; + +template< bool IsCharASCIICompatible > +struct to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false > +{ + static_assert(static_cast< wchar_t >(static_cast< std::uint8_t >(L'0')) == L'0' && static_cast< wchar_t >(static_cast< std::uint8_t >(L'9')) == L'9' && + static_cast< wchar_t >(static_cast< std::uint8_t >(L'a')) == L'a' && static_cast< wchar_t >(static_cast< std::uint8_t >(L'f')) == L'f' && + static_cast< wchar_t >(static_cast< std::uint8_t >(L'-')) == L'-', + "Boost.UUID: Unsupported wchar_t encoding, hexadecimal and dash character codes are expected to be representable by a single byte"); + + // This requirement is necessary for the _mm_max_epu8 trick in to_chars_simd_core below to work + static_assert(static_cast< std::uint8_t >(L'-') < static_cast< std::uint8_t >(L'0') && static_cast< std::uint8_t >(L'-') < static_cast< std::uint8_t >(L'a'), + "Boost.UUID: Unsupported wchar_t encoding, L'-' character code is expected to be less than any hexadecimal characters"); + + alignas(16) static const std::uint8_t mm_char_table[16]; + alignas(16) static const std::uint8_t mm_char_dash[16]; +}; + +template< bool IsCharASCIICompatible > +alignas(16) const std::uint8_t to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_table[16] = +{ + static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'1'), static_cast< std::uint8_t >(L'2'), static_cast< std::uint8_t >(L'3'), + static_cast< std::uint8_t >(L'4'), static_cast< std::uint8_t >(L'5'), static_cast< std::uint8_t >(L'6'), static_cast< std::uint8_t >(L'7'), + static_cast< std::uint8_t >(L'8'), static_cast< std::uint8_t >(L'9'), static_cast< std::uint8_t >(L'a'), static_cast< std::uint8_t >(L'b'), + static_cast< std::uint8_t >(L'c'), static_cast< std::uint8_t >(L'd'), static_cast< std::uint8_t >(L'e'), static_cast< std::uint8_t >(L'f') +}; +template< bool IsCharASCIICompatible > +alignas(16) const std::uint8_t to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_dash[16] = +{ + static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), + static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), + static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), + static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-'), static_cast< std::uint8_t >(L'-') +}; + +template< typename > +struct to_chars_simd_constants +{ + alignas(16) static const std::uint8_t mm_15[16]; + alignas(16) static const std::uint8_t mm_shuffle_pattern1[16]; + alignas(16) static const std::uint8_t mm_shuffle_pattern2[16]; +}; + +template< typename T > +alignas(16) const std::uint8_t to_chars_simd_constants< T >::mm_15[16] = + { 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }; +template< typename T > +alignas(16) const std::uint8_t to_chars_simd_constants< T >::mm_shuffle_pattern1[16] = + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x80, 0x08, 0x09, 0x0A, 0x0B, 0x80, 0x0C, 0x0D }; +template< typename T > +alignas(16) const std::uint8_t to_chars_simd_constants< T >::mm_shuffle_pattern2[16] = + { 0x00, 0x01, 0x80, 0x02, 0x03, 0x04, 0x05, 0x80, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D }; + +//! Converts UUID to a string of 36 characters, where first 32 craracters are returned in mm_chars1 and mm_chars2 and the last 4 in the highest 32 bits of mm_chars3 +BOOST_FORCEINLINE void to_chars_simd_core +( + const std::uint8_t* data, + __m128i const& mm_char_table, __m128i const& mm_char_dash, + __m128i& mm_chars1, __m128i& mm_chars2, __m128i& mm_chars3 +) noexcept +{ + __m128i const& mm_15 = *reinterpret_cast< const __m128i* >(uuids::detail::to_chars_simd_constants< void >::mm_15); + __m128i const& mm_shuffle_pattern1 = *reinterpret_cast< const __m128i* >(uuids::detail::to_chars_simd_constants< void >::mm_shuffle_pattern1); + __m128i const& mm_shuffle_pattern2 = *reinterpret_cast< const __m128i* >(uuids::detail::to_chars_simd_constants< void >::mm_shuffle_pattern2); + + __m128i mm_input = uuids::detail::load_unaligned_si128(data); + + // Split half-bytes + __m128i mm_input_hi = _mm_and_si128(_mm_srli_epi32(mm_input, 4), mm_15); + __m128i mm_input_lo = _mm_and_si128(mm_input, mm_15); + + // Stringize each of the halves + mm_input_hi = _mm_shuffle_epi8(mm_char_table, mm_input_hi); + mm_input_lo = _mm_shuffle_epi8(mm_char_table, mm_input_lo); + + // Join them back together + __m128i mm_1 = _mm_unpacklo_epi8(mm_input_hi, mm_input_lo); + __m128i mm_2 = _mm_unpackhi_epi8(mm_input_hi, mm_input_lo); + + // Insert dashes at positions 8, 13, 18 and 23 + // mm_1 mm_2 + // |0123456789abcdef|0123456789abcdef| + // |01234567-89ab-cd|ef-0123-456789ab| + // + // Note that the last "cdef" characters are already available at the end of mm_2 + mm_chars1 = _mm_shuffle_epi8(mm_1, mm_shuffle_pattern1); + mm_chars2 = _mm_shuffle_epi8(_mm_alignr_epi8(mm_2, mm_1, 14), mm_shuffle_pattern2); + + mm_chars1 = _mm_max_epu8(mm_chars1, mm_char_dash); + mm_chars2 = _mm_max_epu8(mm_chars2, mm_char_dash); + mm_chars3 = mm_2; +} + +template< typename Char > +BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept +{ + __m128i mm_chars1, mm_chars2, mm_chars3; + uuids::detail::to_chars_simd_core + ( + u.data, + *reinterpret_cast< const __m128i* >(uuids::detail::to_chars_simd_char_constants< Char >::mm_char_table), + *reinterpret_cast< const __m128i* >(uuids::detail::to_chars_simd_char_constants< Char >::mm_char_dash), + mm_chars1, mm_chars2, mm_chars3 + ); + + static_assert(sizeof(Char) == 1u || sizeof(Char) == 2u || sizeof(Char) == 4u, "Boost.UUID: Unsupported output character type for to_chars"); + BOOST_IF_CONSTEXPR (sizeof(Char) == 1u) + { + _mm_storeu_si128(reinterpret_cast< __m128i* >(out), mm_chars1); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 16), mm_chars2); +#if defined(BOOST_UUID_USE_SSE41) + *reinterpret_cast< BOOST_MAY_ALIAS std::uint32_t* >(out + 32) = _mm_extract_epi32(mm_chars3, 3); +#else + *reinterpret_cast< BOOST_MAY_ALIAS std::uint32_t* >(out + 32) = _mm_cvtsi128_si32(_mm_srli_si128(mm_chars3, 12)); +#endif + } + else BOOST_IF_CONSTEXPR (sizeof(Char) == 2u) + { + const __m128i mm_0 = _mm_setzero_si128(); +#if defined(BOOST_UUID_USE_AVX2) + _mm256_storeu_si256(reinterpret_cast< __m256i* >(out), _mm256_cvtepu8_epi16(mm_chars1)); + _mm256_storeu_si256(reinterpret_cast< __m256i* >(out + 16), _mm256_cvtepu8_epi16(mm_chars2)); +#else + _mm_storeu_si128(reinterpret_cast< __m128i* >(out), _mm_unpacklo_epi8(mm_chars1, mm_0)); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 8), _mm_unpackhi_epi8(mm_chars1, mm_0)); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 16), _mm_unpacklo_epi8(mm_chars2, mm_0)); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 24), _mm_unpackhi_epi8(mm_chars2, mm_0)); +#endif +#if defined(BOOST_UUID_USE_SSE41) && (defined(__x86_64__) || defined(_M_X64)) + *reinterpret_cast< BOOST_MAY_ALIAS std::uint64_t* >(out + 32) = _mm_extract_epi64(_mm_unpackhi_epi8(mm_chars3, mm_0), 1); +#else + _mm_storeh_pd(reinterpret_cast< BOOST_MAY_ALIAS double* >(out + 32), _mm_castsi128_pd(_mm_unpackhi_epi8(mm_chars3, mm_0))); +#endif + } + else + { + const __m128i mm_0 = _mm_setzero_si128(); +#if 0 && defined(BOOST_UUID_USE_AVX10_1) + // Slower than the AVX2 version below on Intel Golden Cove. Perhaps, it will become beneficial on newer microarchitectures. + _mm512_storeu_epi32(out, _mm512_cvtepu8_epi32(mm_chars1)); + _mm512_storeu_epi32(out + 16, _mm512_cvtepu8_epi32(mm_chars2)); +#elif defined(BOOST_UUID_USE_AVX2) + _mm256_storeu_si256(reinterpret_cast< __m256i* >(out), _mm256_cvtepu8_epi32(mm_chars1)); + _mm256_storeu_si256(reinterpret_cast< __m256i* >(out + 8), _mm256_cvtepu8_epi32(_mm_unpackhi_epi64(mm_chars1, mm_chars1))); + _mm256_storeu_si256(reinterpret_cast< __m256i* >(out + 16), _mm256_cvtepu8_epi32(mm_chars2)); + _mm256_storeu_si256(reinterpret_cast< __m256i* >(out + 24), _mm256_cvtepu8_epi32(_mm_unpackhi_epi64(mm_chars2, mm_chars2))); +#else + __m128i mm = _mm_unpacklo_epi8(mm_chars1, mm_0); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out), _mm_unpacklo_epi16(mm, mm_0)); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 4), _mm_unpackhi_epi16(mm, mm_0)); + mm = _mm_unpackhi_epi8(mm_chars1, mm_0); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 8), _mm_unpacklo_epi16(mm, mm_0)); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 12), _mm_unpackhi_epi16(mm, mm_0)); + mm = _mm_unpacklo_epi8(mm_chars2, mm_0); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 16), _mm_unpacklo_epi16(mm, mm_0)); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 20), _mm_unpackhi_epi16(mm, mm_0)); + mm = _mm_unpackhi_epi8(mm_chars2, mm_0); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 24), _mm_unpacklo_epi16(mm, mm_0)); + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 28), _mm_unpackhi_epi16(mm, mm_0)); +#endif + _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 32), _mm_unpackhi_epi16(_mm_unpackhi_epi8(mm_chars3, mm_0), mm_0)); + } + + return out + 36; +} + +} // namespace detail +} // namespace uuids +} // namespace boost + +#endif // defined(BOOST_UUID_USE_SSSE3) + +#endif // BOOST_UUID_DETAIL_TO_CHARS_X86_HPP_INCLUDED diff --git a/test/test_to_chars_cx.cpp b/test/test_to_chars_cx.cpp index 862c192..7bb5ec7 100644 --- a/test/test_to_chars_cx.cpp +++ b/test/test_to_chars_cx.cpp @@ -5,14 +5,19 @@ #include #include #include +#include #include #include #include +#if defined(BOOST_UUID_NO_CXX14_CONSTEXPR_RT) +BOOST_PRAGMA_MESSAGE( "Test is not constexpr because BOOST_UUID_NO_CXX14_CONSTEXPR_RT is defined" ) +#endif + using namespace boost::uuids; template -BOOST_CXX14_CONSTEXPR boost::array uuid_to_string( uuid const& u ) +BOOST_UUID_CXX14_CONSTEXPR_RT boost::array uuid_to_string( uuid const& u ) { boost::array r = {{}}; to_chars( u, r.begin(), r.end() ); @@ -25,7 +30,7 @@ int main() BOOST_CXX14_CONSTEXPR uuid u; { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::string const w( "00000000-0000-0000-0000-000000000000" ); @@ -33,7 +38,7 @@ int main() } { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::wstring const w( L"00000000-0000-0000-0000-000000000000" ); @@ -41,7 +46,7 @@ int main() } { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::u16string const w( u"00000000-0000-0000-0000-000000000000" ); @@ -49,7 +54,7 @@ int main() } { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::u32string const w( U"00000000-0000-0000-0000-000000000000" ); @@ -59,7 +64,7 @@ int main() #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::u8string const w( u8"00000000-0000-0000-0000-000000000000" ); @@ -73,7 +78,7 @@ int main() BOOST_CXX14_CONSTEXPR uuid u = {{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }}; { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::string const w( "00010203-0405-0607-0809-0a0b0c0d0e0f" ); @@ -81,7 +86,7 @@ int main() } { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::wstring const w( L"00010203-0405-0607-0809-0a0b0c0d0e0f" ); @@ -89,7 +94,7 @@ int main() } { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::u16string const w( u"00010203-0405-0607-0809-0a0b0c0d0e0f" ); @@ -97,7 +102,7 @@ int main() } { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::u32string const w( U"00010203-0405-0607-0809-0a0b0c0d0e0f" ); @@ -107,7 +112,7 @@ int main() #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::u8string const w( u8"00010203-0405-0607-0809-0a0b0c0d0e0f" ); @@ -121,7 +126,7 @@ int main() BOOST_CXX14_CONSTEXPR uuid u = {{ 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef }}; { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::string const w( "12345678-90ab-cdef-1234-567890abcdef" ); @@ -129,7 +134,7 @@ int main() } { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::wstring const w( L"12345678-90ab-cdef-1234-567890abcdef" ); @@ -137,7 +142,7 @@ int main() } { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::u16string const w( u"12345678-90ab-cdef-1234-567890abcdef" ); @@ -145,7 +150,7 @@ int main() } { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::u32string const w( U"12345678-90ab-cdef-1234-567890abcdef" ); @@ -155,7 +160,7 @@ int main() #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L { - BOOST_CXX14_CONSTEXPR auto v = uuid_to_string( u ); + BOOST_UUID_CXX14_CONSTEXPR_RT auto v = uuid_to_string( u ); std::u8string const w( u8"12345678-90ab-cdef-1234-567890abcdef" ); From 84afcf6372106e6ea3f25c2a9e17b0c58e82a6ea Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Tue, 16 Dec 2025 02:44:02 +0300 Subject: [PATCH 2/6] Align buffers for to_chars for better performance of SIMD implementation. --- include/boost/uuid/uuid_io.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/boost/uuid/uuid_io.hpp b/include/boost/uuid/uuid_io.hpp index ebb058b..86186e9 100644 --- a/include/boost/uuid/uuid_io.hpp +++ b/include/boost/uuid/uuid_io.hpp @@ -31,7 +31,7 @@ namespace uuids { template BOOST_CXX14_CONSTEXPR OutputIterator to_chars( uuid const& u, OutputIterator out ) { - char tmp[ 36 ] = {}; + alignas( 16 ) char tmp[ 36 ] = {}; detail::to_chars( u, tmp ); for( std::size_t i = 0; i < 36; ++i ) @@ -79,7 +79,7 @@ BOOST_CXX14_CONSTEXPR inline Ch* to_chars( uuid const& u, Ch (&buffer)[ 36 ] ) n template std::basic_ostream& operator<<( std::basic_ostream& os, uuid const& u ) { - char tmp[ 37 ]; + alignas( 16 ) char tmp[ 37 ]; to_chars( u, tmp ); os << tmp; From 2508c5434e8d93db2b15e60203ee153e2302d123 Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Tue, 16 Dec 2025 03:29:27 +0300 Subject: [PATCH 3/6] Added running IO and to_chars tests with SIMD disabled. --- test/CMakeLists.txt | 12 ++++++++---- test/Jamfile.v2 | 13 +++++++++---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5f2b913..dec4ca7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -29,10 +29,14 @@ boost_test(TYPE run SOURCES test_comparison.cpp COMPILE_DEFINITIONS BOOST_UUID_N boost_test(TYPE run SOURCES test_include1.cpp test_include2.cpp) -boost_test(TYPE run SOURCES test_io.cpp LINK_LIBRARIES Boost::lexical_cast Boost::predef) -boost_test(TYPE run SOURCES test_io_2.cpp) -boost_test(TYPE run SOURCES test_to_chars.cpp) -boost_test(TYPE run SOURCES test_to_chars_2.cpp) +boost_test(TYPE run SOURCES test_io.cpp LINK_LIBRARIES Boost::lexical_cast Boost::predef COMPILE_DEFINITIONS BOOST_UUID_REPORT_IMPLEMENTATION=1) +boost_test(TYPE run SOURCES test_io.cpp LINK_LIBRARIES Boost::lexical_cast Boost::predef COMPILE_DEFINITIONS BOOST_UUID_NO_SIMD=1 BOOST_UUID_REPORT_IMPLEMENTATION=1 NAME test_io_no_simd) +boost_test(TYPE run SOURCES test_io_2.cpp COMPILE_DEFINITIONS BOOST_UUID_REPORT_IMPLEMENTATION=1) +boost_test(TYPE run SOURCES test_io_2.cpp COMPILE_DEFINITIONS BOOST_UUID_NO_SIMD=1 BOOST_UUID_REPORT_IMPLEMENTATION=1 NAME test_io_2_no_simd) +boost_test(TYPE run SOURCES test_to_chars.cpp COMPILE_DEFINITIONS BOOST_UUID_REPORT_IMPLEMENTATION=1) +boost_test(TYPE run SOURCES test_to_chars.cpp COMPILE_DEFINITIONS BOOST_UUID_NO_SIMD=1 BOOST_UUID_REPORT_IMPLEMENTATION=1 NAME test_to_chars_no_simd) +boost_test(TYPE run SOURCES test_to_chars_2.cpp COMPILE_DEFINITIONS BOOST_UUID_REPORT_IMPLEMENTATION=1) +boost_test(TYPE run SOURCES test_to_chars_2.cpp COMPILE_DEFINITIONS BOOST_UUID_NO_SIMD=1 BOOST_UUID_REPORT_IMPLEMENTATION=1 NAME test_to_chars_2_no_simd) boost_test(TYPE run SOURCES test_uuid_clock.cpp) diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 27ad45c..03b9c74 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -89,11 +89,16 @@ run test_comparison.cpp : : : BOOST_UUID_NO_SIMD BOOST_UUID_REPO # test uuid_io.hpp run test_io.cpp - : : : /boost/lexical_cast//boost_lexical_cast /boost/predef//boost_predef -$(WERROR) ; -run test_io_2.cpp ; + : : : /boost/lexical_cast//boost_lexical_cast /boost/predef//boost_predef BOOST_UUID_REPORT_IMPLEMENTATION -$(WERROR) ; +run test_io.cpp + : : : /boost/lexical_cast//boost_lexical_cast /boost/predef//boost_predef BOOST_UUID_NO_SIMD BOOST_UUID_REPORT_IMPLEMENTATION -$(WERROR) : test_io_no_simd ; +run test_io_2.cpp : : : BOOST_UUID_REPORT_IMPLEMENTATION ; +run test_io_2.cpp : : : BOOST_UUID_NO_SIMD BOOST_UUID_REPORT_IMPLEMENTATION : test_io_2_no_simd ; -run test_to_chars.cpp ; -run test_to_chars_2.cpp ; +run test_to_chars.cpp : : : BOOST_UUID_REPORT_IMPLEMENTATION ; +run test_to_chars.cpp : : : BOOST_UUID_NO_SIMD BOOST_UUID_REPORT_IMPLEMENTATION : test_to_chars_no_simd ; +run test_to_chars_2.cpp : : : BOOST_UUID_REPORT_IMPLEMENTATION ; +run test_to_chars_2.cpp : : : BOOST_UUID_NO_SIMD BOOST_UUID_REPORT_IMPLEMENTATION : test_to_chars_2_no_simd ; run test_from_chars.cpp ; run test_from_chars_2.cpp ; From e6fe1c45d9fea7100cc70254776d6284642ba5a3 Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Tue, 16 Dec 2025 04:07:18 +0300 Subject: [PATCH 4/6] Added GitHub Actions jobs for AVX2-enabled target. --- .github/workflows/ci.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9312f0e..f5ccaec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -100,6 +100,14 @@ jobs: container: ubuntu:24.04 install: g++-13-multilib address-model: 32,64 + - toolset: gcc-13 + cxxstd: "11,14,17,20,2b" + instruction-set: haswell + cpu-requirements: [ avx2, bmi1, bmi2 ] + os: ubuntu-latest + container: ubuntu:24.04 + install: g++-13-multilib + address-model: 32,64 - toolset: gcc-13 cxxstd: "11,14,17,20,2b" instruction-set: skylake-avx512 @@ -230,6 +238,14 @@ jobs: container: ubuntu:24.04 os: ubuntu-latest install: clang-17 + - toolset: clang + compiler: clang++-17 + cxxstd: "11,14,17,20,2b" + instruction-set: haswell + cpu-requirements: [ avx2, bmi1, bmi2 ] + container: ubuntu:24.04 + os: ubuntu-latest + install: clang-17 - toolset: clang compiler: clang++-17 cxxstd: "11,14,17,20,2b" From ef9c903055ba98e56c52fe83c4c3cc2be528231f Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Tue, 16 Dec 2025 12:20:38 +0300 Subject: [PATCH 5/6] Reorder macos parameters in GitHub Actions CI for better readability in web UI. --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f5ccaec..b44a238 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -255,14 +255,14 @@ jobs: os: ubuntu-latest install: clang-17 - toolset: clang - cxxstd: "11,14,17,20,2b" os: macos-14 + cxxstd: "11,14,17,20,2b" - toolset: clang - cxxstd: "11,14,17,20,23,2c" os: macos-15 - - toolset: clang cxxstd: "11,14,17,20,23,2c" + - toolset: clang os: macos-26 + cxxstd: "11,14,17,20,23,2c" runs-on: ${{matrix.os}} From 454de03dbd58b753ee49c2664c0c05bb0cd234a4 Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Tue, 16 Dec 2025 19:01:29 +0300 Subject: [PATCH 6/6] Updated docs with the new SIMD macros, added a release note for SIMD to_chars. Also clarified the meaning of BOOST_UUID_USE_AVX10_1 in the docs as the previous wording could be taken that it indicates support for a subset of AVX-512 that is supported in Skylake-X. --- doc/uuid/changes.adoc | 1 + doc/uuid/configuration.adoc | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/uuid/changes.adoc b/doc/uuid/changes.adoc index 2bd8123..9674ad7 100644 --- a/doc/uuid/changes.adoc +++ b/doc/uuid/changes.adoc @@ -9,6 +9,7 @@ * Added a `noexcept` `operator()` overload to `string_generator`. * `string_generator` now supports the Unicode character types in addition to `char` and `wchar_t`. * Most `uuid` accessors, operations, and `to_chars` are now `constexpr` when possible (on {cpp}14 and higher and on recent compilers). +* Added SIMD implementation of `to_chars`, which can offer up to 5.5x performance improvement in UUID formatting. == Changes in Boost 1.90.0 diff --git a/doc/uuid/configuration.adoc b/doc/uuid/configuration.adoc index fe36006..42156b5 100644 --- a/doc/uuid/configuration.adoc +++ b/doc/uuid/configuration.adoc @@ -25,15 +25,22 @@ However, there are a few options that can be enabled by defining macros prior to |`BOOST_UUID_USE_SSE3` |If defined, enables optimizations for https://en.wikipedia.org/wiki/SSE3[SSE3] extensions available in x86 processors. +|`BOOST_UUID_USE_SSSE3` +|If defined, enables optimizations for https://en.wikipedia.org/wiki/SSSE3[SSSE3] extensions available in x86 processors. + |`BOOST_UUID_USE_SSE41` |If defined, enables optimizations for https://en.wikipedia.org/wiki/SSE4#SSE4.1[SSE4.1] extensions available in x86 processors. |`BOOST_UUID_USE_AVX` |If defined, enables optimizations for https://en.wikipedia.org/wiki/Advanced_Vector_Extensions[AVX] extensions available in modern x86 processors. +|`BOOST_UUID_USE_AVX2` +|If defined, enables optimizations for https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#Advanced_Vector_Extensions_2[AVX2] extensions available in modern x86 processors. + |`BOOST_UUID_USE_AVX10_1` |If defined, enables optimizations for https://en.wikipedia.org/wiki/AVX-512[AVX-512] and https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX10[AVX10.1] extensions available in modern x86 processors. - The library does not require 512-bit vectors and is compatible with CPUs implementing AVX-512F, CD, VL, BW and DQ instruction subsets (i.e. equivalent to Intel Skylake-X). + When defined by user, this macro indicates support for the full set of instructions defined in AVX10.1. Currently, the library does not require 512-bit vectors and is compatible with CPUs implementing AVX-512F, + CD, VL, BW and DQ instruction subsets (i.e. equivalent to Intel Skylake-X), so it may auto-detect and use AVX-512 even if only those subsets are supported. |===