From 39946fb660d8ac0c10caab837a1cad08e9cee06b Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Fri, 9 Jan 2026 01:44:11 +0300 Subject: [PATCH 1/6] Fix -Wconversion warning in from_chars_x86.hpp. --- include/boost/uuid/detail/from_chars_x86.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/boost/uuid/detail/from_chars_x86.hpp b/include/boost/uuid/detail/from_chars_x86.hpp index 31b5989..6521e0c 100644 --- a/include/boost/uuid/detail/from_chars_x86.hpp +++ b/include/boost/uuid/detail/from_chars_x86.hpp @@ -9,6 +9,7 @@ #if defined(BOOST_UUID_USE_SSE41) +#include #include #include #include @@ -478,7 +479,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_2 = #pragma GCC diagnostic ignored "-Warray-bounds" #endif -template< typename Char, unsigned int Size = sizeof(Char) > +template< typename Char, std::size_t Size = sizeof(Char) > struct from_chars_simd_load_traits; template< typename Char > From d53a476a7721dfa053690e5ae80105e159b4678f Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Fri, 9 Jan 2026 14:24:42 +0300 Subject: [PATCH 2/6] Silence -Wstrict-aliasing warning in simd_vector.hpp. --- include/boost/uuid/detail/simd_vector.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/boost/uuid/detail/simd_vector.hpp b/include/boost/uuid/detail/simd_vector.hpp index 4cee4cf..fde583b 100644 --- a/include/boost/uuid/detail/simd_vector.hpp +++ b/include/boost/uuid/detail/simd_vector.hpp @@ -27,12 +27,22 @@ union simd_vector > BOOST_FORCEINLINE operator Vector () const noexcept { return get< Vector >(); } +#if defined(BOOST_GCC) && (BOOST_GCC >= 40600) +#pragma GCC diagnostic push +// dereferencing type-punned pointer will break strict-aliasing rules +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif + template< typename Vector > BOOST_FORCEINLINE typename std::enable_if< sizeof(Vector) <= ByteSize, Vector >::type get() const noexcept { using vector_type = typename std::remove_cv< typename std::remove_reference< Vector >::type >::type; return *reinterpret_cast< const vector_type* >(bytes); } + +#if defined(BOOST_GCC) && (BOOST_GCC >= 40800) +#pragma GCC diagnostic pop +#endif }; template< typename T > From 97dc5c35ba3d15069d97d55ad6bfa094432b436f Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Wed, 7 Jan 2026 04:27:48 +0300 Subject: [PATCH 3/6] Added SSE2 SIMD implementation of to_chars. This adds SSE2 code paths to to_chars_x86.hpp. The performance effect on Intel Golden Cove (Core i7-12700K), gcc 13.3, in millions of to_chars() calls per second with a 16-byte aligned output buffer: Char | Generic | SSE2 | SSE4.1 | AVX2 | AVX10.1 =========+=========+=================+==================+==================+================= char | 202.314 | 564.857 (2.79x) | 1194.772 (5.91x) | 1192.094 (5.89x) | 1191.838 (5.89x) char16_t | 188.532 | 457.281 (2.43x) | 795.798 (4.22x) | 935.016 (4.96x) | 938.368 (4.98x) char32_t | 193.151 | 345.612 (1.79x) | 489.620 (2.53x) | 688.829 (3.57x) | 689.617 (3.57x) Here, Generic column was generated with BOOST_UUID_NO_SIMD defined and SSE2 with -march=x86-64. SSE2 support can be useful in cases when users need to be compatible with the base x86-64 ISA. --- include/boost/uuid/detail/to_chars.hpp | 7 +- include/boost/uuid/detail/to_chars_x86.hpp | 185 +++++++++++++++++++-- 2 files changed, 175 insertions(+), 17 deletions(-) diff --git a/include/boost/uuid/detail/to_chars.hpp b/include/boost/uuid/detail/to_chars.hpp index 80f6841..6fcabe1 100644 --- a/include/boost/uuid/detail/to_chars.hpp +++ b/include/boost/uuid/detail/to_chars.hpp @@ -11,7 +11,7 @@ #include #include -#if defined(BOOST_UUID_USE_SSSE3) +#if defined(BOOST_UUID_USE_SSE2) # include #elif defined(BOOST_UUID_REPORT_IMPLEMENTATION) @@ -26,7 +26,7 @@ namespace detail { template BOOST_UUID_CXX14_CONSTEXPR_RT inline Ch* to_chars( uuid const& u, Ch* out ) noexcept { -#if defined(BOOST_UUID_USE_SSSE3) +#if defined(BOOST_UUID_USE_SSE2) if( detail::is_constant_evaluated_rt() ) { return detail::to_chars_generic( u, out ); @@ -40,7 +40,6 @@ template BOOST_UUID_CXX14_CONSTEXPR_RT inline Ch* to_chars( uuid const #endif } -} // namespace detail -}} //namespace boost::uuids +}}} // namespace boost::uuids::detail #endif // BOOST_UUID_DETAIL_TO_CHARS_HPP_INCLUDED diff --git a/include/boost/uuid/detail/to_chars_x86.hpp b/include/boost/uuid/detail/to_chars_x86.hpp index 818522b..6184b6b 100644 --- a/include/boost/uuid/detail/to_chars_x86.hpp +++ b/include/boost/uuid/detail/to_chars_x86.hpp @@ -7,7 +7,7 @@ #include -#if defined(BOOST_UUID_USE_SSSE3) +#if defined(BOOST_UUID_USE_SSE2) #include #include @@ -26,9 +26,12 @@ BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, AVX2" ) #elif defined(BOOST_UUID_USE_SSE41) BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSE4.1" ) -#else +#elif defined(BOOST_UUID_USE_SSSE3) BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSSE3" ) +#else +BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSE2" ) + #endif #endif // #if defined(BOOST_UUID_REPORT_IMPLEMENTATION) @@ -36,8 +39,10 @@ BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSSE3" ) #include #elif defined(BOOST_UUID_USE_SSE41) #include -#else +#elif defined(BOOST_UUID_USE_SSSE3) #include +#else +#include #endif namespace boost { @@ -51,13 +56,31 @@ template< > struct to_chars_simd_char_constants { +#if defined(BOOST_UUID_USE_SSSE3) static const simd_vector128< std::uint8_t > mm_char_table; +#else + static constexpr std::uint8_t char_a_add = static_cast< std::uint8_t >((0x61 - 10) - 0x30); // ('a' - 10) - '0' in ASCII + static const simd_vector128< std::uint8_t > mm_char_0_add; + static const simd_vector128< std::uint8_t > mm_char_a_add; +#endif static const simd_vector128< std::uint8_t > mm_char_dash; }; +#if defined(BOOST_UUID_USE_SSSE3) template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_table = {{ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66 }}; // 0123456789abcdef in ASCII +#else +template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_0_add = + {{ 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30 }}; // 0x30 is '0' in ASCII +template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_a_add = +{{ + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add +}}; +#endif template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_dash = {{ 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D }}; // ---------------- in ASCII @@ -69,10 +92,17 @@ struct to_chars_simd_char_constants< char, false, IsWCharASCIICompatible > static_assert(static_cast< std::uint8_t >('-') < static_cast< std::uint8_t >('0') && static_cast< std::uint8_t >('-') < static_cast< std::uint8_t >('a'), "Boost.UUID: Unsupported char encoding, '-' character code is expected to be less than any hexadecimal characters"); +#if defined(BOOST_UUID_USE_SSSE3) static const simd_vector128< std::uint8_t > mm_char_table; +#else + static constexpr std::uint8_t char_a_add = static_cast< std::uint8_t >(('a' - 10) - '0'); + static const simd_vector128< std::uint8_t > mm_char_0_add; + static const simd_vector128< std::uint8_t > mm_char_a_add; +#endif static const simd_vector128< std::uint8_t > mm_char_dash; }; +#if defined(BOOST_UUID_USE_SSSE3) template< bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_table = {{ @@ -81,6 +111,22 @@ const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, static_cast< std::uint8_t >('8'), static_cast< std::uint8_t >('9'), static_cast< std::uint8_t >('a'), static_cast< std::uint8_t >('b'), static_cast< std::uint8_t >('c'), static_cast< std::uint8_t >('d'), static_cast< std::uint8_t >('e'), static_cast< std::uint8_t >('f') }}; +#else +template< bool IsWCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_0_add = +{{ + static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), + static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), + static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), + static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0') +}}; +template< bool IsWCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_a_add = +{{ + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add +}}; +#endif template< bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_dash = {{ @@ -102,10 +148,17 @@ struct to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false > static_assert(static_cast< std::uint8_t >(L'-') < static_cast< std::uint8_t >(L'0') && static_cast< std::uint8_t >(L'-') < static_cast< std::uint8_t >(L'a'), "Boost.UUID: Unsupported wchar_t encoding, L'-' character code is expected to be less than any hexadecimal characters"); +#if defined(BOOST_UUID_USE_SSSE3) static const simd_vector128< std::uint8_t > mm_char_table; +#else + static constexpr std::uint8_t char_a_add = static_cast< std::uint8_t >((L'a' - 10) - L'0'); + static const simd_vector128< std::uint8_t > mm_char_0_add; + static const simd_vector128< std::uint8_t > mm_char_a_add; +#endif static const simd_vector128< std::uint8_t > mm_char_dash; }; +#if defined(BOOST_UUID_USE_SSSE3) template< bool IsCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_table = {{ @@ -114,6 +167,22 @@ const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCh static_cast< std::uint8_t >(L'8'), static_cast< std::uint8_t >(L'9'), static_cast< std::uint8_t >(L'a'), static_cast< std::uint8_t >(L'b'), static_cast< std::uint8_t >(L'c'), static_cast< std::uint8_t >(L'd'), static_cast< std::uint8_t >(L'e'), static_cast< std::uint8_t >(L'f') }}; +#else +template< bool IsCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_0_add = +{{ + static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), + static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), + static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), + static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0') +}}; +template< bool IsCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_a_add = +{{ + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add +}}; +#endif template< bool IsCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_dash = {{ @@ -127,25 +196,55 @@ template< typename > struct to_chars_simd_constants { static const simd_vector128< std::uint8_t > mm_0F; +#if defined(BOOST_UUID_USE_SSSE3) static const simd_vector128< std::uint8_t > mm_shuffle_pattern1; static const simd_vector128< std::uint8_t > mm_shuffle_pattern2; +#else + static const simd_vector128< std::uint8_t > mm_9; + static const simd_vector128< std::uint8_t > mm_group1_mask; + static const simd_vector128< std::uint8_t > mm_group2_mask; + static const simd_vector128< std::uint8_t > mm_group3_mask; +#endif }; template< typename T > const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_0F = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }}; +#if defined(BOOST_UUID_USE_SSSE3) template< typename T > const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_shuffle_pattern1 = {{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x80, 0x08, 0x09, 0x0A, 0x0B, 0x80, 0x0C, 0x0D }}; template< typename T > const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_shuffle_pattern2 = {{ 0x00, 0x01, 0x80, 0x02, 0x03, 0x04, 0x05, 0x80, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D }}; +#else +template< typename T > +const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_9 = + {{ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 }}; +template< typename T > +const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_group1_mask = + {{ 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}; +template< typename T > +const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_group2_mask = + {{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}; +template< typename T > +const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_group3_mask = + {{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00 }}; +#endif -//! Converts UUID to a string of 36 characters, where first 32 craracters are returned in mm_chars1 and mm_chars2 and the last 4 in the highest 32 bits of mm_chars3 +/*! + * Converts UUID to a string of 36 characters, where the first 32 characters are returned in mm_chars1 and mm_chars2. + * When SSSE3 is supported, last 4 characters are returned in the highest 32 bits of mm_chars3, otherwise in the lowest 32 bits. + */ BOOST_FORCEINLINE void to_chars_simd_core ( const std::uint8_t* data, - __m128i const& mm_char_table, __m128i const& mm_char_dash, +#if defined(BOOST_UUID_USE_SSSE3) + __m128i const& mm_char_table, +#else + __m128i const& mm_char_0_add, __m128i const& mm_char_a_add, +#endif + __m128i const& mm_char_dash, __m128i& mm_chars1, __m128i& mm_chars2, __m128i& mm_chars3 ) noexcept { @@ -154,18 +253,31 @@ BOOST_FORCEINLINE void to_chars_simd_core __m128i mm_input = _mm_loadu_si128(reinterpret_cast< const __m128i* >(data)); // Split half-bytes - __m128i const& mm_0F = constants::mm_0F; - __m128i mm_input_hi = _mm_and_si128(_mm_srli_epi32(mm_input, 4), mm_0F); - __m128i mm_input_lo = _mm_and_si128(mm_input, mm_0F); + __m128i mm_input_hi = _mm_and_si128(_mm_srli_epi32(mm_input, 4), constants::mm_0F); + __m128i mm_input_lo = _mm_and_si128(mm_input, constants::mm_0F); // Stringize each of the halves +#if defined(BOOST_UUID_USE_SSSE3) mm_input_hi = _mm_shuffle_epi8(mm_char_table, mm_input_hi); mm_input_lo = _mm_shuffle_epi8(mm_char_table, mm_input_lo); +#else + { + __m128i mm_add_mask_hi = _mm_cmpgt_epi8(mm_input_hi, constants::mm_9); + __m128i mm_add_mask_lo = _mm_cmpgt_epi8(mm_input_lo, constants::mm_9); + + __m128i mm_add_hi = _mm_add_epi8(mm_char_0_add, _mm_and_si128(mm_add_mask_hi, mm_char_a_add)); + __m128i mm_add_lo = _mm_add_epi8(mm_char_0_add, _mm_and_si128(mm_add_mask_lo, mm_char_a_add)); + + mm_input_hi = _mm_add_epi8(mm_input_hi, mm_add_hi); + mm_input_lo = _mm_add_epi8(mm_input_lo, mm_add_lo); + } +#endif // Join them back together __m128i mm_1 = _mm_unpacklo_epi8(mm_input_hi, mm_input_lo); __m128i mm_2 = _mm_unpackhi_epi8(mm_input_hi, mm_input_lo); +#if defined(BOOST_UUID_USE_SSSE3) // Insert dashes at positions 8, 13, 18 and 23 // mm_1 mm_2 // |0123456789abcdef|0123456789abcdef| @@ -178,6 +290,32 @@ BOOST_FORCEINLINE void to_chars_simd_core mm_chars1 = _mm_max_epu8(mm_chars1, mm_char_dash); mm_chars2 = _mm_max_epu8(mm_chars2, mm_char_dash); mm_chars3 = mm_2; +#else + // Split groups of characters between dashes and shift them into their places + // mm_middle: |89abcdef01234567| + // mm_group1: |Z89abZZZZZZZZZZZ| + // mm_group2: |ZZZZZZcdefZZZZZZ| + // mm_group3: |ZZZZZZZZZZZ0123Z| + __m128i mm_middle = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(mm_1), _mm_castsi128_pd(mm_2), _MM_SHUFFLE2(0, 1))); + __m128i mm_group1 = _mm_slli_epi64(mm_middle, 8); + __m128i mm_group2 = _mm_slli_si128(mm_middle, 2); + __m128i mm_group3 = _mm_slli_epi64(mm_middle, 24); + mm_group1 = _mm_and_si128(mm_group1, constants::mm_group1_mask); + mm_group2 = _mm_and_si128(mm_group2, constants::mm_group2_mask); + mm_group3 = _mm_and_si128(mm_group3, constants::mm_group3_mask); + + // Merge them back and insert dashes + // mm_middle: |-89ab-cdef-0123-| + mm_middle = _mm_or_si128(_mm_or_si128(mm_group1, mm_group2), mm_group3); + mm_middle = _mm_max_epu8(mm_middle, mm_char_dash); + + // mm_2: |cdef0123456789ab| + mm_2 = _mm_shuffle_epi32(mm_2, _MM_SHUFFLE(2, 1, 0, 3)); + + mm_chars1 = _mm_unpacklo_epi64(mm_1, mm_middle); + mm_chars2 = _mm_unpackhi_epi64(mm_middle, mm_2); + mm_chars3 = mm_2; +#endif } #if defined(BOOST_MSVC) @@ -195,7 +333,12 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept uuids::detail::to_chars_simd_core ( u.data(), +#if defined(BOOST_UUID_USE_SSSE3) char_constants::mm_char_table, +#else + char_constants::mm_char_0_add, + char_constants::mm_char_a_add, +#endif char_constants::mm_char_dash, mm_chars1, mm_chars2, mm_chars3 ); @@ -205,11 +348,17 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept { _mm_storeu_si128(reinterpret_cast< __m128i* >(out), mm_chars1); _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 16), mm_chars2); + detail::store_native_u32 + ( + out + 32, #if defined(BOOST_UUID_USE_SSE41) - detail::store_native_u32(out + 32, static_cast< std::uint32_t >(_mm_extract_epi32(mm_chars3, 3))); + static_cast< std::uint32_t >(_mm_extract_epi32(mm_chars3, 3)) +#elif defined(BOOST_UUID_USE_SSSE3) + static_cast< std::uint32_t >(_mm_cvtsi128_si32(_mm_srli_si128(mm_chars3, 12))) #else - detail::store_native_u32(out + 32, static_cast< std::uint32_t >(_mm_cvtsi128_si32(_mm_srli_si128(mm_chars3, 12)))); + static_cast< std::uint32_t >(_mm_cvtsi128_si32(mm_chars3)) #endif + ); } else BOOST_IF_CONSTEXPR (sizeof(Char) == 2u) { @@ -225,8 +374,10 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept #endif #if defined(BOOST_UUID_USE_SSE41) && (defined(__x86_64__) || defined(_M_X64)) detail::store_native_u64(out + 32, static_cast< std::uint64_t >(_mm_extract_epi64(_mm_unpackhi_epi8(mm_chars3, mm_0), 1))); -#else +#elif defined(BOOST_UUID_USE_SSSE3) _mm_storeh_pd(reinterpret_cast< BOOST_MAY_ALIAS double* >(out + 32), _mm_castsi128_pd(_mm_unpackhi_epi8(mm_chars3, mm_0))); +#else + _mm_storel_epi64(reinterpret_cast< __m128i* >(out + 32), _mm_unpacklo_epi8(mm_chars3, mm_0)); #endif } else @@ -255,7 +406,15 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 24), _mm_unpacklo_epi16(mm, mm_0)); _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 28), _mm_unpackhi_epi16(mm, mm_0)); #endif - _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 32), _mm_unpackhi_epi16(_mm_unpackhi_epi8(mm_chars3, mm_0), mm_0)); + _mm_storeu_si128 + ( + reinterpret_cast< __m128i* >(out + 32), +#if defined(BOOST_UUID_USE_SSSE3) + _mm_unpackhi_epi16(_mm_unpackhi_epi8(mm_chars3, mm_0), mm_0) +#else + _mm_unpacklo_epi16(_mm_unpacklo_epi8(mm_chars3, mm_0), mm_0) +#endif + ); } return out + 36; @@ -269,6 +428,6 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept } // namespace uuids } // namespace boost -#endif // defined(BOOST_UUID_USE_SSSE3) +#endif // defined(BOOST_UUID_USE_SSE2) #endif // BOOST_UUID_DETAIL_TO_CHARS_X86_HPP_INCLUDED From ba77fe278110836762727040c1ff13641b36a743 Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Thu, 8 Jan 2026 03:10:28 +0300 Subject: [PATCH 4/6] Added SSE2 and SSSE3 SIMD implementations of from_chars. This adds SSE2 and SSSE3 code paths to from_chars_x86.hpp. The performance effect on Intel Golden Cove (Core i7-12700K), gcc 13.3, in millions of successful from_chars() calls per second: Char | Generic | SSE2 | SSSE3 | SSE4.1 | AVX2 | AVX512v1 =========+=========+=================+=================+=================+=================+================ char | 40.475 | 327.791 (8.10x) | 465.857 (11.5x) | 555.346 (13.7x) | 504.648 (12.5x) | 539.700 (13.3x) char16_t | 38.757 | 292.048 (7.54x) | 401.117 (10.3x) | 478.574 (12.3x) | 426.188 (11.0x) | 416.205 (10.7x) char32_t | 50.200 | 150.900 (3.01x) | 204.588 (4.08x) | 389.882 (7.77x) | 359.591 (7.16x) | 349.663 (6.97x) In addition, the workarounds to avoid (v)pblendvb instructions have been extended to Intel Haswell and Broadwell, as these microarchitectures have poor performance with these instructions (including the SSE4.1 pblendvb). Two new experimental control macros added: BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB and BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB. The former indicates that (v)pblendvb instructions are slow and should be avoided on the target microarchitectures. The latter indicates that (v)pblendvb should be used by the implementation. The latter macro is derived from the former and takes precedence. As before, these macros can be used for experimenting and fine tuning performance for specific target CPUs. By default, BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB is defined for Haswell/Broadwell or if AVX is detected. Lastly, made selection between blend-based and shuffle-based character code conversion in various places unified, controlled by a single internal macro BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS. --- include/boost/uuid/detail/from_chars.hpp | 4 +- include/boost/uuid/detail/from_chars_x86.hpp | 248 +++++++++++++++---- 2 files changed, 195 insertions(+), 57 deletions(-) diff --git a/include/boost/uuid/detail/from_chars.hpp b/include/boost/uuid/detail/from_chars.hpp index d66ec64..2639a72 100644 --- a/include/boost/uuid/detail/from_chars.hpp +++ b/include/boost/uuid/detail/from_chars.hpp @@ -11,7 +11,7 @@ #include #include -#if defined(BOOST_UUID_USE_SSE41) +#if defined(BOOST_UUID_USE_SSE2) # include #elif defined(BOOST_UUID_REPORT_IMPLEMENTATION) @@ -27,7 +27,7 @@ template BOOST_UUID_CXX14_CONSTEXPR_RT inline from_chars_result from_chars( Ch const* first, Ch const* last, uuid& u ) noexcept { -#if defined(BOOST_UUID_USE_SSE41) +#if defined(BOOST_UUID_USE_SSE2) if( detail::is_constant_evaluated_rt() ) { return detail::from_chars_generic( first, last, u ); diff --git a/include/boost/uuid/detail/from_chars_x86.hpp b/include/boost/uuid/detail/from_chars_x86.hpp index 6521e0c..c54b6f9 100644 --- a/include/boost/uuid/detail/from_chars_x86.hpp +++ b/include/boost/uuid/detail/from_chars_x86.hpp @@ -7,7 +7,7 @@ #include -#if defined(BOOST_UUID_USE_SSE41) +#if defined(BOOST_UUID_USE_SSE2) #include #include @@ -29,22 +29,48 @@ BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, AVX512v1" ) #elif defined(BOOST_UUID_USE_AVX) BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, AVX" ) -#else +#elif defined(BOOST_UUID_USE_SSE41) BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, SSE4.1" ) +#elif defined(BOOST_UUID_USE_SSSE3) +BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, SSSE3" ) + +#else +BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, SSE2" ) + #endif #endif // #if defined(BOOST_UUID_REPORT_IMPLEMENTATION) #if defined(BOOST_UUID_USE_AVX) #include -#else +#elif defined(BOOST_UUID_USE_SSE41) #include +#elif defined(BOOST_UUID_USE_SSSE3) +#include +#else +#include #endif #if defined(_MSC_VER) && !defined(__clang__) #include #pragma intrinsic(_BitScanForward) #endif +// Unlike the legacy SSE4.1 pblendvb instruction, the VEX-coded vpblendvb is slow on Intel Lion Cove, Skymont and older. +// Newer microarchitectures are unknown at the time of this writing. Also, on Intel Haswell/Broadwell, even the SSE4.1 +// pblendvb is slow. +#if !defined(BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB) && \ + (defined(__tune_haswell__) || defined(__tune_broadwell__) || defined(BOOST_UUID_USE_AVX)) +#define BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB +#endif + +#if !defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB) && defined(BOOST_UUID_USE_SSE41) && !defined(BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB) +#define BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB +#endif + +#if defined(BOOST_UUID_USE_AVX512_V1) || (defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB)) || !defined(BOOST_UUID_USE_SSSE3) +#define BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS +#endif + namespace boost { namespace uuids { namespace detail { @@ -110,11 +136,11 @@ struct from_chars_simd_char_constants static const simd_vector128< std::uint8_t > mm_char_code2_cmp; static const simd_vector128< std::uint8_t > mm_char_code1_cmp; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) static const simd_vector128< std::uint8_t > mm_char_code2_sub; static const simd_vector128< std::uint8_t > mm_char_code1_sub; static const simd_vector128< std::uint8_t > mm_char_code0_sub; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) }; template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > @@ -147,7 +173,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< Char, IsCha static_cast< std::uint8_t >(char_code1 - 1u), static_cast< std::uint8_t >(char_code1 - 1u) }}; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > from_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_code2_sub = @@ -170,7 +196,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< Char, IsCha char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub }}; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< bool IsWCharASCIICompatible > struct from_chars_simd_char_constants< char, false, IsWCharASCIICompatible > @@ -225,11 +251,11 @@ struct from_chars_simd_char_constants< char, false, IsWCharASCIICompatible > static const simd_vector128< std::uint8_t > mm_char_code2_cmp; static const simd_vector128< std::uint8_t > mm_char_code1_cmp; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) static const simd_vector128< std::uint8_t > mm_char_code2_sub; static const simd_vector128< std::uint8_t > mm_char_code1_sub; static const simd_vector128< std::uint8_t > mm_char_code0_sub; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) }; template< bool IsWCharASCIICompatible > @@ -265,7 +291,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< char, false static_cast< std::uint8_t >(char_code1 - 1u), static_cast< std::uint8_t >(char_code1 - 1u) }}; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > from_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_code2_sub = @@ -288,7 +314,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< char, false char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub }}; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< bool IsCharASCIICompatible > struct from_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false > @@ -348,11 +374,11 @@ struct from_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false > static const simd_vector128< std::uint8_t > mm_char_code2_cmp; static const simd_vector128< std::uint8_t > mm_char_code1_cmp; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) static const simd_vector128< std::uint8_t > mm_char_code2_sub; static const simd_vector128< std::uint8_t > mm_char_code1_sub; static const simd_vector128< std::uint8_t > mm_char_code0_sub; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) }; template< bool IsCharASCIICompatible > @@ -388,7 +414,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< wchar_t, Is static_cast< std::uint8_t >(char_code1 - 1u), static_cast< std::uint8_t >(char_code1 - 1u) }}; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< bool IsCharASCIICompatible > const simd_vector128< std::uint8_t > from_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_code2_sub = @@ -411,7 +437,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< wchar_t, Is char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub }}; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< typename > @@ -426,12 +452,18 @@ struct from_chars_simd_constants static const simd_vector128< std::uint8_t > mm_split_half_bytes_pattern1; static const simd_vector128< std::uint8_t > mm_split_half_bytes_pattern2; static const simd_vector128< std::uint8_t > mm_split_half_bytes_pattern3; - static const simd_vector128< std::uint8_t > mm_split_half_bytes_blend_mask; + static const simd_vector128< std::uint8_t > mm_split_half_bytes_blend_mask1; +#if !defined(BOOST_UUID_USE_SSE41) + static const simd_vector128< std::uint8_t > mm_split_half_bytes_blend_mask2; +#endif +#if !defined(BOOST_UUID_USE_SSSE3) + static const simd_vector128< std::uint8_t > mm_split_half_byte_chars_mask; +#endif #endif static const simd_vector128< std::uint8_t > mm_F0; -#if defined(BOOST_UUID_USE_AVX) && !defined(BOOST_UUID_USE_AVX512_V1) +#if !defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) static const simd_vector128< std::uint8_t > mm_2; #endif }; @@ -457,13 +489,23 @@ template< typename T > const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_pattern3 = {{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x03, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x02 }}; template< typename T > -const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_blend_mask = +const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_blend_mask1 = {{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF }}; +#if !defined(BOOST_UUID_USE_SSE41) +template< typename T > +const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_blend_mask2 = + {{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00 }}; +#endif +#if !defined(BOOST_UUID_USE_SSSE3) +template< typename T > +const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_byte_chars_mask = + {{ 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }}; +#endif #endif template< typename T > const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_F0 = {{ 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0 }}; -#if defined(BOOST_UUID_USE_AVX) && !defined(BOOST_UUID_USE_AVX512_V1) +#if !defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< typename T > const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_2 = {{ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 }}; @@ -594,6 +636,38 @@ struct from_chars_simd_load_traits< Char, 2u > template< typename Char > struct from_chars_simd_load_traits< Char, 4u > { +#if !defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_USE_SSSE3) + static const simd_vector128< std::uint8_t > mm_deinterleave_epi16_pattern; +#endif + + static BOOST_FORCEINLINE __m128i mm_packus_epi32(__m128i mm1, __m128i mm2) noexcept + { +#if defined(BOOST_UUID_USE_SSE41) + return _mm_packus_epi32(mm1, mm2); +#else // defined(BOOST_UUID_USE_SSE41) +#if defined(BOOST_UUID_USE_SSSE3) + mm1 = _mm_shuffle_epi8(mm1, mm_deinterleave_epi16_pattern); + mm2 = _mm_shuffle_epi8(mm2, mm_deinterleave_epi16_pattern); + + __m128i mm_lo = _mm_unpacklo_epi64(mm1, mm2); + __m128i mm_hi = _mm_unpackhi_epi64(mm1, mm2); +#else // defined(BOOST_UUID_USE_SSSE3) + mm1 = _mm_shufflelo_epi16(mm1, _MM_SHUFFLE(3, 1, 2, 0)); + mm2 = _mm_shufflelo_epi16(mm2, _MM_SHUFFLE(3, 1, 2, 0)); + mm1 = _mm_shufflehi_epi16(mm1, _MM_SHUFFLE(3, 1, 2, 0)); + mm2 = _mm_shufflehi_epi16(mm2, _MM_SHUFFLE(3, 1, 2, 0)); + + __m128i mm_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0))); + __m128i mm_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(3, 1, 3, 1))); +#endif // defined(BOOST_UUID_USE_SSSE3) + const __m128i mm_0 = _mm_setzero_si128(); + const __m128i mm_FF = _mm_cmpeq_epi32(mm_0, mm_0); + + __m128i mm_sat = _mm_xor_si128(_mm_cmpeq_epi16(mm_hi, mm_0), mm_FF); + return _mm_or_si128(mm_lo, mm_sat); +#endif // defined(BOOST_UUID_USE_SSE41) + } + static BOOST_FORCEINLINE __m128i load_packed_16(const Char* p) noexcept { #if defined(BOOST_UUID_USE_AVX512_V1) @@ -606,8 +680,8 @@ struct from_chars_simd_load_traits< Char, 4u > return _mm_unpacklo_epi64(mm1, mm2); #endif // defined(BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM) #else - __m128i mm1 = _mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 4))); - __m128i mm2 = _mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 8)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 12))); + __m128i mm1 = mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 4))); + __m128i mm2 = mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 8)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 12))); return _mm_packus_epi16(mm1, mm2); #endif } @@ -619,7 +693,7 @@ struct from_chars_simd_load_traits< Char, 4u > #else __m128i mm1 = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p)); __m128i mm2 = _mm_setzero_si128(); - return _mm_packus_epi16(_mm_packus_epi32(mm1, mm2), mm2); + return _mm_packus_epi16(mm_packus_epi32(mm1, mm2), mm2); #endif } @@ -668,13 +742,19 @@ struct from_chars_simd_load_traits< Char, 4u > mm_chars1 = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p)); mm_chars2 = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 4)); } - mm_chars1 = _mm_packus_epi32(mm_chars1, mm_chars2); - mm_chars3 = _mm_packus_epi32(mm_chars3, mm_chars4); + mm_chars1 = mm_packus_epi32(mm_chars1, mm_chars2); + mm_chars3 = mm_packus_epi32(mm_chars3, mm_chars4); return _mm_packus_epi16(mm_chars1, mm_chars3); #endif } }; +#if !defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_USE_SSSE3) +template< typename Char > +const simd_vector128< std::uint8_t > from_chars_simd_load_traits< Char, 4u >::mm_deinterleave_epi16_pattern = + {{ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D, 0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F }}; +#endif + #if defined(BOOST_GCC) && (BOOST_GCC >= 40600) #pragma GCC diagnostic pop #endif @@ -690,7 +770,7 @@ BOOST_FORCEINLINE void from_chars_simd_core __m128i const& mm_expected_dashes, __m128i const& mm_char_code1_cmp, __m128i const& mm_char_code2_cmp, -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) __m128i const& mm_char_code0_sub, __m128i const& mm_char_code1_sub, __m128i const& mm_char_code2_sub, @@ -706,15 +786,24 @@ BOOST_FORCEINLINE void from_chars_simd_core // |01234567-89ab-cd|ef-0123-456789ab|cdefXXXXXXXXXXXX| // // Check if dashes are in the expected positions + // + // mm_middle + // |-89ab-cdef-0123-| + const __m128i mm_middle = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(mm_chars1), _mm_castsi128_pd(mm_chars2), _MM_SHUFFLE2(0, 1))); { - // mm_dashes - // |-89ab-cdef-0123-| - __m128i mm_dashes = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(mm_chars1), _mm_castsi128_pd(mm_chars2), _MM_SHUFFLE2(0, 1))); - if (BOOST_UNLIKELY(!_mm_test_all_zeros(_mm_xor_si128(mm_dashes, mm_expected_dashes), constants::mm_dashes_mask))) +#if defined(BOOST_UUID_USE_SSE41) + if (BOOST_UNLIKELY(!_mm_test_all_zeros(_mm_xor_si128(mm_middle, mm_expected_dashes), constants::mm_dashes_mask))) +#else + __m128i mm_dashes = _mm_and_si128(mm_middle, constants::mm_dashes_mask); + std::uint32_t dash_mask = static_cast< std::uint32_t >(_mm_movemask_epi8(_mm_cmpeq_epi8(mm_dashes, mm_expected_dashes))); + if (BOOST_UNLIKELY(dash_mask != 0xFFFF)) +#endif { // Some of the dashes are missing - mm_dashes = _mm_and_si128(mm_dashes, constants::mm_dashes_mask); +#if defined(BOOST_UUID_USE_SSE41) + __m128i mm_dashes = _mm_and_si128(mm_middle, constants::mm_dashes_mask); std::uint32_t dash_mask = static_cast< std::uint32_t >(_mm_movemask_epi8(_mm_cmpeq_epi8(mm_dashes, mm_expected_dashes))); +#endif unsigned int pos = detail::countr_zero_nz(~dash_mask) + 8u; if (pos < end_pos) { @@ -733,7 +822,11 @@ BOOST_FORCEINLINE void from_chars_simd_core // mm_chars2: |02468ace13579bdf| mm_chars1 = _mm_permutex2var_epi8(mm_chars1, constants::mm_split_half_bytes_pattern1, mm_chars2); mm_chars2 = _mm_permutex2var_epi8(mm_chars2, constants::mm_split_half_bytes_pattern2, mm_chars3); -#else + + // Group half-byte characters + __m128i mm_lo = _mm_unpacklo_epi64(mm_chars1, mm_chars2); + __m128i mm_hi = _mm_unpackhi_epi64(mm_chars1, mm_chars2); +#elif defined(BOOST_UUID_USE_SSSE3) // mm_chars1: |02468acZ13579bdZ| // mm_chars2: |02468aZe13579bZf| // mm_chars3: |ZZZZZZceZZZZZZdf| @@ -743,20 +836,47 @@ BOOST_FORCEINLINE void from_chars_simd_core // mm_chars1: |02468ace13579bdf| // mm_chars2: |02468ace13579bdf| - // Avoid using vpblendvb, which is slow on Intel #if defined(BOOST_UUID_USE_AVX512_V1) - mm_chars1 = _mm_ternarylogic_epi64(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask, 0xD8); // (_MM_TERNLOG_A & ~_MM_TERNLOG_C) | (_MM_TERNLOG_B & _MM_TERNLOG_C) -#elif defined(BOOST_UUID_USE_AVX) - mm_chars1 = _mm_or_si128(mm_chars1, _mm_and_si128(mm_chars2, constants::mm_split_half_bytes_blend_mask)); + // Avoid using vpblendvb, which is slow on Intel + mm_chars1 = _mm_ternarylogic_epi64(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask1, 0xD8); // (_MM_TERNLOG_A & ~_MM_TERNLOG_C) | (_MM_TERNLOG_B & _MM_TERNLOG_C) +#elif defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB) + mm_chars1 = _mm_blendv_epi8(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask1); #else - mm_chars1 = _mm_blendv_epi8(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask); + mm_chars1 = _mm_or_si128(mm_chars1, _mm_and_si128(mm_chars2, constants::mm_split_half_bytes_blend_mask1)); #endif +#if defined(BOOST_UUID_USE_SSE41) mm_chars2 = _mm_blend_epi16(mm_chars2, mm_chars3, 0x88); +#else + mm_chars2 = _mm_or_si128(_mm_and_si128(mm_chars2, constants::mm_split_half_bytes_blend_mask2), mm_chars3); #endif - // Group half-byte digits + // Group half-byte characters __m128i mm_lo = _mm_unpacklo_epi64(mm_chars1, mm_chars2); __m128i mm_hi = _mm_unpackhi_epi64(mm_chars1, mm_chars2); +#else + __m128i mm_lo, mm_hi; + { + // Remove dashes + __m128i mm_group1 = _mm_srli_epi64(mm_middle, 8); + __m128i mm_group2 = _mm_srli_si128(mm_middle, 6); + __m128i mm_group3 = _mm_srli_si128(mm_middle, 11); + + mm_chars1 = _mm_unpacklo_epi64(mm_chars1, _mm_unpacklo_epi32(mm_group1, mm_group2)); + + mm_chars2 = _mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(mm_chars2), _mm_castsi128_pd(_mm_unpacklo_epi32(mm_group3, mm_chars3)))); + mm_chars2 = _mm_shuffle_epi32(mm_chars2, _MM_SHUFFLE(1, 3, 2, 0)); + + // Deinterleave half-byte characters + __m128i mm_lo1 = _mm_srli_epi16(mm_chars1, 8); + __m128i mm_lo2 = _mm_srli_epi16(mm_chars2, 8); + + __m128i mm_hi1 = _mm_and_si128(mm_chars1, constants::mm_split_half_byte_chars_mask); + __m128i mm_hi2 = _mm_and_si128(mm_chars2, constants::mm_split_half_byte_chars_mask); + + mm_lo = _mm_packus_epi16(mm_lo1, mm_lo2); + mm_hi = _mm_packus_epi16(mm_hi1, mm_hi2); + } +#endif // Convert characters to 8-bit integers. The algorithm is basically as follows: // @@ -779,6 +899,7 @@ BOOST_FORCEINLINE void from_chars_simd_core // Note that there is one caveat due to the fact that there are only signed byte comparisons until AVX-512. This is a problem if the character encoding has // hexadecimal character codes with the highest bit set to 1. This is handled in from_chars_simd_char_constants by constructing mm_char_code1 and // mm_char_code2 in such a way that signed comparisons work as described. We also use signed comparisons in AVX-512 to reuse the same constants. +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) #if defined(BOOST_UUID_USE_AVX512_V1) __mmask16 k_char_code2_mask_lo = _mm_cmpgt_epi8_mask(mm_lo, mm_char_code2_cmp); __mmask16 k_char_code2_mask_hi = _mm_cmpgt_epi8_mask(mm_hi, mm_char_code2_cmp); @@ -791,8 +912,29 @@ BOOST_FORCEINLINE void from_chars_simd_core mm_char_code_sub_lo = _mm_mask_blend_epi8(k_char_code1_mask_lo, mm_char_code0_sub, mm_char_code_sub_lo); mm_char_code_sub_hi = _mm_mask_blend_epi8(k_char_code1_mask_hi, mm_char_code0_sub, mm_char_code_sub_hi); -#elif defined(BOOST_UUID_USE_AVX) - // Unlike the legacy SSE4.1 pblendvb instruction, the VEX-coded vpblendvb is slow on Intel. Use a different approach: +#else + __m128i mm_char_code2_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code2_cmp); + __m128i mm_char_code2_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code2_cmp); + + __m128i mm_char_code1_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code1_cmp); + __m128i mm_char_code1_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code1_cmp); + +#if defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB) + __m128i mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_lo); + __m128i mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_hi); + + mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_lo, mm_char_code1_mask_lo); + mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_hi, mm_char_code1_mask_hi); +#else + __m128i mm_char_code_sub_lo = _mm_or_si128(_mm_andnot_si128(mm_char_code2_mask_lo, mm_char_code1_sub), _mm_and_si128(mm_char_code2_mask_lo, mm_char_code2_sub)); + __m128i mm_char_code_sub_hi = _mm_or_si128(_mm_andnot_si128(mm_char_code2_mask_hi, mm_char_code1_sub), _mm_and_si128(mm_char_code2_mask_hi, mm_char_code2_sub)); + + mm_char_code_sub_lo = _mm_or_si128(_mm_andnot_si128(mm_char_code1_mask_lo, mm_char_code0_sub), _mm_and_si128(mm_char_code1_mask_lo, mm_char_code_sub_lo)); + mm_char_code_sub_hi = _mm_or_si128(_mm_andnot_si128(mm_char_code1_mask_hi, mm_char_code0_sub), _mm_and_si128(mm_char_code1_mask_hi, mm_char_code_sub_hi)); +#endif +#endif +#else // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) + // Use a different approach: // - Each vpcmpgtb produces a mask, where 0 indicates false and -1 - true. // - mm_char_code1_mask_* always overlaps with the corresponding mm_char_code2_mask_*, which means adding them // produces a vector where 0 means none of the vpcmpgtb matched the value, -1 - where mm_char_code1_mask_* matched @@ -815,26 +957,19 @@ BOOST_FORCEINLINE void from_chars_simd_core const __m128i mm_char_code_sub = _mm_cvtsi32_si128(static_cast< int >(char_code_sub)); __m128i mm_char_code_sub_lo = _mm_shuffle_epi8(mm_char_code_sub, mm_char_code_pattern_lo); __m128i mm_char_code_sub_hi = _mm_shuffle_epi8(mm_char_code_sub, mm_char_code_pattern_hi); -#else - __m128i mm_char_code2_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code2_cmp); - __m128i mm_char_code2_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code2_cmp); - - __m128i mm_char_code1_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code1_cmp); - __m128i mm_char_code1_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code1_cmp); - - __m128i mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_lo); - __m128i mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_hi); - - mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_lo, mm_char_code1_mask_lo); - mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_hi, mm_char_code1_mask_hi); -#endif +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) mm_lo = _mm_sub_epi8(mm_lo, mm_char_code_sub_lo); mm_hi = _mm_sub_epi8(mm_hi, mm_char_code_sub_hi); // Check hexadecimal character validity. Proper hexadecimal characters always convert to values of 0-15 and any other characters convert // to values outside that range. Which means if the upper 4 bits of a resulting integer are non-zero then the corresponding character was invalid. +#if defined(BOOST_UUID_USE_SSE41) if (BOOST_LIKELY(_mm_test_all_zeros(_mm_or_si128(mm_lo, mm_hi), constants::mm_F0))) +#else + const __m128i mm_0 = _mm_setzero_si128(); + if (BOOST_LIKELY(_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(_mm_or_si128(mm_lo, mm_hi), constants::mm_F0), mm_0)) == 0xFFFF)) +#endif { if (BOOST_LIKELY(ec == from_chars_error::none)) { @@ -845,12 +980,13 @@ BOOST_FORCEINLINE void from_chars_simd_core else { // Some of the hex digits are invalid +#if defined(BOOST_UUID_USE_SSE41) const __m128i mm_0 = _mm_setzero_si128(); +#endif __m128i mm_hi_bits_lo = _mm_and_si128(mm_lo, constants::mm_F0); __m128i mm_hi_bits_hi = _mm_and_si128(mm_hi, constants::mm_F0); mm_hi_bits_lo = _mm_cmpeq_epi8(mm_hi_bits_lo, mm_0); mm_hi_bits_hi = _mm_cmpeq_epi8(mm_hi_bits_hi, mm_0); - std::uint32_t digits_mask_lo = static_cast< std::uint32_t >(_mm_movemask_epi8(mm_hi_bits_lo)); std::uint32_t digits_mask_hi = static_cast< std::uint32_t >(_mm_movemask_epi8(mm_hi_bits_hi)); @@ -931,7 +1067,7 @@ BOOST_FORCEINLINE from_chars_result< Char > from_chars_simd(const Char* begin, c char_constants::mm_expected_dashes, char_constants::mm_char_code1_cmp, char_constants::mm_char_code2_cmp, -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) char_constants::mm_char_code0_sub, char_constants::mm_char_code1_sub, char_constants::mm_char_code2_sub, @@ -948,6 +1084,8 @@ BOOST_FORCEINLINE from_chars_result< Char > from_chars_simd(const Char* begin, c } // namespace uuids } // namespace boost -#endif // defined(BOOST_UUID_USE_SSE41) +#undef BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS + +#endif // defined(BOOST_UUID_USE_SSE2) #endif // BOOST_UUID_DETAIL_FROM_CHARS_X86_HPP_INCLUDED From b135a5d8164237c1cda63215987db052eb133657 Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Fri, 9 Jan 2026 02:58:33 +0300 Subject: [PATCH 5/6] Compile tests without running in CI if the CPU lacks required features. This allows for testing that the ISA-specific code at least compiles, even if running the tests isn't possible. The support is only added to b2, CMake still always compiles and runs the tests to keep using boost_test_jamfile for easier maintenance. In the future, similar support can be added to CMake as well. --- .github/workflows/ci.yml | 4 ++-- test/Jamfile.v2 | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c30af2b..9524ea8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -355,8 +355,8 @@ jobs: if ! [[ "$cpu_flags" =~ $re ]] then echo "CPU lacks required feature: $requirement" - echo "Skipping testing" - exit 0 + echo "Skipping running tests" + export BOOST_UUID_SKIP_RUNNING_TESTS=1 fi done fi diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 1372d80..f2aa304 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -34,6 +34,20 @@ import path ; import regex ; import testing ; +# The rule allows for suppressing running tests and instead only compile them. +# This is useful e.g. if the tests are compiled for a target ISA that is not supported by the CPU. +local rule run ( sources + : args * : input-files * : requirements * : target-name ? : default-build * ) +{ + if [ os.environ BOOST_UUID_SKIP_RUNNING_TESTS ] + { + return [ testing.compile $(sources) : $(requirements) : $(target-name) ] ; + } + else + { + return [ testing.run $(sources) : $(args) : $(input-files) : $(requirements) : $(target-name) : $(default-build) ] ; + } +} + # this rule enumerates through all the headers and ensures # that inclusion of the header by itself is sufficient to # compile successfully, proving the header does not depend From 3af17a423626a3e31d1badde8f3367f9c7783bec Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Fri, 9 Jan 2026 03:10:09 +0300 Subject: [PATCH 6/6] Added GitHub Actions jobs for SSSE3 and AVX targets. The targets verify the respective code paths in SIMD algorithms. The recently added SSE2 paths are already tested in the other, unspecialized jobs. Also added jobs to compile tests with BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM and BOOST_UUID_FROM_CHARS_X86_USE_VPERMI2B experimental macros defined. --- .github/workflows/ci.yml | 62 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9524ea8..999ee30 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,6 +92,14 @@ jobs: os: ubuntu-latest install: g++-15-multilib address-model: 32,64 + - toolset: gcc-13 + cxxstd: "11,14,17,20,2b" + instruction-set: core2 + cpu-requirements: [ ssse3 ] + os: ubuntu-latest + container: ubuntu:24.04 + install: g++-13-multilib + address-model: 32,64 - toolset: gcc-13 cxxstd: "11,14,17,20,2b" instruction-set: nehalem @@ -100,6 +108,14 @@ jobs: container: ubuntu:24.04 install: g++-13-multilib address-model: 32,64 + - toolset: gcc-13 + cxxstd: "11,14,17,20,2b" + instruction-set: sandy-bridge + cpu-requirements: [ avx ] + os: ubuntu-latest + container: ubuntu:24.04 + install: g++-13-multilib + address-model: 32,64 - toolset: gcc-13 cxxstd: "11,14,17,20,2b" instruction-set: haswell @@ -124,6 +140,16 @@ jobs: container: ubuntu:24.04 install: g++-13-multilib address-model: 32,64 + # Experimental features + - toolset: gcc-13 + cxxstd: "11,14,17,20,2b" + instruction-set: rocketlake + cpu-requirements: [ avx512f, avx512cd, avx512vl, avx512dq, avx512bw, avx512vbmi, avx512_vbmi2, avx512_bitalg, bmi1, bmi2 ] + defines: [ BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM, BOOST_UUID_FROM_CHARS_X86_USE_VPERMI2B ] + os: ubuntu-latest + container: ubuntu:24.04 + install: g++-13-multilib + address-model: 32,64 - toolset: clang compiler: clang++-3.9 cxxstd: "11,14" @@ -238,6 +264,14 @@ jobs: container: ubuntu:25.10 os: ubuntu-latest install: clang-21 + - toolset: clang + compiler: clang++-17 + cxxstd: "11,14,17,20,2b" + instruction-set: core2 + cpu-requirements: [ ssse3 ] + container: ubuntu:24.04 + os: ubuntu-latest + install: clang-17 - toolset: clang compiler: clang++-17 cxxstd: "11,14,17,20,2b" @@ -246,6 +280,14 @@ jobs: container: ubuntu:24.04 os: ubuntu-latest install: clang-17 + - toolset: clang + compiler: clang++-17 + cxxstd: "11,14,17,20,2b" + instruction-set: sandy-bridge + cpu-requirements: [ avx ] + container: ubuntu:24.04 + os: ubuntu-latest + install: clang-17 - toolset: clang compiler: clang++-17 cxxstd: "11,14,17,20,2b" @@ -270,6 +312,16 @@ jobs: container: ubuntu:24.04 os: ubuntu-latest install: clang-17 + # Experimental features + - toolset: clang + compiler: clang++-17 + cxxstd: "11,14,17,20,2b" + instruction-set: rocketlake + cpu-requirements: [ avx512f, avx512cd, avx512vl, avx512dq, avx512bw, avx512vbmi, avx512_vbmi2, avx512_bitalg, bmi1, bmi2 ] + defines: [ BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM, BOOST_UUID_FROM_CHARS_X86_USE_VPERMI2B ] + container: ubuntu:24.04 + os: ubuntu-latest + install: clang-17 - toolset: clang os: macos-14 cxxstd: "11,14,17,20,2b" @@ -346,6 +398,7 @@ jobs: cd ../boost-root ADDRMD=${{matrix.address-model}} INSTRUCTION_SET=${{matrix.instruction-set}} + b2_args=(-j2 libs/$LIBRARY/test toolset=${{matrix.toolset}} cxxstd=${{matrix.cxxstd}} ${ADDRMD:+address-model=$ADDRMD} ${INSTRUCTION_SET:+instruction-set=$INSTRUCTION_SET} variant=debug,release) if [ -n "${{matrix.cpu-requirements}}" ] then cpu_flags="$(lscpu | grep -F "Flags:" | sed "s/^Flags:\\s*//")"; @@ -360,7 +413,14 @@ jobs: fi done fi - ./b2 -j2 libs/$LIBRARY/test toolset=${{matrix.toolset}} cxxstd=${{matrix.cxxstd}} ${ADDRMD:+address-model=$ADDRMD} ${INSTRUCTION_SET:+instruction-set=$INSTRUCTION_SET} variant=debug,release + if [ -n "${{matrix.defines}}" ] + then + for define in ${{join(matrix.defines, ' ')}} + do + b2_args+=("define=$define") + done + fi + ./b2 "${b2_args[@]}" windows: strategy: