diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c30af2b..999ee30 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,6 +92,14 @@ jobs: os: ubuntu-latest install: g++-15-multilib address-model: 32,64 + - toolset: gcc-13 + cxxstd: "11,14,17,20,2b" + instruction-set: core2 + cpu-requirements: [ ssse3 ] + os: ubuntu-latest + container: ubuntu:24.04 + install: g++-13-multilib + address-model: 32,64 - toolset: gcc-13 cxxstd: "11,14,17,20,2b" instruction-set: nehalem @@ -100,6 +108,14 @@ jobs: container: ubuntu:24.04 install: g++-13-multilib address-model: 32,64 + - toolset: gcc-13 + cxxstd: "11,14,17,20,2b" + instruction-set: sandy-bridge + cpu-requirements: [ avx ] + os: ubuntu-latest + container: ubuntu:24.04 + install: g++-13-multilib + address-model: 32,64 - toolset: gcc-13 cxxstd: "11,14,17,20,2b" instruction-set: haswell @@ -124,6 +140,16 @@ jobs: container: ubuntu:24.04 install: g++-13-multilib address-model: 32,64 + # Experimental features + - toolset: gcc-13 + cxxstd: "11,14,17,20,2b" + instruction-set: rocketlake + cpu-requirements: [ avx512f, avx512cd, avx512vl, avx512dq, avx512bw, avx512vbmi, avx512_vbmi2, avx512_bitalg, bmi1, bmi2 ] + defines: [ BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM, BOOST_UUID_FROM_CHARS_X86_USE_VPERMI2B ] + os: ubuntu-latest + container: ubuntu:24.04 + install: g++-13-multilib + address-model: 32,64 - toolset: clang compiler: clang++-3.9 cxxstd: "11,14" @@ -238,6 +264,14 @@ jobs: container: ubuntu:25.10 os: ubuntu-latest install: clang-21 + - toolset: clang + compiler: clang++-17 + cxxstd: "11,14,17,20,2b" + instruction-set: core2 + cpu-requirements: [ ssse3 ] + container: ubuntu:24.04 + os: ubuntu-latest + install: clang-17 - toolset: clang compiler: clang++-17 cxxstd: "11,14,17,20,2b" @@ -246,6 +280,14 @@ jobs: container: ubuntu:24.04 os: ubuntu-latest install: clang-17 + - toolset: clang + compiler: clang++-17 + cxxstd: "11,14,17,20,2b" + instruction-set: sandy-bridge + cpu-requirements: [ avx ] + container: ubuntu:24.04 + os: ubuntu-latest + install: clang-17 - toolset: clang compiler: clang++-17 cxxstd: "11,14,17,20,2b" @@ -270,6 +312,16 @@ jobs: container: ubuntu:24.04 os: ubuntu-latest install: clang-17 + # Experimental features + - toolset: clang + compiler: clang++-17 + cxxstd: "11,14,17,20,2b" + instruction-set: rocketlake + cpu-requirements: [ avx512f, avx512cd, avx512vl, avx512dq, avx512bw, avx512vbmi, avx512_vbmi2, avx512_bitalg, bmi1, bmi2 ] + defines: [ BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM, BOOST_UUID_FROM_CHARS_X86_USE_VPERMI2B ] + container: ubuntu:24.04 + os: ubuntu-latest + install: clang-17 - toolset: clang os: macos-14 cxxstd: "11,14,17,20,2b" @@ -346,6 +398,7 @@ jobs: cd ../boost-root ADDRMD=${{matrix.address-model}} INSTRUCTION_SET=${{matrix.instruction-set}} + b2_args=(-j2 libs/$LIBRARY/test toolset=${{matrix.toolset}} cxxstd=${{matrix.cxxstd}} ${ADDRMD:+address-model=$ADDRMD} ${INSTRUCTION_SET:+instruction-set=$INSTRUCTION_SET} variant=debug,release) if [ -n "${{matrix.cpu-requirements}}" ] then cpu_flags="$(lscpu | grep -F "Flags:" | sed "s/^Flags:\\s*//")"; @@ -355,12 +408,19 @@ jobs: if ! [[ "$cpu_flags" =~ $re ]] then echo "CPU lacks required feature: $requirement" - echo "Skipping testing" - exit 0 + echo "Skipping running tests" + export BOOST_UUID_SKIP_RUNNING_TESTS=1 fi done fi - ./b2 -j2 libs/$LIBRARY/test toolset=${{matrix.toolset}} cxxstd=${{matrix.cxxstd}} ${ADDRMD:+address-model=$ADDRMD} ${INSTRUCTION_SET:+instruction-set=$INSTRUCTION_SET} variant=debug,release + if [ -n "${{matrix.defines}}" ] + then + for define in ${{join(matrix.defines, ' ')}} + do + b2_args+=("define=$define") + done + fi + ./b2 "${b2_args[@]}" windows: strategy: diff --git a/include/boost/uuid/detail/from_chars.hpp b/include/boost/uuid/detail/from_chars.hpp index d66ec64..2639a72 100644 --- a/include/boost/uuid/detail/from_chars.hpp +++ b/include/boost/uuid/detail/from_chars.hpp @@ -11,7 +11,7 @@ #include #include -#if defined(BOOST_UUID_USE_SSE41) +#if defined(BOOST_UUID_USE_SSE2) # include #elif defined(BOOST_UUID_REPORT_IMPLEMENTATION) @@ -27,7 +27,7 @@ template BOOST_UUID_CXX14_CONSTEXPR_RT inline from_chars_result from_chars( Ch const* first, Ch const* last, uuid& u ) noexcept { -#if defined(BOOST_UUID_USE_SSE41) +#if defined(BOOST_UUID_USE_SSE2) if( detail::is_constant_evaluated_rt() ) { return detail::from_chars_generic( first, last, u ); diff --git a/include/boost/uuid/detail/from_chars_x86.hpp b/include/boost/uuid/detail/from_chars_x86.hpp index 31b5989..c54b6f9 100644 --- a/include/boost/uuid/detail/from_chars_x86.hpp +++ b/include/boost/uuid/detail/from_chars_x86.hpp @@ -7,8 +7,9 @@ #include -#if defined(BOOST_UUID_USE_SSE41) +#if defined(BOOST_UUID_USE_SSE2) +#include #include #include #include @@ -28,22 +29,48 @@ BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, AVX512v1" ) #elif defined(BOOST_UUID_USE_AVX) BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, AVX" ) -#else +#elif defined(BOOST_UUID_USE_SSE41) BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, SSE4.1" ) +#elif defined(BOOST_UUID_USE_SSSE3) +BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, SSSE3" ) + +#else +BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, SSE2" ) + #endif #endif // #if defined(BOOST_UUID_REPORT_IMPLEMENTATION) #if defined(BOOST_UUID_USE_AVX) #include -#else +#elif defined(BOOST_UUID_USE_SSE41) #include +#elif defined(BOOST_UUID_USE_SSSE3) +#include +#else +#include #endif #if defined(_MSC_VER) && !defined(__clang__) #include #pragma intrinsic(_BitScanForward) #endif +// Unlike the legacy SSE4.1 pblendvb instruction, the VEX-coded vpblendvb is slow on Intel Lion Cove, Skymont and older. +// Newer microarchitectures are unknown at the time of this writing. Also, on Intel Haswell/Broadwell, even the SSE4.1 +// pblendvb is slow. +#if !defined(BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB) && \ + (defined(__tune_haswell__) || defined(__tune_broadwell__) || defined(BOOST_UUID_USE_AVX)) +#define BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB +#endif + +#if !defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB) && defined(BOOST_UUID_USE_SSE41) && !defined(BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB) +#define BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB +#endif + +#if defined(BOOST_UUID_USE_AVX512_V1) || (defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB)) || !defined(BOOST_UUID_USE_SSSE3) +#define BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS +#endif + namespace boost { namespace uuids { namespace detail { @@ -109,11 +136,11 @@ struct from_chars_simd_char_constants static const simd_vector128< std::uint8_t > mm_char_code2_cmp; static const simd_vector128< std::uint8_t > mm_char_code1_cmp; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) static const simd_vector128< std::uint8_t > mm_char_code2_sub; static const simd_vector128< std::uint8_t > mm_char_code1_sub; static const simd_vector128< std::uint8_t > mm_char_code0_sub; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) }; template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > @@ -146,7 +173,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< Char, IsCha static_cast< std::uint8_t >(char_code1 - 1u), static_cast< std::uint8_t >(char_code1 - 1u) }}; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > from_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_code2_sub = @@ -169,7 +196,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< Char, IsCha char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub }}; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< bool IsWCharASCIICompatible > struct from_chars_simd_char_constants< char, false, IsWCharASCIICompatible > @@ -224,11 +251,11 @@ struct from_chars_simd_char_constants< char, false, IsWCharASCIICompatible > static const simd_vector128< std::uint8_t > mm_char_code2_cmp; static const simd_vector128< std::uint8_t > mm_char_code1_cmp; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) static const simd_vector128< std::uint8_t > mm_char_code2_sub; static const simd_vector128< std::uint8_t > mm_char_code1_sub; static const simd_vector128< std::uint8_t > mm_char_code0_sub; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) }; template< bool IsWCharASCIICompatible > @@ -264,7 +291,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< char, false static_cast< std::uint8_t >(char_code1 - 1u), static_cast< std::uint8_t >(char_code1 - 1u) }}; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > from_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_code2_sub = @@ -287,7 +314,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< char, false char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub }}; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< bool IsCharASCIICompatible > struct from_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false > @@ -347,11 +374,11 @@ struct from_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false > static const simd_vector128< std::uint8_t > mm_char_code2_cmp; static const simd_vector128< std::uint8_t > mm_char_code1_cmp; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) static const simd_vector128< std::uint8_t > mm_char_code2_sub; static const simd_vector128< std::uint8_t > mm_char_code1_sub; static const simd_vector128< std::uint8_t > mm_char_code0_sub; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) }; template< bool IsCharASCIICompatible > @@ -387,7 +414,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< wchar_t, Is static_cast< std::uint8_t >(char_code1 - 1u), static_cast< std::uint8_t >(char_code1 - 1u) }}; -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< bool IsCharASCIICompatible > const simd_vector128< std::uint8_t > from_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_code2_sub = @@ -410,7 +437,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< wchar_t, Is char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub }}; -#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< typename > @@ -425,12 +452,18 @@ struct from_chars_simd_constants static const simd_vector128< std::uint8_t > mm_split_half_bytes_pattern1; static const simd_vector128< std::uint8_t > mm_split_half_bytes_pattern2; static const simd_vector128< std::uint8_t > mm_split_half_bytes_pattern3; - static const simd_vector128< std::uint8_t > mm_split_half_bytes_blend_mask; + static const simd_vector128< std::uint8_t > mm_split_half_bytes_blend_mask1; +#if !defined(BOOST_UUID_USE_SSE41) + static const simd_vector128< std::uint8_t > mm_split_half_bytes_blend_mask2; +#endif +#if !defined(BOOST_UUID_USE_SSSE3) + static const simd_vector128< std::uint8_t > mm_split_half_byte_chars_mask; +#endif #endif static const simd_vector128< std::uint8_t > mm_F0; -#if defined(BOOST_UUID_USE_AVX) && !defined(BOOST_UUID_USE_AVX512_V1) +#if !defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) static const simd_vector128< std::uint8_t > mm_2; #endif }; @@ -456,13 +489,23 @@ template< typename T > const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_pattern3 = {{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x03, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x02 }}; template< typename T > -const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_blend_mask = +const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_blend_mask1 = {{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF }}; +#if !defined(BOOST_UUID_USE_SSE41) +template< typename T > +const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_blend_mask2 = + {{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00 }}; +#endif +#if !defined(BOOST_UUID_USE_SSSE3) +template< typename T > +const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_byte_chars_mask = + {{ 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }}; +#endif #endif template< typename T > const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_F0 = {{ 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0 }}; -#if defined(BOOST_UUID_USE_AVX) && !defined(BOOST_UUID_USE_AVX512_V1) +#if !defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) template< typename T > const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_2 = {{ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 }}; @@ -478,7 +521,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_2 = #pragma GCC diagnostic ignored "-Warray-bounds" #endif -template< typename Char, unsigned int Size = sizeof(Char) > +template< typename Char, std::size_t Size = sizeof(Char) > struct from_chars_simd_load_traits; template< typename Char > @@ -593,6 +636,38 @@ struct from_chars_simd_load_traits< Char, 2u > template< typename Char > struct from_chars_simd_load_traits< Char, 4u > { +#if !defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_USE_SSSE3) + static const simd_vector128< std::uint8_t > mm_deinterleave_epi16_pattern; +#endif + + static BOOST_FORCEINLINE __m128i mm_packus_epi32(__m128i mm1, __m128i mm2) noexcept + { +#if defined(BOOST_UUID_USE_SSE41) + return _mm_packus_epi32(mm1, mm2); +#else // defined(BOOST_UUID_USE_SSE41) +#if defined(BOOST_UUID_USE_SSSE3) + mm1 = _mm_shuffle_epi8(mm1, mm_deinterleave_epi16_pattern); + mm2 = _mm_shuffle_epi8(mm2, mm_deinterleave_epi16_pattern); + + __m128i mm_lo = _mm_unpacklo_epi64(mm1, mm2); + __m128i mm_hi = _mm_unpackhi_epi64(mm1, mm2); +#else // defined(BOOST_UUID_USE_SSSE3) + mm1 = _mm_shufflelo_epi16(mm1, _MM_SHUFFLE(3, 1, 2, 0)); + mm2 = _mm_shufflelo_epi16(mm2, _MM_SHUFFLE(3, 1, 2, 0)); + mm1 = _mm_shufflehi_epi16(mm1, _MM_SHUFFLE(3, 1, 2, 0)); + mm2 = _mm_shufflehi_epi16(mm2, _MM_SHUFFLE(3, 1, 2, 0)); + + __m128i mm_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0))); + __m128i mm_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(3, 1, 3, 1))); +#endif // defined(BOOST_UUID_USE_SSSE3) + const __m128i mm_0 = _mm_setzero_si128(); + const __m128i mm_FF = _mm_cmpeq_epi32(mm_0, mm_0); + + __m128i mm_sat = _mm_xor_si128(_mm_cmpeq_epi16(mm_hi, mm_0), mm_FF); + return _mm_or_si128(mm_lo, mm_sat); +#endif // defined(BOOST_UUID_USE_SSE41) + } + static BOOST_FORCEINLINE __m128i load_packed_16(const Char* p) noexcept { #if defined(BOOST_UUID_USE_AVX512_V1) @@ -605,8 +680,8 @@ struct from_chars_simd_load_traits< Char, 4u > return _mm_unpacklo_epi64(mm1, mm2); #endif // defined(BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM) #else - __m128i mm1 = _mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 4))); - __m128i mm2 = _mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 8)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 12))); + __m128i mm1 = mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 4))); + __m128i mm2 = mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 8)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 12))); return _mm_packus_epi16(mm1, mm2); #endif } @@ -618,7 +693,7 @@ struct from_chars_simd_load_traits< Char, 4u > #else __m128i mm1 = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p)); __m128i mm2 = _mm_setzero_si128(); - return _mm_packus_epi16(_mm_packus_epi32(mm1, mm2), mm2); + return _mm_packus_epi16(mm_packus_epi32(mm1, mm2), mm2); #endif } @@ -667,13 +742,19 @@ struct from_chars_simd_load_traits< Char, 4u > mm_chars1 = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p)); mm_chars2 = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 4)); } - mm_chars1 = _mm_packus_epi32(mm_chars1, mm_chars2); - mm_chars3 = _mm_packus_epi32(mm_chars3, mm_chars4); + mm_chars1 = mm_packus_epi32(mm_chars1, mm_chars2); + mm_chars3 = mm_packus_epi32(mm_chars3, mm_chars4); return _mm_packus_epi16(mm_chars1, mm_chars3); #endif } }; +#if !defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_USE_SSSE3) +template< typename Char > +const simd_vector128< std::uint8_t > from_chars_simd_load_traits< Char, 4u >::mm_deinterleave_epi16_pattern = + {{ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D, 0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F }}; +#endif + #if defined(BOOST_GCC) && (BOOST_GCC >= 40600) #pragma GCC diagnostic pop #endif @@ -689,7 +770,7 @@ BOOST_FORCEINLINE void from_chars_simd_core __m128i const& mm_expected_dashes, __m128i const& mm_char_code1_cmp, __m128i const& mm_char_code2_cmp, -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) __m128i const& mm_char_code0_sub, __m128i const& mm_char_code1_sub, __m128i const& mm_char_code2_sub, @@ -705,15 +786,24 @@ BOOST_FORCEINLINE void from_chars_simd_core // |01234567-89ab-cd|ef-0123-456789ab|cdefXXXXXXXXXXXX| // // Check if dashes are in the expected positions + // + // mm_middle + // |-89ab-cdef-0123-| + const __m128i mm_middle = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(mm_chars1), _mm_castsi128_pd(mm_chars2), _MM_SHUFFLE2(0, 1))); { - // mm_dashes - // |-89ab-cdef-0123-| - __m128i mm_dashes = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(mm_chars1), _mm_castsi128_pd(mm_chars2), _MM_SHUFFLE2(0, 1))); - if (BOOST_UNLIKELY(!_mm_test_all_zeros(_mm_xor_si128(mm_dashes, mm_expected_dashes), constants::mm_dashes_mask))) +#if defined(BOOST_UUID_USE_SSE41) + if (BOOST_UNLIKELY(!_mm_test_all_zeros(_mm_xor_si128(mm_middle, mm_expected_dashes), constants::mm_dashes_mask))) +#else + __m128i mm_dashes = _mm_and_si128(mm_middle, constants::mm_dashes_mask); + std::uint32_t dash_mask = static_cast< std::uint32_t >(_mm_movemask_epi8(_mm_cmpeq_epi8(mm_dashes, mm_expected_dashes))); + if (BOOST_UNLIKELY(dash_mask != 0xFFFF)) +#endif { // Some of the dashes are missing - mm_dashes = _mm_and_si128(mm_dashes, constants::mm_dashes_mask); +#if defined(BOOST_UUID_USE_SSE41) + __m128i mm_dashes = _mm_and_si128(mm_middle, constants::mm_dashes_mask); std::uint32_t dash_mask = static_cast< std::uint32_t >(_mm_movemask_epi8(_mm_cmpeq_epi8(mm_dashes, mm_expected_dashes))); +#endif unsigned int pos = detail::countr_zero_nz(~dash_mask) + 8u; if (pos < end_pos) { @@ -732,7 +822,11 @@ BOOST_FORCEINLINE void from_chars_simd_core // mm_chars2: |02468ace13579bdf| mm_chars1 = _mm_permutex2var_epi8(mm_chars1, constants::mm_split_half_bytes_pattern1, mm_chars2); mm_chars2 = _mm_permutex2var_epi8(mm_chars2, constants::mm_split_half_bytes_pattern2, mm_chars3); -#else + + // Group half-byte characters + __m128i mm_lo = _mm_unpacklo_epi64(mm_chars1, mm_chars2); + __m128i mm_hi = _mm_unpackhi_epi64(mm_chars1, mm_chars2); +#elif defined(BOOST_UUID_USE_SSSE3) // mm_chars1: |02468acZ13579bdZ| // mm_chars2: |02468aZe13579bZf| // mm_chars3: |ZZZZZZceZZZZZZdf| @@ -742,20 +836,47 @@ BOOST_FORCEINLINE void from_chars_simd_core // mm_chars1: |02468ace13579bdf| // mm_chars2: |02468ace13579bdf| - // Avoid using vpblendvb, which is slow on Intel #if defined(BOOST_UUID_USE_AVX512_V1) - mm_chars1 = _mm_ternarylogic_epi64(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask, 0xD8); // (_MM_TERNLOG_A & ~_MM_TERNLOG_C) | (_MM_TERNLOG_B & _MM_TERNLOG_C) -#elif defined(BOOST_UUID_USE_AVX) - mm_chars1 = _mm_or_si128(mm_chars1, _mm_and_si128(mm_chars2, constants::mm_split_half_bytes_blend_mask)); + // Avoid using vpblendvb, which is slow on Intel + mm_chars1 = _mm_ternarylogic_epi64(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask1, 0xD8); // (_MM_TERNLOG_A & ~_MM_TERNLOG_C) | (_MM_TERNLOG_B & _MM_TERNLOG_C) +#elif defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB) + mm_chars1 = _mm_blendv_epi8(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask1); #else - mm_chars1 = _mm_blendv_epi8(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask); + mm_chars1 = _mm_or_si128(mm_chars1, _mm_and_si128(mm_chars2, constants::mm_split_half_bytes_blend_mask1)); #endif +#if defined(BOOST_UUID_USE_SSE41) mm_chars2 = _mm_blend_epi16(mm_chars2, mm_chars3, 0x88); +#else + mm_chars2 = _mm_or_si128(_mm_and_si128(mm_chars2, constants::mm_split_half_bytes_blend_mask2), mm_chars3); #endif - // Group half-byte digits + // Group half-byte characters __m128i mm_lo = _mm_unpacklo_epi64(mm_chars1, mm_chars2); __m128i mm_hi = _mm_unpackhi_epi64(mm_chars1, mm_chars2); +#else + __m128i mm_lo, mm_hi; + { + // Remove dashes + __m128i mm_group1 = _mm_srli_epi64(mm_middle, 8); + __m128i mm_group2 = _mm_srli_si128(mm_middle, 6); + __m128i mm_group3 = _mm_srli_si128(mm_middle, 11); + + mm_chars1 = _mm_unpacklo_epi64(mm_chars1, _mm_unpacklo_epi32(mm_group1, mm_group2)); + + mm_chars2 = _mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(mm_chars2), _mm_castsi128_pd(_mm_unpacklo_epi32(mm_group3, mm_chars3)))); + mm_chars2 = _mm_shuffle_epi32(mm_chars2, _MM_SHUFFLE(1, 3, 2, 0)); + + // Deinterleave half-byte characters + __m128i mm_lo1 = _mm_srli_epi16(mm_chars1, 8); + __m128i mm_lo2 = _mm_srli_epi16(mm_chars2, 8); + + __m128i mm_hi1 = _mm_and_si128(mm_chars1, constants::mm_split_half_byte_chars_mask); + __m128i mm_hi2 = _mm_and_si128(mm_chars2, constants::mm_split_half_byte_chars_mask); + + mm_lo = _mm_packus_epi16(mm_lo1, mm_lo2); + mm_hi = _mm_packus_epi16(mm_hi1, mm_hi2); + } +#endif // Convert characters to 8-bit integers. The algorithm is basically as follows: // @@ -778,6 +899,7 @@ BOOST_FORCEINLINE void from_chars_simd_core // Note that there is one caveat due to the fact that there are only signed byte comparisons until AVX-512. This is a problem if the character encoding has // hexadecimal character codes with the highest bit set to 1. This is handled in from_chars_simd_char_constants by constructing mm_char_code1 and // mm_char_code2 in such a way that signed comparisons work as described. We also use signed comparisons in AVX-512 to reuse the same constants. +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) #if defined(BOOST_UUID_USE_AVX512_V1) __mmask16 k_char_code2_mask_lo = _mm_cmpgt_epi8_mask(mm_lo, mm_char_code2_cmp); __mmask16 k_char_code2_mask_hi = _mm_cmpgt_epi8_mask(mm_hi, mm_char_code2_cmp); @@ -790,8 +912,29 @@ BOOST_FORCEINLINE void from_chars_simd_core mm_char_code_sub_lo = _mm_mask_blend_epi8(k_char_code1_mask_lo, mm_char_code0_sub, mm_char_code_sub_lo); mm_char_code_sub_hi = _mm_mask_blend_epi8(k_char_code1_mask_hi, mm_char_code0_sub, mm_char_code_sub_hi); -#elif defined(BOOST_UUID_USE_AVX) - // Unlike the legacy SSE4.1 pblendvb instruction, the VEX-coded vpblendvb is slow on Intel. Use a different approach: +#else + __m128i mm_char_code2_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code2_cmp); + __m128i mm_char_code2_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code2_cmp); + + __m128i mm_char_code1_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code1_cmp); + __m128i mm_char_code1_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code1_cmp); + +#if defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB) + __m128i mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_lo); + __m128i mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_hi); + + mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_lo, mm_char_code1_mask_lo); + mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_hi, mm_char_code1_mask_hi); +#else + __m128i mm_char_code_sub_lo = _mm_or_si128(_mm_andnot_si128(mm_char_code2_mask_lo, mm_char_code1_sub), _mm_and_si128(mm_char_code2_mask_lo, mm_char_code2_sub)); + __m128i mm_char_code_sub_hi = _mm_or_si128(_mm_andnot_si128(mm_char_code2_mask_hi, mm_char_code1_sub), _mm_and_si128(mm_char_code2_mask_hi, mm_char_code2_sub)); + + mm_char_code_sub_lo = _mm_or_si128(_mm_andnot_si128(mm_char_code1_mask_lo, mm_char_code0_sub), _mm_and_si128(mm_char_code1_mask_lo, mm_char_code_sub_lo)); + mm_char_code_sub_hi = _mm_or_si128(_mm_andnot_si128(mm_char_code1_mask_hi, mm_char_code0_sub), _mm_and_si128(mm_char_code1_mask_hi, mm_char_code_sub_hi)); +#endif +#endif +#else // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) + // Use a different approach: // - Each vpcmpgtb produces a mask, where 0 indicates false and -1 - true. // - mm_char_code1_mask_* always overlaps with the corresponding mm_char_code2_mask_*, which means adding them // produces a vector where 0 means none of the vpcmpgtb matched the value, -1 - where mm_char_code1_mask_* matched @@ -814,26 +957,19 @@ BOOST_FORCEINLINE void from_chars_simd_core const __m128i mm_char_code_sub = _mm_cvtsi32_si128(static_cast< int >(char_code_sub)); __m128i mm_char_code_sub_lo = _mm_shuffle_epi8(mm_char_code_sub, mm_char_code_pattern_lo); __m128i mm_char_code_sub_hi = _mm_shuffle_epi8(mm_char_code_sub, mm_char_code_pattern_hi); -#else - __m128i mm_char_code2_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code2_cmp); - __m128i mm_char_code2_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code2_cmp); - - __m128i mm_char_code1_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code1_cmp); - __m128i mm_char_code1_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code1_cmp); - - __m128i mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_lo); - __m128i mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_hi); - - mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_lo, mm_char_code1_mask_lo); - mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_hi, mm_char_code1_mask_hi); -#endif +#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) mm_lo = _mm_sub_epi8(mm_lo, mm_char_code_sub_lo); mm_hi = _mm_sub_epi8(mm_hi, mm_char_code_sub_hi); // Check hexadecimal character validity. Proper hexadecimal characters always convert to values of 0-15 and any other characters convert // to values outside that range. Which means if the upper 4 bits of a resulting integer are non-zero then the corresponding character was invalid. +#if defined(BOOST_UUID_USE_SSE41) if (BOOST_LIKELY(_mm_test_all_zeros(_mm_or_si128(mm_lo, mm_hi), constants::mm_F0))) +#else + const __m128i mm_0 = _mm_setzero_si128(); + if (BOOST_LIKELY(_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(_mm_or_si128(mm_lo, mm_hi), constants::mm_F0), mm_0)) == 0xFFFF)) +#endif { if (BOOST_LIKELY(ec == from_chars_error::none)) { @@ -844,12 +980,13 @@ BOOST_FORCEINLINE void from_chars_simd_core else { // Some of the hex digits are invalid +#if defined(BOOST_UUID_USE_SSE41) const __m128i mm_0 = _mm_setzero_si128(); +#endif __m128i mm_hi_bits_lo = _mm_and_si128(mm_lo, constants::mm_F0); __m128i mm_hi_bits_hi = _mm_and_si128(mm_hi, constants::mm_F0); mm_hi_bits_lo = _mm_cmpeq_epi8(mm_hi_bits_lo, mm_0); mm_hi_bits_hi = _mm_cmpeq_epi8(mm_hi_bits_hi, mm_0); - std::uint32_t digits_mask_lo = static_cast< std::uint32_t >(_mm_movemask_epi8(mm_hi_bits_lo)); std::uint32_t digits_mask_hi = static_cast< std::uint32_t >(_mm_movemask_epi8(mm_hi_bits_hi)); @@ -930,7 +1067,7 @@ BOOST_FORCEINLINE from_chars_result< Char > from_chars_simd(const Char* begin, c char_constants::mm_expected_dashes, char_constants::mm_char_code1_cmp, char_constants::mm_char_code2_cmp, -#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX) +#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS) char_constants::mm_char_code0_sub, char_constants::mm_char_code1_sub, char_constants::mm_char_code2_sub, @@ -947,6 +1084,8 @@ BOOST_FORCEINLINE from_chars_result< Char > from_chars_simd(const Char* begin, c } // namespace uuids } // namespace boost -#endif // defined(BOOST_UUID_USE_SSE41) +#undef BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS + +#endif // defined(BOOST_UUID_USE_SSE2) #endif // BOOST_UUID_DETAIL_FROM_CHARS_X86_HPP_INCLUDED diff --git a/include/boost/uuid/detail/simd_vector.hpp b/include/boost/uuid/detail/simd_vector.hpp index 4cee4cf..fde583b 100644 --- a/include/boost/uuid/detail/simd_vector.hpp +++ b/include/boost/uuid/detail/simd_vector.hpp @@ -27,12 +27,22 @@ union simd_vector > BOOST_FORCEINLINE operator Vector () const noexcept { return get< Vector >(); } +#if defined(BOOST_GCC) && (BOOST_GCC >= 40600) +#pragma GCC diagnostic push +// dereferencing type-punned pointer will break strict-aliasing rules +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif + template< typename Vector > BOOST_FORCEINLINE typename std::enable_if< sizeof(Vector) <= ByteSize, Vector >::type get() const noexcept { using vector_type = typename std::remove_cv< typename std::remove_reference< Vector >::type >::type; return *reinterpret_cast< const vector_type* >(bytes); } + +#if defined(BOOST_GCC) && (BOOST_GCC >= 40800) +#pragma GCC diagnostic pop +#endif }; template< typename T > diff --git a/include/boost/uuid/detail/to_chars.hpp b/include/boost/uuid/detail/to_chars.hpp index 80f6841..6fcabe1 100644 --- a/include/boost/uuid/detail/to_chars.hpp +++ b/include/boost/uuid/detail/to_chars.hpp @@ -11,7 +11,7 @@ #include #include -#if defined(BOOST_UUID_USE_SSSE3) +#if defined(BOOST_UUID_USE_SSE2) # include #elif defined(BOOST_UUID_REPORT_IMPLEMENTATION) @@ -26,7 +26,7 @@ namespace detail { template BOOST_UUID_CXX14_CONSTEXPR_RT inline Ch* to_chars( uuid const& u, Ch* out ) noexcept { -#if defined(BOOST_UUID_USE_SSSE3) +#if defined(BOOST_UUID_USE_SSE2) if( detail::is_constant_evaluated_rt() ) { return detail::to_chars_generic( u, out ); @@ -40,7 +40,6 @@ template BOOST_UUID_CXX14_CONSTEXPR_RT inline Ch* to_chars( uuid const #endif } -} // namespace detail -}} //namespace boost::uuids +}}} // namespace boost::uuids::detail #endif // BOOST_UUID_DETAIL_TO_CHARS_HPP_INCLUDED diff --git a/include/boost/uuid/detail/to_chars_x86.hpp b/include/boost/uuid/detail/to_chars_x86.hpp index 818522b..6184b6b 100644 --- a/include/boost/uuid/detail/to_chars_x86.hpp +++ b/include/boost/uuid/detail/to_chars_x86.hpp @@ -7,7 +7,7 @@ #include -#if defined(BOOST_UUID_USE_SSSE3) +#if defined(BOOST_UUID_USE_SSE2) #include #include @@ -26,9 +26,12 @@ BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, AVX2" ) #elif defined(BOOST_UUID_USE_SSE41) BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSE4.1" ) -#else +#elif defined(BOOST_UUID_USE_SSSE3) BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSSE3" ) +#else +BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSE2" ) + #endif #endif // #if defined(BOOST_UUID_REPORT_IMPLEMENTATION) @@ -36,8 +39,10 @@ BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSSE3" ) #include #elif defined(BOOST_UUID_USE_SSE41) #include -#else +#elif defined(BOOST_UUID_USE_SSSE3) #include +#else +#include #endif namespace boost { @@ -51,13 +56,31 @@ template< > struct to_chars_simd_char_constants { +#if defined(BOOST_UUID_USE_SSSE3) static const simd_vector128< std::uint8_t > mm_char_table; +#else + static constexpr std::uint8_t char_a_add = static_cast< std::uint8_t >((0x61 - 10) - 0x30); // ('a' - 10) - '0' in ASCII + static const simd_vector128< std::uint8_t > mm_char_0_add; + static const simd_vector128< std::uint8_t > mm_char_a_add; +#endif static const simd_vector128< std::uint8_t > mm_char_dash; }; +#if defined(BOOST_UUID_USE_SSSE3) template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_table = {{ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66 }}; // 0123456789abcdef in ASCII +#else +template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_0_add = + {{ 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30 }}; // 0x30 is '0' in ASCII +template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_a_add = +{{ + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add +}}; +#endif template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_dash = {{ 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D }}; // ---------------- in ASCII @@ -69,10 +92,17 @@ struct to_chars_simd_char_constants< char, false, IsWCharASCIICompatible > static_assert(static_cast< std::uint8_t >('-') < static_cast< std::uint8_t >('0') && static_cast< std::uint8_t >('-') < static_cast< std::uint8_t >('a'), "Boost.UUID: Unsupported char encoding, '-' character code is expected to be less than any hexadecimal characters"); +#if defined(BOOST_UUID_USE_SSSE3) static const simd_vector128< std::uint8_t > mm_char_table; +#else + static constexpr std::uint8_t char_a_add = static_cast< std::uint8_t >(('a' - 10) - '0'); + static const simd_vector128< std::uint8_t > mm_char_0_add; + static const simd_vector128< std::uint8_t > mm_char_a_add; +#endif static const simd_vector128< std::uint8_t > mm_char_dash; }; +#if defined(BOOST_UUID_USE_SSSE3) template< bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_table = {{ @@ -81,6 +111,22 @@ const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, static_cast< std::uint8_t >('8'), static_cast< std::uint8_t >('9'), static_cast< std::uint8_t >('a'), static_cast< std::uint8_t >('b'), static_cast< std::uint8_t >('c'), static_cast< std::uint8_t >('d'), static_cast< std::uint8_t >('e'), static_cast< std::uint8_t >('f') }}; +#else +template< bool IsWCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_0_add = +{{ + static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), + static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), + static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), + static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0') +}}; +template< bool IsWCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_a_add = +{{ + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add +}}; +#endif template< bool IsWCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_dash = {{ @@ -102,10 +148,17 @@ struct to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false > static_assert(static_cast< std::uint8_t >(L'-') < static_cast< std::uint8_t >(L'0') && static_cast< std::uint8_t >(L'-') < static_cast< std::uint8_t >(L'a'), "Boost.UUID: Unsupported wchar_t encoding, L'-' character code is expected to be less than any hexadecimal characters"); +#if defined(BOOST_UUID_USE_SSSE3) static const simd_vector128< std::uint8_t > mm_char_table; +#else + static constexpr std::uint8_t char_a_add = static_cast< std::uint8_t >((L'a' - 10) - L'0'); + static const simd_vector128< std::uint8_t > mm_char_0_add; + static const simd_vector128< std::uint8_t > mm_char_a_add; +#endif static const simd_vector128< std::uint8_t > mm_char_dash; }; +#if defined(BOOST_UUID_USE_SSSE3) template< bool IsCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_table = {{ @@ -114,6 +167,22 @@ const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCh static_cast< std::uint8_t >(L'8'), static_cast< std::uint8_t >(L'9'), static_cast< std::uint8_t >(L'a'), static_cast< std::uint8_t >(L'b'), static_cast< std::uint8_t >(L'c'), static_cast< std::uint8_t >(L'd'), static_cast< std::uint8_t >(L'e'), static_cast< std::uint8_t >(L'f') }}; +#else +template< bool IsCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_0_add = +{{ + static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), + static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), + static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), + static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0') +}}; +template< bool IsCharASCIICompatible > +const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_a_add = +{{ + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, + char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add +}}; +#endif template< bool IsCharASCIICompatible > const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_dash = {{ @@ -127,25 +196,55 @@ template< typename > struct to_chars_simd_constants { static const simd_vector128< std::uint8_t > mm_0F; +#if defined(BOOST_UUID_USE_SSSE3) static const simd_vector128< std::uint8_t > mm_shuffle_pattern1; static const simd_vector128< std::uint8_t > mm_shuffle_pattern2; +#else + static const simd_vector128< std::uint8_t > mm_9; + static const simd_vector128< std::uint8_t > mm_group1_mask; + static const simd_vector128< std::uint8_t > mm_group2_mask; + static const simd_vector128< std::uint8_t > mm_group3_mask; +#endif }; template< typename T > const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_0F = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }}; +#if defined(BOOST_UUID_USE_SSSE3) template< typename T > const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_shuffle_pattern1 = {{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x80, 0x08, 0x09, 0x0A, 0x0B, 0x80, 0x0C, 0x0D }}; template< typename T > const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_shuffle_pattern2 = {{ 0x00, 0x01, 0x80, 0x02, 0x03, 0x04, 0x05, 0x80, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D }}; +#else +template< typename T > +const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_9 = + {{ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 }}; +template< typename T > +const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_group1_mask = + {{ 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}; +template< typename T > +const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_group2_mask = + {{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}; +template< typename T > +const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_group3_mask = + {{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00 }}; +#endif -//! Converts UUID to a string of 36 characters, where first 32 craracters are returned in mm_chars1 and mm_chars2 and the last 4 in the highest 32 bits of mm_chars3 +/*! + * Converts UUID to a string of 36 characters, where the first 32 characters are returned in mm_chars1 and mm_chars2. + * When SSSE3 is supported, last 4 characters are returned in the highest 32 bits of mm_chars3, otherwise in the lowest 32 bits. + */ BOOST_FORCEINLINE void to_chars_simd_core ( const std::uint8_t* data, - __m128i const& mm_char_table, __m128i const& mm_char_dash, +#if defined(BOOST_UUID_USE_SSSE3) + __m128i const& mm_char_table, +#else + __m128i const& mm_char_0_add, __m128i const& mm_char_a_add, +#endif + __m128i const& mm_char_dash, __m128i& mm_chars1, __m128i& mm_chars2, __m128i& mm_chars3 ) noexcept { @@ -154,18 +253,31 @@ BOOST_FORCEINLINE void to_chars_simd_core __m128i mm_input = _mm_loadu_si128(reinterpret_cast< const __m128i* >(data)); // Split half-bytes - __m128i const& mm_0F = constants::mm_0F; - __m128i mm_input_hi = _mm_and_si128(_mm_srli_epi32(mm_input, 4), mm_0F); - __m128i mm_input_lo = _mm_and_si128(mm_input, mm_0F); + __m128i mm_input_hi = _mm_and_si128(_mm_srli_epi32(mm_input, 4), constants::mm_0F); + __m128i mm_input_lo = _mm_and_si128(mm_input, constants::mm_0F); // Stringize each of the halves +#if defined(BOOST_UUID_USE_SSSE3) mm_input_hi = _mm_shuffle_epi8(mm_char_table, mm_input_hi); mm_input_lo = _mm_shuffle_epi8(mm_char_table, mm_input_lo); +#else + { + __m128i mm_add_mask_hi = _mm_cmpgt_epi8(mm_input_hi, constants::mm_9); + __m128i mm_add_mask_lo = _mm_cmpgt_epi8(mm_input_lo, constants::mm_9); + + __m128i mm_add_hi = _mm_add_epi8(mm_char_0_add, _mm_and_si128(mm_add_mask_hi, mm_char_a_add)); + __m128i mm_add_lo = _mm_add_epi8(mm_char_0_add, _mm_and_si128(mm_add_mask_lo, mm_char_a_add)); + + mm_input_hi = _mm_add_epi8(mm_input_hi, mm_add_hi); + mm_input_lo = _mm_add_epi8(mm_input_lo, mm_add_lo); + } +#endif // Join them back together __m128i mm_1 = _mm_unpacklo_epi8(mm_input_hi, mm_input_lo); __m128i mm_2 = _mm_unpackhi_epi8(mm_input_hi, mm_input_lo); +#if defined(BOOST_UUID_USE_SSSE3) // Insert dashes at positions 8, 13, 18 and 23 // mm_1 mm_2 // |0123456789abcdef|0123456789abcdef| @@ -178,6 +290,32 @@ BOOST_FORCEINLINE void to_chars_simd_core mm_chars1 = _mm_max_epu8(mm_chars1, mm_char_dash); mm_chars2 = _mm_max_epu8(mm_chars2, mm_char_dash); mm_chars3 = mm_2; +#else + // Split groups of characters between dashes and shift them into their places + // mm_middle: |89abcdef01234567| + // mm_group1: |Z89abZZZZZZZZZZZ| + // mm_group2: |ZZZZZZcdefZZZZZZ| + // mm_group3: |ZZZZZZZZZZZ0123Z| + __m128i mm_middle = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(mm_1), _mm_castsi128_pd(mm_2), _MM_SHUFFLE2(0, 1))); + __m128i mm_group1 = _mm_slli_epi64(mm_middle, 8); + __m128i mm_group2 = _mm_slli_si128(mm_middle, 2); + __m128i mm_group3 = _mm_slli_epi64(mm_middle, 24); + mm_group1 = _mm_and_si128(mm_group1, constants::mm_group1_mask); + mm_group2 = _mm_and_si128(mm_group2, constants::mm_group2_mask); + mm_group3 = _mm_and_si128(mm_group3, constants::mm_group3_mask); + + // Merge them back and insert dashes + // mm_middle: |-89ab-cdef-0123-| + mm_middle = _mm_or_si128(_mm_or_si128(mm_group1, mm_group2), mm_group3); + mm_middle = _mm_max_epu8(mm_middle, mm_char_dash); + + // mm_2: |cdef0123456789ab| + mm_2 = _mm_shuffle_epi32(mm_2, _MM_SHUFFLE(2, 1, 0, 3)); + + mm_chars1 = _mm_unpacklo_epi64(mm_1, mm_middle); + mm_chars2 = _mm_unpackhi_epi64(mm_middle, mm_2); + mm_chars3 = mm_2; +#endif } #if defined(BOOST_MSVC) @@ -195,7 +333,12 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept uuids::detail::to_chars_simd_core ( u.data(), +#if defined(BOOST_UUID_USE_SSSE3) char_constants::mm_char_table, +#else + char_constants::mm_char_0_add, + char_constants::mm_char_a_add, +#endif char_constants::mm_char_dash, mm_chars1, mm_chars2, mm_chars3 ); @@ -205,11 +348,17 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept { _mm_storeu_si128(reinterpret_cast< __m128i* >(out), mm_chars1); _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 16), mm_chars2); + detail::store_native_u32 + ( + out + 32, #if defined(BOOST_UUID_USE_SSE41) - detail::store_native_u32(out + 32, static_cast< std::uint32_t >(_mm_extract_epi32(mm_chars3, 3))); + static_cast< std::uint32_t >(_mm_extract_epi32(mm_chars3, 3)) +#elif defined(BOOST_UUID_USE_SSSE3) + static_cast< std::uint32_t >(_mm_cvtsi128_si32(_mm_srli_si128(mm_chars3, 12))) #else - detail::store_native_u32(out + 32, static_cast< std::uint32_t >(_mm_cvtsi128_si32(_mm_srli_si128(mm_chars3, 12)))); + static_cast< std::uint32_t >(_mm_cvtsi128_si32(mm_chars3)) #endif + ); } else BOOST_IF_CONSTEXPR (sizeof(Char) == 2u) { @@ -225,8 +374,10 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept #endif #if defined(BOOST_UUID_USE_SSE41) && (defined(__x86_64__) || defined(_M_X64)) detail::store_native_u64(out + 32, static_cast< std::uint64_t >(_mm_extract_epi64(_mm_unpackhi_epi8(mm_chars3, mm_0), 1))); -#else +#elif defined(BOOST_UUID_USE_SSSE3) _mm_storeh_pd(reinterpret_cast< BOOST_MAY_ALIAS double* >(out + 32), _mm_castsi128_pd(_mm_unpackhi_epi8(mm_chars3, mm_0))); +#else + _mm_storel_epi64(reinterpret_cast< __m128i* >(out + 32), _mm_unpacklo_epi8(mm_chars3, mm_0)); #endif } else @@ -255,7 +406,15 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 24), _mm_unpacklo_epi16(mm, mm_0)); _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 28), _mm_unpackhi_epi16(mm, mm_0)); #endif - _mm_storeu_si128(reinterpret_cast< __m128i* >(out + 32), _mm_unpackhi_epi16(_mm_unpackhi_epi8(mm_chars3, mm_0), mm_0)); + _mm_storeu_si128 + ( + reinterpret_cast< __m128i* >(out + 32), +#if defined(BOOST_UUID_USE_SSSE3) + _mm_unpackhi_epi16(_mm_unpackhi_epi8(mm_chars3, mm_0), mm_0) +#else + _mm_unpacklo_epi16(_mm_unpacklo_epi8(mm_chars3, mm_0), mm_0) +#endif + ); } return out + 36; @@ -269,6 +428,6 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept } // namespace uuids } // namespace boost -#endif // defined(BOOST_UUID_USE_SSSE3) +#endif // defined(BOOST_UUID_USE_SSE2) #endif // BOOST_UUID_DETAIL_TO_CHARS_X86_HPP_INCLUDED diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 1372d80..f2aa304 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -34,6 +34,20 @@ import path ; import regex ; import testing ; +# The rule allows for suppressing running tests and instead only compile them. +# This is useful e.g. if the tests are compiled for a target ISA that is not supported by the CPU. +local rule run ( sources + : args * : input-files * : requirements * : target-name ? : default-build * ) +{ + if [ os.environ BOOST_UUID_SKIP_RUNNING_TESTS ] + { + return [ testing.compile $(sources) : $(requirements) : $(target-name) ] ; + } + else + { + return [ testing.run $(sources) : $(args) : $(input-files) : $(requirements) : $(target-name) : $(default-build) ] ; + } +} + # this rule enumerates through all the headers and ensures # that inclusion of the header by itself is sufficient to # compile successfully, proving the header does not depend