diff --git a/doc/uuid/configuration.adoc b/doc/uuid/configuration.adoc index 42156b5..3cc4cbe 100644 --- a/doc/uuid/configuration.adoc +++ b/doc/uuid/configuration.adoc @@ -32,15 +32,18 @@ However, there are a few options that can be enabled by defining macros prior to |If defined, enables optimizations for https://en.wikipedia.org/wiki/SSE4#SSE4.1[SSE4.1] extensions available in x86 processors. |`BOOST_UUID_USE_AVX` -|If defined, enables optimizations for https://en.wikipedia.org/wiki/Advanced_Vector_Extensions[AVX] extensions available in modern x86 processors. +|If defined, enables optimizations for https://en.wikipedia.org/wiki/Advanced_Vector_Extensions[AVX] extensions available in x86 processors. |`BOOST_UUID_USE_AVX2` -|If defined, enables optimizations for https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#Advanced_Vector_Extensions_2[AVX2] extensions available in modern x86 processors. +|If defined, enables optimizations for https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#Advanced_Vector_Extensions_2[AVX2] extensions available in x86 processors. + +|`BOOST_UUID_USE_AVX512_V1` +|If defined, enables optimizations for https://en.wikipedia.org/wiki/AVX-512[AVX-512] F, VL, CD, BW and DQ extensions available in x86 processors (e.g. in Intel Skylake-X). |`BOOST_UUID_USE_AVX10_1` -|If defined, enables optimizations for https://en.wikipedia.org/wiki/AVX-512[AVX-512] and https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX10[AVX10.1] extensions available in modern x86 processors. - When defined by user, this macro indicates support for the full set of instructions defined in AVX10.1. Currently, the library does not require 512-bit vectors and is compatible with CPUs implementing AVX-512F, - CD, VL, BW and DQ instruction subsets (i.e. equivalent to Intel Skylake-X), so it may auto-detect and use AVX-512 even if only those subsets are supported. +|If defined, enables optimizations for https://en.wikipedia.org/wiki/AVX-512[AVX-512] and https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX10[AVX10.1] extensions available in x86 processors. + When defined by user, this macro indicates support for the full set of instructions defined in AVX10.1. When auto-detected by the library, this macro may be defined even when not all AVX10.1 subsets + are enabled, but rather when the detected subset is sufficient for the library. |=== diff --git a/include/boost/uuid/detail/config.hpp b/include/boost/uuid/detail/config.hpp index 902d8c0..e67e86d 100644 --- a/include/boost/uuid/detail/config.hpp +++ b/include/boost/uuid/detail/config.hpp @@ -48,7 +48,11 @@ #define BOOST_UUID_USE_AVX2 #endif -#if ((defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512BW__)) || defined(__AVX10_1__)) && !defined(BOOST_UUID_USE_AVX10_1) +#if (defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512CD__)) && !defined(BOOST_UUID_USE_AVX512_V1) +#define BOOST_UUID_USE_AVX512_V1 +#endif + +#if ((defined(BOOST_UUID_USE_AVX512_V1) && defined(__AVX512VBMI__)) || defined(__AVX10_1__)) && !defined(BOOST_UUID_USE_AVX10_1) #define BOOST_UUID_USE_AVX10_1 #endif @@ -66,14 +70,22 @@ #define BOOST_UUID_USE_AVX2 #endif -#if ((defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512BW__)) || defined(__AVX10_1__)) && !defined(BOOST_UUID_USE_AVX10_1) +#if (defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512CD__)) && !defined(BOOST_UUID_USE_AVX512_V1) +#define BOOST_UUID_USE_AVX512_V1 +#endif + +#if ((defined(BOOST_UUID_USE_AVX512_V1) && defined(__AVX512VBMI__)) || defined(__AVX10_1__)) && !defined(BOOST_UUID_USE_AVX10_1) #define BOOST_UUID_USE_AVX10_1 #endif #endif // More advanced ISA extensions imply less advanced are also available -#if !defined(BOOST_UUID_USE_AVX2) && defined(BOOST_UUID_USE_AVX10_1) +#if !defined(BOOST_UUID_USE_AVX512_V1) && defined(BOOST_UUID_USE_AVX10_1) +#define BOOST_UUID_USE_AVX512_V1 +#endif + +#if !defined(BOOST_UUID_USE_AVX2) && defined(BOOST_UUID_USE_AVX512_V1) #define BOOST_UUID_USE_AVX2 #endif @@ -99,6 +111,7 @@ #if !defined(BOOST_UUID_NO_SIMD) && \ !defined(BOOST_UUID_USE_AVX10_1) && \ + !defined(BOOST_UUID_USE_AVX512_V1) && \ !defined(BOOST_UUID_USE_AVX2) && \ !defined(BOOST_UUID_USE_AVX) && \ !defined(BOOST_UUID_USE_SSE41) && \ diff --git a/include/boost/uuid/detail/to_chars_x86.hpp b/include/boost/uuid/detail/to_chars_x86.hpp index c3d8bd9..e056d0f 100644 --- a/include/boost/uuid/detail/to_chars_x86.hpp +++ b/include/boost/uuid/detail/to_chars_x86.hpp @@ -16,8 +16,8 @@ #if defined(BOOST_UUID_REPORT_IMPLEMENTATION) #include -#if defined(BOOST_UUID_USE_AVX10_1) -BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, AVX10.1" ) +#if defined(BOOST_UUID_USE_AVX512_V1) +BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, AVX512v1" ) #elif defined(BOOST_UUID_USE_AVX2) BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, AVX2" ) @@ -231,7 +231,7 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept else { const __m128i mm_0 = _mm_setzero_si128(); -#if 0 && defined(BOOST_UUID_USE_AVX10_1) +#if 0 && defined(BOOST_UUID_USE_AVX512_V1) // Slower than the AVX2 version below on Intel Golden Cove. Perhaps, it will become beneficial on newer microarchitectures. _mm512_storeu_epi32(out, _mm512_cvtepu8_epi32(mm_chars1)); _mm512_storeu_epi32(out + 16, _mm512_cvtepu8_epi32(mm_chars2)); diff --git a/include/boost/uuid/detail/uuid_x86.ipp b/include/boost/uuid/detail/uuid_x86.ipp index d72f28d..37f9ae1 100644 --- a/include/boost/uuid/detail/uuid_x86.ipp +++ b/include/boost/uuid/detail/uuid_x86.ipp @@ -22,8 +22,8 @@ #if defined(BOOST_UUID_REPORT_IMPLEMENTATION) #include -#if defined(BOOST_UUID_USE_AVX10_1) -BOOST_PRAGMA_MESSAGE( "Using uuid_x86.ipp, AVX10.1" ) +#if defined(BOOST_UUID_USE_AVX512_V1) +BOOST_PRAGMA_MESSAGE( "Using uuid_x86.ipp, AVX512v1" ) #elif defined(BOOST_UUID_USE_SSE41) BOOST_PRAGMA_MESSAGE( "Using uuid_x86.ipp, SSE4.1" ) @@ -38,7 +38,7 @@ BOOST_PRAGMA_MESSAGE( "Using uuid_x86.ipp, SSE2" ) #endif // #if defined(BOOST_UUID_REPORT_IMPLEMENTATION) // MSVC does not always have immintrin.h (at least, not up to MSVC 10), so include the appropriate header for each instruction set -#if defined(BOOST_UUID_USE_AVX10_1) +#if defined(BOOST_UUID_USE_AVX512_V1) #include #elif defined(BOOST_UUID_USE_SSE41) #include @@ -72,7 +72,7 @@ BOOST_FORCEINLINE void compare(uuid const& lhs, uuid const& rhs, std::uint32_t& // with another XOR to the comparison results. // 3. Until AVX-512, there is only pcmpgtb instruction that compares for "greater" relation, so we swap the arguments to get what we need. -#if defined(BOOST_UUID_USE_AVX10_1) +#if defined(BOOST_UUID_USE_AVX512_V1) __mmask16 k_cmp = _mm_cmplt_epu8_mask(mm_left, mm_right); __mmask16 k_rcmp = _mm_cmplt_epu8_mask(mm_right, mm_left); @@ -80,7 +80,7 @@ BOOST_FORCEINLINE void compare(uuid const& lhs, uuid const& rhs, std::uint32_t& cmp = static_cast< std::uint32_t >(_cvtmask16_u32(k_cmp)); rcmp = static_cast< std::uint32_t >(_cvtmask16_u32(k_rcmp)); -#else // defined(BOOST_UUID_USE_AVX10_1) +#else // defined(BOOST_UUID_USE_AVX512_V1) const __m128i mm_signs_mask = _mm_xor_si128(mm_left, mm_right); @@ -92,7 +92,7 @@ BOOST_FORCEINLINE void compare(uuid const& lhs, uuid const& rhs, std::uint32_t& cmp = static_cast< std::uint32_t >(_mm_movemask_epi8(mm_cmp)); rcmp = static_cast< std::uint32_t >(_mm_movemask_epi8(mm_rcmp)); -#endif // defined(BOOST_UUID_USE_AVX10_1) +#endif // defined(BOOST_UUID_USE_AVX512_V1) cmp = (cmp - 1u) ^ cmp; rcmp = (rcmp - 1u) ^ rcmp;