2
0
mirror of https://github.com/boostorg/uuid.git synced 2026-01-19 04:42:16 +00:00

Merge pull request #190 from Lastique/feature/to_from_chars_sse2

Add SSE2 implementations of `to_chars` and `from_chars`
This commit is contained in:
Peter Dimov
2026-01-12 16:54:54 +02:00
committed by GitHub
7 changed files with 459 additions and 78 deletions

View File

@@ -92,6 +92,14 @@ jobs:
os: ubuntu-latest
install: g++-15-multilib
address-model: 32,64
- toolset: gcc-13
cxxstd: "11,14,17,20,2b"
instruction-set: core2
cpu-requirements: [ ssse3 ]
os: ubuntu-latest
container: ubuntu:24.04
install: g++-13-multilib
address-model: 32,64
- toolset: gcc-13
cxxstd: "11,14,17,20,2b"
instruction-set: nehalem
@@ -100,6 +108,14 @@ jobs:
container: ubuntu:24.04
install: g++-13-multilib
address-model: 32,64
- toolset: gcc-13
cxxstd: "11,14,17,20,2b"
instruction-set: sandy-bridge
cpu-requirements: [ avx ]
os: ubuntu-latest
container: ubuntu:24.04
install: g++-13-multilib
address-model: 32,64
- toolset: gcc-13
cxxstd: "11,14,17,20,2b"
instruction-set: haswell
@@ -124,6 +140,16 @@ jobs:
container: ubuntu:24.04
install: g++-13-multilib
address-model: 32,64
# Experimental features
- toolset: gcc-13
cxxstd: "11,14,17,20,2b"
instruction-set: rocketlake
cpu-requirements: [ avx512f, avx512cd, avx512vl, avx512dq, avx512bw, avx512vbmi, avx512_vbmi2, avx512_bitalg, bmi1, bmi2 ]
defines: [ BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM, BOOST_UUID_FROM_CHARS_X86_USE_VPERMI2B ]
os: ubuntu-latest
container: ubuntu:24.04
install: g++-13-multilib
address-model: 32,64
- toolset: clang
compiler: clang++-3.9
cxxstd: "11,14"
@@ -238,6 +264,14 @@ jobs:
container: ubuntu:25.10
os: ubuntu-latest
install: clang-21
- toolset: clang
compiler: clang++-17
cxxstd: "11,14,17,20,2b"
instruction-set: core2
cpu-requirements: [ ssse3 ]
container: ubuntu:24.04
os: ubuntu-latest
install: clang-17
- toolset: clang
compiler: clang++-17
cxxstd: "11,14,17,20,2b"
@@ -246,6 +280,14 @@ jobs:
container: ubuntu:24.04
os: ubuntu-latest
install: clang-17
- toolset: clang
compiler: clang++-17
cxxstd: "11,14,17,20,2b"
instruction-set: sandy-bridge
cpu-requirements: [ avx ]
container: ubuntu:24.04
os: ubuntu-latest
install: clang-17
- toolset: clang
compiler: clang++-17
cxxstd: "11,14,17,20,2b"
@@ -270,6 +312,16 @@ jobs:
container: ubuntu:24.04
os: ubuntu-latest
install: clang-17
# Experimental features
- toolset: clang
compiler: clang++-17
cxxstd: "11,14,17,20,2b"
instruction-set: rocketlake
cpu-requirements: [ avx512f, avx512cd, avx512vl, avx512dq, avx512bw, avx512vbmi, avx512_vbmi2, avx512_bitalg, bmi1, bmi2 ]
defines: [ BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM, BOOST_UUID_FROM_CHARS_X86_USE_VPERMI2B ]
container: ubuntu:24.04
os: ubuntu-latest
install: clang-17
- toolset: clang
os: macos-14
cxxstd: "11,14,17,20,2b"
@@ -346,6 +398,7 @@ jobs:
cd ../boost-root
ADDRMD=${{matrix.address-model}}
INSTRUCTION_SET=${{matrix.instruction-set}}
b2_args=(-j2 libs/$LIBRARY/test toolset=${{matrix.toolset}} cxxstd=${{matrix.cxxstd}} ${ADDRMD:+address-model=$ADDRMD} ${INSTRUCTION_SET:+instruction-set=$INSTRUCTION_SET} variant=debug,release)
if [ -n "${{matrix.cpu-requirements}}" ]
then
cpu_flags="$(lscpu | grep -F "Flags:" | sed "s/^Flags:\\s*//")";
@@ -355,12 +408,19 @@ jobs:
if ! [[ "$cpu_flags" =~ $re ]]
then
echo "CPU lacks required feature: $requirement"
echo "Skipping testing"
exit 0
echo "Skipping running tests"
export BOOST_UUID_SKIP_RUNNING_TESTS=1
fi
done
fi
./b2 -j2 libs/$LIBRARY/test toolset=${{matrix.toolset}} cxxstd=${{matrix.cxxstd}} ${ADDRMD:+address-model=$ADDRMD} ${INSTRUCTION_SET:+instruction-set=$INSTRUCTION_SET} variant=debug,release
if [ -n "${{matrix.defines}}" ]
then
for define in ${{join(matrix.defines, ' ')}}
do
b2_args+=("define=$define")
done
fi
./b2 "${b2_args[@]}"
windows:
strategy:

View File

@@ -11,7 +11,7 @@
#include <boost/uuid/detail/config.hpp>
#include <boost/uuid/detail/is_constant_evaluated.hpp>
#if defined(BOOST_UUID_USE_SSE41)
#if defined(BOOST_UUID_USE_SSE2)
# include <boost/uuid/detail/from_chars_x86.hpp>
#elif defined(BOOST_UUID_REPORT_IMPLEMENTATION)
@@ -27,7 +27,7 @@ template<class Ch>
BOOST_UUID_CXX14_CONSTEXPR_RT inline
from_chars_result<Ch> from_chars( Ch const* first, Ch const* last, uuid& u ) noexcept
{
#if defined(BOOST_UUID_USE_SSE41)
#if defined(BOOST_UUID_USE_SSE2)
if( detail::is_constant_evaluated_rt() )
{
return detail::from_chars_generic( first, last, u );

View File

@@ -7,8 +7,9 @@
#include <boost/uuid/detail/config.hpp>
#if defined(BOOST_UUID_USE_SSE41)
#if defined(BOOST_UUID_USE_SSE2)
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <boost/uuid/uuid.hpp>
@@ -28,22 +29,48 @@ BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, AVX512v1" )
#elif defined(BOOST_UUID_USE_AVX)
BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, AVX" )
#else
#elif defined(BOOST_UUID_USE_SSE41)
BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, SSE4.1" )
#elif defined(BOOST_UUID_USE_SSSE3)
BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, SSSE3" )
#else
BOOST_PRAGMA_MESSAGE( "Using from_chars_x86.hpp, SSE2" )
#endif
#endif // #if defined(BOOST_UUID_REPORT_IMPLEMENTATION)
#if defined(BOOST_UUID_USE_AVX)
#include <immintrin.h>
#else
#elif defined(BOOST_UUID_USE_SSE41)
#include <smmintrin.h>
#elif defined(BOOST_UUID_USE_SSSE3)
#include <tmmintrin.h>
#else
#include <emmintrin.h>
#endif
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>
#pragma intrinsic(_BitScanForward)
#endif
// Unlike the legacy SSE4.1 pblendvb instruction, the VEX-coded vpblendvb is slow on Intel Lion Cove, Skymont and older.
// Newer microarchitectures are unknown at the time of this writing. Also, on Intel Haswell/Broadwell, even the SSE4.1
// pblendvb is slow.
#if !defined(BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB) && \
(defined(__tune_haswell__) || defined(__tune_broadwell__) || defined(BOOST_UUID_USE_AVX))
#define BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB
#endif
#if !defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB) && defined(BOOST_UUID_USE_SSE41) && !defined(BOOST_UUID_FROM_CHARS_X86_SLOW_PBLENDVB)
#define BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB
#endif
#if defined(BOOST_UUID_USE_AVX512_V1) || (defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB)) || !defined(BOOST_UUID_USE_SSSE3)
#define BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS
#endif
namespace boost {
namespace uuids {
namespace detail {
@@ -109,11 +136,11 @@ struct from_chars_simd_char_constants
static const simd_vector128< std::uint8_t > mm_char_code2_cmp;
static const simd_vector128< std::uint8_t > mm_char_code1_cmp;
#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
static const simd_vector128< std::uint8_t > mm_char_code2_sub;
static const simd_vector128< std::uint8_t > mm_char_code1_sub;
static const simd_vector128< std::uint8_t > mm_char_code0_sub;
#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
};
template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible >
@@ -146,7 +173,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< Char, IsCha
static_cast< std::uint8_t >(char_code1 - 1u), static_cast< std::uint8_t >(char_code1 - 1u)
}};
#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > from_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_code2_sub =
@@ -169,7 +196,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< Char, IsCha
char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub
}};
#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
template< bool IsWCharASCIICompatible >
struct from_chars_simd_char_constants< char, false, IsWCharASCIICompatible >
@@ -224,11 +251,11 @@ struct from_chars_simd_char_constants< char, false, IsWCharASCIICompatible >
static const simd_vector128< std::uint8_t > mm_char_code2_cmp;
static const simd_vector128< std::uint8_t > mm_char_code1_cmp;
#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
static const simd_vector128< std::uint8_t > mm_char_code2_sub;
static const simd_vector128< std::uint8_t > mm_char_code1_sub;
static const simd_vector128< std::uint8_t > mm_char_code0_sub;
#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
};
template< bool IsWCharASCIICompatible >
@@ -264,7 +291,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< char, false
static_cast< std::uint8_t >(char_code1 - 1u), static_cast< std::uint8_t >(char_code1 - 1u)
}};
#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
template< bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > from_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_code2_sub =
@@ -287,7 +314,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< char, false
char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub
}};
#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
template< bool IsCharASCIICompatible >
struct from_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >
@@ -347,11 +374,11 @@ struct from_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >
static const simd_vector128< std::uint8_t > mm_char_code2_cmp;
static const simd_vector128< std::uint8_t > mm_char_code1_cmp;
#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
static const simd_vector128< std::uint8_t > mm_char_code2_sub;
static const simd_vector128< std::uint8_t > mm_char_code1_sub;
static const simd_vector128< std::uint8_t > mm_char_code0_sub;
#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
};
template< bool IsCharASCIICompatible >
@@ -387,7 +414,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< wchar_t, Is
static_cast< std::uint8_t >(char_code1 - 1u), static_cast< std::uint8_t >(char_code1 - 1u)
}};
#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
template< bool IsCharASCIICompatible >
const simd_vector128< std::uint8_t > from_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_code2_sub =
@@ -410,7 +437,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_char_constants< wchar_t, Is
char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub, char_code0_sub
}};
#endif // defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
template< typename >
@@ -425,12 +452,18 @@ struct from_chars_simd_constants
static const simd_vector128< std::uint8_t > mm_split_half_bytes_pattern1;
static const simd_vector128< std::uint8_t > mm_split_half_bytes_pattern2;
static const simd_vector128< std::uint8_t > mm_split_half_bytes_pattern3;
static const simd_vector128< std::uint8_t > mm_split_half_bytes_blend_mask;
static const simd_vector128< std::uint8_t > mm_split_half_bytes_blend_mask1;
#if !defined(BOOST_UUID_USE_SSE41)
static const simd_vector128< std::uint8_t > mm_split_half_bytes_blend_mask2;
#endif
#if !defined(BOOST_UUID_USE_SSSE3)
static const simd_vector128< std::uint8_t > mm_split_half_byte_chars_mask;
#endif
#endif
static const simd_vector128< std::uint8_t > mm_F0;
#if defined(BOOST_UUID_USE_AVX) && !defined(BOOST_UUID_USE_AVX512_V1)
#if !defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
static const simd_vector128< std::uint8_t > mm_2;
#endif
};
@@ -456,13 +489,23 @@ template< typename T >
const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_pattern3 =
{{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x03, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x02 }};
template< typename T >
const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_blend_mask =
const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_blend_mask1 =
{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF }};
#if !defined(BOOST_UUID_USE_SSE41)
template< typename T >
const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_bytes_blend_mask2 =
{{ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00 }};
#endif
#if !defined(BOOST_UUID_USE_SSSE3)
template< typename T >
const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_split_half_byte_chars_mask =
{{ 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 }};
#endif
#endif
template< typename T >
const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_F0 =
{{ 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0 }};
#if defined(BOOST_UUID_USE_AVX) && !defined(BOOST_UUID_USE_AVX512_V1)
#if !defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
template< typename T >
const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_2 =
{{ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 }};
@@ -478,7 +521,7 @@ const simd_vector128< std::uint8_t > from_chars_simd_constants< T >::mm_2 =
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
template< typename Char, unsigned int Size = sizeof(Char) >
template< typename Char, std::size_t Size = sizeof(Char) >
struct from_chars_simd_load_traits;
template< typename Char >
@@ -593,6 +636,38 @@ struct from_chars_simd_load_traits< Char, 2u >
template< typename Char >
struct from_chars_simd_load_traits< Char, 4u >
{
#if !defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_USE_SSSE3)
static const simd_vector128< std::uint8_t > mm_deinterleave_epi16_pattern;
#endif
static BOOST_FORCEINLINE __m128i mm_packus_epi32(__m128i mm1, __m128i mm2) noexcept
{
#if defined(BOOST_UUID_USE_SSE41)
return _mm_packus_epi32(mm1, mm2);
#else // defined(BOOST_UUID_USE_SSE41)
#if defined(BOOST_UUID_USE_SSSE3)
mm1 = _mm_shuffle_epi8(mm1, mm_deinterleave_epi16_pattern);
mm2 = _mm_shuffle_epi8(mm2, mm_deinterleave_epi16_pattern);
__m128i mm_lo = _mm_unpacklo_epi64(mm1, mm2);
__m128i mm_hi = _mm_unpackhi_epi64(mm1, mm2);
#else // defined(BOOST_UUID_USE_SSSE3)
mm1 = _mm_shufflelo_epi16(mm1, _MM_SHUFFLE(3, 1, 2, 0));
mm2 = _mm_shufflelo_epi16(mm2, _MM_SHUFFLE(3, 1, 2, 0));
mm1 = _mm_shufflehi_epi16(mm1, _MM_SHUFFLE(3, 1, 2, 0));
mm2 = _mm_shufflehi_epi16(mm2, _MM_SHUFFLE(3, 1, 2, 0));
__m128i mm_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
__m128i mm_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(3, 1, 3, 1)));
#endif // defined(BOOST_UUID_USE_SSSE3)
const __m128i mm_0 = _mm_setzero_si128();
const __m128i mm_FF = _mm_cmpeq_epi32(mm_0, mm_0);
__m128i mm_sat = _mm_xor_si128(_mm_cmpeq_epi16(mm_hi, mm_0), mm_FF);
return _mm_or_si128(mm_lo, mm_sat);
#endif // defined(BOOST_UUID_USE_SSE41)
}
static BOOST_FORCEINLINE __m128i load_packed_16(const Char* p) noexcept
{
#if defined(BOOST_UUID_USE_AVX512_V1)
@@ -605,8 +680,8 @@ struct from_chars_simd_load_traits< Char, 4u >
return _mm_unpacklo_epi64(mm1, mm2);
#endif // defined(BOOST_UUID_TO_FROM_CHARS_X86_USE_ZMM)
#else
__m128i mm1 = _mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 4)));
__m128i mm2 = _mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 8)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 12)));
__m128i mm1 = mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 4)));
__m128i mm2 = mm_packus_epi32(_mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 8)), _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 12)));
return _mm_packus_epi16(mm1, mm2);
#endif
}
@@ -618,7 +693,7 @@ struct from_chars_simd_load_traits< Char, 4u >
#else
__m128i mm1 = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p));
__m128i mm2 = _mm_setzero_si128();
return _mm_packus_epi16(_mm_packus_epi32(mm1, mm2), mm2);
return _mm_packus_epi16(mm_packus_epi32(mm1, mm2), mm2);
#endif
}
@@ -667,13 +742,19 @@ struct from_chars_simd_load_traits< Char, 4u >
mm_chars1 = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p));
mm_chars2 = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p + 4));
}
mm_chars1 = _mm_packus_epi32(mm_chars1, mm_chars2);
mm_chars3 = _mm_packus_epi32(mm_chars3, mm_chars4);
mm_chars1 = mm_packus_epi32(mm_chars1, mm_chars2);
mm_chars3 = mm_packus_epi32(mm_chars3, mm_chars4);
return _mm_packus_epi16(mm_chars1, mm_chars3);
#endif
}
};
#if !defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_USE_SSSE3)
template< typename Char >
const simd_vector128< std::uint8_t > from_chars_simd_load_traits< Char, 4u >::mm_deinterleave_epi16_pattern =
{{ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D, 0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F }};
#endif
#if defined(BOOST_GCC) && (BOOST_GCC >= 40600)
#pragma GCC diagnostic pop
#endif
@@ -689,7 +770,7 @@ BOOST_FORCEINLINE void from_chars_simd_core
__m128i const& mm_expected_dashes,
__m128i const& mm_char_code1_cmp,
__m128i const& mm_char_code2_cmp,
#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
__m128i const& mm_char_code0_sub,
__m128i const& mm_char_code1_sub,
__m128i const& mm_char_code2_sub,
@@ -705,15 +786,24 @@ BOOST_FORCEINLINE void from_chars_simd_core
// |01234567-89ab-cd|ef-0123-456789ab|cdefXXXXXXXXXXXX|
//
// Check if dashes are in the expected positions
//
// mm_middle
// |-89ab-cdef-0123-|
const __m128i mm_middle = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(mm_chars1), _mm_castsi128_pd(mm_chars2), _MM_SHUFFLE2(0, 1)));
{
// mm_dashes
// |-89ab-cdef-0123-|
__m128i mm_dashes = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(mm_chars1), _mm_castsi128_pd(mm_chars2), _MM_SHUFFLE2(0, 1)));
if (BOOST_UNLIKELY(!_mm_test_all_zeros(_mm_xor_si128(mm_dashes, mm_expected_dashes), constants::mm_dashes_mask)))
#if defined(BOOST_UUID_USE_SSE41)
if (BOOST_UNLIKELY(!_mm_test_all_zeros(_mm_xor_si128(mm_middle, mm_expected_dashes), constants::mm_dashes_mask)))
#else
__m128i mm_dashes = _mm_and_si128(mm_middle, constants::mm_dashes_mask);
std::uint32_t dash_mask = static_cast< std::uint32_t >(_mm_movemask_epi8(_mm_cmpeq_epi8(mm_dashes, mm_expected_dashes)));
if (BOOST_UNLIKELY(dash_mask != 0xFFFF))
#endif
{
// Some of the dashes are missing
mm_dashes = _mm_and_si128(mm_dashes, constants::mm_dashes_mask);
#if defined(BOOST_UUID_USE_SSE41)
__m128i mm_dashes = _mm_and_si128(mm_middle, constants::mm_dashes_mask);
std::uint32_t dash_mask = static_cast< std::uint32_t >(_mm_movemask_epi8(_mm_cmpeq_epi8(mm_dashes, mm_expected_dashes)));
#endif
unsigned int pos = detail::countr_zero_nz(~dash_mask) + 8u;
if (pos < end_pos)
{
@@ -732,7 +822,11 @@ BOOST_FORCEINLINE void from_chars_simd_core
// mm_chars2: |02468ace13579bdf|
mm_chars1 = _mm_permutex2var_epi8(mm_chars1, constants::mm_split_half_bytes_pattern1, mm_chars2);
mm_chars2 = _mm_permutex2var_epi8(mm_chars2, constants::mm_split_half_bytes_pattern2, mm_chars3);
#else
// Group half-byte characters
__m128i mm_lo = _mm_unpacklo_epi64(mm_chars1, mm_chars2);
__m128i mm_hi = _mm_unpackhi_epi64(mm_chars1, mm_chars2);
#elif defined(BOOST_UUID_USE_SSSE3)
// mm_chars1: |02468acZ13579bdZ|
// mm_chars2: |02468aZe13579bZf|
// mm_chars3: |ZZZZZZceZZZZZZdf|
@@ -742,20 +836,47 @@ BOOST_FORCEINLINE void from_chars_simd_core
// mm_chars1: |02468ace13579bdf|
// mm_chars2: |02468ace13579bdf|
// Avoid using vpblendvb, which is slow on Intel
#if defined(BOOST_UUID_USE_AVX512_V1)
mm_chars1 = _mm_ternarylogic_epi64(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask, 0xD8); // (_MM_TERNLOG_A & ~_MM_TERNLOG_C) | (_MM_TERNLOG_B & _MM_TERNLOG_C)
#elif defined(BOOST_UUID_USE_AVX)
mm_chars1 = _mm_or_si128(mm_chars1, _mm_and_si128(mm_chars2, constants::mm_split_half_bytes_blend_mask));
// Avoid using vpblendvb, which is slow on Intel
mm_chars1 = _mm_ternarylogic_epi64(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask1, 0xD8); // (_MM_TERNLOG_A & ~_MM_TERNLOG_C) | (_MM_TERNLOG_B & _MM_TERNLOG_C)
#elif defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB)
mm_chars1 = _mm_blendv_epi8(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask1);
#else
mm_chars1 = _mm_blendv_epi8(mm_chars1, mm_chars2, constants::mm_split_half_bytes_blend_mask);
mm_chars1 = _mm_or_si128(mm_chars1, _mm_and_si128(mm_chars2, constants::mm_split_half_bytes_blend_mask1));
#endif
#if defined(BOOST_UUID_USE_SSE41)
mm_chars2 = _mm_blend_epi16(mm_chars2, mm_chars3, 0x88);
#else
mm_chars2 = _mm_or_si128(_mm_and_si128(mm_chars2, constants::mm_split_half_bytes_blend_mask2), mm_chars3);
#endif
// Group half-byte digits
// Group half-byte characters
__m128i mm_lo = _mm_unpacklo_epi64(mm_chars1, mm_chars2);
__m128i mm_hi = _mm_unpackhi_epi64(mm_chars1, mm_chars2);
#else
__m128i mm_lo, mm_hi;
{
// Remove dashes
__m128i mm_group1 = _mm_srli_epi64(mm_middle, 8);
__m128i mm_group2 = _mm_srli_si128(mm_middle, 6);
__m128i mm_group3 = _mm_srli_si128(mm_middle, 11);
mm_chars1 = _mm_unpacklo_epi64(mm_chars1, _mm_unpacklo_epi32(mm_group1, mm_group2));
mm_chars2 = _mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(mm_chars2), _mm_castsi128_pd(_mm_unpacklo_epi32(mm_group3, mm_chars3))));
mm_chars2 = _mm_shuffle_epi32(mm_chars2, _MM_SHUFFLE(1, 3, 2, 0));
// Deinterleave half-byte characters
__m128i mm_lo1 = _mm_srli_epi16(mm_chars1, 8);
__m128i mm_lo2 = _mm_srli_epi16(mm_chars2, 8);
__m128i mm_hi1 = _mm_and_si128(mm_chars1, constants::mm_split_half_byte_chars_mask);
__m128i mm_hi2 = _mm_and_si128(mm_chars2, constants::mm_split_half_byte_chars_mask);
mm_lo = _mm_packus_epi16(mm_lo1, mm_lo2);
mm_hi = _mm_packus_epi16(mm_hi1, mm_hi2);
}
#endif
// Convert characters to 8-bit integers. The algorithm is basically as follows:
//
@@ -778,6 +899,7 @@ BOOST_FORCEINLINE void from_chars_simd_core
// Note that there is one caveat due to the fact that there are only signed byte comparisons until AVX-512. This is a problem if the character encoding has
// hexadecimal character codes with the highest bit set to 1. This is handled in from_chars_simd_char_constants by constructing mm_char_code1 and
// mm_char_code2 in such a way that signed comparisons work as described. We also use signed comparisons in AVX-512 to reuse the same constants.
#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
#if defined(BOOST_UUID_USE_AVX512_V1)
__mmask16 k_char_code2_mask_lo = _mm_cmpgt_epi8_mask(mm_lo, mm_char_code2_cmp);
__mmask16 k_char_code2_mask_hi = _mm_cmpgt_epi8_mask(mm_hi, mm_char_code2_cmp);
@@ -790,8 +912,29 @@ BOOST_FORCEINLINE void from_chars_simd_core
mm_char_code_sub_lo = _mm_mask_blend_epi8(k_char_code1_mask_lo, mm_char_code0_sub, mm_char_code_sub_lo);
mm_char_code_sub_hi = _mm_mask_blend_epi8(k_char_code1_mask_hi, mm_char_code0_sub, mm_char_code_sub_hi);
#elif defined(BOOST_UUID_USE_AVX)
// Unlike the legacy SSE4.1 pblendvb instruction, the VEX-coded vpblendvb is slow on Intel. Use a different approach:
#else
__m128i mm_char_code2_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code2_cmp);
__m128i mm_char_code2_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code2_cmp);
__m128i mm_char_code1_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code1_cmp);
__m128i mm_char_code1_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code1_cmp);
#if defined(BOOST_UUID_USE_SSE41) && defined(BOOST_UUID_FROM_CHARS_X86_USE_PBLENDVB)
__m128i mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_lo);
__m128i mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_hi);
mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_lo, mm_char_code1_mask_lo);
mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_hi, mm_char_code1_mask_hi);
#else
__m128i mm_char_code_sub_lo = _mm_or_si128(_mm_andnot_si128(mm_char_code2_mask_lo, mm_char_code1_sub), _mm_and_si128(mm_char_code2_mask_lo, mm_char_code2_sub));
__m128i mm_char_code_sub_hi = _mm_or_si128(_mm_andnot_si128(mm_char_code2_mask_hi, mm_char_code1_sub), _mm_and_si128(mm_char_code2_mask_hi, mm_char_code2_sub));
mm_char_code_sub_lo = _mm_or_si128(_mm_andnot_si128(mm_char_code1_mask_lo, mm_char_code0_sub), _mm_and_si128(mm_char_code1_mask_lo, mm_char_code_sub_lo));
mm_char_code_sub_hi = _mm_or_si128(_mm_andnot_si128(mm_char_code1_mask_hi, mm_char_code0_sub), _mm_and_si128(mm_char_code1_mask_hi, mm_char_code_sub_hi));
#endif
#endif
#else // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
// Use a different approach:
// - Each vpcmpgtb produces a mask, where 0 indicates false and -1 - true.
// - mm_char_code1_mask_* always overlaps with the corresponding mm_char_code2_mask_*, which means adding them
// produces a vector where 0 means none of the vpcmpgtb matched the value, -1 - where mm_char_code1_mask_* matched
@@ -814,26 +957,19 @@ BOOST_FORCEINLINE void from_chars_simd_core
const __m128i mm_char_code_sub = _mm_cvtsi32_si128(static_cast< int >(char_code_sub));
__m128i mm_char_code_sub_lo = _mm_shuffle_epi8(mm_char_code_sub, mm_char_code_pattern_lo);
__m128i mm_char_code_sub_hi = _mm_shuffle_epi8(mm_char_code_sub, mm_char_code_pattern_hi);
#else
__m128i mm_char_code2_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code2_cmp);
__m128i mm_char_code2_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code2_cmp);
__m128i mm_char_code1_mask_lo = _mm_cmpgt_epi8(mm_lo, mm_char_code1_cmp);
__m128i mm_char_code1_mask_hi = _mm_cmpgt_epi8(mm_hi, mm_char_code1_cmp);
__m128i mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_lo);
__m128i mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code1_sub, mm_char_code2_sub, mm_char_code2_mask_hi);
mm_char_code_sub_lo = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_lo, mm_char_code1_mask_lo);
mm_char_code_sub_hi = _mm_blendv_epi8(mm_char_code0_sub, mm_char_code_sub_hi, mm_char_code1_mask_hi);
#endif
#endif // defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
mm_lo = _mm_sub_epi8(mm_lo, mm_char_code_sub_lo);
mm_hi = _mm_sub_epi8(mm_hi, mm_char_code_sub_hi);
// Check hexadecimal character validity. Proper hexadecimal characters always convert to values of 0-15 and any other characters convert
// to values outside that range. Which means if the upper 4 bits of a resulting integer are non-zero then the corresponding character was invalid.
#if defined(BOOST_UUID_USE_SSE41)
if (BOOST_LIKELY(_mm_test_all_zeros(_mm_or_si128(mm_lo, mm_hi), constants::mm_F0)))
#else
const __m128i mm_0 = _mm_setzero_si128();
if (BOOST_LIKELY(_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(_mm_or_si128(mm_lo, mm_hi), constants::mm_F0), mm_0)) == 0xFFFF))
#endif
{
if (BOOST_LIKELY(ec == from_chars_error::none))
{
@@ -844,12 +980,13 @@ BOOST_FORCEINLINE void from_chars_simd_core
else
{
// Some of the hex digits are invalid
#if defined(BOOST_UUID_USE_SSE41)
const __m128i mm_0 = _mm_setzero_si128();
#endif
__m128i mm_hi_bits_lo = _mm_and_si128(mm_lo, constants::mm_F0);
__m128i mm_hi_bits_hi = _mm_and_si128(mm_hi, constants::mm_F0);
mm_hi_bits_lo = _mm_cmpeq_epi8(mm_hi_bits_lo, mm_0);
mm_hi_bits_hi = _mm_cmpeq_epi8(mm_hi_bits_hi, mm_0);
std::uint32_t digits_mask_lo = static_cast< std::uint32_t >(_mm_movemask_epi8(mm_hi_bits_lo));
std::uint32_t digits_mask_hi = static_cast< std::uint32_t >(_mm_movemask_epi8(mm_hi_bits_hi));
@@ -930,7 +1067,7 @@ BOOST_FORCEINLINE from_chars_result< Char > from_chars_simd(const Char* begin, c
char_constants::mm_expected_dashes,
char_constants::mm_char_code1_cmp,
char_constants::mm_char_code2_cmp,
#if defined(BOOST_UUID_USE_AVX512_V1) || !defined(BOOST_UUID_USE_AVX)
#if defined(BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS)
char_constants::mm_char_code0_sub,
char_constants::mm_char_code1_sub,
char_constants::mm_char_code2_sub,
@@ -947,6 +1084,8 @@ BOOST_FORCEINLINE from_chars_result< Char > from_chars_simd(const Char* begin, c
} // namespace uuids
} // namespace boost
#endif // defined(BOOST_UUID_USE_SSE41)
#undef BOOST_UUID_DETAIL_FROM_CHARS_X86_USE_BLENDS
#endif // defined(BOOST_UUID_USE_SSE2)
#endif // BOOST_UUID_DETAIL_FROM_CHARS_X86_HPP_INCLUDED

View File

@@ -27,12 +27,22 @@ union simd_vector
>
BOOST_FORCEINLINE operator Vector () const noexcept { return get< Vector >(); }
#if defined(BOOST_GCC) && (BOOST_GCC >= 40600)
#pragma GCC diagnostic push
// dereferencing type-punned pointer will break strict-aliasing rules
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif
template< typename Vector >
BOOST_FORCEINLINE typename std::enable_if< sizeof(Vector) <= ByteSize, Vector >::type get() const noexcept
{
using vector_type = typename std::remove_cv< typename std::remove_reference< Vector >::type >::type;
return *reinterpret_cast< const vector_type* >(bytes);
}
#if defined(BOOST_GCC) && (BOOST_GCC >= 40800)
#pragma GCC diagnostic pop
#endif
};
template< typename T >

View File

@@ -11,7 +11,7 @@
#include <boost/uuid/detail/is_constant_evaluated.hpp>
#include <boost/uuid/detail/to_chars_generic.hpp>
#if defined(BOOST_UUID_USE_SSSE3)
#if defined(BOOST_UUID_USE_SSE2)
# include <boost/uuid/detail/to_chars_x86.hpp>
#elif defined(BOOST_UUID_REPORT_IMPLEMENTATION)
@@ -26,7 +26,7 @@ namespace detail {
template<class Ch> BOOST_UUID_CXX14_CONSTEXPR_RT inline Ch* to_chars( uuid const& u, Ch* out ) noexcept
{
#if defined(BOOST_UUID_USE_SSSE3)
#if defined(BOOST_UUID_USE_SSE2)
if( detail::is_constant_evaluated_rt() )
{
return detail::to_chars_generic( u, out );
@@ -40,7 +40,6 @@ template<class Ch> BOOST_UUID_CXX14_CONSTEXPR_RT inline Ch* to_chars( uuid const
#endif
}
} // namespace detail
}} //namespace boost::uuids
}}} // namespace boost::uuids::detail
#endif // BOOST_UUID_DETAIL_TO_CHARS_HPP_INCLUDED

View File

@@ -7,7 +7,7 @@
#include <boost/uuid/detail/config.hpp>
#if defined(BOOST_UUID_USE_SSSE3)
#if defined(BOOST_UUID_USE_SSE2)
#include <cstdint>
#include <boost/uuid/uuid.hpp>
@@ -26,9 +26,12 @@ BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, AVX2" )
#elif defined(BOOST_UUID_USE_SSE41)
BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSE4.1" )
#else
#elif defined(BOOST_UUID_USE_SSSE3)
BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSSE3" )
#else
BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSE2" )
#endif
#endif // #if defined(BOOST_UUID_REPORT_IMPLEMENTATION)
@@ -36,8 +39,10 @@ BOOST_PRAGMA_MESSAGE( "Using to_chars_x86.hpp, SSSE3" )
#include <immintrin.h>
#elif defined(BOOST_UUID_USE_SSE41)
#include <smmintrin.h>
#else
#elif defined(BOOST_UUID_USE_SSSE3)
#include <tmmintrin.h>
#else
#include <emmintrin.h>
#endif
namespace boost {
@@ -51,13 +56,31 @@ template<
>
struct to_chars_simd_char_constants
{
#if defined(BOOST_UUID_USE_SSSE3)
static const simd_vector128< std::uint8_t > mm_char_table;
#else
static constexpr std::uint8_t char_a_add = static_cast< std::uint8_t >((0x61 - 10) - 0x30); // ('a' - 10) - '0' in ASCII
static const simd_vector128< std::uint8_t > mm_char_0_add;
static const simd_vector128< std::uint8_t > mm_char_a_add;
#endif
static const simd_vector128< std::uint8_t > mm_char_dash;
};
#if defined(BOOST_UUID_USE_SSSE3)
template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_table =
{{ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66 }}; // 0123456789abcdef in ASCII
#else
template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_0_add =
{{ 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30 }}; // 0x30 is '0' in ASCII
template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_a_add =
{{
char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add,
char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add
}};
#endif
template< typename Char, bool IsCharASCIICompatible, bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< Char, IsCharASCIICompatible, IsWCharASCIICompatible >::mm_char_dash =
{{ 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D }}; // ---------------- in ASCII
@@ -69,10 +92,17 @@ struct to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >
static_assert(static_cast< std::uint8_t >('-') < static_cast< std::uint8_t >('0') && static_cast< std::uint8_t >('-') < static_cast< std::uint8_t >('a'),
"Boost.UUID: Unsupported char encoding, '-' character code is expected to be less than any hexadecimal characters");
#if defined(BOOST_UUID_USE_SSSE3)
static const simd_vector128< std::uint8_t > mm_char_table;
#else
static constexpr std::uint8_t char_a_add = static_cast< std::uint8_t >(('a' - 10) - '0');
static const simd_vector128< std::uint8_t > mm_char_0_add;
static const simd_vector128< std::uint8_t > mm_char_a_add;
#endif
static const simd_vector128< std::uint8_t > mm_char_dash;
};
#if defined(BOOST_UUID_USE_SSSE3)
template< bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_table =
{{
@@ -81,6 +111,22 @@ const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false,
static_cast< std::uint8_t >('8'), static_cast< std::uint8_t >('9'), static_cast< std::uint8_t >('a'), static_cast< std::uint8_t >('b'),
static_cast< std::uint8_t >('c'), static_cast< std::uint8_t >('d'), static_cast< std::uint8_t >('e'), static_cast< std::uint8_t >('f')
}};
#else
template< bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_0_add =
{{
static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'),
static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'),
static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'),
static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0'), static_cast< std::uint8_t >('0')
}};
template< bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_a_add =
{{
char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add,
char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add
}};
#endif
template< bool IsWCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< char, false, IsWCharASCIICompatible >::mm_char_dash =
{{
@@ -102,10 +148,17 @@ struct to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >
static_assert(static_cast< std::uint8_t >(L'-') < static_cast< std::uint8_t >(L'0') && static_cast< std::uint8_t >(L'-') < static_cast< std::uint8_t >(L'a'),
"Boost.UUID: Unsupported wchar_t encoding, L'-' character code is expected to be less than any hexadecimal characters");
#if defined(BOOST_UUID_USE_SSSE3)
static const simd_vector128< std::uint8_t > mm_char_table;
#else
static constexpr std::uint8_t char_a_add = static_cast< std::uint8_t >((L'a' - 10) - L'0');
static const simd_vector128< std::uint8_t > mm_char_0_add;
static const simd_vector128< std::uint8_t > mm_char_a_add;
#endif
static const simd_vector128< std::uint8_t > mm_char_dash;
};
#if defined(BOOST_UUID_USE_SSSE3)
template< bool IsCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_table =
{{
@@ -114,6 +167,22 @@ const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCh
static_cast< std::uint8_t >(L'8'), static_cast< std::uint8_t >(L'9'), static_cast< std::uint8_t >(L'a'), static_cast< std::uint8_t >(L'b'),
static_cast< std::uint8_t >(L'c'), static_cast< std::uint8_t >(L'd'), static_cast< std::uint8_t >(L'e'), static_cast< std::uint8_t >(L'f')
}};
#else
template< bool IsCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_0_add =
{{
static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'),
static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'),
static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'),
static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0'), static_cast< std::uint8_t >(L'0')
}};
template< bool IsCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_a_add =
{{
char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add,
char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add, char_a_add
}};
#endif
template< bool IsCharASCIICompatible >
const simd_vector128< std::uint8_t > to_chars_simd_char_constants< wchar_t, IsCharASCIICompatible, false >::mm_char_dash =
{{
@@ -127,25 +196,55 @@ template< typename >
struct to_chars_simd_constants
{
static const simd_vector128< std::uint8_t > mm_0F;
#if defined(BOOST_UUID_USE_SSSE3)
static const simd_vector128< std::uint8_t > mm_shuffle_pattern1;
static const simd_vector128< std::uint8_t > mm_shuffle_pattern2;
#else
static const simd_vector128< std::uint8_t > mm_9;
static const simd_vector128< std::uint8_t > mm_group1_mask;
static const simd_vector128< std::uint8_t > mm_group2_mask;
static const simd_vector128< std::uint8_t > mm_group3_mask;
#endif
};
template< typename T >
const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_0F =
{{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }};
#if defined(BOOST_UUID_USE_SSSE3)
template< typename T >
const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_shuffle_pattern1 =
{{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x80, 0x08, 0x09, 0x0A, 0x0B, 0x80, 0x0C, 0x0D }};
template< typename T >
const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_shuffle_pattern2 =
{{ 0x00, 0x01, 0x80, 0x02, 0x03, 0x04, 0x05, 0x80, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D }};
#else
template< typename T >
const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_9 =
{{ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 }};
template< typename T >
const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_group1_mask =
{{ 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }};
template< typename T >
const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_group2_mask =
{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }};
template< typename T >
const simd_vector128< std::uint8_t > to_chars_simd_constants< T >::mm_group3_mask =
{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00 }};
#endif
//! Converts UUID to a string of 36 characters, where first 32 craracters are returned in mm_chars1 and mm_chars2 and the last 4 in the highest 32 bits of mm_chars3
/*!
* Converts UUID to a string of 36 characters, where the first 32 characters are returned in mm_chars1 and mm_chars2.
* When SSSE3 is supported, last 4 characters are returned in the highest 32 bits of mm_chars3, otherwise in the lowest 32 bits.
*/
BOOST_FORCEINLINE void to_chars_simd_core
(
const std::uint8_t* data,
__m128i const& mm_char_table, __m128i const& mm_char_dash,
#if defined(BOOST_UUID_USE_SSSE3)
__m128i const& mm_char_table,
#else
__m128i const& mm_char_0_add, __m128i const& mm_char_a_add,
#endif
__m128i const& mm_char_dash,
__m128i& mm_chars1, __m128i& mm_chars2, __m128i& mm_chars3
) noexcept
{
@@ -154,18 +253,31 @@ BOOST_FORCEINLINE void to_chars_simd_core
__m128i mm_input = _mm_loadu_si128(reinterpret_cast< const __m128i* >(data));
// Split half-bytes
__m128i const& mm_0F = constants::mm_0F;
__m128i mm_input_hi = _mm_and_si128(_mm_srli_epi32(mm_input, 4), mm_0F);
__m128i mm_input_lo = _mm_and_si128(mm_input, mm_0F);
__m128i mm_input_hi = _mm_and_si128(_mm_srli_epi32(mm_input, 4), constants::mm_0F);
__m128i mm_input_lo = _mm_and_si128(mm_input, constants::mm_0F);
// Stringize each of the halves
#if defined(BOOST_UUID_USE_SSSE3)
mm_input_hi = _mm_shuffle_epi8(mm_char_table, mm_input_hi);
mm_input_lo = _mm_shuffle_epi8(mm_char_table, mm_input_lo);
#else
{
__m128i mm_add_mask_hi = _mm_cmpgt_epi8(mm_input_hi, constants::mm_9);
__m128i mm_add_mask_lo = _mm_cmpgt_epi8(mm_input_lo, constants::mm_9);
__m128i mm_add_hi = _mm_add_epi8(mm_char_0_add, _mm_and_si128(mm_add_mask_hi, mm_char_a_add));
__m128i mm_add_lo = _mm_add_epi8(mm_char_0_add, _mm_and_si128(mm_add_mask_lo, mm_char_a_add));
mm_input_hi = _mm_add_epi8(mm_input_hi, mm_add_hi);
mm_input_lo = _mm_add_epi8(mm_input_lo, mm_add_lo);
}
#endif
// Join them back together
__m128i mm_1 = _mm_unpacklo_epi8(mm_input_hi, mm_input_lo);
__m128i mm_2 = _mm_unpackhi_epi8(mm_input_hi, mm_input_lo);
#if defined(BOOST_UUID_USE_SSSE3)
// Insert dashes at positions 8, 13, 18 and 23
// mm_1 mm_2
// |0123456789abcdef|0123456789abcdef|
@@ -178,6 +290,32 @@ BOOST_FORCEINLINE void to_chars_simd_core
mm_chars1 = _mm_max_epu8(mm_chars1, mm_char_dash);
mm_chars2 = _mm_max_epu8(mm_chars2, mm_char_dash);
mm_chars3 = mm_2;
#else
// Split groups of characters between dashes and shift them into their places
// mm_middle: |89abcdef01234567|
// mm_group1: |Z89abZZZZZZZZZZZ|
// mm_group2: |ZZZZZZcdefZZZZZZ|
// mm_group3: |ZZZZZZZZZZZ0123Z|
__m128i mm_middle = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(mm_1), _mm_castsi128_pd(mm_2), _MM_SHUFFLE2(0, 1)));
__m128i mm_group1 = _mm_slli_epi64(mm_middle, 8);
__m128i mm_group2 = _mm_slli_si128(mm_middle, 2);
__m128i mm_group3 = _mm_slli_epi64(mm_middle, 24);
mm_group1 = _mm_and_si128(mm_group1, constants::mm_group1_mask);
mm_group2 = _mm_and_si128(mm_group2, constants::mm_group2_mask);
mm_group3 = _mm_and_si128(mm_group3, constants::mm_group3_mask);
// Merge them back and insert dashes
// mm_middle: |-89ab-cdef-0123-|
mm_middle = _mm_or_si128(_mm_or_si128(mm_group1, mm_group2), mm_group3);
mm_middle = _mm_max_epu8(mm_middle, mm_char_dash);
// mm_2: |cdef0123456789ab|
mm_2 = _mm_shuffle_epi32(mm_2, _MM_SHUFFLE(2, 1, 0, 3));
mm_chars1 = _mm_unpacklo_epi64(mm_1, mm_middle);
mm_chars2 = _mm_unpackhi_epi64(mm_middle, mm_2);
mm_chars3 = mm_2;
#endif
}
#if defined(BOOST_MSVC)
@@ -195,7 +333,12 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept
uuids::detail::to_chars_simd_core
(
u.data(),
#if defined(BOOST_UUID_USE_SSSE3)
char_constants::mm_char_table,
#else
char_constants::mm_char_0_add,
char_constants::mm_char_a_add,
#endif
char_constants::mm_char_dash,
mm_chars1, mm_chars2, mm_chars3
);
@@ -205,11 +348,17 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept
{
_mm_storeu_si128(reinterpret_cast< __m128i* >(out), mm_chars1);
_mm_storeu_si128(reinterpret_cast< __m128i* >(out + 16), mm_chars2);
detail::store_native_u32
(
out + 32,
#if defined(BOOST_UUID_USE_SSE41)
detail::store_native_u32(out + 32, static_cast< std::uint32_t >(_mm_extract_epi32(mm_chars3, 3)));
static_cast< std::uint32_t >(_mm_extract_epi32(mm_chars3, 3))
#elif defined(BOOST_UUID_USE_SSSE3)
static_cast< std::uint32_t >(_mm_cvtsi128_si32(_mm_srli_si128(mm_chars3, 12)))
#else
detail::store_native_u32(out + 32, static_cast< std::uint32_t >(_mm_cvtsi128_si32(_mm_srli_si128(mm_chars3, 12))));
static_cast< std::uint32_t >(_mm_cvtsi128_si32(mm_chars3))
#endif
);
}
else BOOST_IF_CONSTEXPR (sizeof(Char) == 2u)
{
@@ -225,8 +374,10 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept
#endif
#if defined(BOOST_UUID_USE_SSE41) && (defined(__x86_64__) || defined(_M_X64))
detail::store_native_u64(out + 32, static_cast< std::uint64_t >(_mm_extract_epi64(_mm_unpackhi_epi8(mm_chars3, mm_0), 1)));
#else
#elif defined(BOOST_UUID_USE_SSSE3)
_mm_storeh_pd(reinterpret_cast< BOOST_MAY_ALIAS double* >(out + 32), _mm_castsi128_pd(_mm_unpackhi_epi8(mm_chars3, mm_0)));
#else
_mm_storel_epi64(reinterpret_cast< __m128i* >(out + 32), _mm_unpacklo_epi8(mm_chars3, mm_0));
#endif
}
else
@@ -255,7 +406,15 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept
_mm_storeu_si128(reinterpret_cast< __m128i* >(out + 24), _mm_unpacklo_epi16(mm, mm_0));
_mm_storeu_si128(reinterpret_cast< __m128i* >(out + 28), _mm_unpackhi_epi16(mm, mm_0));
#endif
_mm_storeu_si128(reinterpret_cast< __m128i* >(out + 32), _mm_unpackhi_epi16(_mm_unpackhi_epi8(mm_chars3, mm_0), mm_0));
_mm_storeu_si128
(
reinterpret_cast< __m128i* >(out + 32),
#if defined(BOOST_UUID_USE_SSSE3)
_mm_unpackhi_epi16(_mm_unpackhi_epi8(mm_chars3, mm_0), mm_0)
#else
_mm_unpacklo_epi16(_mm_unpacklo_epi8(mm_chars3, mm_0), mm_0)
#endif
);
}
return out + 36;
@@ -269,6 +428,6 @@ BOOST_FORCEINLINE Char* to_chars_simd(uuid const& u, Char* out) noexcept
} // namespace uuids
} // namespace boost
#endif // defined(BOOST_UUID_USE_SSSE3)
#endif // defined(BOOST_UUID_USE_SSE2)
#endif // BOOST_UUID_DETAIL_TO_CHARS_X86_HPP_INCLUDED

View File

@@ -34,6 +34,20 @@ import path ;
import regex ;
import testing ;
# The rule allows for suppressing running tests and instead only compile them.
# This is useful e.g. if the tests are compiled for a target ISA that is not supported by the CPU.
local rule run ( sources + : args * : input-files * : requirements * : target-name ? : default-build * )
{
if [ os.environ BOOST_UUID_SKIP_RUNNING_TESTS ]
{
return [ testing.compile $(sources) : $(requirements) : $(target-name) ] ;
}
else
{
return [ testing.run $(sources) : $(args) : $(input-files) : $(requirements) : $(target-name) : $(default-build) ] ;
}
}
# this rule enumerates through all the headers and ensures
# that inclusion of the header by itself is sufficient to
# compile successfully, proving the header does not depend