From e0c97bbdc55fa65d334139242c8fa7533ffc7e7e Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Thu, 9 May 2013 16:53:02 +0000 Subject: [PATCH] Implemented AVX2 version of dump. [SVN r84204] --- build/Jamfile.v2 | 4 +- src/dump_avx2.cpp | 174 ++++++++++++++++++++++------------- test/run/util_manip_dump.cpp | 25 +++++ 3 files changed, 136 insertions(+), 67 deletions(-) diff --git a/build/Jamfile.v2 b/build/Jamfile.v2 index 8eb53d8..c101ac0 100644 --- a/build/Jamfile.v2 +++ b/build/Jamfile.v2 @@ -286,7 +286,7 @@ rule avx2-targets-cond ( properties * ) if ! [ version.version-less $(version) : 4 7 ] { - result = "-march=core-avx2 -mavx -mavx2" ; + result = "-march=core-avx2 -mavx -mavx2 -fabi-version=0" ; } } else if msvc in $(properties) @@ -325,7 +325,7 @@ rule avx2-targets-cond ( properties * ) } else { - result = "-march=core-avx2 -mavx -mavx2" ; + result = "-march=core-avx2 -mavx -mavx2 -fabi-version=0" ; } } } diff --git a/src/dump_avx2.cpp b/src/dump_avx2.cpp index 9ca0aae..0e787d8 100644 --- a/src/dump_avx2.cpp +++ b/src/dump_avx2.cpp @@ -40,59 +40,112 @@ BOOST_LOG_ANONYMOUS_NAMESPACE { enum { packs_per_stride = 32, - stride = packs_per_stride * 16 + stride = packs_per_stride * 32 }; -union xmm_constant +union ymm_constant { - uint8_t as_bytes[16]; - __m128i as_mm; + uint8_t as_bytes[32]; + __m256i as_mm; }; -static const xmm_constant mm_15 = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }}; -static const xmm_constant mm_9 = {{ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 }}; -static const xmm_constant mm_char_0 = {{ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' }}; -static const xmm_constant mm_char_space_mask = {{ ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ' }}; -static const xmm_constant mm_shuffle_pattern1 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }}; -static const xmm_constant mm_shuffle_pattern2 = {{ 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }}; -static const xmm_constant mm_shuffle_pattern3 = {{ 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }}; +static const ymm_constant mm_char_space_mask = {{ ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ' }}; +static const ymm_constant mm_shuffle_pattern1 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }}; +static const ymm_constant mm_shuffle_pattern2 = {{ 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }}; +static const ymm_constant mm_shuffle_pattern3 = {{ 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }}; +static const ymm_constant mm_shuffle_pattern13 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }}; //! Dumps a pack of input data into a string of 8 bit ASCII characters -static BOOST_LOG_FORCEINLINE void dump_pack(__m128i mm_char_10_to_a, __m128i mm_input, __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3) +static BOOST_LOG_FORCEINLINE void dump_pack(__m256i mm_char_a, __m256i mm_input, __m256i& mm_output1, __m256i& mm_output2, __m256i& mm_output3) { // Split half-bytes - __m128i mm_input_hi = _mm_and_si128(_mm_srli_epi16(mm_input, 4), mm_15.as_mm); - __m128i mm_input_lo = _mm_and_si128(mm_input, mm_15.as_mm); + const __m256i mm_15 = _mm256_set1_epi8(0x0F); + __m256i mm_input_hi = _mm256_and_si256(_mm256_srli_epi16(mm_input, 4), mm_15); + __m256i mm_input_lo = _mm256_and_si256(mm_input, mm_15); // Stringize each of the halves - __m128i mm_addend_hi = _mm_cmpgt_epi8(mm_input_hi, mm_9.as_mm); - __m128i mm_addend_lo = _mm_cmpgt_epi8(mm_input_lo, mm_9.as_mm); - mm_addend_hi = _mm_and_si128(mm_char_10_to_a, mm_addend_hi); - mm_addend_lo = _mm_and_si128(mm_char_10_to_a, mm_addend_lo); + const __m256i mm_9 = _mm256_set1_epi8(9); + __m256i mm_addend_hi = _mm256_cmpgt_epi8(mm_input_hi, mm_9); + __m256i mm_addend_lo = _mm256_cmpgt_epi8(mm_input_lo, mm_9); + const __m256i mm_char_0 = _mm256_set1_epi8('0'); + mm_addend_hi = _mm256_blendv_epi8(mm_char_0, mm_char_a, mm_addend_hi); + mm_addend_lo = _mm256_blendv_epi8(mm_char_0, mm_char_a, mm_addend_lo); - mm_input_hi = _mm_add_epi8(mm_input_hi, mm_char_0.as_mm); - mm_input_lo = _mm_add_epi8(mm_input_lo, mm_char_0.as_mm); - - mm_input_hi = _mm_add_epi8(mm_input_hi, mm_addend_hi); - mm_input_lo = _mm_add_epi8(mm_input_lo, mm_addend_lo); + mm_input_hi = _mm256_add_epi8(mm_input_hi, mm_addend_hi); + mm_input_lo = _mm256_add_epi8(mm_input_lo, mm_addend_lo); // Join them back together - __m128i mm_1 = _mm_unpacklo_epi8(mm_input_hi, mm_input_lo); - __m128i mm_2 = _mm_unpackhi_epi8(mm_input_hi, mm_input_lo); + __m256i mm_1 = _mm256_unpacklo_epi8(mm_input_hi, mm_input_lo); + __m256i mm_2 = _mm256_unpackhi_epi8(mm_input_hi, mm_input_lo); // Insert spaces between stringized bytes: // |0123456789abcdef|0123456789abcdef| // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef| - mm_output1 = _mm_shuffle_epi8(mm_1, mm_shuffle_pattern1.as_mm); - mm_output2 = _mm_shuffle_epi8(_mm_alignr_epi8(mm_2, mm_1, 10), mm_shuffle_pattern2.as_mm); - mm_output3 = _mm_shuffle_epi8(mm_2, mm_shuffle_pattern3.as_mm); + __m256i mm_out1 = _mm256_shuffle_epi8(mm_1, mm_shuffle_pattern1.as_mm); + __m256i mm_out2 = _mm256_shuffle_epi8(_mm256_alignr_epi8(mm_2, mm_1, 10), mm_shuffle_pattern2.as_mm); + __m256i mm_out3 = _mm256_shuffle_epi8(mm_2, mm_shuffle_pattern3.as_mm); - __m128i mm_char_space = mm_char_space_mask.as_mm; - mm_output1 = _mm_or_si128(mm_output1, mm_char_space); + __m256i mm_char_space = mm_char_space_mask.as_mm; + mm_out1 = _mm256_or_si256(mm_out1, mm_char_space); + mm_char_space = _mm256_srli_si256(mm_char_space, 1); + mm_out2 = _mm256_or_si256(mm_out2, mm_char_space); + mm_char_space = _mm256_srli_si256(mm_char_space, 1); + mm_out3 = _mm256_or_si256(mm_out3, mm_char_space); + + mm_output1 = _mm256_permute2x128_si256(mm_out1, mm_out2, 2u << 4 | 0u); + mm_output2 = _mm256_permute2x128_si256(mm_out3, mm_out1, 3u << 4 | 0u); + mm_output3 = _mm256_permute2x128_si256(mm_out2, mm_out3, 3u << 4 | 1u); +} + +//! Dumps a pack of input data into a string of 8 bit ASCII characters +static BOOST_LOG_FORCEINLINE void dump_pack(__m256i mm_char_a, __m128i mm_input, __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3) +{ + // Split half-bytes + __m128i mm_input_hi = _mm_srli_epi16(mm_input, 4); + __m256i mm = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi8(mm_input_hi, mm_input)), _mm_unpackhi_epi8(mm_input_hi, mm_input), 1); + mm = _mm256_and_si256(mm, _mm256_set1_epi8(0x0F)); + + // Stringize the halves + __m256i mm_addend = _mm256_cmpgt_epi8(mm, _mm256_set1_epi8(9)); + mm_addend = _mm256_blendv_epi8(_mm256_set1_epi8('0'), mm_char_a, mm_addend); + mm = _mm256_add_epi8(mm, mm_addend); + + // Insert spaces between stringized bytes: + __m256i mm_out13 = _mm256_shuffle_epi8(mm, mm_shuffle_pattern13.as_mm); + __m128i mm_out2 = _mm_shuffle_epi8(_mm_alignr_epi8(_mm256_extractf128_si256(mm, 1), _mm256_castsi256_si128(mm), 10), _mm256_castsi256_si128(mm_shuffle_pattern2.as_mm)); + + __m128i mm_char_space = _mm256_castsi256_si128(mm_char_space_mask.as_mm); + mm_output1 = _mm_or_si128(_mm256_castsi256_si128(mm_out13), mm_char_space); mm_char_space = _mm_srli_si128(mm_char_space, 1); - mm_output2 = _mm_or_si128(mm_output2, mm_char_space); + mm_output2 = _mm_or_si128(mm_out2, mm_char_space); mm_char_space = _mm_srli_si128(mm_char_space, 1); - mm_output3 = _mm_or_si128(mm_output3, mm_char_space); + mm_output3 = _mm_or_si128(_mm256_extractf128_si256(mm_out13, 1), mm_char_space); +} + +template< typename CharT > +BOOST_LOG_FORCEINLINE void store_characters(__m256i mm_chars, CharT* buf) +{ + switch (sizeof(CharT)) + { + case 1: + _mm256_store_si256(reinterpret_cast< __m256i* >(buf), mm_chars); + break; + + case 2: + _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi16(_mm256_castsi256_si128(mm_chars))); + _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 1, _mm256_cvtepu8_epi16(_mm256_extractf128_si256(mm_chars, 1))); + break; + + case 4: + { + __m256i mm = _mm256_unpackhi_epi64(mm_chars, mm_chars); + _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(mm_chars))); + _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 1, _mm256_cvtepu8_epi32(_mm256_castsi256_si128(mm))); + _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 2, _mm256_cvtepu8_epi32(_mm256_extractf128_si256(mm_chars, 1))); + _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 3, _mm256_cvtepu8_epi32(_mm256_extractf128_si256(mm, 1))); + } + break; + } } template< typename CharT > @@ -105,22 +158,14 @@ BOOST_LOG_FORCEINLINE void store_characters(__m128i mm_chars, CharT* buf) break; case 2: - { - __m128i mm_0 = _mm_setzero_si128(); - _mm_store_si128(reinterpret_cast< __m128i* >(buf), _mm_unpacklo_epi8(mm_chars, mm_0)); - _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 1, _mm_unpackhi_epi8(mm_chars, mm_0)); - } + _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi16(mm_chars)); break; case 4: { - __m128i mm_0 = _mm_setzero_si128(); - __m128i mm = _mm_unpacklo_epi8(mm_chars, mm_0); - _mm_store_si128(reinterpret_cast< __m128i* >(buf), _mm_unpacklo_epi16(mm, mm_0)); - _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 1, _mm_unpackhi_epi16(mm, mm_0)); - mm = _mm_unpackhi_epi8(mm_chars, mm_0); - _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 2, _mm_unpacklo_epi16(mm, mm_0)); - _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 3, _mm_unpackhi_epi16(mm, mm_0)); + __m128i mm = _mm_unpackhi_epi64(mm_chars, mm_chars); + _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi32(mm_chars)); + _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 1, _mm256_cvtepu8_epi32(mm)); } break; } @@ -131,29 +176,26 @@ BOOST_LOG_FORCEINLINE void dump_data_avx2(const void* data, std::size_t size, st { typedef CharT char_type; - char_type buf_storage[stride * 3u + 16u]; - // Align the temporary buffer at 16 bytes - char_type* const buf = reinterpret_cast< char_type* >((uint8_t*)buf_storage + (16u - (((uintptr_t)(char_type*)buf_storage) & 15u))); + char_type buf_storage[stride * 3u + 32u]; + // Align the temporary buffer at 32 bytes + char_type* const buf = reinterpret_cast< char_type* >((uint8_t*)buf_storage + (32u - (((uintptr_t)(char_type*)buf_storage) & 31u))); char_type* buf_begin = buf + 1u; // skip the first space of the first chunk char_type* buf_end = buf + stride * 3u; - __m128i mm_char_10_to_a; - if (strm.flags() & std::ios_base::uppercase) - mm_char_10_to_a = _mm_set1_epi32(0x07070707); // '9' is 0x39 and 'A' is 0x41 in ASCII, so we have to add 0x07 to 0x3A to get uppercase letters - else - mm_char_10_to_a = _mm_set1_epi32(0x27272727); // ...and 'a' is 0x61, which means we have to add 0x27 to 0x3A to get lowercase letters + const __m256i mm_char_a = _mm256_set1_epi8(((strm.flags() & std::ios_base::uppercase) ? 'A' : 'a') - 10); // First, check the input alignment const uint8_t* p = static_cast< const uint8_t* >(data); - if (const std::size_t prealign_size = ((16u - ((uintptr_t)p & 15u)) & 15u)) + if (const std::size_t prealign_size = ((32u - ((uintptr_t)p & 31u)) & 31u)) { - __m128i mm_input = _mm_lddqu_si128(reinterpret_cast< const __m128i* >(p)); - __m128i mm_output1, mm_output2, mm_output3; - dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3); + __m256i mm_input = _mm256_lddqu_si256(reinterpret_cast< const __m256i* >(p)); + __m256i mm_output1, mm_output2, mm_output3; + dump_pack(mm_char_a, mm_input, mm_output1, mm_output2, mm_output3); store_characters(mm_output1, buf); - store_characters(mm_output2, buf + 16u); - store_characters(mm_output3, buf + 32u); + store_characters(mm_output2, buf + 32u); + store_characters(mm_output3, buf + 64u); + _mm256_zeroupper(); strm.write(buf_begin, prealign_size * 3u - 1u); buf_begin = buf; size -= prealign_size; @@ -165,16 +207,17 @@ BOOST_LOG_FORCEINLINE void dump_data_avx2(const void* data, std::size_t size, st for (std::size_t i = 0; i < stride_count; ++i) { char_type* b = buf; - for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 16u, p += 16u) + for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 32u, p += 32u) { - __m128i mm_input = _mm_load_si128(reinterpret_cast< const __m128i* >(p)); - __m128i mm_output1, mm_output2, mm_output3; - dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3); + __m256i mm_input = _mm256_load_si256(reinterpret_cast< const __m256i* >(p)); + __m256i mm_output1, mm_output2, mm_output3; + dump_pack(mm_char_a, mm_input, mm_output1, mm_output2, mm_output3); store_characters(mm_output1, b); - store_characters(mm_output2, b + 16u); - store_characters(mm_output3, b + 32u); + store_characters(mm_output2, b + 32u); + store_characters(mm_output3, b + 64u); } + _mm256_zeroupper(); strm.write(buf_begin, buf_end - buf_begin); buf_begin = buf; } @@ -186,7 +229,7 @@ BOOST_LOG_FORCEINLINE void dump_data_avx2(const void* data, std::size_t size, st { __m128i mm_input = _mm_load_si128(reinterpret_cast< const __m128i* >(p)); __m128i mm_output1, mm_output2, mm_output3; - dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3); + dump_pack(mm_char_a, mm_input, mm_output1, mm_output2, mm_output3); store_characters(mm_output1, b); store_characters(mm_output2, b + 16u); store_characters(mm_output3, b + 32u); @@ -204,6 +247,7 @@ BOOST_LOG_FORCEINLINE void dump_data_avx2(const void* data, std::size_t size, st b[2] = static_cast< char_type >(char_table[n & 0x0F]); } + _mm256_zeroupper(); strm.write(buf_begin, b - buf_begin); } } diff --git a/test/run/util_manip_dump.cpp b/test/run/util_manip_dump.cpp index 16a2b9d..13d825a 100644 --- a/test/run/util_manip_dump.cpp +++ b/test/run/util_manip_dump.cpp @@ -130,6 +130,31 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(unbounded_binary_large_dump, CharT, char_types) BOOST_CHECK(equal_strings(strm_dump.str(), strm_correct.str())); } +// Test SIMD tail handling +BOOST_AUTO_TEST_CASE_TEMPLATE(unbounded_binary_tail_dump, CharT, char_types) +{ + typedef CharT char_type; + typedef std::basic_string< char_type > string_type; + typedef std::basic_ostringstream< char_type > ostream_type; + + std::vector< unsigned char > data; + ostream_type strm_correct; + // 1023 makes it very unlikely for the buffer to end at 16 or 32 byte boundary, which makes the dump algorithm to process the tail in a special way + for (unsigned int i = 0; i < 1023; ++i) + { + unsigned char n = static_cast< unsigned char >(i); + data.push_back(n); + if (i > 0) + strm_correct << " "; + strm_correct << std::hex << std::setw(2) << std::setfill(static_cast< char_type >('0')) << static_cast< unsigned int >(n); + } + + ostream_type strm_dump; + strm_dump << logging::dump(&data[0], data.size()); + + BOOST_CHECK(equal_strings(strm_dump.str(), strm_correct.str())); +} + // Test bounded dump BOOST_AUTO_TEST_CASE_TEMPLATE(bounded_binary_dump, CharT, char_types) {