diff --git a/build/Jamfile.v2 b/build/Jamfile.v2 index ae024db..09fa259 100644 --- a/build/Jamfile.v2 +++ b/build/Jamfile.v2 @@ -5,9 +5,13 @@ # http://www.boost.org/LICENSE_1_0.txt) # +import common ; import modules ; import os ; import feature ; +import version ; +import property ; +import architecture ; using mc ; lib psapi ; @@ -23,9 +27,41 @@ local rule default_logapi ( ) feature.feature logapi : unix winnt : propagated ; feature.set-default logapi : [ default_logapi ] ; +rule select-instruction-set ( properties * ) +{ + local result ; + + if x86 in $(properties) && 32 in $(properties) + { + result = [ property.select : $(properties) ] ; + if $(result) + { + if $(result) = i386 || $(result) = i486 + { + if ! $(.annouced-failure) + { + ECHO Boost.Log is not supported on the specified target CPU and will not be built. At least i586 class CPU is required. ; + .annouced-failure = 1 ; + } + result = no ; + } + } + else + { + # We build for Pentium Pro and later CPUs by default. This is used as the target in many Linux distributions, and Windows and OS X also seem to not support older CPUs. + result = i686 ; + } + } + + return $(result) ; +} + project boost/log : source-location ../src : requirements + [ architecture.architecture ] + [ architecture.address-model ] + @select-instruction-set BOOST_SPIRIT_USE_PHOENIX_V3=1 BOOST_THREAD_DONT_USE_CHRONO=1 # Don't introduce false dependency on Boost.Chrono unix:BOOST_LOG_USE_NATIVE_SYSLOG=1 @@ -95,9 +131,191 @@ local BOOST_LOG_COMMON_SRC = dump.cpp ; +local BOOST_LOG_COMMON_SSSE3_SRC = + dump_ssse3 + ; + +local BOOST_LOG_COMMON_AVX2_SRC = + dump_avx2 + ; + +rule ssse3-targets-cond ( properties * ) +{ + local result = no ; + + if x86 in $(properties) + { + if gcc in $(properties) + { + local string_version = [ feature.get-values "toolset-gcc:version" : $(properties) ] ; + local version = [ SPLIT_BY_CHARACTERS $(string_version) : "." ] ; + + if ! [ version.version-less $(version) : 4 3 ] + { + result = "-march=core2 -msse -msse2 -msse3 -mssse3" ; + } + } + else if msvc in $(properties) + { + local string_version = [ feature.get-values "toolset-msvc:version" : $(properties) ] ; + local version = [ SPLIT_BY_CHARACTERS $(string_version) : "." ] ; + + if ! [ version.version-less $(version) : 9 0 ] + { + # MSVC doesn't really care about these switches, all SSE intrinsics are always available, but still... + result = "/arch:SSE2" ; + } + } + else if clang in $(properties) + { + local string_version = [ feature.get-values "toolset-clang:version" : $(properties) ] ; + local version = [ SPLIT_BY_CHARACTERS $(string_version) : "." ] ; + + # I don't know which version started to support SSSE3, but we don't support versions before 3.2 anyway and it has support for SSSE3 + if ! [ version.version-less $(version) : 3 2 ] + { + result = "-march=core2 -msse -msse2 -msse3 -mssse3" ; + } + } + else if intel in $(properties) + { + local string_version = [ feature.get-values "toolset-intel:version" : $(properties) ] ; + local version = [ SPLIT_BY_CHARACTERS $(string_version) : "." ] ; + + if ! [ version.version-less $(version) : 10 0 ] + { + if win in $(properties) + { + if ! [ version.version-less $(version) : 11 0 ] + { + result = "/QxSSSE3" ; + } + else + { + result = "/QxT" ; + } + } + else + { + result = "-march=core2 -msse -msse2 -msse3 -mssse3" ; + } + } + } + } + +# if ! no in $(result) +# { +# ECHO Boost.Log: Using SSSE3 optimized implementation ; +# } +# ECHO $(result) ; + + return $(result) ; +} + +for local src in $(BOOST_LOG_COMMON_SSSE3_SRC) +{ + obj $(src) + : ## sources ## + $(src).cpp + : ## requirements ## + @ssse3-targets-cond + shared:BOOST_LOG_DLL + BOOST_LOG_BUILDING_THE_LIB=1 + : ## default-build ## + : ## usage-requirements ## + BOOST_LOG_USE_SSSE3 + ; +} + + +rule avx2-targets-cond ( properties * ) +{ + local result = no ; + + if x86 in $(properties) + { + if gcc in $(properties) + { + local string_version = [ feature.get-values "toolset-gcc:version" : $(properties) ] ; + local version = [ SPLIT_BY_CHARACTERS $(string_version) : "." ] ; + + if ! [ version.version-less $(version) : 4 7 ] + { + result = "-march=core-avx2 -mavx -mavx2" ; + } + } + else if msvc in $(properties) + { + local string_version = [ feature.get-values "toolset-msvc:version" : $(properties) ] ; + local version = [ SPLIT_BY_CHARACTERS $(string_version) : "." ] ; + + if ! [ version.version-less $(version) : 11 0 ] + { + result = "/arch:AVX" ; + } + } + else if clang in $(properties) + { + local string_version = [ feature.get-values "toolset-clang:version" : $(properties) ] ; + local version = [ SPLIT_BY_CHARACTERS $(string_version) : "." ] ; + + # I don't know which version started to support AVX2, but we don't support versions before 3.2 anyway and it has support for AVX2 + if ! [ version.version-less $(version) : 3 2 ] + { + result = "-march=core-avx2 -mavx -mavx2" ; + } + } + else if intel in $(properties) + { + local string_version = [ feature.get-values "toolset-intel:version" : $(properties) ] ; + local version = [ SPLIT_BY_CHARACTERS $(string_version) : "." ] ; + + # AVX2 support added in Composer XE 2011 Update 7, while the original Composer XE 2011 had icc version 12. + # I don't know what version Update 7 had, so to be on the safe side enable this optimization since version 13. + if ! [ version.version-less $(version) : 13 0 ] + { + if win in $(properties) + { + result = "/QxAVX2" ; + } + else + { + result = "-march=core-avx2 -mavx -mavx2" ; + } + } + } + } + +# if ! no in $(result) +# { +# ECHO Boost.Log: Using AVX2 optimized implementation ; +# } +# ECHO $(result) ; + + return $(result) ; +} + +for local src in $(BOOST_LOG_COMMON_AVX2_SRC) +{ + obj $(src) + : ## sources ## + $(src).cpp + : ## requirements ## + @avx2-targets-cond + shared:BOOST_LOG_DLL + BOOST_LOG_BUILDING_THE_LIB=1 + : ## default-build ## + : ## usage-requirements ## + BOOST_LOG_USE_AVX2 + ; +} + + lib boost_log : ## sources ## $(BOOST_LOG_COMMON_SRC) + $(BOOST_LOG_COMMON_SSSE3_SRC) + $(BOOST_LOG_COMMON_AVX2_SRC) ## winnt sources ## $(BOOST_LOG_MC_SRC) event_log_backend.cpp @@ -118,6 +336,8 @@ lib boost_log lib boost_log : ## sources ## $(BOOST_LOG_COMMON_SRC) + $(BOOST_LOG_COMMON_SSSE3_SRC) + $(BOOST_LOG_COMMON_AVX2_SRC) ## unix sources ## : ## requirements ## shared:BOOST_LOG_DLL diff --git a/build/architecture.jam b/build/architecture.jam new file mode 100644 index 0000000..e89e317 --- /dev/null +++ b/build/architecture.jam @@ -0,0 +1,80 @@ +# architecture.jam +# +# Copyright 2012 Steven Watanabe +# +# Distributed under the Boost Software License Version 1.0. (See +# accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + +import configure ; +import project ; +import path ; +import property ; + +local here = [ modules.binding $(__name__) ] ; + +project.push-current [ project.current ] ; +project.load [ path.join [ path.make $(here:D) ] ../config ] ; +project.pop-current ; + +rule deduce-address-model ( properties * ) +{ + local result = [ property.select : $(properties) ] ; + if $(result) + { + return $(result) ; + } + else + { + if [ configure.builds /boost/architecture//32 : $(properties) : 32-bit ] + { + return 32 ; + } + else if [ configure.builds /boost/architecture//64 : $(properties) : 64-bit ] + { + return 64 ; + } + } +} + +rule address-model ( ) +{ + return @architecture.deduce-address-model ; +} + +rule deduce-architecture ( properties * ) +{ + local result = [ property.select : $(properties) ] ; + if $(result) + { + return $(result) ; + } + else + { + if [ configure.builds /boost/architecture//x86 : $(properties) : x86 ] + { + return x86 ; + } + else if [ configure.builds /boost/architecture//arm : $(properties) : arm ] + { + return arm ; + } + else if [ configure.builds /boost/architecture//mips1 : $(properties) : mips1 ] + { + return mips1 ; + } + else if [ configure.builds /boost/architecture//power : $(properties) : power ] + { + return power ; + } + else if [ configure.builds /boost/architecture//sparc : $(properties) : sparc ] + { + return sparc ; + } + } +} + +rule architecture ( ) +{ + return @architecture.deduce-architecture ; +} diff --git a/include/boost/log/utility/manipulators/dump.hpp b/include/boost/log/utility/manipulators/dump.hpp index 06f76c2..7dcaf74 100644 --- a/include/boost/log/utility/manipulators/dump.hpp +++ b/include/boost/log/utility/manipulators/dump.hpp @@ -30,8 +30,37 @@ BOOST_LOG_OPEN_NAMESPACE namespace aux { -template< typename CharT > -BOOST_LOG_API void dump_data(const void* data, std::size_t size, std::basic_ostream< CharT >& strm); +typedef void dump_data_char_t(const void* data, std::size_t size, std::basic_ostream< char >& strm); +extern BOOST_LOG_API dump_data_char_t* dump_data_char; +BOOST_LOG_FORCEINLINE void dump_data(const void* data, std::size_t size, std::basic_ostream< char >& strm) +{ + (dump_data_char)(data, size, strm); +} + +typedef void dump_data_wchar_t(const void* data, std::size_t size, std::basic_ostream< wchar_t >& strm); +extern BOOST_LOG_API dump_data_wchar_t* dump_data_wchar; +BOOST_LOG_FORCEINLINE void dump_data(const void* data, std::size_t size, std::basic_ostream< wchar_t >& strm) +{ + (dump_data_wchar)(data, size, strm); +} + +#if !defined(BOOST_NO_CXX11_CHAR16_T) +typedef void dump_data_char16_t(const void* data, std::size_t size, std::basic_ostream< char16_t >& strm); +extern BOOST_LOG_API dump_data_char16_t* dump_data_char16; +BOOST_LOG_FORCEINLINE void dump_data(const void* data, std::size_t size, std::basic_ostream< char16_t >& strm) +{ + (dump_data_char16)(data, size, strm); +} +#endif + +#if !defined(BOOST_NO_CXX11_CHAR32_T) +typedef void dump_data_char32_t(const void* data, std::size_t size, std::basic_ostream< char32_t >& strm); +extern BOOST_LOG_API dump_data_char32_t* dump_data_char32; +BOOST_LOG_FORCEINLINE void dump_data(const void* data, std::size_t size, std::basic_ostream< char32_t >& strm) +{ + (dump_data_char32)(data, size, strm); +} +#endif template< std::size_t SizeV, typename R > struct enable_dump_size_based diff --git a/src/dump.cpp b/src/dump.cpp index 5c53791..235c9fc 100644 --- a/src/dump.cpp +++ b/src/dump.cpp @@ -16,6 +16,11 @@ #include #include #include +#if defined(_MSC_VER) +#include "windows_version.hpp" +#include +#include // __cpuid +#endif #include namespace boost { @@ -24,19 +29,40 @@ BOOST_LOG_OPEN_NAMESPACE namespace aux { -enum { stride = 64 }; +#if defined(BOOST_LOG_USE_SSSE3) +extern dump_data_char_t dump_data_char_ssse3; +extern dump_data_wchar_t dump_data_wchar_ssse3; +#if !defined(BOOST_NO_CXX11_CHAR16_T) +extern dump_data_char16_t dump_data_char16_ssse3; +#endif +#if !defined(BOOST_NO_CXX11_CHAR32_T) +extern dump_data_char32_t dump_data_char32_ssse3; +#endif +#endif +#if defined(BOOST_LOG_USE_AVX2) +extern dump_data_char_t dump_data_char_avx2; +extern dump_data_wchar_t dump_data_wchar_avx2; +#if !defined(BOOST_NO_CXX11_CHAR16_T) +extern dump_data_char16_t dump_data_char16_avx2; +#endif +#if !defined(BOOST_NO_CXX11_CHAR32_T) +extern dump_data_char32_t dump_data_char32_avx2; +#endif +#endif -static const char g_lowercase_char_table[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; -static const char g_uppercase_char_table[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; +enum { stride = 256 }; + +extern const char g_lowercase_dump_char_table[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; +extern const char g_uppercase_dump_char_table[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; template< typename CharT > -void dump_data(const void* data, std::size_t size, std::basic_ostream< CharT >& strm) +void dump_data_generic(const void* data, std::size_t size, std::basic_ostream< CharT >& strm) { typedef CharT char_type; char_type buf[stride * 3u]; - const char* const char_table = (strm.flags() & std::ios_base::uppercase) ? g_uppercase_char_table : g_lowercase_char_table; + const char* const char_table = (strm.flags() & std::ios_base::uppercase) ? g_uppercase_dump_char_table : g_lowercase_dump_char_table; const std::size_t stride_count = size / stride, tail_size = size % stride; const uint8_t* p = static_cast< const uint8_t* >(data); @@ -78,19 +104,150 @@ void dump_data(const void* data, std::size_t size, std::basic_ostream< CharT >& } } -template BOOST_LOG_API -void dump_data< char >(const void* data, std::size_t size, std::basic_ostream< char >& strm); -template BOOST_LOG_API -void dump_data< wchar_t >(const void* data, std::size_t size, std::basic_ostream< wchar_t >& strm); +BOOST_LOG_API dump_data_char_t* dump_data_char = &dump_data_generic< char >; +BOOST_LOG_API dump_data_wchar_t* dump_data_wchar = &dump_data_generic< wchar_t >; #if !defined(BOOST_NO_CXX11_CHAR16_T) -template BOOST_LOG_API -void dump_data< char16_t >(const void* data, std::size_t size, std::basic_ostream< char16_t >& strm); +BOOST_LOG_API dump_data_char16_t* dump_data_char16 = &dump_data_generic< char16_t >; #endif #if !defined(BOOST_NO_CXX11_CHAR32_T) -template BOOST_LOG_API -void dump_data< char32_t >(const void* data, std::size_t size, std::basic_ostream< char32_t >& strm); +BOOST_LOG_API dump_data_char32_t* dump_data_char32 = &dump_data_generic< char32_t >; #endif +#if defined(BOOST_LOG_USE_SSSE3) || defined(BOOST_LOG_USE_AVX2) + +BOOST_LOG_ANONYMOUS_NAMESPACE { + +struct function_pointer_initializer +{ + function_pointer_initializer() + { + // First, let's check for the max supported cpuid function + uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; + cpuid(eax, ebx, ecx, edx); + + const uint32_t max_cpuid_function = eax; + if (max_cpuid_function >= 1) + { + eax = 1; + ebx = ecx = edx = 0; + cpuid(eax, ebx, ecx, edx); + + // Check for SSSE3 support + if (ecx & (1u << 9)) + enable_ssse3(); + + if (max_cpuid_function >= 7) + { + // To check for AVX2 availability we also need to verify that OS supports it + // Check that OSXSAVE is supported by CPU + if (ecx & (1u << 27)) + { + // Check that it is used by the OS + bool mmstate = false; +#if defined(__GNUC__) + // Get the XFEATURE_ENABLED_MASK register + __asm__ __volatile__ + ( + "xgetbv\n\t" + : "=a" (eax), "=d" (edx) + : "c" (0) + ); + mmstate = (eax & 6U) == 6U; +#elif defined(_MSC_VER) + // MSVC does not have an intrinsic for xgetbv, we have to query OS + HMODULE hKernel32 = GetModuleHandleA("kernel32.dll"); + if (hKernel32) + { + typedef uint64_t (__stdcall* get_enabled_extended_features_t)(uint64_t); + get_enabled_extended_features_t get_enabled_extended_features = (get_enabled_extended_features_t)GetProcAddress(hKernel32, "GetEnabledExtendedFeatures"); + if (get_enabled_extended_features) + mmstate = get_enabled_extended_features(6u) == 6u; + } +#else +#error Boost.Log: Unexpected compiler +#endif + + if (mmstate) + { + // Finally, check for AVX2 support in CPU + eax = 7; + ebx = ecx = edx = 0; + cpuid(eax, ebx, ecx, edx); + + if (ebx & (1U << 5)) + enable_avx2(); + } + } + } + } + } + +private: + static void cpuid(uint32_t& eax, uint32_t& ebx, uint32_t& ecx, uint32_t& edx) + { +#if defined(__GNUC__) +#if defined(__i386__) && defined(__PIC__) && __PIC__ != 0 + // We have to backup ebx in 32 bit PIC code because it is reserved by the ABI + uint32_t ebx_backup; + __asm__ __volatile__ + ( + "movl %%ebx, %0\n\t" + "movl %1, %%ebx\n\t" + "cpuid\n\t" + "movl %%ebx, %1\n\t" + "movl %0, %%ebx\n\t" + : "=m" (ebx_backup), "+m" (ebx), "+a" (eax), "+c" (ecx), "+d" (edx) + ); +#else + __asm__ __volatile__ + ( + "cpuid\n\t" + : "+a" (eax), "+b" (ebx), "+c" (ecx), "+d" (edx) + ); +#endif +#elif defined(_MSC_VER) + int regs[4] = {}; + __cpuid(regs, eax); + eax = regs[0]; + ebx = regs[1]; + ecx = regs[2]; + edx = regs[3]; +#else +#error Boost.Log: Unexpected compiler +#endif + } + + static void enable_ssse3() + { + dump_data_char = &dump_data_char_ssse3; + dump_data_wchar = &dump_data_wchar_ssse3; +#if !defined(BOOST_NO_CXX11_CHAR16_T) + dump_data_char16 = &dump_data_char16_ssse3; +#endif +#if !defined(BOOST_NO_CXX11_CHAR32_T) + dump_data_char32 = &dump_data_char32_ssse3; +#endif + } + + static void enable_avx2() + { + dump_data_char = &dump_data_char_avx2; + dump_data_wchar = &dump_data_wchar_avx2; +#if !defined(BOOST_NO_CXX11_CHAR16_T) + dump_data_char16 = &dump_data_char16_avx2; +#endif +#if !defined(BOOST_NO_CXX11_CHAR32_T) + dump_data_char32 = &dump_data_char32_avx2; +#endif + } +}; + +static function_pointer_initializer g_function_pointer_initializer; + +} // namespace + +#endif // defined(BOOST_LOG_USE_SSSE3) || defined(BOOST_LOG_USE_AVX2) + } // namespace aux BOOST_LOG_CLOSE_NAMESPACE // namespace log @@ -98,4 +255,3 @@ BOOST_LOG_CLOSE_NAMESPACE // namespace log } // namespace boost #include - diff --git a/src/dump_avx2.cpp b/src/dump_avx2.cpp new file mode 100644 index 0000000..dd31b8d --- /dev/null +++ b/src/dump_avx2.cpp @@ -0,0 +1,270 @@ +/* + * Copyright Andrey Semashev 2007 - 2013. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +/*! + * \file dump_avx2.cpp + * \author Andrey Semashev + * \date 05.05.2013 + * + * \brief This header is the Boost.Log library implementation, see the library documentation + * at http://www.boost.org/libs/log/doc/log.html. + */ + +// NOTE: You should generally avoid including headers as much as possible here, because this file +// is compiled with special compiler options, and any included header may result in generation of +// unintended code with these options and violation of ODR. +#include +#include +#include +#include +#include + +namespace boost { + +BOOST_LOG_OPEN_NAMESPACE + +namespace aux { + +extern const char g_lowercase_dump_char_table[]; +extern const char g_uppercase_dump_char_table[]; + +template< typename CharT > +extern void dump_data_generic(const void* data, std::size_t size, std::basic_ostream< CharT >& strm); + +BOOST_LOG_ANONYMOUS_NAMESPACE { + +enum +{ + packs_per_stride = 32, + stride = packs_per_stride * 16 +}; + +union xmm_constant +{ + uint8_t as_bytes[16]; + __m128i as_mm; +}; + +static const xmm_constant mm_15 = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }}; +static const xmm_constant mm_9 = {{ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 }}; +static const xmm_constant mm_char_0 = {{ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' }}; +static const xmm_constant mm_char_space_mask = {{ ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ' }}; +static const xmm_constant mm_shuffle_pattern1 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }}; +static const xmm_constant mm_shuffle_pattern2 = {{ 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }}; +static const xmm_constant mm_shuffle_pattern3 = {{ 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }}; + +//! Dumps a pack of input data into a string of 8 bit ASCII characters +static BOOST_LOG_FORCEINLINE void dump_pack(__m128i mm_char_10_to_a, __m128i mm_input, __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3) +{ + // Split half-bytes + __m128i mm_input_hi = _mm_and_si128(_mm_srli_epi16(mm_input, 4), mm_15.as_mm); + __m128i mm_input_lo = _mm_and_si128(mm_input, mm_15.as_mm); + + // Stringize each of the halves + __m128i mm_addend_hi = _mm_cmpgt_epi8(mm_input_hi, mm_9.as_mm); + __m128i mm_addend_lo = _mm_cmpgt_epi8(mm_input_lo, mm_9.as_mm); + mm_addend_hi = _mm_and_si128(mm_char_10_to_a, mm_addend_hi); + mm_addend_lo = _mm_and_si128(mm_char_10_to_a, mm_addend_lo); + + mm_input_hi = _mm_add_epi8(mm_input_hi, mm_char_0.as_mm); + mm_input_lo = _mm_add_epi8(mm_input_lo, mm_char_0.as_mm); + + mm_input_hi = _mm_add_epi8(mm_input_hi, mm_addend_hi); + mm_input_lo = _mm_add_epi8(mm_input_lo, mm_addend_lo); + + // Join them back together + __m128i mm_1 = _mm_unpacklo_epi8(mm_input_hi, mm_input_lo); + __m128i mm_2 = _mm_unpackhi_epi8(mm_input_hi, mm_input_lo); + + // Insert spaces between stringized bytes: + // |0123456789abcdef|0123456789abcdef| + // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef| + mm_output1 = _mm_shuffle_epi8(mm_1, mm_shuffle_pattern1.as_mm); + mm_output2 = _mm_shuffle_epi8(_mm_alignr_epi8(mm_2, mm_1, 10), mm_shuffle_pattern2.as_mm); + mm_output3 = _mm_shuffle_epi8(mm_2, mm_shuffle_pattern3.as_mm); + + __m128i mm_char_space = mm_char_space_mask.as_mm; + mm_output1 = _mm_or_si128(mm_output1, mm_char_space); + mm_char_space = _mm_srli_si128(mm_char_space, 1); + mm_output2 = _mm_or_si128(mm_output2, mm_char_space); + mm_char_space = _mm_srli_si128(mm_char_space, 1); + mm_output3 = _mm_or_si128(mm_output3, mm_char_space); +} + +template< typename CharT > +BOOST_LOG_FORCEINLINE void store_characters(__m128i mm_chars, CharT* buf) +{ + switch (sizeof(CharT)) + { + case 1: + _mm_store_si128(reinterpret_cast< __m128i* >(buf), mm_chars); + break; + + case 2: + { + __m128i mm_0 = _mm_setzero_si128(); + _mm_store_si128(reinterpret_cast< __m128i* >(buf), _mm_unpacklo_epi8(mm_chars, mm_0)); + _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 1, _mm_unpackhi_epi8(mm_chars, mm_0)); + } + break; + + case 4: + { + __m128i mm_0 = _mm_setzero_si128(); + __m128i mm = _mm_unpacklo_epi8(mm_chars, mm_0); + _mm_store_si128(reinterpret_cast< __m128i* >(buf), _mm_unpacklo_epi16(mm, mm_0)); + _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 1, _mm_unpackhi_epi16(mm, mm_0)); + mm = _mm_unpackhi_epi8(mm_chars, mm_0); + _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 2, _mm_unpacklo_epi16(mm, mm_0)); + _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 3, _mm_unpackhi_epi16(mm, mm_0)); + } + break; + } +} + +template< typename CharT > +BOOST_LOG_FORCEINLINE void dump_data_avx2(const void* data, std::size_t size, std::basic_ostream< CharT >& strm) +{ + typedef CharT char_type; + + char_type buf_storage[stride * 3u + 16u]; + // Align the temporary buffer at 16 bytes + char_type* const buf = buf_storage + (16u - ((std::size_t)(char_type*)buf_storage & 15u)); + char_type* buf_begin = buf + 1u; // skip the first space of the first chunk + char_type* buf_end = buf + stride * 3u; + + __m128i mm_char_10_to_a; + if (strm.flags() & std::ios_base::uppercase) + mm_char_10_to_a = _mm_set1_epi32(0x07070707); // '9' is 0x39 and 'A' is 0x41 in ASCII, so we have to add 0x07 to 0x3A to get uppercase letters + else + mm_char_10_to_a = _mm_set1_epi32(0x27272727); // ...and 'a' is 0x61, which means we have to add 0x27 to 0x3A to get lowercase letters + + // First, check the input alignment + const uint8_t* p = static_cast< const uint8_t* >(data); + if (const std::size_t prealign_size = ((16u - ((std::size_t)p & 15u)) & 15u)) + { + __m128i mm_input = _mm_lddqu_si128(reinterpret_cast< const __m128i* >(p)); + __m128i mm_output1, mm_output2, mm_output3; + dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3); + store_characters(mm_output1, buf); + store_characters(mm_output2, buf + 16u); + store_characters(mm_output3, buf + 32u); + + strm.write(buf_begin, prealign_size * 3u - 1u); + buf_begin = buf; + size -= prealign_size; + p += prealign_size; + } + + const std::size_t stride_count = size / stride; + std::size_t tail_size = size % stride; + for (std::size_t i = 0; i < stride_count; ++i) + { + char_type* b = buf; + for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 16u, p += 16u) + { + __m128i mm_input = _mm_load_si128(reinterpret_cast< const __m128i* >(p)); + __m128i mm_output1, mm_output2, mm_output3; + dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3); + store_characters(mm_output1, buf); + store_characters(mm_output2, buf + 16u); + store_characters(mm_output3, buf + 32u); + } + + strm.write(buf_begin, buf_end - buf_begin); + buf_begin = buf; + } + + if (tail_size > 0) + { + char_type* b = buf; + while (tail_size >= 16u) + { + __m128i mm_input = _mm_load_si128(reinterpret_cast< const __m128i* >(p)); + __m128i mm_output1, mm_output2, mm_output3; + dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3); + store_characters(mm_output1, buf); + store_characters(mm_output2, buf + 16u); + store_characters(mm_output3, buf + 32u); + b += 3u * 16u; + p += 16u; + tail_size -= 16u; + } + + const char* const char_table = (strm.flags() & std::ios_base::uppercase) ? g_uppercase_dump_char_table : g_lowercase_dump_char_table; + for (unsigned int i = 0; i < tail_size; ++i, ++p, b += 3u) + { + uint32_t n = *p; + b[0] = static_cast< char_type >(' '); + b[1] = static_cast< char_type >(char_table[n >> 4]); + b[2] = static_cast< char_type >(char_table[n & 0x0F]); + } + + strm.write(buf_begin, b - buf_begin); + } +} + +} // namespace + +void dump_data_char_avx2(const void* data, std::size_t size, std::basic_ostream< char >& strm) +{ + if (size >= 32) + { + dump_data_avx2(data, size, strm); + } + else + { + dump_data_generic(data, size, strm); + } +} + +void dump_data_wchar_avx2(const void* data, std::size_t size, std::basic_ostream< wchar_t >& strm) +{ + if (size >= 32) + { + dump_data_avx2(data, size, strm); + } + else + { + dump_data_generic(data, size, strm); + } +} + +#if !defined(BOOST_NO_CXX11_CHAR16_T) +void dump_data_char16_avx2(const void* data, std::size_t size, std::basic_ostream< char16_t >& strm) +{ + if (size >= 32) + { + dump_data_avx2(data, size, strm); + } + else + { + dump_data_generic(data, size, strm); + } +} +#endif + +#if !defined(BOOST_NO_CXX11_CHAR32_T) +void dump_data_char32_avx2(const void* data, std::size_t size, std::basic_ostream< char32_t >& strm) +{ + if (size >= 32) + { + dump_data_avx2(data, size, strm); + } + else + { + dump_data_generic(data, size, strm); + } +} +#endif + +} // namespace aux + +BOOST_LOG_CLOSE_NAMESPACE // namespace log + +} // namespace boost + +#include diff --git a/src/dump_ssse3.cpp b/src/dump_ssse3.cpp new file mode 100644 index 0000000..8d80ad4 --- /dev/null +++ b/src/dump_ssse3.cpp @@ -0,0 +1,270 @@ +/* + * Copyright Andrey Semashev 2007 - 2013. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +/*! + * \file dump_ssse3.cpp + * \author Andrey Semashev + * \date 05.05.2013 + * + * \brief This header is the Boost.Log library implementation, see the library documentation + * at http://www.boost.org/libs/log/doc/log.html. + */ + +// NOTE: You should generally avoid including headers as much as possible here, because this file +// is compiled with special compiler options, and any included header may result in generation of +// unintended code with these options and violation of ODR. +#include +#include +#include +#include +#include + +namespace boost { + +BOOST_LOG_OPEN_NAMESPACE + +namespace aux { + +extern const char g_lowercase_dump_char_table[]; +extern const char g_uppercase_dump_char_table[]; + +template< typename CharT > +extern void dump_data_generic(const void* data, std::size_t size, std::basic_ostream< CharT >& strm); + +BOOST_LOG_ANONYMOUS_NAMESPACE { + +enum +{ + packs_per_stride = 32, + stride = packs_per_stride * 16 +}; + +union xmm_constant +{ + uint8_t as_bytes[16]; + __m128i as_mm; +}; + +static const xmm_constant mm_15 = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }}; +static const xmm_constant mm_9 = {{ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 }}; +static const xmm_constant mm_char_0 = {{ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' }}; +static const xmm_constant mm_char_space_mask = {{ ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ' }}; +static const xmm_constant mm_shuffle_pattern1 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }}; +static const xmm_constant mm_shuffle_pattern2 = {{ 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }}; +static const xmm_constant mm_shuffle_pattern3 = {{ 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }}; + +//! Dumps a pack of input data into a string of 8 bit ASCII characters +static BOOST_LOG_FORCEINLINE void dump_pack(__m128i mm_char_10_to_a, __m128i mm_input, __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3) +{ + // Split half-bytes + __m128i mm_input_hi = _mm_and_si128(_mm_srli_epi16(mm_input, 4), mm_15.as_mm); + __m128i mm_input_lo = _mm_and_si128(mm_input, mm_15.as_mm); + + // Stringize each of the halves + __m128i mm_addend_hi = _mm_cmpgt_epi8(mm_input_hi, mm_9.as_mm); + __m128i mm_addend_lo = _mm_cmpgt_epi8(mm_input_lo, mm_9.as_mm); + mm_addend_hi = _mm_and_si128(mm_char_10_to_a, mm_addend_hi); + mm_addend_lo = _mm_and_si128(mm_char_10_to_a, mm_addend_lo); + + mm_input_hi = _mm_add_epi8(mm_input_hi, mm_char_0.as_mm); + mm_input_lo = _mm_add_epi8(mm_input_lo, mm_char_0.as_mm); + + mm_input_hi = _mm_add_epi8(mm_input_hi, mm_addend_hi); + mm_input_lo = _mm_add_epi8(mm_input_lo, mm_addend_lo); + + // Join them back together + __m128i mm_1 = _mm_unpacklo_epi8(mm_input_hi, mm_input_lo); + __m128i mm_2 = _mm_unpackhi_epi8(mm_input_hi, mm_input_lo); + + // Insert spaces between stringized bytes: + // |0123456789abcdef|0123456789abcdef| + // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef| + mm_output1 = _mm_shuffle_epi8(mm_1, mm_shuffle_pattern1.as_mm); + mm_output2 = _mm_shuffle_epi8(_mm_alignr_epi8(mm_2, mm_1, 10), mm_shuffle_pattern2.as_mm); + mm_output3 = _mm_shuffle_epi8(mm_2, mm_shuffle_pattern3.as_mm); + + __m128i mm_char_space = mm_char_space_mask.as_mm; + mm_output1 = _mm_or_si128(mm_output1, mm_char_space); + mm_char_space = _mm_srli_si128(mm_char_space, 1); + mm_output2 = _mm_or_si128(mm_output2, mm_char_space); + mm_char_space = _mm_srli_si128(mm_char_space, 1); + mm_output3 = _mm_or_si128(mm_output3, mm_char_space); +} + +template< typename CharT > +BOOST_LOG_FORCEINLINE void store_characters(__m128i mm_chars, CharT* buf) +{ + switch (sizeof(CharT)) + { + case 1: + _mm_store_si128(reinterpret_cast< __m128i* >(buf), mm_chars); + break; + + case 2: + { + __m128i mm_0 = _mm_setzero_si128(); + _mm_store_si128(reinterpret_cast< __m128i* >(buf), _mm_unpacklo_epi8(mm_chars, mm_0)); + _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 1, _mm_unpackhi_epi8(mm_chars, mm_0)); + } + break; + + case 4: + { + __m128i mm_0 = _mm_setzero_si128(); + __m128i mm = _mm_unpacklo_epi8(mm_chars, mm_0); + _mm_store_si128(reinterpret_cast< __m128i* >(buf), _mm_unpacklo_epi16(mm, mm_0)); + _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 1, _mm_unpackhi_epi16(mm, mm_0)); + mm = _mm_unpackhi_epi8(mm_chars, mm_0); + _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 2, _mm_unpacklo_epi16(mm, mm_0)); + _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 3, _mm_unpackhi_epi16(mm, mm_0)); + } + break; + } +} + +template< typename CharT > +BOOST_LOG_FORCEINLINE void dump_data_ssse3(const void* data, std::size_t size, std::basic_ostream< CharT >& strm) +{ + typedef CharT char_type; + + char_type buf_storage[stride * 3u + 16u]; + // Align the temporary buffer at 16 bytes + char_type* const buf = buf_storage + (16u - ((std::size_t)(char_type*)buf_storage & 15u)); + char_type* buf_begin = buf + 1u; // skip the first space of the first chunk + char_type* buf_end = buf + stride * 3u; + + __m128i mm_char_10_to_a; + if (strm.flags() & std::ios_base::uppercase) + mm_char_10_to_a = _mm_set1_epi32(0x07070707); // '9' is 0x39 and 'A' is 0x41 in ASCII, so we have to add 0x07 to 0x3A to get uppercase letters + else + mm_char_10_to_a = _mm_set1_epi32(0x27272727); // ...and 'a' is 0x61, which means we have to add 0x27 to 0x3A to get lowercase letters + + // First, check the input alignment + const uint8_t* p = static_cast< const uint8_t* >(data); + if (const std::size_t prealign_size = ((16u - ((std::size_t)p & 15u)) & 15u)) + { + __m128i mm_input = _mm_lddqu_si128(reinterpret_cast< const __m128i* >(p)); + __m128i mm_output1, mm_output2, mm_output3; + dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3); + store_characters(mm_output1, buf); + store_characters(mm_output2, buf + 16u); + store_characters(mm_output3, buf + 32u); + + strm.write(buf_begin, prealign_size * 3u - 1u); + buf_begin = buf; + size -= prealign_size; + p += prealign_size; + } + + const std::size_t stride_count = size / stride; + std::size_t tail_size = size % stride; + for (std::size_t i = 0; i < stride_count; ++i) + { + char_type* b = buf; + for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 16u, p += 16u) + { + __m128i mm_input = _mm_load_si128(reinterpret_cast< const __m128i* >(p)); + __m128i mm_output1, mm_output2, mm_output3; + dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3); + store_characters(mm_output1, buf); + store_characters(mm_output2, buf + 16u); + store_characters(mm_output3, buf + 32u); + } + + strm.write(buf_begin, buf_end - buf_begin); + buf_begin = buf; + } + + if (tail_size > 0) + { + char_type* b = buf; + while (tail_size >= 16u) + { + __m128i mm_input = _mm_load_si128(reinterpret_cast< const __m128i* >(p)); + __m128i mm_output1, mm_output2, mm_output3; + dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3); + store_characters(mm_output1, buf); + store_characters(mm_output2, buf + 16u); + store_characters(mm_output3, buf + 32u); + b += 3u * 16u; + p += 16u; + tail_size -= 16u; + } + + const char* const char_table = (strm.flags() & std::ios_base::uppercase) ? g_uppercase_dump_char_table : g_lowercase_dump_char_table; + for (unsigned int i = 0; i < tail_size; ++i, ++p, b += 3u) + { + uint32_t n = *p; + b[0] = static_cast< char_type >(' '); + b[1] = static_cast< char_type >(char_table[n >> 4]); + b[2] = static_cast< char_type >(char_table[n & 0x0F]); + } + + strm.write(buf_begin, b - buf_begin); + } +} + +} // namespace + +void dump_data_char_ssse3(const void* data, std::size_t size, std::basic_ostream< char >& strm) +{ + if (size >= 32) + { + dump_data_ssse3(data, size, strm); + } + else + { + dump_data_generic(data, size, strm); + } +} + +void dump_data_wchar_ssse3(const void* data, std::size_t size, std::basic_ostream< wchar_t >& strm) +{ + if (size >= 32) + { + dump_data_ssse3(data, size, strm); + } + else + { + dump_data_generic(data, size, strm); + } +} + +#if !defined(BOOST_NO_CXX11_CHAR16_T) +void dump_data_char16_ssse3(const void* data, std::size_t size, std::basic_ostream< char16_t >& strm) +{ + if (size >= 32) + { + dump_data_ssse3(data, size, strm); + } + else + { + dump_data_generic(data, size, strm); + } +} +#endif + +#if !defined(BOOST_NO_CXX11_CHAR32_T) +void dump_data_char32_ssse3(const void* data, std::size_t size, std::basic_ostream< char32_t >& strm) +{ + if (size >= 32) + { + dump_data_ssse3(data, size, strm); + } + else + { + dump_data_generic(data, size, strm); + } +} +#endif + +} // namespace aux + +BOOST_LOG_CLOSE_NAMESPACE // namespace log + +} // namespace boost + +#include diff --git a/test/performance/Jamfile.v2 b/test/performance/Jamfile.v2 index 4545cbb..b07f415 100644 --- a/test/performance/Jamfile.v2 +++ b/test/performance/Jamfile.v2 @@ -9,3 +9,7 @@ exe record_emission : record_emission.cpp ../../build//boost_log ; +exe dump + : dump.cpp ../../build//boost_log + ; + diff --git a/test/performance/dump.cpp b/test/performance/dump.cpp new file mode 100644 index 0000000..baadd1c --- /dev/null +++ b/test/performance/dump.cpp @@ -0,0 +1,76 @@ +/* + * Copyright Andrey Semashev 2007 - 2013. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +/*! + * \file dump.cpp + * \author Andrey Semashev + * \date 05.05.2013 + * + * \brief This code measures performance dumping binary data + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace logging = boost::log; + +const unsigned int base_loop_count = 10000; + +void test(std::size_t block_size) +{ + std::cout << "Block size: " << block_size << " bytes."; + + std::vector< boost::uint8_t > data; + data.resize(block_size); + std::generate_n(data.begin(), block_size, &std::rand); + + std::string str; + logging::formatting_ostream strm(str); + + const boost::uint8_t* const p = &data[0]; + + boost::uint64_t data_processed = 0, duration = 0; + boost::posix_time::ptime start, end; + start = boost::date_time::microsec_clock< boost::posix_time::ptime >::universal_time(); + do + { + for (unsigned int i = 0; i < base_loop_count; ++i) + { + strm << logging::dump(p, block_size); + str.clear(); + } + end = boost::date_time::microsec_clock< boost::posix_time::ptime >::universal_time(); + data_processed += base_loop_count * block_size; + duration = (end - start).total_microseconds(); + } + while (duration < 2000000); + + std::cout << " Test duration: " << duration << " us (" + << std::fixed << std::setprecision(3) << static_cast< double >(data_processed) / (static_cast< double >(duration) * (1048576.0 / 1000000.0)) + << " MiB per second)" << std::endl; +} + +int main(int argc, char* argv[]) +{ + test(32); + test(128); + test(1024); + test(16384); + test(1048576); + + return 0; +} diff --git a/test/performance/record_emission.cpp b/test/performance/record_emission.cpp index 8e64f69..46229cd 100644 --- a/test/performance/record_emission.cpp +++ b/test/performance/record_emission.cpp @@ -68,7 +68,7 @@ namespace { public sinks::basic_sink_backend< sinks::concurrent_feeding > { public: - void consume(logging::record const& rec) + void consume(logging::record_view const& rec) { } };