Reenabled popcnt for MSVC, improved perf by leveraging ILP.

On MSVC and compatible compilers popcnt is now enabled if it is known at
compile time that the target CPU supports popcnt instruction. Unfortunately,
MSVC does not have a predefined macro specifically for popcnt, so we have
to test for AVX instead. __POPCNT__ is still tested in case clang or user
defines it.

For MSVC, the 64-bit popcount is now also implemented on 32-bit targets by
issuing two 32-bit popcnt instructions. For gcc and compatible compilers,
16-bit popcount implementation is provided, which is better than the generic
popcount implementation, but still slower than byte-wise do_count.

The do_count algorithm implementations have been improved by leveraging
instruction level parallelism better, which gives 0 to 27% improvement and
no regressions.

Also made code formatting more consistent and reduced code duplication between
do_count implementations.
This commit is contained in:
Andrey Semashev
2019-04-13 18:04:54 +03:00
committed by James E. King III
parent c747bec057
commit 83bdf5a335

View File

@@ -5,6 +5,7 @@
// Copyright (c) 2014 Glen Joseph Fernandes
// (glenjofe@gmail.com)
// Copyright (c) 2018 Evgeny Shulgin
// Copyright (c) 2019 Andrey Semashev
//
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
@@ -85,126 +86,129 @@ namespace boost {
struct count_table<false> { /* no table */ };
const unsigned int table_width = 8;
template <bool b>
const byte_type count_table<b>::table[] =
{
// Automatically generated by GPTableGen.exe v.1.0
//
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
const unsigned int table_width = 8;
template <bool b>
const byte_type count_table<b>::table[] =
{
// Automatically generated by GPTableGen.exe v.1.0
//
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
// overload for access by bytes
//
// Some platforms have fast popcount operation, that allow us to implement
// counting bits much more efficiently
//
template <typename ValueType>
BOOST_FORCEINLINE std::size_t popcount(ValueType value) BOOST_NOEXCEPT
{
std::size_t num = 0u;
while (value) {
num += count_table<>::table[value & ((1u<<table_width) - 1)];
value >>= table_width;
}
return num;
}
template <typename Iterator>
inline std::size_t do_count(Iterator first, std::size_t length,
int /*dummy param*/,
value_to_type<access_by_bytes>* )
{
std::size_t num = 0;
if (length)
{
const byte_type * p = object_representation(&*first);
length *= sizeof(*first);
#if (((defined(BOOST_MSVC) && (BOOST_MSVC >= 1600)) || (defined(__clang__) && defined(__c2__)) || (defined(BOOST_INTEL) && defined(_MSC_VER))) && (defined(_M_IX86) || defined(_M_X64))) \
&& (defined(__POPCNT__) || defined(__AVX__))
do {
num += count_table<>::table[*p];
++p;
--length;
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned short>(unsigned short value) BOOST_NOEXCEPT
{
return static_cast<std::size_t>(__popcnt16(value));
}
} while (length);
}
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned int>(unsigned int value) BOOST_NOEXCEPT
{
return static_cast<std::size_t>(__popcnt(value));
}
return num;
}
// Some platforms have fast popcount operation, that allow us to implement
// counting bits much more efficiently
//
template <typename ValueType>
BOOST_FORCEINLINE std::size_t popcount(ValueType value) BOOST_NOEXCEPT
{
std::size_t num = 0;
while (value) {
num += count_table<>::table[value & ((1u<<table_width) - 1)];
value >>= table_width;
}
return num;
}
#if ((defined(BOOST_MSVC) && (BOOST_MSVC >= 1600)) || (defined(__clang__) && defined(__c2__)) || (defined(BOOST_INTEL) && defined(_MSC_VER))) && (defined(_M_IX86) || defined(_M_X64))
/*
per https://github.com/boostorg/dynamic_bitset/issues/33
this code does not support older cpus properly...
it needs to check for cpuid support to avoid undefined behavior
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned short>(unsigned short value) BOOST_NOEXCEPT
{
return static_cast<std::size_t>(__popcnt16(value));
}
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned int>(unsigned int value) BOOST_NOEXCEPT
{
return static_cast<std::size_t>(__popcnt(value));
}
#ifdef _M_X64
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned __int64>(unsigned __int64 value) BOOST_NOEXCEPT
{
return static_cast<std::size_t>(__popcnt64(value));
}
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned __int64>(unsigned __int64 value) BOOST_NOEXCEPT
{
#if defined(_M_X64)
return static_cast<std::size_t>(__popcnt64(value));
#else
return static_cast<std::size_t>(__popcnt(static_cast< unsigned int >(value))) + static_cast<std::size_t>(__popcnt(static_cast< unsigned int >(value >> 32)));
#endif
*/
}
#elif defined(BOOST_GCC) || defined(__clang__) || (defined(BOOST_INTEL) && defined(__GNUC__))
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned int>(unsigned int value) BOOST_NOEXCEPT
{
return __builtin_popcount(value);
}
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned long>(unsigned long value) BOOST_NOEXCEPT
{
return __builtin_popcountl(value);
}
// Note: gcc builtins are implemented by compiler runtime when the target CPU may not support the necessary instructions
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned short>(unsigned short value) BOOST_NOEXCEPT
{
return static_cast<unsigned int>(__builtin_popcount(static_cast<unsigned int>(value)));
}
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned int>(unsigned int value) BOOST_NOEXCEPT
{
return static_cast<unsigned int>(__builtin_popcount(value));
}
template <>
BOOST_FORCEINLINE std::size_t popcount<unsigned long>(unsigned long value) BOOST_NOEXCEPT
{
return static_cast<unsigned int>(__builtin_popcountl(value));
}
template <>
BOOST_FORCEINLINE std::size_t popcount<boost::ulong_long_type>(boost::ulong_long_type value) BOOST_NOEXCEPT
{
return static_cast<unsigned int>(__builtin_popcountll(value));
}
template <>
BOOST_FORCEINLINE std::size_t popcount<boost::ulong_long_type>(boost::ulong_long_type value) BOOST_NOEXCEPT
{
return __builtin_popcountll(value);
}
#endif
// overload for access by blocks
//
template <typename Iterator, typename ValueType>
inline std::size_t do_count(Iterator first, std::size_t length, ValueType,
value_to_type<access_by_blocks>*)
{
std::size_t num = 0;
while (length){
num += popcount<ValueType>(*first);
++first;
--length;
}
// overload for access by blocks
//
template <typename Iterator, typename ValueType>
inline std::size_t do_count(Iterator first, std::size_t length, ValueType,
value_to_type<access_by_blocks>*)
{
std::size_t num1 = 0u, num2 = 0u;
while (length >= 2u) {
num1 += popcount<ValueType>(*first);
++first;
num2 += popcount<ValueType>(*first);
++first;
length -= 2u;
}
return num;
}
if (length > 0u)
num1 += popcount<ValueType>(*first);
return num1 + num2;
}
// overload for access by bytes
//
template <typename Iterator>
inline std::size_t do_count(Iterator first, std::size_t length,
int /*dummy param*/,
value_to_type<access_by_bytes>*)
{
if (length > 0u) {
const byte_type* p = object_representation(&*first);
length *= sizeof(*first);
return do_count(p, length, static_cast<byte_type>(0u),
static_cast< value_to_type<access_by_blocks>* >(0));
}
return 0u;
}
// -------------------------------------------------------