mirror of
https://github.com/boostorg/dynamic_bitset.git
synced 2026-01-19 04:12:09 +00:00
Reenabled popcnt for MSVC, improved perf by leveraging ILP.
On MSVC and compatible compilers popcnt is now enabled if it is known at compile time that the target CPU supports popcnt instruction. Unfortunately, MSVC does not have a predefined macro specifically for popcnt, so we have to test for AVX instead. __POPCNT__ is still tested in case clang or user defines it. For MSVC, the 64-bit popcount is now also implemented on 32-bit targets by issuing two 32-bit popcnt instructions. For gcc and compatible compilers, 16-bit popcount implementation is provided, which is better than the generic popcount implementation, but still slower than byte-wise do_count. The do_count algorithm implementations have been improved by leveraging instruction level parallelism better, which gives 0 to 27% improvement and no regressions. Also made code formatting more consistent and reduced code duplication between do_count implementations.
This commit is contained in:
committed by
James E. King III
parent
c747bec057
commit
83bdf5a335
@@ -5,6 +5,7 @@
|
||||
// Copyright (c) 2014 Glen Joseph Fernandes
|
||||
// (glenjofe@gmail.com)
|
||||
// Copyright (c) 2018 Evgeny Shulgin
|
||||
// Copyright (c) 2019 Andrey Semashev
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE_1_0.txt or copy at
|
||||
@@ -85,126 +86,129 @@ namespace boost {
|
||||
struct count_table<false> { /* no table */ };
|
||||
|
||||
|
||||
const unsigned int table_width = 8;
|
||||
template <bool b>
|
||||
const byte_type count_table<b>::table[] =
|
||||
{
|
||||
// Automatically generated by GPTableGen.exe v.1.0
|
||||
//
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||
};
|
||||
const unsigned int table_width = 8;
|
||||
template <bool b>
|
||||
const byte_type count_table<b>::table[] =
|
||||
{
|
||||
// Automatically generated by GPTableGen.exe v.1.0
|
||||
//
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||
};
|
||||
|
||||
|
||||
// overload for access by bytes
|
||||
//
|
||||
// Some platforms have fast popcount operation, that allow us to implement
|
||||
// counting bits much more efficiently
|
||||
//
|
||||
template <typename ValueType>
|
||||
BOOST_FORCEINLINE std::size_t popcount(ValueType value) BOOST_NOEXCEPT
|
||||
{
|
||||
std::size_t num = 0u;
|
||||
while (value) {
|
||||
num += count_table<>::table[value & ((1u<<table_width) - 1)];
|
||||
value >>= table_width;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
template <typename Iterator>
|
||||
inline std::size_t do_count(Iterator first, std::size_t length,
|
||||
int /*dummy param*/,
|
||||
value_to_type<access_by_bytes>* )
|
||||
{
|
||||
std::size_t num = 0;
|
||||
if (length)
|
||||
{
|
||||
const byte_type * p = object_representation(&*first);
|
||||
length *= sizeof(*first);
|
||||
#if (((defined(BOOST_MSVC) && (BOOST_MSVC >= 1600)) || (defined(__clang__) && defined(__c2__)) || (defined(BOOST_INTEL) && defined(_MSC_VER))) && (defined(_M_IX86) || defined(_M_X64))) \
|
||||
&& (defined(__POPCNT__) || defined(__AVX__))
|
||||
|
||||
do {
|
||||
num += count_table<>::table[*p];
|
||||
++p;
|
||||
--length;
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned short>(unsigned short value) BOOST_NOEXCEPT
|
||||
{
|
||||
return static_cast<std::size_t>(__popcnt16(value));
|
||||
}
|
||||
|
||||
} while (length);
|
||||
}
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned int>(unsigned int value) BOOST_NOEXCEPT
|
||||
{
|
||||
return static_cast<std::size_t>(__popcnt(value));
|
||||
}
|
||||
|
||||
return num;
|
||||
}
|
||||
|
||||
|
||||
// Some platforms have fast popcount operation, that allow us to implement
|
||||
// counting bits much more efficiently
|
||||
//
|
||||
template <typename ValueType>
|
||||
BOOST_FORCEINLINE std::size_t popcount(ValueType value) BOOST_NOEXCEPT
|
||||
{
|
||||
std::size_t num = 0;
|
||||
while (value) {
|
||||
num += count_table<>::table[value & ((1u<<table_width) - 1)];
|
||||
value >>= table_width;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
#if ((defined(BOOST_MSVC) && (BOOST_MSVC >= 1600)) || (defined(__clang__) && defined(__c2__)) || (defined(BOOST_INTEL) && defined(_MSC_VER))) && (defined(_M_IX86) || defined(_M_X64))
|
||||
|
||||
/*
|
||||
per https://github.com/boostorg/dynamic_bitset/issues/33
|
||||
this code does not support older cpus properly...
|
||||
it needs to check for cpuid support to avoid undefined behavior
|
||||
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned short>(unsigned short value) BOOST_NOEXCEPT
|
||||
{
|
||||
return static_cast<std::size_t>(__popcnt16(value));
|
||||
}
|
||||
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned int>(unsigned int value) BOOST_NOEXCEPT
|
||||
{
|
||||
return static_cast<std::size_t>(__popcnt(value));
|
||||
}
|
||||
|
||||
#ifdef _M_X64
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned __int64>(unsigned __int64 value) BOOST_NOEXCEPT
|
||||
{
|
||||
return static_cast<std::size_t>(__popcnt64(value));
|
||||
}
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned __int64>(unsigned __int64 value) BOOST_NOEXCEPT
|
||||
{
|
||||
#if defined(_M_X64)
|
||||
return static_cast<std::size_t>(__popcnt64(value));
|
||||
#else
|
||||
return static_cast<std::size_t>(__popcnt(static_cast< unsigned int >(value))) + static_cast<std::size_t>(__popcnt(static_cast< unsigned int >(value >> 32)));
|
||||
#endif
|
||||
*/
|
||||
}
|
||||
|
||||
#elif defined(BOOST_GCC) || defined(__clang__) || (defined(BOOST_INTEL) && defined(__GNUC__))
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned int>(unsigned int value) BOOST_NOEXCEPT
|
||||
{
|
||||
return __builtin_popcount(value);
|
||||
}
|
||||
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned long>(unsigned long value) BOOST_NOEXCEPT
|
||||
{
|
||||
return __builtin_popcountl(value);
|
||||
}
|
||||
// Note: gcc builtins are implemented by compiler runtime when the target CPU may not support the necessary instructions
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned short>(unsigned short value) BOOST_NOEXCEPT
|
||||
{
|
||||
return static_cast<unsigned int>(__builtin_popcount(static_cast<unsigned int>(value)));
|
||||
}
|
||||
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned int>(unsigned int value) BOOST_NOEXCEPT
|
||||
{
|
||||
return static_cast<unsigned int>(__builtin_popcount(value));
|
||||
}
|
||||
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<unsigned long>(unsigned long value) BOOST_NOEXCEPT
|
||||
{
|
||||
return static_cast<unsigned int>(__builtin_popcountl(value));
|
||||
}
|
||||
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<boost::ulong_long_type>(boost::ulong_long_type value) BOOST_NOEXCEPT
|
||||
{
|
||||
return static_cast<unsigned int>(__builtin_popcountll(value));
|
||||
}
|
||||
|
||||
template <>
|
||||
BOOST_FORCEINLINE std::size_t popcount<boost::ulong_long_type>(boost::ulong_long_type value) BOOST_NOEXCEPT
|
||||
{
|
||||
return __builtin_popcountll(value);
|
||||
}
|
||||
#endif
|
||||
|
||||
// overload for access by blocks
|
||||
//
|
||||
template <typename Iterator, typename ValueType>
|
||||
inline std::size_t do_count(Iterator first, std::size_t length, ValueType,
|
||||
value_to_type<access_by_blocks>*)
|
||||
{
|
||||
std::size_t num = 0;
|
||||
while (length){
|
||||
num += popcount<ValueType>(*first);
|
||||
++first;
|
||||
--length;
|
||||
}
|
||||
// overload for access by blocks
|
||||
//
|
||||
template <typename Iterator, typename ValueType>
|
||||
inline std::size_t do_count(Iterator first, std::size_t length, ValueType,
|
||||
value_to_type<access_by_blocks>*)
|
||||
{
|
||||
std::size_t num1 = 0u, num2 = 0u;
|
||||
while (length >= 2u) {
|
||||
num1 += popcount<ValueType>(*first);
|
||||
++first;
|
||||
num2 += popcount<ValueType>(*first);
|
||||
++first;
|
||||
length -= 2u;
|
||||
}
|
||||
|
||||
return num;
|
||||
}
|
||||
if (length > 0u)
|
||||
num1 += popcount<ValueType>(*first);
|
||||
|
||||
return num1 + num2;
|
||||
}
|
||||
|
||||
// overload for access by bytes
|
||||
//
|
||||
template <typename Iterator>
|
||||
inline std::size_t do_count(Iterator first, std::size_t length,
|
||||
int /*dummy param*/,
|
||||
value_to_type<access_by_bytes>*)
|
||||
{
|
||||
if (length > 0u) {
|
||||
const byte_type* p = object_representation(&*first);
|
||||
length *= sizeof(*first);
|
||||
|
||||
return do_count(p, length, static_cast<byte_type>(0u),
|
||||
static_cast< value_to_type<access_by_blocks>* >(0));
|
||||
}
|
||||
|
||||
return 0u;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------
|
||||
|
||||
|
||||
Reference in New Issue
Block a user