2
0
mirror of https://github.com/boostorg/atomic.git synced 2026-02-01 20:12:09 +00:00

Fixed incorrect code generated by clang for 32-bit x86 PIC.

The compiler, surprisingly, uses ebx for memory operands, which messed up the
save/restore logic in asm blocks, resulting the memory operand (which was
supposed to be the pointer to the atomic storage) being incorrect.

First, clang (and, apparently, recent gcc as well) are able to deal with ebx
around the asm blocks by themselves, which makes it unnecessary to save/restore
the register in the asm blocks. Therefore, for those compilers we now use the
non-PIC branch in PIC mode as well. This sidesteps the original problem with
clang.

Second, since we can't be sure if other compilers are able to pull the same
trick, the PIC branches of code have been updated to avoid any memory operand
constraints and use the explicitly calculated pointer in a register instead. We
also no longer use a scratch slot on the stack to save ebx but instead use esi
for that, which is also conveniently used for one of the inputs. This should
be slightly faster as well. The downside is that we're possibly wasting one
register for storing the pointer to the storage, but there seem to be no way
around it.
This commit is contained in:
Andrey Semashev
2018-02-03 00:46:07 +03:00
parent 13845129c4
commit c91cb67396
2 changed files with 81 additions and 68 deletions

View File

@@ -62,7 +62,13 @@
#if (defined(__i386__) || defined(__x86_64__)) && (defined(__clang__) || (defined(BOOST_GCC) && (BOOST_GCC+0) < 40700) || defined(__SUNPRO_CC))
// This macro indicates that the compiler does not support allocating eax:edx or rax:rdx register pairs ("A") in asm blocks
#define BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS
#define BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS
#endif
#if defined(__i386__) && defined(__PIC__) && !(defined(__clang__) || (defined(BOOST_GCC) && (BOOST_GCC+0) >= 50100))
// This macro indicates that asm blocks should preserve ebx value unchanged. Some compilers are able to maintain ebx themselves
// around the asm blocks. For those compilers we don't need to save/restore ebx in asm blocks.
#define BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX
#endif
#if defined(BOOST_NO_CXX11_HDR_TYPE_TRAITS)

View File

@@ -21,7 +21,7 @@
#include <boost/atomic/detail/config.hpp>
#include <boost/atomic/detail/storage_type.hpp>
#include <boost/atomic/capabilities.hpp>
#if defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS) && !defined(BOOST_ATOMIC_DETAIL_HAS_BUILTIN_MEMCPY)
#if defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS) && !defined(BOOST_ATOMIC_DETAIL_HAS_BUILTIN_MEMCPY)
#include <cstring>
#endif
@@ -33,6 +33,14 @@ namespace boost {
namespace atomics {
namespace detail {
// Note: In the 32-bit PIC code guarded with BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX below we have to avoid using memory
// operand constraints because the compiler may choose to use ebx as the base register for that operand. At least, clang
// is known to do that. For this reason we have to pre-compute a pointer to storage and pass it in edi. For the same reason
// we cannot save ebx to the stack with a mov instruction, so we use esi as a scratch register and restore it afterwards.
// Alternatively, we could push/pop the register to the stack, but exchanging the registers is faster.
// The need to pass a pointer in edi is a bit wasteful because normally the memory operand would use a base pointer
// with an offset (e.g. `this` + offset). But unfortunately, there seems to be no way around it.
#if defined(BOOST_ATOMIC_DETAIL_X86_HAS_CMPXCHG8B)
template< bool Signed >
@@ -80,23 +88,22 @@ struct gcc_dcas_x86
}
else
{
#if defined(__PIC__)
uint32_t scratch;
#if defined(BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX)
__asm__ __volatile__
(
"movl %%ebx, %[scratch]\n\t"
"xchgl %%ebx, %%esi\n\t"
"movl %%eax, %%ebx\n\t"
"movl %[dest_lo], %%eax\n\t"
"movl %[dest_hi], %%edx\n\t"
"movl (%[dest]), %%eax\n\t"
"movl 4(%[dest]), %%edx\n\t"
".align 16\n\t"
"1: lock; cmpxchg8b %[dest_lo]\n\t"
"1: lock; cmpxchg8b (%[dest])\n\t"
"jne 1b\n\t"
"movl %[scratch], %%ebx\n\t"
: [scratch] "=m" (scratch), [dest_lo] "=m" (storage), [dest_hi] "=m" (reinterpret_cast< volatile uint32_t* >(&storage)[1])
: "a" ((uint32_t)v), "c" ((uint32_t)(v >> 32))
"xchgl %%ebx, %%esi\n\t"
:
: "a" ((uint32_t)v), "c" ((uint32_t)(v >> 32)), [dest] "D" (&storage)
: BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC_COMMA "edx", "memory"
);
#else // defined(__PIC__)
#else // defined(BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX)
__asm__ __volatile__
(
"movl %[dest_lo], %%eax\n\t"
@@ -108,7 +115,7 @@ struct gcc_dcas_x86
: [value_lo] "b" ((uint32_t)v), "c" ((uint32_t)(v >> 32))
: BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC_COMMA "eax", "edx", "memory"
);
#endif // defined(__PIC__)
#endif // defined(BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX)
}
}
@@ -154,7 +161,7 @@ struct gcc_dcas_x86
#if defined(__clang__)
// Clang cannot allocate eax:edx register pairs but it has sync intrinsics
value = __sync_val_compare_and_swap(&storage, (storage_type)0, (storage_type)0);
#elif defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#elif defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
uint32_t value_bits[2];
// We don't care for comparison result here; the previous value will be stored into value anyway.
// Also we don't care for ebx and ecx values, they just have to be equal to eax and edx before cmpxchg8b.
@@ -196,54 +203,35 @@ struct gcc_dcas_x86
expected = __sync_val_compare_and_swap(&storage, old_expected, desired);
return expected == old_expected;
#elif defined(__PIC__)
#elif defined(BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX)
// Make sure ebx is saved and restored properly in case
// of position independent code. To make this work
// setup register constraints such that ebx can not be
// used by accident e.g. as base address for the variable
// to be modified. Accessing "scratch" should always be okay,
// as it can only be placed on the stack (and therefore
// accessed through ebp or esp only).
//
// In theory, could push/pop ebx onto/off the stack, but movs
// to a prepared stack slot turn out to be faster.
uint32_t scratch;
bool success;
#if defined(BOOST_ATOMIC_DETAIL_ASM_HAS_FLAG_OUTPUTS)
__asm__ __volatile__
(
"movl %%ebx, %[scratch]\n\t"
"movl %[desired_lo], %%ebx\n\t"
"xchgl %%ebx, %[desired_lo]\n\t"
"lock; cmpxchg8b (%[dest])\n\t"
"movl %[scratch], %%ebx\n\t"
: "+A" (expected), [scratch] "=m" (scratch), [success] "=@ccz" (success)
: [desired_lo] "Sm" ((uint32_t)desired), "c" ((uint32_t)(desired >> 32)), [dest] "D" (&storage)
"xchgl %%ebx, %[desired_lo]\n\t"
: "+A" (expected), [success] "=@ccz" (success)
: [desired_lo] "S" ((uint32_t)desired), "c" ((uint32_t)(desired >> 32)), [dest] "D" (&storage)
: BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC_COMMA "memory"
);
#else // defined(BOOST_ATOMIC_DETAIL_ASM_HAS_FLAG_OUTPUTS)
__asm__ __volatile__
(
"movl %%ebx, %[scratch]\n\t"
"movl %[desired_lo], %%ebx\n\t"
"xchgl %%ebx, %[desired_lo]\n\t"
"lock; cmpxchg8b (%[dest])\n\t"
"movl %[scratch], %%ebx\n\t"
"xchgl %%ebx, %[desired_lo]\n\t"
"sete %[success]\n\t"
#if !defined(BOOST_ATOMIC_DETAIL_NO_ASM_CONSTRAINT_ALTERNATIVES)
: "+A,A,A,A,A,A" (expected), [scratch] "=m,m,m,m,m,m" (scratch), [success] "=q,m,q,m,q,m" (success)
: [desired_lo] "S,S,D,D,m,m" ((uint32_t)desired), "c,c,c,c,c,c" ((uint32_t)(desired >> 32)), [dest] "D,D,S,S,D,D" (&storage)
#else
: "+A" (expected), [scratch] "=m" (scratch), [success] "=q" (success)
: "+A" (expected), [success] "=qm" (success)
: [desired_lo] "S" ((uint32_t)desired), "c" ((uint32_t)(desired >> 32)), [dest] "D" (&storage)
#endif
: BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC_COMMA "memory"
);
#endif // defined(BOOST_ATOMIC_DETAIL_ASM_HAS_FLAG_OUTPUTS)
return success;
#else // defined(__PIC__)
#else // defined(BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX)
bool success;
#if defined(BOOST_ATOMIC_DETAIL_ASM_HAS_FLAG_OUTPUTS)
@@ -272,7 +260,7 @@ struct gcc_dcas_x86
return success;
#endif // defined(__PIC__)
#endif // defined(BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX)
}
static BOOST_FORCEINLINE bool compare_exchange_weak(
@@ -283,26 +271,45 @@ struct gcc_dcas_x86
static BOOST_FORCEINLINE storage_type exchange(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT
{
#if defined(__PIC__)
uint32_t scratch;
#if defined(BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX)
#if defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
uint32_t old_bits[2];
__asm__ __volatile__
(
"movl %%ebx, %[scratch]\n\t"
"movl %%eax, %%ebx\n\t"
"movl %%edx, %%ecx\n\t"
"movl %[dest_lo], %%eax\n\t"
"movl %[dest_hi], %%edx\n\t"
"xchgl %%ebx, %%esi\n\t"
"movl (%[dest]), %%eax\n\t"
"movl 4(%[dest]), %%edx\n\t"
".align 16\n\t"
"1: lock; cmpxchg8b %[dest_lo]\n\t"
"1: lock; cmpxchg8b (%[dest])\n\t"
"jne 1b\n\t"
"movl %[scratch], %%ebx\n\t"
: "+A" (v), [scratch] "=m" (scratch), [dest_lo] "+m" (storage), [dest_hi] "+m" (reinterpret_cast< volatile uint32_t* >(&storage)[1])
:
: BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC_COMMA "ecx", "memory"
"xchgl %%ebx, %%esi\n\t"
: "=a" (old_bits[0]), "=d" (old_bits[1])
: "S" ((uint32_t)v), "c" ((uint32_t)(v >> 32)), [dest] "D" (&storage)
: BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC_COMMA "memory"
);
return v;
#else // defined(__PIC__)
#if defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
storage_type old_value;
BOOST_ATOMIC_DETAIL_MEMCPY(&old_value, old_bits, sizeof(old_value));
return old_value;
#else // defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
storage_type old_value;
__asm__ __volatile__
(
"xchgl %%ebx, %%esi\n\t"
"movl (%[dest]), %%eax\n\t"
"movl 4(%[dest]), %%edx\n\t"
".align 16\n\t"
"1: lock; cmpxchg8b (%[dest])\n\t"
"jne 1b\n\t"
"xchgl %%ebx, %%esi\n\t"
: "=A" (old_value)
: "S" ((uint32_t)v), "c" ((uint32_t)(v >> 32)), [dest] "D" (&storage)
: BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC_COMMA "memory"
);
return old_value;
#endif // defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
#else // defined(BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX)
#if defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
uint32_t old_bits[2];
__asm__ __volatile__
(
@@ -319,7 +326,7 @@ struct gcc_dcas_x86
storage_type old_value;
BOOST_ATOMIC_DETAIL_MEMCPY(&old_value, old_bits, sizeof(old_value));
return old_value;
#else // defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#else // defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
storage_type old_value;
__asm__ __volatile__
(
@@ -333,8 +340,8 @@ struct gcc_dcas_x86
: BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC_COMMA "memory"
);
return old_value;
#endif // defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#endif // defined(__PIC__)
#endif // defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
#endif // defined(BOOST_ATOMIC_DETAIL_X86_ASM_PRESERVE_EBX)
}
};
@@ -371,7 +378,7 @@ struct gcc_dcas_x86_64
// Clang cannot allocate rax:rdx register pairs but it has sync intrinsics
storage_type value = storage_type();
return __sync_val_compare_and_swap(&storage, value, value);
#elif defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#elif defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
// Some compilers can't allocate rax:rdx register pair either and also don't support 128-bit __sync_val_compare_and_swap
uint64_t value_bits[2];
@@ -390,7 +397,7 @@ struct gcc_dcas_x86_64
storage_type value;
BOOST_ATOMIC_DETAIL_MEMCPY(&value, value_bits, sizeof(value));
return value;
#else // defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#else // defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
storage_type value;
// We don't care for comparison result here; the previous value will be stored into value anyway.
@@ -419,7 +426,7 @@ struct gcc_dcas_x86_64
expected = __sync_val_compare_and_swap(&storage, old_expected, desired);
return expected == old_expected;
#elif defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#elif defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
// Some compilers can't allocate rax:rdx register pair either but also don't support 128-bit __sync_val_compare_and_swap
bool success;
@@ -434,7 +441,7 @@ struct gcc_dcas_x86_64
return success;
#else // defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#else // defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
bool success;
#if defined(BOOST_ATOMIC_DETAIL_ASM_HAS_FLAG_OUTPUTS)
@@ -463,7 +470,7 @@ struct gcc_dcas_x86_64
return success;
#endif // defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#endif // defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
}
static BOOST_FORCEINLINE bool compare_exchange_weak(
@@ -474,7 +481,7 @@ struct gcc_dcas_x86_64
static BOOST_FORCEINLINE storage_type exchange(storage_type volatile& storage, storage_type v, memory_order) BOOST_NOEXCEPT
{
#if defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#if defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
uint64_t old_bits[2];
__asm__ __volatile__
(
@@ -491,7 +498,7 @@ struct gcc_dcas_x86_64
storage_type old_value;
BOOST_ATOMIC_DETAIL_MEMCPY(&old_value, old_bits, sizeof(old_value));
return old_value;
#else // defined(BOOST_ATOMIC_DETAIL_NO_ASM_AX_DX_PAIRS)
#else // defined(BOOST_ATOMIC_DETAIL_X86_NO_ASM_AX_DX_PAIRS)
storage_type old_value;
__asm__ __volatile__
(