From b2c6d37a1afb9c136f5f86f6871e720435159da9 Mon Sep 17 00:00:00 2001 From: Andrey Semashev Date: Fri, 14 Jul 2017 19:25:01 +0300 Subject: [PATCH] Added byte/word-wide implementations of atomic ops on ARM. Use ldrexb/w and strexb/w on ARMv7 and later to implement byte/word-wide atomic ops. On the older ARM versions we still have to use 32-bit widening implementation. Also allowed immediate constants in some of the operations to improve generated code. Common ARM code extracted to a separate header to reuse with extra ops. --- include/boost/atomic/detail/ops_gcc_arm.hpp | 654 +++++++++++++++--- .../atomic/detail/ops_gcc_arm_common.hpp | 133 ++++ 2 files changed, 672 insertions(+), 115 deletions(-) create mode 100644 include/boost/atomic/detail/ops_gcc_arm_common.hpp diff --git a/include/boost/atomic/detail/ops_gcc_arm.hpp b/include/boost/atomic/detail/ops_gcc_arm.hpp index 4513abd..0cea16b 100644 --- a/include/boost/atomic/detail/ops_gcc_arm.hpp +++ b/include/boost/atomic/detail/ops_gcc_arm.hpp @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #ifdef BOOST_HAS_PRAGMA_ONCE @@ -54,105 +54,6 @@ namespace detail { // (Actually it looks like these are available from version 6k onwards.) // FIXME these are not yet used; should be mostly a matter of copy-and-paste. // I think you can supply an immediate offset to the address. -// -// A memory barrier is effected using a "co-processor 15" instruction, -// though a separate assembler mnemonic is available for it in v7. -// -// "Thumb 1" is a subset of the ARM instruction set that uses a 16-bit encoding. It -// doesn't include all instructions and in particular it doesn't include the co-processor -// instruction used for the memory barrier or the load-locked/store-conditional -// instructions. So, if we're compiling in "Thumb 1" mode, we need to wrap all of our -// asm blocks with code to temporarily change to ARM mode. -// -// You can only change between ARM and Thumb modes when branching using the bx instruction. -// bx takes an address specified in a register. The least significant bit of the address -// indicates the mode, so 1 is added to indicate that the destination code is Thumb. -// A temporary register is needed for the address and is passed as an argument to these -// macros. It must be one of the "low" registers accessible to Thumb code, specified -// using the "l" attribute in the asm statement. -// -// Architecture v7 introduces "Thumb 2", which does include (almost?) all of the ARM -// instruction set. (Actually, there was an extension of v6 called v6T2 which supported -// "Thumb 2" mode, but its architecture manual is no longer available, referring to v7.) -// So in v7 we don't need to change to ARM mode; we can write "universal -// assembler" which will assemble to Thumb 2 or ARM code as appropriate. The only thing -// we need to do to make this "universal" assembler mode work is to insert "IT" instructions -// to annotate the conditional instructions. These are ignored in other modes (e.g. v6), -// so they can always be present. - -// A note about memory_order_consume. Technically, this architecture allows to avoid -// unnecessary memory barrier after consume load since it supports data dependency ordering. -// However, some compiler optimizations may break a seemingly valid code relying on data -// dependency tracking by injecting bogus branches to aid out of order execution. -// This may happen not only in Boost.Atomic code but also in user's code, which we have no -// control of. See this thread: http://lists.boost.org/Archives/boost/2014/06/213890.php. -// For this reason we promote memory_order_consume to memory_order_acquire. - -#if defined(__thumb__) && !defined(__thumb2__) -#define BOOST_ATOMIC_DETAIL_ARM_ASM_START(TMPREG) "adr " #TMPREG ", 8f\n" "bx " #TMPREG "\n" ".arm\n" ".align 4\n" "8:\n" -#define BOOST_ATOMIC_DETAIL_ARM_ASM_END(TMPREG) "adr " #TMPREG ", 9f + 1\n" "bx " #TMPREG "\n" ".thumb\n" ".align 2\n" "9:\n" -#define BOOST_ATOMIC_DETAIL_ARM_ASM_TMPREG_CONSTRAINT(var) "=&l" (var) -#else -// The tmpreg may be wasted in this case, which is non-optimal. -#define BOOST_ATOMIC_DETAIL_ARM_ASM_START(TMPREG) -#define BOOST_ATOMIC_DETAIL_ARM_ASM_END(TMPREG) -#define BOOST_ATOMIC_DETAIL_ARM_ASM_TMPREG_CONSTRAINT(var) "=&r" (var) -#endif - -struct gcc_arm_operations_base -{ - static BOOST_CONSTEXPR_OR_CONST bool is_always_lock_free = true; - - static BOOST_FORCEINLINE void fence_before(memory_order order) BOOST_NOEXCEPT - { - if ((order & memory_order_release) != 0) - hardware_full_fence(); - } - - static BOOST_FORCEINLINE void fence_after(memory_order order) BOOST_NOEXCEPT - { - if ((order & (memory_order_consume | memory_order_acquire)) != 0) - hardware_full_fence(); - } - - static BOOST_FORCEINLINE void fence_after_store(memory_order order) BOOST_NOEXCEPT - { - if (order == memory_order_seq_cst) - hardware_full_fence(); - } - - static BOOST_FORCEINLINE void hardware_full_fence() BOOST_NOEXCEPT - { -#if defined(BOOST_ATOMIC_DETAIL_ARM_HAS_DMB) - // Older binutils (supposedly, older than 2.21.1) didn't support symbolic or numeric arguments of the "dmb" instruction such as "ish" or "#11". - // As a workaround we have to inject encoded bytes of the instruction. There are two encodings for the instruction: ARM and Thumb. See ARM Architecture Reference Manual, A8.8.43. - // Since we cannot detect binutils version at compile time, we'll have to always use this hack. - __asm__ __volatile__ - ( -#if defined(__thumb2__) - ".short 0xF3BF, 0x8F5B\n" // dmb ish -#else - ".word 0xF57FF05B\n" // dmb ish -#endif - : - : - : "memory" - ); -#else - int tmp; - __asm__ __volatile__ - ( - BOOST_ATOMIC_DETAIL_ARM_ASM_START(%0) - "mcr\tp15, 0, r0, c7, c10, 5\n" - BOOST_ATOMIC_DETAIL_ARM_ASM_END(%0) - : "=&l" (tmp) - : - : "memory" - ); -#endif - } -}; - template< bool Signed > struct operations< 4u, Signed > : @@ -221,7 +122,7 @@ struct operations< 4u, Signed > : [success] "=&r" (success), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [expected] "r" (expected), // %4 + : [expected] "Ir" (expected), // %4 [desired] "r" (desired) // %5 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); @@ -257,7 +158,7 @@ struct operations< 4u, Signed > : [success] "=&r" (success), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [expected] "r" (expected), // %4 + : [expected] "Ir" (expected), // %4 [desired] "r" (desired) // %5 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); @@ -288,7 +189,7 @@ struct operations< 4u, Signed > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -314,7 +215,7 @@ struct operations< 4u, Signed > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -340,7 +241,7 @@ struct operations< 4u, Signed > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -366,7 +267,7 @@ struct operations< 4u, Signed > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -392,7 +293,7 @@ struct operations< 4u, Signed > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -410,6 +311,266 @@ struct operations< 4u, Signed > : } }; +#if defined(BOOST_ATOMIC_DETAIL_ARM_HAS_LDREXB_STREXB) + +template< bool Signed > +struct operations< 1u, Signed > : + public gcc_arm_operations_base +{ + typedef typename make_storage_type< 1u, Signed >::type storage_type; + typedef typename make_storage_type< 1u, Signed >::aligned aligned_storage_type; + typedef typename make_storage_type< 4u, Signed >::type extended_storage_type; + + static BOOST_CONSTEXPR_OR_CONST std::size_t storage_size = 1u; + static BOOST_CONSTEXPR_OR_CONST bool is_signed = Signed; + + static BOOST_FORCEINLINE void store(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + storage = v; + fence_after_store(order); + } + + static BOOST_FORCEINLINE storage_type load(storage_type const volatile& storage, memory_order order) BOOST_NOEXCEPT + { + storage_type v = storage; + fence_after(order); + return v; + } + + static BOOST_FORCEINLINE storage_type exchange(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + extended_storage_type original; + fence_before(order); + uint32_t tmp; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexb %[original], %[storage]\n" // load the original value + "strexb %[tmp], %[value], %[storage]\n" // store the replacement, tmp = store failed + "teq %[tmp], #0\n" // check if store succeeded + "bne 1b\n" + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [tmp] "=&l" (tmp), [original] "=&r" (original), [storage] "+Q" (storage) + : [value] "r" ((extended_storage_type)v) + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE bool compare_exchange_weak( + storage_type volatile& storage, storage_type& expected, storage_type desired, memory_order success_order, memory_order failure_order) BOOST_NOEXCEPT + { + fence_before(success_order); + uint32_t success; + uint32_t tmp; + extended_storage_type original; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "mov %[success], #0\n" // success = 0 + "ldrexb %[original], %[storage]\n" // original = *(&storage) + "cmp %[original], %[expected]\n" // flags = original==expected + "itt eq\n" // [hint that the following 2 instructions are conditional on flags.equal] + "strexbeq %[success], %[desired], %[storage]\n" // if (flags.equal) *(&storage) = desired, success = store failed + "eoreq %[success], %[success], #1\n" // if (flags.equal) success ^= 1 (i.e. make it 1 if store succeeded) + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [success] "=&r" (success), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [expected] "Ir" ((extended_storage_type)expected), // %4 + [desired] "r" ((extended_storage_type)desired) // %5 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = static_cast< storage_type >(original); + return !!success; + } + + static BOOST_FORCEINLINE bool compare_exchange_strong( + storage_type volatile& storage, storage_type& expected, storage_type desired, memory_order success_order, memory_order failure_order) BOOST_NOEXCEPT + { + fence_before(success_order); + uint32_t success; + uint32_t tmp; + extended_storage_type original; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "mov %[success], #0\n" // success = 0 + "1:\n" + "ldrexb %[original], %[storage]\n" // original = *(&storage) + "cmp %[original], %[expected]\n" // flags = original==expected + "bne 2f\n" // if (!flags.equal) goto end + "strexb %[success], %[desired], %[storage]\n" // *(&storage) = desired, success = store failed + "eors %[success], %[success], #1\n" // success ^= 1 (i.e. make it 1 if store succeeded); flags.equal = success == 0 + "beq 1b\n" // if (flags.equal) goto retry + "2:\n" + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [success] "=&r" (success), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [expected] "Ir" ((extended_storage_type)expected), // %4 + [desired] "r" ((extended_storage_type)desired) // %5 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = static_cast< storage_type >(original); + return !!success; + } + + static BOOST_FORCEINLINE storage_type fetch_add(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexb %[original], %[storage]\n" // original = *(&storage) + "add %[result], %[original], %[value]\n" // result = original + value + "strexb %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE storage_type fetch_sub(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexb %[original], %[storage]\n" // original = *(&storage) + "sub %[result], %[original], %[value]\n" // result = original - value + "strexb %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE storage_type fetch_and(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexb %[original], %[storage]\n" // original = *(&storage) + "and %[result], %[original], %[value]\n" // result = original & value + "strexb %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE storage_type fetch_or(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexb %[original], %[storage]\n" // original = *(&storage) + "orr %[result], %[original], %[value]\n" // result = original | value + "strexb %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE storage_type fetch_xor(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexb %[original], %[storage]\n" // original = *(&storage) + "eor %[result], %[original], %[value]\n" // result = original ^ value + "strexb %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE bool test_and_set(storage_type volatile& storage, memory_order order) BOOST_NOEXCEPT + { + return !!exchange(storage, (storage_type)1, order); + } + + static BOOST_FORCEINLINE void clear(storage_type volatile& storage, memory_order order) BOOST_NOEXCEPT + { + store(storage, 0, order); + } +}; + +#else // defined(BOOST_ATOMIC_DETAIL_ARM_HAS_LDREXB_STREXB) template< > struct operations< 1u, false > : @@ -438,7 +599,7 @@ struct operations< 1u, false > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -465,7 +626,7 @@ struct operations< 1u, false > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -500,7 +661,7 @@ struct operations< 1u, true > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -527,7 +688,7 @@ struct operations< 1u, true > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -535,6 +696,268 @@ struct operations< 1u, true > : } }; +#endif // defined(BOOST_ATOMIC_DETAIL_ARM_HAS_LDREXB_STREXB) + +#if defined(BOOST_ATOMIC_DETAIL_ARM_HAS_LDREXH_STREXH) + +template< bool Signed > +struct operations< 2u, Signed > : + public gcc_arm_operations_base +{ + typedef typename make_storage_type< 2u, Signed >::type storage_type; + typedef typename make_storage_type< 2u, Signed >::aligned aligned_storage_type; + typedef typename make_storage_type< 4u, Signed >::type extended_storage_type; + + static BOOST_CONSTEXPR_OR_CONST std::size_t storage_size = 2u; + static BOOST_CONSTEXPR_OR_CONST bool is_signed = Signed; + + static BOOST_FORCEINLINE void store(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + storage = v; + fence_after_store(order); + } + + static BOOST_FORCEINLINE storage_type load(storage_type const volatile& storage, memory_order order) BOOST_NOEXCEPT + { + storage_type v = storage; + fence_after(order); + return v; + } + + static BOOST_FORCEINLINE storage_type exchange(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + extended_storage_type original; + fence_before(order); + uint32_t tmp; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexh %[original], %[storage]\n" // load the original value + "strexh %[tmp], %[value], %[storage]\n" // store the replacement, tmp = store failed + "teq %[tmp], #0\n" // check if store succeeded + "bne 1b\n" + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [tmp] "=&l" (tmp), [original] "=&r" (original), [storage] "+Q" (storage) + : [value] "r" ((extended_storage_type)v) + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE bool compare_exchange_weak( + storage_type volatile& storage, storage_type& expected, storage_type desired, memory_order success_order, memory_order failure_order) BOOST_NOEXCEPT + { + fence_before(success_order); + uint32_t success; + uint32_t tmp; + extended_storage_type original; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "mov %[success], #0\n" // success = 0 + "ldrexh %[original], %[storage]\n" // original = *(&storage) + "cmp %[original], %[expected]\n" // flags = original==expected + "itt eq\n" // [hint that the following 2 instructions are conditional on flags.equal] + "strexheq %[success], %[desired], %[storage]\n" // if (flags.equal) *(&storage) = desired, success = store failed + "eoreq %[success], %[success], #1\n" // if (flags.equal) success ^= 1 (i.e. make it 1 if store succeeded) + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [success] "=&r" (success), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [expected] "Ir" ((extended_storage_type)expected), // %4 + [desired] "r" ((extended_storage_type)desired) // %5 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = static_cast< storage_type >(original); + return !!success; + } + + static BOOST_FORCEINLINE bool compare_exchange_strong( + storage_type volatile& storage, storage_type& expected, storage_type desired, memory_order success_order, memory_order failure_order) BOOST_NOEXCEPT + { + fence_before(success_order); + uint32_t success; + uint32_t tmp; + extended_storage_type original; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "mov %[success], #0\n" // success = 0 + "1:\n" + "ldrexh %[original], %[storage]\n" // original = *(&storage) + "cmp %[original], %[expected]\n" // flags = original==expected + "bne 2f\n" // if (!flags.equal) goto end + "strexh %[success], %[desired], %[storage]\n" // *(&storage) = desired, success = store failed + "eors %[success], %[success], #1\n" // success ^= 1 (i.e. make it 1 if store succeeded); flags.equal = success == 0 + "beq 1b\n" // if (flags.equal) goto retry + "2:\n" + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [success] "=&r" (success), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [expected] "Ir" ((extended_storage_type)expected), // %4 + [desired] "r" ((extended_storage_type)desired) // %5 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = static_cast< storage_type >(original); + return !!success; + } + + static BOOST_FORCEINLINE storage_type fetch_add(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexh %[original], %[storage]\n" // original = *(&storage) + "add %[result], %[original], %[value]\n" // result = original + value + "strexh %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE storage_type fetch_sub(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexh %[original], %[storage]\n" // original = *(&storage) + "sub %[result], %[original], %[value]\n" // result = original - value + "strexh %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE storage_type fetch_and(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexh %[original], %[storage]\n" // original = *(&storage) + "and %[result], %[original], %[value]\n" // result = original & value + "strexh %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE storage_type fetch_or(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexh %[original], %[storage]\n" // original = *(&storage) + "orr %[result], %[original], %[value]\n" // result = original | value + "strexh %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE storage_type fetch_xor(storage_type volatile& storage, storage_type v, memory_order order) BOOST_NOEXCEPT + { + fence_before(order); + uint32_t tmp; + extended_storage_type original, result; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%[tmp]) + "1:\n" + "ldrexh %[original], %[storage]\n" // original = *(&storage) + "eor %[result], %[original], %[value]\n" // result = original ^ value + "strexh %[tmp], %[result], %[storage]\n" // *(&storage) = result, tmp = store failed + "teq %[tmp], #0\n" // flags = tmp==0 + "bne 1b\n" // if (!flags.equal) goto retry + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%[tmp]) + : [original] "=&r" (original), // %0 + [result] "=&r" (result), // %1 + [tmp] "=&l" (tmp), // %2 + [storage] "+Q" (storage) // %3 + : [value] "Ir" ((extended_storage_type)v) // %4 + : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC + ); + fence_after(order); + return static_cast< storage_type >(original); + } + + static BOOST_FORCEINLINE bool test_and_set(storage_type volatile& storage, memory_order order) BOOST_NOEXCEPT + { + return !!exchange(storage, (storage_type)1, order); + } + + static BOOST_FORCEINLINE void clear(storage_type volatile& storage, memory_order order) BOOST_NOEXCEPT + { + store(storage, 0, order); + } +}; + +#else // defined(BOOST_ATOMIC_DETAIL_ARM_HAS_LDREXH_STREXH) template< > struct operations< 2u, false > : @@ -563,7 +986,7 @@ struct operations< 2u, false > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -590,7 +1013,7 @@ struct operations< 2u, false > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -625,7 +1048,7 @@ struct operations< 2u, true > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -652,7 +1075,7 @@ struct operations< 2u, true > : [result] "=&r" (result), // %1 [tmp] "=&l" (tmp), // %2 [storage] "+Q" (storage) // %3 - : [value] "r" (v) // %4 + : [value] "Ir" (v) // %4 : BOOST_ATOMIC_DETAIL_ASM_CLOBBER_CC ); fence_after(order); @@ -660,6 +1083,7 @@ struct operations< 2u, true > : } }; +#endif // defined(BOOST_ATOMIC_DETAIL_ARM_HAS_LDREXH_STREXH) #if defined(BOOST_ATOMIC_DETAIL_ARM_HAS_LDREXD_STREXD) diff --git a/include/boost/atomic/detail/ops_gcc_arm_common.hpp b/include/boost/atomic/detail/ops_gcc_arm_common.hpp new file mode 100644 index 0000000..9ac08ee --- /dev/null +++ b/include/boost/atomic/detail/ops_gcc_arm_common.hpp @@ -0,0 +1,133 @@ +/* + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + * + * Copyright (c) 2009 Helge Bahmann + * Copyright (c) 2013 Tim Blechmann + * Copyright (c) 2014 Andrey Semashev + */ +/*! + * \file atomic/detail/ops_gcc_arm_common.hpp + * + * This header contains basic utilities for gcc ARM backend. + */ + +#ifndef BOOST_ATOMIC_DETAIL_OPS_GCC_ARM_COMMON_HPP_INCLUDED_ +#define BOOST_ATOMIC_DETAIL_OPS_GCC_ARM_COMMON_HPP_INCLUDED_ + +#include +#include +#include + +#ifdef BOOST_HAS_PRAGMA_ONCE +#pragma once +#endif + +namespace boost { +namespace atomics { +namespace detail { + +// A memory barrier is effected using a "co-processor 15" instruction, +// though a separate assembler mnemonic is available for it in v7. +// +// "Thumb 1" is a subset of the ARM instruction set that uses a 16-bit encoding. It +// doesn't include all instructions and in particular it doesn't include the co-processor +// instruction used for the memory barrier or the load-locked/store-conditional +// instructions. So, if we're compiling in "Thumb 1" mode, we need to wrap all of our +// asm blocks with code to temporarily change to ARM mode. +// +// You can only change between ARM and Thumb modes when branching using the bx instruction. +// bx takes an address specified in a register. The least significant bit of the address +// indicates the mode, so 1 is added to indicate that the destination code is Thumb. +// A temporary register is needed for the address and is passed as an argument to these +// macros. It must be one of the "low" registers accessible to Thumb code, specified +// using the "l" attribute in the asm statement. +// +// Architecture v7 introduces "Thumb 2", which does include (almost?) all of the ARM +// instruction set. (Actually, there was an extension of v6 called v6T2 which supported +// "Thumb 2" mode, but its architecture manual is no longer available, referring to v7.) +// So in v7 we don't need to change to ARM mode; we can write "universal +// assembler" which will assemble to Thumb 2 or ARM code as appropriate. The only thing +// we need to do to make this "universal" assembler mode work is to insert "IT" instructions +// to annotate the conditional instructions. These are ignored in other modes (e.g. v6), +// so they can always be present. + +// A note about memory_order_consume. Technically, this architecture allows to avoid +// unnecessary memory barrier after consume load since it supports data dependency ordering. +// However, some compiler optimizations may break a seemingly valid code relying on data +// dependency tracking by injecting bogus branches to aid out of order execution. +// This may happen not only in Boost.Atomic code but also in user's code, which we have no +// control of. See this thread: http://lists.boost.org/Archives/boost/2014/06/213890.php. +// For this reason we promote memory_order_consume to memory_order_acquire. + +#if defined(__thumb__) && !defined(__thumb2__) +#define BOOST_ATOMIC_DETAIL_ARM_ASM_START(TMPREG) "adr " #TMPREG ", 8f\n" "bx " #TMPREG "\n" ".arm\n" ".align 4\n" "8:\n" +#define BOOST_ATOMIC_DETAIL_ARM_ASM_END(TMPREG) "adr " #TMPREG ", 9f + 1\n" "bx " #TMPREG "\n" ".thumb\n" ".align 2\n" "9:\n" +#define BOOST_ATOMIC_DETAIL_ARM_ASM_TMPREG_CONSTRAINT(var) "=&l" (var) +#else +// The tmpreg may be wasted in this case, which is non-optimal. +#define BOOST_ATOMIC_DETAIL_ARM_ASM_START(TMPREG) +#define BOOST_ATOMIC_DETAIL_ARM_ASM_END(TMPREG) +#define BOOST_ATOMIC_DETAIL_ARM_ASM_TMPREG_CONSTRAINT(var) "=&r" (var) +#endif + +struct gcc_arm_operations_base +{ + static BOOST_CONSTEXPR_OR_CONST bool is_always_lock_free = true; + + static BOOST_FORCEINLINE void fence_before(memory_order order) BOOST_NOEXCEPT + { + if ((order & memory_order_release) != 0) + hardware_full_fence(); + } + + static BOOST_FORCEINLINE void fence_after(memory_order order) BOOST_NOEXCEPT + { + if ((order & (memory_order_consume | memory_order_acquire)) != 0) + hardware_full_fence(); + } + + static BOOST_FORCEINLINE void fence_after_store(memory_order order) BOOST_NOEXCEPT + { + if (order == memory_order_seq_cst) + hardware_full_fence(); + } + + static BOOST_FORCEINLINE void hardware_full_fence() BOOST_NOEXCEPT + { +#if defined(BOOST_ATOMIC_DETAIL_ARM_HAS_DMB) + // Older binutils (supposedly, older than 2.21.1) didn't support symbolic or numeric arguments of the "dmb" instruction such as "ish" or "#11". + // As a workaround we have to inject encoded bytes of the instruction. There are two encodings for the instruction: ARM and Thumb. See ARM Architecture Reference Manual, A8.8.43. + // Since we cannot detect binutils version at compile time, we'll have to always use this hack. + __asm__ __volatile__ + ( +#if defined(__thumb2__) + ".short 0xF3BF, 0x8F5B\n" // dmb ish +#else + ".word 0xF57FF05B\n" // dmb ish +#endif + : + : + : "memory" + ); +#else + uint32_t tmp; + __asm__ __volatile__ + ( + BOOST_ATOMIC_DETAIL_ARM_ASM_START(%0) + "mcr\tp15, 0, r0, c7, c10, 5\n" + BOOST_ATOMIC_DETAIL_ARM_ASM_END(%0) + : "=&l" (tmp) + : + : "memory" + ); +#endif + } +}; + +} // namespace detail +} // namespace atomics +} // namespace boost + +#endif // BOOST_ATOMIC_DETAIL_OPS_GCC_ARM_COMMON_HPP_INCLUDED_