|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 |
|
# error "CUDA synchronization primitives are only supported for sm_70 and up." |
|
#endif |
|
|
|
#ifndef _CUDA_BARRIER |
|
#define _CUDA_BARRIER |
|
|
|
#include "atomic" |
|
#include "cstddef" |
|
|
|
#include "detail/__config" |
|
|
|
#include "detail/__pragma_push" |
|
|
|
#include "detail/libcxx/include/barrier" |
|
|
|
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
|
|
|
template<std::size_t _Alignment> |
|
struct aligned_size_t { |
|
static constexpr std::size_t align = _Alignment; |
|
std::size_t value; |
|
__host__ __device__ |
|
explicit aligned_size_t(size_t __s) : value(__s) { } |
|
__host__ __device__ |
|
operator size_t() const { return value; } |
|
}; |
|
|
|
template<thread_scope _Sco, class _CompletionF = std::__empty_completion> |
|
class barrier : public std::__barrier_base<_CompletionF, _Sco> { |
|
template<thread_scope> |
|
friend class pipeline; |
|
|
|
using std::__barrier_base<_CompletionF, _Sco>::__try_wait; |
|
|
|
public: |
|
barrier() = default; |
|
|
|
barrier(const barrier &) = delete; |
|
barrier & operator=(const barrier &) = delete; |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR |
|
barrier(std::ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) |
|
: std::__barrier_base<_CompletionF, _Sco>(__expected, __completion) { |
|
} |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
friend void init(barrier * __b, std::ptrdiff_t __expected) { |
|
new (__b) barrier(__expected); |
|
} |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
friend void init(barrier * __b, std::ptrdiff_t __expected, _CompletionF __completion) { |
|
new (__b) barrier(__expected, __completion); |
|
} |
|
}; |
|
|
|
struct __block_scope_barrier_base {}; |
|
|
|
_LIBCUDACXX_END_NAMESPACE_CUDA |
|
|
|
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE |
|
|
|
__device__ |
|
inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b); |
|
|
|
_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE |
|
|
|
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
|
|
|
template<> |
|
class barrier<thread_scope_block, std::__empty_completion> : public __block_scope_barrier_base { |
|
using __barrier_base = std::__barrier_base<std::__empty_completion, (int)thread_scope_block>; |
|
__barrier_base __barrier; |
|
|
|
__device__ |
|
friend inline std::uint64_t * device::_LIBCUDACXX_CUDA_ABI_NAMESPACE::barrier_native_handle(barrier<thread_scope_block> & b); |
|
|
|
public: |
|
using arrival_token = typename __barrier_base::arrival_token; |
|
|
|
private: |
|
struct __poll_tester { |
|
barrier const* __this; |
|
arrival_token __phase; |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
__poll_tester(barrier const* __this_, arrival_token&& __phase_) |
|
: __this(__this_) |
|
, __phase(_CUDA_VSTD::move(__phase_)) |
|
{} |
|
|
|
inline _LIBCUDACXX_INLINE_VISIBILITY |
|
bool operator()() const |
|
{ |
|
return __this->__try_wait(__phase); |
|
} |
|
}; |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
bool __try_wait(arrival_token __phase) const { |
|
#if __CUDA_ARCH__ >= 800 |
|
if (__isShared(&__barrier)) { |
|
int __ready = 0; |
|
asm volatile ("{\n\t" |
|
".reg .pred p;\n\t" |
|
"mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t" |
|
"selp.b32 %0, 1, 0, p;\n\t" |
|
"}" |
|
: "=r"(__ready) |
|
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__phase) |
|
: "memory"); |
|
return __ready; |
|
} |
|
else |
|
#endif |
|
{ |
|
return __barrier.__try_wait(std::move(__phase)); |
|
} |
|
} |
|
|
|
template<thread_scope> |
|
friend class pipeline; |
|
|
|
public: |
|
barrier() = default; |
|
|
|
barrier(const barrier &) = delete; |
|
barrier & operator=(const barrier &) = delete; |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
barrier(std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { |
|
static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier<thread_scope_block>, __barrier), "fatal error: bad barrier layout"); |
|
init(this, __expected, __completion); |
|
} |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
~barrier() { |
|
#if __CUDA_ARCH__ >= 800 |
|
if (__isShared(&__barrier)) { |
|
asm volatile ("mbarrier.inval.shared.b64 [%0];" |
|
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
|
: "memory"); |
|
} |
|
#endif |
|
} |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { |
|
#if __CUDA_ARCH__ >= 800 |
|
if (__isShared(&__b->__barrier)) { |
|
asm volatile ("mbarrier.init.shared.b64 [%0], %1;" |
|
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))), |
|
"r"(static_cast<std::uint32_t>(__expected)) |
|
: "memory"); |
|
} |
|
else |
|
#endif |
|
{ |
|
new (&__b->__barrier) __barrier_base(__expected); |
|
} |
|
} |
|
|
|
_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY |
|
arrival_token arrive(std::ptrdiff_t __update = 1) |
|
{ |
|
#if __CUDA_ARCH__ |
|
if (__isShared(&__barrier)) { |
|
arrival_token __token; |
|
#if __CUDA_ARCH__ >= 800 |
|
if (__update > 1) { |
|
asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;" |
|
: "=l"(__token) |
|
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), |
|
"r"(static_cast<std::uint32_t>(__update - 1)) |
|
: "memory"); |
|
} |
|
asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];" |
|
: "=l"(__token) |
|
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
|
: "memory"); |
|
#else |
|
unsigned int __activeA = __match_any_sync(__activemask(), __update); |
|
unsigned int __activeB = __match_any_sync(__activemask(), reinterpret_cast<std::uintptr_t>(&__barrier)); |
|
unsigned int __active = __activeA & __activeB; |
|
int __inc = __popc(__active) * __update; |
|
|
|
unsigned __laneid; |
|
asm volatile ("mov.u32 %0, %laneid;" : "=r"(__laneid)); |
|
int __leader = __ffs(__active) - 1; |
|
|
|
if(__leader == __laneid) |
|
{ |
|
__token = __barrier.arrive(__inc); |
|
} |
|
__token = __shfl_sync(__active, __token, __leader); |
|
#endif |
|
return __token; |
|
} |
|
else |
|
#endif |
|
{ |
|
return __barrier.arrive(__update); |
|
} |
|
} |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void wait(arrival_token && __phase) const |
|
{ |
|
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase))); |
|
} |
|
|
|
inline _LIBCUDACXX_INLINE_VISIBILITY |
|
void arrive_and_wait() |
|
{ |
|
wait(arrive()); |
|
} |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void arrive_and_drop() |
|
{ |
|
#if __CUDA_ARCH__ >= 800 |
|
if (__isShared(&__barrier)) { |
|
asm volatile ("mbarrier.arrive_drop.shared.b64 _, [%0];" |
|
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
|
: "memory"); |
|
} |
|
else |
|
#endif |
|
{ |
|
__barrier.arrive_and_drop(); |
|
} |
|
} |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
static constexpr ptrdiff_t max() noexcept |
|
{ |
|
return (1 << 20) - 1; |
|
} |
|
}; |
|
|
|
_LIBCUDACXX_END_NAMESPACE_CUDA |
|
|
|
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE |
|
|
|
__device__ |
|
inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b) { |
|
return reinterpret_cast<std::uint64_t *>(&b.__barrier); |
|
} |
|
|
|
_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE |
|
|
|
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
|
|
|
template<> |
|
class barrier<thread_scope_thread, std::__empty_completion> : private barrier<thread_scope_block> { |
|
using __base = barrier<thread_scope_block>; |
|
|
|
public: |
|
using __base::__base; |
|
|
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { |
|
init(static_cast<__base *>(__b), __expected, __completion); |
|
} |
|
|
|
using __base::arrive; |
|
using __base::wait; |
|
using __base::arrive_and_wait; |
|
using __base::arrive_and_drop; |
|
using __base::max; |
|
}; |
|
|
|
template<std::size_t _Alignment> |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
inline void __strided_memcpy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride = 1) { |
|
if (__stride == 1) { |
|
memcpy(__destination, __source, __total_size); |
|
} |
|
else { |
|
for (std::size_t __offset = __rank * _Alignment; __offset < __total_size; __offset += __stride * _Alignment) { |
|
memcpy(__destination + __offset, __source + __offset, _Alignment); |
|
} |
|
} |
|
} |
|
|
|
#if __CUDA_ARCH__ >= 800 |
|
template<std::size_t _Alignment, bool _Large = (_Alignment > 16)> |
|
struct __memcpy_async_impl { |
|
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
|
__strided_memcpy<_Alignment>(__destination, __source, __total_size, __rank, __stride); |
|
return false; |
|
} |
|
}; |
|
|
|
template<> |
|
struct __memcpy_async_impl<4, false> { |
|
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
|
for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) { |
|
asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;" |
|
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))), |
|
"l"(__source + __offset) |
|
: "memory"); |
|
} |
|
return true; |
|
} |
|
}; |
|
|
|
template<> |
|
struct __memcpy_async_impl<8, false> { |
|
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
|
for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) { |
|
asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;" |
|
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))), |
|
"l"(__source + __offset) |
|
: "memory"); |
|
} |
|
return true; |
|
} |
|
}; |
|
|
|
template<> |
|
struct __memcpy_async_impl<16, false> { |
|
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
|
for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) { |
|
asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;" |
|
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))), |
|
"l"(__source + __offset) |
|
: "memory"); |
|
} |
|
return true; |
|
} |
|
}; |
|
|
|
template<std::size_t _Alignment> |
|
struct __memcpy_async_impl<_Alignment, true> : public __memcpy_async_impl<16, false> { }; |
|
#endif |
|
|
|
template<thread_scope _Sco, typename _CompF, bool _Is_mbarrier = (_Sco >= thread_scope_block) && std::is_same<_CompF, std::__empty_completion>::value> |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
inline void __memcpy_async_synchronize(barrier<_Sco, _CompF> & __barrier, bool __is_async) { |
|
#if __CUDA_ARCH__ >= 800 |
|
if (__is_async) { |
|
if (_Is_mbarrier && __isShared(&__barrier)) { |
|
asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];" |
|
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
|
: "memory"); |
|
} |
|
else { |
|
asm volatile ("cp.async.wait_all;" |
|
::: "memory"); |
|
} |
|
} |
|
#endif |
|
} |
|
|
|
template<std::size_t _Native_alignment, typename _Group, typename _Sync> |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void inline __memcpy_async(_Group const & __group, char * __destination, char const * __source, std::size_t __size, _Sync & __sync) { |
|
bool __is_async = false; |
|
|
|
#if __CUDA_ARCH__ >= 800 |
|
__is_async = __isShared(__destination) && __isGlobal(__source); |
|
|
|
if (__is_async) { |
|
if (_Native_alignment < 4) { |
|
auto __source_address = reinterpret_cast<std::uintptr_t>(__source); |
|
auto __destination_address = reinterpret_cast<std::uintptr_t>(__destination); |
|
|
|
|
|
auto _Alignment = __ffs(__source_address | __destination_address | __size); |
|
|
|
switch (_Alignment) { |
|
default: __is_async = __memcpy_async_impl<16>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
|
case 4: __is_async = __memcpy_async_impl<8>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
|
case 3: __is_async = __memcpy_async_impl<4>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
|
case 2: |
|
case 1: __is_async = __memcpy_async_impl<1>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
|
} |
|
} |
|
else { |
|
__is_async = __memcpy_async_impl<_Native_alignment>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); |
|
} |
|
} |
|
else |
|
#endif |
|
{ |
|
__strided_memcpy<_Native_alignment>(__destination, __source, __size, __group.thread_rank(), __group.size()); |
|
} |
|
|
|
__memcpy_async_synchronize(__sync, __is_async); |
|
} |
|
|
|
struct __single_thread_group { |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void sync() const {} |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
constexpr std::size_t size() const { return 1; }; |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
constexpr std::size_t thread_rank() const { return 0; }; |
|
}; |
|
|
|
template<typename _Group, class _Tp, thread_scope _Sco, typename _CompF> |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) { |
|
|
|
|
|
|
|
|
|
#if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 |
|
static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type"); |
|
#endif |
|
|
|
__memcpy_async<alignof(_Tp)>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
|
} |
|
|
|
template<typename _Group, class _Tp, std::size_t _Alignment, thread_scope _Sco, typename _CompF, std::size_t _Larger_alignment = (alignof(_Tp) > _Alignment) ? alignof(_Tp) : _Alignment> |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) { |
|
|
|
|
|
|
|
|
|
#if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 |
|
static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type"); |
|
#endif |
|
|
|
__memcpy_async<_Larger_alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
|
} |
|
|
|
template<class _Tp, typename _Size, thread_scope _Sco, typename _CompF> |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void memcpy_async(_Tp * __destination, _Tp const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) { |
|
memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier); |
|
} |
|
|
|
template<typename _Group, thread_scope _Sco, typename _CompF> |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) { |
|
__memcpy_async<1>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
|
} |
|
|
|
template<typename _Group, std::size_t _Alignment, thread_scope _Sco, typename _CompF> |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) { |
|
__memcpy_async<_Alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
|
} |
|
|
|
template<typename _Size, thread_scope _Sco, typename _CompF> |
|
_LIBCUDACXX_INLINE_VISIBILITY |
|
void memcpy_async(void * __destination, void const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) { |
|
memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier); |
|
} |
|
|
|
_LIBCUDACXX_END_NAMESPACE_CUDA |
|
|
|
#include "detail/__pragma_pop" |
|
|
|
#endif |
|
|