camenduru's picture
thanks to nvidia ❤
0dc1b04
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
# error "CUDA synchronization primitives are only supported for sm_70 and up."
#endif
#ifndef _CUDA_BARRIER
#define _CUDA_BARRIER
#include "atomic"
#include "cstddef"
#include "detail/__config"
#include "detail/__pragma_push"
#include "detail/libcxx/include/barrier"
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
template<std::size_t _Alignment>
struct aligned_size_t {
static constexpr std::size_t align = _Alignment;
std::size_t value;
__host__ __device__
explicit aligned_size_t(size_t __s) : value(__s) { }
__host__ __device__
operator size_t() const { return value; }
};
template<thread_scope _Sco, class _CompletionF = std::__empty_completion>
class barrier : public std::__barrier_base<_CompletionF, _Sco> {
template<thread_scope>
friend class pipeline;
using std::__barrier_base<_CompletionF, _Sco>::__try_wait;
public:
barrier() = default;
barrier(const barrier &) = delete;
barrier & operator=(const barrier &) = delete;
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
barrier(std::ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
: std::__barrier_base<_CompletionF, _Sco>(__expected, __completion) {
}
_LIBCUDACXX_INLINE_VISIBILITY
friend void init(barrier * __b, std::ptrdiff_t __expected) {
new (__b) barrier(__expected);
}
_LIBCUDACXX_INLINE_VISIBILITY
friend void init(barrier * __b, std::ptrdiff_t __expected, _CompletionF __completion) {
new (__b) barrier(__expected, __completion);
}
};
struct __block_scope_barrier_base {};
_LIBCUDACXX_END_NAMESPACE_CUDA
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
__device__
inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b);
_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
template<>
class barrier<thread_scope_block, std::__empty_completion> : public __block_scope_barrier_base {
using __barrier_base = std::__barrier_base<std::__empty_completion, (int)thread_scope_block>;
__barrier_base __barrier;
__device__
friend inline std::uint64_t * device::_LIBCUDACXX_CUDA_ABI_NAMESPACE::barrier_native_handle(barrier<thread_scope_block> & b);
public:
using arrival_token = typename __barrier_base::arrival_token;
private:
struct __poll_tester {
barrier const* __this;
arrival_token __phase;
_LIBCUDACXX_INLINE_VISIBILITY
__poll_tester(barrier const* __this_, arrival_token&& __phase_)
: __this(__this_)
, __phase(_CUDA_VSTD::move(__phase_))
{}
inline _LIBCUDACXX_INLINE_VISIBILITY
bool operator()() const
{
return __this->__try_wait(__phase);
}
};
_LIBCUDACXX_INLINE_VISIBILITY
bool __try_wait(arrival_token __phase) const {
#if __CUDA_ARCH__ >= 800
if (__isShared(&__barrier)) {
int __ready = 0;
asm volatile ("{\n\t"
".reg .pred p;\n\t"
"mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t"
"selp.b32 %0, 1, 0, p;\n\t"
"}"
: "=r"(__ready)
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__phase)
: "memory");
return __ready;
}
else
#endif
{
return __barrier.__try_wait(std::move(__phase));
}
}
template<thread_scope>
friend class pipeline;
public:
barrier() = default;
barrier(const barrier &) = delete;
barrier & operator=(const barrier &) = delete;
_LIBCUDACXX_INLINE_VISIBILITY
barrier(std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) {
static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier<thread_scope_block>, __barrier), "fatal error: bad barrier layout");
init(this, __expected, __completion);
}
_LIBCUDACXX_INLINE_VISIBILITY
~barrier() {
#if __CUDA_ARCH__ >= 800
if (__isShared(&__barrier)) {
asm volatile ("mbarrier.inval.shared.b64 [%0];"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier)))
: "memory");
}
#endif
}
_LIBCUDACXX_INLINE_VISIBILITY
friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) {
#if __CUDA_ARCH__ >= 800
if (__isShared(&__b->__barrier)) {
asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))),
"r"(static_cast<std::uint32_t>(__expected))
: "memory");
}
else
#endif
{
new (&__b->__barrier) __barrier_base(__expected);
}
}
_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
arrival_token arrive(std::ptrdiff_t __update = 1)
{
#if __CUDA_ARCH__
if (__isShared(&__barrier)) {
arrival_token __token;
#if __CUDA_ARCH__ >= 800
if (__update > 1) {
asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
: "=l"(__token)
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))),
"r"(static_cast<std::uint32_t>(__update - 1))
: "memory");
}
asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
: "=l"(__token)
: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier)))
: "memory");
#else
unsigned int __activeA = __match_any_sync(__activemask(), __update);
unsigned int __activeB = __match_any_sync(__activemask(), reinterpret_cast<std::uintptr_t>(&__barrier));
unsigned int __active = __activeA & __activeB;
int __inc = __popc(__active) * __update;
unsigned __laneid;
asm volatile ("mov.u32 %0, %laneid;" : "=r"(__laneid));
int __leader = __ffs(__active) - 1;
if(__leader == __laneid)
{
__token = __barrier.arrive(__inc);
}
__token = __shfl_sync(__active, __token, __leader);
#endif
return __token;
}
else
#endif
{
return __barrier.arrive(__update);
}
}
_LIBCUDACXX_INLINE_VISIBILITY
void wait(arrival_token && __phase) const
{
_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
}
inline _LIBCUDACXX_INLINE_VISIBILITY
void arrive_and_wait()
{
wait(arrive());
}
_LIBCUDACXX_INLINE_VISIBILITY
void arrive_and_drop()
{
#if __CUDA_ARCH__ >= 800
if (__isShared(&__barrier)) {
asm volatile ("mbarrier.arrive_drop.shared.b64 _, [%0];"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier)))
: "memory");
}
else
#endif
{
__barrier.arrive_and_drop();
}
}
_LIBCUDACXX_INLINE_VISIBILITY
static constexpr ptrdiff_t max() noexcept
{
return (1 << 20) - 1;
}
};
_LIBCUDACXX_END_NAMESPACE_CUDA
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
__device__
inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b) {
return reinterpret_cast<std::uint64_t *>(&b.__barrier);
}
_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
template<>
class barrier<thread_scope_thread, std::__empty_completion> : private barrier<thread_scope_block> {
using __base = barrier<thread_scope_block>;
public:
using __base::__base;
_LIBCUDACXX_INLINE_VISIBILITY
friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) {
init(static_cast<__base *>(__b), __expected, __completion);
}
using __base::arrive;
using __base::wait;
using __base::arrive_and_wait;
using __base::arrive_and_drop;
using __base::max;
};
template<std::size_t _Alignment>
_LIBCUDACXX_INLINE_VISIBILITY
inline void __strided_memcpy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride = 1) {
if (__stride == 1) {
memcpy(__destination, __source, __total_size);
}
else {
for (std::size_t __offset = __rank * _Alignment; __offset < __total_size; __offset += __stride * _Alignment) {
memcpy(__destination + __offset, __source + __offset, _Alignment);
}
}
}
#if __CUDA_ARCH__ >= 800
template<std::size_t _Alignment, bool _Large = (_Alignment > 16)>
struct __memcpy_async_impl {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
__strided_memcpy<_Alignment>(__destination, __source, __total_size, __rank, __stride);
return false;
}
};
template<>
struct __memcpy_async_impl<4, false> {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) {
asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
"l"(__source + __offset)
: "memory");
}
return true;
}
};
template<>
struct __memcpy_async_impl<8, false> {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) {
asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
"l"(__source + __offset)
: "memory");
}
return true;
}
};
template<>
struct __memcpy_async_impl<16, false> {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) {
asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
"l"(__source + __offset)
: "memory");
}
return true;
}
};
template<std::size_t _Alignment>
struct __memcpy_async_impl<_Alignment, true> : public __memcpy_async_impl<16, false> { };
#endif
template<thread_scope _Sco, typename _CompF, bool _Is_mbarrier = (_Sco >= thread_scope_block) && std::is_same<_CompF, std::__empty_completion>::value>
_LIBCUDACXX_INLINE_VISIBILITY
inline void __memcpy_async_synchronize(barrier<_Sco, _CompF> & __barrier, bool __is_async) {
#if __CUDA_ARCH__ >= 800
if (__is_async) {
if (_Is_mbarrier && __isShared(&__barrier)) {
asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier)))
: "memory");
}
else {
asm volatile ("cp.async.wait_all;"
::: "memory");
}
}
#endif
}
template<std::size_t _Native_alignment, typename _Group, typename _Sync>
_LIBCUDACXX_INLINE_VISIBILITY
void inline __memcpy_async(_Group const & __group, char * __destination, char const * __source, std::size_t __size, _Sync & __sync) {
bool __is_async = false;
#if __CUDA_ARCH__ >= 800
__is_async = __isShared(__destination) && __isGlobal(__source);
if (__is_async) {
if (_Native_alignment < 4) {
auto __source_address = reinterpret_cast<std::uintptr_t>(__source);
auto __destination_address = reinterpret_cast<std::uintptr_t>(__destination);
// Lowest bit set will tell us what the common alignment of the three values is.
auto _Alignment = __ffs(__source_address | __destination_address | __size);
switch (_Alignment) {
default: __is_async = __memcpy_async_impl<16>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break;
case 4: __is_async = __memcpy_async_impl<8>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break;
case 3: __is_async = __memcpy_async_impl<4>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break;
case 2: // fallthrough
case 1: __is_async = __memcpy_async_impl<1>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break;
}
}
else {
__is_async = __memcpy_async_impl<_Native_alignment>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size());
}
}
else
#endif
{
__strided_memcpy<_Native_alignment>(__destination, __source, __size, __group.thread_rank(), __group.size());
}
__memcpy_async_synchronize(__sync, __is_async);
}
struct __single_thread_group {
_LIBCUDACXX_INLINE_VISIBILITY
void sync() const {}
_LIBCUDACXX_INLINE_VISIBILITY
constexpr std::size_t size() const { return 1; };
_LIBCUDACXX_INLINE_VISIBILITY
constexpr std::size_t thread_rank() const { return 0; };
};
template<typename _Group, class _Tp, thread_scope _Sco, typename _CompF>
_LIBCUDACXX_INLINE_VISIBILITY
void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) {
// When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
// incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
// memcpy_async when compiling with GCC 4.8.
// FIXME: remove the #if once GCC 4.8 is no longer supported.
#if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408
static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type");
#endif
__memcpy_async<alignof(_Tp)>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier);
}
template<typename _Group, class _Tp, std::size_t _Alignment, thread_scope _Sco, typename _CompF, std::size_t _Larger_alignment = (alignof(_Tp) > _Alignment) ? alignof(_Tp) : _Alignment>
_LIBCUDACXX_INLINE_VISIBILITY
void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) {
// When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
// incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
// memcpy_async when compiling with GCC 4.8.
// FIXME: remove the #if once GCC 4.8 is no longer supported.
#if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408
static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type");
#endif
__memcpy_async<_Larger_alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier);
}
template<class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
_LIBCUDACXX_INLINE_VISIBILITY
void memcpy_async(_Tp * __destination, _Tp const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) {
memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier);
}
template<typename _Group, thread_scope _Sco, typename _CompF>
_LIBCUDACXX_INLINE_VISIBILITY
void memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) {
__memcpy_async<1>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier);
}
template<typename _Group, std::size_t _Alignment, thread_scope _Sco, typename _CompF>
_LIBCUDACXX_INLINE_VISIBILITY
void memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) {
__memcpy_async<_Alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier);
}
template<typename _Size, thread_scope _Sco, typename _CompF>
_LIBCUDACXX_INLINE_VISIBILITY
void memcpy_async(void * __destination, void const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) {
memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier);
}
_LIBCUDACXX_END_NAMESPACE_CUDA
#include "detail/__pragma_pop"
#endif //_CUDA_BARRIER