camenduru
/

thrust

Model card Files Files and versions Community

thrust / install /include /cub /util_device.cuh

camenduru

thanks to nvidia ❤

0dc1b04 almost 2 years ago

raw

history blame contribute delete

23 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* Properties of a given CUDA device and the corresponding PTX bundle
	*/

	#pragma once

	#include <cuda/std/utility>

	#include <cub/detail/device_synchronize.cuh>
	#include <cub/util_arch.cuh>
	#include <cub/util_cpp_dialect.cuh>
	#include <cub/util_debug.cuh>
	#include <cub/util_macro.cuh>
	#include <cub/util_namespace.cuh>
	#include <cub/util_type.cuh>

	#include <nv/target>

	#include <atomic>
	#include <array>
	#include <cassert>

	CUB_NAMESPACE_BEGIN


	/**
	* \addtogroup UtilMgmt
	* @{
	*/

	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document


	/**
	* \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
	*/
	template <int ALLOCATIONS>
	__host__ __device__ __forceinline__
	cudaError_t AliasTemporaries(
	void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
	size_t& temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation
	void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed
	size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed
	{
	const int ALIGN_BYTES = 256;
	const int ALIGN_MASK = ~(ALIGN_BYTES - 1);

	// Compute exclusive prefix sum over allocation requests
	size_t allocation_offsets[ALLOCATIONS];
	size_t bytes_needed = 0;
	for (int i = 0; i < ALLOCATIONS; ++i)
	{
	size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
	allocation_offsets[i] = bytes_needed;
	bytes_needed += allocation_bytes;
	}
	bytes_needed += ALIGN_BYTES - 1;

	// Check if the caller is simply requesting the size of the storage allocation
	if (!d_temp_storage)
	{
	temp_storage_bytes = bytes_needed;
	return cudaSuccess;
	}

	// Check if enough storage provided
	if (temp_storage_bytes < bytes_needed)
	{
	return CubDebug(cudaErrorInvalidValue);
	}

	// Alias
	d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
	for (int i = 0; i < ALLOCATIONS; ++i)
	{
	allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
	}

	return cudaSuccess;
	}


	/**
	* \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
	*/
	template <typename T>
	__global__ void EmptyKernel(void) { }

	#endif // DOXYGEN_SHOULD_SKIP_THIS

	/**
	* \brief Returns the current device or -1 if an error occurred.
	*/
	CUB_RUNTIME_FUNCTION inline int CurrentDevice()
	{
	int device = -1;
	if (CubDebug(cudaGetDevice(&device))) return -1;
	return device;
	}

	/**
	* \brief RAII helper which saves the current device and switches to the
	* specified device on construction and switches to the saved device on
	* destruction.
	*/
	struct SwitchDevice
	{
	private:
	int const old_device;
	bool const needs_reset;
	public:
	__host__ inline SwitchDevice(int new_device)
	: old_device(CurrentDevice()), needs_reset(old_device != new_device)
	{
	if (needs_reset)
	CubDebug(cudaSetDevice(new_device));
	}

	__host__ inline ~SwitchDevice()
	{
	if (needs_reset)
	CubDebug(cudaSetDevice(old_device));
	}
	};

	/**
	* \brief Returns the number of CUDA devices available or -1 if an error
	* occurred.
	*/
	CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
	{
	int count = -1;
	if (CubDebug(cudaGetDeviceCount(&count)))
	// CUDA makes no guarantees about the state of the output parameter if
	// `cudaGetDeviceCount` fails; in practice, they don't, but out of
	// paranoia we'll reset `count` to `-1`.
	count = -1;
	return count;
	}

	/**
	* \brief Cache for an arbitrary value produced by a nullary function.
	*/
	template <typename T, T(*Function)()>
	struct ValueCache
	{
	T const value;

	/**
	* \brief Call the nullary function to produce the value and construct the
	* cache.
	*/
	__host__ inline ValueCache() : value(Function()) {}
	};

	// Host code, only safely usable in C++11 or newer, where thread-safe
	// initialization of static locals is guaranteed. This is a separate function
	// to avoid defining a local static in a host/device function.
	__host__ inline int DeviceCountCachedValue()
	{
	static ValueCache<int, DeviceCountUncached> cache;
	return cache.value;
	}

	/**
	* \brief Returns the number of CUDA devices available.
	*
	* \note This function may cache the result internally.
	*
	* \note This function is thread safe.
	*/
	CUB_RUNTIME_FUNCTION inline int DeviceCount()
	{
	int result = -1;

	NV_IF_TARGET(NV_IS_HOST,
	(result = DeviceCountCachedValue();),
	(result = DeviceCountUncached();));

	return result;
	}

	/**
	* \brief Per-device cache for a CUDA attribute value; the attribute is queried
	* and stored for each device upon construction.
	*/
	struct PerDeviceAttributeCache
	{
	struct DevicePayload
	{
	int attribute;
	cudaError_t error;
	};

	// Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
	// `DeviceEntryInitializing` state, and then proceeds to the
	// `DeviceEntryReady` state. These are the only state transitions allowed;
	// e.g. a linear sequence of transitions.
	enum DeviceEntryStatus
	{
	DeviceEntryEmpty = 0,
	DeviceEntryInitializing,
	DeviceEntryReady
	};

	struct DeviceEntry
	{
	std::atomic<DeviceEntryStatus> flag;
	DevicePayload payload;
	};

	private:
	std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;

	public:
	/**
	* \brief Construct the cache.
	*/
	__host__ inline PerDeviceAttributeCache() : entries_()
	{
	assert(DeviceCount() <= CUB_MAX_DEVICES);
	}

	/**
	* \brief Retrieves the payload of the cached function \p f for \p device.
	*
	* \note You must pass a morally equivalent function in to every call or
	* this function has undefined behavior.
	*/
	template <typename Invocable>
	__host__ DevicePayload operator()(Invocable&& f, int device)
	{
	if (device >= DeviceCount())
	return DevicePayload{0, cudaErrorInvalidDevice};

	auto& entry = entries_[device];
	auto& flag = entry.flag;
	auto& payload = entry.payload;

	DeviceEntryStatus old_status = DeviceEntryEmpty;

	// First, check for the common case of the entry being ready.
	if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
	{
	// Assume the entry is empty and attempt to lock it so we can fill
	// it by trying to set the state from `DeviceEntryReady` to
	// `DeviceEntryInitializing`.
	if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing,
	std::memory_order_acq_rel,
	std::memory_order_acquire))
	{
	// We successfully set the state to `DeviceEntryInitializing`;
	// we have the lock and it's our job to initialize this entry
	// and then release it.

	// We don't use `CubDebug` here because we let the user code
	// decide whether or not errors are hard errors.
	payload.error = ::cuda::std::forward<Invocable>(f)(payload.attribute);
	if (payload.error)
	// Clear the global CUDA error state which may have been
	// set by the last call. Otherwise, errors may "leak" to
	// unrelated kernel launches.
	cudaGetLastError();

	// Release the lock by setting the state to `DeviceEntryReady`.
	flag.store(DeviceEntryReady, std::memory_order_release);
	}

	// If the `compare_exchange_weak` failed, then `old_status` has
	// been updated with the value of `flag` that it observed.

	else if (old_status == DeviceEntryInitializing)
	{
	// Another execution agent is initializing this entry; we need
	// to wait for them to finish; we'll know they're done when we
	// observe the entry status as `DeviceEntryReady`.
	do { old_status = flag.load(std::memory_order_acquire); }
	while (old_status != DeviceEntryReady);
	// FIXME: Use `atomic::wait` instead when we have access to
	// host-side C++20 atomics. We could use libcu++, but it only
	// supports atomics for SM60 and up, even if you're only using
	// them in host code.
	}
	}

	// We now know that the state of our entry is `DeviceEntryReady`, so
	// just return the entry's payload.
	return entry.payload;
	}
	};

	/**
	* \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
	*/
	CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
	{
	// Instantiate `EmptyKernel<void>` in both host and device code to ensure
	// it can be called.
	typedef void (*EmptyKernelPtr)();
	EmptyKernelPtr empty_kernel = EmptyKernel<void>;

	// This is necessary for unused variable warnings in host compilers. The
	// usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
	(void)reinterpret_cast<void*>(empty_kernel);

	// Define a temporary macro that expands to the current target ptx version
	// in device code.
	// <nv/target> may provide an abstraction for this eventually. For now,
	// we have to keep this usage of __CUDA_ARCH__.
	#if defined(_NVHPC_CUDA)
	#define CUB_TEMP_GET_PTX __builtin_current_device_sm()
	#else
	#define CUB_TEMP_GET_PTX __CUDA_ARCH__
	#endif

	cudaError_t result = cudaSuccess;
	NV_IF_TARGET(
	NV_IS_HOST,
	(
	cudaFuncAttributes empty_kernel_attrs;

	result = cudaFuncGetAttributes(&empty_kernel_attrs,
	reinterpret_cast<void*>(empty_kernel));
	CubDebug(result);

	ptx_version = empty_kernel_attrs.ptxVersion * 10;
	),
	// NV_IS_DEVICE
	(
	// This is necessary to ensure instantiation of EmptyKernel in device
	// code. The `reinterpret_cast` is necessary to suppress a
	// set-but-unused warnings. This is a meme now:
	// https://twitter.com/blelbach/status/1222391615576100864
	(void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);

	ptx_version = CUB_TEMP_GET_PTX;
	));

	#undef CUB_TEMP_GET_PTX

	return result;
	}

	/**
	* \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
	*/
	__host__ inline cudaError_t PtxVersionUncached(int& ptx_version, int device)
	{
	SwitchDevice sd(device);
	(void)sd;
	return PtxVersionUncached(ptx_version);
	}

	template <typename Tag>
	__host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
	{
	// C++11 guarantees that initialization of static locals is thread safe.
	static PerDeviceAttributeCache cache;
	return cache;
	}

	struct PtxVersionCacheTag {};
	struct SmVersionCacheTag {};

	/**
	* \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
	*
	* \note This function may cache the result internally.
	*
	* \note This function is thread safe.
	*/
	__host__ inline cudaError_t PtxVersion(int& ptx_version, int device)
	{
	auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
	// If this call fails, then we get the error code back in the payload,
	// which we check with `CubDebug` below.
	[=] (int& pv) { return PtxVersionUncached(pv, device); },
	device);

	if (!CubDebug(payload.error))
	ptx_version = payload.attribute;

	return payload.error;
	}

	/**
	* \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
	*
	* \note This function may cache the result internally.
	*
	* \note This function is thread safe.
	*/
	CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int &ptx_version)
	{
	cudaError_t result = cudaErrorUnknown;
	NV_IF_TARGET(
	NV_IS_HOST,
	(
	auto const device = CurrentDevice();
	auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
	// If this call fails, then we get the error code back in the payload,
	// which we check with `CubDebug` below.
	[=](int &pv) { return PtxVersionUncached(pv, device); },
	device);

	if (!CubDebug(payload.error))
	{
	ptx_version = payload.attribute;
	}

	result = payload.error;
	),
	( // NV_IS_DEVICE:
	result = PtxVersionUncached(ptx_version);
	));

	return result;
	}

	/**
	* \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
	*/
	CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
	{
	cudaError_t error = cudaSuccess;
	do
	{
	int major = 0, minor = 0;
	if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break;
	if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break;
	sm_version = major * 100 + minor * 10;
	}
	while (0);

	return error;
	}

	/**
	* \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
	*
	* \note This function may cache the result internally.
	*
	* \note This function is thread safe.
	*/
	CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int &sm_version,
	int device = CurrentDevice())
	{
	cudaError_t result = cudaErrorUnknown;

	NV_IF_TARGET(
	NV_IS_HOST,
	(
	auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
	// If this call fails, then we get the error code back in
	// the payload, which we check with `CubDebug` below.
	[=](int &pv) { return SmVersionUncached(pv, device); },
	device);

	if (!CubDebug(payload.error))
	{
	sm_version = payload.attribute;
	};

	result = payload.error;
	),
	( // NV_IS_DEVICE
	result = SmVersionUncached(sm_version, device);
	));

	return result;
	}

	/**
	* Synchronize the specified \p stream.
	*/
	CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
	{
	cudaError_t result = cudaErrorNotSupported;

	NV_IF_TARGET(NV_IS_HOST,
	(result = CubDebug(cudaStreamSynchronize(stream));),
	((void)stream;
	result = CubDebug(cub::detail::device_synchronize());));

	return result;
	}

	namespace detail
	{

	/**
	* Same as SyncStream, but intended for use with the debug_synchronous flags
	* in device algorithms. This should not be used if synchronization is required
	* for correctness.
	*
	* If `debug_synchronous` is false, this function will immediately return
	* cudaSuccess. If true, one of the following will occur:
	*
	* If synchronization is supported by the current compilation target and
	* settings, the sync is performed and the sync result is returned.
	*
	* If syncs are not supported then no sync is performed, but a message is logged
	* via _CubLog and cudaSuccess is returned.
	*/
	CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
	{
	#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC

	(void)stream;
	return cudaSuccess;

	#else // CUB_DETAIL_DEBUG_ENABLE_SYNC:

	#define CUB_TMP_SYNC_AVAILABLE \
	_CubLog("%s\n", "Synchronizing..."); \
	return SyncStream(stream)

	#define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \
	(void)stream; \
	_CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
	"device-side sync requires <sm_90, RDC, and CDPv1"); \
	return cudaSuccess

	#ifdef CUB_DETAIL_CDPv1

	// Can sync everywhere but SM_90+
	NV_IF_TARGET(NV_PROVIDES_SM_90,
	(CUB_TMP_DEVICE_SYNC_UNAVAILABLE;),
	(CUB_TMP_SYNC_AVAILABLE;));

	#else // CDPv2 or no CDP:

	// Can only sync on host
	NV_IF_TARGET(NV_IS_HOST,
	(CUB_TMP_SYNC_AVAILABLE;),
	(CUB_TMP_DEVICE_SYNC_UNAVAILABLE;));

	#endif // CDP version

	#undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
	#undef CUB_TMP_SYNC_AVAILABLE

	#endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
	}

	/** \brief Gets whether the current device supports unified addressing */
	CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
	{
	has_uva = false;
	cudaError_t error = cudaSuccess;
	int device = -1;
	if (CubDebug(error = cudaGetDevice(&device)) != cudaSuccess) return error;
	int uva = 0;
	if (CubDebug(error = cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device))
	!= cudaSuccess)
	{
	return error;
	}
	has_uva = uva == 1;
	return error;
	}

	} // namespace detail

	/**
	* \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
	*
	* \par Snippet
	* The code snippet below illustrates the use of the MaxSmOccupancy function.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/util_device.cuh>
	*
	* template <typename T>
	* __global__ void ExampleKernel()
	* {
	* // Allocate shared memory for BlockScan
	* __shared__ volatile T buffer[4096];
	*
	* ...
	* }
	*
	* ...
	*
	* // Determine SM occupancy for ExampleKernel specialized for unsigned char
	* int max_sm_occupancy;
	* MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
	*
	* // max_sm_occupancy <-- 4 on SM10
	* // max_sm_occupancy <-- 8 on SM20
	* // max_sm_occupancy <-- 12 on SM35
	*
	* \endcode
	*
	*/
	template <typename KernelPtr>
	CUB_RUNTIME_FUNCTION inline
	cudaError_t MaxSmOccupancy(
	int& max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM
	KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy
	int block_threads, ///< [in] Number of threads per thread block
	int dynamic_smem_bytes = 0) ///< [in] Dynamically allocated shared memory in bytes. Default is 0.
	{
	return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
	&max_sm_occupancy,
	kernel_ptr,
	block_threads,
	dynamic_smem_bytes));
	}


	/******************************************************************************
	* Policy management
	******************************************************************************/

	/**
	* Kernel dispatch configuration
	*/
	struct KernelConfig
	{
	int block_threads;
	int items_per_thread;
	int tile_size;
	int sm_occupancy;

	CUB_RUNTIME_FUNCTION __forceinline__
	KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}

	template <typename AgentPolicyT, typename KernelPtrT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Init(KernelPtrT kernel_ptr)
	{
	block_threads = AgentPolicyT::BLOCK_THREADS;
	items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
	tile_size = block_threads * items_per_thread;
	cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
	return retval;
	}
	};



	/// Helper for dispatching into a policy chain
	template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
	struct ChainedPolicy
	{
	/// The policy for the active compiler pass
	using ActivePolicy =
	cub::detail::conditional_t<(CUB_PTX_ARCH < PTX_VERSION),
	typename PrevPolicyT::ActivePolicy,
	PolicyT>;

	/// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
	template <typename FunctorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t Invoke(int ptx_version, FunctorT& op)
	{
	if (ptx_version < PTX_VERSION) {
	return PrevPolicyT::Invoke(ptx_version, op);
	}
	return op.template Invoke<PolicyT>();
	}
	};

	/// Helper for dispatching into a policy chain (end-of-chain specialization)
	template <int PTX_VERSION, typename PolicyT>
	struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
	{
	/// The policy for the active compiler pass
	typedef PolicyT ActivePolicy;

	/// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
	template <typename FunctorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t Invoke(int /ptx_version/, FunctorT& op) {
	return op.template Invoke<PolicyT>();
	}
	};




	/** @} */ // end group UtilMgmt

	CUB_NAMESPACE_END