|
/****************************************************************************** |
|
* Copyright (c) 2011, Duane Merrill. All rights reserved. |
|
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions are met: |
|
* * Redistributions of source code must retain the above copyright |
|
* notice, this list of conditions and the following disclaimer. |
|
* * Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in the |
|
* documentation and/or other materials provided with the distribution. |
|
* * Neither the name of the NVIDIA CORPORATION nor the |
|
* names of its contributors may be used to endorse or promote products |
|
* derived from this software without specific prior written permission. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
* |
|
******************************************************************************/ |
|
|
|
/** |
|
* \file |
|
* Callback operator types for supplying BlockScan prefixes |
|
*/ |
|
|
|
#pragma once |
|
|
|
#include <iterator> |
|
|
|
#include <cub/config.cuh> |
|
#include <cub/detail/strong_load.cuh> |
|
#include <cub/detail/strong_store.cuh> |
|
#include <cub/detail/uninitialized_copy.cuh> |
|
#include <cub/thread/thread_load.cuh> |
|
#include <cub/thread/thread_store.cuh> |
|
#include <cub/util_device.cuh> |
|
#include <cub/warp/warp_reduce.cuh> |
|
|
|
#include <nv/target> |
|
|
|
CUB_NAMESPACE_BEGIN |
|
|
|
|
|
/****************************************************************************** |
|
* Prefix functor type for maintaining a running prefix while scanning a |
|
* region independent of other thread blocks |
|
******************************************************************************/ |
|
|
|
/** |
|
* Stateful callback operator type for supplying BlockScan prefixes. |
|
* Maintains a running prefix that can be applied to consecutive |
|
* BlockScan operations. |
|
*/ |
|
template < |
|
typename T, ///< BlockScan value type |
|
typename ScanOpT> ///< Wrapped scan operator type |
|
struct BlockScanRunningPrefixOp |
|
{ |
|
ScanOpT op; ///< Wrapped scan operator |
|
T running_total; ///< Running block-wide prefix |
|
|
|
/// Constructor |
|
__device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op) |
|
: |
|
op(op) |
|
{} |
|
|
|
/// Constructor |
|
__device__ __forceinline__ BlockScanRunningPrefixOp( |
|
T starting_prefix, |
|
ScanOpT op) |
|
: |
|
op(op), |
|
running_total(starting_prefix) |
|
{} |
|
|
|
/** |
|
* Prefix callback operator. Returns the block-wide running_total in thread-0. |
|
*/ |
|
__device__ __forceinline__ T operator()( |
|
const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs |
|
{ |
|
T retval = running_total; |
|
running_total = op(running_total, block_aggregate); |
|
return retval; |
|
} |
|
}; |
|
|
|
|
|
/****************************************************************************** |
|
* Generic tile status interface types for block-cooperative scans |
|
******************************************************************************/ |
|
|
|
/** |
|
* Enumerations of tile status |
|
*/ |
|
enum ScanTileStatus |
|
{ |
|
SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) |
|
SCAN_TILE_INVALID = 99, // Not yet processed |
|
SCAN_TILE_PARTIAL, // Tile aggregate is available |
|
SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available |
|
}; |
|
|
|
namespace detail |
|
{ |
|
|
|
template <int Delay, unsigned int GridThreshold = 500> |
|
__device__ __forceinline__ void delay() |
|
{ |
|
NV_IF_TARGET(NV_PROVIDES_SM_70, |
|
(if (Delay > 0) |
|
{ |
|
if (gridDim.x < GridThreshold) |
|
{ |
|
__threadfence_block(); |
|
} |
|
else |
|
{ |
|
__nanosleep(Delay); |
|
} |
|
})); |
|
} |
|
|
|
template <unsigned int GridThreshold = 500> |
|
__device__ __forceinline__ void delay(int ns) |
|
{ |
|
NV_IF_TARGET(NV_PROVIDES_SM_70, |
|
(if (ns > 0) |
|
{ |
|
if (gridDim.x < GridThreshold) |
|
{ |
|
__threadfence_block(); |
|
} |
|
else |
|
{ |
|
__nanosleep(ns); |
|
} |
|
})); |
|
} |
|
|
|
template <int Delay> |
|
__device__ __forceinline__ void always_delay() |
|
{ |
|
NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(Delay);)); |
|
} |
|
|
|
__device__ __forceinline__ void always_delay(int ns) |
|
{ |
|
NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(ns);), ((void)ns;)); |
|
} |
|
|
|
template <unsigned int Delay = 350, unsigned int GridThreshold = 500> |
|
__device__ __forceinline__ void delay_or_prevent_hoisting() |
|
{ |
|
NV_IF_TARGET(NV_PROVIDES_SM_70, |
|
(delay<Delay, GridThreshold>();), |
|
(__threadfence_block();)); |
|
} |
|
|
|
template <unsigned int GridThreshold = 500> |
|
__device__ __forceinline__ void delay_or_prevent_hoisting(int ns) |
|
{ |
|
NV_IF_TARGET(NV_PROVIDES_SM_70, |
|
(delay<GridThreshold>(ns);), |
|
((void)ns; __threadfence_block();)); |
|
} |
|
|
|
template <unsigned int Delay = 350> |
|
__device__ __forceinline__ void always_delay_or_prevent_hoisting() |
|
{ |
|
NV_IF_TARGET(NV_PROVIDES_SM_70, |
|
(always_delay(Delay);), |
|
(__threadfence_block();)); |
|
} |
|
|
|
__device__ __forceinline__ void always_delay_or_prevent_hoisting(int ns) |
|
{ |
|
NV_IF_TARGET(NV_PROVIDES_SM_70, |
|
(always_delay(ns);), |
|
((void)ns; __threadfence_block();)); |
|
} |
|
|
|
template <unsigned int L2WriteLatency> |
|
struct no_delay_constructor_t |
|
{ |
|
struct delay_t |
|
{ |
|
__device__ __forceinline__ void operator()() |
|
{ |
|
NV_IF_TARGET(NV_PROVIDES_SM_70, |
|
(), |
|
(__threadfence_block();)); |
|
} |
|
}; |
|
|
|
__device__ __forceinline__ no_delay_constructor_t(unsigned int /* seed */) |
|
{ |
|
delay<L2WriteLatency>(); |
|
} |
|
|
|
__device__ __forceinline__ delay_t operator()() { return {}; } |
|
}; |
|
|
|
template <unsigned int Delay, unsigned int L2WriteLatency, unsigned int GridThreshold = 500> |
|
struct reduce_by_key_delay_constructor_t |
|
{ |
|
struct delay_t |
|
{ |
|
__device__ __forceinline__ void operator()() |
|
{ |
|
NV_DISPATCH_TARGET( |
|
NV_IS_EXACTLY_SM_80, (delay<Delay, GridThreshold>();), |
|
NV_PROVIDES_SM_70, (delay< 0, GridThreshold>();), |
|
NV_IS_DEVICE, (__threadfence_block();)); |
|
} |
|
}; |
|
|
|
__device__ __forceinline__ reduce_by_key_delay_constructor_t(unsigned int /* seed */) |
|
{ |
|
delay<L2WriteLatency>(); |
|
} |
|
|
|
__device__ __forceinline__ delay_t operator()() { return {}; } |
|
}; |
|
|
|
template <unsigned int Delay, unsigned int L2WriteLatency> |
|
struct fixed_delay_constructor_t |
|
{ |
|
struct delay_t |
|
{ |
|
__device__ __forceinline__ void operator()() { delay_or_prevent_hoisting<Delay>(); } |
|
}; |
|
|
|
__device__ __forceinline__ fixed_delay_constructor_t(unsigned int /* seed */) |
|
{ |
|
delay<L2WriteLatency>(); |
|
} |
|
|
|
__device__ __forceinline__ delay_t operator()() { return {}; } |
|
}; |
|
|
|
template <unsigned int InitialDelay, unsigned int L2WriteLatency> |
|
struct exponential_backoff_constructor_t |
|
{ |
|
struct delay_t |
|
{ |
|
int delay; |
|
|
|
__device__ __forceinline__ void operator()() |
|
{ |
|
always_delay_or_prevent_hoisting(delay); |
|
delay <<= 1; |
|
} |
|
}; |
|
|
|
__device__ __forceinline__ exponential_backoff_constructor_t(unsigned int /* seed */) |
|
{ |
|
always_delay<L2WriteLatency>(); |
|
} |
|
|
|
__device__ __forceinline__ delay_t operator()() { return {InitialDelay}; } |
|
}; |
|
|
|
template <unsigned int InitialDelay, unsigned int L2WriteLatency> |
|
struct exponential_backoff_jitter_constructor_t |
|
{ |
|
struct delay_t |
|
{ |
|
static constexpr unsigned int a = 16807; |
|
static constexpr unsigned int c = 0; |
|
static constexpr unsigned int m = 1u << 31; |
|
|
|
unsigned int max_delay; |
|
unsigned int &seed; |
|
|
|
__device__ __forceinline__ unsigned int next(unsigned int min, unsigned int max) |
|
{ |
|
return (seed = (a * seed + c) % m) % (max + 1 - min) + min; |
|
} |
|
|
|
__device__ __forceinline__ void operator()() |
|
{ |
|
always_delay_or_prevent_hoisting(next(0, max_delay)); |
|
max_delay <<= 1; |
|
} |
|
}; |
|
|
|
unsigned int seed; |
|
|
|
__device__ __forceinline__ exponential_backoff_jitter_constructor_t(unsigned int seed) |
|
: seed(seed) |
|
{ |
|
always_delay<L2WriteLatency>(); |
|
} |
|
|
|
__device__ __forceinline__ delay_t operator()() { return {InitialDelay, seed}; } |
|
}; |
|
|
|
template <unsigned int InitialDelay, unsigned int L2WriteLatency> |
|
struct exponential_backoff_jitter_window_constructor_t |
|
{ |
|
struct delay_t |
|
{ |
|
static constexpr unsigned int a = 16807; |
|
static constexpr unsigned int c = 0; |
|
static constexpr unsigned int m = 1u << 31; |
|
|
|
unsigned int max_delay; |
|
unsigned int &seed; |
|
|
|
__device__ __forceinline__ unsigned int next(unsigned int min, unsigned int max) |
|
{ |
|
return (seed = (a * seed + c) % m) % (max + 1 - min) + min; |
|
} |
|
|
|
__device__ __forceinline__ void operator()() |
|
{ |
|
unsigned int next_max_delay = max_delay << 1; |
|
always_delay_or_prevent_hoisting(next(max_delay, next_max_delay)); |
|
max_delay = next_max_delay; |
|
} |
|
}; |
|
|
|
unsigned int seed; |
|
__device__ __forceinline__ exponential_backoff_jitter_window_constructor_t(unsigned int seed) |
|
: seed(seed) |
|
{ |
|
always_delay<L2WriteLatency>(); |
|
} |
|
|
|
__device__ __forceinline__ delay_t operator()() { return {InitialDelay, seed}; } |
|
}; |
|
|
|
template <unsigned int InitialDelay, unsigned int L2WriteLatency> |
|
struct exponential_backon_jitter_window_constructor_t |
|
{ |
|
struct delay_t |
|
{ |
|
static constexpr unsigned int a = 16807; |
|
static constexpr unsigned int c = 0; |
|
static constexpr unsigned int m = 1u << 31; |
|
|
|
unsigned int max_delay; |
|
unsigned int &seed; |
|
|
|
__device__ __forceinline__ unsigned int next(unsigned int min, unsigned int max) |
|
{ |
|
return (seed = (a * seed + c) % m) % (max + 1 - min) + min; |
|
} |
|
|
|
__device__ __forceinline__ void operator()() |
|
{ |
|
int prev_delay = max_delay >> 1; |
|
always_delay_or_prevent_hoisting(next(prev_delay, max_delay)); |
|
max_delay = prev_delay; |
|
} |
|
}; |
|
|
|
unsigned int seed; |
|
unsigned int max_delay = InitialDelay; |
|
|
|
__device__ __forceinline__ exponential_backon_jitter_window_constructor_t(unsigned int seed) |
|
: seed(seed) |
|
{ |
|
always_delay<L2WriteLatency>(); |
|
} |
|
|
|
__device__ __forceinline__ delay_t operator()() |
|
{ |
|
max_delay >>= 1; |
|
return {max_delay, seed}; |
|
} |
|
}; |
|
|
|
template <unsigned int InitialDelay, unsigned int L2WriteLatency> |
|
struct exponential_backon_jitter_constructor_t |
|
{ |
|
struct delay_t |
|
{ |
|
static constexpr unsigned int a = 16807; |
|
static constexpr unsigned int c = 0; |
|
static constexpr unsigned int m = 1u << 31; |
|
|
|
unsigned int max_delay; |
|
unsigned int &seed; |
|
|
|
__device__ __forceinline__ unsigned int next(unsigned int min, unsigned int max) |
|
{ |
|
return (seed = (a * seed + c) % m) % (max + 1 - min) + min; |
|
} |
|
|
|
__device__ __forceinline__ void operator()() |
|
{ |
|
always_delay_or_prevent_hoisting(next(0, max_delay)); |
|
max_delay >>= 1; |
|
} |
|
}; |
|
|
|
unsigned int seed; |
|
unsigned int max_delay = InitialDelay; |
|
|
|
__device__ __forceinline__ exponential_backon_jitter_constructor_t(unsigned int seed) |
|
: seed(seed) |
|
{ |
|
always_delay<L2WriteLatency>(); |
|
} |
|
|
|
__device__ __forceinline__ delay_t operator()() |
|
{ |
|
max_delay >>= 1; |
|
return {max_delay, seed}; |
|
} |
|
}; |
|
|
|
template <unsigned int InitialDelay, unsigned int L2WriteLatency> |
|
struct exponential_backon_constructor_t |
|
{ |
|
struct delay_t |
|
{ |
|
unsigned int delay; |
|
|
|
__device__ __forceinline__ void operator()() |
|
{ |
|
always_delay_or_prevent_hoisting(delay); |
|
delay >>= 1; |
|
} |
|
}; |
|
|
|
unsigned int max_delay = InitialDelay; |
|
|
|
__device__ __forceinline__ exponential_backon_constructor_t(unsigned int /* seed */) |
|
{ |
|
always_delay<L2WriteLatency>(); |
|
} |
|
|
|
__device__ __forceinline__ delay_t operator()() |
|
{ |
|
max_delay >>= 1; |
|
return {max_delay}; |
|
} |
|
}; |
|
|
|
using default_no_delay_constructor_t = no_delay_constructor_t<450>; |
|
using default_no_delay_t = default_no_delay_constructor_t::delay_t; |
|
|
|
template <class T> |
|
using default_delay_constructor_t = cub::detail::conditional_t<Traits<T>::PRIMITIVE, |
|
fixed_delay_constructor_t<350, 450>, |
|
default_no_delay_constructor_t>; |
|
|
|
template <class T> |
|
using default_delay_t = typename default_delay_constructor_t<T>::delay_t; |
|
|
|
template <class KeyT, class ValueT> |
|
using default_reduce_by_key_delay_constructor_t = |
|
detail::conditional_t<(Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16), |
|
reduce_by_key_delay_constructor_t<350, 450>, |
|
default_delay_constructor_t<KeyValuePair<KeyT, ValueT>>>; |
|
} |
|
|
|
/** |
|
* Tile status interface. |
|
*/ |
|
template < |
|
typename T, |
|
bool SINGLE_WORD = Traits<T>::PRIMITIVE> |
|
struct ScanTileState; |
|
|
|
|
|
/** |
|
* Tile status interface specialized for scan status and value types |
|
* that can be combined into one machine word that can be |
|
* read/written coherently in a single access. |
|
*/ |
|
template <typename T> |
|
struct ScanTileState<T, true> |
|
{ |
|
// Status word type |
|
using StatusWord = cub::detail::conditional_t< |
|
sizeof(T) == 8, |
|
unsigned long long, |
|
cub::detail::conditional_t< |
|
sizeof(T) == 4, |
|
unsigned int, |
|
cub::detail::conditional_t<sizeof(T) == 2, unsigned short, unsigned char>>>; |
|
|
|
// Unit word type |
|
using TxnWord = cub::detail::conditional_t< |
|
sizeof(T) == 8, |
|
ulonglong2, |
|
cub::detail::conditional_t< |
|
sizeof(T) == 4, |
|
uint2, |
|
unsigned int>>; |
|
|
|
// Device word type |
|
struct TileDescriptor |
|
{ |
|
StatusWord status; |
|
T value; |
|
}; |
|
|
|
|
|
// Constants |
|
enum |
|
{ |
|
TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, |
|
}; |
|
|
|
|
|
// Device storage |
|
TxnWord *d_tile_descriptors; |
|
|
|
/// Constructor |
|
__host__ __device__ __forceinline__ |
|
ScanTileState() |
|
: |
|
d_tile_descriptors(NULL) |
|
{} |
|
|
|
|
|
/// Initializer |
|
__host__ __device__ __forceinline__ |
|
cudaError_t Init( |
|
int /*num_tiles*/, ///< [in] Number of tiles |
|
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. |
|
size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation |
|
{ |
|
d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage); |
|
return cudaSuccess; |
|
} |
|
|
|
|
|
/** |
|
* Compute device memory needed for tile status |
|
*/ |
|
__host__ __device__ __forceinline__ |
|
static cudaError_t AllocationSize( |
|
int num_tiles, ///< [in] Number of tiles |
|
size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation |
|
{ |
|
temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); // bytes needed for tile status descriptors |
|
return cudaSuccess; |
|
} |
|
|
|
|
|
/** |
|
* Initialize (from device) |
|
*/ |
|
__device__ __forceinline__ void InitializeStatus(int num_tiles) |
|
{ |
|
int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; |
|
|
|
TxnWord val = TxnWord(); |
|
TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val); |
|
|
|
if (tile_idx < num_tiles) |
|
{ |
|
// Not-yet-set |
|
descriptor->status = StatusWord(SCAN_TILE_INVALID); |
|
d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; |
|
} |
|
|
|
if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) |
|
{ |
|
// Padding |
|
descriptor->status = StatusWord(SCAN_TILE_OOB); |
|
d_tile_descriptors[threadIdx.x] = val; |
|
} |
|
} |
|
|
|
|
|
/** |
|
* Update the specified tile's inclusive value and corresponding status |
|
*/ |
|
__device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) |
|
{ |
|
TileDescriptor tile_descriptor; |
|
tile_descriptor.status = SCAN_TILE_INCLUSIVE; |
|
tile_descriptor.value = tile_inclusive; |
|
|
|
TxnWord alias; |
|
*reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor; |
|
|
|
detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); |
|
} |
|
|
|
|
|
/** |
|
* Update the specified tile's partial value and corresponding status |
|
*/ |
|
__device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) |
|
{ |
|
TileDescriptor tile_descriptor; |
|
tile_descriptor.status = SCAN_TILE_PARTIAL; |
|
tile_descriptor.value = tile_partial; |
|
|
|
TxnWord alias; |
|
*reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor; |
|
|
|
detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); |
|
} |
|
|
|
/** |
|
* Wait for the corresponding tile to become non-invalid |
|
*/ |
|
template <class DelayT = detail::default_delay_t<T>> |
|
__device__ __forceinline__ void WaitForValid( |
|
int tile_idx, |
|
StatusWord &status, |
|
T &value, |
|
DelayT delay_or_prevent_hoisting = {}) |
|
{ |
|
TileDescriptor tile_descriptor; |
|
|
|
{ |
|
TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); |
|
tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); |
|
} |
|
|
|
while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)) |
|
{ |
|
delay_or_prevent_hoisting(); |
|
TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); |
|
tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); |
|
} |
|
|
|
status = tile_descriptor.status; |
|
value = tile_descriptor.value; |
|
} |
|
|
|
/** |
|
* Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or |
|
* (b) there is no memory fence between reading a non-invalid status and the call to LoadValid. |
|
*/ |
|
__device__ __forceinline__ T LoadValid(int tile_idx) |
|
{ |
|
TxnWord alias = d_tile_descriptors[TILE_STATUS_PADDING + tile_idx]; |
|
TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); |
|
return tile_descriptor.value; |
|
} |
|
}; |
|
|
|
|
|
|
|
/** |
|
* Tile status interface specialized for scan status and value types that |
|
* cannot be combined into one machine word. |
|
*/ |
|
template <typename T> |
|
struct ScanTileState<T, false> |
|
{ |
|
// Status word type |
|
using StatusWord = unsigned int; |
|
|
|
// Constants |
|
enum |
|
{ |
|
TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, |
|
}; |
|
|
|
// Device storage |
|
StatusWord *d_tile_status; |
|
T *d_tile_partial; |
|
T *d_tile_inclusive; |
|
|
|
/// Constructor |
|
__host__ __device__ __forceinline__ |
|
ScanTileState() |
|
: |
|
d_tile_status(NULL), |
|
d_tile_partial(NULL), |
|
d_tile_inclusive(NULL) |
|
{} |
|
|
|
|
|
/// Initializer |
|
__host__ __device__ __forceinline__ |
|
cudaError_t Init( |
|
int num_tiles, ///< [in] Number of tiles |
|
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. |
|
size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation |
|
{ |
|
cudaError_t error = cudaSuccess; |
|
do |
|
{ |
|
void* allocations[3] = {}; |
|
size_t allocation_sizes[3]; |
|
|
|
allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors |
|
allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for partials |
|
allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for inclusives |
|
|
|
// Compute allocation pointers into the single storage blob |
|
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; |
|
|
|
// Alias the offsets |
|
d_tile_status = reinterpret_cast<StatusWord*>(allocations[0]); |
|
d_tile_partial = reinterpret_cast<T*>(allocations[1]); |
|
d_tile_inclusive = reinterpret_cast<T*>(allocations[2]); |
|
} |
|
while (0); |
|
|
|
return error; |
|
} |
|
|
|
|
|
/** |
|
* Compute device memory needed for tile status |
|
*/ |
|
__host__ __device__ __forceinline__ |
|
static cudaError_t AllocationSize( |
|
int num_tiles, ///< [in] Number of tiles |
|
size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation |
|
{ |
|
// Specify storage allocation requirements |
|
size_t allocation_sizes[3]; |
|
allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors |
|
allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for partials |
|
allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>); // bytes needed for inclusives |
|
|
|
// Set the necessary size of the blob |
|
void* allocations[3] = {}; |
|
return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); |
|
} |
|
|
|
|
|
/** |
|
* Initialize (from device) |
|
*/ |
|
__device__ __forceinline__ void InitializeStatus(int num_tiles) |
|
{ |
|
int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; |
|
if (tile_idx < num_tiles) |
|
{ |
|
// Not-yet-set |
|
d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); |
|
} |
|
|
|
if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) |
|
{ |
|
// Padding |
|
d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); |
|
} |
|
} |
|
|
|
|
|
/** |
|
* Update the specified tile's inclusive value and corresponding status |
|
*/ |
|
__device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) |
|
{ |
|
// Update tile inclusive value |
|
ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); |
|
detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); |
|
} |
|
|
|
|
|
/** |
|
* Update the specified tile's partial value and corresponding status |
|
*/ |
|
__device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) |
|
{ |
|
// Update tile partial value |
|
ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); |
|
detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); |
|
} |
|
|
|
/** |
|
* Wait for the corresponding tile to become non-invalid |
|
*/ |
|
template <class DelayT = detail::default_no_delay_t> |
|
__device__ __forceinline__ void WaitForValid( |
|
int tile_idx, |
|
StatusWord &status, |
|
T &value, |
|
DelayT delay = {}) |
|
{ |
|
do |
|
{ |
|
delay(); |
|
status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx); |
|
__threadfence(); |
|
} while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff)); |
|
|
|
if (status == StatusWord(SCAN_TILE_PARTIAL)) |
|
{ |
|
value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx); |
|
} |
|
else |
|
{ |
|
value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); |
|
} |
|
} |
|
|
|
/** |
|
* Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or |
|
* (b) there is no memory fence between reading a non-invalid status and the call to LoadValid. |
|
*/ |
|
__device__ __forceinline__ T LoadValid(int tile_idx) |
|
{ |
|
return d_tile_inclusive[TILE_STATUS_PADDING + tile_idx]; |
|
} |
|
}; |
|
|
|
|
|
/****************************************************************************** |
|
* ReduceByKey tile status interface types for block-cooperative scans |
|
******************************************************************************/ |
|
|
|
/** |
|
* Tile status interface for reduction by key. |
|
* |
|
*/ |
|
template < |
|
typename ValueT, |
|
typename KeyT, |
|
bool SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)> |
|
struct ReduceByKeyScanTileState; |
|
|
|
|
|
/** |
|
* Tile status interface for reduction by key, specialized for scan status and value types that |
|
* cannot be combined into one machine word. |
|
*/ |
|
template < |
|
typename ValueT, |
|
typename KeyT> |
|
struct ReduceByKeyScanTileState<ValueT, KeyT, false> : |
|
ScanTileState<KeyValuePair<KeyT, ValueT> > |
|
{ |
|
typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass; |
|
|
|
/// Constructor |
|
__host__ __device__ __forceinline__ |
|
ReduceByKeyScanTileState() : SuperClass() {} |
|
}; |
|
|
|
|
|
/** |
|
* Tile status interface for reduction by key, specialized for scan status and value types that |
|
* can be combined into one machine word that can be read/written coherently in a single access. |
|
*/ |
|
template < |
|
typename ValueT, |
|
typename KeyT> |
|
struct ReduceByKeyScanTileState<ValueT, KeyT, true> |
|
{ |
|
using KeyValuePairT = KeyValuePair<KeyT, ValueT>; |
|
|
|
// Constants |
|
enum |
|
{ |
|
PAIR_SIZE = static_cast<int>(sizeof(ValueT) + sizeof(KeyT)), |
|
TXN_WORD_SIZE = 1 << Log2<PAIR_SIZE + 1>::VALUE, |
|
STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, |
|
|
|
TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, |
|
}; |
|
|
|
// Status word type |
|
using StatusWord = cub::detail::conditional_t< |
|
STATUS_WORD_SIZE == 8, |
|
unsigned long long, |
|
cub::detail::conditional_t< |
|
STATUS_WORD_SIZE == 4, |
|
unsigned int, |
|
cub::detail::conditional_t<STATUS_WORD_SIZE == 2, unsigned short, unsigned char>>>; |
|
|
|
// Status word type |
|
using TxnWord = cub::detail::conditional_t< |
|
TXN_WORD_SIZE == 16, |
|
ulonglong2, |
|
cub::detail::conditional_t<TXN_WORD_SIZE == 8, unsigned long long, unsigned int>>; |
|
|
|
// Device word type (for when sizeof(ValueT) == sizeof(KeyT)) |
|
struct TileDescriptorBigStatus |
|
{ |
|
KeyT key; |
|
ValueT value; |
|
StatusWord status; |
|
}; |
|
|
|
// Device word type (for when sizeof(ValueT) != sizeof(KeyT)) |
|
struct TileDescriptorLittleStatus |
|
{ |
|
ValueT value; |
|
StatusWord status; |
|
KeyT key; |
|
}; |
|
|
|
// Device word type |
|
using TileDescriptor = |
|
cub::detail::conditional_t<sizeof(ValueT) == sizeof(KeyT), |
|
TileDescriptorBigStatus, |
|
TileDescriptorLittleStatus>; |
|
|
|
// Device storage |
|
TxnWord *d_tile_descriptors; |
|
|
|
|
|
/// Constructor |
|
__host__ __device__ __forceinline__ |
|
ReduceByKeyScanTileState() |
|
: |
|
d_tile_descriptors(NULL) |
|
{} |
|
|
|
|
|
/// Initializer |
|
__host__ __device__ __forceinline__ |
|
cudaError_t Init( |
|
int /*num_tiles*/, ///< [in] Number of tiles |
|
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. |
|
size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation |
|
{ |
|
d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage); |
|
return cudaSuccess; |
|
} |
|
|
|
|
|
/** |
|
* Compute device memory needed for tile status |
|
*/ |
|
__host__ __device__ __forceinline__ |
|
static cudaError_t AllocationSize( |
|
int num_tiles, ///< [in] Number of tiles |
|
size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation |
|
{ |
|
temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); // bytes needed for tile status descriptors |
|
return cudaSuccess; |
|
} |
|
|
|
|
|
/** |
|
* Initialize (from device) |
|
*/ |
|
__device__ __forceinline__ void InitializeStatus(int num_tiles) |
|
{ |
|
int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; |
|
TxnWord val = TxnWord(); |
|
TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val); |
|
|
|
if (tile_idx < num_tiles) |
|
{ |
|
// Not-yet-set |
|
descriptor->status = StatusWord(SCAN_TILE_INVALID); |
|
d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; |
|
} |
|
|
|
if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) |
|
{ |
|
// Padding |
|
descriptor->status = StatusWord(SCAN_TILE_OOB); |
|
d_tile_descriptors[threadIdx.x] = val; |
|
} |
|
} |
|
|
|
|
|
/** |
|
* Update the specified tile's inclusive value and corresponding status |
|
*/ |
|
__device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive) |
|
{ |
|
TileDescriptor tile_descriptor; |
|
tile_descriptor.status = SCAN_TILE_INCLUSIVE; |
|
tile_descriptor.value = tile_inclusive.value; |
|
tile_descriptor.key = tile_inclusive.key; |
|
|
|
TxnWord alias; |
|
*reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor; |
|
|
|
detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); |
|
} |
|
|
|
|
|
/** |
|
* Update the specified tile's partial value and corresponding status |
|
*/ |
|
__device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial) |
|
{ |
|
TileDescriptor tile_descriptor; |
|
tile_descriptor.status = SCAN_TILE_PARTIAL; |
|
tile_descriptor.value = tile_partial.value; |
|
tile_descriptor.key = tile_partial.key; |
|
|
|
TxnWord alias; |
|
*reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor; |
|
|
|
detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); |
|
} |
|
|
|
/** |
|
* Wait for the corresponding tile to become non-invalid |
|
*/ |
|
template <class DelayT = detail::fixed_delay_constructor_t<350, 450>::delay_t> |
|
__device__ __forceinline__ void WaitForValid( |
|
int tile_idx, |
|
StatusWord &status, |
|
KeyValuePairT &value, |
|
DelayT delay_or_prevent_hoisting = {}) |
|
{ |
|
// TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); |
|
// TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); |
|
// |
|
// while (tile_descriptor.status == SCAN_TILE_INVALID) |
|
// { |
|
// __threadfence_block(); // prevent hoisting loads from loop |
|
// |
|
// alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); |
|
// tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); |
|
// } |
|
// |
|
// status = tile_descriptor.status; |
|
// value.value = tile_descriptor.value; |
|
// value.key = tile_descriptor.key; |
|
|
|
TileDescriptor tile_descriptor; |
|
|
|
do |
|
{ |
|
delay_or_prevent_hoisting(); |
|
TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); |
|
tile_descriptor = reinterpret_cast<TileDescriptor&>(alias); |
|
|
|
} while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); |
|
|
|
status = tile_descriptor.status; |
|
value.value = tile_descriptor.value; |
|
value.key = tile_descriptor.key; |
|
} |
|
|
|
}; |
|
|
|
|
|
/****************************************************************************** |
|
* Prefix call-back operator for coupling local block scan within a |
|
* block-cooperative scan |
|
******************************************************************************/ |
|
|
|
/** |
|
* Stateful block-scan prefix functor. Provides the the running prefix for |
|
* the current tile by using the call-back warp to wait on on |
|
* aggregates/prefixes from predecessor tiles to become available. |
|
* |
|
* @tparam DelayConstructorT |
|
* Implementation detail, do not specify directly, requirements on the |
|
* content of this type are subject to breaking change. |
|
*/ |
|
template < |
|
typename T, |
|
typename ScanOpT, |
|
typename ScanTileStateT, |
|
int LEGACY_PTX_ARCH = 0, |
|
typename DelayConstructorT = detail::default_delay_constructor_t<T>> |
|
struct TilePrefixCallbackOp |
|
{ |
|
// Parameterized warp reduce |
|
typedef WarpReduce<T, CUB_PTX_WARP_THREADS> WarpReduceT; |
|
|
|
// Temporary storage type |
|
struct _TempStorage |
|
{ |
|
typename WarpReduceT::TempStorage warp_reduce; |
|
T exclusive_prefix; |
|
T inclusive_prefix; |
|
T block_aggregate; |
|
}; |
|
|
|
// Alias wrapper allowing temporary storage to be unioned |
|
struct TempStorage : Uninitialized<_TempStorage> {}; |
|
|
|
// Type of status word |
|
typedef typename ScanTileStateT::StatusWord StatusWord; |
|
|
|
// Fields |
|
_TempStorage& temp_storage; ///< Reference to a warp-reduction instance |
|
ScanTileStateT& tile_status; ///< Interface to tile status |
|
ScanOpT scan_op; ///< Binary scan operator |
|
int tile_idx; ///< The current tile index |
|
T exclusive_prefix; ///< Exclusive prefix for the tile |
|
T inclusive_prefix; ///< Inclusive prefix for the tile |
|
|
|
// Constructs prefix functor for a given tile index. |
|
// Precondition: thread blocks processing all of the predecessor tiles were scheduled. |
|
__device__ __forceinline__ TilePrefixCallbackOp(ScanTileStateT &tile_status, |
|
TempStorage &temp_storage, |
|
ScanOpT scan_op, |
|
int tile_idx) |
|
: temp_storage(temp_storage.Alias()) |
|
, tile_status(tile_status) |
|
, scan_op(scan_op) |
|
, tile_idx(tile_idx) |
|
{} |
|
|
|
// Computes the tile index and constructs prefix functor with it. |
|
// Precondition: thread block per tile assignment. |
|
__device__ __forceinline__ TilePrefixCallbackOp(ScanTileStateT &tile_status, |
|
TempStorage &temp_storage, |
|
ScanOpT scan_op) |
|
: TilePrefixCallbackOp(tile_status, temp_storage, scan_op, blockIdx.x) |
|
{} |
|
|
|
// Block until all predecessors within the warp-wide window have non-invalid status |
|
template <class DelayT = detail::default_delay_t<T>> |
|
__device__ __forceinline__ |
|
void ProcessWindow( |
|
int predecessor_idx, ///< Preceding tile index to inspect |
|
StatusWord &predecessor_status, ///< [out] Preceding tile status |
|
T &window_aggregate, ///< [out] Relevant partial reduction from this window of preceding tiles |
|
DelayT delay = {}) |
|
{ |
|
T value; |
|
tile_status.WaitForValid(predecessor_idx, predecessor_status, value, delay); |
|
|
|
// Perform a segmented reduction to get the prefix for the current window. |
|
// Use the swizzled scan operator because we are now scanning *down* towards thread0. |
|
|
|
int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); |
|
window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce( |
|
value, |
|
tail_flag, |
|
SwizzleScanOp<ScanOpT>(scan_op)); |
|
} |
|
|
|
|
|
// BlockScan prefix callback functor (called by the first warp) |
|
__device__ __forceinline__ |
|
T operator()(T block_aggregate) |
|
{ |
|
// Update our status with our tile-aggregate |
|
if (threadIdx.x == 0) |
|
{ |
|
detail::uninitialized_copy(&temp_storage.block_aggregate, |
|
block_aggregate); |
|
|
|
tile_status.SetPartial(tile_idx, block_aggregate); |
|
} |
|
|
|
int predecessor_idx = tile_idx - threadIdx.x - 1; |
|
StatusWord predecessor_status; |
|
T window_aggregate; |
|
|
|
// Wait for the warp-wide window of predecessor tiles to become valid |
|
DelayConstructorT construct_delay(tile_idx); |
|
ProcessWindow(predecessor_idx, predecessor_status, window_aggregate, construct_delay()); |
|
|
|
// The exclusive tile prefix starts out as the current window aggregate |
|
exclusive_prefix = window_aggregate; |
|
|
|
// Keep sliding the window back until we come across a tile whose inclusive prefix is known |
|
while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) |
|
{ |
|
predecessor_idx -= CUB_PTX_WARP_THREADS; |
|
|
|
// Update exclusive tile prefix with the window prefix |
|
ProcessWindow(predecessor_idx, predecessor_status, window_aggregate, construct_delay()); |
|
exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); |
|
} |
|
|
|
// Compute the inclusive tile prefix and update the status for this tile |
|
if (threadIdx.x == 0) |
|
{ |
|
inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); |
|
tile_status.SetInclusive(tile_idx, inclusive_prefix); |
|
|
|
detail::uninitialized_copy(&temp_storage.exclusive_prefix, |
|
exclusive_prefix); |
|
|
|
detail::uninitialized_copy(&temp_storage.inclusive_prefix, |
|
inclusive_prefix); |
|
} |
|
|
|
// Return exclusive_prefix |
|
return exclusive_prefix; |
|
} |
|
|
|
// Get the exclusive prefix stored in temporary storage |
|
__device__ __forceinline__ |
|
T GetExclusivePrefix() |
|
{ |
|
return temp_storage.exclusive_prefix; |
|
} |
|
|
|
// Get the inclusive prefix stored in temporary storage |
|
__device__ __forceinline__ |
|
T GetInclusivePrefix() |
|
{ |
|
return temp_storage.inclusive_prefix; |
|
} |
|
|
|
// Get the block aggregate stored in temporary storage |
|
__device__ __forceinline__ |
|
T GetBlockAggregate() |
|
{ |
|
return temp_storage.block_aggregate; |
|
} |
|
|
|
__device__ __forceinline__ |
|
int GetTileIdx() const |
|
{ |
|
return tile_idx; |
|
} |
|
}; |
|
|
|
|
|
CUB_NAMESPACE_END |
|
|
|
|