|
/****************************************************************************** |
|
* Copyright (c) 2011, Duane Merrill. All rights reserved. |
|
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions are met: |
|
* * Redistributions of source code must retain the above copyright |
|
* notice, this list of conditions and the following disclaimer. |
|
* * Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in the |
|
* documentation and/or other materials provided with the distribution. |
|
* * Neither the name of the NVIDIA CORPORATION nor the |
|
* names of its contributors may be used to endorse or promote products |
|
* derived from this software without specific prior written permission. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
* |
|
******************************************************************************/ |
|
|
|
/** |
|
* @file cub::DeviceSegmentedReduce provides device-wide, parallel operations |
|
* for computing a batched reduction across multiple sequences of data |
|
* items residing within device-accessible memory. |
|
*/ |
|
|
|
#pragma once |
|
|
|
#include <iterator> |
|
|
|
#include <cub/config.cuh> |
|
#include <cub/device/dispatch/dispatch_reduce.cuh> |
|
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh> |
|
#include <cub/iterator/arg_index_input_iterator.cuh> |
|
#include <cub/util_deprecated.cuh> |
|
#include <cub/util_type.cuh> |
|
|
|
CUB_NAMESPACE_BEGIN |
|
|
|
|
|
/** |
|
* @brief DeviceSegmentedReduce provides device-wide, parallel operations for |
|
* computing a reduction across multiple sequences of data items |
|
* residing within device-accessible memory.  |
|
* @ingroup SegmentedModule |
|
* |
|
* @par Overview |
|
* A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)">*reduction*</a> |
|
* (or *fold*) uses a binary combining operator to compute a single aggregate |
|
* from a sequence of input elements. |
|
* |
|
* @par Usage Considerations |
|
* @cdp_class{DeviceSegmentedReduce} |
|
* |
|
*/ |
|
struct DeviceSegmentedReduce |
|
{ |
|
/** |
|
* @brief Computes a device-wide segmented reduction using the specified |
|
* binary `reduction_op` functor. |
|
* |
|
* @par |
|
* - Does not support binary reduction operators that are non-commutative. |
|
* - Provides "run-to-run" determinism for pseudo-associative reduction |
|
* (e.g., addition of floating point types) on the same GPU device. |
|
* However, results for pseudo-associative reduction may be inconsistent |
|
* from one device to a another device of a different compute-capability |
|
* because CUB can employ different tile-sizing for different architectures. |
|
* - When input a contiguous sequence of segments, a single sequence |
|
* `segment_offsets` (of length `num_segments + 1`) can be aliased |
|
* for both the `d_begin_offsets` and `d_end_offsets` parameters (where |
|
* the latter is specified as `segment_offsets + 1`). |
|
* - Let `s` be in `[0, num_segments)`. The range |
|
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not |
|
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)`. |
|
* - @devicestorage |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates a custom min-reduction of a device |
|
* vector of `int` data elements. |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_radix_sort.cuh> |
|
* |
|
* // CustomMin functor |
|
* struct CustomMin |
|
* { |
|
* template <typename T> |
|
* CUB_RUNTIME_FUNCTION __forceinline__ |
|
* T operator()(const T &a, const T &b) const { |
|
* return (b < a) ? b : a; |
|
* } |
|
* }; |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for input and output |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_out; // e.g., [-, -, -] |
|
* CustomMin min_op; |
|
* int initial_value; // e.g., INT_MAX |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedReduce::Reduce( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1, min_op, initial_value); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run reduction |
|
* cub::DeviceSegmentedReduce::Reduce( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1, min_op, initial_value); |
|
* |
|
* // d_out <-- [6, INT_MAX, 0] |
|
* @endcode |
|
* |
|
* @tparam InputIteratorT |
|
* **[inferred]** Random-access input iterator type for reading input |
|
* items \iterator |
|
* |
|
* @tparam OutputIteratorT |
|
* **[inferred]** Output iterator type for recording the reduced |
|
* aggregate \iterator |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @tparam ReductionOp |
|
* **[inferred]** Binary reduction functor type having member |
|
* `T operator()(const T &a, const T &b)` |
|
* |
|
* @tparam T |
|
* **[inferred]** Data element type that is convertible to the `value` type |
|
* of `InputIteratorT` |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to `temp_storage_bytes` and no |
|
* work is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of \p d_temp_storage allocation |
|
* |
|
* @param[in] d_in |
|
* Pointer to the input sequence of data items |
|
* |
|
* @param[out] d_out |
|
* Pointer to the output aggregate |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length `num_segments`, such that `d_begin_offsets[i]` is the first |
|
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of |
|
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is |
|
* considered empty. |
|
* |
|
* @param[in] reduction_op |
|
* Binary reduction functor |
|
* |
|
* @param[in] initial_value |
|
* Initial value of the reduction for each segment |
|
* |
|
* @param[in] stream |
|
* **[optional]** CUDA stream to launch kernels within. |
|
* Default is stream<sub>0</sub>. |
|
*/ |
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT, |
|
typename ReductionOp, |
|
typename T> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
Reduce(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
ReductionOp reduction_op, |
|
T initial_value, |
|
cudaStream_t stream = 0) |
|
{ |
|
// Signed integer type for global offsets |
|
using OffsetT = int; |
|
|
|
return DispatchSegmentedReduce<InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT, |
|
OffsetT, |
|
ReductionOp>::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
reduction_op, |
|
initial_value, |
|
stream); |
|
} |
|
|
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT, |
|
typename ReductionOp, |
|
typename T> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
Reduce(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
ReductionOp reduction_op, |
|
T initial_value, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return Reduce<InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT, |
|
ReductionOp, |
|
T>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
reduction_op, |
|
initial_value, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Computes a device-wide segmented sum using the addition (`+`) |
|
* operator. |
|
* |
|
* @par |
|
* - Uses `0` as the initial value of the reduction for each segment. |
|
* - When input a contiguous sequence of segments, a single sequence |
|
* `segment_offsets` (of length `num_segments + 1`) can be aliased |
|
* for both the `d_begin_offsets` and `d_end_offsets` parameters (where |
|
* the latter is specified as `segment_offsets + 1`). |
|
* - Does not support `+` operators that are non-commutative. |
|
* - Let `s` be in `[0, num_segments)`. The range |
|
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not |
|
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)`. |
|
* - @devicestorage |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the sum reduction of a device vector of |
|
* `int` data elements. |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_radix_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for input and output |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_out; // e.g., [-, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedReduce::Sum( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sum-reduction |
|
* cub::DeviceSegmentedReduce::Sum( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_out <-- [21, 0, 17] |
|
* @endcode |
|
* |
|
* @tparam InputIteratorT |
|
* **[inferred]** Random-access input iterator type for reading input |
|
* items \iterator |
|
* |
|
* @tparam OutputIteratorT |
|
* **[inferred]** Output iterator type for recording the reduced aggregate |
|
* \iterator |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to `temp_storage_bytes` and no work |
|
* is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of `d_temp_storage` allocation |
|
* |
|
* @param[in] d_in |
|
* Pointer to the input sequence of data items |
|
* |
|
* @param[out] d_out |
|
* Pointer to the output aggregate |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length `num_segments`, such that `d_begin_offsets[i]` is the first |
|
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of |
|
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* **[optional]</b> CUDA stream to launch kernels within. |
|
* Default is stream<sub>0</sub>. |
|
*/ |
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
Sum(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
// Signed integer type for global offsets |
|
using OffsetT = int; |
|
|
|
// The output value type |
|
using OutputT = |
|
cub::detail::non_void_value_t<OutputIteratorT, |
|
cub::detail::value_t<InputIteratorT>>; |
|
|
|
return DispatchSegmentedReduce< |
|
InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT, |
|
OffsetT, |
|
cub::Sum>::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
cub::Sum(), |
|
OutputT(), // zero-initialize |
|
stream); |
|
} |
|
|
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
Sum(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return Sum<InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Computes a device-wide segmented minimum using the less-than |
|
* (`<`) operator. |
|
* |
|
* @par |
|
* - Uses `std::numeric_limits<T>::max()` as the initial value of the |
|
* reduction for each segment. |
|
* - When input a contiguous sequence of segments, a single sequence |
|
* `segment_offsets` (of length `num_segments + 1`) can be aliased for both |
|
* the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is |
|
* specified as `segment_offsets + 1`). |
|
* - Does not support `<` operators that are non-commutative. |
|
* - Let `s` be in `[0, num_segments)`. The range |
|
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not |
|
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)`. |
|
* - @devicestorage |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the min-reduction of a device vector of |
|
* `int` data elements. |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_radix_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for input and output |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_out; // e.g., [-, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedReduce::Min( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run min-reduction |
|
* cub::DeviceSegmentedReduce::Min( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_out <-- [6, INT_MAX, 0] |
|
* @endcode |
|
* |
|
* @tparam InputIteratorT |
|
* **[inferred]** Random-access input iterator type for reading input |
|
* items \iterator |
|
* |
|
* @tparam OutputIteratorT |
|
* **[inferred]** Output iterator type for recording the reduced |
|
* aggregate \iterator |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to `temp_storage_bytes` and no work |
|
* is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of `d_temp_storage` allocation |
|
* |
|
* @param[in] d_in |
|
* Pointer to the input sequence of data items |
|
* |
|
* @param[out] d_out |
|
* Pointer to the output aggregate |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length `num_segments`, such that `d_begin_offsets[i]` is the first |
|
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of |
|
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* **[optional]** CUDA stream to launch kernels within. |
|
* Default is stream<sub>0</sub>. |
|
*/ |
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
Min(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
// Signed integer type for global offsets |
|
using OffsetT = int; |
|
|
|
// The input value type |
|
using InputT = cub::detail::value_t<InputIteratorT>; |
|
|
|
return DispatchSegmentedReduce< |
|
InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT, |
|
OffsetT, |
|
cub::Min>::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
cub::Min(), |
|
Traits<InputT>::Max(), // replace with |
|
// std::numeric_limits<T>::max() |
|
// when C++11 support is more |
|
// prevalent |
|
stream); |
|
} |
|
|
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
Min(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return Min<InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Finds the first device-wide minimum in each segment using the |
|
* less-than ('<') operator, also returning the in-segment index of |
|
* that item. |
|
* |
|
* @par |
|
* - The output value type of `d_out` is cub::KeyValuePair `<int, T>` |
|
* (assuming the value type of `d_in` is `T`) |
|
* - The minimum of the *i*<sup>th</sup> segment is written to |
|
* `d_out[i].value` and its offset in that segment is written to |
|
* `d_out[i].key`. |
|
* - The `{1, std::numeric_limits<T>::max()}` tuple is produced for |
|
* zero-length inputs |
|
* - When input a contiguous sequence of segments, a single sequence |
|
* `segment_offsets` (of length `num_segments + 1`) can be aliased for both |
|
* the `d_begin_offsets` and `d_end_offsets` parameters (where the latter |
|
* is specified as `segment_offsets + 1`). |
|
* - Does not support `<` operators that are non-commutative. |
|
* - Let `s` be in `[0, num_segments)`. The range |
|
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not |
|
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)`. |
|
* - @devicestorage |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the argmin-reduction of a device vector |
|
* of `int` data elements. |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_radix_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for input and output |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* KeyValuePair<int, int> *d_out; // e.g., [{-,-}, {-,-}, {-,-}] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedReduce::ArgMin( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run argmin-reduction |
|
* cub::DeviceSegmentedReduce::ArgMin( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] |
|
* @endcode |
|
* |
|
* @tparam InputIteratorT |
|
* **[inferred]** Random-access input iterator type for reading input items |
|
* (of some type `T`) \iterator |
|
* |
|
* @tparam OutputIteratorT |
|
* **[inferred]** Output iterator type for recording the reduced aggregate |
|
* (having value type `KeyValuePair<int, T>`) \iterator |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to `temp_storage_bytes` and no work |
|
* is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of `d_temp_storage` allocation |
|
* |
|
* @param[in] d_in |
|
* Pointer to the input sequence of data items |
|
* |
|
* @param[out] d_out |
|
* Pointer to the output aggregate |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length `num_segments`, such that `d_begin_offsets[i]` is the first |
|
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of |
|
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the |
|
* *i*<sup>th</sup> is considered empty. |
|
* |
|
* @param[in] stream |
|
* **[optional]** CUDA stream to launch kernels within. |
|
* Default is stream<sub>0</sub>. |
|
*/ |
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
ArgMin(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
// Signed integer type for global offsets |
|
using OffsetT = int; |
|
|
|
// The input type |
|
using InputValueT = cub::detail::value_t<InputIteratorT>; |
|
|
|
// The output tuple type |
|
using OutputTupleT = |
|
cub::detail::non_void_value_t<OutputIteratorT, |
|
KeyValuePair<OffsetT, InputValueT>>; |
|
|
|
// The output value type |
|
using OutputValueT = typename OutputTupleT::Value; |
|
|
|
using AccumT = OutputTupleT; |
|
|
|
using InitT = detail::reduce::empty_problem_init_t<AccumT>; |
|
|
|
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples |
|
using ArgIndexInputIteratorT = |
|
ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>; |
|
|
|
ArgIndexInputIteratorT d_indexed_in(d_in); |
|
|
|
// Initial value |
|
// TODO Address https://github.com/NVIDIA/cub/issues/651 |
|
InitT initial_value{AccumT(1, Traits<InputValueT>::Max())}; |
|
|
|
return DispatchSegmentedReduce<ArgIndexInputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT, |
|
OffsetT, |
|
cub::ArgMin, |
|
InitT, |
|
AccumT>::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_indexed_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
cub::ArgMin(), |
|
initial_value, |
|
stream); |
|
} |
|
|
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
ArgMin(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return ArgMin<InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Computes a device-wide segmented maximum using the greater-than |
|
* (`>`) operator. |
|
* |
|
* @par |
|
* - Uses `std::numeric_limits<T>::lowest()` as the initial value of the |
|
* reduction. |
|
* - When input a contiguous sequence of segments, a single sequence |
|
* `segment_offsets` (of length `num_segments + 1`) can be aliased |
|
* for both the `d_begin_offsets` and `d_end_offsets` parameters (where |
|
* the latter is specified as `segment_offsets + 1`). |
|
* - Does not support `>` operators that are non-commutative. |
|
* - Let `s` be in `[0, num_segments)`. The range |
|
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not |
|
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)`. |
|
* - @devicestorage |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the max-reduction of a device vector |
|
* of `int` data elements. |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_radix_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for input and output |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_out; // e.g., [-, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedReduce::Max( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run max-reduction |
|
* cub::DeviceSegmentedReduce::Max( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_out <-- [8, INT_MIN, 9] |
|
* @endcode |
|
* |
|
* @tparam InputIteratorT |
|
* **[inferred]** Random-access input iterator type for reading input |
|
* items \iterator |
|
* |
|
* @tparam OutputIteratorT |
|
* **[inferred]** Output iterator type for recording the reduced |
|
* aggregate \iterator |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to `temp_storage_bytes` and no work |
|
* is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of `d_temp_storage` allocation |
|
* |
|
* @param[in] d_in |
|
* Pointer to the input sequence of data items |
|
* |
|
* @param[out] d_out |
|
* Pointer to the output aggregate |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length `num_segments`, such that `d_begin_offsets[i]` is the first |
|
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of |
|
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* **[optional]** CUDA stream to launch kernels within. |
|
* Default is stream<sub>0</sub>. |
|
*/ |
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
Max(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
// Signed integer type for global offsets |
|
using OffsetT = int; |
|
|
|
// The input value type |
|
using InputT = cub::detail::value_t<InputIteratorT>; |
|
|
|
return DispatchSegmentedReduce< |
|
InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT, |
|
OffsetT, |
|
cub::Max>::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
cub::Max(), |
|
Traits<InputT>::Lowest(), // replace with |
|
// std::numeric_limits<T>::lowest() |
|
// when C++11 support is |
|
// more prevalent |
|
stream); |
|
} |
|
|
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
Max(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return Max<InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Finds the first device-wide maximum in each segment using the |
|
* greater-than ('>') operator, also returning the in-segment index of |
|
* that item |
|
* |
|
* @par |
|
* - The output value type of `d_out` is `cub::KeyValuePair<int, T>` |
|
* (assuming the value type of `d_in` is `T`) |
|
* - The maximum of the *i*<sup>th</sup> segment is written to |
|
* `d_out[i].value` and its offset in that segment is written to |
|
* `d_out[i].key`. |
|
* - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for |
|
* zero-length inputs |
|
* - When input a contiguous sequence of segments, a single sequence |
|
* `segment_offsets` (of length `num_segments + 1`) can be aliased |
|
* for both the `d_begin_offsets` and `d_end_offsets` parameters (where |
|
* the latter is specified as `segment_offsets + 1`). |
|
* - Does not support `>` operators that are non-commutative. |
|
* - Let `s` be in `[0, num_segments)`. The range |
|
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not |
|
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)`. |
|
* - @devicestorage |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the argmax-reduction of a device vector |
|
* of `int` data elements. |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_reduce.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for input and output |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* KeyValuePair<int, int> *d_out; // e.g., [{-,-}, {-,-}, {-,-}] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedReduce::ArgMax( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run argmax-reduction |
|
* cub::DeviceSegmentedReduce::ArgMax( |
|
* d_temp_storage, temp_storage_bytes, d_in, d_out, |
|
* num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] |
|
* @endcode |
|
* |
|
* @tparam InputIteratorT |
|
* **[inferred]** Random-access input iterator type for reading input items |
|
* (of some type `T`) \iterator |
|
* |
|
* @tparam OutputIteratorT |
|
* **[inferred]** Output iterator type for recording the reduced aggregate |
|
* (having value type `KeyValuePair<int, T>`) \iterator |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* **[inferred]** Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to `temp_storage_bytes` and no work |
|
* is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of `d_temp_storage` allocation |
|
* |
|
* @param[in] d_in |
|
* Pointer to the input sequence of data items |
|
* |
|
* @param[out] d_out |
|
* Pointer to the output aggregate |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length `num_segments`, such that `d_begin_offsets[i]` is the first |
|
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of |
|
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* **[optional]** CUDA stream to launch kernels within. |
|
* Default is stream<sub>0</sub>. |
|
*/ |
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
ArgMax(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
// Signed integer type for global offsets |
|
using OffsetT = int; |
|
|
|
// The input type |
|
using InputValueT = cub::detail::value_t<InputIteratorT>; |
|
|
|
// The output tuple type |
|
using OutputTupleT = |
|
cub::detail::non_void_value_t<OutputIteratorT, |
|
KeyValuePair<OffsetT, InputValueT>>; |
|
|
|
using AccumT = OutputTupleT; |
|
|
|
using InitT = detail::reduce::empty_problem_init_t<AccumT>; |
|
|
|
// The output value type |
|
using OutputValueT = typename OutputTupleT::Value; |
|
|
|
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples |
|
using ArgIndexInputIteratorT = |
|
ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>; |
|
|
|
ArgIndexInputIteratorT d_indexed_in(d_in); |
|
|
|
// Initial value |
|
// TODO Address https://github.com/NVIDIA/cub/issues/651 |
|
InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())}; |
|
|
|
return DispatchSegmentedReduce<ArgIndexInputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT, |
|
OffsetT, |
|
cub::ArgMax, |
|
InitT, |
|
AccumT>::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_indexed_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
cub::ArgMax(), |
|
initial_value, |
|
stream); |
|
} |
|
|
|
template <typename InputIteratorT, |
|
typename OutputIteratorT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
ArgMax(void *d_temp_storage, |
|
size_t &temp_storage_bytes, |
|
InputIteratorT d_in, |
|
OutputIteratorT d_out, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return ArgMax<InputIteratorT, |
|
OutputIteratorT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_in, |
|
d_out, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
}; |
|
|
|
CUB_NAMESPACE_END |
|
|
|
|
|
|