thrust / install /include /cub /device /device_segmented_reduce.cuh
camenduru's picture
thanks to nvidia ❤
0dc1b04
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* @file cub::DeviceSegmentedReduce provides device-wide, parallel operations
* for computing a batched reduction across multiple sequences of data
* items residing within device-accessible memory.
*/
#pragma once
#include <iterator>
#include <cub/config.cuh>
#include <cub/device/dispatch/dispatch_reduce.cuh>
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_type.cuh>
CUB_NAMESPACE_BEGIN
/**
* @brief DeviceSegmentedReduce provides device-wide, parallel operations for
* computing a reduction across multiple sequences of data items
* residing within device-accessible memory. ![](reduce_logo.png)
* @ingroup SegmentedModule
*
* @par Overview
* A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)">*reduction*</a>
* (or *fold*) uses a binary combining operator to compute a single aggregate
* from a sequence of input elements.
*
* @par Usage Considerations
* @cdp_class{DeviceSegmentedReduce}
*
*/
struct DeviceSegmentedReduce
{
/**
* @brief Computes a device-wide segmented reduction using the specified
* binary `reduction_op` functor.
*
* @par
* - Does not support binary reduction operators that are non-commutative.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - When input a contiguous sequence of segments, a single sequence
* `segment_offsets` (of length `num_segments + 1`) can be aliased
* for both the `d_begin_offsets` and `d_end_offsets` parameters (where
* the latter is specified as `segment_offsets + 1`).
* - Let `s` be in `[0, num_segments)`. The range
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor
* `[d_end_offsets, d_end_offsets + num_segments)`.
* - @devicestorage
*
* @par Snippet
* The code snippet below illustrates a custom min-reduction of a device
* vector of `int` data elements.
* @par
* @code
* #include <cub/cub.cuh>
* // or equivalently <cub/device/device_radix_sort.cuh>
*
* // CustomMin functor
* struct CustomMin
* {
* template <typename T>
* CUB_RUNTIME_FUNCTION __forceinline__
* T operator()(const T &a, const T &b) const {
* return (b < a) ? b : a;
* }
* };
*
* // Declare, allocate, and initialize device-accessible pointers
* // for input and output
* int num_segments; // e.g., 3
* int *d_offsets; // e.g., [0, 3, 3, 7]
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
* int *d_out; // e.g., [-, -, -]
* CustomMin min_op;
* int initial_value; // e.g., INT_MAX
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSegmentedReduce::Reduce(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run reduction
* cub::DeviceSegmentedReduce::Reduce(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
*
* // d_out <-- [6, INT_MAX, 0]
* @endcode
*
* @tparam InputIteratorT
* **[inferred]** Random-access input iterator type for reading input
* items \iterator
*
* @tparam OutputIteratorT
* **[inferred]** Output iterator type for recording the reduced
* aggregate \iterator
*
* @tparam BeginOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* beginning offsets \iterator
*
* @tparam EndOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* ending offsets \iterator
*
* @tparam ReductionOp
* **[inferred]** Binary reduction functor type having member
* `T operator()(const T &a, const T &b)`
*
* @tparam T
* **[inferred]** Data element type that is convertible to the `value` type
* of `InputIteratorT`
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no
* work is done.
*
* @param[in,out] temp_storage_bytes
* Reference to size in bytes of \p d_temp_storage allocation
*
* @param[in] d_in
* Pointer to the input sequence of data items
*
* @param[out] d_out
* Pointer to the output aggregate
*
* @param[in] num_segments
* The number of segments that comprise the sorting data
*
* @param[in] d_begin_offsets
* Random-access input iterator to the sequence of beginning offsets of
* length `num_segments`, such that `d_begin_offsets[i]` is the first
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and
* `d_values_*`
*
* @param[in] d_end_offsets
* Random-access input iterator to the sequence of ending offsets of length
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
* considered empty.
*
* @param[in] reduction_op
* Binary reduction functor
*
* @param[in] initial_value
* Initial value of the reduction for each segment
*
* @param[in] stream
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename ReductionOp,
typename T>
CUB_RUNTIME_FUNCTION static cudaError_t
Reduce(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
ReductionOp reduction_op,
T initial_value,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
return DispatchSegmentedReduce<InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
OffsetT,
ReductionOp>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
reduction_op,
initial_value,
stream);
}
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename ReductionOp,
typename T>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION static cudaError_t
Reduce(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
ReductionOp reduction_op,
T initial_value,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return Reduce<InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
ReductionOp,
T>(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
reduction_op,
initial_value,
stream);
}
/**
* @brief Computes a device-wide segmented sum using the addition (`+`)
* operator.
*
* @par
* - Uses `0` as the initial value of the reduction for each segment.
* - When input a contiguous sequence of segments, a single sequence
* `segment_offsets` (of length `num_segments + 1`) can be aliased
* for both the `d_begin_offsets` and `d_end_offsets` parameters (where
* the latter is specified as `segment_offsets + 1`).
* - Does not support `+` operators that are non-commutative.
* - Let `s` be in `[0, num_segments)`. The range
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor
* `[d_end_offsets, d_end_offsets + num_segments)`.
* - @devicestorage
*
* @par Snippet
* The code snippet below illustrates the sum reduction of a device vector of
* `int` data elements.
* @par
* @code
* #include <cub/cub.cuh>
* // or equivalently <cub/device/device_radix_sort.cuh>
*
* // Declare, allocate, and initialize device-accessible pointers
* // for input and output
* int num_segments; // e.g., 3
* int *d_offsets; // e.g., [0, 3, 3, 7]
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
* int *d_out; // e.g., [-, -, -]
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSegmentedReduce::Sum(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run sum-reduction
* cub::DeviceSegmentedReduce::Sum(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // d_out <-- [21, 0, 17]
* @endcode
*
* @tparam InputIteratorT
* **[inferred]** Random-access input iterator type for reading input
* items \iterator
*
* @tparam OutputIteratorT
* **[inferred]** Output iterator type for recording the reduced aggregate
* \iterator
*
* @tparam BeginOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* beginning offsets \iterator
*
* @tparam EndOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* ending offsets \iterator
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
* is done.
*
* @param[in,out] temp_storage_bytes
* Reference to size in bytes of `d_temp_storage` allocation
*
* @param[in] d_in
* Pointer to the input sequence of data items
*
* @param[out] d_out
* Pointer to the output aggregate
*
* @param[in] num_segments
* The number of segments that comprise the sorting data
*
* @param[in] d_begin_offsets
* Random-access input iterator to the sequence of beginning offsets of
* length `num_segments`, such that `d_begin_offsets[i]` is the first
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and
* `d_values_*`
*
* @param[in] d_end_offsets
* Random-access input iterator to the sequence of ending offsets of length
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
* considered empty.
*
* @param[in] stream
* **[optional]</b> CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t
Sum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
// The output value type
using OutputT =
cub::detail::non_void_value_t<OutputIteratorT,
cub::detail::value_t<InputIteratorT>>;
return DispatchSegmentedReduce<
InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
OffsetT,
cub::Sum>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::Sum(),
OutputT(), // zero-initialize
stream);
}
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION static cudaError_t
Sum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return Sum<InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT>(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
stream);
}
/**
* @brief Computes a device-wide segmented minimum using the less-than
* (`<`) operator.
*
* @par
* - Uses `std::numeric_limits<T>::max()` as the initial value of the
* reduction for each segment.
* - When input a contiguous sequence of segments, a single sequence
* `segment_offsets` (of length `num_segments + 1`) can be aliased for both
* the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is
* specified as `segment_offsets + 1`).
* - Does not support `<` operators that are non-commutative.
* - Let `s` be in `[0, num_segments)`. The range
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor
* `[d_end_offsets, d_end_offsets + num_segments)`.
* - @devicestorage
*
* @par Snippet
* The code snippet below illustrates the min-reduction of a device vector of
* `int` data elements.
* @par
* @code
* #include <cub/cub.cuh>
* // or equivalently <cub/device/device_radix_sort.cuh>
*
* // Declare, allocate, and initialize device-accessible pointers
* // for input and output
* int num_segments; // e.g., 3
* int *d_offsets; // e.g., [0, 3, 3, 7]
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
* int *d_out; // e.g., [-, -, -]
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSegmentedReduce::Min(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run min-reduction
* cub::DeviceSegmentedReduce::Min(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // d_out <-- [6, INT_MAX, 0]
* @endcode
*
* @tparam InputIteratorT
* **[inferred]** Random-access input iterator type for reading input
* items \iterator
*
* @tparam OutputIteratorT
* **[inferred]** Output iterator type for recording the reduced
* aggregate \iterator
*
* @tparam BeginOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* beginning offsets \iterator
*
* @tparam EndOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* ending offsets \iterator
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
* is done.
*
* @param[in,out] temp_storage_bytes
* Reference to size in bytes of `d_temp_storage` allocation
*
* @param[in] d_in
* Pointer to the input sequence of data items
*
* @param[out] d_out
* Pointer to the output aggregate
*
* @param[in] num_segments
* The number of segments that comprise the sorting data
*
* @param[in] d_begin_offsets
* Random-access input iterator to the sequence of beginning offsets of
* length `num_segments`, such that `d_begin_offsets[i]` is the first
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and
* `d_values_*`
*
* @param[in] d_end_offsets
* Random-access input iterator to the sequence of ending offsets of length
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
* considered empty.
*
* @param[in] stream
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t
Min(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
// The input value type
using InputT = cub::detail::value_t<InputIteratorT>;
return DispatchSegmentedReduce<
InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
OffsetT,
cub::Min>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::Min(),
Traits<InputT>::Max(), // replace with
// std::numeric_limits<T>::max()
// when C++11 support is more
// prevalent
stream);
}
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION static cudaError_t
Min(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return Min<InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT>(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
stream);
}
/**
* @brief Finds the first device-wide minimum in each segment using the
* less-than ('<') operator, also returning the in-segment index of
* that item.
*
* @par
* - The output value type of `d_out` is cub::KeyValuePair `<int, T>`
* (assuming the value type of `d_in` is `T`)
* - The minimum of the *i*<sup>th</sup> segment is written to
* `d_out[i].value` and its offset in that segment is written to
* `d_out[i].key`.
* - The `{1, std::numeric_limits<T>::max()}` tuple is produced for
* zero-length inputs
* - When input a contiguous sequence of segments, a single sequence
* `segment_offsets` (of length `num_segments + 1`) can be aliased for both
* the `d_begin_offsets` and `d_end_offsets` parameters (where the latter
* is specified as `segment_offsets + 1`).
* - Does not support `<` operators that are non-commutative.
* - Let `s` be in `[0, num_segments)`. The range
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor
* `[d_end_offsets, d_end_offsets + num_segments)`.
* - @devicestorage
*
* @par Snippet
* The code snippet below illustrates the argmin-reduction of a device vector
* of `int` data elements.
* @par
* @code
* #include <cub/cub.cuh>
* // or equivalently <cub/device/device_radix_sort.cuh>
*
* // Declare, allocate, and initialize device-accessible pointers
* // for input and output
* int num_segments; // e.g., 3
* int *d_offsets; // e.g., [0, 3, 3, 7]
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
* KeyValuePair<int, int> *d_out; // e.g., [{-,-}, {-,-}, {-,-}]
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSegmentedReduce::ArgMin(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run argmin-reduction
* cub::DeviceSegmentedReduce::ArgMin(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
* @endcode
*
* @tparam InputIteratorT
* **[inferred]** Random-access input iterator type for reading input items
* (of some type `T`) \iterator
*
* @tparam OutputIteratorT
* **[inferred]** Output iterator type for recording the reduced aggregate
* (having value type `KeyValuePair<int, T>`) \iterator
*
* @tparam BeginOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* beginning offsets \iterator
*
* @tparam EndOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* ending offsets \iterator
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
* is done.
*
* @param[in,out] temp_storage_bytes
* Reference to size in bytes of `d_temp_storage` allocation
*
* @param[in] d_in
* Pointer to the input sequence of data items
*
* @param[out] d_out
* Pointer to the output aggregate
*
* @param[in] num_segments
* The number of segments that comprise the sorting data
*
* @param[in] d_begin_offsets
* Random-access input iterator to the sequence of beginning offsets of
* length `num_segments`, such that `d_begin_offsets[i]` is the first
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and
* `d_values_*`
*
* @param[in] d_end_offsets
* Random-access input iterator to the sequence of ending offsets of length
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the
* *i*<sup>th</sup> is considered empty.
*
* @param[in] stream
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t
ArgMin(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
// The input type
using InputValueT = cub::detail::value_t<InputIteratorT>;
// The output tuple type
using OutputTupleT =
cub::detail::non_void_value_t<OutputIteratorT,
KeyValuePair<OffsetT, InputValueT>>;
// The output value type
using OutputValueT = typename OutputTupleT::Value;
using AccumT = OutputTupleT;
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
using ArgIndexInputIteratorT =
ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
ArgIndexInputIteratorT d_indexed_in(d_in);
// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
return DispatchSegmentedReduce<ArgIndexInputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
OffsetT,
cub::ArgMin,
InitT,
AccumT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::ArgMin(),
initial_value,
stream);
}
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION static cudaError_t
ArgMin(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return ArgMin<InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT>(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
stream);
}
/**
* @brief Computes a device-wide segmented maximum using the greater-than
* (`>`) operator.
*
* @par
* - Uses `std::numeric_limits<T>::lowest()` as the initial value of the
* reduction.
* - When input a contiguous sequence of segments, a single sequence
* `segment_offsets` (of length `num_segments + 1`) can be aliased
* for both the `d_begin_offsets` and `d_end_offsets` parameters (where
* the latter is specified as `segment_offsets + 1`).
* - Does not support `>` operators that are non-commutative.
* - Let `s` be in `[0, num_segments)`. The range
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor
* `[d_end_offsets, d_end_offsets + num_segments)`.
* - @devicestorage
*
* @par Snippet
* The code snippet below illustrates the max-reduction of a device vector
* of `int` data elements.
* @par
* @code
* #include <cub/cub.cuh>
* // or equivalently <cub/device/device_radix_sort.cuh>
*
* // Declare, allocate, and initialize device-accessible pointers
* // for input and output
* int num_segments; // e.g., 3
* int *d_offsets; // e.g., [0, 3, 3, 7]
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
* int *d_out; // e.g., [-, -, -]
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSegmentedReduce::Max(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run max-reduction
* cub::DeviceSegmentedReduce::Max(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // d_out <-- [8, INT_MIN, 9]
* @endcode
*
* @tparam InputIteratorT
* **[inferred]** Random-access input iterator type for reading input
* items \iterator
*
* @tparam OutputIteratorT
* **[inferred]** Output iterator type for recording the reduced
* aggregate \iterator
*
* @tparam BeginOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* beginning offsets \iterator
*
* @tparam EndOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* ending offsets \iterator
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
* is done.
*
* @param[in,out] temp_storage_bytes
* Reference to size in bytes of `d_temp_storage` allocation
*
* @param[in] d_in
* Pointer to the input sequence of data items
*
* @param[out] d_out
* Pointer to the output aggregate
*
* @param[in] num_segments
* The number of segments that comprise the sorting data
*
* @param[in] d_begin_offsets
* Random-access input iterator to the sequence of beginning offsets of
* length `num_segments`, such that `d_begin_offsets[i]` is the first
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and
* `d_values_*`
*
* @param[in] d_end_offsets
* Random-access input iterator to the sequence of ending offsets of length
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
* considered empty.
*
* @param[in] stream
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t
Max(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
// The input value type
using InputT = cub::detail::value_t<InputIteratorT>;
return DispatchSegmentedReduce<
InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
OffsetT,
cub::Max>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::Max(),
Traits<InputT>::Lowest(), // replace with
// std::numeric_limits<T>::lowest()
// when C++11 support is
// more prevalent
stream);
}
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION static cudaError_t
Max(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return Max<InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT>(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
stream);
}
/**
* @brief Finds the first device-wide maximum in each segment using the
* greater-than ('>') operator, also returning the in-segment index of
* that item
*
* @par
* - The output value type of `d_out` is `cub::KeyValuePair<int, T>`
* (assuming the value type of `d_in` is `T`)
* - The maximum of the *i*<sup>th</sup> segment is written to
* `d_out[i].value` and its offset in that segment is written to
* `d_out[i].key`.
* - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for
* zero-length inputs
* - When input a contiguous sequence of segments, a single sequence
* `segment_offsets` (of length `num_segments + 1`) can be aliased
* for both the `d_begin_offsets` and `d_end_offsets` parameters (where
* the latter is specified as `segment_offsets + 1`).
* - Does not support `>` operators that are non-commutative.
* - Let `s` be in `[0, num_segments)`. The range
* `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
* overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor
* `[d_end_offsets, d_end_offsets + num_segments)`.
* - @devicestorage
*
* @par Snippet
* The code snippet below illustrates the argmax-reduction of a device vector
* of `int` data elements.
* @par
* @code
* #include <cub/cub.cuh>
* // or equivalently <cub/device/device_reduce.cuh>
*
* // Declare, allocate, and initialize device-accessible pointers
* // for input and output
* int num_segments; // e.g., 3
* int *d_offsets; // e.g., [0, 3, 3, 7]
* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
* KeyValuePair<int, int> *d_out; // e.g., [{-,-}, {-,-}, {-,-}]
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSegmentedReduce::ArgMax(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run argmax-reduction
* cub::DeviceSegmentedReduce::ArgMax(
* d_temp_storage, temp_storage_bytes, d_in, d_out,
* num_segments, d_offsets, d_offsets + 1);
*
* // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
* @endcode
*
* @tparam InputIteratorT
* **[inferred]** Random-access input iterator type for reading input items
* (of some type `T`) \iterator
*
* @tparam OutputIteratorT
* **[inferred]** Output iterator type for recording the reduced aggregate
* (having value type `KeyValuePair<int, T>`) \iterator
*
* @tparam BeginOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* beginning offsets \iterator
*
* @tparam EndOffsetIteratorT
* **[inferred]** Random-access input iterator type for reading segment
* ending offsets \iterator
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
* is done.
*
* @param[in,out] temp_storage_bytes
* Reference to size in bytes of `d_temp_storage` allocation
*
* @param[in] d_in
* Pointer to the input sequence of data items
*
* @param[out] d_out
* Pointer to the output aggregate
*
* @param[in] num_segments
* The number of segments that comprise the sorting data
*
* @param[in] d_begin_offsets
* Random-access input iterator to the sequence of beginning offsets of
* length `num_segments`, such that `d_begin_offsets[i]` is the first
* element of the *i*<sup>th</sup> data segment in `d_keys_*` and
* `d_values_*`
*
* @param[in] d_end_offsets
* Random-access input iterator to the sequence of ending offsets of length
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
* the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
* considered empty.
*
* @param[in] stream
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t
ArgMax(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
// The input type
using InputValueT = cub::detail::value_t<InputIteratorT>;
// The output tuple type
using OutputTupleT =
cub::detail::non_void_value_t<OutputIteratorT,
KeyValuePair<OffsetT, InputValueT>>;
using AccumT = OutputTupleT;
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
// The output value type
using OutputValueT = typename OutputTupleT::Value;
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
using ArgIndexInputIteratorT =
ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
ArgIndexInputIteratorT d_indexed_in(d_in);
// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
return DispatchSegmentedReduce<ArgIndexInputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
OffsetT,
cub::ArgMax,
InitT,
AccumT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::ArgMax(),
initial_value,
stream);
}
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION static cudaError_t
ArgMax(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
return ArgMax<InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT>(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
stream);
}
};
CUB_NAMESPACE_END