|
/****************************************************************************** |
|
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions are met: |
|
* * Redistributions of source code must retain the above copyright |
|
* notice, this list of conditions and the following disclaimer. |
|
* * Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in the |
|
* documentation and/or other materials provided with the distribution. |
|
* * Neither the name of the NVIDIA CORPORATION nor the |
|
* names of its contributors may be used to endorse or promote products |
|
* derived from this software without specific prior written permission. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
* |
|
******************************************************************************/ |
|
|
|
/** |
|
* @file |
|
* cub::DeviceSegmentedSort provides device-wide, parallel operations for |
|
* computing a batched sort across multiple, non-overlapping sequences of |
|
* data items residing within device-accessible memory. |
|
*/ |
|
|
|
#pragma once |
|
|
|
#include <cub/config.cuh> |
|
#include <cub/device/dispatch/dispatch_segmented_sort.cuh> |
|
#include <cub/util_deprecated.cuh> |
|
#include <cub/util_namespace.cuh> |
|
|
|
CUB_NAMESPACE_BEGIN |
|
|
|
|
|
/** |
|
* @brief DeviceSegmentedSort provides device-wide, parallel operations for |
|
* computing a batched sort across multiple, non-overlapping sequences of |
|
* data items residing within device-accessible memory. |
|
*  |
|
* @ingroup SegmentedModule |
|
* |
|
* @par Overview |
|
* The algorithm arranges items into ascending (or descending) order. |
|
* The underlying sorting algorithm is undefined. Depending on the segment size, |
|
* it might be radix sort, merge sort or something else. Therefore, no |
|
* assumptions on the underlying implementation should be made. |
|
* |
|
* @par Differences from DeviceSegmentedRadixSort |
|
* DeviceSegmentedRadixSort is optimized for significantly large segments (tens |
|
* of thousands of items and more). Nevertheless, some domains produce a wide |
|
* range of segment sizes. DeviceSegmentedSort partitions segments into size |
|
* groups and specialize sorting algorithms for each group. This approach leads |
|
* to better resource utilization in the presence of segment size imbalance or |
|
* moderate segment sizes (up to thousands of items). |
|
* This algorithm is more complex and consists of multiple kernels. This fact |
|
* leads to longer compilation times as well as larger binaries sizes. |
|
* |
|
* @par Supported Types |
|
* The algorithm has to satisfy the underlying algorithms restrictions. Radix |
|
* sort usage restricts the list of supported types. Therefore, |
|
* DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types |
|
* (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` and |
|
* `__nv_bfloat16` 16-bit floating-point types. |
|
* |
|
* @par Segments are not required to be contiguous. Any element of input(s) or |
|
* output(s) outside the specified segments will not be accessed nor modified. |
|
* |
|
* @par A simple example |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_keys_out; // e.g., [-, -, -, -, -, -, -] |
|
* int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] |
|
* int *d_values_out; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::SortPairs( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::SortPairs( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] |
|
* // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] |
|
* @endcode |
|
*/ |
|
struct DeviceSegmentedSort |
|
{ |
|
|
|
/*************************************************************************//** |
|
* @name Keys-only |
|
****************************************************************************/ |
|
//@{ |
|
|
|
/** |
|
* @brief Sorts segments of keys into ascending order. Approximately |
|
* `num_items + 2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The contents of the input data are not altered by the sorting operation. |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - SortKeys is not guaranteed to be stable. That is, suppose that @p i and |
|
* @p j are equivalent: neither one is less than the other. It is not |
|
* guaranteed that the relative order of these two elements will be |
|
* preserved by sort. |
|
* - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap |
|
* `[d_keys_in, d_keys_in + num_items)`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not |
|
* be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible |
|
* // pointers for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_keys_out; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::SortKeys( |
|
* d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::SortKeys( |
|
* d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in] d_keys_in |
|
* Device-accessible pointer to the input data of key data to sort |
|
* |
|
* @param[out] d_keys_out |
|
* Device-accessible pointer to the sorted output sequence of key data |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length `num_segments`, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortKeys(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
constexpr bool is_descending = false; |
|
constexpr bool is_overwrite_okay = false; |
|
using DispatchT = DispatchSegmentedSort<is_descending, |
|
KeyT, |
|
cub::NullType, |
|
int, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>; |
|
|
|
DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out); |
|
DoubleBuffer<NullType> d_values; |
|
|
|
return DispatchT::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
is_overwrite_okay, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortKeys(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of keys into descending order. Approximately |
|
* `num_items + 2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The contents of the input data are not altered by the sorting operation. |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments + 1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets + 1`). |
|
* - SortKeysDescending is not guaranteed to be stable. That is, suppose that |
|
* @p i and @p j are equivalent: neither one is less than the other. It is |
|
* not guaranteed that the relative order of these two elements will be |
|
* preserved by sort. |
|
* - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap |
|
* `[d_keys_in, d_keys_in + num_items)`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not |
|
* be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_keys_out; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::SortKeysDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::SortKeysDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no |
|
* work is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in] d_keys_in |
|
* Device-accessible pointer to the input data of key data to sort |
|
* |
|
* @param[out] d_keys_out |
|
* Device-accessible pointer to the sorted output sequence of key data |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i] - 1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortKeysDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
constexpr bool is_descending = true; |
|
constexpr bool is_overwrite_okay = false; |
|
using DispatchT = DispatchSegmentedSort<is_descending, |
|
KeyT, |
|
cub::NullType, |
|
int, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>; |
|
|
|
DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out); |
|
DoubleBuffer<NullType> d_values; |
|
|
|
return DispatchT::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
is_overwrite_okay, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortKeysDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of keys into ascending order. Approximately |
|
* `2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The sorting operation is given a pair of key buffers managed by a |
|
* DoubleBuffer structure that indicates which of the two buffers is |
|
* "current" (and thus contains the input data to be sorted). |
|
* - The contents of both buffers may be altered by the sorting operation. |
|
* - Upon completion, the sorting operation will update the "current" |
|
* indicator within the DoubleBuffer wrapper to reference which of the two |
|
* buffers now contains the sorted output sequence (a function of the number |
|
* of key bits and the targeted device architecture). |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - SortKeys is not guaranteed to be stable. That is, suppose that |
|
* @p i and @p j are equivalent: neither one is less than the other. It is |
|
* not guaranteed that the relative order of these two elements will be |
|
* preserved by sort. |
|
* - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. |
|
* The range `[cur, cur + num_items)` shall not overlap |
|
* `[alt, alt + num_items)`. Both ranges shall not overlap |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys.Current()[i]`, |
|
* `d_keys[i].Alternate()[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible |
|
* // pointers for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Create a DoubleBuffer to wrap the pair of device pointers |
|
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::SortKeys( |
|
* d_temp_storage, temp_storage_bytes, d_keys, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::SortKeys( |
|
* d_temp_storage, temp_storage_bytes, d_keys, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no |
|
* work is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in,out] d_keys |
|
* Reference to the double-buffer of keys whose "current" device-accessible |
|
* buffer contains the unsorted input keys and, upon return, is updated to |
|
* point to the sorted output keys |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` |
|
* and `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i] - 1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortKeys(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
constexpr bool is_descending = false; |
|
constexpr bool is_overwrite_okay = true; |
|
|
|
using DispatchT = DispatchSegmentedSort<is_descending, |
|
KeyT, |
|
cub::NullType, |
|
int, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>; |
|
|
|
DoubleBuffer<NullType> d_values; |
|
|
|
return DispatchT::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
is_overwrite_okay, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortKeys(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of keys into descending order. Approximately |
|
* `2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The sorting operation is given a pair of key buffers managed by a |
|
* DoubleBuffer structure that indicates which of the two buffers is |
|
* "current" (and thus contains the input data to be sorted). |
|
* - The contents of both buffers may be altered by the sorting operation. |
|
* - Upon completion, the sorting operation will update the "current" |
|
* indicator within the DoubleBuffer wrapper to reference which of the two |
|
* buffers now contains the sorted output sequence (a function of the number |
|
* of key bits and the targeted device architecture). |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments + 1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets + 1`). |
|
* - SortKeysDescending is not guaranteed to be stable. That is, suppose that |
|
* @p i and @p j are equivalent: neither one is less than the other. It is |
|
* not guaranteed that the relative order of these two elements will be |
|
* preserved by sort. |
|
* - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. |
|
* The range `[cur, cur + num_items)` shall not overlap |
|
* `[alt, alt + num_items)`. Both ranges shall not overlap |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys.Current()[i]`, |
|
* `d_keys[i].Alternate()[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers for |
|
* // sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Create a DoubleBuffer to wrap the pair of device pointers |
|
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::SortKeysDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::SortKeysDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in,out] d_keys |
|
* Reference to the double-buffer of keys whose "current" device-accessible |
|
* buffer contains the unsorted input keys and, upon return, is updated to |
|
* point to the sorted output keys |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i] - 1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1<= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortKeysDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
constexpr bool is_descending = true; |
|
constexpr bool is_overwrite_okay = true; |
|
|
|
using DispatchT = DispatchSegmentedSort<is_descending, |
|
KeyT, |
|
cub::NullType, |
|
int, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>; |
|
|
|
DoubleBuffer<NullType> d_values; |
|
|
|
return DispatchT::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
is_overwrite_okay, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortKeysDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of keys into ascending order. Approximately |
|
* `num_items + 2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The contents of the input data are not altered by the sorting operation. |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - StableSortKeys is stable: it preserves the relative ordering of |
|
* equivalent elements. That is, if @p x and @p y are elements such that |
|
* @p x precedes @p y, and if the two elements are equivalent (neither |
|
* @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that |
|
* @p x still precedes @p y. |
|
* - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap |
|
* `[d_keys_in, d_keys_in + num_items)`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not |
|
* be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_keys_out; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::StableSortKeys( |
|
* d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::StableSortKeys( |
|
* d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in] d_keys_in |
|
* Device-accessible pointer to the input data of key data to sort |
|
* |
|
* @param[out] d_keys_out |
|
* Device-accessible pointer to the sorted output sequence of key data |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortKeys(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortKeys(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return StableSortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of keys into descending order. Approximately |
|
* `num_items + 2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The contents of the input data are not altered by the sorting operation. |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - StableSortKeysDescending is stable: it preserves the relative ordering of |
|
* equivalent elements. That is, if @p x and @p y are elements such that |
|
* @p x precedes @p y, and if the two elements are equivalent (neither |
|
* @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that |
|
* @p x still precedes @p y. |
|
* - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap |
|
* `[d_keys_in, d_keys_in + num_items)`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not |
|
* be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_keys_out; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::StableSortKeysDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::StableSortKeysDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in] d_keys_in |
|
* Device-accessible pointer to the input data of key data to sort |
|
* |
|
* @param[out] d_keys_out |
|
* Device-accessible pointer to the sorted output sequence of key data |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortKeysDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
return SortKeysDescending<KeyT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortKeysDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return StableSortKeysDescending<KeyT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of keys into ascending order. Approximately |
|
* `2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The sorting operation is given a pair of key buffers managed by a |
|
* DoubleBuffer structure that indicates which of the two buffers is |
|
* "current" (and thus contains the input data to be sorted). |
|
* - The contents of both buffers may be altered by the sorting operation. |
|
* - Upon completion, the sorting operation will update the "current" |
|
* indicator within the DoubleBuffer wrapper to reference which of the two |
|
* buffers now contains the sorted output sequence (a function of the number |
|
* of key bits and the targeted device architecture). |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - StableSortKeys is stable: it preserves the relative ordering of |
|
* equivalent elements. That is, if @p x and @p y are elements such that |
|
* @p x precedes @p y, and if the two elements are equivalent (neither |
|
* @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that |
|
* @p x still precedes @p y. |
|
* - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. |
|
* The range `[cur, cur + num_items)` shall not overlap |
|
* `[alt, alt + num_items)`. Both ranges shall not overlap |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys.Current()[i]`, |
|
* `d_keys[i].Alternate()[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Create a DoubleBuffer to wrap the pair of device pointers |
|
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::StableSortKeys( |
|
* d_temp_storage, temp_storage_bytes, d_keys, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::StableSortKeys( |
|
* d_temp_storage, temp_storage_bytes, d_keys, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in,out] d_keys |
|
* Reference to the double-buffer of keys whose "current" device-accessible |
|
* buffer contains the unsorted input keys and, upon return, is updated to |
|
* point to the sorted output keys |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i] - 1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortKeys(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortKeys(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return StableSortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of keys into descending order. Approximately |
|
* `2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The sorting operation is given a pair of key buffers managed by a |
|
* DoubleBuffer structure that indicates which of the two buffers is |
|
* "current" (and thus contains the input data to be sorted). |
|
* - The contents of both buffers may be altered by the sorting operation. |
|
* - Upon completion, the sorting operation will update the "current" |
|
* indicator within the DoubleBuffer wrapper to reference which of the two |
|
* buffers now contains the sorted output sequence (a function of the number |
|
* of key bits and the targeted device architecture). |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - StableSortKeysDescending is stable: it preserves the relative ordering of |
|
* equivalent elements. That is, if @p x and @p y are elements such that |
|
* @p x precedes @p y, and if the two elements are equivalent (neither |
|
* @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that |
|
* @p x still precedes @p y. |
|
* - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. |
|
* The range `[cur, cur + num_items)` shall not overlap |
|
* `[alt, alt + num_items)`. Both ranges shall not overlap |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys.Current()[i]`, |
|
* `d_keys[i].Alternate()[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Create a DoubleBuffer to wrap the pair of device pointers |
|
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::StableSortKeysDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::StableSortKeysDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in,out] d_keys |
|
* Reference to the double-buffer of keys whose "current" device-accessible |
|
* buffer contains the unsorted input keys and, upon return, is updated to |
|
* point to the sorted output keys |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the |
|
* i-th segment is considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortKeysDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
return SortKeysDescending<KeyT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortKeysDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return StableSortKeysDescending<KeyT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
//@} end member group |
|
/*************************************************************************//** |
|
* @name Key-value pairs |
|
****************************************************************************/ |
|
//@{ |
|
|
|
/** |
|
* @brief Sorts segments of key-value pairs into ascending order. |
|
* Approximately `2*num_items + 2*num_segments` auxiliary storage |
|
* required. |
|
* |
|
* @par |
|
* - The contents of the input data are not altered by the sorting operation. |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - SortPairs is not guaranteed to be stable. That is, suppose that @p i and |
|
* @p j are equivalent: neither one is less than the other. It is not |
|
* guaranteed that the relative order of these two elements will be |
|
* preserved by sort. |
|
* - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of |
|
* `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall |
|
* not overlap `[in, in + num_items)`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, |
|
* `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys with associated vector of |
|
* @p int values. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_keys_out; // e.g., [-, -, -, -, -, -, -] |
|
* int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] |
|
* int *d_values_out; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::SortPairs( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::SortPairs( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] |
|
* // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam ValueT |
|
* <b>[inferred]</b> Value type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in] d_keys_in |
|
* Device-accessible pointer to the input data of key data to sort |
|
* |
|
* @param[out] d_keys_out |
|
* Device-accessible pointer to the sorted output sequence of key data |
|
* |
|
* @param[in] d_values_in |
|
* Device-accessible pointer to the corresponding input sequence of |
|
* associated value items |
|
* |
|
* @param[out] d_values_out |
|
* Device-accessible pointer to the correspondingly-reordered output |
|
* sequence of associated value items |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortPairs(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
const ValueT *d_values_in, |
|
ValueT *d_values_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
constexpr bool is_descending = false; |
|
constexpr bool is_overwrite_okay = false; |
|
using DispatchT = DispatchSegmentedSort<is_descending, |
|
KeyT, |
|
ValueT, |
|
int, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>; |
|
|
|
DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out); |
|
DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out); |
|
|
|
return DispatchT::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
is_overwrite_okay, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortPairs(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
const ValueT *d_values_in, |
|
ValueT *d_values_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
d_values_in, |
|
d_values_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of key-value pairs into descending order. Approximately |
|
* `2*num_items + 2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The contents of the input data are not altered by the sorting operation. |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - SortPairs is not guaranteed to be stable. That is, suppose that @p i and |
|
* @p j are equivalent: neither one is less than the other. It is not |
|
* guaranteed that the relative order of these two elements will be |
|
* preserved by sort. |
|
* - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of |
|
* `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall |
|
* not overlap `[in, in + num_items)`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, |
|
* `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys with associated vector of |
|
* @p int values. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers for |
|
* // sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_keys_out; // e.g., [-, -, -, -, -, -, -] |
|
* int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] |
|
* int *d_values_out; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::SortPairsDescending( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::SortPairsDescending( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] |
|
* // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam ValueT |
|
* <b>[inferred]</b> Value type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in] d_keys_in |
|
* Device-accessible pointer to the input data of key data to sort |
|
* |
|
* @param[out] d_keys_out |
|
* Device-accessible pointer to the sorted output sequence of key data |
|
* |
|
* @param[in] d_values_in |
|
* Device-accessible pointer to the corresponding input sequence of |
|
* associated value items |
|
* |
|
* @param[out] d_values_out |
|
* Device-accessible pointer to the correspondingly-reordered output |
|
* sequence of associated value items |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortPairsDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
const ValueT *d_values_in, |
|
ValueT *d_values_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
constexpr bool is_descending = true; |
|
constexpr bool is_overwrite_okay = false; |
|
using DispatchT = DispatchSegmentedSort<is_descending, |
|
KeyT, |
|
ValueT, |
|
int, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>; |
|
|
|
DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out); |
|
DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out); |
|
|
|
return DispatchT::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
is_overwrite_okay, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortPairsDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
const ValueT *d_values_in, |
|
ValueT *d_values_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return SortPairsDescending<KeyT, |
|
ValueT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
d_values_in, |
|
d_values_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of key-value pairs into ascending order. |
|
* Approximately `2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The sorting operation is given a pair of key buffers and a corresponding |
|
* pair of associated value buffers. Each pair is managed by a DoubleBuffer |
|
* structure that indicates which of the two buffers is "current" (and thus |
|
* contains the input data to be sorted). |
|
* - The contents of both buffers within each pair may be altered by the sorting |
|
* operation. |
|
* - Upon completion, the sorting operation will update the "current" indicator |
|
* within each DoubleBuffer wrapper to reference which of the two buffers |
|
* now contains the sorted output sequence (a function of the number of key bits |
|
* specified and the targeted device architecture). |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - SortPairs is not guaranteed to be stable. That is, suppose that @p i and |
|
* @p j are equivalent: neither one is less than the other. It is not |
|
* guaranteed that the relative order of these two elements will be |
|
* preserved by sort. |
|
* - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` |
|
* be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range |
|
* `[cur, cur + num_items)` shall not overlap |
|
* `[alt, alt + num_items)`. Both ranges shall not overlap |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys.Current()[i]`, |
|
* `d_values.Current()[i]`, `d_keys.Alternate()[i]`, |
|
* `d_values.Alternate()[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys with associated vector of |
|
* @p int values. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] |
|
* int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Create a set of DoubleBuffers to wrap pairs of device pointers |
|
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); |
|
* cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf); |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::SortPairs( |
|
* d_temp_storage, temp_storage_bytes, d_keys, d_values, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::SortPairs( |
|
* d_temp_storage, temp_storage_bytes, d_keys, d_values, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] |
|
* // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] |
|
* |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam ValueT |
|
* <b>[inferred]</b> Value type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in,out] d_keys |
|
* Reference to the double-buffer of keys whose "current" device-accessible |
|
* buffer contains the unsorted input keys and, upon return, is updated to |
|
* point to the sorted output keys |
|
* |
|
* @param[in,out] d_values |
|
* Double-buffer of values whose "current" device-accessible buffer contains |
|
* the unsorted input values and, upon return, is updated to point to the |
|
* sorted output values |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortPairs(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
DoubleBuffer<ValueT> &d_values, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
constexpr bool is_descending = false; |
|
constexpr bool is_overwrite_okay = true; |
|
using DispatchT = DispatchSegmentedSort<is_descending, |
|
KeyT, |
|
ValueT, |
|
int, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>; |
|
|
|
return DispatchT::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
is_overwrite_okay, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortPairs(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
DoubleBuffer<ValueT> &d_values, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of key-value pairs into descending order. |
|
* Approximately `2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The sorting operation is given a pair of key buffers and a corresponding |
|
* pair of associated value buffers. Each pair is managed by a DoubleBuffer |
|
* structure that indicates which of the two buffers is "current" (and thus |
|
* contains the input data to be sorted). |
|
* - The contents of both buffers within each pair may be altered by the |
|
* sorting operation. |
|
* - Upon completion, the sorting operation will update the "current" |
|
* indicator within each DoubleBuffer wrapper to reference which of the two |
|
* buffers now contains the sorted output sequence (a function of the number |
|
* of key bits specified and the targeted device architecture). |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as <tt>segment_offsets+1</tt>). |
|
* - SortPairsDescending is not guaranteed to be stable. That is, suppose that |
|
* @p i and @p j are equivalent: neither one is less than the other. It is |
|
* not guaranteed that the relative order of these two elements will be |
|
* preserved by sort. |
|
* - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` |
|
* be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range |
|
* `[cur, cur + num_items)` shall not overlap |
|
* `[alt, alt + num_items)`. Both ranges shall not overlap |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys.Current()[i]`, |
|
* `d_values.Current()[i]`, `d_keys.Alternate()[i]`, |
|
* `d_values.Alternate()[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys with associated vector of |
|
* @p int values. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers for |
|
* // sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] |
|
* int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Create a set of DoubleBuffers to wrap pairs of device pointers |
|
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); |
|
* cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf); |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::SortPairsDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys, d_values, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::SortPairsDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys, d_values, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] |
|
* // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] |
|
* |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam ValueT |
|
* <b>[inferred]</b> Value type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in,out] d_keys |
|
* Reference to the double-buffer of keys whose "current" device-accessible |
|
* buffer contains the unsorted input keys and, upon return, is updated to |
|
* point to the sorted output keys |
|
* |
|
* @param[in,out] d_values |
|
* Double-buffer of values whose "current" device-accessible buffer contains |
|
* the unsorted input values and, upon return, is updated to point to the |
|
* sorted output values |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortPairsDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
DoubleBuffer<ValueT> &d_values, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
constexpr bool is_descending = true; |
|
constexpr bool is_overwrite_okay = true; |
|
using DispatchT = DispatchSegmentedSort<is_descending, |
|
KeyT, |
|
ValueT, |
|
int, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>; |
|
|
|
return DispatchT::Dispatch(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
is_overwrite_okay, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
SortPairsDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
DoubleBuffer<ValueT> &d_values, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return SortPairsDescending<KeyT, |
|
ValueT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of key-value pairs into ascending order. Approximately |
|
* `2*num_items + 2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The contents of the input data are not altered by the sorting operation. |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - StableSortPairs is stable: it preserves the relative ordering of |
|
* equivalent elements. That is, if @p x and @p y are elements such that |
|
* @p x precedes @p y, and if the two elements are equivalent (neither |
|
* @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that |
|
* @p x still precedes @p y. |
|
* - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of |
|
* `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall |
|
* not overlap `[in, in + num_items)`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, |
|
* `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys with associated vector of |
|
* @p int values. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_keys_out; // e.g., [-, -, -, -, -, -, -] |
|
* int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] |
|
* int *d_values_out; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::StableSortPairs( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::StableSortPairs( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] |
|
* // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam ValueT |
|
* <b>[inferred]</b> Value type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When nullptr, the |
|
* required allocation size is written to @p temp_storage_bytes and no work is done. |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in] d_keys_in |
|
* Device-accessible pointer to the input data of key data to sort |
|
* |
|
* @param[out] d_keys_out |
|
* Device-accessible pointer to the sorted output sequence of key data |
|
* |
|
* @param[in] d_values_in |
|
* Device-accessible pointer to the corresponding input sequence of |
|
* associated value items |
|
* |
|
* @param[out] d_values_out |
|
* Device-accessible pointer to the correspondingly-reordered output |
|
* sequence of associated value items |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortPairs(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
const ValueT *d_values_in, |
|
ValueT *d_values_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
return SortPairs<KeyT, |
|
ValueT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
d_values_in, |
|
d_values_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortPairs(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
const ValueT *d_values_in, |
|
ValueT *d_values_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return StableSortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
d_values_in, |
|
d_values_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of key-value pairs into descending order. |
|
* Approximately `2*num_items + 2*num_segments` auxiliary |
|
* storage required. |
|
* |
|
* @par |
|
* - The contents of the input data are not altered by the sorting operation. |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - StableSortPairsDescending is stable: it preserves the relative ordering |
|
* of equivalent elements. That is, if @p x and @p y are elements such that |
|
* @p x precedes @p y, and if the two elements are equivalent (neither |
|
* @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that |
|
* @p x still precedes @p y. |
|
* - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of |
|
* `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall |
|
* not overlap `[in, in + num_items)`, |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, |
|
* `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys with associated vector of |
|
* @p int values. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_keys_out; // e.g., [-, -, -, -, -, -, -] |
|
* int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] |
|
* int *d_values_out; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::StableSortPairsDescending( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::StableSortPairsDescending( |
|
* d_temp_storage, temp_storage_bytes, |
|
* d_keys_in, d_keys_out, d_values_in, d_values_out, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] |
|
* // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam ValueT |
|
* <b>[inferred]</b> Value type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in] d_keys_in |
|
* Device-accessible pointer to the input data of key data to sort |
|
* |
|
* @param[out] d_keys_out |
|
* Device-accessible pointer to the sorted output sequence of key data |
|
* |
|
* @param[in] d_values_in |
|
* Device-accessible pointer to the corresponding input sequence of |
|
* associated value items |
|
* |
|
* @param[out] d_values_out |
|
* Device-accessible pointer to the correspondingly-reordered output |
|
* sequence of associated value items |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortPairsDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
const ValueT *d_values_in, |
|
ValueT *d_values_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
return SortPairsDescending<KeyT, |
|
ValueT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
d_values_in, |
|
d_values_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortPairsDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
const KeyT *d_keys_in, |
|
KeyT *d_keys_out, |
|
const ValueT *d_values_in, |
|
ValueT *d_values_out, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return StableSortPairsDescending<KeyT, |
|
ValueT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys_in, |
|
d_keys_out, |
|
d_values_in, |
|
d_values_out, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of key-value pairs into ascending order. |
|
* Approximately `2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The sorting operation is given a pair of key buffers and a corresponding |
|
* pair of associated value buffers. Each pair is managed by a DoubleBuffer |
|
* structure that indicates which of the two buffers is "current" (and thus |
|
* contains the input data to be sorted). |
|
* - The contents of both buffers within each pair may be altered by the |
|
* sorting operation. |
|
* - Upon completion, the sorting operation will update the "current" |
|
* indicator within each DoubleBuffer wrapper to reference which of the two |
|
* buffers now contains the sorted output sequence (a function of the number |
|
* of key bits specified and the targeted device architecture). |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - StableSortPairs is stable: it preserves the relative ordering |
|
* of equivalent elements. That is, if @p x and @p y are elements such that |
|
* @p x precedes @p y, and if the two elements are equivalent (neither |
|
* @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that |
|
* @p x still precedes @p y. |
|
* - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` |
|
* be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range |
|
* `[cur, cur + num_items)` shall not overlap |
|
* `[alt, alt + num_items)`. Both ranges shall not overlap |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys.Current()[i]`, |
|
* `d_values.Current()[i]`, `d_keys.Alternate()[i]`, |
|
* `d_values.Alternate()[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys with associated vector of |
|
* @p int values. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] |
|
* int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Create a set of DoubleBuffers to wrap pairs of device pointers |
|
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); |
|
* cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf); |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::StableSortPairs( |
|
* d_temp_storage, temp_storage_bytes, d_keys, d_values, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::StableSortPairs( |
|
* d_temp_storage, temp_storage_bytes, d_keys, d_values, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] |
|
* // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] |
|
* |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam ValueT |
|
* <b>[inferred]</b> Value type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in,out] d_keys |
|
* Reference to the double-buffer of keys whose "current" device-accessible |
|
* buffer contains the unsorted input keys and, upon return, is updated to |
|
* point to the sorted output keys |
|
* |
|
* @param[in,out] d_values |
|
* Double-buffer of values whose "current" device-accessible buffer contains |
|
* the unsorted input values and, upon return, is updated to point to the |
|
* sorted output values |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortPairs(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
DoubleBuffer<ValueT> &d_values, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
return SortPairs<KeyT, |
|
ValueT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortPairs(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
DoubleBuffer<ValueT> &d_values, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return StableSortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>( |
|
d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
/** |
|
* @brief Sorts segments of key-value pairs into descending order. |
|
* Approximately `2*num_segments` auxiliary storage required. |
|
* |
|
* @par |
|
* - The sorting operation is given a pair of key buffers and a corresponding |
|
* pair of associated value buffers. Each pair is managed by a DoubleBuffer |
|
* structure that indicates which of the two buffers is "current" (and thus |
|
* contains the input data to be sorted). |
|
* - The contents of both buffers within each pair may be altered by the sorting |
|
* operation. |
|
* - Upon completion, the sorting operation will update the "current" indicator |
|
* within each DoubleBuffer wrapper to reference which of the two buffers |
|
* now contains the sorted output sequence (a function of the number of key bits |
|
* specified and the targeted device architecture). |
|
* - When the input is a contiguous sequence of segments, a single sequence |
|
* @p segment_offsets (of length `num_segments+1`) can be aliased |
|
* for both the @p d_begin_offsets and @p d_end_offsets parameters (where |
|
* the latter is specified as `segment_offsets+1`). |
|
* - StableSortPairsDescending is stable: it preserves the relative ordering |
|
* of equivalent elements. That is, if @p x and @p y are elements such that |
|
* @p x precedes @p y, and if the two elements are equivalent (neither |
|
* @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that |
|
* @p x still precedes @p y. |
|
* - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` |
|
* be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range |
|
* `[cur, cur + num_items)` shall not overlap |
|
* `[alt, alt + num_items)`. Both ranges shall not overlap |
|
* `[d_begin_offsets, d_begin_offsets + num_segments)` nor |
|
* `[d_end_offsets, d_end_offsets + num_segments)` in any way. |
|
* - Segments are not required to be contiguous. For all index values `i` |
|
* outside the specified segments `d_keys.Current()[i]`, |
|
* `d_values.Current()[i]`, `d_keys.Alternate()[i]`, |
|
* `d_values.Alternate()[i]` will not be accessed nor modified. |
|
* |
|
* @par Snippet |
|
* The code snippet below illustrates the batched sorting of three segments |
|
* (with one zero-length segment) of @p int keys with associated vector of |
|
* @p int values. |
|
* |
|
* @par |
|
* @code |
|
* #include <cub/cub.cuh> |
|
* // or equivalently <cub/device/device_segmented_sort.cuh> |
|
* |
|
* // Declare, allocate, and initialize device-accessible pointers |
|
* // for sorting data |
|
* int num_items; // e.g., 7 |
|
* int num_segments; // e.g., 3 |
|
* int *d_offsets; // e.g., [0, 3, 3, 7] |
|
* int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] |
|
* int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] |
|
* int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] |
|
* ... |
|
* |
|
* // Create a set of DoubleBuffers to wrap pairs of device pointers |
|
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf); |
|
* cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf); |
|
* |
|
* // Determine temporary device storage requirements |
|
* void *d_temp_storage = NULL; |
|
* size_t temp_storage_bytes = 0; |
|
* cub::DeviceSegmentedSort::StableSortPairsDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys, d_values, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // Allocate temporary storage |
|
* cudaMalloc(&d_temp_storage, temp_storage_bytes); |
|
* |
|
* // Run sorting operation |
|
* cub::DeviceSegmentedSort::StableSortPairsDescending( |
|
* d_temp_storage, temp_storage_bytes, d_keys, d_values, |
|
* num_items, num_segments, d_offsets, d_offsets + 1); |
|
* |
|
* // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] |
|
* // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] |
|
* @endcode |
|
* |
|
* @tparam KeyT |
|
* <b>[inferred]</b> Key type |
|
* |
|
* @tparam ValueT |
|
* <b>[inferred]</b> Value type |
|
* |
|
* @tparam BeginOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* beginning offsets \iterator |
|
* |
|
* @tparam EndOffsetIteratorT |
|
* <b>[inferred]</b> Random-access input iterator type for reading segment |
|
* ending offsets \iterator |
|
* |
|
* @param[in] d_temp_storage |
|
* Device-accessible allocation of temporary storage. When `nullptr`, the |
|
* required allocation size is written to @p temp_storage_bytes and no work |
|
* is done |
|
* |
|
* @param[in,out] temp_storage_bytes |
|
* Reference to size in bytes of @p d_temp_storage allocation |
|
* |
|
* @param[in,out] d_keys |
|
* Reference to the double-buffer of keys whose "current" device-accessible |
|
* buffer contains the unsorted input keys and, upon return, is updated to |
|
* point to the sorted output keys |
|
* |
|
* @param[in,out] d_values |
|
* Double-buffer of values whose "current" device-accessible buffer contains |
|
* the unsorted input values and, upon return, is updated to point to the |
|
* sorted output values |
|
* |
|
* @param[in] num_items |
|
* The total number of items to sort (across all segments) |
|
* |
|
* @param[in] num_segments |
|
* The number of segments that comprise the sorting data |
|
* |
|
* @param[in] d_begin_offsets |
|
* Random-access input iterator to the sequence of beginning offsets of |
|
* length @p num_segments, such that `d_begin_offsets[i]` is the first |
|
* element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and |
|
* `d_values_*` |
|
* |
|
* @param[in] d_end_offsets |
|
* Random-access input iterator to the sequence of ending offsets of length |
|
* @p num_segments, such that `d_end_offsets[i]-1` is the last element of |
|
* the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. |
|
* If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is |
|
* considered empty. |
|
* |
|
* @param[in] stream |
|
* <b>[optional]</b> CUDA stream to launch kernels within. Default is |
|
* stream<sub>0</sub>. |
|
*/ |
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortPairsDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
DoubleBuffer<ValueT> &d_values, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream = 0) |
|
{ |
|
return SortPairsDescending<KeyT, |
|
ValueT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
template <typename KeyT, |
|
typename ValueT, |
|
typename BeginOffsetIteratorT, |
|
typename EndOffsetIteratorT> |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
|
CUB_RUNTIME_FUNCTION static cudaError_t |
|
StableSortPairsDescending(void *d_temp_storage, |
|
std::size_t &temp_storage_bytes, |
|
DoubleBuffer<KeyT> &d_keys, |
|
DoubleBuffer<ValueT> &d_values, |
|
int num_items, |
|
int num_segments, |
|
BeginOffsetIteratorT d_begin_offsets, |
|
EndOffsetIteratorT d_end_offsets, |
|
cudaStream_t stream, |
|
bool debug_synchronous) |
|
{ |
|
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG |
|
|
|
return StableSortPairsDescending<KeyT, |
|
ValueT, |
|
BeginOffsetIteratorT, |
|
EndOffsetIteratorT>(d_temp_storage, |
|
temp_storage_bytes, |
|
d_keys, |
|
d_values, |
|
num_items, |
|
num_segments, |
|
d_begin_offsets, |
|
d_end_offsets, |
|
stream); |
|
} |
|
|
|
//@} end member group |
|
|
|
}; |
|
|
|
|
|
CUB_NAMESPACE_END |
|
|