| /****************************************************************************** |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * * Neither the name of the NVIDIA CORPORATION nor the |
| * names of its contributors may be used to endorse or promote products |
| * derived from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| ******************************************************************************/ |
| |
| /** |
| * \file |
| * Thread utilities for reading memory using PTX cache modifiers. |
| */ |
| |
| #pragma once |
| |
| #include <iterator> |
| |
| #include "../config.cuh" |
| #include "../util_ptx.cuh" |
| #include "../util_type.cuh" |
| |
| /// Optional outer namespace(s) |
| CUB_NS_PREFIX |
| |
| /// CUB namespace |
| namespace cub { |
| |
| /** |
| * \addtogroup UtilIo |
| * @{ |
| */ |
| |
| //----------------------------------------------------------------------------- |
| // Tags and constants |
| //----------------------------------------------------------------------------- |
| |
| /** |
| * \brief Enumeration of cache modifiers for memory load operations. |
| */ |
| enum CacheLoadModifier |
| { |
| LOAD_DEFAULT, ///< Default (no modifier) |
| LOAD_CA, ///< Cache at all levels |
| LOAD_CG, ///< Cache at global level |
| LOAD_CS, ///< Cache streaming (likely to be accessed once) |
| LOAD_CV, ///< Cache as volatile (including cached system lines) |
| LOAD_LDG, ///< Cache as texture |
| LOAD_VOLATILE, ///< Volatile (any memory space) |
| }; |
| |
| |
| /** |
| * \name Thread I/O (cache modified) |
| * @{ |
| */ |
| |
| /** |
| * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. |
| * |
| * \par Example |
| * \code |
| * #include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh> |
| * |
| * // 32-bit load using cache-global modifier: |
| * int *d_in; |
| * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x); |
| * |
| * // 16-bit load using default modifier |
| * short *d_in; |
| * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x); |
| * |
| * // 256-bit load using cache-volatile modifier |
| * double4 *d_in; |
| * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x); |
| * |
| * // 96-bit load using cache-streaming modifier |
| * struct TestFoo { bool a; short b; }; |
| * TestFoo *d_struct; |
| * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x); |
| * \endcode |
| * |
| * \tparam MODIFIER <b>[inferred]</b> CacheLoadModifier enumeration |
| * \tparam InputIteratorT <b>[inferred]</b> Input iterator type \iterator |
| */ |
| template < |
| CacheLoadModifier MODIFIER, |
| typename InputIteratorT> |
| __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr); |
| |
| |
| //@} end member group |
| |
| |
| #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document |
|
|
|
|
| /// Helper structure for templated load iteration (inductive case) |
| template <int COUNT, int MAX> |
| struct IterateThreadLoad |
| { |
| template <CacheLoadModifier MODIFIER, typename T> |
| static __device__ __forceinline__ void Load(T const *ptr, T *vals) |
| { |
| vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT); |
| IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals); |
| } |
| |
| template <typename InputIteratorT, typename T> |
| static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) |
| { |
| vals[COUNT] = itr[COUNT]; |
| IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals); |
| } |
| }; |
| |
|
|
| /// Helper structure for templated load iteration (termination case) |
| template <int MAX> |
| struct IterateThreadLoad<MAX, MAX> |
| { |
| template <CacheLoadModifier MODIFIER, typename T> |
| static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} |
| |
| template <typename InputIteratorT, typename T> |
| static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} |
| }; |
| |
|
|
| /** |
| * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier |
| */ |
| #define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ |
| template<> \ |
| __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr) \ |
| { \ |
| uint4 retval; \ |
| asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ |
| "=r"(retval.x), \ |
| "=r"(retval.y), \ |
| "=r"(retval.z), \ |
| "=r"(retval.w) : \ |
| _CUB_ASM_PTR_(ptr)); \ |
| return retval; \ |
| } \ |
| template<> \ |
| __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr) \ |
| { \ |
| ulonglong2 retval; \ |
| asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ |
| "=l"(retval.x), \ |
| "=l"(retval.y) : \ |
| _CUB_ASM_PTR_(ptr)); \ |
| return retval; \ |
| } |
| |
| /** |
| * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier |
| */ |
| #define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ |
| template<> \ |
| __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr) \ |
| { \ |
| ushort4 retval; \ |
| asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ |
| "=h"(retval.x), \ |
| "=h"(retval.y), \ |
| "=h"(retval.z), \ |
| "=h"(retval.w) : \ |
| _CUB_ASM_PTR_(ptr)); \ |
| return retval; \ |
| } \ |
| template<> \ |
| __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr) \ |
| { \ |
| uint2 retval; \ |
| asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ |
| "=r"(retval.x), \ |
| "=r"(retval.y) : \ |
| _CUB_ASM_PTR_(ptr)); \ |
| return retval; \ |
| } \ |
| template<> \ |
| __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr) \ |
| { \ |
| unsigned long long retval; \ |
| asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ |
| "=l"(retval) : \ |
| _CUB_ASM_PTR_(ptr)); \ |
| return retval; \ |
| } |
| |
| /** |
| * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier |
| */ |
| #define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ |
| template<> \ |
| __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr) \ |
| { \ |
| unsigned int retval; \ |
| asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ |
| "=r"(retval) : \ |
| _CUB_ASM_PTR_(ptr)); \ |
| return retval; \ |
| } |
| |
|
|
| /** |
| * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier |
| */ |
| #define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ |
| template<> \ |
| __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr) \ |
| { \ |
| unsigned short retval; \ |
| asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ |
| "=h"(retval) : \ |
| _CUB_ASM_PTR_(ptr)); \ |
| return retval; \ |
| } |
| |
|
|
| /** |
| * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier |
| */ |
| #define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ |
| template<> \ |
| __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr) \ |
| { \ |
| unsigned short retval; \ |
| asm volatile ( \ |
| "{" \ |
| " .reg .u8 datum;" \ |
| " ld."#ptx_modifier".u8 datum, [%1];" \ |
| " cvt.u16.u8 %0, datum;" \ |
| "}" : \ |
| "=h"(retval) : \ |
| _CUB_ASM_PTR_(ptr)); \ |
| return (unsigned char) retval; \ |
| } |
| |
|
|
| /** |
| * Define powers-of-two ThreadLoad specializations for the given Cache load modifier |
| */ |
| #define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ |
| _CUB_LOAD_16(cub_modifier, ptx_modifier) \ |
| _CUB_LOAD_8(cub_modifier, ptx_modifier) \ |
| _CUB_LOAD_4(cub_modifier, ptx_modifier) \ |
| _CUB_LOAD_2(cub_modifier, ptx_modifier) \ |
| _CUB_LOAD_1(cub_modifier, ptx_modifier) \ |
| |
| |
| /** |
| * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers |
| */ |
| #if CUB_PTX_ARCH >= 200 |
| _CUB_LOAD_ALL(LOAD_CA, ca) |
| _CUB_LOAD_ALL(LOAD_CG, cg) |
| _CUB_LOAD_ALL(LOAD_CS, cs) |
| _CUB_LOAD_ALL(LOAD_CV, cv) |
| #else |
| _CUB_LOAD_ALL(LOAD_CA, global) |
| // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 |
| _CUB_LOAD_ALL(LOAD_CG, volatile.global) |
| _CUB_LOAD_ALL(LOAD_CS, global) |
| _CUB_LOAD_ALL(LOAD_CV, volatile.global) |
| #endif |
| |
| #if CUB_PTX_ARCH >= 350 |
| _CUB_LOAD_ALL(LOAD_LDG, global.nc) |
| #else |
| _CUB_LOAD_ALL(LOAD_LDG, global) |
| #endif |
| |
| |
| // Macro cleanup |
| #undef _CUB_LOAD_ALL |
| #undef _CUB_LOAD_1 |
| #undef _CUB_LOAD_2 |
| #undef _CUB_LOAD_4 |
| #undef _CUB_LOAD_8 |
| #undef _CUB_LOAD_16 |
| |
| |
| |
| /** |
| * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types |
| */ |
| template <typename InputIteratorT> |
| __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad( |
| InputIteratorT itr, |
| Int2Type<LOAD_DEFAULT> /*modifier*/, |
| Int2Type<false> /*is_pointer*/) |
| { |
| return *itr; |
| } |
|
|
|
|
| /** |
| * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types |
| */ |
| template <typename T> |
| __device__ __forceinline__ T ThreadLoad( |
| T *ptr, |
| Int2Type<LOAD_DEFAULT> /*modifier*/, |
| Int2Type<true> /*is_pointer*/) |
| { |
| return *ptr; |
| } |
|
|
|
|
| /** |
| * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types |
| */ |
| template <typename T> |
| __device__ __forceinline__ T ThreadLoadVolatilePointer( |
| T *ptr, |
| Int2Type<true> /*is_primitive*/) |
| { |
| T retval = *reinterpret_cast<volatile T*>(ptr); |
| return retval; |
| } |
| |
| |
| /** |
| * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types |
| */ |
| template <typename T> |
| __device__ __forceinline__ T ThreadLoadVolatilePointer( |
| T *ptr, |
| Int2Type<false> /*is_primitive*/) |
| { |
| typedef typename UnitWord<T>::VolatileWord VolatileWord; // Word type for memcopying |
| |
| const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); |
| |
| T retval; |
| VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval); |
| IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( |
| reinterpret_cast<volatile VolatileWord*>(ptr), |
| words); |
| return retval; |
| } |
| |
|
|
| /** |
| * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types |
| */ |
| template <typename T> |
| __device__ __forceinline__ T ThreadLoad( |
| T *ptr, |
| Int2Type<LOAD_VOLATILE> /*modifier*/, |
| Int2Type<true> /*is_pointer*/) |
| { |
| // Apply tags for partial-specialization |
| return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>()); |
| } |
| |
| |
| /** |
| * ThreadLoad definition for generic modifiers on pointer types |
| */ |
| template <typename T, int MODIFIER> |
| __device__ __forceinline__ T ThreadLoad( |
| T const *ptr, |
| Int2Type<MODIFIER> /*modifier*/, |
| Int2Type<true> /*is_pointer*/) |
| { |
| typedef typename UnitWord<T>::DeviceWord DeviceWord; |
| |
| const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); |
| |
| DeviceWord words[DEVICE_MULTIPLE]; |
| |
| IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>( |
| reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)), |
| words); |
| |
| return *reinterpret_cast<T*>(words); |
| } |
| |
|
|
| /** |
| * ThreadLoad definition for generic modifiers |
| */ |
| template < |
| CacheLoadModifier MODIFIER, |
| typename InputIteratorT> |
| __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr) |
| { |
| // Apply tags for partial-specialization |
| return ThreadLoad( |
| itr, |
| Int2Type<MODIFIER>(), |
| Int2Type<IsPointer<InputIteratorT>::VALUE>()); |
| } |
| |
| |
| |
| #endif // DOXYGEN_SHOULD_SKIP_THIS |
| |
| |
| /** @} */ // end group UtilIo |
| |
| |
| } // CUB namespace |
| CUB_NS_POSTFIX // Optional outer namespace(s) |
| |