#pragma once /* * General settings and functions */ const int WARP_SIZE = 32; const int MAX_BLOCK_SIZE = 1024; static int getNumThreads(int nElem) { int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE}; for (int i = 0; i < 6; ++i) { if (nElem <= threadSizes[i]) { return threadSizes[i]; } } return MAX_BLOCK_SIZE; } /* * Reduction utilities */ template __device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) { #if CUDART_VERSION >= 9000 return __shfl_xor_sync(mask, value, laneMask, width); #else return __shfl_xor(value, laneMask, width); #endif } __device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); } template struct Pair { T v1, v2; __device__ Pair() {} __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {} __device__ Pair(T v) : v1(v), v2(v) {} __device__ Pair(int v) : v1(v), v2(v) {} __device__ Pair &operator+=(const Pair &a) { v1 += a.v1; v2 += a.v2; return *this; } }; template static __device__ __forceinline__ T warpSum(T val) { #if __CUDA_ARCH__ >= 300 for (int i = 0; i < getMSB(WARP_SIZE); ++i) { val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE); } #else __shared__ T values[MAX_BLOCK_SIZE]; values[threadIdx.x] = val; __threadfence_block(); const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE; for (int i = 1; i < WARP_SIZE; i++) { val += values[base + ((i + threadIdx.x) % WARP_SIZE)]; } #endif return val; } template static __device__ __forceinline__ Pair warpSum(Pair value) { value.v1 = warpSum(value.v1); value.v2 = warpSum(value.v2); return value; }