|
#pragma once |
|
|
|
#include <ATen/detail/FunctionTraits.h> |
|
#include <ATen/native/TensorIterator.h> |
|
#include <ATen/native/TensorIteratorDynamicCasting.h> |
|
#include <ATen/cuda/detail/OffsetCalculator.cuh> |
|
#include <ATen/OpMathType.h> |
|
#include <ATen/native/cuda/thread_constants.h> |
|
|
|
#include <thrust/tuple.h> |
|
|
|
#include <ATen/native/cuda/MemoryAccess.cuh> |
|
|
|
|
|
namespace at { namespace native { |
|
|
|
template<int N> |
|
static OffsetCalculator<N> make_input_offset_calculator(const TensorIteratorBase& iter) { |
|
|
|
constexpr int array_size = std::max<int>(N, 1); |
|
TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs()); |
|
std::array<const int64_t*, array_size> strides; |
|
int64_t element_sizes[array_size]; |
|
for (int i = 0; i < N; i++) { |
|
strides[i] = iter.strides(i + iter.noutputs()).data(); |
|
element_sizes[i] = iter.element_size(i + iter.noutputs()); |
|
} |
|
return OffsetCalculator<N>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes); |
|
} |
|
|
|
template <int num_outputs = 1> |
|
static OffsetCalculator<num_outputs> make_output_offset_calculator(const TensorIteratorBase& iter) { |
|
TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs()); |
|
std::array<const int64_t*, num_outputs> strides; |
|
int64_t element_sizes[num_outputs]; |
|
for (int i = 0; i < num_outputs; i++) { |
|
strides[i] = iter.strides(i).data(); |
|
element_sizes[i] = iter.element_size(i); |
|
} |
|
return OffsetCalculator<num_outputs>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes); |
|
} |
|
|
|
template<typename func_t, typename policy_t> |
|
__device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) { |
|
using traits = function_traits<func_t>; |
|
using return_t = typename traits::result_type; |
|
using args_t = typename traits::ArgsTuple; |
|
|
|
int idx = blockIdx.x; |
|
|
|
return_t results[thread_work_size()]; |
|
args_t args[thread_work_size()]; |
|
|
|
|
|
policy.load(args, idx); |
|
|
|
|
|
#pragma unroll |
|
for (int i = 0; i < thread_work_size(); i++) { |
|
if (policy.check_inbounds(i)) { |
|
results[i] = c10::guts::apply(f, args[i]); |
|
} |
|
} |
|
|
|
|
|
policy.store(results, idx); |
|
} |
|
|
|
}} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if !defined(USE_ROCM) |
|
#include <ATen/native/cuda/CUDALoops.cuh> |
|
#else |
|
#include <ATen/native/cuda/ROCmLoops.cuh> |
|
#endif |
|
|
|
namespace at { namespace native { |
|
|
|
template <typename func_t> |
|
void gpu_kernel(TensorIteratorBase& iter, const func_t& f) { |
|
|
|
for (int arg = 0; arg < iter.ntensors(); arg++) { |
|
TORCH_INTERNAL_ASSERT( |
|
iter.device(arg).is_cuda(), |
|
"argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); |
|
} |
|
|
|
if (iter.numel() == 0) { |
|
return; |
|
} |
|
|
|
if (!iter.can_use_32bit_indexing()) { |
|
for (auto& sub_iter : iter.with_32bit_indexing()) { |
|
gpu_kernel(sub_iter, f); |
|
} |
|
return; |
|
} |
|
|
|
gpu_kernel_impl(iter, f); |
|
} |
|
|
|
template<typename arg1_t, typename arg2_t, typename return_t, typename func_t> |
|
struct AUnaryFunctor { |
|
using traits = function_traits<func_t>; |
|
using opmath_arg1_t = typename traits::template arg<0>::type; |
|
__device__ return_t operator()(arg2_t b) const { |
|
return f(a, b); |
|
} |
|
|
|
AUnaryFunctor(func_t f_, opmath_arg1_t a_): f(f_), a(a_) {} |
|
private: |
|
func_t f; |
|
opmath_arg1_t a; |
|
}; |
|
|
|
template<typename arg1_t, typename arg2_t, typename return_t, typename func_t> |
|
struct BUnaryFunctor { |
|
using traits = function_traits<func_t>; |
|
using opmath_arg2_t = typename traits::template arg<1>::type; |
|
__device__ return_t operator()(arg1_t a) const { |
|
return f(a, b); |
|
} |
|
|
|
BUnaryFunctor(func_t f_, opmath_arg2_t b_): f(f_), b(b_) {} |
|
private: |
|
func_t f; |
|
opmath_arg2_t b; |
|
}; |
|
|
|
|
|
|
|
template <typename arg1_t, typename arg2_t, typename return_t, typename func_t> |
|
struct BinaryFunctor { |
|
__device__ return_t operator()(arg1_t a, arg2_t b) const { |
|
return f(a, b); |
|
} |
|
BinaryFunctor(func_t f_): f(f_) {} |
|
private: |
|
func_t f; |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename arg1_t, typename arg2_t = arg1_t, typename return_t = arg1_t, typename func_t> |
|
void opmath_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { |
|
TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); |
|
|
|
using traits = function_traits<func_t>; |
|
using opmath_arg1_t = typename traits::template arg<0>::type; |
|
using opmath_arg2_t = typename traits::template arg<1>::type; |
|
static_assert( |
|
traits::arity == 2, |
|
"gpu_kernel_with_scalars only supports two input arguments"); |
|
|
|
if (iter.is_cpu_scalar(1)) { |
|
AUnaryFunctor<arg1_t, arg2_t, return_t, func_t> af(f, iter.scalar_value<opmath_arg1_t>(1)); |
|
iter.remove_operand(1); |
|
|
|
|
|
|
|
|
|
|
|
const OptionalDeviceGuard device_guard(iter.device(1)); |
|
gpu_kernel(iter, af); |
|
} else if (iter.is_cpu_scalar(2)) { |
|
BUnaryFunctor<arg1_t, arg2_t, return_t, func_t> bf(f, iter.scalar_value<opmath_arg2_t>(2)); |
|
iter.remove_operand(2); |
|
gpu_kernel(iter, bf); |
|
} else { |
|
gpu_kernel(iter, BinaryFunctor<arg1_t, arg2_t, return_t, func_t>(f)); |
|
} |
|
} |
|
|
|
template <typename scalar_t, typename return_t = scalar_t, typename func_t> |
|
void opmath_symmetric_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { |
|
|
|
|
|
TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); |
|
|
|
using traits = function_traits<func_t>; |
|
using opmath_arg_t = typename traits::template arg<0>::type; |
|
static_assert( |
|
traits::arity == 2, |
|
"gpu_kernel_with_scalars only supports two input arguments"); |
|
static_assert(std::is_same<opmath_arg_t, typename traits::template arg<1>::type>::value, |
|
"f is not symmetric"); |
|
|
|
OptionalDeviceGuard device_guard; |
|
opmath_arg_t scalar_val{}; |
|
|
|
if (iter.is_cpu_scalar(1)) { |
|
scalar_val = iter.scalar_value<opmath_arg_t>(1); |
|
iter.remove_operand(1); |
|
|
|
|
|
|
|
|
|
|
|
|
|
device_guard.reset_device(iter.device(1)); |
|
} else if (iter.is_cpu_scalar(2)) { |
|
scalar_val = iter.scalar_value<opmath_arg_t>(2); |
|
iter.remove_operand(2); |
|
} |
|
|
|
if (iter.ninputs() == 2) { |
|
gpu_kernel(iter, BinaryFunctor<scalar_t, scalar_t, return_t, func_t>(f)); |
|
} else { |
|
AUnaryFunctor<scalar_t, scalar_t, return_t, func_t> unary_f(f, scalar_val); |
|
gpu_kernel(iter, unary_f); |
|
} |
|
} |
|
|
|
|
|
|
|
template <typename func_t> |
|
void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { |
|
using traits = function_traits<func_t>; |
|
static_assert( |
|
traits::arity == 2, |
|
"gpu_kernel_with_scalars only supports two input arguments"); |
|
using arg1_t = typename traits::template arg<0>::type; |
|
using arg2_t = typename traits::template arg<1>::type; |
|
using return_t = typename traits::result_type; |
|
opmath_gpu_kernel_with_scalars<arg1_t, arg2_t, return_t, func_t>(iter, f); |
|
} |
|
|
|
namespace { |
|
|
|
|
|
template <typename T> struct is_tuple: std::false_type {}; |
|
|
|
template <typename ...T> struct is_tuple<thrust::tuple<T...>>: std::true_type {}; |
|
|
|
template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t> |
|
C10_LAUNCH_BOUNDS_1(num_threads()) |
|
__global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) { |
|
int remaining = N - block_work_size() * blockIdx.x; |
|
elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll<array_t, inp_calc_t, out_calc_t, num_outputs>(data, remaining, ic, oc)); |
|
} |
|
|
|
template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t> |
|
static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) { |
|
TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max()); |
|
int64_t grid = (N + block_work_size() - 1) / block_work_size(); |
|
auto stream = at::cuda::getCurrentCUDAStream(); |
|
unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc); |
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); |
|
} |
|
|
|
template <typename func_t> |
|
void gpu_kernel_multiple_outputs_impl(TensorIteratorBase& iter, const func_t& f) { |
|
using traits = function_traits<func_t>; |
|
using output_t = typename traits::result_type; |
|
static_assert(is_tuple<output_t>::value, "f's return type must be `thrust::tuple`"); |
|
constexpr int num_outputs = thrust::tuple_size<output_t>::value; |
|
constexpr int num_inputs = traits::arity; |
|
constexpr int ntensors = num_outputs + num_inputs; |
|
|
|
TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); |
|
TORCH_INTERNAL_ASSERT(iter.ntensors() == ntensors); |
|
|
|
at::detail::Array<char*, ntensors> data; |
|
for (int i = 0; i < ntensors; i++) { |
|
data[i] = (char*)iter.data_ptr(i); |
|
} |
|
|
|
int64_t numel = iter.numel(); |
|
|
|
if (iter.is_contiguous()) { |
|
auto input_calc = TrivialOffsetCalculator<num_inputs>(); |
|
auto output_calc = TrivialOffsetCalculator<num_outputs>(); |
|
launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc); |
|
} else { |
|
auto input_calc = make_input_offset_calculator<num_inputs>(iter); |
|
auto output_calc = make_output_offset_calculator<num_outputs>(iter); |
|
launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc); |
|
} |
|
} |
|
} |
|
|
|
template <typename func_t> |
|
void gpu_kernel_multiple_outputs(TensorIteratorBase& iter, const func_t& f) { |
|
ASSERT_HOST_DEVICE_LAMBDA(func_t); |
|
|
|
for (int arg = 0; arg < iter.ntensors(); arg++) { |
|
TORCH_INTERNAL_ASSERT(iter.device(arg).is_cuda()); |
|
} |
|
|
|
if (iter.numel() == 0) { |
|
return; |
|
} |
|
|
|
if (!iter.can_use_32bit_indexing()) { |
|
for (auto& sub_iter : iter.with_32bit_indexing()) { |
|
gpu_kernel_multiple_outputs(sub_iter, f); |
|
} |
|
return; |
|
} |
|
|
|
gpu_kernel_multiple_outputs_impl(iter, f); |
|
} |
|
|
|
}} |
|
|