|
#pragma once |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdint.h> |
|
#include <c10/util/C++17.h> |
|
#include <c10/util/Load.h> |
|
#include <c10/util/irange.h> |
|
#include <ATen/detail/FunctionTraits.h> |
|
#include <ATen/native/cpu/IsContiguous.h> |
|
#include <ATen/native/TensorIterator.h> |
|
#include <ATen/native/TensorIteratorDynamicCasting.h> |
|
#include <ATen/cpu/vec/vec.h> |
|
|
|
namespace at { namespace native { inline namespace CPU_CAPABILITY { |
|
|
|
using namespace vec; |
|
|
|
template <typename traits, std::size_t... INDEX> |
|
typename traits::ArgsTuple |
|
dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, |
|
std::index_sequence<INDEX...>) { |
|
return std::make_tuple( |
|
c10::load<typename traits::template arg<INDEX>::type>( |
|
data[INDEX] + i * strides[INDEX])...); |
|
} |
|
|
|
template <typename traits> |
|
typename traits::ArgsTuple |
|
dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) { |
|
using Indices = std::make_index_sequence<traits::arity>; |
|
return dereference_impl<traits>(data, strides, i, Indices{}); |
|
} |
|
|
|
template <typename traits, std::size_t... INDEX> |
|
typename traits::ArgsTuple |
|
dereference_vec_impl(char* C10_RESTRICT data[], |
|
const typename traits::result_type& opt_scalar, |
|
size_t S, |
|
int64_t i, |
|
std::index_sequence<INDEX...>) { |
|
using Vec = typename traits::result_type; |
|
using scalar_t = typename Vec::value_type; |
|
return std::make_tuple( |
|
S == INDEX + 1 ? |
|
opt_scalar : |
|
Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...); |
|
} |
|
|
|
template <typename traits> |
|
typename traits::ArgsTuple |
|
dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) { |
|
using Indices = std::make_index_sequence<traits::arity>; |
|
return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{}); |
|
} |
|
|
|
template <typename func_t, |
|
typename std::enable_if<!std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr> |
|
static inline void |
|
execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { |
|
using traits = function_traits<func_t>; |
|
using result_type = typename traits::result_type; |
|
for (; i < n; i++) { |
|
result_type* out_ptr = (result_type*)(data[0] + i * strides[0]); |
|
*out_ptr = c10::guts::apply(std::forward<func_t>(op), dereference<traits>( |
|
&data[1], |
|
&strides[1], |
|
i)); |
|
} |
|
} |
|
|
|
template <typename func_t, |
|
typename std::enable_if<std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr> |
|
static inline void |
|
execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { |
|
using traits = function_traits<func_t>; |
|
for (; i < n; i++) { |
|
c10::guts::apply(std::forward<func_t>(op), dereference<traits>( |
|
&data[0], |
|
&strides[0], |
|
i)); |
|
} |
|
} |
|
|
|
|
|
|
|
template <typename func_t> |
|
static inline void |
|
basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) { |
|
using traits = function_traits<func_t>; |
|
constexpr int ntensors = traits::arity + 1; |
|
|
|
|
|
|
|
int64_t strides[ntensors]; |
|
for (const auto arg : c10::irange(ntensors)) { |
|
strides[arg] = strides_[arg]; |
|
} |
|
|
|
execute_op(data, strides, i, n, std::forward<func_t>(op)); |
|
} |
|
|
|
|
|
template<class T, size_t N> |
|
struct TupleOutput { |
|
static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i, |
|
const T &tuple) { |
|
TupleOutput<T, N - 1>::handle(data, strides, i, tuple); |
|
|
|
auto output = std::get<N - 1>(tuple); |
|
using output_type = decltype(output); |
|
output_type * out_ptr = (output_type *)(data[N - 1] + i * strides[N - 1]); |
|
*out_ptr = output; |
|
} |
|
}; |
|
|
|
|
|
template<class T> |
|
struct TupleOutput<T, 1> { |
|
static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i, |
|
const T &tuple) { |
|
auto output = std::get<0>(tuple); |
|
using output_type = decltype(output); |
|
output_type* out_ptr = (output_type *)(data[0] + i * strides[0]); |
|
*out_ptr = output; |
|
} |
|
}; |
|
|
|
template<class... Args> |
|
void handle_tuple_outputs(char* C10_RESTRICT data[], |
|
const int64_t* strides, |
|
int64_t i, |
|
const std::tuple<Args...> &tuple) { |
|
TupleOutput<decltype(tuple), sizeof...(Args)>::handle(data, strides, i, tuple); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename func_t> |
|
static inline void |
|
multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) { |
|
using traits = function_traits<func_t>; |
|
|
|
using result_type = typename traits::result_type; |
|
constexpr int num_outputs = std::tuple_size<result_type>::value; |
|
constexpr int ntensors = traits::arity + num_outputs; |
|
|
|
|
|
|
|
int64_t strides[ntensors]; |
|
for (const auto arg : c10::irange(ntensors)) { |
|
strides[arg] = strides_[arg]; |
|
} |
|
|
|
for (; i < n; i++) { |
|
auto output = c10::guts::apply(op, dereference<traits>( |
|
&data[num_outputs], |
|
&strides[num_outputs], |
|
i)); |
|
handle_tuple_outputs(data, strides, i, output); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename func_t, typename vec_func_t> |
|
static inline void |
|
vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) { |
|
using traits = function_traits<vec_func_t>; |
|
using scalar_t = typename function_traits<func_t>::result_type; |
|
using Vec = Vectorized<scalar_t>; |
|
constexpr int ntensors = traits::arity + 1; |
|
|
|
char* C10_RESTRICT data[ntensors]; |
|
for (const auto arg : c10::irange(ntensors)) { |
|
data[arg] = data_[arg]; |
|
} |
|
|
|
Vec opt_scalar = Vec(S > 0 ? *(scalar_t*)data[S] : scalar_t(0)); |
|
int64_t i = 0; |
|
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { |
|
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i); |
|
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size()); |
|
auto out1 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args1)); |
|
auto out2 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args2)); |
|
out1.store(data[0] + i * sizeof(scalar_t)); |
|
out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t)); |
|
} |
|
if (i < n) { |
|
int64_t strides[ntensors]; |
|
for (const auto arg : c10::irange(ntensors)) { |
|
strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t); |
|
} |
|
basic_loop(data, strides, i, n, std::forward<func_t>(op)); |
|
} |
|
} |
|
|
|
|
|
template <typename traits, typename cb_t> |
|
static inline void unroll_contiguous_scalar_checks( |
|
const int64_t* , |
|
std::index_sequence<>, |
|
cb_t&& cb) { |
|
cb(0); |
|
} |
|
|
|
template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX> |
|
static inline void unroll_contiguous_scalar_checks( |
|
const int64_t* strides, |
|
std::index_sequence<INDEX0, INDEX...>, |
|
cb_t&& cb) { |
|
if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) { |
|
cb(INDEX0 + 1); |
|
} else { |
|
unroll_contiguous_scalar_checks<traits>(strides, std::index_sequence<INDEX...>{}, std::forward<cb_t>(cb)); |
|
} |
|
} |
|
|
|
template <typename op_t, typename vop_t> |
|
struct VectorizedLoop2d { |
|
op_t op; |
|
vop_t vop; |
|
|
|
using traits = function_traits<op_t>; |
|
static constexpr int ntensors = traits::arity + 1; |
|
using data_t = std::array<char*, ntensors>; |
|
|
|
VectorizedLoop2d(const op_t &op, const vop_t &vop): |
|
op(op), vop(vop) {} |
|
|
|
static void advance(data_t &data, const int64_t *outer_strides) { |
|
for (const auto arg : c10::irange(data.size())) { |
|
data[arg] += outer_strides[arg]; |
|
} |
|
} |
|
|
|
void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) { |
|
data_t data; |
|
std::copy_n(base, ntensors, data.data()); |
|
const int64_t *outer_strides = &strides[ntensors]; |
|
|
|
if (is_contiguous<traits>(strides)) { |
|
for (const auto i C10_UNUSED : c10::irange(size1)) { |
|
vectorized_loop(data.data(), size0, 0, op, vop); |
|
advance(data, outer_strides); |
|
} |
|
} else { |
|
using Indices = std::make_index_sequence<traits::arity>; |
|
unroll_contiguous_scalar_checks<traits>(strides, Indices{}, [&](size_t idx) { |
|
if (idx) { |
|
for (const auto i C10_UNUSED : c10::irange(size1)) { |
|
vectorized_loop(data.data(), size0, idx, op, vop); |
|
advance(data, outer_strides); |
|
} |
|
} else { |
|
for (const auto i C10_UNUSED : c10::irange(size1)) { |
|
basic_loop(data.data(), strides, 0, size0, op); |
|
advance(data, outer_strides); |
|
} |
|
} |
|
}); |
|
} |
|
} |
|
}; |
|
|
|
template <typename op_t, typename vop_t> |
|
VectorizedLoop2d<op_t, vop_t> make_vectorized_loop2d( |
|
const op_t &op, const vop_t &vop) { |
|
return VectorizedLoop2d<op_t, vop_t>(op, vop); |
|
} |
|
|
|
template <typename func_t> |
|
void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) { |
|
using traits = function_traits<func_t>; |
|
|
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); |
|
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); |
|
|
|
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter)); |
|
|
|
iter.for_each([&](char** data, const int64_t* strides, int64_t n) { |
|
|
|
|
|
basic_loop(data, strides, 0, n, std::forward<func_t>(op)); |
|
}, grain_size); |
|
iter.cast_outputs(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename func_t> |
|
void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) { |
|
using traits = function_traits<func_t>; |
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); |
|
|
|
iter.for_each([&](char** data, const int64_t* strides, int64_t n) { |
|
multiple_outputs_loop(data, strides, 0, n, std::forward<func_t>(op)); |
|
}, grain_size); |
|
iter.cast_outputs(); |
|
} |
|
|
|
template <bool check_dynamic_cast=true, typename func_t, typename vec_func_t> |
|
void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) { |
|
using traits = function_traits<func_t>; |
|
|
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); |
|
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); |
|
|
|
|
|
c10::guts::if_constexpr<check_dynamic_cast>([&] { |
|
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter)); |
|
}); |
|
|
|
iter.for_each(make_vectorized_loop2d(op, vop), grain_size); |
|
iter.cast_outputs(); |
|
} |
|
|
|
template <typename func_t> |
|
void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) { |
|
using traits = function_traits<func_t>; |
|
constexpr bool result_void = std::is_void<typename traits::result_type>::value; |
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity && |
|
((result_void && iter.noutputs() == 0) || (!result_void && iter.noutputs() == 1))); |
|
|
|
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter)); |
|
|
|
iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) { |
|
basic_loop(data, strides, 0, n, std::forward<func_t>(op)); |
|
}, range); |
|
iter.cast_outputs(); |
|
} |
|
|
|
template <typename func_t> |
|
void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) { |
|
cpu_serial_kernel(iter, op, {0, iter.numel()}); |
|
} |
|
|
|
template <typename func_t, typename vec_func_t> |
|
void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) { |
|
using traits = function_traits<func_t>; |
|
|
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); |
|
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); |
|
|
|
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter)); |
|
|
|
iter.serial_for_each(make_vectorized_loop2d(op, vop), range); |
|
iter.cast_outputs(); |
|
} |
|
|
|
template <typename func_t, typename vec_func_t> |
|
void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) { |
|
cpu_serial_kernel_vec(iter, op, vop, {0, iter.numel()}); |
|
} |
|
|
|
}}} |
|
|