|
#pragma once |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cassert> |
|
#include <cstring> |
|
#include <functional> |
|
#include <cmath> |
|
#include <type_traits> |
|
#include <bitset> |
|
|
|
#include <ATen/cpu/vec/intrinsics.h> |
|
#include <ATen/native/Math.h> |
|
#include <ATen/NumericUtils.h> |
|
#include <c10/util/C++17.h> |
|
#include <c10/util/BFloat16.h> |
|
#include <c10/util/BFloat16-math.h> |
|
#include <c10/util/copysign.h> |
|
#include <c10/util/math_compat.h> |
|
#include <ATen/native/cpu/zmath.h> |
|
#include <c10/util/TypeCast.h> |
|
#include <c10/macros/Macros.h> |
|
#include <c10/util/irange.h> |
|
#include <c10/util/Load.h> |
|
|
|
|
|
#ifdef CPU_CAPABILITY_AVX512 |
|
#if defined(__GNUC__) |
|
#define __at_align__ __attribute__((aligned(64))) |
|
#elif defined(_WIN32) |
|
#define __at_align__ __declspec(align(64)) |
|
#else |
|
#define __at_align__ |
|
#endif |
|
#define VECTOR_WIDTH 64 |
|
#define int_vector __m512i |
|
#else |
|
#if defined(__GNUC__) |
|
#define __at_align__ __attribute__((aligned(32))) |
|
#elif defined(_WIN32) |
|
#define __at_align__ __declspec(align(32)) |
|
#else |
|
#define __at_align__ |
|
#endif |
|
#define VECTOR_WIDTH 32 |
|
#define int_vector __m256i |
|
#endif |
|
|
|
namespace at { |
|
namespace vec { |
|
|
|
inline namespace CPU_CAPABILITY { |
|
|
|
template <typename T> |
|
struct is_floating_point: |
|
std::integral_constant<bool, |
|
std::is_floating_point<T>::value || |
|
std::is_same<T, at::Half>::value || |
|
std::is_same<T, at::BFloat16>::value> { |
|
}; |
|
|
|
template<size_t n> struct int_of_size; |
|
|
|
#define DEFINE_INT_OF_SIZE(int_t) \ |
|
template<> struct int_of_size<sizeof(int_t)> { using type = int_t; } |
|
|
|
DEFINE_INT_OF_SIZE(int64_t); |
|
DEFINE_INT_OF_SIZE(int32_t); |
|
DEFINE_INT_OF_SIZE(int16_t); |
|
DEFINE_INT_OF_SIZE(int8_t); |
|
|
|
#undef DEFINE_INT_OF_SIZE |
|
|
|
template <typename T> |
|
using int_same_size_t = typename int_of_size<sizeof(T)>::type; |
|
|
|
|
|
|
|
|
|
#if defined(__s390x__) |
|
template <class T, class TEMP=void> |
|
#else |
|
template <class T> |
|
#endif |
|
struct Vectorized { |
|
private: |
|
__at_align__ T values[VECTOR_WIDTH / sizeof(T)]; |
|
public: |
|
using value_type = T; |
|
using size_type = int; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static constexpr size_type size_T = sizeof(T); |
|
static constexpr size_type size() { |
|
return VECTOR_WIDTH / size_T; |
|
} |
|
Vectorized() : values{static_cast<T>(0)} {} |
|
Vectorized(T val) { |
|
for (int i = 0; i != size(); i++) { |
|
values[i] = val; |
|
} |
|
} |
|
template<typename... Args, |
|
typename = std::enable_if_t<(sizeof...(Args) == size())>> |
|
Vectorized(Args... vals) : values{vals...}{ |
|
} |
|
|
|
inline operator const T*() const { |
|
return values; |
|
} |
|
|
|
inline operator T*() { |
|
return values; |
|
} |
|
|
|
auto as_bytes() const -> const char* { |
|
return reinterpret_cast<const char*>(values); |
|
} |
|
template <int64_t mask_> |
|
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) { |
|
int64_t mask = mask_; |
|
Vectorized vector; |
|
for (const auto i : c10::irange(size())) { |
|
if (mask & 0x01) { |
|
vector[i] = b[i]; |
|
} else { |
|
vector[i] = a[i]; |
|
} |
|
mask = mask >> 1; |
|
} |
|
return vector; |
|
} |
|
static Vectorized<T> blendv(const Vectorized<T>& a, const Vectorized<T>& b, |
|
const Vectorized<T>& mask) { |
|
Vectorized vector; |
|
int_same_size_t<T> buffer[size()]; |
|
mask.store(buffer); |
|
for (const auto i : c10::irange(size())) { |
|
if (buffer[i] & 0x01) |
|
{ |
|
vector[i] = b[i]; |
|
} else { |
|
vector[i] = a[i]; |
|
} |
|
} |
|
return vector; |
|
} |
|
template<typename step_t> |
|
static Vectorized<T> arange(T base = static_cast<T>(0), step_t step = static_cast<step_t>(1)) { |
|
Vectorized vector; |
|
for (const auto i : c10::irange(size())) { |
|
vector.values[i] = base + i * step; |
|
} |
|
return vector; |
|
} |
|
static Vectorized<T> set(const Vectorized<T>& a, const Vectorized<T>& b, int64_t count = size()) { |
|
Vectorized vector; |
|
for (const auto i : c10::irange(size())) { |
|
if (i < count) { |
|
vector[i] = b[i]; |
|
} else { |
|
vector[i] = a[i]; |
|
} |
|
} |
|
return vector; |
|
} |
|
static Vectorized<T> loadu(const void* ptr) { |
|
Vectorized vector; |
|
std::memcpy(vector.values, ptr, VECTOR_WIDTH); |
|
return vector; |
|
} |
|
static Vectorized<T> loadu(const void* ptr, int64_t count) { |
|
Vectorized vector; |
|
std::memcpy(vector.values, ptr, count * sizeof(T)); |
|
return vector; |
|
} |
|
void store(void* ptr, int count = size()) const { |
|
std::memcpy(ptr, values, count * sizeof(T)); |
|
} |
|
int zero_mask() const { |
|
|
|
int mask = 0; |
|
for (int i = 0; i < size(); ++ i) { |
|
if (values[i] == static_cast<T>(0)) { |
|
mask |= (1 << i); |
|
} |
|
} |
|
return mask; |
|
} |
|
Vectorized<T> isnan() const { |
|
Vectorized<T> vector; |
|
for (int64_t i = 0; i != size(); i++) { |
|
if (_isnan(values[i])) { |
|
std::memset(static_cast<void*>(vector.values + i), 0xFF, sizeof(T)); |
|
} else { |
|
std::memset(static_cast<void*>(vector.values + i), 0, sizeof(T)); |
|
} |
|
} |
|
return vector; |
|
} |
|
Vectorized<T> map(T (*const f)(T)) const { |
|
Vectorized<T> ret; |
|
for (int64_t i = 0; i != size(); i++) { |
|
ret[i] = f(values[i]); |
|
} |
|
return ret; |
|
} |
|
Vectorized<T> map(T (*const f)(const T &)) const { |
|
Vectorized<T> ret; |
|
for (int64_t i = 0; i != size(); i++) { |
|
ret[i] = f(values[i]); |
|
} |
|
return ret; |
|
} |
|
template <typename other_t_abs = T, |
|
typename std::enable_if<!is_floating_point<other_t_abs>::value && !c10::is_complex<other_t_abs>::value, int>::type = 0> |
|
Vectorized<T> abs() const { |
|
|
|
static_assert(std::is_same<other_t_abs, T>::value, "other_t_abs must be T"); |
|
return map([](T x) -> T { return x < static_cast<T>(0) ? -x : x; }); |
|
} |
|
template <typename float_t_abs = T, |
|
typename std::enable_if<is_floating_point<float_t_abs>::value, int>::type = 0> |
|
Vectorized<T> abs() const { |
|
|
|
static_assert(std::is_same<float_t_abs, T>::value, "float_t_abs must be T"); |
|
|
|
|
|
return map([](T x) -> T { return std::abs(x); }); |
|
} |
|
template <typename complex_t_abs = T, |
|
typename std::enable_if<c10::is_complex<complex_t_abs>::value, int>::type = 0> |
|
Vectorized<T> abs() const { |
|
|
|
static_assert(std::is_same<complex_t_abs, T>::value, "complex_t_abs must be T"); |
|
|
|
return map([](T x) { return static_cast<T>(std::abs(x)); }); |
|
} |
|
|
|
template <typename other_t_sgn = T, |
|
typename std::enable_if<c10::is_complex<other_t_sgn>::value, int>::type = 0> |
|
Vectorized<T> sgn() const { |
|
return map(at::native::sgn_impl); |
|
} |
|
|
|
template <typename other_t_angle = T, |
|
typename std::enable_if<!c10::is_complex<other_t_angle>::value, int>::type = 0> |
|
Vectorized<T> angle() const { |
|
|
|
static_assert(std::is_same<other_t_angle, T>::value, "other_t_angle must be T"); |
|
return map(at::native::angle_impl<T>); |
|
} |
|
template <typename complex_t_angle = T, |
|
typename std::enable_if<c10::is_complex<complex_t_angle>::value, int>::type = 0> |
|
Vectorized<T> angle() const { |
|
|
|
static_assert(std::is_same<complex_t_angle, T>::value, "complex_t_angle must be T"); |
|
return map([](T x) { return static_cast<T>(std::arg(x)); }); |
|
} |
|
template <typename other_t_real = T, |
|
typename std::enable_if<!c10::is_complex<other_t_real>::value, int>::type = 0> |
|
Vectorized<T> real() const { |
|
|
|
static_assert(std::is_same<other_t_real, T>::value, "other_t_real must be T"); |
|
return *this; |
|
} |
|
template <typename complex_t_real = T, |
|
typename std::enable_if<c10::is_complex<complex_t_real>::value, int>::type = 0> |
|
Vectorized<T> real() const { |
|
|
|
static_assert(std::is_same<complex_t_real, T>::value, "complex_t_real must be T"); |
|
return map([](T x) { return static_cast<T>(x.real()); }); |
|
} |
|
template <typename other_t_imag = T, |
|
typename std::enable_if<!c10::is_complex<other_t_imag>::value, int>::type = 0> |
|
Vectorized<T> imag() const { |
|
|
|
static_assert(std::is_same<other_t_imag, T>::value, "other_t_imag must be T"); |
|
return Vectorized(0); |
|
} |
|
template <typename complex_t_imag = T, |
|
typename std::enable_if<c10::is_complex<complex_t_imag>::value, int>::type = 0> |
|
Vectorized<T> imag() const { |
|
|
|
static_assert(std::is_same<complex_t_imag, T>::value, "complex_t_imag must be T"); |
|
return map([](T x) { return static_cast<T>(x.imag()); }); |
|
} |
|
template <typename other_t_conj = T, |
|
typename std::enable_if<!c10::is_complex<other_t_conj>::value, int>::type = 0> |
|
Vectorized<T> conj() const { |
|
|
|
static_assert(std::is_same<other_t_conj, T>::value, "other_t_conj must be T"); |
|
return *this; |
|
} |
|
template <typename complex_t_conj = T, |
|
typename std::enable_if<c10::is_complex<complex_t_conj>::value, int>::type = 0> |
|
Vectorized<T> conj() const { |
|
|
|
static_assert(std::is_same<complex_t_conj, T>::value, "complex_t_conj must be T"); |
|
return map([](T x) { return static_cast<T>(std::conj(x)); }); |
|
} |
|
Vectorized<T> acos() const { |
|
return map(std::acos); |
|
} |
|
Vectorized<T> asin() const { |
|
return map(std::asin); |
|
} |
|
Vectorized<T> atan() const { |
|
return map(std::atan); |
|
} |
|
Vectorized<T> atan2(const Vectorized<T> &exp) const { |
|
Vectorized<T> ret; |
|
for (const auto i : c10::irange(size())) { |
|
ret[i] = std::atan2(values[i], exp[i]); |
|
} |
|
return ret; |
|
} |
|
template < |
|
typename U = T, |
|
typename std::enable_if_t<is_floating_point<U>::value, int> = 0> |
|
Vectorized<T> copysign(const Vectorized<T> &sign) const { |
|
Vectorized<T> ret; |
|
for (size_type i = 0; i < size(); i++) { |
|
ret[i] = c10::copysign(values[i], sign[i]); |
|
} |
|
return ret; |
|
} |
|
Vectorized<T> erf() const { |
|
return map(std::erf); |
|
} |
|
Vectorized<T> erfc() const { |
|
return map(std::erfc); |
|
} |
|
Vectorized<T> erfinv() const { |
|
return map(calc_erfinv); |
|
} |
|
Vectorized<T> exp() const { |
|
return map(std::exp); |
|
} |
|
Vectorized<T> expm1() const { |
|
return map(std::expm1); |
|
} |
|
Vectorized<T> frac() const { |
|
return *this - this->trunc(); |
|
} |
|
template < |
|
typename U = T, |
|
typename std::enable_if_t<is_floating_point<U>::value, int> = 0> |
|
Vectorized<T> fmod(const Vectorized<T>& q) const { |
|
|
|
static_assert(std::is_same<U, T>::value, "U must be T"); |
|
Vectorized<T> ret; |
|
for (const auto i : c10::irange(size())) { |
|
ret[i] = std::fmod(values[i], q[i]); |
|
} |
|
return ret; |
|
} |
|
Vectorized<T> log() const { |
|
return map(std::log); |
|
} |
|
Vectorized<T> log10() const { |
|
return map(std::log10); |
|
} |
|
Vectorized<T> log1p() const { |
|
return map(std::log1p); |
|
} |
|
template <typename other_t_log2 = T, |
|
typename std::enable_if<!c10::is_complex<other_t_log2>::value, int>::type = 0> |
|
Vectorized<T> log2() const { |
|
|
|
static_assert(std::is_same<other_t_log2, T>::value, "other_t_log2 must be T"); |
|
return map(std::log2); |
|
} |
|
template <typename complex_t_log2 = T, |
|
typename std::enable_if<c10::is_complex<complex_t_log2>::value, int>::type = 0> |
|
Vectorized<T> log2() const { |
|
|
|
static_assert(std::is_same<complex_t_log2, T>::value, "complex_t_log2 must be T"); |
|
const T log_2 = T(std::log(2.0)); |
|
return Vectorized(map(std::log))/Vectorized(log_2); |
|
} |
|
Vectorized<T> ceil() const { |
|
return map(at::native::ceil_impl); |
|
} |
|
Vectorized<T> cos() const { |
|
return map(std::cos); |
|
} |
|
Vectorized<T> cosh() const { |
|
return map(std::cosh); |
|
} |
|
Vectorized<T> floor() const { |
|
return map(at::native::floor_impl); |
|
} |
|
Vectorized<T> hypot(const Vectorized<T> &b) const { |
|
Vectorized<T> ret; |
|
for (const auto i : c10::irange(size())) { |
|
ret[i] = std::hypot(values[i], b[i]); |
|
} |
|
return ret; |
|
} |
|
Vectorized<T> i0() const { |
|
return map(calc_i0); |
|
} |
|
Vectorized<T> i0e() const { |
|
return map(calc_i0e); |
|
} |
|
Vectorized<T> igamma(const Vectorized<T> &x) const { |
|
Vectorized<T> ret; |
|
for (const auto i : c10::irange(size())) { |
|
ret[i] = calc_igamma(values[i], x[i]); |
|
} |
|
return ret; |
|
} |
|
Vectorized<T> igammac(const Vectorized<T> &x) const { |
|
Vectorized<T> ret; |
|
for (const auto i : c10::irange(size())) { |
|
ret[i] = calc_igammac(values[i], x[i]); |
|
} |
|
return ret; |
|
} |
|
Vectorized<T> neg() const { |
|
|
|
|
|
|
|
return map([](T x) -> T { return -x; }); |
|
} |
|
Vectorized<T> nextafter(const Vectorized<T> &b) const { |
|
Vectorized<T> ret; |
|
for (const auto i : c10::irange(size())) { |
|
ret[i] = std::nextafter(values[i], b[i]); |
|
} |
|
return ret; |
|
} |
|
Vectorized<T> round() const { |
|
|
|
return map(at::native::round_impl); |
|
} |
|
Vectorized<T> sin() const { |
|
return map(std::sin); |
|
} |
|
Vectorized<T> sinh() const { |
|
return map(std::sinh); |
|
} |
|
Vectorized<T> tan() const { |
|
return map(std::tan); |
|
} |
|
Vectorized<T> tanh() const { |
|
return map(std::tanh); |
|
} |
|
Vectorized<T> trunc() const { |
|
return map(at::native::trunc_impl); |
|
} |
|
Vectorized<T> lgamma() const { |
|
return map(std::lgamma); |
|
} |
|
Vectorized<T> sqrt() const { |
|
return map(std::sqrt); |
|
} |
|
Vectorized<T> reciprocal() const { |
|
return map([](T x) { return (T)(1) / x; }); |
|
} |
|
Vectorized<T> rsqrt() const { |
|
return map([](T x) { return (T)1 / std::sqrt(x); }); |
|
} |
|
Vectorized<T> pow(const Vectorized<T> &exp) const { |
|
Vectorized<T> ret; |
|
for (const auto i : c10::irange(size())) { |
|
ret[i] = std::pow(values[i], exp[i]); |
|
} |
|
return ret; |
|
} |
|
private: |
|
template <typename Op> |
|
inline Vectorized<T> binary_pred(const Vectorized<T>& other, Op op) const { |
|
|
|
Vectorized<T> vector; |
|
for (int64_t i = 0; i != size(); i++) { |
|
if (op(values[i], other.values[i])) { |
|
std::memset(static_cast<void*>(vector.values + i), 0xFF, sizeof(T)); |
|
} else { |
|
std::memset(static_cast<void*>(vector.values + i), 0, sizeof(T)); |
|
} |
|
} |
|
return vector; |
|
} |
|
|
|
public: |
|
Vectorized<T> operator==(const Vectorized<T>& other) const { return binary_pred(other, std::equal_to<T>()); } |
|
Vectorized<T> operator!=(const Vectorized<T>& other) const { return binary_pred(other, std::not_equal_to<T>()); } |
|
Vectorized<T> operator>=(const Vectorized<T>& other) const { return binary_pred(other, std::greater_equal<T>()); } |
|
Vectorized<T> operator<=(const Vectorized<T>& other) const { return binary_pred(other, std::less_equal<T>()); } |
|
Vectorized<T> operator>(const Vectorized<T>& other) const { return binary_pred(other, std::greater<T>()); } |
|
Vectorized<T> operator<(const Vectorized<T>& other) const { return binary_pred(other, std::less<T>()); } |
|
|
|
private: |
|
template <typename Op> |
|
inline Vectorized<T> binary_pred_bool(const Vectorized<T>& other, Op op) const { |
|
|
|
Vectorized<T> vector; |
|
for (int i = 0; i != size(); ++ i) { |
|
vector[i] = static_cast<T>(op(values[i], other.values[i])); |
|
} |
|
return vector; |
|
} |
|
|
|
public: |
|
Vectorized<T> eq(const Vectorized<T>& other) const { return binary_pred_bool(other, std::equal_to<T>()); } |
|
Vectorized<T> ne(const Vectorized<T>& other) const { return binary_pred_bool(other, std::not_equal_to<T>()); } |
|
Vectorized<T> gt(const Vectorized<T>& other) const { return binary_pred_bool(other, std::greater<T>()); } |
|
Vectorized<T> ge(const Vectorized<T>& other) const { return binary_pred_bool(other, std::greater_equal<T>()); } |
|
Vectorized<T> lt(const Vectorized<T>& other) const { return binary_pred_bool(other, std::less<T>()); } |
|
Vectorized<T> le(const Vectorized<T>& other) const { return binary_pred_bool(other, std::less_equal<T>()); } |
|
}; |
|
|
|
template <class T> Vectorized<T> inline operator+(const Vectorized<T> &a, const Vectorized<T> &b) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = a[i] + b[i]; |
|
} |
|
return c; |
|
} |
|
|
|
template <class T> Vectorized<T> inline operator-(const Vectorized<T> &a, const Vectorized<T> &b) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = a[i] - b[i]; |
|
} |
|
return c; |
|
} |
|
|
|
template <class T> Vectorized<T> inline operator*(const Vectorized<T> &a, const Vectorized<T> &b) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = a[i] * b[i]; |
|
} |
|
return c; |
|
} |
|
|
|
template <class T> Vectorized<T> inline operator/(const Vectorized<T> &a, const Vectorized<T> &b) __ubsan_ignore_float_divide_by_zero__ { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = a[i] / b[i]; |
|
} |
|
return c; |
|
} |
|
|
|
template <class T> Vectorized<T> inline operator||( |
|
const Vectorized<T> &a, const Vectorized<T> &b) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = a[i] || b[i]; |
|
} |
|
return c; |
|
} |
|
|
|
|
|
|
|
template <class T, |
|
typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0> |
|
Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = (a[i] > b[i]) ? a[i] : b[i]; |
|
if (_isnan(a[i])) { |
|
|
|
|
|
|
|
c[i] = a[i]; |
|
} |
|
} |
|
return c; |
|
} |
|
|
|
template <class T, |
|
typename std::enable_if<c10::is_complex<T>::value, int>::type = 0> |
|
Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = (std::abs(a[i]) > std::abs(b[i])) ? a[i] : b[i]; |
|
if (_isnan(a[i])) { |
|
|
|
|
|
|
|
c[i] = a[i]; |
|
} |
|
} |
|
return c; |
|
} |
|
|
|
|
|
|
|
template <class T, |
|
typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0> |
|
Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = (a[i] < b[i]) ? a[i] : b[i]; |
|
if (_isnan(a[i])) { |
|
|
|
|
|
|
|
c[i] = a[i]; |
|
} |
|
} |
|
return c; |
|
} |
|
|
|
template <class T, |
|
typename std::enable_if<c10::is_complex<T>::value, int>::type = 0> |
|
Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = (std::abs(a[i]) < std::abs(b[i])) ? a[i] : b[i]; |
|
if (_isnan(a[i])) { |
|
|
|
|
|
|
|
c[i] = a[i]; |
|
} |
|
} |
|
return c; |
|
} |
|
|
|
template <class T, |
|
typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0> |
|
Vectorized<T> inline clamp(const Vectorized<T> &a, const Vectorized<T> &min_vec, const Vectorized<T> &max_vec) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = std::min(std::max(a[i], min_vec[i]), max_vec[i]); |
|
} |
|
return c; |
|
} |
|
|
|
template <class T, |
|
typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0> |
|
Vectorized<T> inline clamp_max(const Vectorized<T> &a, const Vectorized<T> &max_vec) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = a[i] > max_vec[i] ? max_vec[i] : a[i]; |
|
} |
|
return c; |
|
} |
|
|
|
template <class T, |
|
typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0> |
|
Vectorized<T> inline clamp_min(const Vectorized<T> &a, const Vectorized<T> &min_vec) { |
|
Vectorized<T> c; |
|
for (int i = 0; i != Vectorized<T>::size(); i++) { |
|
c[i] = a[i] < min_vec[i] ? min_vec[i] : a[i]; |
|
} |
|
return c; |
|
} |
|
|
|
struct Vectorizedi; |
|
|
|
#if defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512) |
|
template <class T, typename Op> |
|
static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vectorized<T> &b, Op op) { |
|
int_vector buffer; |
|
#if defined(CPU_CAPABILITY_AVX2) |
|
int_vector a_buffer = _mm256_load_si256(reinterpret_cast<const int_vector*>((const T*)a)); |
|
int_vector b_buffer = _mm256_load_si256(reinterpret_cast<const int_vector*>((const T*)b)); |
|
#elif defined(CPU_CAPABILITY_AVX512) |
|
int_vector a_buffer = _mm512_load_si512(reinterpret_cast<const int_vector*>((const T*)a)); |
|
int_vector b_buffer = _mm512_load_si512(reinterpret_cast<const int_vector*>((const T*)b)); |
|
#endif |
|
buffer = op(a_buffer, b_buffer); |
|
__at_align__ T results[Vectorized<T>::size()]; |
|
|
|
#if defined(CPU_CAPABILITY_AVX2) |
|
_mm256_store_si256(reinterpret_cast<int_vector*>(results), buffer); |
|
#elif defined(CPU_CAPABILITY_AVX512) |
|
_mm512_store_si512(reinterpret_cast<int_vector*>(results), buffer); |
|
#endif |
|
return Vectorized<T>::loadu(results); |
|
} |
|
|
|
template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0> |
|
inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) { |
|
|
|
#if defined(CPU_CAPABILITY_AVX2) |
|
return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_and_si256(a, b); }); |
|
#elif defined(CPU_CAPABILITY_AVX512) |
|
return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_and_si512(a, b); }); |
|
#endif |
|
} |
|
template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0> |
|
inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) { |
|
|
|
#if defined(CPU_CAPABILITY_AVX2) |
|
return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_or_si256(a, b); }); |
|
#elif defined(CPU_CAPABILITY_AVX512) |
|
return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_or_si512(a, b); }); |
|
#endif |
|
} |
|
template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0> |
|
inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) { |
|
|
|
#if defined(CPU_CAPABILITY_AVX2) |
|
return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_xor_si256(a, b); }); |
|
#elif defined(CPU_CAPABILITY_AVX512) |
|
return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_xor_si512(a, b); }); |
|
#endif |
|
} |
|
|
|
#else |
|
|
|
template <typename T> |
|
auto load(char const* data) -> T { |
|
T ret; |
|
std::memcpy(&ret, data, sizeof(ret)); |
|
return ret; |
|
} |
|
|
|
template<class T, typename Op> |
|
static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vectorized<T> &b, Op op) { |
|
static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t); |
|
__at_align__ intmax_t buffer[element_no]; |
|
static_assert(VECTOR_WIDTH % sizeof(intmax_t) == 0, "VECTOR_WIDTH not a multiple of sizeof(intmax_t)"); |
|
static_assert(sizeof(buffer) == sizeof(Vectorized<T>), "sizeof(buffer) must match sizeof(Vectorized<T>)"); |
|
|
|
|
|
|
|
|
|
const auto* a_data = a.as_bytes(); |
|
const auto* b_data = b.as_bytes(); |
|
|
|
for (auto& out : buffer) { |
|
out = op(load<intmax_t>(a_data), load<intmax_t>(b_data)); |
|
a_data += sizeof(intmax_t); |
|
b_data += sizeof(intmax_t); |
|
} |
|
assert(a_data == a.as_bytes() + sizeof(a)); |
|
assert(b_data == b.as_bytes() + sizeof(b)); |
|
return Vectorized<T>::loadu(buffer); |
|
} |
|
|
|
template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0> |
|
inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) { |
|
return bitwise_binary_op(a, b, std::bit_and<intmax_t>()); |
|
} |
|
template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0> |
|
inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) { |
|
return bitwise_binary_op(a, b, std::bit_or<intmax_t>()); |
|
} |
|
template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0> |
|
inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) { |
|
return bitwise_binary_op(a, b, std::bit_xor<intmax_t>()); |
|
} |
|
|
|
#endif |
|
|
|
template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0> |
|
inline Vectorized<T> operator~(const Vectorized<T>& a) { |
|
Vectorized<T> ones; |
|
memset((T*) ones, 0xFF, VECTOR_WIDTH); |
|
return a ^ ones; |
|
} |
|
|
|
|
|
template <typename T> |
|
inline Vectorized<T>& operator += (Vectorized<T>& a, const Vectorized<T>& b) { |
|
a = a + b; |
|
return a; |
|
} |
|
template <typename T> |
|
inline Vectorized<T>& operator -= (Vectorized<T>& a, const Vectorized<T>& b) { |
|
a = a - b; |
|
return a; |
|
} |
|
template <typename T> |
|
inline Vectorized<T>& operator /= (Vectorized<T>& a, const Vectorized<T>& b) { |
|
a = a / b; |
|
return a; |
|
} |
|
template <typename T> |
|
inline Vectorized<T>& operator %= (Vectorized<T>& a, const Vectorized<T>& b) { |
|
a = a % b; |
|
return a; |
|
} |
|
template <typename T> |
|
inline Vectorized<T>& operator *= (Vectorized<T>& a, const Vectorized<T>& b) { |
|
a = a * b; |
|
return a; |
|
} |
|
|
|
template <typename T> |
|
inline Vectorized<T> fmadd(const Vectorized<T>& a, const Vectorized<T>& b, const Vectorized<T>& c) { |
|
return a * b + c; |
|
} |
|
|
|
template <int64_t scale = 1, typename T = void> |
|
std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<T>> |
|
inline gather(T const* base_addr, const Vectorized<int_same_size_t<T>>& vindex) { |
|
static constexpr int size = Vectorized<T>::size(); |
|
int_same_size_t<T> index_arr[size]; |
|
vindex.store(static_cast<void*>(index_arr)); |
|
T buffer[size]; |
|
for (const auto i : c10::irange(size)) { |
|
buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; |
|
} |
|
return Vectorized<T>::loadu(static_cast<void*>(buffer)); |
|
} |
|
|
|
template <int64_t scale = 1, typename T = void> |
|
std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<T>> |
|
inline mask_gather(const Vectorized<T>& src, T const* base_addr, |
|
const Vectorized<int_same_size_t<T>>& vindex, Vectorized<T>& mask) { |
|
static constexpr int size = Vectorized<T>::size(); |
|
T src_arr[size]; |
|
int_same_size_t<T> mask_arr[size]; |
|
int_same_size_t<T> index_arr[size]; |
|
src.store(static_cast<void*>(src_arr)); |
|
mask.store(static_cast<void*>(mask_arr)); |
|
vindex.store(static_cast<void*>(index_arr)); |
|
T buffer[size]; |
|
for (const auto i : c10::irange(size)) { |
|
if (mask_arr[i] & 0x01) { |
|
buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; |
|
} else { |
|
buffer[i] = src_arr[i]; |
|
} |
|
} |
|
mask = Vectorized<T>(); |
|
return Vectorized<T>::loadu(static_cast<void*>(buffer)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<typename dst_t, typename src_t> |
|
struct CastImpl { |
|
static inline Vectorized<dst_t> apply(const Vectorized<src_t>& src) { |
|
src_t src_arr[Vectorized<src_t>::size()]; |
|
src.store(static_cast<void*>(src_arr)); |
|
return Vectorized<dst_t>::loadu(static_cast<const void*>(src_arr)); |
|
} |
|
}; |
|
|
|
template<typename scalar_t> |
|
struct CastImpl<scalar_t, scalar_t> { |
|
static inline Vectorized<scalar_t> apply(const Vectorized<scalar_t>& src) { |
|
return src; |
|
} |
|
}; |
|
|
|
template<typename dst_t, typename src_t> |
|
inline Vectorized<dst_t> cast(const Vectorized<src_t>& src) { |
|
return CastImpl<dst_t, src_t>::apply(src); |
|
} |
|
|
|
template <typename T> |
|
inline Vectorized<int_same_size_t<T>> convert_to_int_of_same_size(const Vectorized<T>& src) { |
|
static constexpr int size = Vectorized<T>::size(); |
|
T src_arr[size]; |
|
src.store(static_cast<void*>(src_arr)); |
|
int_same_size_t<T> buffer[size]; |
|
for (const auto i : c10::irange(size)) { |
|
buffer[i] = static_cast<int_same_size_t<T>>(src_arr[i]); |
|
} |
|
return Vectorized<int_same_size_t<T>>::loadu(static_cast<void*>(buffer)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
inline std::enable_if_t<Vectorized<T>::size() % 2 == 0, std::pair<Vectorized<T>, Vectorized<T>>> |
|
deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) { |
|
static constexpr int size = Vectorized<T>::size(); |
|
static constexpr int half_size = size / 2; |
|
T a_arr[size]; |
|
T b_arr[size]; |
|
T buffer1[size]; |
|
T buffer2[size]; |
|
a.store(static_cast<void*>(a_arr)); |
|
b.store(static_cast<void*>(b_arr)); |
|
for (const auto i : c10::irange(half_size)) { |
|
buffer1[i] = a_arr[i * 2]; |
|
buffer1[half_size + i] = b_arr[i * 2]; |
|
buffer2[i] = a_arr[i * 2 + 1]; |
|
buffer2[half_size + i] = b_arr[i * 2 + 1]; |
|
} |
|
return std::make_pair(Vectorized<T>::loadu(static_cast<void*>(buffer1)), |
|
Vectorized<T>::loadu(static_cast<void*>(buffer2))); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
inline std::enable_if_t<Vectorized<T>::size() % 2 == 0, std::pair<Vectorized<T>, Vectorized<T>>> |
|
interleave2(const Vectorized<T>& a, const Vectorized<T>& b) { |
|
static constexpr int size = Vectorized<T>::size(); |
|
static constexpr int half_size = size / 2; |
|
T a_arr[size]; |
|
T b_arr[size]; |
|
T buffer1[size]; |
|
T buffer2[size]; |
|
a.store(static_cast<void*>(a_arr)); |
|
b.store(static_cast<void*>(b_arr)); |
|
for (const auto i : c10::irange(half_size)) { |
|
buffer1[i * 2] = a_arr[i]; |
|
buffer1[i * 2 + 1] = b_arr[i]; |
|
buffer2[i * 2] = a_arr[half_size + i]; |
|
buffer2[i * 2 + 1] = b_arr[half_size + i]; |
|
} |
|
return std::make_pair(Vectorized<T>::loadu(static_cast<void*>(buffer1)), |
|
Vectorized<T>::loadu(static_cast<void*>(buffer2))); |
|
} |
|
|
|
template <typename src_T, typename dst_T> |
|
inline void convert(const src_T *src, dst_T *dst, int64_t n) { |
|
#ifndef _MSC_VER |
|
# pragma unroll |
|
#endif |
|
for (const auto i : c10::irange(n)) { |
|
(void)i; |
|
*dst = c10::convert<dst_T>(c10::load(src)); |
|
src++; |
|
dst++; |
|
} |
|
} |
|
|
|
}}} |
|
|