|
#pragma once |
|
|
|
|
|
|
|
|
|
#include <ATen/cpu/vec/intrinsics.h> |
|
#include <ATen/cpu/vec/vec_base.h> |
|
#include <ATen/native/quantized/AffineQuantizerBase.h> |
|
|
|
#include <c10/util/irange.h> |
|
#include <c10/util/qint32.h> |
|
#include <c10/util/qint8.h> |
|
#include <c10/util/quint8.h> |
|
|
|
#include <array> |
|
#include <iostream> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace at { |
|
namespace vec { |
|
inline namespace CPU_CAPABILITY { |
|
|
|
#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) |
|
|
|
struct Vectorizedqi { |
|
protected: |
|
__m256i vals __attribute__((aligned(64))); |
|
|
|
public: |
|
Vectorizedqi() {} |
|
Vectorizedqi(__m256i v) : vals(v) {} |
|
operator __m256i() const { |
|
return vals; |
|
} |
|
}; |
|
|
|
template <typename T> |
|
__m256i pack_saturate_and_clamp( |
|
__m256i first, |
|
__m256i second, |
|
T min_val, |
|
T max_val); |
|
|
|
template <> |
|
inline __m256i pack_saturate_and_clamp<int32_t>( |
|
__m256i , |
|
__m256i , |
|
int32_t , |
|
int32_t ) { |
|
|
|
AT_ERROR("pack_saturate_and_clamp<int32_t> is not supported"); |
|
} |
|
|
|
template <> |
|
inline __m256i pack_saturate_and_clamp<int8_t>( |
|
__m256i first, |
|
__m256i second, |
|
int8_t min_val, |
|
int8_t max_val) { |
|
__m256i packed_and_sat = _mm256_packs_epi16(first, second); |
|
return _mm256_max_epi8( |
|
_mm256_set1_epi8(min_val), |
|
_mm256_min_epi8(packed_and_sat, _mm256_set1_epi8(max_val))); |
|
} |
|
|
|
template <> |
|
inline __m256i pack_saturate_and_clamp<uint8_t>( |
|
__m256i first, |
|
__m256i second, |
|
uint8_t min_val, |
|
uint8_t max_val) { |
|
__m256i packed_and_sat = _mm256_packus_epi16(first, second); |
|
return _mm256_max_epu8( |
|
_mm256_set1_epi8(min_val), |
|
_mm256_min_epu8(packed_and_sat, _mm256_set1_epi8(max_val))); |
|
} |
|
|
|
template <typename T> |
|
inline void __attribute__((always_inline)) QuantizeAvx2( |
|
const float* src, |
|
typename T::underlying* dst, |
|
int len, |
|
float inverse_scale, |
|
int64_t zero_point) { |
|
constexpr int VLEN = 8; |
|
constexpr auto min_val = std::numeric_limits<typename T::underlying>::min(); |
|
constexpr auto max_val = std::numeric_limits<typename T::underlying>::max(); |
|
const __m256i min_v = _mm256_set1_epi32(min_val); |
|
const __m256i max_v = _mm256_set1_epi32(max_val); |
|
|
|
constexpr int32_t int32_float_max_val = |
|
std::numeric_limits<int32_t>::max() - 127; |
|
int i = 0; |
|
__m256 inverse_scale_v = _mm256_set1_ps(inverse_scale); |
|
|
|
static const __m256i shuffle_mask_v = _mm256_set_epi8( |
|
0xff, 0xff, 0xff, 0xff, |
|
0xff, 0xff, 0xff, 0xff, |
|
0xff, 0xff, 0xff, 0xff, |
|
0x0c, 0x08, 0x04, 0x00, |
|
0xff, 0xff, 0xff, 0xff, |
|
0xff, 0xff, 0xff, 0xff, |
|
0xff, 0xff, 0xff, 0xff, |
|
0x0c, 0x08, 0x04, 0x00); |
|
|
|
__m256i permute_mask_v = |
|
_mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); |
|
__m256i permute_mask_l8_v = |
|
_mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); |
|
int len_aligned = len / (VLEN * 4) * (VLEN * 4); |
|
for (; i < len_aligned; i += 4 * VLEN) { |
|
|
|
__m256 x_vals = _mm256_load_ps(src + i); |
|
__m256 x_transformed_v = _mm256_mul_ps(x_vals, inverse_scale_v); |
|
|
|
|
|
|
|
x_transformed_v = |
|
_mm256_min_ps(x_transformed_v, _mm256_set1_ps(int32_float_max_val)); |
|
|
|
__m256 y_vals = _mm256_load_ps(src + i + VLEN); |
|
__m256 y_transformed_v = _mm256_mul_ps(y_vals, inverse_scale_v); |
|
y_transformed_v = |
|
_mm256_min_ps(y_transformed_v, _mm256_set1_ps(int32_float_max_val)); |
|
|
|
__m256 z_vals = _mm256_load_ps(src + i + 2 * VLEN); |
|
__m256 z_transformed_v = _mm256_mul_ps(z_vals, inverse_scale_v); |
|
z_transformed_v = |
|
_mm256_min_ps(z_transformed_v, _mm256_set1_ps(int32_float_max_val)); |
|
|
|
__m256 w_vals = _mm256_load_ps(src + i + 3 * VLEN); |
|
__m256 w_transformed_v = _mm256_mul_ps(w_vals, inverse_scale_v); |
|
w_transformed_v = |
|
_mm256_min_ps(w_transformed_v, _mm256_set1_ps(int32_float_max_val)); |
|
|
|
__m256i x_rounded_v = _mm256_cvtps_epi32(x_transformed_v); |
|
__m256i y_rounded_v = _mm256_cvtps_epi32(y_transformed_v); |
|
__m256i z_rounded_v = _mm256_cvtps_epi32(z_transformed_v); |
|
__m256i w_rounded_v = _mm256_cvtps_epi32(w_transformed_v); |
|
|
|
|
|
x_rounded_v = _mm256_add_epi32(x_rounded_v, _mm256_set1_epi32(zero_point)); |
|
y_rounded_v = _mm256_add_epi32(y_rounded_v, _mm256_set1_epi32(zero_point)); |
|
z_rounded_v = _mm256_add_epi32(z_rounded_v, _mm256_set1_epi32(zero_point)); |
|
w_rounded_v = _mm256_add_epi32(w_rounded_v, _mm256_set1_epi32(zero_point)); |
|
|
|
__m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v); |
|
__m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v); |
|
__m256i xyzw_clamped_v = pack_saturate_and_clamp<typename T::underlying>( |
|
xy_packed_v, zw_packed_v, min_val, max_val); |
|
|
|
xyzw_clamped_v = |
|
_mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v); |
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), xyzw_clamped_v); |
|
} |
|
|
|
|
|
|
|
for (; i < len / VLEN * VLEN; i += VLEN) { |
|
__m256 x_vals = _mm256_load_ps(src + i); |
|
__m256 x_transformed_v = _mm256_mul_ps(x_vals, inverse_scale_v); |
|
x_transformed_v = |
|
_mm256_min_ps(x_transformed_v, _mm256_set1_ps(int32_float_max_val)); |
|
__m256i x_rounded_v = _mm256_cvtps_epi32(x_transformed_v); |
|
x_rounded_v = _mm256_add_epi32(x_rounded_v, _mm256_set1_epi32(zero_point)); |
|
__m256i x_clipped_v = |
|
_mm256_max_epi32(min_v, _mm256_min_epi32(max_v, x_rounded_v)); |
|
|
|
x_clipped_v = _mm256_shuffle_epi8(x_clipped_v, shuffle_mask_v); |
|
x_clipped_v = _mm256_permutevar8x32_epi32(x_clipped_v, permute_mask_l8_v); |
|
_mm_storel_epi64( |
|
reinterpret_cast<__m128i*>(dst + i), |
|
_mm256_castsi256_si128(x_clipped_v)); |
|
} |
|
|
|
for (; i < len; ++i) { |
|
float transformed = src[i] * inverse_scale; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
transformed = zero_point + nearbyint(transformed); |
|
float clipped = |
|
std::min(std::max(transformed, float(min_val)), float(max_val)); |
|
dst[i] = clipped; |
|
} |
|
} |
|
|
|
template<> |
|
struct Vectorized<c10::qint32> : public Vectorizedqi { |
|
using size_type = int; |
|
static constexpr size_type size() { |
|
return 8; |
|
} |
|
|
|
static constexpr int float_num_vecs() { |
|
return 1; |
|
} |
|
|
|
static constexpr int int_num_vecs() { |
|
return 1; |
|
} |
|
|
|
using float_vec_return_type = std::array<Vectorized<float>, 1>; |
|
using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>; |
|
using value_type = c10::qint32::underlying; |
|
|
|
public: |
|
using Vectorizedqi::Vectorizedqi; |
|
Vectorized() {} |
|
|
|
Vectorized(__m256i vals_) { vals = vals_;} |
|
|
|
|
|
Vectorized(const c10::qint32& val) { |
|
value_type uw = val.val_; |
|
vals = _mm256_set1_epi32(uw); |
|
} |
|
|
|
void store(void* ptr, int count = size()) const { |
|
if (count != size()) { |
|
memcpy(ptr, &vals, count * sizeof(value_type)); |
|
} else { |
|
_mm256_storeu_si256((__m256i*)ptr, vals); |
|
} |
|
} |
|
|
|
static Vectorized<c10::qint32> loadu(const void* ptr) { |
|
return Vectorized<c10::qint32>(ptr); |
|
} |
|
|
|
static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) { |
|
__at_align__ value_type tmp_values[size()]; |
|
|
|
|
|
|
|
for (const auto i : c10::irange(size())) { |
|
tmp_values[i] = 0; |
|
} |
|
std::memcpy( |
|
tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type)); |
|
return _mm256_loadu_si256((const __m256i*)tmp_values); |
|
} |
|
|
|
float_vec_return_type dequantize( |
|
Vectorized<float> scale, |
|
Vectorized<float> , |
|
Vectorized<float> scale_zp_premul) const { |
|
__m256 float_vals = _mm256_cvtepi32_ps(vals); |
|
return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)}; |
|
} |
|
|
|
static Vectorized<c10::qint32> quantize( |
|
const float_vec_return_type& rhs, |
|
float scale, |
|
int32_t zero_point, |
|
float ) { |
|
Vectorized<c10::qint32> retval; |
|
auto rhs_data = (__m256)rhs[0]; |
|
at::native::quantize_vec<c10::qint32, 32>( |
|
scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 8); |
|
return retval; |
|
} |
|
|
|
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const { |
|
return _mm256_max_epi32(vals, b.vals); |
|
} |
|
|
|
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const { |
|
return _mm256_min_epi32(vals, b.vals); |
|
} |
|
|
|
Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const { |
|
return maximum(zero_point); |
|
} |
|
|
|
Vectorized<c10::qint32> relu6( |
|
Vectorized<c10::qint32> zero_point, |
|
Vectorized<c10::qint32> q_six) { |
|
return _mm256_min_epi32( |
|
_mm256_max_epi32(vals, zero_point.vals), q_six.vals); |
|
} |
|
|
|
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const { |
|
return {_mm256_sub_epi32(vals, b)}; |
|
} |
|
|
|
static Vectorized<c10::qint32> requantize_from_int( |
|
const int_vec_return_type& inp, |
|
float multiplier, |
|
int32_t zero_point) { |
|
__m256 multiplier_v = _mm256_set1_ps(multiplier); |
|
__m256i zero_point_v = _mm256_set1_epi32(zero_point); |
|
|
|
__m256 scaled = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier_v); |
|
__m256i rounded = _mm256_cvtps_epi32(scaled); |
|
return _mm256_add_epi32(rounded, zero_point_v); |
|
} |
|
|
|
private: |
|
|
|
Vectorized(const void* ptr) { |
|
vals = _mm256_loadu_si256((const __m256i*)ptr); |
|
} |
|
}; |
|
|
|
template <> |
|
Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) { |
|
return a.maximum(b); |
|
} |
|
|
|
template <> |
|
Vectorized<c10::qint32> inline operator*( |
|
const Vectorized<c10::qint32>& a, |
|
const Vectorized<c10::qint32>& b) { |
|
return _mm256_mullo_epi32(a, b); |
|
} |
|
|
|
template <> |
|
Vectorized<c10::qint32> inline operator+( |
|
const Vectorized<c10::qint32>& a, |
|
const Vectorized<c10::qint32>& b) { |
|
return _mm256_add_epi32(a, b); |
|
} |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
__m256i RequantizeAvx2( |
|
const std::array<Vectorized<c10::qint32>, 4>& inp, |
|
__m256 multiplier, |
|
__m256i zp) { |
|
static_assert( |
|
std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value, |
|
"Only int8_t/uint8_t are supported"); |
|
constexpr auto min_val = std::numeric_limits<T>::min(); |
|
constexpr auto max_val = std::numeric_limits<T>::max(); |
|
__m256i permute_mask_v = |
|
_mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); |
|
__m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier); |
|
__m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[1]), multiplier); |
|
__m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[2]), multiplier); |
|
__m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[3]), multiplier); |
|
|
|
__m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v); |
|
__m256i y_rounded_v = _mm256_cvtps_epi32(y_scaled_v); |
|
__m256i z_rounded_v = _mm256_cvtps_epi32(z_scaled_v); |
|
__m256i w_rounded_v = _mm256_cvtps_epi32(w_scaled_v); |
|
|
|
|
|
__m256i x_v = _mm256_add_epi32(x_rounded_v, zp); |
|
__m256i y_v = _mm256_add_epi32(y_rounded_v, zp); |
|
__m256i z_v = _mm256_add_epi32(z_rounded_v, zp); |
|
__m256i w_v = _mm256_add_epi32(w_rounded_v, zp); |
|
|
|
|
|
__m256i xy_packed_v = _mm256_packs_epi32(x_v, y_v); |
|
__m256i zw_packed_v = _mm256_packs_epi32(z_v, w_v); |
|
|
|
__m256i xyzw_clamped_v = |
|
pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val); |
|
|
|
|
|
|
|
|
|
|
|
xyzw_clamped_v = _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v); |
|
return xyzw_clamped_v; |
|
} |
|
|
|
template<> |
|
struct Vectorized<c10::qint8> : public Vectorizedqi { |
|
static constexpr int size() { |
|
return 32; |
|
} |
|
|
|
static constexpr int float_num_vecs() { |
|
return 4; |
|
} |
|
|
|
static constexpr int int_num_vecs() { |
|
return 4; |
|
} |
|
|
|
using float_vec_return_type = std::array<Vectorized<float>, 4>; |
|
using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>; |
|
using value_type = typename c10::qint8::underlying; |
|
|
|
public: |
|
using Vectorizedqi::Vectorizedqi; |
|
|
|
Vectorized() {} |
|
Vectorized(__m256i vals_) { vals = vals_;} |
|
|
|
|
|
Vectorized(const c10::qint8& val) { |
|
value_type uw = val.val_; |
|
vals = _mm256_set1_epi8(uw); |
|
} |
|
|
|
|
|
|
|
|
|
C10_CLANG_DIAGNOSTIC_PUSH() |
|
#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy") |
|
C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy") |
|
#endif |
|
Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) { } |
|
C10_CLANG_DIAGNOSTIC_POP() |
|
|
|
void store(void* ptr, int count = size()) const { |
|
if (count != size()) { |
|
memcpy(ptr, &vals, count * sizeof(value_type)); |
|
} else { |
|
_mm256_storeu_si256((__m256i*)ptr, vals); |
|
} |
|
} |
|
|
|
static Vectorized<c10::qint8> loadu(const void* ptr) { |
|
return Vectorized<c10::qint8>(ptr); |
|
} |
|
|
|
static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) { |
|
__at_align__ value_type tmp_values[size()]; |
|
|
|
|
|
|
|
for (const auto i : c10::irange(size())) { |
|
tmp_values[i] = 0; |
|
} |
|
std::memcpy( |
|
tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type)); |
|
return _mm256_loadu_si256((const __m256i*)tmp_values); |
|
} |
|
|
|
private: |
|
__m256i cvtepi8_epi32(__m128i epi8_vals) const { |
|
return _mm256_cvtepi8_epi32(epi8_vals); |
|
} |
|
|
|
public: |
|
float_vec_return_type dequantize( |
|
Vectorized<float> scale, |
|
Vectorized<float> , |
|
Vectorized<float> scale_neg_zp_premul) const { |
|
__m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); |
|
__m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); |
|
__m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); |
|
__m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); |
|
|
|
__m256 float_val0 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val0)); |
|
__m256 float_val1 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val1)); |
|
__m256 float_val2 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val2)); |
|
__m256 float_val3 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val3)); |
|
|
|
auto val0 = |
|
vec::fmadd(scale, Vectorized<float>(float_val0), scale_neg_zp_premul); |
|
auto val1 = |
|
vec::fmadd(scale, Vectorized<float>(float_val1), scale_neg_zp_premul); |
|
auto val2 = |
|
vec::fmadd(scale, Vectorized<float>(float_val2), scale_neg_zp_premul); |
|
auto val3 = |
|
vec::fmadd(scale, Vectorized<float>(float_val3), scale_neg_zp_premul); |
|
return {val0, val1, val2, val3}; |
|
} |
|
|
|
static Vectorized<c10::qint8> quantize( |
|
const float_vec_return_type& rhs, |
|
float , |
|
int32_t zero_point, |
|
float inverse_scale) { |
|
auto* rhs_data = (float*)rhs.data(); |
|
int8_t quantized_values[32]; |
|
QuantizeAvx2<c10::qint8>( |
|
rhs_data, quantized_values, 32, inverse_scale, zero_point); |
|
return Vectorized<c10::qint8>::loadu(quantized_values); |
|
} |
|
|
|
Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const { |
|
return _mm256_max_epi8(vals, b.vals); |
|
} |
|
|
|
Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const { |
|
return _mm256_min_epi8(vals, b.vals); |
|
} |
|
|
|
Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const { |
|
return maximum(zero_point); |
|
} |
|
|
|
Vectorized<c10::qint8> relu6( |
|
Vectorized<c10::qint8> zero_point, |
|
Vectorized<c10::qint8> q_six) { |
|
return _mm256_min_epi8( |
|
_mm256_max_epi8(vals, zero_point.vals), q_six.vals); |
|
} |
|
|
|
int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const { |
|
__m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); |
|
__m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); |
|
__m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); |
|
__m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); |
|
|
|
__m256i int32_val0 = cvtepi8_epi32(int_val0); |
|
__m256i int32_val1 = cvtepi8_epi32(int_val1); |
|
__m256i int32_val2 = cvtepi8_epi32(int_val2); |
|
__m256i int32_val3 = cvtepi8_epi32(int_val3); |
|
|
|
__m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0)); |
|
__m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1)); |
|
__m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2)); |
|
__m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3)); |
|
|
|
__m256i int32_b0 = cvtepi8_epi32(int_b0); |
|
__m256i int32_b1 = cvtepi8_epi32(int_b1); |
|
__m256i int32_b2 = cvtepi8_epi32(int_b2); |
|
__m256i int32_b3 = cvtepi8_epi32(int_b3); |
|
|
|
__m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0); |
|
__m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1); |
|
__m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2); |
|
__m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3); |
|
|
|
return {Vectorized<c10::qint32>(res_0), |
|
Vectorized<c10::qint32>(res_1), |
|
Vectorized<c10::qint32>(res_2), |
|
Vectorized<c10::qint32>(res_3)}; |
|
} |
|
|
|
static Vectorized<c10::qint8> requantize_from_int( |
|
const int_vec_return_type& inp, |
|
float multiplier, |
|
int32_t zero_point) { |
|
__m256 multiplier_v = _mm256_set1_ps(multiplier); |
|
__m256i zero_point_v = _mm256_set1_epi32(zero_point); |
|
return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v); |
|
} |
|
|
|
private: |
|
|
|
Vectorized(const void* ptr) { |
|
vals = _mm256_loadu_si256((const __m256i*)ptr); |
|
} |
|
}; |
|
|
|
template <> |
|
Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) { |
|
return a.maximum(b); |
|
} |
|
|
|
template<> |
|
struct Vectorized<c10::quint8> : public Vectorizedqi { |
|
static constexpr int size() { |
|
return 32; |
|
} |
|
|
|
static constexpr int float_num_vecs() { |
|
return 4; |
|
} |
|
|
|
static constexpr int int_num_vecs() { |
|
return 4; |
|
} |
|
|
|
using float_vec_return_type = std::array<Vectorized<float>, 4>; |
|
using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>; |
|
using value_type = typename c10::quint8::underlying; |
|
|
|
public: |
|
using Vectorizedqi::Vectorizedqi; |
|
Vectorized() {} |
|
|
|
Vectorized(__m256i vals_) { vals = vals_;} |
|
|
|
|
|
Vectorized(const c10::quint8& val) { |
|
value_type uw = val.val_; |
|
vals = _mm256_set1_epi8(uw); |
|
} |
|
|
|
|
|
C10_CLANG_DIAGNOSTIC_PUSH() |
|
#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy") |
|
C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy") |
|
#endif |
|
Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) { } |
|
C10_CLANG_DIAGNOSTIC_POP() |
|
|
|
void store(void* ptr, int count = size()) const { |
|
if (count != size()) { |
|
memcpy(ptr, &vals, count * sizeof(value_type)); |
|
} else { |
|
_mm256_storeu_si256((__m256i*)ptr, vals); |
|
} |
|
} |
|
|
|
static Vectorized<c10::quint8> loadu(const void* ptr) { |
|
return Vectorized<c10::quint8>(ptr); |
|
} |
|
|
|
static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) { |
|
__at_align__ value_type tmp_values[size()]; |
|
|
|
|
|
|
|
for (const auto i : c10::irange(size())) { |
|
tmp_values[i] = 0; |
|
} |
|
std::memcpy( |
|
tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type)); |
|
return _mm256_loadu_si256((const __m256i*)tmp_values); |
|
} |
|
|
|
private: |
|
__m256i cvtepu8_epi32(__m128i epu8_vals) const { |
|
return _mm256_cvtepu8_epi32(epu8_vals); |
|
} |
|
|
|
public: |
|
float_vec_return_type dequantize( |
|
Vectorized<float> scale, |
|
Vectorized<float> , |
|
Vectorized<float> scale_zp_premul) const { |
|
__m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); |
|
__m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); |
|
__m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); |
|
__m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); |
|
|
|
__m256 float_val0 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val0)); |
|
__m256 float_val1 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val1)); |
|
__m256 float_val2 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val2)); |
|
__m256 float_val3 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val3)); |
|
|
|
auto val0 = |
|
vec::fmadd(scale, Vectorized<float>(float_val0), scale_zp_premul); |
|
auto val1 = |
|
vec::fmadd(scale, Vectorized<float>(float_val1), scale_zp_premul); |
|
auto val2 = |
|
vec::fmadd(scale, Vectorized<float>(float_val2), scale_zp_premul); |
|
auto val3 = |
|
vec::fmadd(scale, Vectorized<float>(float_val3), scale_zp_premul); |
|
return {val0, val1, val2, val3}; |
|
} |
|
|
|
static Vectorized<c10::quint8> quantize( |
|
const float_vec_return_type& rhs, |
|
float , |
|
int32_t zero_point, |
|
float inverse_scale) { |
|
auto* rhs_data = (float*)rhs.data(); |
|
uint8_t quantized_values[32]; |
|
QuantizeAvx2<c10::quint8>( |
|
rhs_data, quantized_values, 32, inverse_scale, zero_point); |
|
return Vectorized<c10::quint8>::loadu(quantized_values); |
|
} |
|
|
|
Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const { |
|
return _mm256_max_epu8(vals, b.vals); |
|
} |
|
|
|
Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const { |
|
return _mm256_min_epu8(vals, b.vals); |
|
} |
|
|
|
Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const { |
|
return maximum(zero_point); |
|
} |
|
|
|
Vectorized<c10::quint8> relu6( |
|
Vectorized<c10::quint8> zero_point, |
|
Vectorized<c10::quint8> q_six) { |
|
return _mm256_min_epu8( |
|
_mm256_max_epu8(vals, zero_point.vals), q_six.vals); |
|
} |
|
|
|
int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const { |
|
__m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); |
|
__m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); |
|
__m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2)); |
|
__m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3)); |
|
|
|
__m256i int32_val0 = cvtepu8_epi32(int_val0); |
|
__m256i int32_val1 = cvtepu8_epi32(int_val1); |
|
__m256i int32_val2 = cvtepu8_epi32(int_val2); |
|
__m256i int32_val3 = cvtepu8_epi32(int_val3); |
|
|
|
__m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0)); |
|
__m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1)); |
|
__m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2)); |
|
__m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3)); |
|
|
|
__m256i int32_b0 = cvtepu8_epi32(int_b0); |
|
__m256i int32_b1 = cvtepu8_epi32(int_b1); |
|
__m256i int32_b2 = cvtepu8_epi32(int_b2); |
|
__m256i int32_b3 = cvtepu8_epi32(int_b3); |
|
|
|
__m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0); |
|
__m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1); |
|
__m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2); |
|
__m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3); |
|
return {Vectorized<c10::qint32>(res_0), |
|
Vectorized<c10::qint32>(res_1), |
|
Vectorized<c10::qint32>(res_2), |
|
Vectorized<c10::qint32>(res_3)}; |
|
} |
|
|
|
static Vectorized<c10::quint8> requantize_from_int( |
|
const int_vec_return_type& inp, |
|
float multiplier, |
|
int32_t zero_point) { |
|
__m256 multiplier_v = _mm256_set1_ps(multiplier); |
|
__m256i zero_point_v = _mm256_set1_epi32(zero_point); |
|
return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v); |
|
} |
|
|
|
private: |
|
|
|
|
|
Vectorized(const void* ptr) { |
|
vals = _mm256_loadu_si256((const __m256i*)ptr); |
|
} |
|
}; |
|
|
|
template <> |
|
Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) { |
|
return a.maximum(b); |
|
} |
|
|
|
#else |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template < |
|
typename T, |
|
typename float_vec_return_type_, |
|
typename int_vec_return_type_, |
|
int size_> |
|
struct VectorizedQuantizedConverter { |
|
static constexpr int size() { |
|
return size_; |
|
} |
|
|
|
static constexpr int float_num_vecs() { |
|
return size() / 8; |
|
} |
|
|
|
static constexpr int int_num_vecs() { |
|
return size() / 8; |
|
} |
|
|
|
using float_vec_return_type = float_vec_return_type_; |
|
using int_vec_return_type = int_vec_return_type_; |
|
|
|
using value_type = typename T::underlying; |
|
std::array<value_type, size_> vals; |
|
|
|
VectorizedQuantizedConverter(T val) { |
|
for (const auto i : c10::irange(size())) { |
|
vals[i] = val.val_; |
|
} |
|
} |
|
|
|
VectorizedQuantizedConverter(const void* ptr) { |
|
memcpy(vals.data(), ptr, sizeof(value_type) * size()); |
|
} |
|
|
|
void store(void* ptr, int count = size()) const { |
|
memcpy(ptr, vals.data(), count * sizeof(value_type)); |
|
} |
|
|
|
float_vec_return_type dequantize( |
|
Vectorized<float> scale, |
|
Vectorized<float> zero_point, |
|
Vectorized<float> ) const { |
|
float_vec_return_type rv; |
|
for (const auto i : c10::irange(float_num_vecs())) { |
|
float tmp_vals[8]; |
|
for (const auto j : c10::irange(8)) { |
|
tmp_vals[j] = at::native::dequantize_val<T>( |
|
scale[j], zero_point[j], T(vals[8 * i + j])); |
|
} |
|
rv[i] = Vectorized<float>(tmp_vals[0], |
|
tmp_vals[1], |
|
tmp_vals[2], |
|
tmp_vals[3], |
|
tmp_vals[4], |
|
tmp_vals[5], |
|
tmp_vals[6], |
|
tmp_vals[7]); |
|
} |
|
return rv; |
|
} |
|
|
|
protected: |
|
VectorizedQuantizedConverter() {} |
|
}; |
|
|
|
template <> |
|
struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter< |
|
c10::qint32, |
|
std::array<Vectorized<float>, 1>, |
|
std::array<Vectorized<c10::qint32>, 1>, |
|
8> { |
|
Vectorized() |
|
: VectorizedQuantizedConverter< |
|
c10::qint32, |
|
std::array<Vectorized<float>, 1>, |
|
std::array<Vectorized<c10::qint32>, 1>, |
|
8>() {} |
|
Vectorized(c10::qint32 val) |
|
: VectorizedQuantizedConverter< |
|
c10::qint32, |
|
std::array<Vectorized<float>, 1>, |
|
std::array<Vectorized<c10::qint32>, 1>, |
|
8>(val) {} |
|
Vectorized(const void* ptr) |
|
: VectorizedQuantizedConverter< |
|
c10::qint32, |
|
std::array<Vectorized<float>, 1>, |
|
std::array<Vectorized<c10::qint32>, 1>, |
|
8>(ptr) {} |
|
|
|
static Vectorized<c10::qint32> loadu(const void* ptr) { |
|
return Vectorized<c10::qint32>(ptr); |
|
} |
|
|
|
static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) { |
|
__at_align__ value_type tmp_values[size()]; |
|
|
|
|
|
|
|
for (const auto i : c10::irange(size())) { |
|
tmp_values[i] = 0; |
|
} |
|
std::memcpy( |
|
tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type)); |
|
return Vectorized<c10::qint32>(tmp_values); |
|
} |
|
|
|
static Vectorized<c10::qint32> quantize( |
|
const float_vec_return_type& rhs, |
|
float scale, |
|
int32_t zero_point, |
|
float ) { |
|
std::array<value_type, size()> qvals; |
|
std::array<float, float_num_vecs() * 8> float_vals; |
|
|
|
for (const auto i : c10::irange(float_num_vecs())) { |
|
rhs[i].store(&float_vals[i * 8], 8); |
|
} |
|
|
|
at::native::quantize_vec<c10::qint32, 32>( |
|
scale, |
|
zero_point, |
|
float_vals.data(), |
|
(c10::qint32*)qvals.data(), |
|
8 * float_num_vecs()); |
|
|
|
return Vectorized<c10::qint32>::loadu(qvals.data()); |
|
} |
|
|
|
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const { |
|
Vectorized<c10::qint32> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]); |
|
} |
|
return retval; |
|
} |
|
|
|
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const { |
|
Vectorized<c10::qint32> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]); |
|
} |
|
return retval; |
|
} |
|
|
|
Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const { |
|
return maximum(zero_point); |
|
} |
|
|
|
|
|
Vectorized<c10::qint32> relu6( |
|
Vectorized<c10::qint32> zero_point, |
|
Vectorized<c10::qint32> q_six) { |
|
Vectorized<c10::qint32> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = std::min<value_type>( |
|
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]); |
|
} |
|
return retval; |
|
} |
|
|
|
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const { |
|
int_vec_return_type retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval[0].vals[i] = vals[i] - b.vals[i]; |
|
} |
|
return retval; |
|
} |
|
|
|
static Vectorized<c10::qint32> requantize_from_int( |
|
const int_vec_return_type& inp, |
|
float multiplier, |
|
int32_t zero_point) { |
|
Vectorized<c10::qint32> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = |
|
nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) + |
|
zero_point; |
|
} |
|
return retval; |
|
} |
|
}; |
|
|
|
template <> |
|
Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) { |
|
return a.maximum(b); |
|
} |
|
|
|
template <> |
|
Vectorized<c10::qint32> inline operator*( |
|
const Vectorized<c10::qint32>& a, |
|
const Vectorized<c10::qint32>& b) { |
|
Vectorized<c10::qint32> retval; |
|
for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) { |
|
retval.vals[i] = a.vals[i] * b.vals[i]; |
|
} |
|
return retval; |
|
} |
|
|
|
template <> |
|
Vectorized<c10::qint32> inline operator+( |
|
const Vectorized<c10::qint32>& a, |
|
const Vectorized<c10::qint32>& b) { |
|
Vectorized<c10::qint32> retval; |
|
for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) { |
|
retval.vals[i] = a.vals[i] + b.vals[i]; |
|
} |
|
return retval; |
|
} |
|
|
|
template <> |
|
struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter< |
|
c10::qint8, |
|
std::array<Vectorized<float>, 4>, |
|
std::array<Vectorized<c10::qint32>, 4>, |
|
32> { |
|
Vectorized() |
|
: VectorizedQuantizedConverter< |
|
c10::qint8, |
|
std::array<Vectorized<float>, 4>, |
|
std::array<Vectorized<c10::qint32>, 4>, |
|
32>() {} |
|
Vectorized(c10::qint8 val) |
|
: VectorizedQuantizedConverter< |
|
c10::qint8, |
|
std::array<Vectorized<float>, 4>, |
|
std::array<Vectorized<c10::qint32>, 4>, |
|
32>(val) {} |
|
Vectorized(const void* ptr) |
|
: VectorizedQuantizedConverter< |
|
c10::qint8, |
|
std::array<Vectorized<float>, 4>, |
|
std::array<Vectorized<c10::qint32>, 4>, |
|
32>(ptr) {} |
|
|
|
static Vectorized<c10::qint8> loadu(const void* ptr) { |
|
return Vectorized<c10::qint8>(ptr); |
|
} |
|
|
|
static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) { |
|
__at_align__ value_type tmp_values[size()]; |
|
|
|
|
|
|
|
for (const auto i : c10::irange(size())) { |
|
tmp_values[i] = 0; |
|
} |
|
std::memcpy( |
|
tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type)); |
|
return Vectorized<c10::qint8>(tmp_values); |
|
} |
|
|
|
static Vectorized<c10::qint8> quantize( |
|
const float_vec_return_type& rhs, |
|
float scale, |
|
int32_t zero_point, |
|
float ) { |
|
std::array<value_type, size()> qvals; |
|
std::array<float, float_num_vecs() * 8> float_vals; |
|
|
|
for (const auto i : c10::irange(float_num_vecs())) { |
|
rhs[i].store(&float_vals[i * 8], 8); |
|
} |
|
|
|
at::native::quantize_vec<c10::qint8>( |
|
scale, |
|
zero_point, |
|
float_vals.data(), |
|
(c10::qint8*)qvals.data(), |
|
8 * float_num_vecs()); |
|
|
|
return Vectorized<c10::qint8>::loadu(qvals.data()); |
|
} |
|
|
|
Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const { |
|
Vectorized<c10::qint8> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]); |
|
} |
|
return retval; |
|
} |
|
|
|
Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const { |
|
Vectorized<c10::qint8> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]); |
|
} |
|
return retval; |
|
} |
|
|
|
Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const { |
|
return maximum(zero_point); |
|
} |
|
|
|
Vectorized<c10::qint8> relu6( |
|
Vectorized<c10::qint8> zero_point, |
|
Vectorized<c10::qint8> q_six) { |
|
Vectorized<c10::qint8> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = std::min<value_type>( |
|
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]); |
|
} |
|
return retval; |
|
} |
|
|
|
int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const { |
|
int_vec_return_type retval; |
|
constexpr int elem_per_int_vec = size() / int_num_vecs(); |
|
for (const auto i : c10::irange(int_num_vecs())) { |
|
for (const auto j : c10::irange(elem_per_int_vec)) { |
|
retval[i].vals[j] = |
|
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) - |
|
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]); |
|
} |
|
} |
|
return retval; |
|
} |
|
static Vectorized<c10::qint8> requantize_from_int( |
|
const int_vec_return_type& inp, |
|
float multiplier, |
|
int32_t zero_point) { |
|
constexpr int elem_per_int_vec = size() / int_num_vecs(); |
|
constexpr auto min_val = std::numeric_limits<value_type>::min(); |
|
constexpr auto max_val = std::numeric_limits<value_type>::max(); |
|
Vectorized<c10::qint8> retval; |
|
for (const auto i : c10::irange(int_num_vecs())) { |
|
for (const auto j : c10::irange(elem_per_int_vec)) { |
|
int32_t rounded = |
|
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) + |
|
zero_point; |
|
retval.vals[i * elem_per_int_vec + j] = |
|
std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val); |
|
} |
|
} |
|
return retval; |
|
} |
|
}; |
|
|
|
template <> |
|
Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) { |
|
return a.maximum(b); |
|
} |
|
|
|
template <> |
|
struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter< |
|
c10::quint8, |
|
std::array<Vectorized<float>, 4>, |
|
std::array<Vectorized<c10::qint32>, 4>, |
|
32> { |
|
Vectorized() |
|
: VectorizedQuantizedConverter< |
|
c10::quint8, |
|
std::array<Vectorized<float>, 4>, |
|
std::array<Vectorized<c10::qint32>, 4>, |
|
32>() {} |
|
Vectorized(c10::quint8 val) |
|
: VectorizedQuantizedConverter< |
|
c10::quint8, |
|
std::array<Vectorized<float>, 4>, |
|
std::array<Vectorized<c10::qint32>, 4>, |
|
32>(val) {} |
|
Vectorized(const void* ptr) |
|
: VectorizedQuantizedConverter< |
|
c10::quint8, |
|
std::array<Vectorized<float>, 4>, |
|
std::array<Vectorized<c10::qint32>, 4>, |
|
32>(ptr) {} |
|
|
|
static Vectorized<c10::quint8> loadu(const void* ptr) { |
|
return Vectorized<c10::quint8>(ptr); |
|
} |
|
|
|
static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) { |
|
__at_align__ value_type tmp_values[size()]; |
|
|
|
|
|
|
|
for (const auto i : c10::irange(size())) { |
|
tmp_values[i] = 0; |
|
} |
|
std::memcpy( |
|
tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type)); |
|
return Vectorized<c10::quint8>(tmp_values); |
|
} |
|
|
|
static Vectorized<c10::quint8> quantize( |
|
const float_vec_return_type& rhs, |
|
float scale, |
|
int32_t zero_point, |
|
float ) { |
|
std::array<value_type, size()> qvals; |
|
std::array<float, float_num_vecs() * 8> float_vals; |
|
|
|
for (const auto i : c10::irange(float_num_vecs())) { |
|
rhs[i].store(&float_vals[i * 8], 8); |
|
} |
|
|
|
at::native::quantize_vec<c10::quint8>( |
|
scale, |
|
zero_point, |
|
float_vals.data(), |
|
(c10::quint8*)qvals.data(), |
|
8 * float_num_vecs()); |
|
|
|
return Vectorized<c10::quint8>::loadu(qvals.data()); |
|
} |
|
|
|
Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const { |
|
Vectorized<c10::quint8> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]); |
|
} |
|
return retval; |
|
} |
|
|
|
Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const { |
|
Vectorized<c10::quint8> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]); |
|
} |
|
return retval; |
|
} |
|
|
|
Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const { |
|
return maximum(zero_point); |
|
} |
|
|
|
|
|
Vectorized<c10::quint8> relu6( |
|
Vectorized<c10::quint8> zero_point, |
|
Vectorized<c10::quint8> q_six) { |
|
Vectorized<c10::quint8> retval; |
|
for (const auto i : c10::irange(size())) { |
|
retval.vals[i] = std::min<value_type>( |
|
std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]); |
|
} |
|
return retval; |
|
} |
|
|
|
int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const { |
|
int_vec_return_type retval; |
|
constexpr int elem_per_int_vec = size() / int_num_vecs(); |
|
for (const auto i : c10::irange(int_num_vecs())) { |
|
for (const auto j : c10::irange(elem_per_int_vec)) { |
|
retval[i].vals[j] = |
|
static_cast<int32_t>(vals[i * elem_per_int_vec + j]) - |
|
static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]); |
|
} |
|
} |
|
return retval; |
|
} |
|
static Vectorized<c10::quint8> requantize_from_int( |
|
const int_vec_return_type& inp, |
|
float multiplier, |
|
int32_t zero_point) { |
|
constexpr int elem_per_int_vec = size() / int_num_vecs(); |
|
constexpr auto min_val = std::numeric_limits<value_type>::min(); |
|
constexpr auto max_val = std::numeric_limits<value_type>::max(); |
|
Vectorized<c10::quint8> retval; |
|
for (const auto i : c10::irange(int_num_vecs())) { |
|
for (const auto j : c10::irange(elem_per_int_vec)) { |
|
int32_t rounded = |
|
nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) + |
|
zero_point; |
|
retval.vals[i * elem_per_int_vec + j] = |
|
std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val); |
|
} |
|
} |
|
return retval; |
|
} |
|
}; |
|
|
|
template <> |
|
Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) { |
|
return a.maximum(b); |
|
} |
|
|
|
#endif |
|
}}} |
|
|