|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef LYRA_CODEC_SPARSE_MATMUL_NUMERICS_FAST_TRANSCENDENTALS_H_ |
|
#define LYRA_CODEC_SPARSE_MATMUL_NUMERICS_FAST_TRANSCENDENTALS_H_ |
|
|
|
#include <cstdint> |
|
#if defined __ARM_NEON || defined __aarch64__ |
|
#include <arm_neon.h> |
|
#else |
|
#include <algorithm> |
|
#endif |
|
#if defined __AVX__ || defined __AVX2__ |
|
#include <immintrin.h> |
|
#endif |
|
#include <math.h> |
|
|
|
#include "sparse_matmul/numerics/fixed_types.h" |
|
#include "sparse_matmul/numerics/type_utils.h" |
|
|
|
namespace csrblocksparse { |
|
|
|
|
|
|
|
constexpr float kMaxExpInput = 80.f; |
|
constexpr int kMaxExpInputInt = static_cast<int>(kMaxExpInput); |
|
constexpr float kMinExpInput = -80.f; |
|
|
|
constexpr float kMaxTanhInput = 9.f; |
|
constexpr float kMinTanhInput = -9.f; |
|
|
|
constexpr float kMaxSigmoidInput = 18.f; |
|
constexpr float kMinSigmoidInput = -18.f; |
|
|
|
constexpr uint32_t kAConstant = 0x4b38aa3b; |
|
|
|
constexpr uint32_t kBConstant = 0x4e7de9a9; |
|
|
|
|
|
constexpr float kTanhAlpha1 = 4.89352455891786e-03; |
|
constexpr float kTanhAlpha3 = 6.37261928875436e-04; |
|
constexpr float kTanhAlpha5 = 1.48572235717979e-05; |
|
constexpr float kTanhAlpha7 = 5.12229709037114e-08; |
|
constexpr float kTanhAlpha9 = -8.60467152213735e-11; |
|
constexpr float kTanhAlpha11 = 2.00018790482477e-13; |
|
constexpr float kTanhAlpha13 = -2.76076847742355e-16; |
|
|
|
constexpr float kTanhBeta0 = 4.89352518554385e-03; |
|
constexpr float kTanhBeta2 = 2.26843463243900e-03; |
|
constexpr float kTanhBeta4 = 1.18534705686654e-04; |
|
constexpr float kTanhBeta6 = 1.19825839466702e-06; |
|
|
|
|
|
|
|
constexpr float kSigmoidAlpha1 = 2.48287947061529e-01; |
|
constexpr float kSigmoidAlpha3 = 8.51377133304701e-03; |
|
constexpr float kSigmoidAlpha5 = 6.08574864600143e-05; |
|
constexpr float kSigmoidAlpha7 = 1.15627324459942e-07; |
|
constexpr float kSigmoidAlpha9 = 4.37031012579801e-11; |
|
|
|
|
|
constexpr float kSigmoidBeta0 = 9.93151921023180e-01; |
|
constexpr float kSigmoidBeta2 = 1.16817656904453e-01; |
|
constexpr float kSigmoidBeta4 = 1.70198817374094e-03; |
|
constexpr float kSigmoidBeta6 = 6.29106785017040e-06; |
|
constexpr float kSigmoidBeta8 = 5.76102136993427e-09; |
|
constexpr float kSigmoidBeta10 = 6.10247389755681e-13; |
|
|
|
|
|
|
|
|
|
|
|
|
|
constexpr float kTanhLinearRegion = .15f; |
|
constexpr float kSigmoidLinearRegion = .75f; |
|
|
|
|
|
constexpr int kMaxLog2Shift = 30; |
|
static const int kLogFactor = static_cast<int>((1 << kMaxLog2Shift) / log(2.f)); |
|
static const float kOneOverLog2 = 1.0f / log(2.f); |
|
|
|
constexpr int kFloatMantissaBits = 23; |
|
|
|
constexpr int kFloatExponentOffset = 127 << kFloatMantissaBits; |
|
|
|
constexpr int kFloatMantissaMask = (1 << kFloatMantissaBits) - 1; |
|
|
|
constexpr int kFloatExponentMask = (-1) ^ kFloatMantissaMask; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
constexpr float kExpQuarticFactor2 = 0.0135302434f; |
|
constexpr float kExpQuarticFactor1 = 0.0656107542f; |
|
constexpr float kExpQuarticFactor0 = 0.306963906f; |
|
|
|
|
|
|
|
|
|
constexpr float kExpCubicFactor1 = 0.0780252018f; |
|
constexpr float kExpCubicFactor0 = 0.304684167f; |
|
|
|
|
|
|
|
|
|
enum TranscendentalMode { |
|
|
|
TM_ORDER3_16BIT, |
|
|
|
TM_ORDER4_16BIT, |
|
|
|
TM_ORDER4_FLOAT, |
|
}; |
|
|
|
inline int FloatAsInt16(float x) { |
|
return static_cast<int>(x * (1 << 15) + 0.5f); |
|
} |
|
|
|
inline int FloatAsInt32(float x) { |
|
return static_cast<int>(x * (1 << 30) + 0.5f); |
|
} |
|
|
|
#if defined __ARM_NEON || defined __aarch64__ |
|
|
|
constexpr int kMaxSigmoidInputInt = static_cast<int>(kMaxSigmoidInput); |
|
|
|
|
|
|
|
|
|
|
|
inline float32x4_t float32_pow2(float32x4_t x) { |
|
|
|
|
|
|
|
|
|
int32x4_t exp_int_x = vcvtq_s32_f32(x); |
|
|
|
int32x4_t mantissa_mask16 = vdupq_n_s32(0x7fff00); |
|
|
|
int32x4_t mantissa_mask32 = vdupq_n_s32(0x7fffff); |
|
|
|
int16x4_t x_16 = vshrn_n_s32(vandq_s32(mantissa_mask16, exp_int_x), 8); |
|
|
|
int32x4_t x_32 = vshlq_n_s32(vandq_s32(mantissa_mask32, exp_int_x), 7); |
|
|
|
|
|
|
|
int32x4_t x_squared = vmull_s16(x_16, x_16); |
|
int16x4_t b = vdup_n_s16(FloatAsInt16(kExpQuarticFactor1)); |
|
int32x4_t c = vdupq_n_s32(FloatAsInt32(kExpQuarticFactor0)); |
|
int32x4_t bx_plus_c = vmlal_s16(c, b, x_16); |
|
int16x4_t a = vdup_n_s16(FloatAsInt16(kExpQuarticFactor2)); |
|
|
|
int32x4_t result = vmlal_s16(bx_plus_c, a, vshrn_n_s32(x_squared, 15)); |
|
int32x4_t x_squared_minus_x = vsubq_s32(x_squared, x_32); |
|
|
|
|
|
result = vqrdmulhq_s32(result, x_squared_minus_x); |
|
|
|
|
|
|
|
result = vshrq_n_s32(result, 6); |
|
|
|
int32x4_t exp_offset = vdupq_n_s32(kFloatExponentOffset); |
|
exp_int_x = vaddq_s32(exp_int_x, exp_offset); |
|
exp_int_x = vaddq_s32(exp_int_x, result); |
|
|
|
|
|
return vreinterpretq_f32_s32(exp_int_x); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline float32x4_t fixed32_exp_float_preclipped(const int mantissa_bits, |
|
float32x4_t x) { |
|
|
|
|
|
|
|
const int kXShift = kFloatMantissaBits - mantissa_bits; |
|
const float kLogFactor = static_cast<float>(1 << kXShift); |
|
float32x4_t factor = vdupq_n_f32(kLogFactor * kOneOverLog2); |
|
float32x4_t y = vmulq_f32(x, factor); |
|
|
|
return float32_pow2(y); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline float32x4_t fast_exp(float32x4_t x) { |
|
#if defined FAST_TRANSCENDENTALS && __ARM_ARCH >= 800 |
|
|
|
|
|
|
|
float32x4_t A = vreinterpretq_f32_u32(vdupq_n_u32(kAConstant)); |
|
float32x4_t res = vreinterpretq_f32_u32(vdupq_n_u32(kBConstant)); |
|
|
|
|
|
x = vminq_f32(x, vdupq_n_f32(kMaxExpInput)); |
|
x = vmaxq_f32(x, vdupq_n_f32(kMinExpInput)); |
|
|
|
|
|
|
|
res = vmlaq_f32(res, A, x); |
|
|
|
|
|
|
|
int32x4_t x_int = vcvtnq_s32_f32(res); |
|
|
|
return vreinterpretq_f32_s32(x_int); |
|
#else |
|
float32x4_t return_val = vdupq_n_f32(0.f); |
|
|
|
float exponent = expf(vgetq_lane_f32(x, 0)); |
|
return_val = vld1q_lane_f32(&exponent, return_val, 0); |
|
|
|
exponent = expf(vgetq_lane_f32(x, 1)); |
|
return_val = vld1q_lane_f32(&exponent, return_val, 1); |
|
exponent = expf(vgetq_lane_f32(x, 2)); |
|
return_val = vld1q_lane_f32(&exponent, return_val, 2); |
|
exponent = expf(vgetq_lane_f32(x, 3)); |
|
return_val = vld1q_lane_f32(&exponent, return_val, 3); |
|
|
|
return return_val; |
|
#endif |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <int ExponentBits> |
|
inline float32x4_t fast_exp(int32x4_t x) { |
|
return fast_exp(vcvtq_n_f32_s32(x, 31 - ExponentBits)); |
|
} |
|
|
|
|
|
|
|
template <int ExponentBits> |
|
inline float32x4_t fast_exp_fixed(int32x4_t x) { |
|
static_assert(ExponentBits > 8, "Must have more than 8 ExponentBits"); |
|
constexpr int kA = 1.4426950408889634 * (1 << (ExponentBits - 8)); |
|
constexpr int kB = (127 << 23) - 366000; |
|
|
|
constexpr int maxInput = 80 << (31 - ExponentBits); |
|
constexpr int minInput = -maxInput; |
|
|
|
int32x4_t A = vdupq_n_s32(kA); |
|
int32x4_t res = vdupq_n_s32(kB); |
|
|
|
|
|
x = vminq_s32(x, vdupq_n_s32(maxInput)); |
|
x = vmaxq_s32(x, vdupq_n_s32(minInput)); |
|
|
|
|
|
|
|
res = vmlaq_s32(res, A, x); |
|
|
|
return vreinterpretq_f32_s32(res); |
|
} |
|
|
|
|
|
#if __ARM_ARCH >= 800 |
|
namespace detail { |
|
|
|
|
|
inline float32x4_t fast_exp_norange_check(float32x4_t x) { |
|
float32x4_t A = vreinterpretq_f32_u32(vdupq_n_u32(kAConstant)); |
|
float32x4_t res = vreinterpretq_f32_u32(vdupq_n_u32(kBConstant)); |
|
|
|
res = vmlaq_f32(res, A, x); |
|
|
|
int32x4_t x_int = vcvtnq_s32_f32(res); |
|
|
|
return vreinterpretq_f32_s32(x_int); |
|
} |
|
|
|
} |
|
#endif |
|
|
|
|
|
inline float32x4_t ClipToFloatBounds(const float kLimit, const float32x4_t x) { |
|
|
|
float32x4_t clip_limit = vdupq_n_f32(kLimit); |
|
float32x4_t clipped_x = vminq_f32(x, clip_limit); |
|
clip_limit = vnegq_f32(clip_limit); |
|
return vmaxq_f32(clipped_x, clip_limit); |
|
} |
|
|
|
inline float32x4_t float_tanh_float(const float32x4_t& x) { |
|
float32x4_t clipped_x = ClipToFloatBounds(kMaxTanhInput, x); |
|
|
|
|
|
|
|
const float kLogFactor = static_cast<float>(1 << (kFloatMantissaBits + 1)); |
|
float32x4_t factor = vdupq_n_f32(kLogFactor * kOneOverLog2); |
|
clipped_x = vmulq_f32(clipped_x, factor); |
|
|
|
float32x4_t exp_result = float32_pow2(clipped_x); |
|
|
|
float32x4_t one = vdupq_n_f32(1.0f); |
|
float32x4_t numerator = vsubq_f32(exp_result, one); |
|
float32x4_t denominator = vaddq_f32(exp_result, one); |
|
float32x4_t recp = vrecpeq_f32(denominator); |
|
|
|
recp = vmulq_f32(recp, vrecpsq_f32(recp, denominator)); |
|
recp = vmulq_f32(recp, numerator); |
|
|
|
|
|
float32x4_t third = vdupq_n_f32(1.0f / 3.0f); |
|
float32x4_t taylor = vmulq_f32(x, x); |
|
taylor = vmulq_f32(taylor, x); |
|
taylor = vmulq_f32(taylor, third); |
|
taylor = vsubq_f32(x, taylor); |
|
|
|
|
|
float32x4_t ninth = vmulq_f32(third, third); |
|
uint32x4_t cmp_results = vcaleq_f32(x, ninth); |
|
return vbslq_f32(cmp_results, taylor, recp); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline float32x4_t fast_tanh(float32x4_t x) { |
|
#if defined FASTER_TRANSCENDENTALS |
|
return float_tanh_float(x); |
|
#elif defined ACCURATE_TRANSCENDENTAL_APPROX && defined FAST_TRANSCENDENTALS |
|
x = vminq_f32(x, vdupq_n_f32(kMaxTanhInput)); |
|
x = vmaxq_f32(x, vdupq_n_f32(kMinTanhInput)); |
|
|
|
|
|
const float32x4_t alpha_1 = vdupq_n_f32(kTanhAlpha1); |
|
const float32x4_t alpha_3 = vdupq_n_f32(kTanhAlpha3); |
|
const float32x4_t alpha_5 = vdupq_n_f32(kTanhAlpha5); |
|
const float32x4_t alpha_7 = vdupq_n_f32(kTanhAlpha7); |
|
const float32x4_t alpha_9 = vdupq_n_f32(kTanhAlpha9); |
|
const float32x4_t alpha_11 = vdupq_n_f32(kTanhAlpha11); |
|
const float32x4_t alpha_13 = vdupq_n_f32(kTanhAlpha13); |
|
|
|
|
|
const float32x4_t beta_0 = vdupq_n_f32(kTanhBeta0); |
|
const float32x4_t beta_2 = vdupq_n_f32(kTanhBeta2); |
|
const float32x4_t beta_4 = vdupq_n_f32(kTanhBeta4); |
|
const float32x4_t beta_6 = vdupq_n_f32(kTanhBeta6); |
|
|
|
|
|
const float32x4_t x2 = vmulq_f32(x, x); |
|
|
|
|
|
float32x4_t p = vmlaq_f32(alpha_11, x2, alpha_13); |
|
p = vmlaq_f32(alpha_9, x2, p); |
|
p = vmlaq_f32(alpha_7, x2, p); |
|
p = vmlaq_f32(alpha_5, x2, p); |
|
p = vmlaq_f32(alpha_3, x2, p); |
|
p = vmlaq_f32(alpha_1, x2, p); |
|
p = vmulq_f32(x, p); |
|
|
|
|
|
float32x4_t q = vmlaq_f32(beta_4, x2, beta_6); |
|
q = vmlaq_f32(beta_2, x2, q); |
|
q = vmlaq_f32(beta_0, x2, q); |
|
|
|
|
|
float32x4_t recp = vrecpeq_f32(q); |
|
recp = vmulq_f32(recp, vrecpsq_f32(recp, q)); |
|
return vmulq_f32(p, recp); |
|
#elif defined FAST_TRANSCENDENTALS && __ARM_ARCH >= 800 |
|
|
|
|
|
x = vminq_f32(x, vdupq_n_f32(kMaxTanhInput)); |
|
x = vmaxq_f32(x, vdupq_n_f32(kMinTanhInput)); |
|
float32x4_t exp_est = detail::fast_exp_norange_check(x); |
|
float32x4_t neg_exp_est = detail::fast_exp_norange_check(-x); |
|
|
|
|
|
|
|
uint32x4_t cmp_results = vcaleq_f32(x, vdupq_n_f32(kTanhLinearRegion)); |
|
|
|
float32x4_t diff = vsubq_f32(exp_est, neg_exp_est); |
|
float32x4_t sum = vaddq_f32(exp_est, neg_exp_est); |
|
float32x4_t recp = vrecpeq_f32(sum); |
|
recp = vmulq_f32(recp, vrecpsq_f32(recp, sum)); |
|
float32x4_t tanh_estimate = vmulq_f32(diff, recp); |
|
|
|
|
|
|
|
|
|
return vbslq_f32(cmp_results, x, tanh_estimate); |
|
#else |
|
float32x4_t return_val = vdupq_n_f32(0.f); |
|
|
|
float tanh_value = tanhf(vgetq_lane_f32(x, 0)); |
|
return_val = vld1q_lane_f32(&tanh_value, return_val, 0); |
|
tanh_value = tanhf(vgetq_lane_f32(x, 1)); |
|
return_val = vld1q_lane_f32(&tanh_value, return_val, 1); |
|
tanh_value = tanhf(vgetq_lane_f32(x, 2)); |
|
return_val = vld1q_lane_f32(&tanh_value, return_val, 2); |
|
tanh_value = tanhf(vgetq_lane_f32(x, 3)); |
|
return_val = vld1q_lane_f32(&tanh_value, return_val, 3); |
|
|
|
return return_val; |
|
#endif |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline float32x4_t fast_sigmoid(float32x4_t x) { |
|
#ifdef SIGMOID_AS_TANH |
|
float32x4_t half = vdupq_n_f32(0.5f); |
|
return vmlaq_f32(half, half, fast_tanh(vmulq_f32(half, x))); |
|
#else |
|
#if defined FAST_TRANSCENDENTALS && defined ACCURATE_TRANSCENDENTAL_APPROX |
|
x = vminq_f32(x, vdupq_n_f32(kMaxSigmoidInput)); |
|
x = vmaxq_f32(x, vdupq_n_f32(kMinSigmoidInput)); |
|
|
|
|
|
const float32x4_t alpha_1 = vdupq_n_f32(kSigmoidAlpha1); |
|
const float32x4_t alpha_3 = vdupq_n_f32(kSigmoidAlpha3); |
|
const float32x4_t alpha_5 = vdupq_n_f32(kSigmoidAlpha5); |
|
const float32x4_t alpha_7 = vdupq_n_f32(kSigmoidAlpha7); |
|
const float32x4_t alpha_9 = vdupq_n_f32(kSigmoidAlpha9); |
|
|
|
|
|
const float32x4_t beta_0 = vdupq_n_f32(kSigmoidBeta0); |
|
const float32x4_t beta_2 = vdupq_n_f32(kSigmoidBeta2); |
|
const float32x4_t beta_4 = vdupq_n_f32(kSigmoidBeta4); |
|
const float32x4_t beta_6 = vdupq_n_f32(kSigmoidBeta6); |
|
const float32x4_t beta_8 = vdupq_n_f32(kSigmoidBeta8); |
|
const float32x4_t beta_10 = vdupq_n_f32(kSigmoidBeta10); |
|
|
|
|
|
const float32x4_t x2 = vmulq_f32(x, x); |
|
|
|
|
|
float32x4_t p = vmlaq_f32(alpha_7, x2, alpha_9); |
|
p = vmlaq_f32(alpha_5, x2, p); |
|
p = vmlaq_f32(alpha_3, x2, p); |
|
p = vmlaq_f32(alpha_1, x2, p); |
|
p = vmulq_f32(x, p); |
|
|
|
|
|
float32x4_t q = vmlaq_f32(beta_8, x2, beta_10); |
|
q = vmlaq_f32(beta_6, x2, q); |
|
q = vmlaq_f32(beta_4, x2, q); |
|
q = vmlaq_f32(beta_2, x2, q); |
|
q = vmlaq_f32(beta_0, x2, q); |
|
|
|
|
|
float32x4_t recp = vrecpeq_f32(q); |
|
recp = vmulq_f32(recp, vrecpsq_f32(recp, q)); |
|
return vmlaq_f32(vdupq_n_f32(0.5f), p, recp); |
|
#elif defined FAST_TRANSCENDENTALS |
|
float32x4_t denom = vaddq_f32(fast_exp(vnegq_f32(x)), vdupq_n_f32(1.f)); |
|
|
|
float32x4_t recp = vrecpeq_f32(denom); |
|
|
|
recp = vmulq_f32(recp, vrecpsq_f32(recp, denom)); |
|
float32x4_t half = vdupq_n_f32(0.5f); |
|
float32x4_t quarter = vdupq_n_f32(0.245f); |
|
float32x4_t linear_approx = vmlaq_f32(half, quarter, x); |
|
uint32x4_t cmp_results = vcaleq_f32(x, vdupq_n_f32(kSigmoidLinearRegion)); |
|
|
|
return vbslq_f32(cmp_results, linear_approx, recp); |
|
#else |
|
float32x4_t return_val = vdupq_n_f32(0.f); |
|
|
|
float result = 1.f / (1.f + expf(-vgetq_lane_f32(x, 0))); |
|
return_val = vld1q_lane_f32(&result, return_val, 0); |
|
result = 1.f / (1.f + expf(-vgetq_lane_f32(x, 1))); |
|
return_val = vld1q_lane_f32(&result, return_val, 1); |
|
result = 1.f / (1.f + expf(-vgetq_lane_f32(x, 2))); |
|
return_val = vld1q_lane_f32(&result, return_val, 2); |
|
result = 1.f / (1.f + expf(-vgetq_lane_f32(x, 3))); |
|
return_val = vld1q_lane_f32(&result, return_val, 3); |
|
|
|
return return_val; |
|
#endif |
|
#endif |
|
} |
|
|
|
|
|
inline float fast_exp(float x) { |
|
return vgetq_lane_f32(fast_exp(vdupq_n_f32(x)), 0); |
|
} |
|
|
|
template <int ExponentBits> |
|
inline float fast_exp(fixed32<ExponentBits> x) { |
|
return vgetq_lane_f32(fast_exp<ExponentBits>(vdupq_n_s32(x.raw_val())), 0); |
|
} |
|
|
|
|
|
|
|
|
|
template <int ExponentBits> |
|
inline float fast_exp_fixed(fixed32<ExponentBits> x) { |
|
return vgetq_lane_f32(fast_exp_fixed<ExponentBits>(vdupq_n_s32(x.raw_val())), |
|
0); |
|
} |
|
|
|
inline float fast_sigmoid(float x) { |
|
return vgetq_lane_f32(fast_sigmoid(vdupq_n_f32(x)), 0); |
|
} |
|
|
|
inline float fast_tanh(float x) { |
|
return vgetq_lane_f32(fast_tanh(vdupq_n_f32(x)), 0); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <int kLimit> |
|
inline int32x4_t ClipToBounds(const int mantissa_bits, const int32x4_t x) { |
|
|
|
int32x4_t clip_limit = vdupq_n_s32(-(kLimit << mantissa_bits)); |
|
int32x4_t clipped_x = vmaxq_s32(x, clip_limit); |
|
clip_limit = vnegq_s32(clip_limit); |
|
return vminq_s32(clipped_x, clip_limit); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline float32x4_t fixed32_sigmoid_float(const int mantissa_bits, |
|
const int32x4_t x) { |
|
int32x4_t input = vnegq_s32(x); |
|
float32x4_t y = |
|
vcvtq_f32_s32(ClipToBounds<kMaxSigmoidInputInt>(mantissa_bits, input)); |
|
y = fixed32_exp_float_preclipped(mantissa_bits, y); |
|
float32x4_t one = vdupq_n_f32(1.0f); |
|
|
|
float32x4_t denom = vaddq_f32(y, one); |
|
float32x4_t recp = vrecpeq_f32(denom); |
|
|
|
recp = vmulq_f32(recp, vrecpsq_f32(recp, denom)); |
|
return recp; |
|
} |
|
|
|
template <int ExponentBits> |
|
inline float32x4_t fast_sigmoid(int32x4_t x) { |
|
#if defined FASTER_TRANSCENDENTALS |
|
|
|
|
|
static_assert(kFloatMantissaBits >= fixed32<ExponentBits>::kMantissaBits, |
|
"Mantissa bits must be at most 23!"); |
|
return fixed32_sigmoid_float(fixed32<ExponentBits>::kMantissaBits, x); |
|
#else |
|
return fast_sigmoid(vcvtq_n_f32_s32(x, fixed32<ExponentBits>::kMantissaBits)); |
|
#endif |
|
} |
|
|
|
template <int ExponentBits> |
|
inline float fast_sigmoid(fixed32<ExponentBits> x) { |
|
return vgetq_lane_f32(fast_sigmoid<ExponentBits>(vdupq_n_s32(x.raw_val())), |
|
0); |
|
} |
|
|
|
#else |
|
|
|
inline float fast_exp(float x) { |
|
#ifdef FAST_TRANSCENDENTALS |
|
if (isnan(x)) return 0.0f; |
|
x = std::max(std::min(x, kMaxExpInput), kMinExpInput); |
|
float AConstant, BConstant; |
|
memcpy(&AConstant, &kAConstant, sizeof(int)); |
|
memcpy(&BConstant, &kBConstant, sizeof(int)); |
|
float y = x * AConstant + BConstant; |
|
int x_int = static_cast<int>(y); |
|
float ret; |
|
memcpy(&ret, &x_int, sizeof(float)); |
|
return ret; |
|
#else |
|
return expf(x); |
|
#endif |
|
} |
|
|
|
template <int ExponentBits> |
|
inline float fast_exp(fixed32<ExponentBits> x) { |
|
return fast_exp(static_cast<float>(x)); |
|
} |
|
|
|
template <int ExponentBits> |
|
inline float fast_exp_fixed(fixed32<ExponentBits> x) { |
|
static_assert(ExponentBits > 8, "Must have more than 8 ExponentBits"); |
|
int matched_decimal = |
|
std::max(std::min(x.raw_val(), (80 << (31 - ExponentBits))), |
|
-(80 << (31 - ExponentBits))); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int16_t A = (1.f / logf(2.f)) * (1 << (ExponentBits - 8)); |
|
|
|
|
|
|
|
const int B = (127 << 23) - 366000; |
|
matched_decimal = A * matched_decimal + B; |
|
float ret_val; |
|
memcpy(&ret_val, &matched_decimal, sizeof(float)); |
|
return ret_val; |
|
} |
|
|
|
inline float fast_tanh(float x) { |
|
#if defined FAST_TRANSCENDENTALS && defined ACCURATE_TRANSCENDENTAL_APPROX |
|
|
|
|
|
|
|
x = std::max(std::min(x, kMaxTanhInput), kMinTanhInput); |
|
|
|
|
|
float x2 = x * x; |
|
|
|
|
|
float p = kTanhAlpha11 + x2 * kTanhAlpha13; |
|
p = kTanhAlpha9 + x2 * p; |
|
p = kTanhAlpha7 + x2 * p; |
|
p = kTanhAlpha5 + x2 * p; |
|
p = kTanhAlpha3 + x2 * p; |
|
p = kTanhAlpha1 + x2 * p; |
|
p = x * p; |
|
|
|
|
|
float q = kTanhBeta4 + x2 * kTanhBeta6; |
|
q = kTanhBeta2 + x2 * q; |
|
q = kTanhBeta0 + x2 * q; |
|
|
|
return p / q; |
|
#elif defined FAST_TRANSCENDENTALS |
|
if (std::abs(x) < kTanhLinearRegion) { |
|
return x; |
|
} else { |
|
x = std::max(std::min(x, kMaxTanhInput), kMinTanhInput); |
|
float positive = fast_exp(x); |
|
float negative = fast_exp(-x); |
|
return (positive - negative) / (positive + negative); |
|
} |
|
#else |
|
return tanhf(x); |
|
#endif |
|
} |
|
|
|
inline float fast_sigmoid(float x) { |
|
#ifdef SIGMOID_AS_TANH |
|
return .5f * fast_tanh(.5f * x) + .5f; |
|
#else |
|
#if defined FAST_TRANSCENDENTALS && defined ACCURATE_TRANSCENDENTAL_APPROX |
|
|
|
|
|
|
|
|
|
|
|
x = std::max(std::min(x, kMaxSigmoidInput), kMinSigmoidInput); |
|
|
|
|
|
float x2 = x * x; |
|
|
|
|
|
float p = kSigmoidAlpha7 + x2 * kSigmoidAlpha9; |
|
p = kSigmoidAlpha5 + x2 * p; |
|
p = kSigmoidAlpha3 + x2 * p; |
|
p = kSigmoidAlpha1 + x2 * p; |
|
p = x * p; |
|
|
|
|
|
float q = kSigmoidBeta8 + x2 * kSigmoidBeta10; |
|
q = kSigmoidBeta6 + x2 * q; |
|
q = kSigmoidBeta4 + x2 * q; |
|
q = kSigmoidBeta2 + x2 * q; |
|
q = kSigmoidBeta0 + x2 * q; |
|
|
|
return p / q + 0.5f; |
|
#elif defined FAST_TRANSCENDENTALS |
|
if (std::abs(x) < kSigmoidLinearRegion) { |
|
return .245 * x + .5; |
|
} else { |
|
return 1.f / (1.f + fast_exp(-x)); |
|
} |
|
#else |
|
return 1.f / (1.f + expf(-x)); |
|
#endif |
|
#endif |
|
} |
|
|
|
template <int ExponentBits> |
|
inline float fast_sigmoid(fixed32<ExponentBits> x) { |
|
return fast_sigmoid(static_cast<float>(x)); |
|
} |
|
|
|
#endif |
|
|
|
|
|
static constexpr int kNumTanhExpBits = 3; |
|
|
|
static constexpr int kNumSigmoidExpBits = 4; |
|
|
|
static constexpr int kNumExtraSigmoidShiftBits = 1; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int* TanhTable(int num_mantissa_bits_out); |
|
|
|
const int* SigmoidTable(int num_mantissa_bits_out); |
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int kExponentBits, TranscendentalMode kOrder = TM_ORDER4_16BIT> |
|
float fixed32_exp(fixed32<kExponentBits> x) { |
|
constexpr int kMantissaBits = MantissaBitsOf<fixed32<kExponentBits>>::value; |
|
|
|
int64_t clipped_x = |
|
std::max(std::min(x.raw_val(), kMaxExpInputInt << kMantissaBits), |
|
-(kMaxExpInputInt << kMantissaBits)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
constexpr int kXShift = |
|
kFloatMantissaBits + 31 - kMaxLog2Shift - kMantissaBits; |
|
static_assert(kXShift >= 0, |
|
"Mantissa bits > kFloatMantissaBits + 31 - kMaxLog2Shift"); |
|
clipped_x <<= kXShift; |
|
int float_as_int = (kLogFactor * clipped_x >> 31) + kFloatExponentOffset; |
|
|
|
int int_part = float_as_int & kFloatExponentMask; |
|
int float_part = float_as_int & kFloatMantissaMask; |
|
float fraction = static_cast<float>(float_part) / (1 << kFloatMantissaBits); |
|
|
|
|
|
|
|
|
|
float mantissa; |
|
if (kOrder == TM_ORDER4_16BIT || kOrder == TM_ORDER4_FLOAT) { |
|
mantissa = (kExpQuarticFactor2 * fraction + kExpQuarticFactor1) * fraction + |
|
kExpQuarticFactor0; |
|
} else if (kOrder == TM_ORDER3_16BIT) { |
|
mantissa = kExpCubicFactor1 * fraction + kExpCubicFactor0; |
|
} |
|
mantissa = fraction - fraction * (1.0f - fraction) * mantissa; |
|
|
|
|
|
|
|
|
|
float result; |
|
memcpy(&result, &int_part, sizeof(float)); |
|
return result * (1.0f + mantissa); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <int kExponentBits, TranscendentalMode kOrder = TM_ORDER4_16BIT> |
|
float fixed32_tanh(fixed32<kExponentBits> x) { |
|
float float_x = static_cast<float>(x); |
|
if (std::abs(float_x) < 1.0f / 9.0f) { |
|
return float_x * (1 - float_x * float_x / 3.0f); |
|
} |
|
x = static_cast<fixed32<kExponentBits>>(x.raw_val() * 2); |
|
float exp_2x = fixed32_exp<kExponentBits, kOrder>(x); |
|
return (exp_2x - 1.0f) / (exp_2x + 1.0f); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <int kExponentBits, TranscendentalMode kOrder = TM_ORDER4_16BIT> |
|
float fixed32_sigmoid(fixed32<kExponentBits> x) { |
|
x = static_cast<fixed32<kExponentBits>>(-x.raw_val()); |
|
float exp_x = fixed32_exp<kExponentBits, kOrder>(x); |
|
return 1.0f / (exp_x + 1.0f); |
|
} |
|
|
|
#if defined __AVX2__ |
|
|
|
|
|
|
|
|
|
|
|
template <int kNumShiftBits, int kTableOffset> |
|
inline __m256i index_data_table(const int32_t* data_table, const __m256i& x) { |
|
|
|
__m256i shifted = _mm256_set1_epi32(1 << (kNumShiftBits - 1)); |
|
shifted = _mm256_add_epi32(x, shifted); |
|
shifted = _mm256_srai_epi32(shifted, kNumShiftBits); |
|
|
|
__m256i addend = _mm256_set1_epi32(kTableOffset); |
|
shifted = _mm256_add_epi32(shifted, addend); |
|
|
|
addend = _mm256_add_epi32(addend, addend); |
|
shifted = _mm256_min_epi32(shifted, addend); |
|
shifted = _mm256_max_epi32(shifted, _mm256_setzero_si256()); |
|
|
|
return _mm256_i32gather_epi32(data_table, shifted, 4); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int NumInputMantissaBits, int NumOutputMantissaBits> |
|
inline __m256i fixed32_tanh_fixed16(const int* tanh_table, const __m256i& x) { |
|
|
|
constexpr int kNumShiftBits = NumInputMantissaBits - NumOutputMantissaBits; |
|
constexpr int kTableOffset = 1 << (NumOutputMantissaBits + kNumTanhExpBits); |
|
return index_data_table<kNumShiftBits, kTableOffset>(tanh_table, x); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int NumInputMantissaBits, int NumOutputMantissaBits> |
|
inline __m256i fixed32_sigmoid_fixed16(const int* sigmoid_table, |
|
const __m256i& x) { |
|
|
|
constexpr int kNumShiftBits = |
|
kNumExtraSigmoidShiftBits + NumInputMantissaBits - NumOutputMantissaBits; |
|
constexpr int kTableOffset = 1 |
|
<< (NumOutputMantissaBits + kNumSigmoidExpBits - |
|
kNumExtraSigmoidShiftBits); |
|
return index_data_table<kNumShiftBits, kTableOffset>(sigmoid_table, x); |
|
} |
|
|
|
|
|
|
|
inline __m256i PackFloatsToFixed16(const __m256& x0, const __m256& x1) { |
|
__m256i int0 = _mm256_cvtps_epi32(x0); |
|
__m256i int1 = _mm256_cvtps_epi32(x1); |
|
int0 = _mm256_packs_epi32(int0, int1); |
|
|
|
return _mm256_permute4x64_epi64(int0, 0xd8); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <int kLimit> |
|
inline __m256i ClipToBounds(const int mantissa_bits, const __m256i& x) { |
|
|
|
__m256i clip_limit = _mm256_set1_epi32(-(kLimit << mantissa_bits)); |
|
__m256i clipped_x = _mm256_max_epi32(x, clip_limit); |
|
|
|
clip_limit = _mm256_sign_epi32(clip_limit, clip_limit); |
|
return _mm256_min_epi32(clipped_x, clip_limit); |
|
} |
|
|
|
|
|
|
|
|
|
inline __m256 ClipToFloatBounds(const float kLimit, const __m256& x) { |
|
__m256 clip_limit = _mm256_set1_ps(kLimit); |
|
__m256 clipped_x = _mm256_min_ps(x, clip_limit); |
|
clip_limit = _mm256_set1_ps(-kLimit); |
|
return _mm256_max_ps(clipped_x, clip_limit); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <TranscendentalMode kOrder = TM_ORDER4_16BIT> |
|
inline void float32_pow2(__m256& x0, __m256& x1) { |
|
|
|
__m256i exp_int_x0 = _mm256_cvtps_epi32(x0); |
|
__m256i exp_int_x1 = _mm256_cvtps_epi32(x1); |
|
__m256i result_x0, result_x1; |
|
|
|
static_assert(kOrder == TM_ORDER4_FLOAT || kOrder == TM_ORDER4_16BIT || |
|
kOrder == TM_ORDER3_16BIT, |
|
"Invalid order."); |
|
|
|
if (kOrder == TM_ORDER4_FLOAT) { |
|
__m256i mantissa_mask = _mm256_set1_epi32(0x7fffff); |
|
__m256 float_factor = |
|
_mm256_set1_ps(1.0f / static_cast<float>(1 << kFloatMantissaBits)); |
|
__m256i fract0 = _mm256_and_si256(mantissa_mask, exp_int_x0); |
|
__m256i fract1 = _mm256_and_si256(mantissa_mask, exp_int_x1); |
|
__m256 float0 = _mm256_mul_ps(_mm256_cvtepi32_ps(fract0), float_factor); |
|
__m256 float1 = _mm256_mul_ps(_mm256_cvtepi32_ps(fract1), float_factor); |
|
|
|
|
|
|
|
__m256 x_squared0 = _mm256_mul_ps(float0, float0); |
|
__m256 x_squared1 = _mm256_mul_ps(float1, float1); |
|
__m256 b = _mm256_set1_ps(kExpQuarticFactor1); |
|
__m256 b_x0 = _mm256_mul_ps(b, float0); |
|
__m256 b_x1 = _mm256_mul_ps(b, float1); |
|
__m256 a = _mm256_set1_ps(kExpQuarticFactor2); |
|
__m256 a_x_squared0 = _mm256_mul_ps(a, x_squared0); |
|
__m256 a_x_squared1 = _mm256_mul_ps(a, x_squared1); |
|
__m256 x_squared_minus_x0 = _mm256_sub_ps(x_squared0, float0); |
|
__m256 x_squared_minus_x1 = _mm256_sub_ps(x_squared1, float1); |
|
__m256 c = _mm256_set1_ps(kExpQuarticFactor0); |
|
b_x0 = _mm256_add_ps(b_x0, c); |
|
b_x1 = _mm256_add_ps(b_x1, c); |
|
float_factor = _mm256_set1_ps(static_cast<float>(1 << kFloatMantissaBits)); |
|
a_x_squared0 = _mm256_add_ps(a_x_squared0, b_x0); |
|
a_x_squared1 = _mm256_add_ps(a_x_squared1, b_x1); |
|
a_x_squared0 = _mm256_mul_ps(a_x_squared0, x_squared_minus_x0); |
|
a_x_squared1 = _mm256_mul_ps(a_x_squared1, x_squared_minus_x1); |
|
result_x0 = _mm256_cvtps_epi32(_mm256_mul_ps(a_x_squared0, float_factor)); |
|
result_x1 = _mm256_cvtps_epi32(_mm256_mul_ps(a_x_squared1, float_factor)); |
|
} else { |
|
|
|
|
|
__m256i mantissa_mask = _mm256_set1_epi32(0x7fff00); |
|
__m256i x_01 = |
|
_mm256_srli_epi32(_mm256_and_si256(mantissa_mask, exp_int_x0), 8); |
|
x_01 = _mm256_or_si256( |
|
x_01, |
|
_mm256_slli_epi32(_mm256_and_si256(mantissa_mask, exp_int_x1), 8)); |
|
|
|
|
|
|
|
__m256i x_squared = _mm256_mulhrs_epi16(x_01, x_01); |
|
__m256i result, x_squared_minus_x; |
|
if (kOrder == TM_ORDER4_16BIT) { |
|
__m256i b = _mm256_set1_epi16(FloatAsInt16(kExpQuarticFactor1)); |
|
__m256i b_x = _mm256_mulhrs_epi16(b, x_01); |
|
__m256i a = _mm256_set1_epi16(FloatAsInt16(kExpQuarticFactor2)); |
|
__m256i a_x_squared = _mm256_mulhrs_epi16(a, x_squared); |
|
x_squared_minus_x = _mm256_sub_epi16(x_squared, x_01); |
|
|
|
|
|
|
|
__m256i c = _mm256_set1_epi16(FloatAsInt16(kExpQuarticFactor0)); |
|
b_x = _mm256_add_epi16(b_x, c); |
|
|
|
|
|
result = _mm256_add_epi16(a_x_squared, b_x); |
|
} else { |
|
__m256i a = _mm256_set1_epi16(FloatAsInt16(kExpCubicFactor1)); |
|
__m256i b = _mm256_set1_epi16(FloatAsInt16(kExpQuarticFactor0)); |
|
__m256i a_x = _mm256_mulhrs_epi16(a, x_01); |
|
x_squared_minus_x = _mm256_sub_epi16(x_squared, x_01); |
|
result = _mm256_add_epi16(a_x, b); |
|
} |
|
result = _mm256_mulhrs_epi16(result, x_squared_minus_x); |
|
|
|
result_x0 = _mm256_slli_epi32(result, 16); |
|
result_x0 = _mm256_srai_epi32(result_x0, 8); |
|
result_x1 = _mm256_srai_epi32(result, 16); |
|
result_x1 = _mm256_slli_epi32(result_x1, 8); |
|
} |
|
|
|
__m256i exp_offset = _mm256_set1_epi32(kFloatExponentOffset); |
|
exp_int_x0 = _mm256_add_epi32(exp_int_x0, exp_offset); |
|
exp_int_x0 = _mm256_add_epi32(exp_int_x0, result_x0); |
|
exp_int_x1 = _mm256_add_epi32(exp_int_x1, exp_offset); |
|
exp_int_x1 = _mm256_add_epi32(exp_int_x1, result_x1); |
|
|
|
|
|
x0 = _mm256_castsi256_ps(exp_int_x0); |
|
x1 = _mm256_castsi256_ps(exp_int_x1); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int kInputMantissaBits, TranscendentalMode kOrder = TM_ORDER4_16BIT> |
|
inline void float_exp_float_preclipped(__m256& y0, __m256& y1) { |
|
|
|
|
|
|
|
|
|
|
|
constexpr int kXShift = kFloatMantissaBits - kInputMantissaBits; |
|
constexpr float kLogFactor = static_cast<float>(1 << kXShift); |
|
__m256 factor = _mm256_set1_ps(kLogFactor * kOneOverLog2); |
|
y0 = _mm256_mul_ps(y0, factor); |
|
y1 = _mm256_mul_ps(y1, factor); |
|
|
|
float32_pow2<kOrder>(y0, y1); |
|
} |
|
template <int kInputMantissaBits, TranscendentalMode kOrder = TM_ORDER4_16BIT> |
|
inline void fixed32_exp_float(const __m256i& x0, const __m256i& x1, __m256& y0, |
|
__m256& y1) { |
|
|
|
y0 = |
|
_mm256_cvtepi32_ps(ClipToBounds<kMaxExpInputInt>(kInputMantissaBits, x0)); |
|
y1 = |
|
_mm256_cvtepi32_ps(ClipToBounds<kMaxExpInputInt>(kInputMantissaBits, x1)); |
|
float_exp_float_preclipped<kInputMantissaBits, kOrder>(y0, y1); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int kInputMantissaBits, TranscendentalMode kOrder = TM_ORDER4_FLOAT> |
|
inline void float_tanh_float(const __m256& x0, const __m256& x1, __m256& y0, |
|
__m256& y1) { |
|
|
|
|
|
|
|
|
|
const float kLogFactor = |
|
static_cast<float>(1 << (kFloatMantissaBits - kInputMantissaBits + 1)); |
|
__m256 factor = _mm256_set1_ps(kLogFactor * kOneOverLog2); |
|
|
|
__m256 clip_limit = _mm256_set1_ps(kMaxTanhInput * (1 << kInputMantissaBits)); |
|
__m256 clip0 = _mm256_min_ps(x0, clip_limit); |
|
__m256 clip1 = _mm256_min_ps(x1, clip_limit); |
|
clip_limit = _mm256_set1_ps(-kMaxTanhInput * (1 << kInputMantissaBits)); |
|
clip0 = _mm256_max_ps(clip0, clip_limit); |
|
clip1 = _mm256_max_ps(clip1, clip_limit); |
|
__m256 exp0 = _mm256_mul_ps(clip0, factor); |
|
__m256 exp1 = _mm256_mul_ps(clip1, factor); |
|
|
|
float32_pow2<kOrder>(exp0, exp1); |
|
|
|
__m256 one = _mm256_set1_ps(1.0f); |
|
__m256 numerator = _mm256_sub_ps(exp0, one); |
|
__m256 denominator = _mm256_add_ps(exp0, one); |
|
|
|
exp0 = _mm256_div_ps(numerator, denominator); |
|
numerator = _mm256_sub_ps(exp1, one); |
|
denominator = _mm256_add_ps(exp1, one); |
|
exp1 = _mm256_div_ps(numerator, denominator); |
|
|
|
|
|
|
|
factor = _mm256_set1_ps(1.0f / (1 << kInputMantissaBits)); |
|
clip0 = _mm256_mul_ps(clip0, factor); |
|
clip1 = _mm256_mul_ps(clip1, factor); |
|
__m256 third = _mm256_set1_ps(-1.0f / 3.0f); |
|
__m256 taylor0 = _mm256_mul_ps(clip0, clip0); |
|
__m256 taylor1 = _mm256_mul_ps(clip1, clip1); |
|
taylor0 = _mm256_mul_ps(taylor0, clip0); |
|
taylor1 = _mm256_mul_ps(taylor1, clip1); |
|
|
|
|
|
|
|
taylor0 = _mm256_mul_ps(taylor0, third); |
|
taylor1 = _mm256_mul_ps(taylor1, third); |
|
taylor0 = _mm256_add_ps(clip0, taylor0); |
|
taylor1 = _mm256_add_ps(clip1, taylor1); |
|
|
|
|
|
third = _mm256_mul_ps(third, third); |
|
__m256 neg_zero = _mm256_set1_ps(-0.0f); |
|
clip0 = _mm256_andnot_ps(neg_zero, clip0); |
|
clip1 = _mm256_andnot_ps(neg_zero, clip1); |
|
__m256 cmp_results0 = _mm256_cmp_ps(clip0, third, _CMP_LE_OQ); |
|
__m256 cmp_results1 = _mm256_cmp_ps(clip1, third, _CMP_LE_OQ); |
|
y0 = _mm256_blendv_ps(exp0, taylor0, cmp_results0); |
|
y1 = _mm256_blendv_ps(exp1, taylor1, cmp_results1); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <int kInputMantissaBits, TranscendentalMode kOrder = TM_ORDER4_FLOAT> |
|
inline void float_sigmoid_float(__m256& y0, __m256& y1) { |
|
constexpr float kInputFactor = static_cast<float>(1 << kInputMantissaBits); |
|
|
|
__m256 minus_zero = _mm256_set1_ps(-0.0f); |
|
y0 = _mm256_xor_ps(y0, minus_zero); |
|
y1 = _mm256_xor_ps(y1, minus_zero); |
|
y0 = ClipToFloatBounds(kMaxSigmoidInput * kInputFactor, y0); |
|
y1 = ClipToFloatBounds(kMaxSigmoidInput * kInputFactor, y1); |
|
float_exp_float_preclipped<kInputMantissaBits, kOrder>(y0, y1); |
|
__m256 one = _mm256_set1_ps(1.0f); |
|
|
|
y0 = _mm256_div_ps(one, _mm256_add_ps(y0, one)); |
|
y1 = _mm256_div_ps(one, _mm256_add_ps(y1, one)); |
|
} |
|
|
|
#endif |
|
|
|
} |
|
|
|
#endif |
|
|