#pragma once #include #include #include #include #include #include #include #include #ifdef CPU_CAPABILITY_AVX2 #include #include #endif namespace at { namespace native { namespace templates { namespace cpu { namespace { // ==================================================== Random ======================================================== template void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG generator) { AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "random_from_to_kernel_cpu", [&] { std::lock_guard lock(generator->mutex_); cpu_serial_kernel(iter, [range, base, generator]() -> scalar_t { uniform_int_from_to_distribution random(range, base); return random(generator); }); }); } // This is the special kernel to handle single specific case: // from(inclusive) = std::numeric_limits::lowest() // to(exclusive) = None (= std::numeric_limits::max() + 1) template void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG generator) { AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cpu", [&] { std::lock_guard lock(generator->mutex_); if (std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value) { cpu_serial_kernel(iter, [generator]() -> scalar_t { uniform_int_full_range_distribution random; return random(generator); }); } else { TORCH_CHECK(false, "random_full_64_bits_range_kernel_cpu handles only int64, double, float and bfloat16"); } }); } template struct RandomFromToKernel { void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional gen) { random_from_to_kernel(iter, range, base, check_generator(gen)); } void operator()(TensorIteratorBase& iter, c10::optional gen) { random_full_64_bits_range_kernel(iter, check_generator(gen)); } }; template void random_kernel(TensorIteratorBase& iter, RNG generator) { std::lock_guard lock(generator->mutex_); AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cpu", [&] { cpu_serial_kernel(iter, [generator]() -> scalar_t { uniform_int_distribution random; return random(generator); }); }); } template struct RandomKernel { void operator()(TensorIteratorBase& iter, c10::optional gen) { random_kernel(iter, check_generator(gen)); } }; // ==================================================== Normal ======================================================== #ifdef CPU_CAPABILITY_AVX2 static void normal_fill_16_AVX2(float *data, const __m256* two_pi, const __m256* one, const __m256* minus_two, const __m256* mean, const __m256* std_v) { const __m256 u1 = _mm256_sub_ps(*one, _mm256_loadu_ps(data)); const __m256 u2 = _mm256_loadu_ps(data + 8); // sincos256_ps and log256_ps are from avx_mathfun.h const __m256 radius = _mm256_sqrt_ps(_mm256_mul_ps(*minus_two, log256_ps(u1))); const __m256 theta = _mm256_mul_ps(*two_pi, u2); __m256 sintheta, costheta; sincos256_ps(theta, &sintheta, &costheta); const __m256 n1 = _mm256_mul_ps(radius, costheta); const __m256 n2 = _mm256_mul_ps(radius, sintheta); _mm256_storeu_ps(data, _mm256_fmadd_ps(n1, *std_v, *mean)); _mm256_storeu_ps(data + 8, _mm256_fmadd_ps(n2, *std_v, *mean)); } template void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) { float *data = self.data_ptr(); auto size = self.numel(); std::lock_guard lock(generator->mutex_); for (const auto i : c10::irange(size)) { at::uniform_real_distribution uniform(0, 1); data[i] = uniform(generator); } const __m256 two_pi = _mm256_set1_ps(2.0f * c10::pi); const __m256 one = _mm256_set1_ps(1.0f); const __m256 minus_two = _mm256_set1_ps(-2.0f); const __m256 mean_v = _mm256_set1_ps(mean); const __m256 std_v = _mm256_set1_ps(std); for (int64_t i = 0; i < size - 15; i += 16) { normal_fill_16_AVX2(data + i, &two_pi, &one, &minus_two, &mean_v, &std_v); } if (size % 16 != 0) { // Recompute the last 16 values. data = data + size - 16; for (const auto i : c10::irange(16)) { at::uniform_real_distribution uniform(0, 1); data[i] = uniform(generator); } normal_fill_16_AVX2(data, &two_pi, &one, &minus_two, &mean_v, &std_v); } } #endif template static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) { for (const auto j : c10::irange(8)) { const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log. const scalar_t u2 = data[j + 8]; const scalar_t radius = std::sqrt(-2 * std::log(u1)); const scalar_t theta = 2.0f * c10::pi * u2; data[j] = radius * std::cos(theta) * std + mean; data[j + 8] = radius * std::sin(theta) * std + mean; } } template void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) { scalar_t *data = self.data_ptr(); auto size = self.numel(); std::lock_guard lock(generator->mutex_); for (const auto i : c10::irange(size)) { at::uniform_real_distribution uniform(0, 1); data[i] = uniform(generator); } for (int64_t i = 0; i < size - 15; i += 16) { normal_fill_16(data + i, mean, std); } if (size % 16 != 0) { // Recompute the last 16 values. data = data + size - 16; for (const auto i : c10::irange(16)) { at::uniform_real_distribution uniform(0, 1); data[i] = uniform(generator); } normal_fill_16(data, mean, std); } } template void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) { auto size = self.numel(); if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) { #ifdef CPU_CAPABILITY_AVX2 normal_fill_AVX2(self, static_cast(mean), static_cast(std), generator); #else normal_fill(self, static_cast(mean), static_cast(std), generator); #endif } else { AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "normal_kernel_cpu", [&] { if (size >= 16 && self.is_contiguous()) { normal_fill(self, static_cast(mean), static_cast(std), generator); } else { auto iter = TensorIterator::borrowing_nullary_op(self); std::lock_guard lock(generator->mutex_); cpu_serial_kernel(iter, [mean, std, generator]() -> scalar_t { at::normal_distribution normal(mean, std); return static_cast(normal(generator)); }); } }); } } template struct NormalKernel { void operator()(Tensor& self, double mean, double std, c10::optional gen) { normal_kernel(self, mean, std, check_generator(gen)); } }; // ==================================================== Uniform ======================================================= template void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG generator) { AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "uniform_kernel_cpu", [&]() { std::lock_guard lock(generator->mutex_); auto from = static_cast(from_); auto to = static_cast(to_); at::uniform_real_distribution uniform(from, to); cpu_serial_kernel(iter, [&uniform, generator]() -> scalar_t { return static_cast(uniform(generator)); }); }); } template struct UniformKernel { void operator()(TensorIteratorBase& iter, double from, double to, c10::optional gen) { uniform_kernel(iter, from, to, check_generator(gen)); } }; // ==================================================== Cauchy ======================================================== template void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, RNG generator) { AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "cauchy_cpu", [&]() { std::lock_guard lock(generator->mutex_); at::cauchy_distribution cauchy(median, sigma); cpu_serial_kernel(iter, [&cauchy, generator]() -> scalar_t { return static_cast(cauchy(generator)); }); }); } template struct CauchyKernel { void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional gen) { cauchy_kernel(iter, median, sigma, check_generator(gen)); } }; // ================================================== LogNormal ======================================================= template void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, RNG generator) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cpu", [&]() { std::lock_guard lock(generator->mutex_); at::lognormal_distribution logNormal(mean, std); cpu_serial_kernel(iter, [&logNormal, generator]() -> scalar_t { return static_cast(logNormal(generator)); }); }); } template struct LogNormalKernel { void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional gen) { log_normal_kernel(iter, mean, std, check_generator(gen)); } }; // =================================================== Geometric ====================================================== template void geometric_kernel(TensorIteratorBase& iter, double p, RNG generator) { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cpu", [&]() { std::lock_guard lock(generator->mutex_); at::geometric_distribution geometric(p); cpu_serial_kernel(iter, [&geometric, generator]() -> scalar_t { return static_cast(geometric(generator)); }); }); } template struct GeometricKernel { void operator()(TensorIteratorBase& iter, double p, c10::optional gen) { geometric_kernel(iter, p, check_generator(gen)); } }; // ================================================== Exponential ===================================================== template void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG generator) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cpu", [&]() { std::lock_guard lock(generator->mutex_); at::exponential_distribution exponential(lambda); cpu_serial_kernel(iter, [&exponential, generator]() -> scalar_t { return static_cast(exponential(generator)); }); }); } template struct ExponentialKernel { void operator()(TensorIteratorBase& iter, double lambda, c10::optional gen) { exponential_kernel(iter, lambda, check_generator(gen)); } }; // ================================================== Bernoulli ======================================================= template void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] { // See Note [Acquire lock when using random generators] std::lock_guard lock(generator->mutex_); using self_t = scalar_t; auto p_cpu = p_.to(kCPU); auto p = expand_inplace(self, p_cpu); auto iter = TensorIteratorConfig() .add_output(self) .add_input(*p) .check_all_same_dtype(false) .build(); if (p->scalar_type() == kDouble) { cpu_serial_kernel(iter, [&](const double p_val) -> self_t { at::bernoulli_distribution bernoulli(p_val); return static_cast(bernoulli(generator)); }); } else { AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] { using p_t = scalar_t; cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t { at::bernoulli_distribution bernoulli(p_val); return static_cast(bernoulli(generator)); }); }); } }); } template void bernoulli_kernel(const TensorBase &self, double p, RNG generator) { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] { // See Note [Acquire lock when using random generators] std::lock_guard lock(generator->mutex_); auto iter = TensorIterator::borrowing_nullary_op(self); cpu_serial_kernel(iter, [p, generator]() -> scalar_t { at::bernoulli_distribution bernoulli(p); return static_cast(bernoulli(generator)); }); }); } template struct BernoulliKernel { void operator()(const TensorBase &self, double p, c10::optional gen) { bernoulli_kernel(self, p, check_generator(gen)); } void operator()(const TensorBase &self, const TensorBase &p_, c10::optional gen) { bernoulli_kernel(self, p_, check_generator(gen)); } }; }}}}}