File size: 6,013 Bytes
8b7c501 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
#include <benchmark/benchmark.h>
#include <fp16.h>
#ifndef EMSCRIPTEN
#include <fp16/psimd.h>
#endif
#include <vector>
#include <random>
#include <chrono>
#include <functional>
#include <algorithm>
#if defined(__ARM_NEON__) || defined(__aarch64__)
#include <arm_neon.h>
#endif
static void fp16_alt_to_fp32_bits(benchmark::State& state) {
const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
auto rng = std::bind(std::uniform_int_distribution<uint16_t>(0, 0x7BFF), std::mt19937(seed));
std::vector<uint16_t> fp16(state.range(0));
std::vector<uint32_t> fp32(state.range(0));
std::generate(fp16.begin(), fp16.end(),
[&rng]{ return fp16_alt_from_fp32_value(rng()); });
while (state.KeepRunning()) {
uint16_t* input = fp16.data();
benchmark::DoNotOptimize(input);
uint32_t* output = fp32.data();
const size_t n = state.range(0);
for (size_t i = 0; i < n; i++) {
output[i] = fp16_alt_to_fp32_bits(input[i]);
}
benchmark::DoNotOptimize(output);
}
state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
}
BENCHMARK(fp16_alt_to_fp32_bits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
static void fp16_alt_to_fp32_value(benchmark::State& state) {
const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
auto rng = std::bind(std::uniform_int_distribution<uint16_t>(0, 0x7BFF), std::mt19937(seed));
std::vector<uint16_t> fp16(state.range(0));
std::vector<float> fp32(state.range(0));
std::generate(fp16.begin(), fp16.end(),
[&rng]{ return fp16_alt_from_fp32_value(rng()); });
while (state.KeepRunning()) {
uint16_t* input = fp16.data();
benchmark::DoNotOptimize(input);
float* output = fp32.data();
const size_t n = state.range(0);
for (size_t i = 0; i < n; i++) {
output[i] = fp16_alt_to_fp32_value(input[i]);
}
benchmark::DoNotOptimize(output);
}
state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
}
BENCHMARK(fp16_alt_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);
#ifndef EMSCRIPTEN
static void fp16_alt_to_fp32_psimd(benchmark::State& state) {
const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
auto rng = std::bind(std::uniform_int_distribution<uint16_t>(0, 0x7BFF), std::mt19937(seed));
std::vector<uint16_t> fp16(state.range(0));
std::vector<float> fp32(state.range(0));
std::generate(fp16.begin(), fp16.end(),
[&rng]{ return fp16_alt_from_fp32_value(rng()); });
while (state.KeepRunning()) {
uint16_t* input = fp16.data();
benchmark::DoNotOptimize(input);
float* output = fp32.data();
const size_t n = state.range(0);
for (size_t i = 0; i < n - 4; i += 4) {
psimd_store_f32(&output[i],
fp16_alt_to_fp32_psimd(
psimd_load_u16(&input[i])));
}
const psimd_u16 last_vector = { input[n - 4], input[n - 3], input[n - 2], input[n - 1] };
psimd_store_f32(&output[n - 4],
fp16_alt_to_fp32_psimd(last_vector));
benchmark::DoNotOptimize(output);
}
state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
}
BENCHMARK(fp16_alt_to_fp32_psimd)->RangeMultiplier(2)->Range(1<<10, 64<<20);
static void fp16_alt_to_fp32x2_psimd(benchmark::State& state) {
const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
auto rng = std::bind(std::uniform_int_distribution<uint16_t>(0, 0x7BFF), std::mt19937(seed));
std::vector<uint16_t> fp16(state.range(0));
std::vector<float> fp32(state.range(0));
std::generate(fp16.begin(), fp16.end(),
[&rng]{ return fp16_alt_from_fp32_value(rng()); });
while (state.KeepRunning()) {
uint16_t* input = fp16.data();
benchmark::DoNotOptimize(input);
float* output = fp32.data();
const size_t n = state.range(0);
for (size_t i = 0; i < n; i += 8) {
const psimd_f32x2 data =
fp16_alt_to_fp32x2_psimd(
psimd_load_u16(&input[i]));
psimd_store_f32(&output[i], data.lo);
psimd_store_f32(&output[i + 4], data.hi);
}
benchmark::DoNotOptimize(output);
}
state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
}
BENCHMARK(fp16_alt_to_fp32x2_psimd)->RangeMultiplier(2)->Range(1<<10, 64<<20);
#endif
#if defined(__ARM_NEON_FP) && (__ARM_NEON_FP & 0x2) || defined(__aarch64__)
static void hardware_vcvt_f32_f16(benchmark::State& state) {
const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
std::vector<uint16_t> fp16(state.range(0));
std::vector<float> fp32(state.range(0));
std::generate(fp16.begin(), fp16.end(),
[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
while (state.KeepRunning()) {
uint16_t* input = fp16.data();
benchmark::DoNotOptimize(input);
float* output = fp32.data();
const size_t n = state.range(0);
#if defined(__aarch64__)
const unsigned int fpcr = __builtin_aarch64_get_fpcr();
/* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */
__builtin_aarch64_set_fpcr((fpcr & 0xFEFFFFFFu) | 0x08000000u);
#else
unsigned int fpscr;
__asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr));
/* Disable flush-to-zero (bit 24) and enable Alternative FP16 format (bit 26) */
__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :
: [fpscr] "r" ((fpscr & 0xFEFFFFFFu) | 0x08000000u));
#endif
for (size_t i = 0; i < n; i += 4) {
vst1q_f32(&output[i],
vcvt_f32_f16(
(float16x4_t) vld1_u16(&input[i])));
}
#if defined(__aarch64__)
__builtin_aarch64_set_fpcr(fpcr);
#else
__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr));
#endif
benchmark::DoNotOptimize(output);
}
state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
}
BENCHMARK(hardware_vcvt_f32_f16)->RangeMultiplier(2)->Range(1<<10, 64<<20);
#endif
BENCHMARK_MAIN();
|