|
|
|
|
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable |
|
|
|
|
|
#define USE_ATOMICS 0 |
|
|
|
__attribute__((reqd_work_group_size(1, 1, 1))) __kernel void mvn_scale( |
|
const __global half *restrict src, |
|
__global float *restrict mean_part, |
|
__global float *restrict power_mean, |
|
__global half *restrict dst, |
|
int W, |
|
int H1, |
|
int across_channels, |
|
int normalize_variance, |
|
int nparts) |
|
{ |
|
__local half src_line[4 * 1024]; |
|
__local half dst_line[4 * 1024]; |
|
|
|
int c = get_group_id(2); |
|
int C = get_global_size(2); |
|
|
|
int h = get_group_id(1); |
|
int H = get_global_size(1); |
|
|
|
event_t e1 = async_work_group_copy(src_line, src + c * H * W + h * W, W, 0); |
|
wait_group_events(1, &e1); |
|
|
|
int idx = (across_channels == 0) ? nparts * c : 0; |
|
float scale = (across_channels == 0) ? H * W : H * W * C; |
|
|
|
#if USE_ATOMICS |
|
float mean = mean_part[idx]; |
|
float variance = power_mean[idx]; |
|
#else |
|
|
|
int total = (across_channels == 0) ? nparts : nparts * C; |
|
float mean = 0.f; |
|
float variance = 0.f; |
|
|
|
for (int i = 0; i < total; i++) { |
|
mean += mean_part[idx + i]; |
|
variance += power_mean[idx + i]; |
|
} |
|
#endif |
|
|
|
mean = mean / scale; |
|
variance = variance / scale; |
|
variance = variance - mean * mean; |
|
variance = native_sqrt(variance) + 1e-9f; |
|
|
|
half hmean = mean; |
|
half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance); |
|
|
|
for (size_t w = 0; w < W; w++) { |
|
dst_line[w] = (src_line[w] - hmean) * hvariance; |
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
event_t e2 = async_work_group_copy(dst + c * H * W + h * W, dst_line, W, 0); |
|
wait_group_events(1, &e2); |
|
} |
|
|