// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

// Set to 1 only if output is zerroed before kernel execution
#define USE_ATOMICS 0

__attribute__((reqd_work_group_size(1, 1, 1))) __kernel void mvn_scale(
    const __global half *restrict src,
    __global float *restrict mean_part,
    __global float *restrict power_mean,
    __global half *restrict dst,
    int W,
    int H1,
    int across_channels,
    int normalize_variance,
    int nparts)
{
    __local half src_line[4 * 1024];
    __local half dst_line[4 * 1024];

    int c = get_group_id(2);
    int C = get_global_size(2);

    int h = get_group_id(1);
    int H = get_global_size(1);

    event_t e1 = async_work_group_copy(src_line, src + c * H * W + h * W, W, 0);
    wait_group_events(1, &e1);

    int idx     = (across_channels == 0) ? nparts * c : 0;
    float scale = (across_channels == 0) ? H * W : H * W * C;

#if USE_ATOMICS
    float mean     = mean_part[idx];
    float variance = power_mean[idx];
#else

    int total      = (across_channels == 0) ? nparts : nparts * C;
    float mean     = 0.f;
    float variance = 0.f;

    for (int i = 0; i < total; i++) {
        mean += mean_part[idx + i];
        variance += power_mean[idx + i];
    }
#endif

    mean     = mean / scale;
    variance = variance / scale;
    variance = variance - mean * mean;
    variance = native_sqrt(variance) + 1e-9f;

    half hmean     = mean;
    half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance);

    for (size_t w = 0; w < W; w++) {
        dst_line[w] = (src_line[w] - hmean) * hvariance;
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    event_t e2 = async_work_group_copy(dst + c * H * W + h * W, dst_line, W, 0);
    wait_group_events(1, &e2);
}