// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

// Set to 1 only if output is zerroed before kernel execution
#define USE_ATOMICS 0

void atomic_add_global(volatile __global float *source, const float operand)
{
    union {
        unsigned int intVal;
        float floatVal;
    } newVal;
    union {
        unsigned int intVal;
        float floatVal;
    } prevVal;

    do {
        prevVal.floatVal = *source;
        newVal.floatVal  = prevVal.floatVal + operand;
    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}

__kernel void reduction_mean(
    __global const half *restrict src,
    __global float *restrict mean,
    __global float *restrict variance,
    int W,
    int H,
    int across_channels)
{
    __local half src_line[4 * 1024];
    event_t e;

    e = async_work_group_copy_2D2D(
        src_line, // dst
        src + get_group_id(1) * get_local_size(1) * W
            + get_group_id(2) * get_local_size(2) * W * get_global_size(1), // src
        W * get_local_size(1), // num_elements_per_line,
        get_local_size(2), // num_lines,
        W * (get_global_size(1) - get_local_size(1)), // src_line_stride,
        0, // dst_line_stride,
        0);

    wait_group_events(1, &e);

    int h = get_global_id(1);
    int c = get_global_id(2);

    const int MAX_LOCAL_SIZE = 8;

    __local float mbuf[MAX_LOCAL_SIZE];
    __local float vbuf[MAX_LOCAL_SIZE];

    mbuf[get_local_id(1)] = 0;
    vbuf[get_local_id(1)] = 0;

    if (h < H) {
        float sum  = 0.f;
        float sum2 = 0.f;

        float8 sum4  = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
        float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};

        const __local half8 *restrict lsrc = ((const __local half8 *)(src_line + get_local_id(1) * W));

        #pragma unroll 16
        for (size_t w = 0; w < W / 8; w++) {
            half8 sh    = lsrc[w];
            float8 valf = convert_float8(sh);

            sum4 += valf;
            sum24 += valf * valf;
        }

        for (size_t w = W / 8 * 8; w < W; w++) {
            float val = (float)src_line[get_local_id(1) * W + w];
            sum += val;
            sum2 += val * val;
        }

        mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum;
        vbuf[get_local_id(1)] =
            sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    if (get_local_id(1) == 0) {
        float res  = 0;
        float res2 = 0;

        for (int i = 0; i < get_local_size(1); i++) {
            res += mbuf[i];
            res2 += vbuf[i];
        }

// requires memory reset before layer execution
#if USE_ATOMICS
        int idx = (across_channels == 0) ? c : 0;

        atomic_add_global(mean + idx, res);
        atomic_add_global(variance + idx, res2);
#else
        int idx = c * get_num_groups(1) + get_group_id(1);

        mean[idx]     = res;
        variance[idx] = res2;
#endif
    }
}