// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))

#define ALLOW_EARLY_RETURN 1

static void inline logistic_activate_hwc(
    __local const half *restrict src,
    __local half *restrict dst,
    int offset,
    int stride)
{
    half val             = src[offset];
    val                  = 1.0h / (1.0h + exp2(val * -log_2_e));
    dst[offset * stride] = val;
}

__kernel void region_hwc(
    __global const half *restrict src,
    __global half *restrict dst,
    int W,
    int H,
    int classes,
    int coords,
    int num,
    int maskSize,
    int doSoftmax)
{
    __local half local_src[13 * 13 * (4 + 1 + 80)];
    __local half local_dst[13 * 13 * (4 + 1 + 80)];

    const int pixel_pos = get_local_id(0);

    const int local_C = classes + coords + 1;
    const int c       = get_group_id(1) * local_C;
    const int h       = get_group_id(0);

    num         = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize;
    const int C = local_C * num;

    event_t e1 = async_work_group_copy_2D2D(
        local_src, // dst
        src + h * W * C + c, // src
        local_C, // num_elements_per_line,
        H * W, // num_lines,
        C - local_C, // src_line_stride,
        0, // dst_line_stride,
        0);

    wait_group_events(1, &e1);

#if ALLOW_EARLY_RETURN
    if (pixel_pos < W * H)
#endif
    {
        const int w = pixel_pos % W;
        const int h = pixel_pos / W;

        __local const half *restrict src = local_src + h * W * local_C + w * local_C;
        __local half *restrict dst       = local_dst + h * W + w;

        const int stride = H * W;
        logistic_activate_hwc(src, dst, 0, stride);
        logistic_activate_hwc(src, dst, 1, stride);

        //copy plane 2 and 3
        dst[2 * stride] = src[2];
        dst[3 * stride] = src[3];

        logistic_activate_hwc(src, dst, 4, stride);

        src += coords + 1;
        dst += (coords + 1) * stride;

        if (doSoftmax) {
            half max_val = src[0];
            #pragma unroll 4
            for (int c = 1; c < classes; c++) {
                max_val = max(max_val, src[c]);
            }

            half expSum = 0.0h;
            #pragma unroll 4
            for (int c = 0; c < classes; c++) {
                const half e    = src[c] - max_val;
                const half tmp  = exp2(e * log_2_e);
                dst[c * stride] = tmp;
                expSum += tmp;
            }

            const half invExpSum = 1.0h / expSum;
            #pragma unroll 4
            for (int c = 0; c < classes; c++) {
                dst[c * stride] *= invExpSum;
            }
        } else {
            #pragma unroll 4
            for (int c = 0; c < classes; c++) {
                logistic_activate_hwc(src, dst, c, stride);
            }
        }
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    const int box_sz = W * H * (classes + coords + 1);
    event_t e2       = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0);
    wait_group_events(1, &e2);
}