// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

#define USE_OPTIMIZED_ROUND

#ifdef USE_OPTIMIZED_ROUND
#define ROUND(x) ((int)((x) + 0.5f))
#else
#define ROUND(x) (int)(round(x))
#endif

inline int out_to_in(float ox, float f)
{
#ifdef USE_OPTIMIZED_ROUND
    return (int)((ox + 0.5f) / f);
#else
    return ROUND((ox + 0.5f) / f - 0.5f);
#endif
}

static inline float triangleCoeff(float x) { return 1.0f - fabs(x); }

static inline float4 triangleCoeff4(float4 x) { return 1.0f - fabs(x); }

__kernel void resample_with_antialias(
    __global const half *restrict src,
    __global half *restrict dst,
    int iw,
    int ih,
    float factor,
    int ow,
    int oh,
    int channels)
{
    __local half local_src[20 * 1024];
    __local half local_dst[8 * 1024];

    const int r        = (factor > 1.0f) ? 2 : ceil(1.0f / factor);
    const int oy_first = get_group_id(1) * get_local_size(1);
    const int oy_last  = (get_group_id(1) + 1) * get_local_size(1) - 1;
    const int iy_first = max(out_to_in(oy_first, factor) - r, 0);
    const int iy_last  = min(out_to_in(oy_last, factor) + r, ih - 1);
    const int iy_size  = iy_last - iy_first + 1;

    event_t e1 = async_work_group_copy_2D2D(
        local_src, // dst
        src + get_group_id(2) * get_local_size(2) * ih * iw + iy_first * iw, // src
        iy_size * iw, // num_elements_per_line,
        get_local_size(2), // num_lines,
        (ih - iy_size) * iw, // src_line_stride,
        0, // dst_line_stride,
        0);
    wait_group_events(1, &e1);

    const int oy     = get_global_id(1);
    const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first;
    const int iy     = ROUND(iy_f);

    __local half const *restrict start_src =
        local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2);
    __local half *restrict start_dst =
        local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2);

    for (int ox = 0; ox < ow; ox++) {
        const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f;
        const int ix_i   = ROUND(ix_f);

        float4 v_sum  = 0.f;
        float4 v_wsum = 0.f;
        for (int y = 0; y < iy_size; y++) {
            float dy  = iy_f - y;
            int x     = max(ix_i - r, 0);
            int end_x = min(ix_i + r, iw - 1);

            float4 dx;
            for (int i = 0; i < 4; i++) dx[i] = ix_f - x - i;

            for (; x < end_x - 3; x += 4, dx -= 4) {
                float4 w =
                    factor * triangleCoeff4(factor * dx) * factor * triangleCoeff(factor * dy);
                float4 src_vec = {
                    start_src[y * iw + x + 0],
                    start_src[y * iw + x + 1],
                    start_src[y * iw + x + 2],
                    start_src[y * iw + x + 3]};

                v_sum += w * src_vec;
                v_wsum += w;
            }

            for (; x <= end_x; x++) {
                float dx = ix_f - x;
                float w = factor * triangleCoeff(factor * dx) * factor * triangleCoeff(factor * dy);

                v_sum[0] += w * start_src[y * iw + x];
                v_wsum[0] += w;
            }
        }

        v_sum[0]  = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3];
        v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3];

        start_dst[get_local_id(1) * ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]);
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    event_t e2 = async_work_group_copy_2D2D(
        dst + get_group_id(2) * get_local_size(2) * get_global_size(1) * ow
            + get_group_id(1) * get_local_size(1) * ow, // dst
        local_dst, // src
        get_local_size(1) * ow, // num_elements_per_line,
        get_local_size(2), // num_lines,
        0, // src_line_stride,
        (get_global_size(1) - get_local_size(1)) * ow, // dst_line_stride,
        0);
    wait_group_events(1, &e2);
}