Spaces:
Running
Running
// Copyright (C) 2018-2022 Intel Corporation | |
// SPDX-License-Identifier: Apache-2.0 | |
// | |
__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0)) | |
static void inline logistic_activate_hwc( | |
__local const half *restrict src, | |
__local half *restrict dst, | |
int offset, | |
int stride) | |
{ | |
half val = src[offset]; | |
val = 1.0h / (1.0h + exp2(val * -log_2_e)); | |
dst[offset * stride] = val; | |
} | |
__kernel void region_hwc( | |
__global const half *restrict src, | |
__global half *restrict dst, | |
int W, | |
int H, | |
int classes, | |
int coords, | |
int num, | |
int maskSize, | |
int doSoftmax) | |
{ | |
__local half local_src[13 * 13 * (4 + 1 + 80)]; | |
__local half local_dst[13 * 13 * (4 + 1 + 80)]; | |
const int pixel_pos = get_local_id(0); | |
const int local_C = classes + coords + 1; | |
const int c = get_group_id(1) * local_C; | |
const int h = get_group_id(0); | |
num = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize; | |
const int C = local_C * num; | |
event_t e1 = async_work_group_copy_2D2D( | |
local_src, // dst | |
src + h * W * C + c, // src | |
local_C, // num_elements_per_line, | |
H * W, // num_lines, | |
C - local_C, // src_line_stride, | |
0, // dst_line_stride, | |
0); | |
wait_group_events(1, &e1); | |
if (pixel_pos < W * H) | |
{ | |
const int w = pixel_pos % W; | |
const int h = pixel_pos / W; | |
__local const half *restrict src = local_src + h * W * local_C + w * local_C; | |
__local half *restrict dst = local_dst + h * W + w; | |
const int stride = H * W; | |
logistic_activate_hwc(src, dst, 0, stride); | |
logistic_activate_hwc(src, dst, 1, stride); | |
//copy plane 2 and 3 | |
dst[2 * stride] = src[2]; | |
dst[3 * stride] = src[3]; | |
logistic_activate_hwc(src, dst, 4, stride); | |
src += coords + 1; | |
dst += (coords + 1) * stride; | |
if (doSoftmax) { | |
half max_val = src[0]; | |
for (int c = 1; c < classes; c++) { | |
max_val = max(max_val, src[c]); | |
} | |
half expSum = 0.0h; | |
for (int c = 0; c < classes; c++) { | |
const half e = src[c] - max_val; | |
const half tmp = exp2(e * log_2_e); | |
dst[c * stride] = tmp; | |
expSum += tmp; | |
} | |
const half invExpSum = 1.0h / expSum; | |
for (int c = 0; c < classes; c++) { | |
dst[c * stride] *= invExpSum; | |
} | |
} else { | |
for (int c = 0; c < classes; c++) { | |
logistic_activate_hwc(src, dst, c, stride); | |
} | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
const int box_sz = W * H * (classes + coords + 1); | |
event_t e2 = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0); | |
wait_group_events(1, &e2); | |
} | |