Spaces:
Runtime error
Runtime error
// Copyright (C) 2018-2022 Intel Corporation | |
// SPDX-License-Identifier: Apache-2.0 | |
// | |
#pragma OPENCL EXTENSION cl_khr_fp16 : enable | |
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable | |
// Set to 1 only if output is zerroed before kernel execution | |
#define USE_ATOMICS 0 | |
void atomic_add_global(volatile __global float *source, const float operand) | |
{ | |
union { | |
unsigned int intVal; | |
float floatVal; | |
} newVal; | |
union { | |
unsigned int intVal; | |
float floatVal; | |
} prevVal; | |
do { | |
prevVal.floatVal = *source; | |
newVal.floatVal = prevVal.floatVal + operand; | |
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); | |
} | |
__kernel void reduction_mean( | |
__global const half *restrict src, | |
__global float *restrict mean, | |
__global float *restrict variance, | |
int W, | |
int H, | |
int across_channels) | |
{ | |
__local half src_line[4 * 1024]; | |
event_t e; | |
e = async_work_group_copy_2D2D( | |
src_line, // dst | |
src + get_group_id(1) * get_local_size(1) * W | |
+ get_group_id(2) * get_local_size(2) * W * get_global_size(1), // src | |
W * get_local_size(1), // num_elements_per_line, | |
get_local_size(2), // num_lines, | |
W * (get_global_size(1) - get_local_size(1)), // src_line_stride, | |
0, // dst_line_stride, | |
0); | |
wait_group_events(1, &e); | |
int h = get_global_id(1); | |
int c = get_global_id(2); | |
const int MAX_LOCAL_SIZE = 8; | |
__local float mbuf[MAX_LOCAL_SIZE]; | |
__local float vbuf[MAX_LOCAL_SIZE]; | |
mbuf[get_local_id(1)] = 0; | |
vbuf[get_local_id(1)] = 0; | |
if (h < H) { | |
float sum = 0.f; | |
float sum2 = 0.f; | |
float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; | |
float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; | |
const __local half8 *restrict lsrc = ((const __local half8 *)(src_line + get_local_id(1) * W)); | |
#pragma unroll 16 | |
for (size_t w = 0; w < W / 8; w++) { | |
half8 sh = lsrc[w]; | |
float8 valf = convert_float8(sh); | |
sum4 += valf; | |
sum24 += valf * valf; | |
} | |
for (size_t w = W / 8 * 8; w < W; w++) { | |
float val = (float)src_line[get_local_id(1) * W + w]; | |
sum += val; | |
sum2 += val * val; | |
} | |
mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum; | |
vbuf[get_local_id(1)] = | |
sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (get_local_id(1) == 0) { | |
float res = 0; | |
float res2 = 0; | |
for (int i = 0; i < get_local_size(1); i++) { | |
res += mbuf[i]; | |
res2 += vbuf[i]; | |
} | |
// requires memory reset before layer execution | |
#if USE_ATOMICS | |
int idx = (across_channels == 0) ? c : 0; | |
atomic_add_global(mean + idx, res); | |
atomic_add_global(variance + idx, res2); | |
#else | |
int idx = c * get_num_groups(1) + get_group_id(1); | |
mean[idx] = res; | |
variance[idx] = res2; | |
#endif | |
} | |
} | |