|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cstdint> |
|
#ifdef GOOGLE_CUDA |
|
#define EIGEN_USE_GPU |
|
|
|
#include <unordered_map> |
|
#include <unordered_set> |
|
#include <vector> |
|
|
|
#include "tensorflow/core/framework/op_kernel.h" |
|
#include "tensorflow/core/framework/register_types.h" |
|
#include "tensorflow/core/framework/tensor.h" |
|
#include "tensorflow/core/framework/tensor_shape.h" |
|
#include "tensorflow/core/framework/types.h" |
|
#include "tensorflow/core/util/gpu_kernel_helper.h" |
|
#include "merge_semantic_and_instance_maps_op_kernel.h" |
|
|
|
namespace tensorflow_models { |
|
namespace deeplab { |
|
namespace deeplab2 { |
|
|
|
namespace functor { |
|
|
|
namespace { |
|
|
|
using ::tensorflow::CudaGridRangeX; |
|
using ::tensorflow::GetGpuLaunchConfig; |
|
using ::tensorflow::GpuLaunchConfig; |
|
using ::tensorflow::Tensor; |
|
using ::tensorflow::TTypes; |
|
|
|
using GPUDevice = ::Eigen::GpuDevice; |
|
|
|
|
|
|
|
|
|
constexpr int32_t kMaxNumInstance = 1024; |
|
constexpr int32_t kMaxNumSemantic = 256; |
|
|
|
|
|
template <typename T> |
|
__global__ void SetToValue(const int num_threads, const T value, T* x) { |
|
for (int idx : CudaGridRangeX(num_threads)) { |
|
x[idx] = value; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void CollectPixelStats(const int num_threads, |
|
const int32_t* semantic_data, |
|
const int32_t* instance_data, |
|
const bool* is_thing_per_semantic_id, |
|
bool* is_thing_per_pixel, |
|
int32_t* semantic_count_per_instance, |
|
int32_t* stuff_area) { |
|
for (int idx : CudaGridRangeX(num_threads)) { |
|
const int32_t semantic_label = |
|
std::min(semantic_data[idx], kMaxNumSemantic - 1); |
|
const int32_t instance_label = |
|
std::min(instance_data[idx], kMaxNumInstance - 1); |
|
const bool is_thing = is_thing_per_semantic_id[semantic_label]; |
|
is_thing_per_pixel[idx] = is_thing; |
|
|
|
const int offset = instance_label * kMaxNumSemantic + semantic_label; |
|
if (is_thing) { |
|
tensorflow::CudaAtomicAdd(semantic_count_per_instance + offset, 1); |
|
} else { |
|
tensorflow::CudaAtomicAdd(stuff_area + semantic_label, 1); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void MergePredictions( |
|
const int num_threads, const int32_t* semantic_data, |
|
const int32_t* instance_data, const bool* is_thing_per_pixel, |
|
const int32_t* stuff_area, const int32_t* labels_per_instance, |
|
const int32_t stuff_area_limit, const int32_t label_divisor, |
|
const int32_t void_label, int32_t* parsing_maps) { |
|
for (int idx : CudaGridRangeX(num_threads)) { |
|
const int32_t semantic_label = |
|
std::min(semantic_data[idx], kMaxNumSemantic - 1); |
|
const int32_t instance_label = |
|
std::min(instance_data[idx], kMaxNumInstance - 1); |
|
const int32_t is_thing = static_cast<int32_t>(is_thing_per_pixel[idx]); |
|
|
|
const int32_t semantic_label_if_is_thing = |
|
labels_per_instance[instance_label * 2]; |
|
const int32_t instance_label_if_is_thing = |
|
labels_per_instance[instance_label * 2 + 1]; |
|
const int32_t panoptic_label_if_is_thing = |
|
semantic_label_if_is_thing * label_divisor + instance_label_if_is_thing; |
|
|
|
const int32_t is_void = static_cast<int32_t>( |
|
stuff_area_limit > 0 && stuff_area[semantic_label] <= stuff_area_limit); |
|
const int32_t semantic_label_if_is_stuff = |
|
is_void * void_label + (1 - is_void) * semantic_label; |
|
|
|
parsing_maps[idx] = |
|
is_thing * panoptic_label_if_is_thing + |
|
(1 - is_thing) * (semantic_label_if_is_stuff * label_divisor); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
void CreateLabelsPerInstance(const GPUDevice& d, |
|
const int32_t* semantic_count_per_instance, |
|
int32_t* labels_per_instance) { |
|
std::vector<int32_t> semantic_count_per_instance_host(kMaxNumInstance * |
|
kMaxNumSemantic); |
|
d.memcpyDeviceToHost(semantic_count_per_instance_host.data(), |
|
semantic_count_per_instance, |
|
kMaxNumInstance * kMaxNumSemantic * sizeof(int32_t)); |
|
|
|
|
|
|
|
std::vector<int32_t> labels_per_instance_host(kMaxNumInstance * 2); |
|
|
|
|
|
std::unordered_map<int32_t, int32_t> instance_count_per_semantic_class; |
|
for (int i = 0; i < kMaxNumInstance; ++i) { |
|
int max_pixel_count = 0; |
|
int max_semantic_label = -1; |
|
for (int j = 0; j < kMaxNumSemantic; ++j) { |
|
const int current_count = |
|
semantic_count_per_instance_host[i * kMaxNumSemantic + j]; |
|
if (current_count > max_pixel_count) { |
|
max_semantic_label = j; |
|
max_pixel_count = current_count; |
|
} |
|
} |
|
|
|
labels_per_instance_host[2 * i] = std::max(0, max_semantic_label); |
|
if (max_semantic_label >= 0) { |
|
labels_per_instance_host[2 * i + 1] = |
|
++instance_count_per_semantic_class[max_semantic_label]; |
|
} else { |
|
labels_per_instance_host[2 * i + 1] = 0; |
|
} |
|
} |
|
|
|
d.memcpyHostToDevice(labels_per_instance, labels_per_instance_host.data(), |
|
kMaxNumInstance * 2 * sizeof(int32_t)); |
|
} |
|
|
|
} |
|
|
|
|
|
template <> |
|
std::unordered_set<int32_t> Convert1DInt32TensorToSet(const GPUDevice& d, |
|
const Tensor& tensor) { |
|
const int n_vals = tensor.dim_size(0); |
|
std::vector<int32_t> host_buffer(n_vals); |
|
d.memcpyDeviceToHost(host_buffer.data(), tensor.tensor<int32_t, 1>().data(), |
|
n_vals * sizeof(int32_t)); |
|
|
|
return std::unordered_set<int32_t>(host_buffer.begin(), host_buffer.end()); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <> |
|
void MergeSemanticAndInstanceMaps<GPUDevice>::operator()( |
|
const GPUDevice& d, typename TTypes<int32_t, 3>::ConstTensor semantic_maps, |
|
typename TTypes<int32_t, 3>::ConstTensor instance_maps, |
|
const std::unordered_set<int32_t>& thing_ids_set, int label_divisor, |
|
int stuff_area_limit, int void_label, |
|
typename TTypes<int32_t, 3>::Tensor parsing_maps) { |
|
const int num_batches = semantic_maps.dimension(0); |
|
const int height = semantic_maps.dimension(1); |
|
const int width = semantic_maps.dimension(2); |
|
|
|
|
|
bool is_thing_per_semantic_id[kMaxNumSemantic]; |
|
for (int i = 0; i < kMaxNumSemantic; ++i) { |
|
is_thing_per_semantic_id[i] = |
|
(thing_ids_set.find(i) != thing_ids_set.end()); |
|
} |
|
bool* is_thing_per_semantic_id_device = |
|
reinterpret_cast<bool*>(d.allocate_temp(kMaxNumSemantic * sizeof(bool))); |
|
d.memcpyHostToDevice(is_thing_per_semantic_id_device, |
|
is_thing_per_semantic_id, |
|
kMaxNumSemantic * sizeof(bool)); |
|
|
|
|
|
bool* is_thing_per_pixel_device = |
|
reinterpret_cast<bool*>(d.allocate_temp(height * width * sizeof(bool))); |
|
int32_t* semantic_count_per_instance_device = reinterpret_cast<int32_t*>( |
|
d.allocate_temp(kMaxNumInstance * kMaxNumSemantic * sizeof(int32_t))); |
|
int32_t* stuff_area_device = reinterpret_cast<int32_t*>( |
|
d.allocate_temp(kMaxNumSemantic * sizeof(int32_t))); |
|
int32_t* labels_per_instance_device = reinterpret_cast<int32_t*>( |
|
d.allocate_temp(kMaxNumInstance * 2 * sizeof(int32_t))); |
|
|
|
GpuLaunchConfig config; |
|
int total_count = 0; |
|
for (int b = 0; b < num_batches; ++b) { |
|
const int batch_offset = b * height * width; |
|
|
|
total_count = kMaxNumInstance * kMaxNumSemantic; |
|
config = GetGpuLaunchConfig(total_count, d); |
|
SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( |
|
config.virtual_thread_count, 0, semantic_count_per_instance_device); |
|
|
|
total_count = kMaxNumSemantic; |
|
config = GetGpuLaunchConfig(total_count, d); |
|
SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( |
|
config.virtual_thread_count, 0, stuff_area_device); |
|
|
|
|
|
total_count = height * width; |
|
config = GetGpuLaunchConfig(total_count, d); |
|
CollectPixelStats<<<config.block_count, config.thread_per_block, 0, |
|
d.stream()>>>( |
|
config.virtual_thread_count, semantic_maps.data() + batch_offset, |
|
instance_maps.data() + batch_offset, is_thing_per_semantic_id_device, |
|
is_thing_per_pixel_device, semantic_count_per_instance_device, |
|
stuff_area_device); |
|
|
|
|
|
|
|
|
|
CreateLabelsPerInstance(d, semantic_count_per_instance_device, |
|
labels_per_instance_device); |
|
|
|
|
|
total_count = width * height; |
|
config = GetGpuLaunchConfig(total_count, d); |
|
MergePredictions<<<config.block_count, config.thread_per_block, 0, |
|
d.stream()>>>( |
|
config.virtual_thread_count, semantic_maps.data() + batch_offset, |
|
instance_maps.data() + batch_offset, is_thing_per_pixel_device, |
|
stuff_area_device, labels_per_instance_device, stuff_area_limit, |
|
label_divisor, void_label, parsing_maps.data() + batch_offset); |
|
} |
|
|
|
|
|
d.deallocate_temp(is_thing_per_semantic_id_device); |
|
d.deallocate_temp(is_thing_per_pixel_device); |
|
d.deallocate_temp(semantic_count_per_instance_device); |
|
d.deallocate_temp(stuff_area_device); |
|
d.deallocate_temp(labels_per_instance_device); |
|
} |
|
|
|
} |
|
} |
|
} |
|
} |
|
|
|
#endif |
|
|