Spaces:

akhaliq
/

deeplab2

Runtime error

deeplab2 / tensorflow_ops /kernels /merge_semantic_and_instance_maps_op_kernel.cu.cc

akhaliq3

spaces demo

506da10 almost 3 years ago

No virus

12.8 kB

	// Copyright 2021 The Deeplab2 Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include <cstdint>
	#ifdef GOOGLE_CUDA
	#define EIGEN_USE_GPU

	#include <unordered_map>
	#include <unordered_set>
	#include <vector>

	#include /third_party/"tensorflow/core/framework/op_kernel.h"
	#include /third_party/"tensorflow/core/framework/register_types.h"
	#include /third_party/"tensorflow/core/framework/tensor.h"
	#include /third_party/"tensorflow/core/framework/tensor_shape.h"
	#include /third_party/"tensorflow/core/framework/types.h"
	#include /third_party/"tensorflow/core/util/gpu_kernel_helper.h"
	#include /third_party/"merge_semantic_and_instance_maps_op_kernel.h" // local headers

	namespace tensorflow_models {
	namespace deeplab {
	namespace deeplab2 {

	namespace functor {

	namespace {

	using ::tensorflow::CudaGridRangeX;
	using ::tensorflow::GetGpuLaunchConfig;
	using ::tensorflow::GpuLaunchConfig;
	using ::tensorflow::Tensor;
	using ::tensorflow::TTypes;

	using GPUDevice = ::Eigen::GpuDevice;

	// Maximum number of instances and semantic classes. We default to
	// 1024 and 256, respectively. Increase the values, if your dataset
	// contains more instances per image or more semantic classes.
	constexpr int32_t kMaxNumInstance = 1024;
	constexpr int32_t kMaxNumSemantic = 256;

	// CUDA kernel that initializes memory with a constant value.
	template <typename T>
	__global__ void SetToValue(const int num_threads, const T value, T* x) {
	for (int idx : CudaGridRangeX(num_threads)) {
	x[idx] = value;
	}
	}

	// CUDA kernel that goes over each pixel, and collects the following stats:
	// 1. Whether this pixel belongs to "thing" class.
	// 2. Semantic label count inside each instance.
	// 3. Total pixel area of each "stuff" class.
	// Size of each GPU array:
	// semantic_data: [height * width]
	// instance_data: [height * width]
	// is_thing_per_semantic_id: [kMaxNumSemantic]
	// is_thing_per_pixel: [height * width]
	// semantic_count_per_instance: [kMaxNumInstance * kMaxNumSemantic]
	// stuff_area: [kMaxNumSemantic]
	__global__ void CollectPixelStats(const int num_threads,
	const int32_t* semantic_data,
	const int32_t* instance_data,
	const bool* is_thing_per_semantic_id,
	bool* is_thing_per_pixel,
	int32_t* semantic_count_per_instance,
	int32_t* stuff_area) {
	for (int idx : CudaGridRangeX(num_threads)) {
	const int32_t semantic_label =
	std::min(semantic_data[idx], kMaxNumSemantic - 1);
	const int32_t instance_label =
	std::min(instance_data[idx], kMaxNumInstance - 1);
	const bool is_thing = is_thing_per_semantic_id[semantic_label];
	is_thing_per_pixel[idx] = is_thing;

	const int offset = instance_label * kMaxNumSemantic + semantic_label;
	if (is_thing) {
	tensorflow::CudaAtomicAdd(semantic_count_per_instance + offset, 1);
	} else {
	tensorflow::CudaAtomicAdd(stuff_area + semantic_label, 1);
	}
	}
	}

	// CUDA kernel that merges semantic and instance prediction into panoptic map.
	// Merging rules:
	// 1. For "thing" class, its instance label will be reordered, and its semantic
	// label depends on major semantic label inside this instance.
	// 2. For "stuff" class, its instance label is 0, and semantic label will be
	// a) void, if stuff area is small, and b) original semantic label.
	// Size of each GPU array:
	// semantic_data: [height * width]
	// instance_data: [height * width]
	// is_thing_per_semantic_id: [kMaxNumSemantic]
	// is_thing_per_pixel: [height * width]
	// stuff_area: [kMaxNumSemantic]
	// labels_per_instance: [kMaxNumInstance * 2]
	// parsing_maps: [height * width]
	__global__ void MergePredictions(
	const int num_threads, const int32_t* semantic_data,
	const int32_t* instance_data, const bool* is_thing_per_pixel,
	const int32_t* stuff_area, const int32_t* labels_per_instance,
	const int32_t stuff_area_limit, const int32_t label_divisor,
	const int32_t void_label, int32_t* parsing_maps) {
	for (int idx : CudaGridRangeX(num_threads)) {
	const int32_t semantic_label =
	std::min(semantic_data[idx], kMaxNumSemantic - 1);
	const int32_t instance_label =
	std::min(instance_data[idx], kMaxNumInstance - 1);
	const int32_t is_thing = static_cast<int32_t>(is_thing_per_pixel[idx]);

	const int32_t semantic_label_if_is_thing =
	labels_per_instance[instance_label * 2];
	const int32_t instance_label_if_is_thing =
	labels_per_instance[instance_label * 2 + 1];
	const int32_t panoptic_label_if_is_thing =
	semantic_label_if_is_thing * label_divisor + instance_label_if_is_thing;

	const int32_t is_void = static_cast<int32_t>(
	stuff_area_limit > 0 && stuff_area[semantic_label] <= stuff_area_limit);
	const int32_t semantic_label_if_is_stuff =
	is_void * void_label + (1 - is_void) * semantic_label;

	parsing_maps[idx] =
	is_thing * panoptic_label_if_is_thing +
	(1 - is_thing) * (semantic_label_if_is_stuff * label_divisor);
	}
	}

	// Generates semantic and instance label for each predicted instance.
	// Size of each GPU array:
	// semantic_count_per_instance: [kMaxNumInstance * kMaxNumSemantic]
	// labels_per_instance: [kMaxNumInstance * 2]
	void CreateLabelsPerInstance(const GPUDevice& d,
	const int32_t* semantic_count_per_instance,
	int32_t* labels_per_instance) {
	std::vector<int32_t> semantic_count_per_instance_host(kMaxNumInstance *
	kMaxNumSemantic);
	d.memcpyDeviceToHost(semantic_count_per_instance_host.data(),
	semantic_count_per_instance,
	kMaxNumInstance * kMaxNumSemantic * sizeof(int32_t));

	// A flat 2D array with shape [kMaxNumInstance, 2], where each row
	// represents (new semantic label, new instance label) for each instance.
	std::vector<int32_t> labels_per_instance_host(kMaxNumInstance * 2);

	// Map semantic_label -> largest instance label of this semantic class.
	std::unordered_map<int32_t, int32_t> instance_count_per_semantic_class;
	for (int i = 0; i < kMaxNumInstance; ++i) {
	int max_pixel_count = 0;
	int max_semantic_label = -1;
	for (int j = 0; j < kMaxNumSemantic; ++j) {
	const int current_count =
	semantic_count_per_instance_host[i * kMaxNumSemantic + j];
	if (current_count > max_pixel_count) {
	max_semantic_label = j;
	max_pixel_count = current_count;
	}
	}

	labels_per_instance_host[2 * i] = std::max(0, max_semantic_label);
	if (max_semantic_label >= 0) {
	labels_per_instance_host[2 * i + 1] =
	++instance_count_per_semantic_class[max_semantic_label];
	} else {
	labels_per_instance_host[2 * i + 1] = 0;
	}
	}

	d.memcpyHostToDevice(labels_per_instance, labels_per_instance_host.data(),
	kMaxNumInstance * 2 * sizeof(int32_t));
	}

	} // namespace

	// Specialization of Convert1DInt32TensorToSet for GPU.
	template <>
	std::unordered_set<int32_t> Convert1DInt32TensorToSet(const GPUDevice& d,
	const Tensor& tensor) {
	const int n_vals = tensor.dim_size(0);
	std::vector<int32_t> host_buffer(n_vals);
	d.memcpyDeviceToHost(host_buffer.data(), tensor.tensor<int32_t, 1>().data(),
	n_vals * sizeof(int32_t));

	return std::unordered_set<int32_t>(host_buffer.begin(), host_buffer.end());
	}

	// This function merges the semantic segmentation and class-agnostic
	// instance segmentation to form the panoptic segmentation. In particular,
	// the class label of each instance mask is inferred from the majority
	// votes from the corresponding pixels in the semantic segmentation. This
	// operation is first poposed in the DeeperLab paper and adopted by the
	// Panoptic-DeepLab.
	// - DeeperLab: Single-Shot Image Parser, T-J Yang, et al. arXiv:1902.05093.
	// - Panoptic-DeepLab, B. Cheng, et al. In CVPR, 2020.
	// Specialization of MergeSemanticAndInstanceMaps for GPU.
	template <>
	void MergeSemanticAndInstanceMaps<GPUDevice>::operator()(
	const GPUDevice& d, typename TTypes<int32_t, 3>::ConstTensor semantic_maps,
	typename TTypes<int32_t, 3>::ConstTensor instance_maps,
	const std::unordered_set<int32_t>& thing_ids_set, int label_divisor,
	int stuff_area_limit, int void_label,
	typename TTypes<int32_t, 3>::Tensor parsing_maps) {
	const int num_batches = semantic_maps.dimension(0);
	const int height = semantic_maps.dimension(1);
	const int width = semantic_maps.dimension(2);

	// Allocate memory on host, which tells each semantic class is "thing" or not.
	bool is_thing_per_semantic_id[kMaxNumSemantic];
	for (int i = 0; i < kMaxNumSemantic; ++i) {
	is_thing_per_semantic_id[i] =
	(thing_ids_set.find(i) != thing_ids_set.end());
	}
	bool* is_thing_per_semantic_id_device =
	reinterpret_cast<bool>(d.allocate_temp(kMaxNumSemantic sizeof(bool)));
	d.memcpyHostToDevice(is_thing_per_semantic_id_device,
	is_thing_per_semantic_id,
	kMaxNumSemantic * sizeof(bool));

	// Allocate scratch memories on device.
	bool* is_thing_per_pixel_device =
	reinterpret_cast<bool>(d.allocate_temp(height width * sizeof(bool)));
	int32_t* semantic_count_per_instance_device = reinterpret_cast<int32_t*>(
	d.allocate_temp(kMaxNumInstance * kMaxNumSemantic * sizeof(int32_t)));
	int32_t* stuff_area_device = reinterpret_cast<int32_t*>(
	d.allocate_temp(kMaxNumSemantic * sizeof(int32_t)));
	int32_t* labels_per_instance_device = reinterpret_cast<int32_t*>(
	d.allocate_temp(kMaxNumInstance * 2 * sizeof(int32_t)));

	GpuLaunchConfig config;
	int total_count = 0;
	for (int b = 0; b < num_batches; ++b) {
	const int batch_offset = b * height * width;
	// Initialize memories that hold counters.
	total_count = kMaxNumInstance * kMaxNumSemantic;
	config = GetGpuLaunchConfig(total_count, d);
	SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
	config.virtual_thread_count, 0, semantic_count_per_instance_device);

	total_count = kMaxNumSemantic;
	config = GetGpuLaunchConfig(total_count, d);
	SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
	config.virtual_thread_count, 0, stuff_area_device);

	// Step 1: Collect semantic and instance mask stats. Done on GPU.
	total_count = height * width;
	config = GetGpuLaunchConfig(total_count, d);
	CollectPixelStats<<<config.block_count, config.thread_per_block, 0,
	d.stream()>>>(
	config.virtual_thread_count, semantic_maps.data() + batch_offset,
	instance_maps.data() + batch_offset, is_thing_per_semantic_id_device,
	is_thing_per_pixel_device, semantic_count_per_instance_device,
	stuff_area_device);

	// Step 2: Loop over instance, find major "thing" semantic label, and
	// reorder instance IDs to share same ID with different thing class.
	// This process now runs on CPU.
	CreateLabelsPerInstance(d, semantic_count_per_instance_device,
	labels_per_instance_device);

	// Step 3: Create panoptic prediction.
	total_count = width * height;
	config = GetGpuLaunchConfig(total_count, d);
	MergePredictions<<<config.block_count, config.thread_per_block, 0,
	d.stream()>>>(
	config.virtual_thread_count, semantic_maps.data() + batch_offset,
	instance_maps.data() + batch_offset, is_thing_per_pixel_device,
	stuff_area_device, labels_per_instance_device, stuff_area_limit,
	label_divisor, void_label, parsing_maps.data() + batch_offset);
	}

	// Free all temp memories.
	d.deallocate_temp(is_thing_per_semantic_id_device);
	d.deallocate_temp(is_thing_per_pixel_device);
	d.deallocate_temp(semantic_count_per_instance_device);
	d.deallocate_temp(stuff_area_device);
	d.deallocate_temp(labels_per_instance_device);
	}

	} // namespace functor
	} // namespace deeplab2
	} // namespace deeplab
	} // namespace tensorflow_models

	#endif // GOOGLE_CUDA