Spaces:

karolmajek
/

maxdeeplab

Runtime error

App Files Files Community

maxdeeplab / model /layers /resized_fuse.py

karolmajek

from https://huggingface.co/spaces/akhaliq/deeplab2

0924f30 over 3 years ago

raw

history blame contribute delete

7.26 kB

	# coding=utf-8
	# Copyright 2021 The Deeplab2 Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Implements a resized feature fuser for stacked decoders in MaX-DeepLab.

	Reference:
	MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
	CVPR 2021. https://arxiv.org/abs/2012.00759
	Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
	"""

	import tensorflow as tf

	from deeplab2.model import utils
	from deeplab2.model.layers import activations
	from deeplab2.model.layers import convolutions


	class ResizedFuse(tf.keras.layers.Layer):
	"""Fuses features by resizing and 1x1 convolutions.

	This function fuses all input features to a desired shape, by projecting the
	features to the desired number of channels, bilinear resizing the outputs
	(either upsampling or downsampling), and finally adding the outputs. If the
	input channel equals the desired output channels, the 1x1 convolutional
	projection is skipped. If the projection and bilinear resizing can be fused
	into a stride 2 convolution, we use this faster implementation. Other strides
	are also supported with the bilinear resizing, but are probably slower than
	strided convolutions.

	Reference:
	MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
	CVPR 2021. https://arxiv.org/abs/2012.00759
	Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
	"""

	def __init__(self,
	name,
	height,
	width,
	num_channels,
	activation='relu',
	bn_layer=tf.keras.layers.BatchNormalization,
	conv_kernel_weight_decay=0.0):
	"""Initializes a ResizedFuse layer.

	Args:
	name: A string, the name of this layer.
	height: An integer, the desired height of the output.
	width: An integer, the desired width of the output.
	num_channels: An integer, the num of output channels.
	activation: A string, type of activation function to apply.
	bn_layer: A tf.keras.layers.Layer that computes the normalization
	(default: tf.keras.layers.BatchNormalization).
	conv_kernel_weight_decay: A float, the weight decay for convolution
	kernels.
	"""
	super(ResizedFuse, self).__init__(name=name)
	self._height = height
	self._width = width
	self._num_channels = num_channels
	self._activation_fn = activations.get_activation(activation)
	self._bn_layer = bn_layer
	self._conv_kernel_weight_decay = conv_kernel_weight_decay

	def build(self, input_shapes):
	for index, feature_shape in enumerate(input_shapes):
	_, feature_height, feature_width, feature_channels = feature_shape
	if feature_channels == self._num_channels:
	continue
	elif ((feature_height + 1) // 2 == self._height and
	(feature_width + 1) // 2 == self._width):
	# Use stride 2 convolution to accelerate the operation if it generates
	# the desired spatial shape. Otherwise, the more general 1x1 convolution
	# and bilinear resizing are applied.

	# In a stacked decoder, we follow relu-conv-bn because we do the feature
	# summation before relu and after bn (following ResNet bottleneck
	# design). This ordering makes it easier to implement. Besides, it
	# avoids using many 1x1 convolutions when the input has a correct shape.
	current_name = '_strided_conv_bn{}'.format(index + 1)
	utils.safe_setattr(
	self, current_name, convolutions.Conv2DSame(
	self._num_channels, 1, current_name[1:],
	strides=2,
	use_bias=False,
	use_bn=True,
	bn_layer=self._bn_layer,
	activation='none',
	conv_kernel_weight_decay=self._conv_kernel_weight_decay))
	else:
	# If the input channel does not match that of the output, and the
	# operation cannot be accelerated by stride 2 convolution, then we
	# perform a flexible operation as follows. We first project the feature
	# to the desired number of channels, and then bilinearly resize the
	# output to the desired spatial resolution.
	current_name = '_resized_conv_bn{}'.format(index + 1)
	utils.safe_setattr(
	self, current_name, convolutions.Conv2DSame(
	self._num_channels, 1, current_name[1:],
	use_bias=False,
	use_bn=True,
	bn_layer=self._bn_layer,
	activation='none',
	conv_kernel_weight_decay=self._conv_kernel_weight_decay))

	def call(self, inputs, training=False):
	"""Performs a forward pass.

	Args:
	inputs: A list of input [batch, input_height, input_width, input_channels]
	tensors to fuse, where each input tensor may have different spatial
	resolutions and number of channels.
	training: A boolean, whether the model is in training mode.

	Returns:
	output: A fused feature [batch, height, width, num_channels] tensor.
	"""

	output_features = []
	for index, feature in enumerate(inputs):
	_, feature_height, feature_width, feature_channels = (
	feature.get_shape().as_list())
	if feature_channels == self._num_channels:
	# Resize the input feature if the number of channels equals the output.
	# We do not use a 1x1 convolution for this case because the previous
	# operation and the next operation are usually also 1x1 convolutions.
	# Besides, in stacked decoder, a feature can be reused many time, so it
	# saves parameter to avoid those many 1x1 convolutions.
	output_features.append(utils.resize_bilinear(
	feature, [self._height, self._width],
	align_corners=True))
	elif ((feature_height + 1) // 2 == self._height and
	(feature_width + 1) // 2 == self._width):
	current_name = '_strided_conv_bn{}'.format(index + 1)
	feature = self._activation_fn(feature)
	feature = getattr(self, current_name)(feature, training=training)
	output_features.append(feature)
	else:
	current_name = '_resized_conv_bn{}'.format(index + 1)
	feature = self._activation_fn(feature)
	feature = getattr(self, current_name)(feature, training=training)
	output_features.append(utils.resize_bilinear(
	feature, [self._height, self._width],
	align_corners=True))
	# Set the spatial shape of each output feature if possible.
	output_features[-1].set_shape(
	[None,
	self._height,
	self._width,
	self._num_channels])
	output = tf.add_n(output_features)
	return self._activation_fn(output)