|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Contains Axial-ResNet model instances for Axial-DeepLab and MaX-DeepLab. |
|
|
|
Reference: |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", |
|
CVPR 2021. https://arxiv.org/abs/2012.00759 |
|
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. |
|
""" |
|
|
|
import abc |
|
import collections.abc |
|
import copy |
|
|
|
from absl import logging |
|
import tensorflow as tf |
|
|
|
from deeplab2.model.encoder import axial_resnet |
|
|
|
|
|
def _get_default_config(): |
|
"""Gets the default config for Axial-ResNets.""" |
|
|
|
|
|
|
|
default_config = { |
|
'num_blocks': [3, 4, 6, 3], |
|
'backbone_layer_multiplier': 1.0, |
|
'width_multiplier': 1.0, |
|
'stem_width_multiplier': 1.0, |
|
'output_stride': 16, |
|
'classification_mode': False, |
|
'backbone_type': 'resnet_beta', |
|
'use_axial_beyond_stride': 16, |
|
'backbone_use_transformer_beyond_stride': 32, |
|
'extra_decoder_use_transformer_beyond_stride': 32, |
|
'backbone_decoder_num_stacks': 0, |
|
'backbone_decoder_blocks_per_stage': 1, |
|
'extra_decoder_num_stacks': 0, |
|
'extra_decoder_blocks_per_stage': 1, |
|
'max_num_mask_slots': 128, |
|
'num_mask_slots': 128, |
|
'memory_channels': 256, |
|
'base_transformer_expansion': 1.0, |
|
'global_feed_forward_network_channels': 256, |
|
'high_resolution_output_stride': 4, |
|
'activation': 'relu', |
|
'block_group_config': { |
|
'attention_bottleneck_expansion': 2, |
|
'drop_path_keep_prob': 0.8, |
|
'drop_path_beyond_stride': 16, |
|
'drop_path_schedule': 'constant', |
|
'positional_encoding_type': None, |
|
'use_global_beyond_stride': 0, |
|
'use_sac_beyond_stride': 0, |
|
'use_squeeze_and_excite': False, |
|
'conv_use_recompute_grad': False, |
|
'axial_use_recompute_grad': True, |
|
'recompute_within_stride': 0, |
|
'transformer_use_recompute_grad': False, |
|
'axial_layer_config': { |
|
'query_shape': (129, 129), |
|
'key_expansion': 1, |
|
'value_expansion': 2, |
|
'memory_flange': (32, 32), |
|
'double_global_attention': False, |
|
'num_heads': 8, |
|
'use_query_rpe_similarity': True, |
|
'use_key_rpe_similarity': True, |
|
'use_content_similarity': True, |
|
'retrieve_value_rpe': True, |
|
'retrieve_value_content': True, |
|
'initialization_std_for_query_key_rpe': 1.0, |
|
'initialization_std_for_value_rpe': 1.0, |
|
'self_attention_activation': 'softmax', |
|
}, |
|
'dual_path_transformer_layer_config': { |
|
'num_heads': 8, |
|
'bottleneck_expansion': 2, |
|
'key_expansion': 1, |
|
'value_expansion': 2, |
|
'feed_forward_network_channels': 2048, |
|
'use_memory_self_attention': True, |
|
'use_pixel2memory_feedback_attention': True, |
|
'transformer_activation': 'softmax', |
|
}, |
|
}, |
|
'bn_layer': tf.keras.layers.BatchNormalization, |
|
'conv_kernel_weight_decay': 0.0, |
|
} |
|
return default_config |
|
|
|
|
|
def override(config_dict, override_dict): |
|
"""Recursively overrides a config dict with another.""" |
|
output_dict = copy.deepcopy(config_dict) |
|
for key, value in override_dict.items(): |
|
if isinstance(value, collections.abc.Mapping): |
|
output_dict[key] = override(config_dict.get(key, {}), value) |
|
else: |
|
output_dict[key] = value |
|
return output_dict |
|
|
|
|
|
class AxialResNetInstance(axial_resnet.AxialResNet): |
|
"""A base Axial-ResNet model.""" |
|
|
|
@classmethod |
|
@abc.abstractmethod |
|
def _get_config(cls): |
|
pass |
|
|
|
def __init__(self, name, **kwargs): |
|
"""Builds an Axial-ResNet model.""" |
|
|
|
current_config = self._get_config() |
|
|
|
|
|
|
|
|
|
current_config = override(_get_default_config(), current_config) |
|
|
|
|
|
|
|
|
|
current_config = override(current_config, kwargs) |
|
logging.info('Axial-ResNet final config: %s', current_config) |
|
super(AxialResNetInstance, self).__init__(name, **current_config) |
|
|
|
|
|
class MaXDeepLabS(AxialResNetInstance): |
|
"""MaX-DeepLab-S for panoptic segmentation. |
|
|
|
Reference: |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", |
|
CVPR 2021. https://arxiv.org/abs/2012.00759 |
|
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
|
|
|
|
return {} |
|
|
|
|
|
class MaXDeepLabL(AxialResNetInstance): |
|
"""MaX-DeepLab-L for panoptic segmentation. |
|
|
|
Reference: |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", |
|
CVPR 2021. https://arxiv.org/abs/2012.00759 |
|
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
return { |
|
'num_blocks': [3, 6, 3, 3], |
|
'backbone_type': 'wider_resnet', |
|
'backbone_use_transformer_beyond_stride': 16, |
|
'extra_decoder_use_transformer_beyond_stride': 16, |
|
'backbone_decoder_num_stacks': 1, |
|
'extra_decoder_num_stacks': 1, |
|
'extra_decoder_blocks_per_stage': 3, |
|
'memory_channels': 512, |
|
'base_transformer_expansion': 2.0, |
|
'global_feed_forward_network_channels': 512, |
|
'block_group_config': { |
|
'attention_bottleneck_expansion': 4, |
|
'drop_path_beyond_stride': 4, |
|
'axial_layer_config': { |
|
'key_expansion': 2, |
|
'value_expansion': 4, |
|
}, |
|
}, |
|
} |
|
|
|
|
|
class MaXDeepLabSBackbone(MaXDeepLabS): |
|
"""MaX-DeepLab-S backbone for image classification pretraining. |
|
|
|
Reference: |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", |
|
CVPR 2021. https://arxiv.org/abs/2012.00759 |
|
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
base_config = super(MaXDeepLabSBackbone, cls)._get_config() |
|
|
|
override_config = { |
|
'classification_mode': True, |
|
|
|
|
|
'backbone_use_transformer_beyond_stride': 0, |
|
} |
|
return override(base_config, override_config) |
|
|
|
|
|
class MaXDeepLabLBackbone(MaXDeepLabL): |
|
"""MaX-DeepLab-L backbone for image classification pretraining. |
|
|
|
Reference: |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", |
|
CVPR 2021. https://arxiv.org/abs/2012.00759 |
|
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
base_config = super(MaXDeepLabLBackbone, cls)._get_config() |
|
|
|
override_config = { |
|
'classification_mode': True, |
|
|
|
|
|
'backbone_use_transformer_beyond_stride': 0, |
|
} |
|
return override(base_config, override_config) |
|
|
|
|
|
class ResNet50(AxialResNetInstance): |
|
"""A ResNet-50 instance. |
|
|
|
Note that the implementation is different from the original ResNet-50 in: |
|
(1) We apply strided convolutions in the first 3x3 convolution of the first |
|
residual block of a stage. |
|
(2) We replace the strided max pooling layer in the stem by applying strided |
|
convolution in the immediate next residual block. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
return { |
|
'classification_mode': True, |
|
'backbone_type': 'resnet', |
|
'use_axial_beyond_stride': 0, |
|
'backbone_use_transformer_beyond_stride': 0, |
|
'block_group_config': { |
|
'drop_path_keep_prob': 1.0, |
|
}, |
|
} |
|
|
|
|
|
class ResNet50Beta(ResNet50): |
|
"""A ResNet-50 but with inception stem. |
|
|
|
Note that the implementation is different from the original ResNet-50 in: |
|
(1) We apply strided convolutions in the first 3x3 convolution of the first |
|
residual block of a stage. |
|
(2) We replace the strided max pooling layer in the stem by applying strided |
|
convolution in the immediate next residual block. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
base_config = super(ResNet50Beta, cls)._get_config() |
|
|
|
override_config = { |
|
'backbone_type': 'resnet_beta', |
|
} |
|
return override(base_config, override_config) |
|
|
|
|
|
class AxialResNetL(ResNet50): |
|
"""Axial-ResNet-L for image classification only. |
|
|
|
Axial-ResNet-L is a ResNet50 with use_axial_beyond_stride = 2. |
|
|
|
Reference: |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
base_config = super(AxialResNetL, cls)._get_config() |
|
|
|
override_config = { |
|
'use_axial_beyond_stride': 2, |
|
} |
|
return override(base_config, override_config) |
|
|
|
|
|
class AxialResNetS(ResNet50): |
|
"""Axial-ResNet-S for image classification only. |
|
|
|
Axial-ResNet-S is a ResNet50 with use_axial_beyond_stride = 2 and |
|
width_multiplier = 0.5. |
|
|
|
Reference: |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
base_config = super(AxialResNetS, cls)._get_config() |
|
|
|
override_config = { |
|
'width_multiplier': 0.5, |
|
'use_axial_beyond_stride': 2, |
|
} |
|
return override(base_config, override_config) |
|
|
|
|
|
class AxialDeepLabL(ResNet50Beta): |
|
"""Axial-DeepLab-L for panoptic segmentation. |
|
|
|
Axial-DeepLab-L is a ResNet50Beta with use_axial_beyond_stride = 2. |
|
Axial-DeepLab-L is also equivalent to Axial-ResNet-L with an inception stem. |
|
|
|
Reference: |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
base_config = super(AxialDeepLabL, cls)._get_config() |
|
override_config = { |
|
'use_axial_beyond_stride': 2, |
|
} |
|
return override(base_config, override_config) |
|
|
|
|
|
class AxialDeepLabS(ResNet50Beta): |
|
"""Axial-DeepLab-S for panoptic segmentation. |
|
|
|
Axial-DeepLab-S is a ResNet50Beta with use_axial_beyond_stride = 2 and |
|
width_multiplier = 0.5. |
|
Axial-DeepLab-S is also equivalent to Axial-ResNet-S with an inception stem. |
|
|
|
Reference: |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
base_config = super(AxialDeepLabS, cls)._get_config() |
|
override_config = { |
|
'width_multiplier': 0.5, |
|
'use_axial_beyond_stride': 2, |
|
} |
|
return override(base_config, override_config) |
|
|
|
|
|
class SWideRNet(AxialResNetInstance): |
|
"""A SWideRNet instance. |
|
|
|
Note that the implementation is different from the original SWideRNet in: |
|
(1) We apply strided convolutions in the first residual block of a stage, |
|
instead of the last residual block. |
|
(2) We replace the strided max pooling layer in the stem by applying strided |
|
convolution in the immediate next residual block. |
|
(3) We (optionally) use squeeze and excitation in all five stages, instead |
|
of the last four stages only. |
|
|
|
Reference: |
|
Scaling Wide Residual Networks for Panoptic Segmentation, |
|
https://arxiv.org/abs/2011.11675 |
|
Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao. |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
return { |
|
'num_blocks': [3, 6, 3, 3], |
|
'classification_mode': True, |
|
'backbone_type': 'wider_resnet', |
|
'use_axial_beyond_stride': 0, |
|
'backbone_use_transformer_beyond_stride': 0, |
|
'block_group_config': { |
|
'drop_path_beyond_stride': 4, |
|
'conv_use_recompute_grad': True, |
|
}, |
|
} |
|
|
|
|
|
class AxialSWideRNet(SWideRNet): |
|
"""SWideRNet with axial attention blocks in the last two stages. |
|
|
|
Note that the implementation is different from the original SWideRNet in: |
|
(1) We apply strided convolutions in the first residual block of a stage, |
|
instead of the last residual block. |
|
(2) We replace the strided max pooling layer in the stem by applying strided |
|
convolution in the immediate next residual block. |
|
(3) We (optionally) use squeeze and excitation in all five stages, instead |
|
of the last four stages only. |
|
|
|
Reference: |
|
Scaling Wide Residual Networks for Panoptic Segmentation, |
|
https://arxiv.org/abs/2011.11675 |
|
Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao. |
|
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, |
|
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 |
|
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, |
|
Liang-Chieh Chen. |
|
""" |
|
|
|
@classmethod |
|
def _get_config(cls): |
|
base_config = super(AxialSWideRNet, cls)._get_config() |
|
override_config = { |
|
'use_axial_beyond_stride': 16, |
|
'block_group_config': { |
|
'attention_bottleneck_expansion': 4, |
|
'axial_layer_config': { |
|
'key_expansion': 2, |
|
'value_expansion': 4, |
|
}, |
|
}, |
|
} |
|
return override(base_config, override_config) |
|
|
|
|
|
def get_model(name, **kwargs): |
|
"""Gets the model instance given the model name.""" |
|
name_lower = name.lower() |
|
if name_lower == 'max_deeplab_s': |
|
return MaXDeepLabS(name_lower, **kwargs) |
|
elif name_lower == 'max_deeplab_l': |
|
return MaXDeepLabL(name_lower, **kwargs) |
|
elif name_lower == 'max_deeplab_s_backbone': |
|
return MaXDeepLabSBackbone(name_lower, **kwargs) |
|
elif name_lower == 'max_deeplab_l_backbone': |
|
return MaXDeepLabLBackbone(name_lower, **kwargs) |
|
elif name_lower == 'resnet50': |
|
return ResNet50(name_lower, **kwargs) |
|
elif name_lower == 'resnet50_beta': |
|
return ResNet50Beta(name_lower, **kwargs) |
|
elif name_lower == 'swidernet' or name_lower == 'wide_resnet41': |
|
return SWideRNet(name_lower, **kwargs) |
|
elif name_lower == 'axial_swidernet': |
|
return AxialSWideRNet(name_lower, **kwargs) |
|
elif name_lower == 'axial_resnet_s': |
|
return AxialResNetS(name_lower, **kwargs) |
|
elif name_lower == 'axial_resnet_l': |
|
return AxialResNetL(name_lower, **kwargs) |
|
elif name_lower == 'axial_deeplab_s': |
|
return AxialDeepLabS(name_lower, **kwargs) |
|
elif name_lower == 'axial_deeplab_l': |
|
return AxialDeepLabL(name_lower, **kwargs) |
|
else: |
|
raise ValueError(name_lower + ' is not supported.') |
|
|