|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""MobileNetV3 models for Deep Labeling. |
|
|
|
Reference: |
|
Howard, A., Sandler, M., et al. Searching for mobilenetv3. In ICCV, 2019 |
|
""" |
|
from typing import Any, Callable, Mapping, Optional, Sequence |
|
|
|
import tensorflow as tf |
|
|
|
from deeplab2.model import utils |
|
from deeplab2.model.layers import blocks |
|
from deeplab2.model.layers import convolutions |
|
|
|
|
|
_INPUT_CHANNELS = 3 |
|
|
|
|
|
MNV3Small_BLOCK_SPECS = { |
|
'spec_name': 'MobileNetV3Small', |
|
'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', |
|
'activation', 'se_ratio', 'expand_ratio', |
|
'is_endpoint'], |
|
'block_specs': [ |
|
('conv_bn', 3, 2, 16, |
|
'hard_swish', None, None, True), |
|
('inverted_bottleneck', 3, 2, 16, |
|
'relu', 0.25, 1, True), |
|
('inverted_bottleneck', 3, 2, 24, |
|
'relu', None, 72. / 16, False), |
|
('inverted_bottleneck', 3, 1, 24, |
|
'relu', None, 88. / 24, True), |
|
('inverted_bottleneck', 5, 2, 40, |
|
'hard_swish', 0.25, 4., False), |
|
('inverted_bottleneck', 5, 1, 40, |
|
'hard_swish', 0.25, 6., False), |
|
('inverted_bottleneck', 5, 1, 40, |
|
'hard_swish', 0.25, 6., False), |
|
('inverted_bottleneck', 5, 1, 48, |
|
'hard_swish', 0.25, 3., False), |
|
('inverted_bottleneck', 5, 1, 48, |
|
'hard_swish', 0.25, 3., True), |
|
('inverted_bottleneck', 5, 2, 96, |
|
'hard_swish', 0.25, 6., False), |
|
('inverted_bottleneck', 5, 1, 96, |
|
'hard_swish', 0.25, 6., False), |
|
('inverted_bottleneck', 5, 1, 96, |
|
'hard_swish', 0.25, 6., False), |
|
('conv_bn', 1, 1, 576, |
|
'hard_swish', None, None, True), |
|
] |
|
} |
|
|
|
|
|
MNV3Large_BLOCK_SPECS = { |
|
'spec_name': 'MobileNetV3Large', |
|
'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', |
|
'activation', 'se_ratio', 'expand_ratio', |
|
'is_endpoint'], |
|
'block_specs': [ |
|
('conv_bn', 3, 2, 16, |
|
'hard_swish', None, None, False), |
|
('inverted_bottleneck', 3, 1, 16, |
|
'relu', None, 1., True), |
|
('inverted_bottleneck', 3, 2, 24, |
|
'relu', None, 4., False), |
|
('inverted_bottleneck', 3, 1, 24, |
|
'relu', None, 3., True), |
|
('inverted_bottleneck', 5, 2, 40, |
|
'relu', 0.25, 3., False), |
|
('inverted_bottleneck', 5, 1, 40, |
|
'relu', 0.25, 3., False), |
|
('inverted_bottleneck', 5, 1, 40, |
|
'relu', 0.25, 3., True), |
|
('inverted_bottleneck', 3, 2, 80, |
|
'hard_swish', None, 6., False), |
|
('inverted_bottleneck', 3, 1, 80, |
|
'hard_swish', None, 2.5, False), |
|
('inverted_bottleneck', 3, 1, 80, |
|
'hard_swish', None, 2.3, False), |
|
('inverted_bottleneck', 3, 1, 80, |
|
'hard_swish', None, 2.3, False), |
|
('inverted_bottleneck', 3, 1, 112, |
|
'hard_swish', 0.25, 6., False), |
|
('inverted_bottleneck', 3, 1, 112, |
|
'hard_swish', 0.25, 6., True), |
|
('inverted_bottleneck', 5, 2, 160, |
|
'hard_swish', 0.25, 6., False), |
|
('inverted_bottleneck', 5, 1, 160, |
|
'hard_swish', 0.25, 6., False), |
|
('inverted_bottleneck', 5, 1, 160, |
|
'hard_swish', 0.25, 6., False), |
|
('conv_bn', 1, 1, 960, |
|
'hard_swish', None, None, True), |
|
] |
|
} |
|
|
|
|
|
SUPPORTED_SPECS_MAP = { |
|
'MobileNetV3Large': MNV3Large_BLOCK_SPECS, |
|
'MobileNetV3Small': MNV3Small_BLOCK_SPECS, |
|
} |
|
|
|
|
|
|
|
def _block_spec_decoder(specs: Mapping[Any, Any], |
|
width_multiplier: float, |
|
divisible_by: int = 8) -> Sequence[Mapping[str, Any]]: |
|
"""Decodes specs for a block. |
|
|
|
Args: |
|
specs: A `dict` specification of block specs of a mobilenet version. |
|
width_multiplier: A `float` multiplier for the filter size for all |
|
convolution ops. The value must be greater than zero. Typical usage will |
|
be to set this value in (0, 1) to reduce the number of parameters or |
|
computation cost of the model. |
|
divisible_by: An `int` that ensures all inner dimensions are divisible by |
|
this number. |
|
|
|
Returns: |
|
A list of block spec in dictionary that defines structure of the layers. |
|
""" |
|
|
|
spec_name = specs['spec_name'] |
|
block_spec_schema = specs['block_spec_schema'] |
|
block_specs = specs['block_specs'] |
|
|
|
if not block_specs: |
|
raise ValueError( |
|
'The block spec cannot be empty for {} !'.format(spec_name)) |
|
|
|
if len(block_specs[0]) != len(block_spec_schema): |
|
raise ValueError('The block spec values {} do not match with ' |
|
'the schema {}'.format(block_specs[0], block_spec_schema)) |
|
|
|
decoded_specs = [] |
|
|
|
for spec in block_specs: |
|
spec_dict = dict(zip(block_spec_schema, spec)) |
|
decoded_specs.append(spec_dict) |
|
|
|
for ds in decoded_specs: |
|
ds['filters'] = utils.make_divisible( |
|
value=ds['filters'] * width_multiplier, |
|
divisor=divisible_by, |
|
min_value=8) |
|
|
|
return decoded_specs |
|
|
|
|
|
|
|
class MobileNet(tf.keras.Model): |
|
"""Creates a MobileNetV3 family model.""" |
|
|
|
def __init__( |
|
self, |
|
model_id: str = 'MobileNetV3Small', |
|
width_multiplier: float = 1.0, |
|
output_stride: Optional[int] = None, |
|
min_width: int = 8, |
|
divisible_by: int = 8, |
|
regularize_depthwise: bool = False, |
|
bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization, |
|
conv_kernel_weight_decay: float = 0.0, |
|
name: str = 'MobilenNetV3'): |
|
"""Initializes a MobileNet V3 model. |
|
|
|
Args: |
|
model_id: A `str` of MobileNet version. The supported values are |
|
`MobileNetV3Large`, `MobileNetV3Small`. |
|
width_multiplier: A `float` of multiplier for the filters (number of |
|
channels) for all convolution ops. The value must be greater than zero. |
|
Typical usage will be to set this value in (0, 1) to reduce the number |
|
of parameters or computation cost of the model. |
|
output_stride: An `int` that specifies the requested ratio of input to |
|
output spatial resolution. If not None, then we invoke atrous |
|
convolution if necessary to prevent the network from reducing the |
|
spatial resolution of activation maps. The output_stride should be |
|
divisible by 4. |
|
min_width: An `int` of minimum width (number of channels) for all |
|
convolution ops. Enforced when width_multiplier < 1, and not an active |
|
constraint when width_multiplier >= 1. |
|
divisible_by: An `int` that ensures all intermediate feature dimensions |
|
are divisible by this number. |
|
regularize_depthwise: If True, apply regularization on depthwise conv. |
|
bn_layer: An optional tf.keras.layers.Layer that computes the |
|
normalization (default: tf.keras.layers.BatchNormalization). |
|
conv_kernel_weight_decay: A float, the weight decay for convolution |
|
kernels. |
|
name: Model name. |
|
|
|
Raises: |
|
ValueError: The MobileNet version is not supported. |
|
ValueError: width_multiplier is not greater than zero. |
|
ValueError: Output stride must be None or a multiple of 4. |
|
ValueError: Unknown block type i for layer j. |
|
""" |
|
if model_id not in SUPPORTED_SPECS_MAP: |
|
raise ValueError('The MobileNet version {} ' |
|
'is not supported'.format(model_id)) |
|
|
|
if width_multiplier <= 0: |
|
raise ValueError('width_multiplier is not greater than zero.') |
|
|
|
if (output_stride is not None and |
|
(output_stride <= 1 or (output_stride > 1 and output_stride % 4))): |
|
raise ValueError('Output stride must be None or a multiple of 4.') |
|
|
|
super().__init__(name=name) |
|
|
|
self._model_id = model_id |
|
self._width_multiplier = width_multiplier |
|
self._min_width = min_width |
|
self._output_stride = output_stride |
|
self._divisible_by = divisible_by |
|
self._regularize_depthwise = regularize_depthwise |
|
self._bn_layer = bn_layer |
|
self._conv_kernel_weight_decay = conv_kernel_weight_decay |
|
self._blocks = [] |
|
self._endpoint_names = [] |
|
|
|
block_specs = SUPPORTED_SPECS_MAP.get(model_id) |
|
self._decoded_specs = _block_spec_decoder( |
|
specs=block_specs, |
|
width_multiplier=self._width_multiplier, |
|
divisible_by=self._divisible_by) |
|
|
|
self._mobilenet_base() |
|
|
|
def _mobilenet_base(self): |
|
"""Builds the base MobileNet architecture.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
current_stride = 1 |
|
|
|
|
|
rate = 1 |
|
|
|
endpoint_level = 1 |
|
in_filters = _INPUT_CHANNELS |
|
for i, block_def in enumerate(self._decoded_specs): |
|
|
|
if endpoint_level > 5 and not self._classification_mode: |
|
break |
|
|
|
block_name = '{}_{}'.format(block_def['block_fn'], i + 1) |
|
|
|
if (self._output_stride is not None and |
|
current_stride == self._output_stride): |
|
|
|
|
|
|
|
layer_stride = 1 |
|
layer_rate = rate |
|
rate = ( |
|
rate * block_def['strides'] |
|
if block_def['strides'] is not None else rate) |
|
else: |
|
layer_stride = block_def['strides'] |
|
layer_rate = 1 |
|
current_stride = ( |
|
current_stride * block_def['strides'] |
|
if block_def['strides'] is not None else current_stride) |
|
|
|
if block_def['block_fn'] == 'conv_bn': |
|
|
|
self._blocks.append( |
|
convolutions.Conv2DSame( |
|
output_channels=block_def['filters'], |
|
kernel_size=block_def['kernel_size'], |
|
strides=layer_stride, |
|
atrous_rate=layer_rate, |
|
activation=block_def['activation'], |
|
use_bias=False, |
|
bn_layer=self._bn_layer, |
|
use_bn=True, |
|
conv_kernel_weight_decay=self._conv_kernel_weight_decay, |
|
name=block_name, |
|
)) |
|
|
|
elif block_def['block_fn'] == 'inverted_bottleneck': |
|
atrous_rate = 1 |
|
|
|
if layer_rate > 1 and block_def['kernel_size'] != 1: |
|
atrous_rate = layer_rate |
|
self._blocks.append( |
|
blocks.InvertedBottleneckBlock( |
|
in_filters=in_filters, |
|
out_filters=block_def['filters'], |
|
expand_ratio=block_def['expand_ratio'], |
|
strides=layer_stride, |
|
kernel_size=block_def['kernel_size'], |
|
se_ratio=block_def['se_ratio'], |
|
activation=block_def['activation'], |
|
expand_se_in_filters=True, |
|
depthwise_activation=None, |
|
atrous_rate=atrous_rate, |
|
divisible_by=self._divisible_by, |
|
regularize_depthwise=self._regularize_depthwise, |
|
use_depthwise=True, |
|
|
|
|
|
|
|
|
|
use_residual=(block_def['strides'] == 1), |
|
bn_layer=self._bn_layer, |
|
conv_kernel_weight_decay=self._conv_kernel_weight_decay, |
|
name=block_name, |
|
)) |
|
|
|
else: |
|
raise ValueError('Unknown block type {} for layer {}'.format( |
|
block_def['block_fn'], i)) |
|
|
|
|
|
in_filters = block_def['filters'] |
|
|
|
if block_def['is_endpoint']: |
|
|
|
|
|
self._endpoint_names.append('res' + str(endpoint_level)) |
|
endpoint_level += 1 |
|
else: |
|
self._endpoint_names.append(None) |
|
|
|
def call(self, input_tensor: tf.Tensor, training: bool = False): |
|
"""Performs a forward pass through MobileNet.""" |
|
net = input_tensor |
|
endpoints = {} |
|
for block, endpoint_name in zip(self._blocks, self._endpoint_names): |
|
net = block(net, training=training) |
|
if endpoint_name is not None: |
|
endpoints[endpoint_name] = net |
|
return endpoints |
|
|
|
|
|
def MobileNetV3Small( |
|
width_multiplier: float = 1.0, |
|
output_stride: int = 32, |
|
bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization, |
|
conv_kernel_weight_decay: float = 0.0, |
|
name: str = 'MobileNetV3Small') -> tf.keras.Model: |
|
"""Creates a MobileNetV3Small model. |
|
|
|
Args: |
|
width_multiplier: A float, depth_multiplier for the whole model. |
|
output_stride: An optional integer specifying the output stride of the |
|
network. |
|
bn_layer: An optional tf.keras.layers.Layer that computes the |
|
normalization (default: tf.keras.layers.BatchNormalization). |
|
conv_kernel_weight_decay: A float, the weight decay for convolution kernels. |
|
name: Model name. |
|
|
|
Returns: |
|
The MobileNetV3Small model as an instance of tf.keras.Model. |
|
""" |
|
model = MobileNet(model_id='MobileNetV3Small', |
|
width_multiplier=width_multiplier, |
|
output_stride=output_stride, |
|
bn_layer=bn_layer, |
|
conv_kernel_weight_decay=conv_kernel_weight_decay, |
|
name=name) |
|
return model |
|
|
|
|
|
def MobileNetV3Large( |
|
width_multiplier: float = 1.0, |
|
output_stride: int = 32, |
|
bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization, |
|
conv_kernel_weight_decay: float = 0.0, |
|
name: str = 'MobileNetV3Large') -> tf.keras.Model: |
|
"""Creates a MobileNetV3Large model. |
|
|
|
Args: |
|
width_multiplier: A float, depth_multiplier for the STEM. |
|
output_stride: An optional integer specifying the output stride of the |
|
network. |
|
bn_layer: An optional tf.keras.layers.Layer that computes the |
|
normalization (default: tf.keras.layers.BatchNormalization). |
|
conv_kernel_weight_decay: A float, the weight decay for convolution kernels. |
|
name: Model name. |
|
|
|
Returns: |
|
The MobileNetV3Large model as an instance of tf.keras.Model. |
|
""" |
|
model = MobileNet(model_id='MobileNetV3Large', |
|
width_multiplier=width_multiplier, |
|
output_stride=output_stride, |
|
bn_layer=bn_layer, |
|
conv_kernel_weight_decay=conv_kernel_weight_decay, |
|
name=name) |
|
return model |
|
|