Comparative-Analysis-of-Speech-Synthesis-Models
/
TensorFlowTTS
/tensorflow_tts
/utils
/group_conv.py
# -*- coding: utf-8 -*- | |
# This code is copy from https://github.com/tensorflow/tensorflow/pull/36773. | |
"""Group Convolution Modules.""" | |
from tensorflow.python.framework import tensor_shape | |
from tensorflow.python.keras import activations, constraints, initializers, regularizers | |
from tensorflow.python.keras.engine.base_layer import Layer | |
from tensorflow.python.keras.engine.input_spec import InputSpec | |
from tensorflow.python.keras.layers import Conv1D, SeparableConv1D | |
from tensorflow.python.keras.utils import conv_utils | |
from tensorflow.python.ops import array_ops, nn, nn_ops | |
class Convolution(object): | |
"""Helper class for convolution. | |
Note that this class assumes that shapes of input and filter passed to | |
__call__ are compatible with input_shape and filter_shape passed to the | |
constructor. | |
Arguments | |
input_shape: static shape of input. i.e. input.get_shape(). | |
filter_shape: static shape of the filter. i.e. filter.get_shape(). | |
padding: see convolution. | |
strides: see convolution. | |
dilation_rate: see convolution. | |
name: see convolution. | |
data_format: see convolution. | |
""" | |
def __init__( | |
self, | |
input_shape, | |
filter_shape, | |
padding, | |
strides=None, | |
dilation_rate=None, | |
name=None, | |
data_format=None, | |
): | |
"""Helper function for convolution.""" | |
num_total_dims = filter_shape.ndims | |
if num_total_dims is None: | |
num_total_dims = input_shape.ndims | |
if num_total_dims is None: | |
raise ValueError("rank of input or filter must be known") | |
num_spatial_dims = num_total_dims - 2 | |
try: | |
input_shape.with_rank(num_spatial_dims + 2) | |
except ValueError: | |
raise ValueError("input tensor must have rank %d" % (num_spatial_dims + 2)) | |
try: | |
filter_shape.with_rank(num_spatial_dims + 2) | |
except ValueError: | |
raise ValueError("filter tensor must have rank %d" % (num_spatial_dims + 2)) | |
if data_format is None or not data_format.startswith("NC"): | |
input_channels_dim = tensor_shape.dimension_at_index( | |
input_shape, num_spatial_dims + 1 | |
) | |
spatial_dims = range(1, num_spatial_dims + 1) | |
else: | |
input_channels_dim = tensor_shape.dimension_at_index(input_shape, 1) | |
spatial_dims = range(2, num_spatial_dims + 2) | |
filter_dim = tensor_shape.dimension_at_index(filter_shape, num_spatial_dims) | |
if not (input_channels_dim % filter_dim).is_compatible_with(0): | |
raise ValueError( | |
"number of input channels is not divisible by corresponding " | |
"dimension of filter, {} % {} != 0".format( | |
input_channels_dim, filter_dim | |
) | |
) | |
strides, dilation_rate = nn_ops._get_strides_and_dilation_rate( | |
num_spatial_dims, strides, dilation_rate | |
) | |
self.input_shape = input_shape | |
self.filter_shape = filter_shape | |
self.data_format = data_format | |
self.strides = strides | |
self.padding = padding | |
self.name = name | |
self.dilation_rate = dilation_rate | |
self.conv_op = nn_ops._WithSpaceToBatch( | |
input_shape, | |
dilation_rate=dilation_rate, | |
padding=padding, | |
build_op=self._build_op, | |
filter_shape=filter_shape, | |
spatial_dims=spatial_dims, | |
data_format=data_format, | |
) | |
def _build_op(self, _, padding): | |
return nn_ops._NonAtrousConvolution( | |
self.input_shape, | |
filter_shape=self.filter_shape, | |
padding=padding, | |
data_format=self.data_format, | |
strides=self.strides, | |
name=self.name, | |
) | |
def __call__(self, inp, filter): | |
return self.conv_op(inp, filter) | |
class Conv(Layer): | |
"""Abstract N-D convolution layer (private, used as implementation base). | |
This layer creates a convolution kernel that is convolved | |
(actually cross-correlated) with the layer input to produce a tensor of | |
outputs. If `use_bias` is True (and a `bias_initializer` is provided), | |
a bias vector is created and added to the outputs. Finally, if | |
`activation` is not `None`, it is applied to the outputs as well. | |
Note: layer attributes cannot be modified after the layer has been called | |
once (except the `trainable` attribute). | |
Arguments: | |
rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution. | |
filters: Integer, the dimensionality of the output space (i.e. the number | |
of filters in the convolution). | |
kernel_size: An integer or tuple/list of n integers, specifying the | |
length of the convolution window. | |
strides: An integer or tuple/list of n integers, | |
specifying the stride length of the convolution. | |
Specifying any stride value != 1 is incompatible with specifying | |
any `dilation_rate` value != 1. | |
padding: One of `"valid"`, `"same"`, or `"causal"` (case-insensitive). | |
data_format: A string, one of `channels_last` (default) or `channels_first`. | |
The ordering of the dimensions in the inputs. | |
`channels_last` corresponds to inputs with shape | |
`(batch_size, ..., channels)` while `channels_first` corresponds to | |
inputs with shape `(batch_size, channels, ...)`. | |
dilation_rate: An integer or tuple/list of n integers, specifying | |
the dilation rate to use for dilated convolution. | |
Currently, specifying any `dilation_rate` value != 1 is | |
incompatible with specifying any `strides` value != 1. | |
groups: Integer, the number of channel groups controlling the connections | |
between inputs and outputs. Input channels and `filters` must both be | |
divisible by `groups`. For example, | |
- At `groups=1`, all inputs are convolved to all outputs. | |
- At `groups=2`, the operation becomes equivalent to having two | |
convolutional layers side by side, each seeing half the input | |
channels, and producing half the output channels, and both | |
subsequently concatenated. | |
- At `groups=input_channels`, each input channel is convolved with its | |
own set of filters, of size `input_channels / filters` | |
activation: Activation function to use. | |
If you don't specify anything, no activation is applied. | |
use_bias: Boolean, whether the layer uses a bias. | |
kernel_initializer: An initializer for the convolution kernel. | |
bias_initializer: An initializer for the bias vector. If None, the default | |
initializer will be used. | |
kernel_regularizer: Optional regularizer for the convolution kernel. | |
bias_regularizer: Optional regularizer for the bias vector. | |
activity_regularizer: Optional regularizer function for the output. | |
kernel_constraint: Optional projection function to be applied to the | |
kernel after being updated by an `Optimizer` (e.g. used to implement | |
norm constraints or value constraints for layer weights). The function | |
must take as input the unprojected variable and must return the | |
projected variable (which must have the same shape). Constraints are | |
not safe to use when doing asynchronous distributed training. | |
bias_constraint: Optional projection function to be applied to the | |
bias after being updated by an `Optimizer`. | |
trainable: Boolean, if `True` the weights of this layer will be marked as | |
trainable (and listed in `layer.trainable_weights`). | |
name: A string, the name of the layer. | |
""" | |
def __init__( | |
self, | |
rank, | |
filters, | |
kernel_size, | |
strides=1, | |
padding="valid", | |
data_format=None, | |
dilation_rate=1, | |
groups=1, | |
activation=None, | |
use_bias=True, | |
kernel_initializer="glorot_uniform", | |
bias_initializer="zeros", | |
kernel_regularizer=None, | |
bias_regularizer=None, | |
activity_regularizer=None, | |
kernel_constraint=None, | |
bias_constraint=None, | |
trainable=True, | |
name=None, | |
**kwargs | |
): | |
super(Conv, self).__init__( | |
trainable=trainable, | |
name=name, | |
activity_regularizer=regularizers.get(activity_regularizer), | |
**kwargs | |
) | |
self.rank = rank | |
if filters is not None and not isinstance(filters, int): | |
filters = int(filters) | |
self.filters = filters | |
self.groups = groups or 1 | |
if filters is not None and filters % self.groups != 0: | |
raise ValueError( | |
"The number of filters must be evenly divisible by the number of " | |
"groups. Received: groups={}, filters={}".format(groups, filters) | |
) | |
self.kernel_size = conv_utils.normalize_tuple(kernel_size, rank, "kernel_size") | |
if not all(self.kernel_size): | |
raise ValueError( | |
"The argument `kernel_size` cannot contain 0(s). " | |
"Received: %s" % (kernel_size,) | |
) | |
self.strides = conv_utils.normalize_tuple(strides, rank, "strides") | |
self.padding = conv_utils.normalize_padding(padding) | |
if self.padding == "causal" and not isinstance(self, (Conv1D, SeparableConv1D)): | |
raise ValueError( | |
"Causal padding is only supported for `Conv1D`" | |
"and ``SeparableConv1D`." | |
) | |
self.data_format = conv_utils.normalize_data_format(data_format) | |
self.dilation_rate = conv_utils.normalize_tuple( | |
dilation_rate, rank, "dilation_rate" | |
) | |
self.activation = activations.get(activation) | |
self.use_bias = use_bias | |
self.kernel_initializer = initializers.get(kernel_initializer) | |
self.bias_initializer = initializers.get(bias_initializer) | |
self.kernel_regularizer = regularizers.get(kernel_regularizer) | |
self.bias_regularizer = regularizers.get(bias_regularizer) | |
self.kernel_constraint = constraints.get(kernel_constraint) | |
self.bias_constraint = constraints.get(bias_constraint) | |
self.input_spec = InputSpec(ndim=self.rank + 2) | |
def build(self, input_shape): | |
input_shape = tensor_shape.TensorShape(input_shape) | |
input_channel = self._get_input_channel(input_shape) | |
if input_channel % self.groups != 0: | |
raise ValueError( | |
"The number of input channels must be evenly divisible by the number " | |
"of groups. Received groups={}, but the input has {} channels " | |
"(full input shape is {}).".format( | |
self.groups, input_channel, input_shape | |
) | |
) | |
kernel_shape = self.kernel_size + (input_channel // self.groups, self.filters) | |
self.kernel = self.add_weight( | |
name="kernel", | |
shape=kernel_shape, | |
initializer=self.kernel_initializer, | |
regularizer=self.kernel_regularizer, | |
constraint=self.kernel_constraint, | |
trainable=True, | |
dtype=self.dtype, | |
) | |
if self.use_bias: | |
self.bias = self.add_weight( | |
name="bias", | |
shape=(self.filters,), | |
initializer=self.bias_initializer, | |
regularizer=self.bias_regularizer, | |
constraint=self.bias_constraint, | |
trainable=True, | |
dtype=self.dtype, | |
) | |
else: | |
self.bias = None | |
channel_axis = self._get_channel_axis() | |
self.input_spec = InputSpec( | |
ndim=self.rank + 2, axes={channel_axis: input_channel} | |
) | |
self._build_conv_op_input_shape = input_shape | |
self._build_input_channel = input_channel | |
self._padding_op = self._get_padding_op() | |
self._conv_op_data_format = conv_utils.convert_data_format( | |
self.data_format, self.rank + 2 | |
) | |
self._convolution_op = Convolution( | |
input_shape, | |
filter_shape=self.kernel.shape, | |
dilation_rate=self.dilation_rate, | |
strides=self.strides, | |
padding=self._padding_op, | |
data_format=self._conv_op_data_format, | |
) | |
self.built = True | |
def call(self, inputs): | |
if self._recreate_conv_op(inputs): | |
self._convolution_op = Convolution( | |
inputs.get_shape(), | |
filter_shape=self.kernel.shape, | |
dilation_rate=self.dilation_rate, | |
strides=self.strides, | |
padding=self._padding_op, | |
data_format=self._conv_op_data_format, | |
) | |
self._build_conv_op_input_shape = inputs.get_shape() | |
# Apply causal padding to inputs for Conv1D. | |
if self.padding == "causal" and self.__class__.__name__ == "Conv1D": | |
inputs = array_ops.pad(inputs, self._compute_causal_padding()) | |
outputs = self._convolution_op(inputs, self.kernel) | |
if self.use_bias: | |
if self.data_format == "channels_first": | |
if self.rank == 1: | |
# nn.bias_add does not accept a 1D input tensor. | |
bias = array_ops.reshape(self.bias, (1, self.filters, 1)) | |
outputs += bias | |
else: | |
outputs = nn.bias_add(outputs, self.bias, data_format="NCHW") | |
else: | |
outputs = nn.bias_add(outputs, self.bias, data_format="NHWC") | |
if self.activation is not None: | |
return self.activation(outputs) | |
return outputs | |
def compute_output_shape(self, input_shape): | |
input_shape = tensor_shape.TensorShape(input_shape).as_list() | |
if self.data_format == "channels_last": | |
space = input_shape[1:-1] | |
new_space = [] | |
for i in range(len(space)): | |
new_dim = conv_utils.conv_output_length( | |
space[i], | |
self.kernel_size[i], | |
padding=self.padding, | |
stride=self.strides[i], | |
dilation=self.dilation_rate[i], | |
) | |
new_space.append(new_dim) | |
return tensor_shape.TensorShape( | |
[input_shape[0]] + new_space + [self.filters] | |
) | |
else: | |
space = input_shape[2:] | |
new_space = [] | |
for i in range(len(space)): | |
new_dim = conv_utils.conv_output_length( | |
space[i], | |
self.kernel_size[i], | |
padding=self.padding, | |
stride=self.strides[i], | |
dilation=self.dilation_rate[i], | |
) | |
new_space.append(new_dim) | |
return tensor_shape.TensorShape([input_shape[0], self.filters] + new_space) | |
def get_config(self): | |
config = { | |
"filters": self.filters, | |
"kernel_size": self.kernel_size, | |
"strides": self.strides, | |
"padding": self.padding, | |
"data_format": self.data_format, | |
"dilation_rate": self.dilation_rate, | |
"groups": self.groups, | |
"activation": activations.serialize(self.activation), | |
"use_bias": self.use_bias, | |
"kernel_initializer": initializers.serialize(self.kernel_initializer), | |
"bias_initializer": initializers.serialize(self.bias_initializer), | |
"kernel_regularizer": regularizers.serialize(self.kernel_regularizer), | |
"bias_regularizer": regularizers.serialize(self.bias_regularizer), | |
"activity_regularizer": regularizers.serialize(self.activity_regularizer), | |
"kernel_constraint": constraints.serialize(self.kernel_constraint), | |
"bias_constraint": constraints.serialize(self.bias_constraint), | |
} | |
base_config = super(Conv, self).get_config() | |
return dict(list(base_config.items()) + list(config.items())) | |
def _compute_causal_padding(self): | |
"""Calculates padding for 'causal' option for 1-d conv layers.""" | |
left_pad = self.dilation_rate[0] * (self.kernel_size[0] - 1) | |
if self.data_format == "channels_last": | |
causal_padding = [[0, 0], [left_pad, 0], [0, 0]] | |
else: | |
causal_padding = [[0, 0], [0, 0], [left_pad, 0]] | |
return causal_padding | |
def _get_channel_axis(self): | |
if self.data_format == "channels_first": | |
return 1 | |
else: | |
return -1 | |
def _get_input_channel(self, input_shape): | |
channel_axis = self._get_channel_axis() | |
if input_shape.dims[channel_axis].value is None: | |
raise ValueError( | |
"The channel dimension of the inputs " | |
"should be defined. Found `None`." | |
) | |
return int(input_shape[channel_axis]) | |
def _get_padding_op(self): | |
if self.padding == "causal": | |
op_padding = "valid" | |
else: | |
op_padding = self.padding | |
if not isinstance(op_padding, (list, tuple)): | |
op_padding = op_padding.upper() | |
return op_padding | |
def _recreate_conv_op(self, inputs): | |
"""Recreate conv_op if necessary. | |
Check if the input_shape in call() is different from that in build(). | |
For the values that are not None, if they are different, recreate | |
the _convolution_op to avoid the stateful behavior. | |
Args: | |
inputs: The input data to call() method. | |
Returns: | |
`True` or `False` to indicate whether to recreate the conv_op. | |
""" | |
call_input_shape = inputs.get_shape() | |
for axis in range(1, len(call_input_shape)): | |
if ( | |
call_input_shape[axis] is not None | |
and self._build_conv_op_input_shape[axis] is not None | |
and call_input_shape[axis] != self._build_conv_op_input_shape[axis] | |
): | |
return True | |
return False | |
class GroupConv1D(Conv): | |
"""1D convolution layer (e.g. temporal convolution). | |
This layer creates a convolution kernel that is convolved | |
with the layer input over a single spatial (or temporal) dimension | |
to produce a tensor of outputs. | |
If `use_bias` is True, a bias vector is created and added to the outputs. | |
Finally, if `activation` is not `None`, | |
it is applied to the outputs as well. | |
When using this layer as the first layer in a model, | |
provide an `input_shape` argument | |
(tuple of integers or `None`, e.g. | |
`(10, 128)` for sequences of 10 vectors of 128-dimensional vectors, | |
or `(None, 128)` for variable-length sequences of 128-dimensional vectors. | |
Examples: | |
>>> # The inputs are 128-length vectors with 10 timesteps, and the batch size | |
>>> # is 4. | |
>>> input_shape = (4, 10, 128) | |
>>> x = tf.random.normal(input_shape) | |
>>> y = tf.keras.layers.Conv1D( | |
... 32, 3, activation='relu',input_shape=input_shape)(x) | |
>>> print(y.shape) | |
(4, 8, 32) | |
Arguments: | |
filters: Integer, the dimensionality of the output space | |
(i.e. the number of output filters in the convolution). | |
kernel_size: An integer or tuple/list of a single integer, | |
specifying the length of the 1D convolution window. | |
strides: An integer or tuple/list of a single integer, | |
specifying the stride length of the convolution. | |
Specifying any stride value != 1 is incompatible with specifying | |
any `dilation_rate` value != 1. | |
padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive). | |
`"causal"` results in causal (dilated) convolutions, e.g. `output[t]` | |
does not depend on `input[t+1:]`. Useful when modeling temporal data | |
where the model should not violate the temporal order. | |
See [WaveNet: A Generative Model for Raw Audio, section | |
2.1](https://arxiv.org/abs/1609.03499). | |
data_format: A string, | |
one of `channels_last` (default) or `channels_first`. | |
groups: Integer, the number of channel groups controlling the connections | |
between inputs and outputs. Input channels and `filters` must both be | |
divisible by `groups`. For example, | |
- At `groups=1`, all inputs are convolved to all outputs. | |
- At `groups=2`, the operation becomes equivalent to having two | |
convolutional layers side by side, each seeing half the input | |
channels, and producing half the output channels, and both | |
subsequently concatenated. | |
- At `groups=input_channels`, each input channel is convolved with its | |
own set of filters, of size `input_channels / filters` | |
dilation_rate: an integer or tuple/list of a single integer, specifying | |
the dilation rate to use for dilated convolution. | |
Currently, specifying any `dilation_rate` value != 1 is | |
incompatible with specifying any `strides` value != 1. | |
activation: Activation function to use. | |
If you don't specify anything, no activation is applied ( | |
see `keras.activations`). | |
use_bias: Boolean, whether the layer uses a bias vector. | |
kernel_initializer: Initializer for the `kernel` weights matrix ( | |
see `keras.initializers`). | |
bias_initializer: Initializer for the bias vector ( | |
see `keras.initializers`). | |
kernel_regularizer: Regularizer function applied to | |
the `kernel` weights matrix (see `keras.regularizers`). | |
bias_regularizer: Regularizer function applied to the bias vector ( | |
see `keras.regularizers`). | |
activity_regularizer: Regularizer function applied to | |
the output of the layer (its "activation") ( | |
see `keras.regularizers`). | |
kernel_constraint: Constraint function applied to the kernel matrix ( | |
see `keras.constraints`). | |
bias_constraint: Constraint function applied to the bias vector ( | |
see `keras.constraints`). | |
Input shape: | |
3D tensor with shape: `(batch_size, steps, input_dim)` | |
Output shape: | |
3D tensor with shape: `(batch_size, new_steps, filters)` | |
`steps` value might have changed due to padding or strides. | |
Returns: | |
A tensor of rank 3 representing | |
`activation(conv1d(inputs, kernel) + bias)`. | |
Raises: | |
ValueError: when both `strides` > 1 and `dilation_rate` > 1. | |
""" | |
def __init__( | |
self, | |
filters, | |
kernel_size, | |
strides=1, | |
padding="valid", | |
data_format="channels_last", | |
dilation_rate=1, | |
groups=1, | |
activation=None, | |
use_bias=True, | |
kernel_initializer="glorot_uniform", | |
bias_initializer="zeros", | |
kernel_regularizer=None, | |
bias_regularizer=None, | |
activity_regularizer=None, | |
kernel_constraint=None, | |
bias_constraint=None, | |
**kwargs | |
): | |
super().__init__( | |
rank=1, | |
filters=filters, | |
kernel_size=kernel_size, | |
strides=strides, | |
padding=padding, | |
data_format=data_format, | |
dilation_rate=dilation_rate, | |
groups=groups, | |
activation=activations.get(activation), | |
use_bias=use_bias, | |
kernel_initializer=initializers.get(kernel_initializer), | |
bias_initializer=initializers.get(bias_initializer), | |
kernel_regularizer=regularizers.get(kernel_regularizer), | |
bias_regularizer=regularizers.get(bias_regularizer), | |
activity_regularizer=regularizers.get(activity_regularizer), | |
kernel_constraint=constraints.get(kernel_constraint), | |
bias_constraint=constraints.get(bias_constraint), | |
**kwargs | |
) | |