Text-to-Speech

Runtime error

File size: 12,219 Bytes

0d80816

# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import torch
import torch.nn as nn

from modules.encoder.position_encoder import PositionEncoder
from modules.general.utils import append_dims, ConvNd, normalization, zero_module
from .attention import AttentionBlock
from .resblock import Downsample, ResBlock, Upsample


class UNet(nn.Module):
    r"""The full UNet model with attention and timestep embedding.

    Args:
        dims: determines if the signal is 1D (temporal), 2D(spatial).
        in_channels: channels in the input Tensor.
        model_channels: base channel count for the model.
        out_channels: channels in the output Tensor.
        num_res_blocks: number of residual blocks per downsample.
        channel_mult: channel multiplier for each level of the UNet.
        num_attn_blocks: number of attention blocks at place.
        attention_resolutions: a collection of downsample rates at which attention will
            take place. May be a set, list, or tuple. For example, if this contains 4,
            then at 4x downsampling, attention will be used.
        num_heads: the number of attention heads in each attention layer.
        num_head_channels: if specified, ignore num_heads and instead use a fixed
            channel width per attention head.
        d_context: if specified, use for cross-attention channel project.
        p_dropout: the dropout probability.
        use_self_attention: Apply self attention before cross attention.
        num_classes: if specified (as an int), then this model will be class-conditional
            with ``num_classes`` classes.
        use_extra_film: if specified, use an extra FiLM-like conditioning mechanism.
        d_emb: if specified, use for FiLM-like conditioning.
        use_scale_shift_norm: use a FiLM-like conditioning mechanism.
        resblock_updown: use residual blocks for up/downsampling.
    """

    def __init__(
        self,
        dims: int = 1,
        in_channels: int = 100,
        model_channels: int = 128,
        out_channels: int = 100,
        h_dim: int = 128,
        num_res_blocks: int = 1,
        channel_mult: tuple = (1, 2, 4),
        num_attn_blocks: int = 1,
        attention_resolutions: tuple = (1, 2, 4),
        num_heads: int = 1,
        num_head_channels: int = -1,
        d_context: int = None,
        context_hdim: int = 128,
        p_dropout: float = 0.0,
        num_classes: int = -1,
        use_extra_film: str = None,
        d_emb: int = None,
        use_scale_shift_norm: bool = True,
        resblock_updown: bool = False,
    ):
        super().__init__()

        self.dims = dims
        self.in_channels = in_channels
        self.model_channels = model_channels
        self.out_channels = out_channels
        self.num_res_blocks = num_res_blocks
        self.channel_mult = channel_mult
        self.num_attn_blocks = num_attn_blocks
        self.attention_resolutions = attention_resolutions
        self.num_heads = num_heads
        self.num_head_channels = num_head_channels
        self.d_context = d_context
        self.p_dropout = p_dropout
        self.num_classes = num_classes
        self.use_extra_film = use_extra_film
        self.d_emb = d_emb
        self.use_scale_shift_norm = use_scale_shift_norm
        self.resblock_updown = resblock_updown

        time_embed_dim = model_channels * 4
        self.pos_enc = PositionEncoder(model_channels, time_embed_dim)

        assert (
            num_classes == -1 or use_extra_film is None
        ), "You cannot set both num_classes and use_extra_film."

        if self.num_classes > 0:
            # TODO: if used for singer, norm should be 1, correct?
            self.label_emb = nn.Embedding(num_classes, time_embed_dim, max_norm=1.0)
        elif use_extra_film is not None:
            assert (
                d_emb is not None
            ), "d_emb must be specified if use_extra_film is not None"
            assert use_extra_film in [
                "add",
                "concat",
            ], f"use_extra_film only supported by add or concat. Your input is {use_extra_film}"
            self.use_extra_film = use_extra_film
            self.film_emb = ConvNd(dims, d_emb, time_embed_dim, 1)
            if use_extra_film == "concat":
                time_embed_dim *= 2

        # Input blocks
        ch = input_ch = int(channel_mult[0] * model_channels)
        self.input_blocks = nn.ModuleList(
            [UNetSequential(ConvNd(dims, in_channels, ch, 3, padding=1))]
        )
        self._feature_size = ch
        input_block_chans = [ch]
        ds = 1
        for level, mult in enumerate(channel_mult):
            for _ in range(num_res_blocks):
                layers = [
                    ResBlock(
                        ch,
                        time_embed_dim,
                        p_dropout,
                        out_channels=int(mult * model_channels),
                        dims=dims,
                        use_scale_shift_norm=use_scale_shift_norm,
                    )
                ]
                ch = int(mult * model_channels)
                if ds in attention_resolutions:
                    for _ in range(num_attn_blocks):
                        layers.append(
                            AttentionBlock(
                                ch,
                                num_heads=num_heads,
                                num_head_channels=num_head_channels,
                                encoder_channels=d_context,
                                dims=dims,
                                h_dim=h_dim // (level + 1),
                                encoder_hdim=context_hdim,
                                p_dropout=p_dropout,
                            )
                        )
                self.input_blocks.append(UNetSequential(*layers))
                self._feature_size += ch
                input_block_chans.append(ch)
            if level != len(channel_mult) - 1:
                out_ch = ch
                self.input_blocks.append(
                    UNetSequential(
                        ResBlock(
                            ch,
                            time_embed_dim,
                            p_dropout,
                            out_channels=out_ch,
                            dims=dims,
                            use_scale_shift_norm=use_scale_shift_norm,
                            down=True,
                        )
                        if resblock_updown
                        else Downsample(ch, dims=dims, out_channels=out_ch)
                    )
                )
                ch = out_ch
                input_block_chans.append(ch)
                ds *= 2
                self._feature_size += ch

        # Middle blocks
        self.middle_block = UNetSequential(
            ResBlock(
                ch,
                time_embed_dim,
                p_dropout,
                dims=dims,
                use_scale_shift_norm=use_scale_shift_norm,
            ),
            AttentionBlock(
                ch,
                num_heads=num_heads,
                num_head_channels=num_head_channels,
                encoder_channels=d_context,
                dims=dims,
                h_dim=h_dim // (level + 1),
                encoder_hdim=context_hdim,
                p_dropout=p_dropout,
            ),
            ResBlock(
                ch,
                time_embed_dim,
                p_dropout,
                dims=dims,
                use_scale_shift_norm=use_scale_shift_norm,
            ),
        )
        self._feature_size += ch

        # Output blocks
        self.output_blocks = nn.ModuleList([])
        for level, mult in tuple(enumerate(channel_mult))[::-1]:
            for i in range(num_res_blocks + 1):
                ich = input_block_chans.pop()
                layers = [
                    ResBlock(
                        ch + ich,
                        time_embed_dim,
                        p_dropout,
                        out_channels=int(model_channels * mult),
                        dims=dims,
                        use_scale_shift_norm=use_scale_shift_norm,
                    )
                ]
                ch = int(model_channels * mult)
                if ds in attention_resolutions:
                    for _ in range(num_attn_blocks):
                        layers.append(
                            AttentionBlock(
                                ch,
                                num_heads=num_heads,
                                num_head_channels=num_head_channels,
                                encoder_channels=d_context,
                                dims=dims,
                                h_dim=h_dim // (level + 1),
                                encoder_hdim=context_hdim,
                                p_dropout=p_dropout,
                            )
                        )
                if level and i == num_res_blocks:
                    out_ch = ch
                    layers.append(
                        ResBlock(
                            ch,
                            time_embed_dim,
                            p_dropout,
                            out_channels=out_ch,
                            dims=dims,
                            use_scale_shift_norm=use_scale_shift_norm,
                            up=True,
                        )
                        if resblock_updown
                        else Upsample(ch, dims=dims, out_channels=out_ch)
                    )
                    ds //= 2
                self.output_blocks.append(UNetSequential(*layers))
                self._feature_size += ch

        # Final proj out
        self.out = nn.Sequential(
            normalization(ch),
            nn.SiLU(),
            zero_module(ConvNd(dims, input_ch, out_channels, 3, padding=1)),
        )

    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
        r"""Apply the model to an input batch.

        Args:
            x: an [N x C x ...] Tensor of inputs.
            timesteps: a 1-D batch of timesteps, i.e. [N].
            context: conditioning Tensor with shape of [N x ``d_context`` x ...] plugged
            in via cross attention.
            y: an [N] Tensor of labels, if **class-conditional**.
            an [N x ``d_emb`` x ...] Tensor if **film-embed conditional**.

        Returns:
            an [N x C x ...] Tensor of outputs.
        """
        assert (y is None) or (
            (y is not None)
            and ((self.num_classes > 0) or (self.use_extra_film is not None))
        ), f"y must be specified if num_classes or use_extra_film is not None. \nGot num_classes: {self.num_classes}\t\nuse_extra_film: {self.use_extra_film}\t\n"

        hs = []
        emb = self.pos_enc(timesteps)
        emb = append_dims(emb, x.dim())

        if self.num_classes > 0:
            assert y.size() == (x.size(0),)
            emb = emb + self.label_emb(y)
        elif self.use_extra_film is not None:
            assert y.size() == (x.size(0), self.d_emb, *x.size()[2:])
            y = self.film_emb(y)
            if self.use_extra_film == "add":
                emb = emb + y
            elif self.use_extra_film == "concat":
                emb = torch.cat([emb, y], dim=1)

        h = x
        for module in self.input_blocks:
            h = module(h, emb, context)
            hs.append(h)
        h = self.middle_block(h, emb, context)
        for module in self.output_blocks:
            h = torch.cat([h, hs.pop()], dim=1)
            h = module(h, emb, context)

        return self.out(h)


class UNetSequential(nn.Sequential):
    r"""A sequential module that passes embeddings to the children that support it."""

    def forward(self, x, emb=None, context=None):
        for layer in self:
            if isinstance(layer, ResBlock):
                x = layer(x, emb)
            elif isinstance(layer, AttentionBlock):
                x = layer(x, context)
            else:
                x = layer(x)
        return x