from torch import nn

from TTS.tts.layers.generic.res_conv_bn import Conv1dBN


class DurationPredictor(nn.Module):
    """Speedy Speech duration predictor model.
    Predicts phoneme durations from encoder outputs.

    Note:
        Outputs interpreted as log(durations)
        To get actual durations, do exp transformation

    conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1

    Args:
        hidden_channels (int): number of channels in the inner layers.
    """

    def __init__(self, hidden_channels):
        super().__init__()

        self.layers = nn.ModuleList(
            [
                Conv1dBN(hidden_channels, hidden_channels, 4, 1),
                Conv1dBN(hidden_channels, hidden_channels, 3, 1),
                Conv1dBN(hidden_channels, hidden_channels, 1, 1),
                nn.Conv1d(hidden_channels, 1, 1),
            ]
        )

    def forward(self, x, x_mask):
        """
        Shapes:
            x: [B, C, T]
            x_mask: [B, 1, T]
        """
        o = x
        for layer in self.layers:
            o = layer(o) * x_mask
        return o