Upload 3 files

Browse files

Files changed (3) hide show

collator.py +68 -0
modeling_tacotron2.py +323 -0
processing_tacotron2.py +224 -0

collator.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from typing import Union
+import numpy as np
+from transformers.utils import TensorType
+from transformers.feature_extraction_utils import BatchFeature
+class PadAndSortCollator:
+    def __init__(self, processor, return_tensors: Union[str, TensorType] = "pt"):
+        self.processor = processor
+        self.return_tensors = return_tensors
+    def __call__(self, batch):
+        """
+        expect batch with `return_tensors=None` from processor
+        batch: input_ids, length(optional), mel_specgram, mel_specgram_length(optional)
+        """
+        text_batch = {}
+        text_batch["input_ids"] = [x["input_ids"] for x in batch]
+        if "length" in batch[0]:
+            text_batch["length"] = [x["length"] for x in batch]
+        else:
+            text_batch["length"] = [len(x["input_ids"]) for x in batch]
+        audio_batch = {}
+        # transpose mel_specgram for padding
+        audio_batch["mel_specgram"] = [
+            x["mel_specgram"][0].transpose(1, 0) for x in batch
+        ]
+        if "mel_specgram_length" in batch[0]:
+            audio_batch["mel_specgram_length"] = [
+                x["mel_specgram_length"] for x in batch
+            ]
+        else:
+            audio_batch["mel_specgram_length"] = [
+                x["mel_specgram"][0].shape[1] for x in batch
+            ]
+        text_batch = self.processor.tokenizer.pad(
+            text_batch,
+            padding=True,
+            return_tensors="np",
+            return_attention_mask=False,
+        )
+        audio_batch = self.processor.feature_extractor.pad(
+            audio_batch,
+            padding=True,
+            return_tensors="np",
+            return_attention_mask=True,
+        )
+        audio_batch["mel_specgram"] = audio_batch["mel_specgram"].transpose(0, 2, 1)
+        attention_mask = audio_batch.pop("attention_mask")
+        gate_padded = 1 - attention_mask
+        gate_padded = np.roll(gate_padded, -1, axis=1)
+        gate_padded[:, -1] = 1
+        gate_padded = gate_padded.astype(np.float32)
+        output = {**text_batch, **audio_batch, "gate_padded": gate_padded}
+        # sort by text length
+        sort_idx = np.argsort(output["length"])[::-1]
+        for key, value in output.items():
+            output[key] = value[sort_idx]
+        return BatchFeature(output, tensor_type=self.return_tensors)

modeling_tacotron2.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from torch import Tensor, nn
+from torchaudio.models import Tacotron2
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.utils import ModelOutput
+@dataclass
+class Tacotron2Output(ModelOutput):
+    """
+    mel_outputs_postnet
+        The predicted mel spectrogram with shape
+        `(n_batch, n_mels, max of mel_specgram_lengths)`.
+    mel_specgram_lengths
+        The length of the predicted mel spectrogram with shape `(n_batch, )`.
+    alignments
+        Sequence of attention weights from the decoder with shape
+        `(n_batch, max of mel_specgram_lengths, max of lengths)`.
+    """
+    mel_outputs_postnet: Tensor = None
+    mel_specgram_lengths: Tensor = None
+    alignments: Tensor = None
+@dataclass
+class Tacotron2ForPreTrainingOutput(ModelOutput):
+    """
+    mel_specgram
+        Mel spectrogram before Postnet with shape
+        `(n_batch, n_mels, max of mel_specgram_lengths)`.
+    mel_specgram_postnet
+        Mel spectrogram after Postnet with shape
+        `(n_batch, n_mels, max of mel_specgram_lengths)`.
+    gate_outputs
+        The output for stop token at each time step with shape
+        `(n_batch, max of mel_specgram_lengths)`.
+    alignments
+        Sequence of attention weights from the decoder with shape
+        `(n_batch, max of mel_specgram_lengths, max of token_lengths)`.
+    """
+    mel_specgram: Tensor = None
+    mel_specgram_postnet: Tensor = None
+    gate_outputs: Tensor = None
+    alignments: Tensor = None
+    loss: Optional[Tensor] = None
+    mel_loss: Optional[Tensor] = None
+    mel_postnet_loss: Optional[Tensor] = None
+    gate_loss: Optional[Tensor] = None
+class Tacotron2Config(PretrainedConfig):
+    def __init__(
+        self,
+        mask_padding: bool = False,
+        n_mels: int = 80,
+        n_symbol: int = 392,
+        n_frames_per_step: int = 1,
+        symbol_embedding_dim: int = 512,
+        encoder_embedding_dim: int = 512,
+        encoder_n_convolution: int = 3,
+        encoder_kernel_size: int = 5,
+        decoder_rnn_dim: int = 1024,
+        decoder_max_step: int = 2000,
+        decoder_dropout: float = 0.1,
+        decoder_early_stopping: bool = True,
+        attention_rnn_dim: int = 1024,
+        attention_hidden_dim: int = 128,
+        attention_location_n_filter: int = 32,
+        attention_location_kernel_size: int = 31,
+        attention_dropout: float = 0.1,
+        prenet_dim: int = 256,
+        postnet_n_convolution: int = 5,
+        postnet_kernel_size: int = 5,
+        postnet_embedding_dim: int = 512,
+        gate_threshold: float = 0.5,
+        **kwargs,
+    ):
+        # https://pytorch.org/audio/stable/generated/torchaudio.models.Tacotron2.html#torchaudio.models.Tacotron2  # noqa
+        if n_frames_per_step != 1:
+            raise ValueError(
+                f"n_frames_per_step: only 1 is supported, got {n_frames_per_step}"
+            )
+        self.mask_padding = mask_padding
+        self.n_mels = n_mels
+        self.n_symbol = n_symbol
+        self.n_frames_per_step = n_frames_per_step
+        self.symbol_embedding_dim = symbol_embedding_dim
+        self.encoder_embedding_dim = encoder_embedding_dim
+        self.encoder_n_convolution = encoder_n_convolution
+        self.encoder_kernel_size = encoder_kernel_size
+        self.decoder_rnn_dim = decoder_rnn_dim
+        self.decoder_max_step = decoder_max_step
+        self.decoder_dropout = decoder_dropout
+        self.decoder_early_stopping = decoder_early_stopping
+        self.attention_rnn_dim = attention_rnn_dim
+        self.attention_hidden_dim = attention_hidden_dim
+        self.attention_location_n_filter = attention_location_n_filter
+        self.attention_location_kernel_size = attention_location_kernel_size
+        self.attention_dropout = attention_dropout
+        self.prenet_dim = prenet_dim
+        self.postnet_n_convolution = postnet_n_convolution
+        self.postnet_kernel_size = postnet_kernel_size
+        self.postnet_embedding_dim = postnet_embedding_dim
+        self.gate_threshold = gate_threshold
+        super().__init__(**kwargs)
+class Tacotron2PreTrainedModel(PreTrainedModel):
+    config_class = Tacotron2Config
+    base_model_prefix = "tacotron2"
+    main_input_name = "input_ids"
+class Tacotron2Model(Tacotron2PreTrainedModel):
+    def __init__(self, config: Tacotron2Config):
+        super().__init__(config)
+        self.tacotron2 = Tacotron2(
+            mask_padding=config.mask_padding,
+            n_mels=config.n_mels,
+            n_symbol=config.n_symbol,
+            n_frames_per_step=config.n_frames_per_step,
+            symbol_embedding_dim=config.symbol_embedding_dim,
+            encoder_embedding_dim=config.encoder_embedding_dim,
+            encoder_n_convolution=config.encoder_n_convolution,
+            encoder_kernel_size=config.encoder_kernel_size,
+            decoder_rnn_dim=config.decoder_rnn_dim,
+            decoder_max_step=config.decoder_max_step,
+            decoder_dropout=config.decoder_dropout,
+            decoder_early_stopping=config.decoder_early_stopping,
+            attention_rnn_dim=config.attention_rnn_dim,
+            attention_hidden_dim=config.attention_hidden_dim,
+            attention_location_n_filter=config.attention_location_n_filter,
+            attention_location_kernel_size=config.attention_location_kernel_size,
+            attention_dropout=config.attention_dropout,
+            prenet_dim=config.prenet_dim,
+            postnet_n_convolution=config.postnet_n_convolution,
+            postnet_kernel_size=config.postnet_kernel_size,
+            postnet_embedding_dim=config.postnet_embedding_dim,
+            gate_threshold=config.gate_threshold,
+        )
+    def forward(
+        self,
+        input_ids: Tensor,
+        length: Optional[Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Using Tacotron2 for inference. The input is a batch of encoded
+        sentences (``tokens``) and its corresponding lengths (``lengths``). The
+        output is the generated mel spectrograms, its corresponding lengths, and
+        the attention weights from the decoder.
+        The input `tokens` should be padded with zeros to length max of ``lengths``.
+        Args:
+            tokens (Tensor):
+                The input tokens to Tacotron2 with shape `(n_batch, max of lengths)`.
+            lengths (Tensor or None, optional):
+                The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
+                If ``None``, it is assumed that the all the tokens are valid.
+                Default: ``None``
+        Returns:
+            (Tensor, Tensor, Tensor):
+                Tensor
+                    The predicted mel spectrogram with shape
+                    `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    The length of the predicted mel spectrogram with shape
+                    `(n_batch, )`.
+                Tensor
+                    Sequence of attention weights from the decoder with shape
+                    `(n_batch, max of mel_specgram_lengths, max of lengths)`.
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.tacotron2.infer(tokens=input_ids, lengths=length)
+        if not return_dict:
+            return outputs
+        return Tacotron2Output(
+            mel_outputs_postnet=outputs[0],
+            mel_specgram_lengths=outputs[1],
+            alignments=outputs[2],
+        )
+class Tacotron2Loss(nn.Module):
+    """Tacotron2 loss function modified from:
+    https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/loss_function.py  # noqa
+    """
+    def __init__(self):
+        super().__init__()
+        self.mse_loss = nn.MSELoss(reduction="mean")
+        self.bce_loss = nn.BCEWithLogitsLoss(reduction="mean")
+    def forward(
+        self,
+        model_outputs: Tuple[Tensor, Tensor, Tensor],
+        targets: Tuple[Tensor, Tensor],
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        r"""Pass the input through the Tacotron2 loss.
+        The original implementation was introduced in
+        *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions*
+        [:footcite:`shen2018natural`].
+        Args:
+            model_outputs (tuple of three Tensors): The outputs of the
+                Tacotron2. These outputs should include three items:
+                (1) the predicted mel spectrogram before the postnet (``mel_specgram``)
+                    with shape (batch, mel, time).
+                (2) predicted mel spectrogram after the postnet (``mel_specgram_postnet``)  # noqa
+                    with shape (batch, mel, time), and
+                (3) the stop token prediction (``gate_out``) with shape (batch, ).
+            targets (tuple of two Tensors):
+                The ground truth mel spectrogram (batch, mel, time) and
+                stop token with shape (batch, ).
+        Returns:
+            mel_loss (Tensor): The mean MSE of the mel_specgram and ground truth mel spectrogram  # noqa
+                with shape ``torch.Size([])``.
+            mel_postnet_loss (Tensor): The mean MSE of the mel_specgram_postnet and
+                ground truth mel spectrogram with shape ``torch.Size([])``.
+            gate_loss (Tensor): The mean binary cross entropy loss of
+                the prediction on the stop token with shape ``torch.Size([])``.
+        """
+        mel_target, gate_target = targets[0], targets[1]
+        gate_target = gate_target.view(-1, 1)
+        mel_specgram, mel_specgram_postnet, gate_out = model_outputs
+        gate_out = gate_out.view(-1, 1)
+        mel_loss = self.mse_loss(mel_specgram, mel_target)
+        mel_postnet_loss = self.mse_loss(mel_specgram_postnet, mel_target)
+        gate_loss = self.bce_loss(gate_out, gate_target)
+        return mel_loss, mel_postnet_loss, gate_loss
+class Tacotron2ForPreTraining(Tacotron2PreTrainedModel):
+    def __init__(self, config: Tacotron2Config):
+        super().__init__(config)
+        self.tacotron2 = Tacotron2(
+            mask_padding=config.mask_padding,
+            n_mels=config.n_mels,
+            n_symbol=config.n_symbol,
+            n_frames_per_step=config.n_frames_per_step,
+            symbol_embedding_dim=config.symbol_embedding_dim,
+            encoder_embedding_dim=config.encoder_embedding_dim,
+            encoder_n_convolution=config.encoder_n_convolution,
+            encoder_kernel_size=config.encoder_kernel_size,
+            decoder_rnn_dim=config.decoder_rnn_dim,
+            decoder_max_step=config.decoder_max_step,
+            decoder_dropout=config.decoder_dropout,
+            decoder_early_stopping=config.decoder_early_stopping,
+            attention_rnn_dim=config.attention_rnn_dim,
+            attention_hidden_dim=config.attention_hidden_dim,
+            attention_location_n_filter=config.attention_location_n_filter,
+            attention_location_kernel_size=config.attention_location_kernel_size,
+            attention_dropout=config.attention_dropout,
+            prenet_dim=config.prenet_dim,
+            postnet_n_convolution=config.postnet_n_convolution,
+            postnet_kernel_size=config.postnet_kernel_size,
+            postnet_embedding_dim=config.postnet_embedding_dim,
+            gate_threshold=config.gate_threshold,
+        )
+        self.loss_fct = Tacotron2Loss()
+    def sync_batchnorm(self):
+        self.tacotron2 = nn.SyncBatchNorm.convert_sync_batchnorm(self.tacotron2)
+    def forward(
+        self,
+        input_ids: Tensor,
+        length: Tensor,
+        mel_specgram: Tensor,
+        mel_specgram_length: Tensor,
+        gate_padded: Optional[Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.tacotron2(
+            tokens=input_ids,
+            token_lengths=length,
+            mel_specgram=mel_specgram,
+            mel_specgram_lengths=mel_specgram_length,
+        )
+        loss = mel_loss = mel_postnet_loss = gate_loss = None
+        if gate_padded is not None:
+            targets = (mel_specgram, gate_padded)
+            targets[0].requires_grad = False
+            targets[1].requires_grad = False
+            mel_loss, mel_postnet_loss, gate_loss = self.loss_fct(outputs[:3], targets)
+            loss = mel_loss + mel_postnet_loss + gate_loss
+        if not return_dict:
+            if loss is not None:
+                return outputs + (loss, mel_loss, mel_postnet_loss, gate_loss)
+            return outputs
+        return Tacotron2ForPreTrainingOutput(
+            mel_specgram=outputs[0],
+            mel_specgram_postnet=outputs[1],
+            gate_outputs=outputs[2],
+            alignments=outputs[3],
+            loss=loss,
+            mel_loss=mel_loss,
+            mel_postnet_loss=mel_postnet_loss,
+            gate_loss=gate_loss,
+        )

processing_tacotron2.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import copy
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import torch
+from torchaudio.transforms import MelSpectrogram
+from transformers import Wav2Vec2PhonemeCTCTokenizer
+from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessorMixin
+from transformers.utils import TensorType, logging
+logger = logging.get_logger(__name__)
+AudioType = Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]
+class Tacotron2FeatureExtractor(SequenceFeatureExtractor):
+    model_input_names = ["mel_specgram", "mel_specgram_length", "gate_padded"]
+    def __init__(
+        self,
+        feature_size: int = 80,  # n_mels
+        sampling_rate: int = 22050,
+        n_fft: int = 1024,
+        hop_length: int = 256,
+        win_length: int = 1024,
+        mel_fmin: float = 0.0,
+        mel_fmax: float = 8000.0,
+        padding_value: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            **kwargs,
+        )
+        self.feature_size = feature_size
+        self.sampling_rate = sampling_rate
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.mel_fmin = mel_fmin
+        self.mel_fmax = mel_fmax
+    def mel_specgram(self, waveform: torch.Tensor) -> torch.Tensor:
+        if not hasattr(self, "_mel_specgram"):
+            self._mel_specgram = MelSpectrogram(
+                sample_rate=self.sampling_rate,
+                n_fft=self.n_fft,
+                win_length=self.win_length,
+                hop_length=self.hop_length,
+                f_min=self.mel_fmin,
+                f_max=self.mel_fmax,
+                n_mels=self.feature_size,
+                mel_scale="slaney",
+                normalized=False,
+                power=1,
+                norm="slaney",
+            )
+        melspectrogram = self._mel_specgram(waveform)
+        # spectral normalization
+        output = torch.log(torch.clamp(melspectrogram, min=1e-5))
+        # transpose for padding
+        return output.permute(1, 0)
+    def __call__(
+        self,
+        audio: AudioType,
+        sampling_rate: Optional[int] = None,
+        padding: Union[bool, str] = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_length: bool = False,
+        return_gate_padded: bool = False,
+        **kwargs,
+    ) -> BatchFeature:
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided `audio` input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        is_batched = bool(
+            isinstance(audio, (list, tuple))
+            and (
+                isinstance(audio[0], np.ndarray) or isinstance(audio[0], (tuple, list))
+            )
+        )
+        if is_batched:
+            audio = [np.asarray(speech, dtype=np.float32) for speech in audio]
+        elif not is_batched and not isinstance(audio, np.ndarray):
+            audio = np.asarray(audio, dtype=np.float32)
+        elif isinstance(audio, np.ndarray) and audio.dtype is np.dtype(np.float64):
+            audio = audio.astype(np.float32)
+        # always return batch
+        if not is_batched:
+            audio = [audio]
+        features = [
+            self.mel_specgram(torch.from_numpy(one_waveform)).numpy()
+            for one_waveform in audio
+        ]
+        encoded_inputs = BatchFeature({"mel_specgram": features})
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            return_attention_mask=return_gate_padded,
+            **kwargs,
+        )
+        if return_length:
+            mel_specgram_length = [mel.shape[0] for mel in features]
+            if len(mel_specgram_length) == 1 and return_tensors is None:
+                mel_specgram_length = mel_specgram_length[0]
+            padded_inputs["mel_specgram_length"] = mel_specgram_length
+        if return_gate_padded:
+            gate_padded = 1 - padded_inputs.pop("attention_mask")
+            gate_padded = np.roll(gate_padded, -1, axis=1)
+            gate_padded[:, -1] = 1
+            gate_padded = gate_padded.astype(np.float32)
+            padded_inputs["gate_padded"] = gate_padded
+        mel_specgram = padded_inputs["mel_specgram"]
+        if isinstance(mel_specgram[0], list):
+            padded_inputs["mel_specgram"] = [
+                np.asarray(feature, dtype=np.float32) for feature in mel_specgram
+            ]
+        padded_inputs["mel_specgram"] = [
+            spec.transpose(1, 0) for spec in padded_inputs["mel_specgram"]
+        ]
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+        return padded_inputs
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__
+        output.pop("_mel_specgram", None)
+        return output
+class Tacotron2Processor(ProcessorMixin):
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "Wav2Vec2PhonemeCTCTokenizer"
+    def __init__(self, feature_extractor, tokenizer):
+        self.feature_extractor = feature_extractor
+        self.tokenizer = tokenizer
+        self.current_processor = self.feature_extractor
+    def __call__(
+        self,
+        text: Optional[str] = None,
+        audio: Optional[AudioType] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_length: bool = True,
+        **kwargs,
+    ) -> Any:
+        if text is None and audio is None:
+            raise ValueError(
+                "You have to specify either text or audio. Both cannot be none."
+            )
+        if text is not None:
+            encoding = self.tokenizer(
+                text,
+                return_tensors=return_tensors,
+                padding=True,
+                return_attention_mask=False,
+                return_length=return_length,
+            )
+        if audio is not None:
+            features = self.feature_extractor(
+                audio,
+                return_tensors=return_tensors,
+                return_length=return_length,
+                **kwargs,
+            )
+        if text is not None and audio is not None:
+            return BatchFeature({**features, **encoding})
+        elif text is not None:
+            return encoding
+        else:
+            return features
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)