Spaces:

amphion
/

NaturalSpeech2

Sleeping

File size: 27,987 Bytes

b725c5a

# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# This code is modified from https://github.com/lifeiteng/vall-e/blob/main/valle/models/valle.py

import random
from typing import Dict, Iterator, List, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics.classification import MulticlassAccuracy
from utils.util import make_pad_mask
from utils.topk_sampling import topk_sampling
from modules.general import Transpose
from modules.encoder import TokenEmbedding
from modules.general import PromptedFeatures
from modules.transformer import SinePositionalEmbedding
from modules.norms import AdaptiveLayerNorm, LayerNorm
from modules.transformer.transformer import TransformerEncoder, TransformerEncoderLayer


class VALLE(nn.Module):
    def __init__(
        self,
        cfg,
        decoder_cls=TransformerEncoder,
        decoder_layer_cls=TransformerEncoderLayer,
    ):
        super().__init__()
        decoder_dim = cfg.decoder_dim
        nhead = cfg.nhead
        nar_scale_factor = cfg.nar_scale_factor
        num_quantizers = cfg.num_quantizers
        num_decoder_layers = cfg.num_decoder_layers
        nar_decoder_dim = int(decoder_dim * nar_scale_factor)

        self.ar_text_embedding = TokenEmbedding(decoder_dim, cfg.text_token_num)
        self.nar_text_embedding = TokenEmbedding(nar_decoder_dim, cfg.text_token_num)

        self.ar_audio_prepend_bos = cfg.prepend_bos
        self.ar_audio_embedding = TokenEmbedding(
            decoder_dim, cfg.audio_token_num + 1 + int(cfg.prepend_bos)
        )
        self.audio_token_num = cfg.audio_token_num

        # PreNet of AR
        if cfg.add_prenet:
            self.ar_text_prenet = nn.Sequential(
                Transpose(),
                nn.Conv1d(decoder_dim, decoder_dim, kernel_size=5, padding="same"),
                nn.BatchNorm1d(decoder_dim),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Conv1d(decoder_dim, decoder_dim, kernel_size=5, padding="same"),
                nn.BatchNorm1d(decoder_dim),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Conv1d(decoder_dim, decoder_dim, kernel_size=5, padding="same"),
                nn.BatchNorm1d(decoder_dim),
                nn.ReLU(),
                nn.Dropout(0.5),
                Transpose(),
                nn.Linear(decoder_dim, decoder_dim),
            )

            self.ar_audio_prenet = nn.Sequential(
                nn.Linear(decoder_dim, 256),
                nn.ReLU(),
                nn.Dropout(0.25),
                nn.Linear(256, 256),
                nn.ReLU(),
                nn.Dropout(0.25),
                nn.Linear(256, decoder_dim),
            )
        else:
            self.ar_text_prenet = nn.Identity()
            self.ar_audio_prenet = nn.Identity()

        self.ar_text_position = SinePositionalEmbedding(
            decoder_dim,
            dropout=0.1,
            scale=False,
            alpha=True,
        )
        self.ar_audio_position = SinePositionalEmbedding(
            decoder_dim,
            dropout=0.1,
            scale=False,
            alpha=True,
        )

        self.ar_decoder = decoder_cls(
            decoder_layer_cls(
                decoder_dim,
                nhead,
                dim_feedforward=decoder_dim * 4,  # *4?
                dropout=0.1,
                batch_first=True,
                norm_first=cfg.norm_first,
            ),
            num_layers=num_decoder_layers,
            norm=LayerNorm(decoder_dim) if cfg.norm_first else None,
        )
        self.ar_predict_layer = nn.Linear(
            decoder_dim, cfg.audio_token_num + 1, bias=False
        )

        self.ar_accuracy_metric = MulticlassAccuracy(
            cfg.audio_token_num + 1,
            top_k=10,
            average="micro",
            multidim_average="global",
            ignore_index=cfg.audio_token_num,
        )

        self.rng = random.Random(0)
        self.num_heads = nhead
        self.prefix_mode = cfg.prefix_mode
        self.num_quantizers = num_quantizers

        assert num_quantizers >= 1
        if num_quantizers > 1:
            self.nar_audio_embeddings = nn.ModuleList(
                [
                    TokenEmbedding(nar_decoder_dim, cfg.audio_token_num + 1)
                ]  # Why the first layer is audio_token_num + 1?
                + [
                    TokenEmbedding(nar_decoder_dim, cfg.audio_token_num)
                    for i in range(num_quantizers - 1)
                ]
            )

            if cfg.add_prenet:
                self.nar_text_prenet = nn.Sequential(
                    Transpose(),
                    nn.Conv1d(
                        nar_decoder_dim, nar_decoder_dim, kernel_size=5, padding="same"
                    ),
                    nn.BatchNorm1d(nar_decoder_dim),
                    nn.ReLU(),
                    nn.Dropout(0.5),
                    nn.Conv1d(
                        nar_decoder_dim, nar_decoder_dim, kernel_size=5, padding="same"
                    ),
                    nn.BatchNorm1d(nar_decoder_dim),
                    nn.ReLU(),
                    nn.Dropout(0.5),
                    nn.Conv1d(
                        nar_decoder_dim, nar_decoder_dim, kernel_size=5, padding="same"
                    ),
                    nn.BatchNorm1d(nar_decoder_dim),
                    nn.ReLU(),
                    nn.Dropout(0.5),
                    Transpose(),
                    nn.Linear(nar_decoder_dim, nar_decoder_dim),
                )
                self.nar_audio_prenet = nn.Sequential(
                    nn.Linear(nar_decoder_dim, 256),
                    nn.ReLU(),
                    nn.Dropout(0.25),
                    nn.Linear(256, 256),
                    nn.ReLU(),
                    nn.Dropout(0.25),
                    nn.Linear(256, nar_decoder_dim),
                )
            else:
                self.nar_text_prenet = nn.Identity()
                self.nar_audio_prenet = nn.Identity()

            self.nar_text_position = SinePositionalEmbedding(
                nar_decoder_dim,
                dropout=0.0,
                scale=False,
                alpha=False,
            )
            self.nar_audio_position = SinePositionalEmbedding(
                nar_decoder_dim,
                dropout=0.1,
                scale=False,
                alpha=False,
            )

            self.nar_decoder = decoder_cls(
                decoder_layer_cls(
                    nar_decoder_dim,
                    int(nhead * nar_scale_factor),
                    dim_feedforward=nar_decoder_dim * 4,
                    dropout=0.1,
                    batch_first=True,
                    norm_first=cfg.norm_first,
                    adaptive_layer_norm=True,
                ),
                num_layers=int(num_decoder_layers * nar_scale_factor),
                norm=AdaptiveLayerNorm(
                    nar_decoder_dim, norm=nn.LayerNorm(nar_decoder_dim)
                )
                if cfg.norm_first
                else None,
            )
            self.nar_predict_layers = nn.ModuleList(
                [
                    nn.Linear(nar_decoder_dim, cfg.audio_token_num, bias=False)
                    for i in range(num_quantizers - 1)
                ]
            )
            self.nar_stage_embeddings = nn.ModuleList(
                [TokenEmbedding(nar_decoder_dim, 1) for i in range(num_quantizers - 1)]
            )

            if cfg.share_embedding:
                for j in range(0, num_quantizers - 2):
                    self.nar_predict_layers[j].weight = self.nar_audio_embeddings[
                        j + 2
                    ].weight

            self.nar_accuracy_metric = MulticlassAccuracy(
                cfg.audio_token_num + 1,
                top_k=10,
                average="micro",
                multidim_average="global",
                ignore_index=cfg.audio_token_num,
            )

    def forward(
        self,
        x: torch.Tensor,
        x_lens: torch.Tensor,
        y: Union[torch.Tensor, PromptedFeatures],
        y_lens: Union[torch.Tensor, PromptedFeatures],
        reduction: str = "sum",
        train_stage: int = 0,
        **kwargs,
    ) -> Tuple[torch.Tensor, Union[torch.Tensor, None]]:
        """
        Args:
          x:
            A 2-D tensor of shape (N, S).
          x_lens:
            A 1-D tensor of shape (N,). It contains the number of tokens in `x`
            before padding.
          y:
            A 3-D tensor of shape (N, T, 8).
          y_lens:
            A 1-D tensor of shape (N,). It contains the number of tokens in `x`
            before padding.
          train_stage:
            0: AR & NAR modules, 1: AR modules, 2: NAR modules
        Returns:
          Return the predicted audio code matrix, cross-entropy loss and Top-10 accuracy.
        """
        assert x.ndim == 2, x.shape
        assert x_lens.ndim == 1, x_lens.shape

        y_prompts_codes = None
        if isinstance(y, PromptedFeatures):
            y_prompts_codes, y = y.data
            prompts_len, y_lens = y_lens.data
            assert prompts_len.min() == prompts_len.max()
            assert self.prefix_mode == 4
            y_prompts_codes = y_prompts_codes.type(torch.int64)

        assert y.ndim == 3, y.shape
        assert y_lens.ndim == 1, y_lens.shape

        x_mask = make_pad_mask(x_lens).to(x.device)
        y_mask = make_pad_mask(y_lens).to(y.device)
        y_mask_int = y_mask.type(torch.int64)

        text = x
        codes = y.type(torch.int64) * (1 - y_mask_int.unsqueeze(dim=-1))

        y, targets = self.pad_y_eos(
            codes[..., 0], y_mask_int, eos_id=self.audio_token_num
        )
        self.y_mask_int = y_mask_int

        metrics = {}
        total_loss = 0.0

        xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
        if self.ar_audio_prepend_bos:
            ar_xy_padding_mask = torch.concat(
                [x_mask, F.pad(y_mask, (1, 0), value=False)], dim=1
            )
        else:
            ar_xy_padding_mask = xy_padding_mask
        self.xy_padding_mask = xy_padding_mask
        self.ar_xy_padding_mask = ar_xy_padding_mask

        # AR Decoder
        if train_stage in [0, 1]:
            ar_loss, ar_metrics = self._forward_ar_decoder(
                text, x_lens.max(), y, y_lens.max(), targets, x_mask, y_mask, reduction
            )
            total_loss += ar_loss
            metrics["AR_Top100Acc"] = ar_metrics

        # NAR Decoder
        if self.ar_audio_prepend_bos:
            y = y[:, 1:]

        if self.num_quantizers > 1 and train_stage in [0, 2]:
            nar_loss, nar_metrics = self._forward_nar_decoder(
                text,
                x_lens,
                y,
                y_lens,
                codes,
                y_prompts_codes,
                x_mask,
                y_mask,
                reduction,
            )
            total_loss += nar_loss
            metrics["NAR_Top100Acc"] = nar_metrics

        if train_stage == 0:
            total_loss = total_loss / 2.0

        return total_loss, metrics

    def _forward_ar_decoder(
        self, x, x_len, y, y_lens, targets, x_mask, y_mask, reduction
    ):
        x = self.ar_text_embedding(x)
        x = self.ar_text_prenet(x)
        x = self.ar_text_position(x)

        y_len = y_lens.max() + int(self.ar_audio_prepend_bos)

        x_attn_mask = F.pad(
            torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device),
            (0, y_len),
            value=True,
        )
        y_attn_mask = F.pad(
            torch.triu(
                torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
                diagonal=1,
            ),
            (x_len, 0),
            value=False,
        )
        xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0)

        bsz, src_len = x.shape[0], x_len + y_len
        _xy_padding_mask = (
            self.ar_xy_padding_mask.view(bsz, 1, 1, src_len)
            .expand(-1, self.num_heads, -1, -1)
            .reshape(bsz * self.num_heads, 1, src_len)
        )
        xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask)

        new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
        new_attn_mask.masked_fill_(xy_attn_mask, float("-inf"))
        xy_attn_mask = new_attn_mask

        y_emb = self.ar_audio_embedding(y)
        y_emb = self.ar_audio_prenet(y_emb)
        y_pos = self.ar_audio_position(y_emb)

        xy_pos = torch.concat([x, y_pos], dim=1)

        xy_dec, _ = self.ar_decoder(
            (xy_pos, None),
            mask=xy_attn_mask,
        )
        logits = self.ar_predict_layer(xy_dec[:, x_len:]).permute(0, 2, 1)
        ar_loss = F.cross_entropy(logits, targets, reduction=reduction)

        ar_metrics = self.ar_accuracy_metric(
            logits.detach(), targets
        ).item() * y_lens.sum().type(torch.float32)

        return ar_loss, ar_metrics

    def _forward_nar_decoder(
        self, x, x_lens, y, y_lens, codes, y_prompts_codes, x_mask, y_mask, reduction
    ):
        num_nar_layers = self.num_quantizers - 1
        nar_stage = self.rng.choices(
            [_k for _k in range(1, self.num_quantizers)],
            weights=[1.0 / num_nar_layers] * num_nar_layers,
            k=1,
        )[0]

        x = self.nar_text_embedding(x)
        x = self.nar_text_prenet(x)
        x = self.nar_text_position(x)

        y_emb, prefix_len = self._prepare_prompts(
            y, y_lens, codes, nar_stage, y_prompts_codes
        )

        y_len = y_lens.max()
        targets = codes[..., nar_stage] + self.audio_token_num * self.y_mask_int
        if self.prefix_mode in [2, 4]:
            xy_padding_mask = torch.concat(
                [
                    x_mask,
                    F.pad(y_mask, (y_emb.shape[1] - y_len, 0), value=False),
                ],
                dim=1,
            )
        elif self.prefix_mode == 1:
            targets = targets[:, prefix_len:]

        y_pos = self.nar_audio_prenet(y_emb)
        y_pos = self.nar_audio_position(y_pos)
        xy_pos = torch.concat([x, y_pos], dim=1)
        xy_dec, _ = self.nar_decoder(
            (xy_pos, self.nar_stage_embeddings[nar_stage - 1].weight),
            src_key_padding_mask=self.xy_padding_mask,
        )
        xy_dec = xy_dec[:, x_lens.max() + prefix_len :]
        if self.prefix_mode == 4:
            prefix_len = 0
        logits = self.nar_predict_layers[nar_stage - 1](xy_dec).permute(0, 2, 1)

        total_length = (y_lens).sum().type(torch.float32)
        nar_loss = F.cross_entropy(
            logits,
            targets,
            ignore_index=self.audio_token_num,
            reduction=reduction,
        ) * (total_length / (total_length - prefix_len * x.shape[0]))
        nar_metrics = (
            self.nar_accuracy_metric(
                F.pad(
                    logits.detach(),
                    (0, 0, 0, 1, 0, 0),
                    value=logits.min().cpu().item(),
                ),
                targets,
            ).item()
            * total_length
        )
        return nar_loss, nar_metrics

    def inference(
        self,
        x: torch.Tensor,
        x_lens: torch.Tensor,
        y: torch.Tensor,
        enroll_x_lens: torch.Tensor,
        top_k: int = -100,
        temperature: float = 1.0,
    ) -> torch.Tensor:
        """
        Args:
          x:
            A 2-D tensor of shape (1, S).
          x_lens:
            A 1-D tensor of shape (1,). It contains the number of tokens in `x`
            before padding.
          y:
            A 3-D tensor of shape (1, T, 8).
          top_k: (`optional`) int
            The number of highest probability tokens to keep for top-k-filtering. Default to -100.
          temperature: (`optional`) float
            The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
        Returns:
          Return the predicted audio code matrix.
        """
        assert x.ndim == 2, x.shape
        assert x_lens.ndim == 1, x_lens.shape
        assert y.ndim == 3, y.shape
        assert y.shape[0] == 1, y.shape

        assert torch.all(x_lens > 0)

        text = x
        x = self.ar_text_embedding(text)
        x = self.ar_text_prenet(x)
        x = self.ar_text_position(x)

        text_len = x_lens.max()
        prompts = y
        prefix_len = y.shape[1]

        # AR Decoder
        y = prompts[..., 0]
        if self.ar_audio_prepend_bos:
            y = F.pad(y, (1, 0), value=self.audio_token_num + 1)

        x_len = x_lens.max()
        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)

        while True:
            y_emb = self.ar_audio_embedding(y)
            y_emb = self.ar_audio_prenet(y_emb)
            y_pos = self.ar_audio_position(y_emb)
            xy_pos = torch.concat([x, y_pos], dim=1)

            y_len = y.shape[1]
            x_attn_mask_pad = F.pad(
                x_attn_mask,
                (0, y_len),
                value=True,
            )
            y_attn_mask = F.pad(
                torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
                (x_len, 0),
                value=False,
            )
            xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
                y.device
            )

            xy_dec, _ = self.ar_decoder(
                (xy_pos, None),
                mask=xy_attn_mask,
            )
            logits = self.ar_predict_layer(xy_dec[:, -1])
            samples = topk_sampling(
                logits, top_k=top_k, top_p=1.0, temperature=temperature
            )

            if (
                torch.argmax(logits, dim=-1)[0] == self.audio_token_num
                or samples[0, 0] == self.audio_token_num
                or (y.shape[1] - prompts.shape[1]) > x_lens.max() * 16
            ):
                if prompts.shape[1] == y.shape[1]:
                    raise SyntaxError("well trained model shouldn't reach here.")

                break

            y = torch.concat([y, samples], dim=1)

        codes = [y[:, prefix_len + int(self.ar_audio_prepend_bos) :]]
        if self.num_quantizers == 1:
            return torch.stack(codes, dim=-1)

        # Non-AR Decoders
        y_emb = self.nar_audio_embeddings[0](y[:, int(self.ar_audio_prepend_bos) :])

        if self.prefix_mode in [2, 4]:
            enrolled_len = enroll_x_lens.max().item()
            # SOS + Synthesis Text + EOS
            text = torch.concat(
                [
                    text[:, :1],
                    text[:, enrolled_len - 1 :],
                ],
                dim=1,
            )
            text_len = text_len - (enrolled_len - 2)
            assert text.shape[0] == 1

        x = self.nar_text_embedding(text)
        x = self.nar_text_prenet(x)
        x = self.nar_text_position(x)

        if self.prefix_mode == 0:
            for i, (predict_layer, embedding_layer) in enumerate(
                zip(
                    self.nar_predict_layers,
                    self.nar_audio_embeddings[1:],
                )
            ):
                y_pos = self.nar_audio_prenet(y_emb)
                y_pos = self.nar_audio_position(y_pos)
                xy_pos = torch.concat([x, y_pos], dim=1)

                xy_dec, _ = self.nar_decoder(
                    (xy_pos, self.nar_stage_embeddings[i].weight)
                )
                logits = predict_layer(xy_dec[:, text_len + prefix_len :])

                samples = torch.argmax(logits, dim=-1)
                codes.append(samples)

                if i < self.num_quantizers - 2:
                    y_emb[:, :prefix_len] += embedding_layer(prompts[..., i + 1])
                    y_emb[:, prefix_len:] += embedding_layer(samples)
        else:
            for j in range(1, self.num_quantizers):
                y_emb[:, :prefix_len] += self.nar_audio_embeddings[j](prompts[..., j])

            for i, (predict_layer, embedding_layer) in enumerate(
                zip(
                    self.nar_predict_layers,
                    self.nar_audio_embeddings[1:],
                )
            ):
                y_pos = self.nar_audio_prenet(y_emb)
                y_pos = self.nar_audio_position(y_pos)
                xy_pos = torch.concat([x, y_pos], dim=1)

                xy_dec, _ = self.nar_decoder(
                    (xy_pos, self.nar_stage_embeddings[i].weight)
                )
                logits = predict_layer(xy_dec[:, text_len + prefix_len :])

                samples = torch.argmax(logits, dim=-1)
                codes.append(samples)

                if i < self.num_quantizers - 2:
                    y_emb[:, prefix_len:] += embedding_layer(samples)

        assert len(codes) == self.num_quantizers
        return torch.stack(codes, dim=-1)

    def continual(
        self,
        x: torch.Tensor,
        x_lens: torch.Tensor,
        y: torch.Tensor,
    ) -> torch.Tensor:
        """
        Args:
          x:
            A 2-D tensor of shape (1, S).
          x_lens:
            A 1-D tensor of shape (1,). It contains the number of tokens in `x`
            before padding.
          y:
            A 3-D tensor of shape (1, T, 8).
        Returns:
          Return the predicted audio code matrix.
        """
        assert x.ndim == 2, x.shape
        assert x_lens.ndim == 1, x_lens.shape
        assert y.ndim == 3, y.shape
        assert y.shape[0] == 1, y.shape

        assert torch.all(x_lens > 0)
        assert self.num_quantizers == 8

        text = x
        x = self.ar_text_embedding(text)
        x = self.ar_text_prenet(x)
        x = self.ar_text_position(x)

        text_len = x_lens.max()

        prefix_len = min(int(y.shape[1] * 0.5), 3 * 75)

        # AR Decoder
        prompts = y[:, :prefix_len]

        codes = [y[:, prefix_len:, 0]]
        # Non-AR Decoders
        x = self.nar_text_embedding(text)
        x = self.nar_text_prenet(x)
        x = self.nar_text_position(x)

        y_emb = self.nar_audio_embeddings[0](y[..., 0])

        if self.prefix_mode == 0:
            for i, (predict_layer, embedding_layer) in enumerate(
                zip(
                    self.nar_predict_layers,
                    self.nar_audio_embeddings[1:],
                )
            ):
                y_pos = self.nar_audio_position(y_emb)
                y_pos = self.nar_audio_prenet(y_pos)
                xy_pos = torch.concat([x, y_pos], dim=1)

                xy_dec, _ = self.nar_decoder(
                    (xy_pos, self.nar_stage_embeddings[i].weight)
                )
                logits = predict_layer(xy_dec[:, text_len + prefix_len :])

                samples = torch.argmax(logits, dim=-1)
                codes.append(samples)

                if i < 6:
                    y_emb[:, :prefix_len] += embedding_layer(prompts[..., i + 1])
                    y_emb[:, prefix_len:] += embedding_layer(samples)
        else:
            for j in range(1, 8):
                y_emb[:, :prefix_len] += self.nar_audio_embeddings[j](prompts[..., j])

            for i, (predict_layer, embedding_layer) in enumerate(
                zip(
                    self.nar_predict_layers,
                    self.nar_audio_embeddings[1:],
                )
            ):
                y_pos = self.nar_audio_prenet(y_emb)
                y_pos = self.nar_audio_position(y_pos)
                xy_pos = torch.concat([x, y_pos], dim=1)

                xy_dec, _ = self.nar_decoder(
                    (xy_pos, self.nar_stage_embeddings[i].weight)
                )
                logits = predict_layer(xy_dec[:, text_len + prefix_len :])

                samples = torch.argmax(logits, dim=-1)
                codes.append(samples)

                if i < 6:
                    y_emb[:, prefix_len:] += embedding_layer(samples)

        assert len(codes) == 8
        return torch.stack(codes, dim=-1)

    def stage_parameters(self, stage: int = 1) -> Iterator[nn.Parameter]:
        assert stage > 0
        if stage == 1:
            for name, param in self.named_parameters():
                if name.startswith("ar_"):
                    yield param

        if stage == 2:
            for name, param in self.named_parameters():
                if name.startswith("nar_"):
                    yield param

    def stage_named_parameters(
        self, stage: int = 1
    ) -> Iterator[Tuple[str, nn.Parameter]]:
        assert stage > 0
        if stage == 1:
            for pair in self.named_parameters():
                if pair[0].startswith("ar_"):
                    yield pair

        if stage == 2:
            for pair in self.named_parameters():
                if pair[0].startswith("nar_"):
                    yield pair

    def pad_y_eos(self, y, y_mask_int, eos_id):
        targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(
            y_mask_int, (0, 1), value=1
        )
        if self.ar_audio_prepend_bos:
            return (
                F.pad(targets[:, :-1], (1, 0), value=self.audio_token_num + 1),
                targets,
            )

        return targets[:, :-1], targets[:, 1:]

    def _prepare_prompts(self, y, y_lens, codes, nar_stage, y_prompts_codes):
        # 5.1 For the NAR acoustic prompt tokens, we select a random segment waveform of 3 seconds
        # from the same utterance.
        # We implement this differently.
        if self.prefix_mode == 0:
            # no prefix
            prefix_len = 0
            y_emb = self.nar_audio_embeddings[0](y)
            for j in range(1, nar_stage):
                # Formula (4) (5)
                y_emb = y_emb + self.nar_audio_embeddings[j](codes[..., j])
        elif self.prefix_mode == 1:
            # prefix at begining
            int_low = (0.25 * y_lens.min()).type(torch.int64).item()
            prefix_len = torch.randint(int_low, int_low * 2, size=()).item()
            prefix_len = min(prefix_len, 225)  # 24000/320 * 3s = 225 frames

            y_prompts = self.nar_audio_embeddings[0](y[:, :prefix_len])
            y_emb = self.nar_audio_embeddings[0](y[:, prefix_len:])
            for j in range(1, self.num_quantizers):
                y_prompts += self.nar_audio_embeddings[j](codes[:, :prefix_len, j])
                if j < nar_stage:
                    y_emb += self.nar_audio_embeddings[j](codes[:, prefix_len:, j])
            y_emb = torch.concat([y_prompts, y_emb], axis=1)
        elif self.prefix_mode in [2, 4]:
            if self.prefix_mode == 2:
                # random prefix
                prefix_len = min(225, int(0.25 * y_lens.min().item()))

                y_prompts_codes = []
                for b in range(codes.shape[0]):
                    start = self.rng.randint(0, y_lens[b].item() - prefix_len)
                    y_prompts_codes.append(
                        torch.clone(codes[b, start : start + prefix_len])
                    )
                    codes[b, start : start + prefix_len, nar_stage] = NUM_AUDIO_TOKENS
                y_prompts_codes = torch.stack(y_prompts_codes, dim=0)
            else:
                prefix_len = y_prompts_codes.shape[1]

            y_prompts = self.nar_audio_embeddings[0](y_prompts_codes[..., 0])
            y_emb = self.nar_audio_embeddings[0](y)
            for j in range(1, self.num_quantizers):
                y_prompts += self.nar_audio_embeddings[j](y_prompts_codes[..., j])
                if j < nar_stage:
                    y_emb += self.nar_audio_embeddings[j](codes[..., j])
            y_emb = torch.concat([y_prompts, y_emb], axis=1)
        else:
            raise ValueError

        return y_emb, prefix_len