Spaces:

unpairedelectron07
/

Text-to-Music-Generator

Running

App Files Files Community

unpairedelectron07 commited on Jan 21

Commit

982b37b

•

1 Parent(s): 28d2e51

Upload 11 files

Browse files

Files changed (11) hide show

audiocraft/models/audiogen.py +93 -0
audiocraft/models/builders.py +261 -0
audiocraft/models/encodec.py +506 -0
audiocraft/models/genmodel.py +267 -0
audiocraft/models/lm.py +547 -0
audiocraft/models/lm_magnet.py +498 -0
audiocraft/models/loaders.py +177 -0
audiocraft/models/magnet.py +88 -0
audiocraft/models/multibanddiffusion.py +191 -0
audiocraft/models/musicgen.py +308 -0
audiocraft/models/unet.py +214 -0

audiocraft/models/audiogen.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Main model for using AudioGen. This will combine all the required components
+and provide easy access to the generation API.
+"""
+import typing as tp
+import torch
+from .encodec import CompressionModel
+from .genmodel import BaseGenModel
+from .lm import LMModel
+from .builders import get_debug_compression_model, get_debug_lm_model
+from .loaders import load_compression_model, load_lm_model
+class AudioGen(BaseGenModel):
+    """AudioGen main model with convenient generation API.
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+        max_duration (float, optional): maximum duration the model can produce,
+            otherwise, inferred from the training params.
+    """
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: tp.Optional[float] = None):
+        super().__init__(name, compression_model, lm, max_duration)
+        self.set_generation_params(duration=5)  # default duration
+    @staticmethod
+    def get_pretrained(name: str = 'facebook/audiogen-medium', device=None):
+        """Return pretrained model, we provide a single model for now:
+        - facebook/audiogen-medium (1.5B), text to sound,
+          # see: https://huggingface.co/facebook/audiogen-medium
+        """
+        if device is None:
+            if torch.cuda.device_count():
+                device = 'cuda'
+            else:
+                device = 'cpu'
+        if name == 'debug':
+            # used only for unit tests
+            compression_model = get_debug_compression_model(device, sample_rate=16000)
+            lm = get_debug_lm_model(device)
+            return AudioGen(name, compression_model, lm, max_duration=10)
+        compression_model = load_compression_model(name, device=device)
+        lm = load_lm_model(name, device=device)
+        assert 'self_wav' not in lm.condition_provider.conditioners, \
+            "AudioGen do not support waveform conditioning for now"
+        return AudioGen(name, compression_model, lm)
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                              top_p: float = 0.0, temperature: float = 1.0,
+                              duration: float = 10.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 2):
+        """Set the generation parameters for AudioGen.
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 250.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+            duration (float, optional): Duration of the generated waveform. Defaults to 10.0.
+            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+                instead of batching together the two. This has some impact on how things
+                are padded but seems to have little impact in practice.
+            extend_stride: when doing extended generation (i.e. more than 10 seconds), by how much
+                should we extend the audio each time. Larger values will mean less context is
+                preserved, and shorter value will require extra computations.
+        """
+        assert extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
+        self.extend_stride = extend_stride
+        self.duration = duration
+        self.generation_params = {
+            'use_sampling': use_sampling,
+            'temp': temperature,
+            'top_k': top_k,
+            'top_p': top_p,
+            'cfg_coef': cfg_coef,
+            'two_step_cfg': two_step_cfg,
+        }

audiocraft/models/builders.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+All the functions to build the relevant models and modules
+from the Hydra config.
+"""
+import typing as tp
+import audiocraft
+import omegaconf
+import torch
+from .encodec import CompressionModel, EncodecModel, InterleaveStereoCompressionModel
+from .lm import LMModel
+from .lm_magnet import MagnetLMModel
+from ..modules.codebooks_patterns import (
+    CodebooksPatternProvider,
+    DelayedPatternProvider,
+    MusicLMPattern,
+    ParallelPatternProvider,
+    UnrolledPatternProvider,
+    CoarseFirstPattern,
+)
+from ..modules.conditioners import (
+    BaseConditioner,
+    ChromaStemConditioner,
+    CLAPEmbeddingConditioner,
+    ConditionFuser,
+    ConditioningProvider,
+    LUTConditioner,
+    T5Conditioner,
+)
+from .unet import DiffusionUnet
+from .. import quantization as qt
+from ..utils.utils import dict_from_config
+from ..modules.diffusion_schedule import MultiBandProcessor, SampleProcessor
+def get_quantizer(quantizer: str, cfg: omegaconf.DictConfig, dimension: int) -> qt.BaseQuantizer:
+    klass = {
+        'no_quant': qt.DummyQuantizer,
+        'rvq': qt.ResidualVectorQuantizer
+    }[quantizer]
+    kwargs = dict_from_config(getattr(cfg, quantizer))
+    if quantizer != 'no_quant':
+        kwargs['dimension'] = dimension
+    return klass(**kwargs)
+def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
+    if encoder_name == 'seanet':
+        kwargs = dict_from_config(getattr(cfg, 'seanet'))
+        encoder_override_kwargs = kwargs.pop('encoder')
+        decoder_override_kwargs = kwargs.pop('decoder')
+        encoder_kwargs = {**kwargs, **encoder_override_kwargs}
+        decoder_kwargs = {**kwargs, **decoder_override_kwargs}
+        encoder = audiocraft.modules.SEANetEncoder(**encoder_kwargs)
+        decoder = audiocraft.modules.SEANetDecoder(**decoder_kwargs)
+        return encoder, decoder
+    else:
+        raise KeyError(f"Unexpected compression model {cfg.compression_model}")
+def get_compression_model(cfg: omegaconf.DictConfig) -> CompressionModel:
+    """Instantiate a compression model."""
+    if cfg.compression_model == 'encodec':
+        kwargs = dict_from_config(getattr(cfg, 'encodec'))
+        encoder_name = kwargs.pop('autoencoder')
+        quantizer_name = kwargs.pop('quantizer')
+        encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
+        quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
+        frame_rate = kwargs['sample_rate'] // encoder.hop_length
+        renormalize = kwargs.pop('renormalize', False)
+        # deprecated params
+        kwargs.pop('renorm', None)
+        return EncodecModel(encoder, decoder, quantizer,
+                            frame_rate=frame_rate, renormalize=renormalize, **kwargs).to(cfg.device)
+    else:
+        raise KeyError(f"Unexpected compression model {cfg.compression_model}")
+def get_lm_model(cfg: omegaconf.DictConfig) -> LMModel:
+    """Instantiate a transformer LM."""
+    if cfg.lm_model in ['transformer_lm', 'transformer_lm_magnet']:
+        kwargs = dict_from_config(getattr(cfg, 'transformer_lm'))
+        n_q = kwargs['n_q']
+        q_modeling = kwargs.pop('q_modeling', None)
+        codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
+        attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
+        cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
+        cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
+        fuser = get_condition_fuser(cfg)
+        condition_provider = get_conditioner_provider(kwargs["dim"], cfg).to(cfg.device)
+        if len(fuser.fuse2cond['cross']) > 0:  # enforce cross-att programmatically
+            kwargs['cross_attention'] = True
+        if codebooks_pattern_cfg.modeling is None:
+            assert q_modeling is not None, \
+                "LM model should either have a codebook pattern defined or transformer_lm.q_modeling"
+            codebooks_pattern_cfg = omegaconf.OmegaConf.create(
+                {'modeling': q_modeling, 'delay': {'delays': list(range(n_q))}}
+            )
+        pattern_provider = get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
+        lm_class = MagnetLMModel if cfg.lm_model == 'transformer_lm_magnet' else LMModel
+        return lm_class(
+            pattern_provider=pattern_provider,
+            condition_provider=condition_provider,
+            fuser=fuser,
+            cfg_dropout=cfg_prob,
+            cfg_coef=cfg_coef,
+            attribute_dropout=attribute_dropout,
+            dtype=getattr(torch, cfg.dtype),
+            device=cfg.device,
+            **kwargs
+        ).to(cfg.device)
+    else:
+        raise KeyError(f"Unexpected LM model {cfg.lm_model}")
+def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -> ConditioningProvider:
+    """Instantiate a conditioning model."""
+    device = cfg.device
+    duration = cfg.dataset.segment_duration
+    cfg = getattr(cfg, 'conditioners')
+    dict_cfg = {} if cfg is None else dict_from_config(cfg)
+    conditioners: tp.Dict[str, BaseConditioner] = {}
+    condition_provider_args = dict_cfg.pop('args', {})
+    condition_provider_args.pop('merge_text_conditions_p', None)
+    condition_provider_args.pop('drop_desc_p', None)
+    for cond, cond_cfg in dict_cfg.items():
+        model_type = cond_cfg['model']
+        model_args = cond_cfg[model_type]
+        if model_type == 't5':
+            conditioners[str(cond)] = T5Conditioner(output_dim=output_dim, device=device, **model_args)
+        elif model_type == 'lut':
+            conditioners[str(cond)] = LUTConditioner(output_dim=output_dim, **model_args)
+        elif model_type == 'chroma_stem':
+            conditioners[str(cond)] = ChromaStemConditioner(
+                output_dim=output_dim,
+                duration=duration,
+                device=device,
+                **model_args
+            )
+        elif model_type == 'clap':
+            conditioners[str(cond)] = CLAPEmbeddingConditioner(
+                output_dim=output_dim,
+                device=device,
+                **model_args
+            )
+        else:
+            raise ValueError(f"Unrecognized conditioning model: {model_type}")
+    conditioner = ConditioningProvider(conditioners, device=device, **condition_provider_args)
+    return conditioner
+def get_condition_fuser(cfg: omegaconf.DictConfig) -> ConditionFuser:
+    """Instantiate a condition fuser object."""
+    fuser_cfg = getattr(cfg, 'fuser')
+    fuser_methods = ['sum', 'cross', 'prepend', 'input_interpolate']
+    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
+    kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
+    fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
+    return fuser
+def get_codebooks_pattern_provider(n_q: int, cfg: omegaconf.DictConfig) -> CodebooksPatternProvider:
+    """Instantiate a codebooks pattern provider object."""
+    pattern_providers = {
+        'parallel': ParallelPatternProvider,
+        'delay': DelayedPatternProvider,
+        'unroll': UnrolledPatternProvider,
+        'coarse_first': CoarseFirstPattern,
+        'musiclm': MusicLMPattern,
+    }
+    name = cfg.modeling
+    kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
+    klass = pattern_providers[name]
+    return klass(n_q, **kwargs)
+def get_debug_compression_model(device='cpu', sample_rate: int = 32000):
+    """Instantiate a debug compression model to be used for unit tests."""
+    assert sample_rate in [16000, 32000], "unsupported sample rate for debug compression model"
+    model_ratios = {
+        16000: [10, 8, 8],  # 25 Hz at 16kHz
+        32000: [10, 8, 16]  # 25 Hz at 32kHz
+    }
+    ratios: tp.List[int] = model_ratios[sample_rate]
+    frame_rate = 25
+    seanet_kwargs: dict = {
+        'n_filters': 4,
+        'n_residual_layers': 1,
+        'dimension': 32,
+        'ratios': ratios,
+    }
+    encoder = audiocraft.modules.SEANetEncoder(**seanet_kwargs)
+    decoder = audiocraft.modules.SEANetDecoder(**seanet_kwargs)
+    quantizer = qt.ResidualVectorQuantizer(dimension=32, bins=400, n_q=4)
+    init_x = torch.randn(8, 32, 128)
+    quantizer(init_x, 1)  # initialize kmeans etc.
+    compression_model = EncodecModel(
+        encoder, decoder, quantizer,
+        frame_rate=frame_rate, sample_rate=sample_rate, channels=1).to(device)
+    return compression_model.eval()
+def get_diffusion_model(cfg: omegaconf.DictConfig):
+    # TODO Find a way to infer the channels from dset
+    channels = cfg.channels
+    num_steps = cfg.schedule.num_steps
+    return DiffusionUnet(
+            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
+def get_processor(cfg, sample_rate: int = 24000):
+    sample_processor = SampleProcessor()
+    if cfg.use:
+        kw = dict(cfg)
+        kw.pop('use')
+        kw.pop('name')
+        if cfg.name == "multi_band_processor":
+            sample_processor = MultiBandProcessor(sample_rate=sample_rate, **kw)
+    return sample_processor
+def get_debug_lm_model(device='cpu'):
+    """Instantiate a debug LM to be used for unit tests."""
+    pattern = DelayedPatternProvider(n_q=4)
+    dim = 16
+    providers = {
+        'description': LUTConditioner(n_bins=128, dim=dim, output_dim=dim, tokenizer="whitespace"),
+    }
+    condition_provider = ConditioningProvider(providers)
+    fuser = ConditionFuser(
+        {'cross': ['description'], 'prepend': [],
+         'sum': [], 'input_interpolate': []})
+    lm = LMModel(
+        pattern, condition_provider, fuser,
+        n_q=4, card=400, dim=dim, num_heads=4, custom=True, num_layers=2,
+        cross_attention=True, causal=True)
+    return lm.to(device).eval()
+def get_wrapped_compression_model(
+        compression_model: CompressionModel,
+        cfg: omegaconf.DictConfig) -> CompressionModel:
+    if hasattr(cfg, 'interleave_stereo_codebooks'):
+        if cfg.interleave_stereo_codebooks.use:
+            kwargs = dict_from_config(cfg.interleave_stereo_codebooks)
+            kwargs.pop('use')
+            compression_model = InterleaveStereoCompressionModel(compression_model, **kwargs)
+    if hasattr(cfg, 'compression_model_n_q'):
+        if cfg.compression_model_n_q is not None:
+            compression_model.set_num_codebooks(cfg.compression_model_n_q)
+    return compression_model

audiocraft/models/encodec.py ADDED Viewed

	@@ -0,0 +1,506 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Compression models or wrapper around existing models.
+Also defines the main interface that a model must follow to be usable as an audio tokenizer.
+"""
+from abc import ABC, abstractmethod
+import logging
+import math
+from pathlib import Path
+import typing as tp
+from einops import rearrange
+import numpy as np
+import torch
+from torch import nn
+from transformers import EncodecModel as HFEncodecModel
+from .. import quantization as qt
+logger = logging.getLogger()
+class CompressionModel(ABC, nn.Module):
+    """Base API for all compression models that aim at being used as audio tokenizers
+    with a language model.
+    """
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
+        ...
+    @abstractmethod
+    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        """See `EncodecModel.encode`."""
+        ...
+    @abstractmethod
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        """See `EncodecModel.decode`."""
+        ...
+    @abstractmethod
+    def decode_latent(self, codes: torch.Tensor):
+        """Decode from the discrete codes to continuous latent space."""
+        ...
+    @property
+    @abstractmethod
+    def channels(self) -> int:
+        ...
+    @property
+    @abstractmethod
+    def frame_rate(self) -> float:
+        ...
+    @property
+    @abstractmethod
+    def sample_rate(self) -> int:
+        ...
+    @property
+    @abstractmethod
+    def cardinality(self) -> int:
+        ...
+    @property
+    @abstractmethod
+    def num_codebooks(self) -> int:
+        ...
+    @property
+    @abstractmethod
+    def total_codebooks(self) -> int:
+        ...
+    @abstractmethod
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer."""
+        ...
+    @staticmethod
+    def get_pretrained(
+            name: str, device: tp.Union[torch.device, str] = 'cpu'
+            ) -> 'CompressionModel':
+        """Instantiate a CompressionModel from a given pretrained model.
+        Args:
+            name (Path or str): name of the pretrained model. See after.
+            device (torch.device or str): Device on which the model is loaded.
+        Pretrained models:
+            - dac_44khz (https://github.com/descriptinc/descript-audio-codec)
+            - dac_24khz (same)
+            - facebook/encodec_24khz (https://huggingface.co/facebook/encodec_24khz)
+            - facebook/encodec_32khz (https://huggingface.co/facebook/encodec_32khz)
+            - your own model on Hugging Face. Export instructions to come...
+        """
+        from . import builders, loaders
+        model: CompressionModel
+        if name in ['dac_44khz', 'dac_24khz']:
+            model_type = name.split('_')[1]
+            logger.info("Getting pretrained compression model from DAC %s", model_type)
+            model = DAC(model_type)
+        elif name in ['debug_compression_model']:
+            logger.info("Getting pretrained compression model for debug")
+            model = builders.get_debug_compression_model()
+        elif Path(name).exists():
+            # We assume here if the path exists that it is in fact an AC checkpoint
+            # that was exported using `audiocraft.utils.export` functions.
+            model = loaders.load_compression_model(name, device=device)
+        else:
+            logger.info("Getting pretrained compression model from HF %s", name)
+            hf_model = HFEncodecModel.from_pretrained(name)
+            model = HFEncodecCompressionModel(hf_model).to(device)
+        return model.to(device).eval()
+class EncodecModel(CompressionModel):
+    """Encodec model operating on the raw waveform.
+    Args:
+        encoder (nn.Module): Encoder network.
+        decoder (nn.Module): Decoder network.
+        quantizer (qt.BaseQuantizer): Quantizer network.
+        frame_rate (int): Frame rate for the latent representation.
+        sample_rate (int): Audio sample rate.
+        channels (int): Number of audio channels.
+        causal (bool): Whether to use a causal version of the model.
+        renormalize (bool): Whether to renormalize the audio before running the model.
+    """
+    # we need assignment to override the property in the abstract class,
+    # I couldn't find a better way...
+    frame_rate: float = 0
+    sample_rate: int = 0
+    channels: int = 0
+    def __init__(self,
+                 encoder: nn.Module,
+                 decoder: nn.Module,
+                 quantizer: qt.BaseQuantizer,
+                 frame_rate: int,
+                 sample_rate: int,
+                 channels: int,
+                 causal: bool = False,
+                 renormalize: bool = False):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.quantizer = quantizer
+        self.frame_rate = frame_rate
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.renormalize = renormalize
+        self.causal = causal
+        if self.causal:
+            # we force disabling here to avoid handling linear overlap of segments
+            # as supported in original EnCodec codebase.
+            assert not self.renormalize, 'Causal model does not support renormalize'
+    @property
+    def total_codebooks(self):
+        """Total number of quantizer codebooks available."""
+        return self.quantizer.total_codebooks
+    @property
+    def num_codebooks(self):
+        """Active number of codebooks used by the quantizer."""
+        return self.quantizer.num_codebooks
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer."""
+        self.quantizer.set_num_codebooks(n)
+    @property
+    def cardinality(self):
+        """Cardinality of each codebook."""
+        return self.quantizer.bins
+    def preprocess(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        scale: tp.Optional[torch.Tensor]
+        if self.renormalize:
+            mono = x.mean(dim=1, keepdim=True)
+            volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
+            scale = 1e-8 + volume
+            x = x / scale
+            scale = scale.view(-1, 1)
+        else:
+            scale = None
+        return x, scale
+    def postprocess(self,
+                    x: torch.Tensor,
+                    scale: tp.Optional[torch.Tensor] = None) -> torch.Tensor:
+        if scale is not None:
+            assert self.renormalize
+            x = x * scale.view(-1, 1, 1)
+        return x
+    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
+        assert x.dim() == 3
+        length = x.shape[-1]
+        x, scale = self.preprocess(x)
+        emb = self.encoder(x)
+        q_res = self.quantizer(emb, self.frame_rate)
+        out = self.decoder(q_res.x)
+        # remove extra padding added by the encoder and decoder
+        assert out.shape[-1] >= length, (out.shape[-1], length)
+        out = out[..., :length]
+        q_res.x = self.postprocess(out, scale)
+        return q_res
+    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        """Encode the given input tensor to quantized representation along with scale parameter.
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T]
+        Returns:
+            codes, scale (tuple of torch.Tensor, torch.Tensor): Tuple composed of:
+                codes: a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+                scale: a float tensor containing the scale for audio renormalization.
+        """
+        assert x.dim() == 3
+        x, scale = self.preprocess(x)
+        emb = self.encoder(x)
+        codes = self.quantizer.encode(emb)
+        return codes, scale
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        """Decode the given codes to a reconstructed representation, using the scale to perform
+        audio denormalization if needed.
+        Args:
+            codes (torch.Tensor): Int tensor of shape [B, K, T]
+            scale (torch.Tensor, optional): Float tensor containing the scale value.
+        Returns:
+            out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
+        """
+        emb = self.decode_latent(codes)
+        out = self.decoder(emb)
+        out = self.postprocess(out, scale)
+        # out contains extra padding added by the encoder and decoder
+        return out
+    def decode_latent(self, codes: torch.Tensor):
+        """Decode from the discrete codes to continuous latent space."""
+        return self.quantizer.decode(codes)
+class DAC(CompressionModel):
+    def __init__(self, model_type: str = "44khz"):
+        super().__init__()
+        try:
+            import dac.utils
+        except ImportError:
+            raise RuntimeError("Could not import dac, make sure it is installed, "
+                               "please run `pip install descript-audio-codec`")
+        self.model = dac.utils.load_model(model_type=model_type)
+        self.n_quantizers = self.total_codebooks
+        self.model.eval()
+    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
+        # We don't support training with this.
+        raise NotImplementedError("Forward and training with DAC not supported.")
+    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        codes = self.model.encode(x, self.n_quantizers)[1]
+        return codes[:, :self.n_quantizers], None
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        assert scale is None
+        z_q = self.decode_latent(codes)
+        return self.model.decode(z_q)
+    def decode_latent(self, codes: torch.Tensor):
+        """Decode from the discrete codes to continuous latent space."""
+        return self.model.quantizer.from_codes(codes)[0]
+    @property
+    def channels(self) -> int:
+        return 1
+    @property
+    def frame_rate(self) -> float:
+        return self.model.sample_rate / self.model.hop_length
+    @property
+    def sample_rate(self) -> int:
+        return self.model.sample_rate
+    @property
+    def cardinality(self) -> int:
+        return self.model.codebook_size
+    @property
+    def num_codebooks(self) -> int:
+        return self.n_quantizers
+    @property
+    def total_codebooks(self) -> int:
+        return self.model.n_codebooks
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer.
+        """
+        assert n >= 1
+        assert n <= self.total_codebooks
+        self.n_quantizers = n
+class HFEncodecCompressionModel(CompressionModel):
+    """Wrapper around HuggingFace Encodec.
+    """
+    def __init__(self, model: HFEncodecModel):
+        super().__init__()
+        self.model = model
+        bws = self.model.config.target_bandwidths
+        num_codebooks = [
+            bw * 1000 / (self.frame_rate * math.log2(self.cardinality))
+            for bw in bws
+        ]
+        deltas = [nc - int(nc) for nc in num_codebooks]
+        # Checking we didn't do some bad maths and we indeed have integers!
+        assert all(deltas) <= 1e-3, deltas
+        self.possible_num_codebooks = [int(nc) for nc in num_codebooks]
+        self.set_num_codebooks(max(self.possible_num_codebooks))
+    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
+        # We don't support training with this.
+        raise NotImplementedError("Forward and training with HF EncodecModel not supported.")
+    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        bandwidth_index = self.possible_num_codebooks.index(self.num_codebooks)
+        bandwidth = self.model.config.target_bandwidths[bandwidth_index]
+        res = self.model.encode(x, None, bandwidth)
+        assert len(res[0]) == 1
+        assert len(res[1]) == 1
+        return res[0][0], res[1][0]
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        if scale is None:
+            scales = [None]  # type: ignore
+        else:
+            scales = scale  # type: ignore
+        res = self.model.decode(codes[None], scales)
+        return res[0]
+    def decode_latent(self, codes: torch.Tensor):
+        """Decode from the discrete codes to continuous latent space."""
+        return self.model.quantizer.decode(codes.transpose(0, 1))
+    @property
+    def channels(self) -> int:
+        return self.model.config.audio_channels
+    @property
+    def frame_rate(self) -> float:
+        hop_length = int(np.prod(self.model.config.upsampling_ratios))
+        return self.sample_rate / hop_length
+    @property
+    def sample_rate(self) -> int:
+        return self.model.config.sampling_rate
+    @property
+    def cardinality(self) -> int:
+        return self.model.config.codebook_size
+    @property
+    def num_codebooks(self) -> int:
+        return self._num_codebooks
+    @property
+    def total_codebooks(self) -> int:
+        return max(self.possible_num_codebooks)
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer.
+        """
+        if n not in self.possible_num_codebooks:
+            raise ValueError(f"Allowed values for num codebooks: {self.possible_num_codebooks}")
+        self._num_codebooks = n
+class InterleaveStereoCompressionModel(CompressionModel):
+    """Wraps a CompressionModel to support stereo inputs. The wrapped model
+    will be applied independently to the left and right channels, and both codebooks
+    will be interleaved. If the wrapped model returns a representation `[B, K ,T]` per
+    channel, then the output will be `[B, K * 2, T]`  or `[B, K, T * 2]` depending on
+    `per_timestep`.
+    Args:
+        model (CompressionModel): Compression model to wrap.
+        per_timestep (bool): Whether to interleave on the timestep dimension
+            or on the codebooks dimension.
+    """
+    def __init__(self, model: CompressionModel, per_timestep: bool = False):
+        super().__init__()
+        self.model = model
+        self.per_timestep = per_timestep
+        assert self.model.channels == 1, "Wrapped model is expected to be for monophonic audio"
+    @property
+    def total_codebooks(self):
+        return self.model.total_codebooks
+    @property
+    def num_codebooks(self):
+        """Active number of codebooks used by the quantizer.
+        ..Warning:: this reports the number of codebooks after the interleaving
+        of the codebooks!
+        """
+        return self.model.num_codebooks if self.per_timestep else self.model.num_codebooks * 2
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer.
+        ..Warning:: this sets the number of codebooks before the interleaving!
+        """
+        self.model.set_num_codebooks(n)
+    @property
+    def num_virtual_steps(self) -> float:
+        """Return the number of virtual steps, e.g. one real step
+        will be split into that many steps.
+        """
+        return 2 if self.per_timestep else 1
+    @property
+    def frame_rate(self) -> float:
+        return self.model.frame_rate * self.num_virtual_steps
+    @property
+    def sample_rate(self) -> int:
+        return self.model.sample_rate
+    @property
+    def channels(self) -> int:
+        return 2
+    @property
+    def cardinality(self):
+        """Cardinality of each codebook.
+        """
+        return self.model.cardinality
+    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
+        raise NotImplementedError("Not supported, use encode and decode.")
+    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        B, C, T = x.shape
+        assert C == self.channels, f"Expecting stereo audio but audio num channels is {C}"
+        indices_c0, scales_c0 = self.model.encode(x[:, 0, ...].unsqueeze(1))
+        indices_c1, scales_c1 = self.model.encode(x[:, 1, ...].unsqueeze(1))
+        indices = torch.stack([indices_c0, indices_c1], dim=0)
+        scales: tp.Optional[torch.Tensor] = None
+        if scales_c0 is not None and scales_c1 is not None:
+            scales = torch.stack([scales_c0, scales_c1], dim=1)
+        if self.per_timestep:
+            indices = rearrange(indices, 'c b k t -> b k (t c)', c=2)
+        else:
+            indices = rearrange(indices, 'c b k t -> b (k c) t', c=2)
+        return (indices, scales)
+    def get_left_right_codes(self, codes: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        if self.per_timestep:
+            codes = rearrange(codes, 'b k (t c) -> c b k t', c=2)
+        else:
+            codes = rearrange(codes, 'b (k c) t -> c b k t', c=2)
+        return codes[0], codes[1]
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        B, K, T = codes.shape
+        assert T % self.num_virtual_steps == 0, "Provided codes' number of timesteps does not match"
+        assert K == self.num_codebooks, "Provided codes' number of codebooks does not match"
+        scale_c0, scale_c1 = None, None
+        if scale is not None:
+            assert scale.size(0) == B and scale.size(1) == 2, f"Scale has unexpected shape: {scale.shape}"
+            scale_c0 = scale[0, ...]
+            scale_c1 = scale[1, ...]
+        codes_c0, codes_c1 = self.get_left_right_codes(codes)
+        audio_c0 = self.model.decode(codes_c0, scale_c0)
+        audio_c1 = self.model.decode(codes_c1, scale_c1)
+        return torch.cat([audio_c0, audio_c1], dim=1)
+    def decode_latent(self, codes: torch.Tensor):
+        """Decode from the discrete codes to continuous latent space."""
+        raise NotImplementedError("Not supported by interleaved stereo wrapped models.")

audiocraft/models/genmodel.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Base implementation for audio generative models. This base implementation
+combines all the required components to run inference with pretrained audio
+generative models. It can be easily inherited by downstream model classes to
+provide easy access to the generation API.
+"""
+from abc import ABC, abstractmethod
+import typing as tp
+import omegaconf
+import torch
+from .encodec import CompressionModel
+from .lm import LMModel
+from .builders import get_wrapped_compression_model
+from ..data.audio_utils import convert_audio
+from ..modules.conditioners import ConditioningAttributes
+from ..utils.autocast import TorchAutocast
+class BaseGenModel(ABC):
+    """Base generative model with convenient generation API.
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+        max_duration (float, optional): maximum duration the model can produce,
+            otherwise, inferred from the training params.
+    """
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: tp.Optional[float] = None):
+        self.name = name
+        self.compression_model = compression_model
+        self.lm = lm
+        self.cfg: tp.Optional[omegaconf.DictConfig] = None
+        # Just to be safe, let's put everything in eval mode.
+        self.compression_model.eval()
+        self.lm.eval()
+        if hasattr(lm, 'cfg'):
+            cfg = lm.cfg
+            assert isinstance(cfg, omegaconf.DictConfig)
+            self.cfg = cfg
+        if self.cfg is not None:
+            self.compression_model = get_wrapped_compression_model(self.compression_model, self.cfg)
+        if max_duration is None:
+            if self.cfg is not None:
+                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
+            else:
+                raise ValueError("You must provide max_duration when building directly your GenModel")
+        assert max_duration is not None
+        self.max_duration: float = max_duration
+        self.duration = self.max_duration
+        # self.extend_stride is the length of audio extension when generating samples longer
+        # than self.max_duration. NOTE: the derived class must set self.extend_stride to a
+        # positive float value when generating with self.duration > self.max_duration.
+        self.extend_stride: tp.Optional[float] = None
+        self.device = next(iter(lm.parameters())).device
+        self.generation_params: dict = {}
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
+        if self.device.type == 'cpu':
+            self.autocast = TorchAutocast(enabled=False)
+        else:
+            self.autocast = TorchAutocast(
+                enabled=True, device_type=self.device.type, dtype=torch.float16)
+    @property
+    def frame_rate(self) -> float:
+        """Roughly the number of AR steps per seconds."""
+        return self.compression_model.frame_rate
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate of the generated audio."""
+        return self.compression_model.sample_rate
+    @property
+    def audio_channels(self) -> int:
+        """Audio channels of the generated audio."""
+        return self.compression_model.channels
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        """Override the default progress callback."""
+        self._progress_callback = progress_callback
+    @abstractmethod
+    def set_generation_params(self, *args, **kwargs):
+        """Set the generation parameters."""
+        raise NotImplementedError("No base implementation for setting generation params.")
+    @staticmethod
+    @abstractmethod
+    def get_pretrained(name: str, device=None):
+        raise NotImplementedError("No base implementation for getting pretrained model")
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            descriptions: tp.Sequence[tp.Optional[str]],
+            prompt: tp.Optional[torch.Tensor],
+    ) -> tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
+        """Prepare model inputs.
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+        """
+        attributes = [
+            ConditioningAttributes(text={'description': description})
+            for description in descriptions]
+        if prompt is not None:
+            if descriptions is not None:
+                assert len(descriptions) == len(prompt), "Prompt and nb. descriptions doesn't match"
+            prompt = prompt.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt)
+            assert scale is None
+        else:
+            prompt_tokens = None
+        return attributes, prompt_tokens
+    def generate_unconditional(self, num_samples: int, progress: bool = False,
+                               return_tokens: bool = False) -> tp.Union[torch.Tensor,
+                                                                        tp.Tuple[torch.Tensor, torch.Tensor]]:
+        """Generate samples in an unconditional manner.
+        Args:
+            num_samples (int): Number of samples to be generated.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        """
+        descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+    def generate(self, descriptions: tp.List[str], progress: bool = False, return_tokens: bool = False) \
+            -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+        """Generate samples conditioned on text.
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        """
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        assert prompt_tokens is None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                              progress: bool = False, return_tokens: bool = False) \
+            -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
+        """Generate samples conditioned on audio prompts and an optional text description.
+        Args:
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+            descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        """
+        if prompt.dim() == 2:
+            prompt = prompt[None]
+        if prompt.dim() != 3:
+            raise ValueError("prompt should have 3 dimensions: [B, C, T] (C = 1).")
+        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+        if descriptions is None:
+            descriptions = [None] * len(prompt)
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        assert prompt_tokens is not None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
+                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -> torch.Tensor:
+        """Generate discrete audio tokens given audio prompt and/or conditions.
+        Args:
+            attributes (list of ConditioningAttributes): Conditions used for generation (here text).
+            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        """
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, tokens_to_generate)
+            else:
+                print(f'{generated_tokens: 6d} / {tokens_to_generate: 6d}', end='\r')
+        if prompt_tokens is not None:
+            assert max_prompt_len >= prompt_tokens.shape[-1], \
+                "Prompt is longer than audio to generate"
+        callback = None
+        if progress:
+            callback = _progress_callback
+        if self.duration <= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+        else:
+            assert self.extend_stride is not None, "Stride should be defined to generate beyond max_duration"
+            assert self.extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+            while current_gen_offset + prompt_length < total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+            gen_tokens = torch.cat(all_tokens, dim=-1)
+        return gen_tokens
+    def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:
+        """Generate Audio from tokens."""
+        assert gen_tokens.dim() == 3
+        with torch.no_grad():
+            gen_audio = self.compression_model.decode(gen_tokens, None)
+        return gen_audio

audiocraft/models/lm.py ADDED Viewed

	@@ -0,0 +1,547 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from functools import partial
+import logging
+import math
+import typing as tp
+import torch
+from torch import nn
+from ..utils import utils
+from ..modules.streaming import StreamingModule, State
+from ..modules.transformer import StreamingTransformer, create_norm_fn
+from ..modules.conditioners import (
+    ConditionFuser,
+    ClassifierFreeGuidanceDropout,
+    AttributeDropout,
+    ConditioningProvider,
+    ConditioningAttributes,
+    ConditionType,
+)
+from ..modules.codebooks_patterns import CodebooksPatternProvider
+from ..modules.activations import get_activation_fn
+logger = logging.getLogger(__name__)
+ConditionTensors = tp.Dict[str, ConditionType]
+CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
+def get_init_fn(method: str, input_dim: int, init_depth: tp.Optional[int] = None):
+    """LM layer initialization.
+    Inspired from xlformers: https://github.com/fairinternal/xlformers
+    Args:
+        method (str): Method name for init function. Valid options are:
+            'gaussian', 'uniform'.
+        input_dim (int): Input dimension of the initialized module.
+        init_depth (int, optional): Optional init depth value used to rescale
+            the standard deviation if defined.
+    """
+    # Compute std
+    std = 1 / math.sqrt(input_dim)
+    # Rescale with depth
+    if init_depth is not None:
+        std = std / math.sqrt(2 * init_depth)
+    if method == 'gaussian':
+        return partial(
+            torch.nn.init.trunc_normal_, mean=0.0, std=std, a=-3 * std, b=3 * std
+        )
+    elif method == 'uniform':
+        bound = math.sqrt(3) * std  # ensure the standard deviation is `std`
+        return partial(torch.nn.init.uniform_, a=-bound, b=bound)
+    else:
+        raise ValueError("Unsupported layer initialization method")
+def init_layer(m: nn.Module,
+               method: str,
+               init_depth: tp.Optional[int] = None,
+               zero_bias_init: bool = False):
+    """Wrapper around ``get_init_fn`` for proper initialization of LM modules.
+    Args:
+        m (nn.Module): Module to initialize.
+        method (str): Method name for the init function.
+        init_depth (int, optional): Optional init depth value used to rescale
+            the standard deviation if defined.
+        zero_bias_init (bool): Whether to initialize the bias to 0 or not.
+    """
+    if isinstance(m, nn.Linear):
+        init_fn = get_init_fn(method, m.in_features, init_depth=init_depth)
+        if m.weight.device.type == 'cpu' and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)
+        if zero_bias_init and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Embedding):
+        init_fn = get_init_fn(method, m.embedding_dim, init_depth=None)
+        if m.weight.device.type == 'cpu' and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)
+class ScaledEmbedding(nn.Embedding):
+    """Boost learning rate for embeddings (with `scale`).
+    """
+    def __init__(self, *args, lr=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lr = lr
+    def make_optim_group(self):
+        group = {"params": list(self.parameters())}
+        if self.lr is not None:
+            group["lr"] = self.lr
+        return group
+@dataclass
+class LMOutput:
+    # The logits are already re-aligned with the input codes
+    # hence no extra shift is required, e.g. when computing CE
+    logits: torch.Tensor  # [B, K, T, card]
+    mask: torch.Tensor  # [B, K, T]
+class LMModel(StreamingModule):
+    """Transformer-based language model on multiple streams of codes.
+    Args:
+        pattern_provider (CodebooksPatternProvider): Pattern provider for codebook interleaving.
+        condition_provider (MusicConditioningProvider): Conditioning provider from metadata.
+        fuser (ConditionFuser): Fuser handling the fusing of conditions with language model input.
+        n_q (int): Number of parallel streams to model.
+        card (int): Cardinality, vocabulary size.
+        dim (int): Dimension of the transformer encoder.
+        num_heads (int): Number of heads for the transformer encoder.
+        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
+        norm (str): Normalization method.
+        norm_first (bool): Use pre-norm instead of post-norm.
+        emb_lr (float, optional): Embedding-specific learning rate.
+        bias_proj (bool): Use bias for output projections.
+        weight_init (str, optional): Method for weight initialization.
+        depthwise_init (str, optional): Method for depthwise weight initialization.
+        zero_bias_init (bool): If true and bias in Linears, initialize bias to zeros.
+        cfg_dropout (float): Classifier-free guidance dropout.
+        cfg_coef (float): Classifier-free guidance coefficient.
+        attribute_dropout (dict): Attribute dropout probabilities.
+        two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
+        **kwargs: Additional parameters for the transformer encoder.
+    """
+    def __init__(self, pattern_provider: CodebooksPatternProvider, condition_provider: ConditioningProvider,
+                 fuser: ConditionFuser, n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8,
+                 hidden_scale: int = 4, norm: str = 'layer_norm', norm_first: bool = False,
+                 emb_lr: tp.Optional[float] = None, bias_proj: bool = True,
+                 weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
+                 zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0,
+                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {}, two_step_cfg: bool = False,
+                 **kwargs):
+        super().__init__()
+        self.cfg_coef = cfg_coef
+        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout)
+        self.att_dropout = AttributeDropout(p=attribute_dropout)
+        self.condition_provider = condition_provider
+        self.fuser = fuser
+        self.card = card
+        embed_dim = self.card + 1
+        self.n_q = n_q
+        self.dim = dim
+        self.pattern_provider = pattern_provider
+        self.two_step_cfg = two_step_cfg
+        self.emb = nn.ModuleList([ScaledEmbedding(embed_dim, dim, lr=emb_lr) for _ in range(n_q)])
+        if 'activation' in kwargs:
+            kwargs['activation'] = get_activation_fn(kwargs['activation'])
+        self.transformer = StreamingTransformer(
+            d_model=dim, num_heads=num_heads, dim_feedforward=int(hidden_scale * dim),
+            norm=norm, norm_first=norm_first, **kwargs)
+        self.out_norm: tp.Optional[nn.Module] = None
+        if norm_first:
+            self.out_norm = create_norm_fn(norm, dim)
+        self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
+        self._init_weights(weight_init, depthwise_init, zero_bias_init)
+        self._fsdp: tp.Optional[nn.Module]
+        self.__dict__['_fsdp'] = None
+    def _init_weights(self, weight_init: tp.Optional[str], depthwise_init: tp.Optional[str], zero_bias_init: bool):
+        """Initialization of the transformer module weights.
+        Args:
+            weight_init (str, optional): Weight initialization strategy. See ``get_init_fn`` for valid options.
+            depthwise_init (str, optional): Depthwise initialization strategy. The following options are valid:
+                'current' where the depth corresponds to the current layer index or 'global' where the total number
+                of layer is used as depth. If not set, no depthwise initialization strategy is used.
+            zero_bias_init (bool): Whether to initialize bias to zero or not.
+        """
+        assert depthwise_init is None or depthwise_init in ['current', 'global']
+        assert depthwise_init is None or weight_init is not None, \
+            "If 'depthwise_init' is defined, a 'weight_init' method should be provided."
+        assert not zero_bias_init or weight_init is not None, \
+            "If 'zero_bias_init', a 'weight_init' method should be provided"
+        if weight_init is None:
+            return
+        for emb_layer in self.emb:
+            init_layer(emb_layer, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+        for layer_idx, tr_layer in enumerate(self.transformer.layers):
+            depth = None
+            if depthwise_init == 'current':
+                depth = layer_idx + 1
+            elif depthwise_init == 'global':
+                depth = len(self.transformer.layers)
+            init_fn = partial(init_layer, method=weight_init, init_depth=depth, zero_bias_init=zero_bias_init)
+            tr_layer.apply(init_fn)
+        for linear in self.linears:
+            init_layer(linear, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+    @property
+    def special_token_id(self) -> int:
+        return self.card
+    @property
+    def num_codebooks(self) -> int:
+        return self.n_q
+    def forward(self, sequence: torch.Tensor,
+                conditions: tp.List[ConditioningAttributes],
+                condition_tensors: tp.Optional[ConditionTensors] = None,
+                stage: int = -1) -> torch.Tensor:
+        """Apply language model on sequence and conditions.
+        Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+        S the sequence steps, return the logits with shape [B, card, K, S].
+        Args:
+            indices (torch.Tensor): Indices of the codes to model.
+            conditions (list of ConditioningAttributes): Conditions to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
+                tensors, see `conditions`.
+            stage (int): The codebook level that is being predicted. Relevant for MAGNeT
+                in which prediction is done in a codebook-by-codebook manner.
+                Takes values in range(n_q), and ignored by default.
+        Returns:
+            torch.Tensor: Logits.
+        """
+        B, K, S = sequence.shape
+        assert K == self.num_codebooks, "Sequence shape must match the specified number of codebooks"
+        input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
+        if condition_tensors is None:
+            assert not self._is_streaming, "Conditions tensors should be precomputed when streaming."
+            # apply dropout modules
+            conditions = self.cfg_dropout(conditions)
+            conditions = self.att_dropout(conditions)
+            tokenized = self.condition_provider.tokenize(conditions)
+            # encode conditions and fuse, both have a streaming cache to not recompute when generating.
+            condition_tensors = self.condition_provider(tokenized)
+        else:
+            assert not conditions, "Shouldn't pass both conditions and condition_tensors."
+        input_, cross_attention_input = self.fuser(input_, condition_tensors)
+        out = self.transformer(input_, cross_attention_src=cross_attention_input,
+                               src_mask=(self.attn_mask_per_stage[stage] if stage >= 0 else None))
+        if self.out_norm:
+            out = self.out_norm(out)
+        logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
+        # remove the prefix from the model outputs
+        if len(self.fuser.fuse2cond['prepend']) > 0:
+            logits = logits[:, :, -S:]
+        return logits  # [B, K, S, card]
+    def compute_predictions(
+            self, codes: torch.Tensor,
+            conditions: tp.List[ConditioningAttributes],
+            condition_tensors: tp.Optional[ConditionTensors] = None,
+            stage: int = -1,
+            keep_only_valid_steps: bool = True) -> LMOutput:
+        """Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+        forward using the specified codes interleaving pattern.
+        Args:
+            codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
+                K the number of codebooks and T the number of timesteps.
+            conditions (list of ConditioningAttributes): conditionings to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType], optional): pre-computed conditioning
+                tensors, see `conditions`.
+            stage (int): The codebook level that is being predicted. Relevant for MAGNeT
+                in which prediction is done in a codebook-by-codebook manner.
+                Takes values in range(n_q), and ignored by default.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+        Returns:
+            LMOutput: Language model outputs
+                logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+                    i.e. the first item corresponds to logits to predict the first code, meaning that
+                    no additional shifting of codes and logits is required.
+                mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+                    Given the specified interleaving strategies, parts of the logits and codes should
+                    not be considered as valid predictions because of invalid context.
+        """
+        B, K, T = codes.shape
+        codes = codes.contiguous()
+        # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
+        pattern = self.pattern_provider.get_pattern(T)
+        sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
+            codes, self.special_token_id, keep_only_valid_steps=keep_only_valid_steps,
+        )
+        # apply model on pattern sequence
+        model = self if self._fsdp is None else self._fsdp
+        logits = model(sequence_codes, conditions, condition_tensors, stage=stage)  # [B, K, S, card]
+        # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -> [B, K, T, card]
+        # and provide the corresponding mask over invalid positions of tokens
+        logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
+        # note: we use nans as special token to make it obvious if we feed unexpected logits
+        logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
+            logits, float('nan'), keep_only_valid_steps=keep_only_valid_steps
+        )
+        logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
+        logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -> [B, K, T]
+        return LMOutput(logits, logits_mask)
+    def _sample_next_token(self,
+                           sequence: torch.Tensor,
+                           cfg_conditions: CFGConditions,
+                           unconditional_state: State,
+                           use_sampling: bool = False,
+                           temp: float = 1.0,
+                           top_k: int = 0,
+                           top_p: float = 0.0,
+                           cfg_coef: tp.Optional[float] = None,
+                           two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
+        """Sample next token from the model given a sequence and a set of conditions. The model supports
+        multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
+        Args:
+            sequence (torch.Tensor): Current sequence of shape [B, K, S]
+                with K corresponding to the number of codebooks and S the number of sequence steps.
+                S = 1 in streaming mode, except for the first step that contains a bigger prompt.
+            condition_tensors (dict[str, ConditionType): Set of conditions. If CFG is used,
+                should be twice the batch size, being the concatenation of the conditions + null conditions.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for "top-k" sampling.
+            top_p (float): P for "top-p" sampling.
+            cfg_coef (float, optional): classifier free guidance coefficient
+        Returns:
+            next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
+        """
+        B = sequence.shape[0]
+        cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
+        model = self if self._fsdp is None else self._fsdp
+        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if two_step_cfg and cfg_conditions != {}:
+            assert isinstance(cfg_conditions, tuple), type(cfg_conditions)
+            condition_tensors, null_condition_tensors = cfg_conditions
+            cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
+            state = self.get_streaming_state()
+            self.set_streaming_state(unconditional_state)
+            uncond_logits = model(sequence, conditions=[], condition_tensors=null_condition_tensors)
+            unconditional_state.update(self.get_streaming_state())
+            self.set_streaming_state(state)
+            logits = uncond_logits + (cond_logits - uncond_logits) * self.cfg_coef
+        else:
+            assert isinstance(cfg_conditions, dict)
+            condition_tensors = cfg_conditions
+            if condition_tensors:
+                # Preparing for CFG, predicting both conditional and unconditional logits.
+                sequence = torch.cat([sequence, sequence], dim=0)
+            all_logits = model(
+                sequence,
+                conditions=[], condition_tensors=condition_tensors)
+            if condition_tensors:
+                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
+                logits = uncond_logits + (cond_logits - uncond_logits) * cfg_coef
+            else:
+                logits = all_logits
+        logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
+        logits = logits[..., -1]  # [B x K x card]
+        # Apply softmax for sampling if temp > 0. Else, do greedy sampling to avoid zero division error.
+        if use_sampling and temp > 0.0:
+            probs = torch.softmax(logits / temp, dim=-1)
+            if top_p > 0.0:
+                next_token = utils.sample_top_p(probs, p=top_p)
+            elif top_k > 0:
+                next_token = utils.sample_top_k(probs, k=top_k)
+            else:
+                next_token = utils.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(logits, dim=-1, keepdim=True)
+        return next_token
+    @torch.no_grad()
+    def generate(self,
+                 prompt: tp.Optional[torch.Tensor] = None,
+                 conditions: tp.List[ConditioningAttributes] = [],
+                 num_samples: tp.Optional[int] = None,
+                 max_gen_len: int = 256,
+                 use_sampling: bool = True,
+                 temp: float = 1.0,
+                 top_k: int = 250,
+                 top_p: float = 0.0,
+                 cfg_coef: tp.Optional[float] = None,
+                 two_step_cfg: tp.Optional[bool] = None,
+                 remove_prompts: bool = False,
+                 check: bool = False,
+                 callback: tp.Optional[tp.Callable[[int, int], None]] = None,
+                 **kwargs) -> torch.Tensor:
+        """Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+        be performed in a greedy fashion or using sampling with top K and top P strategies.
+        Args:
+            prompt (torch.Tensor, optional): Prompt tokens of shape [B, K, T].
+            conditions_tensors (list of ConditioningAttributes, optional): List of conditions.
+            num_samples (int, optional): Number of samples to generate when no prompt and no conditions are given.
+            max_gen_len (int): Maximum generation length.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for "top-k" sampling.
+            top_p (float): P for "top-p" sampling.
+            cfg_coeff (float, optional): Classifier-free guidance coefficient.
+            two_step_cfg (bool, optional): Whether to perform classifier-free guidance with two steps generation.
+            remove_prompts (bool): Whether to remove prompts from generation or not.
+            check (bool): Whether to apply further checks on generated sequence.
+            callback (Callback, optional): Callback function to report generation progress.
+        Returns:
+            torch.Tensor: Generated tokens.
+        """
+        assert not self.training, "generation shouldn't be used in training mode."
+        first_param = next(iter(self.parameters()))
+        device = first_param.device
+        # Checking all input shapes are consistent.
+        possible_num_samples = []
+        if num_samples is not None:
+            possible_num_samples.append(num_samples)
+        elif prompt is not None:
+            possible_num_samples.append(prompt.shape[0])
+        elif conditions:
+            possible_num_samples.append(len(conditions))
+        else:
+            possible_num_samples.append(1)
+        assert [x == possible_num_samples[0] for x in possible_num_samples], "Inconsistent inputs shapes"
+        num_samples = possible_num_samples[0]
+        # below we create set of conditions: one conditional and one unconditional
+        # to do that we merge the regular condition together with the null condition
+        # we then do 1 forward pass instead of 2.
+        # the reason for that is two-fold:
+        # 1. it is about x2 faster than doing 2 forward passes
+        # 2. avoid the streaming API treating the 2 passes as part of different time steps
+        # We also support doing two different passes, in particular to ensure that
+        # the padding structure is exactly the same between train and test.
+        # With a batch size of 1, this can be slower though.
+        cfg_conditions: CFGConditions
+        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if conditions:
+            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+            if two_step_cfg:
+                cfg_conditions = (
+                    self.condition_provider(self.condition_provider.tokenize(conditions)),
+                    self.condition_provider(self.condition_provider.tokenize(null_conditions)),
+                )
+            else:
+                conditions = conditions + null_conditions
+                tokenized = self.condition_provider.tokenize(conditions)
+                cfg_conditions = self.condition_provider(tokenized)
+        else:
+            cfg_conditions = {}
+        if prompt is None:
+            assert num_samples > 0
+            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+        B, K, T = prompt.shape
+        start_offset = T
+        assert start_offset < max_gen_len
+        pattern = self.pattern_provider.get_pattern(max_gen_len)
+        # this token is used as default value for codes that are not generated yet
+        unknown_token = -1
+        # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
+        gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
+        # filling the gen_codes with the prompt if needed
+        gen_codes[..., :start_offset] = prompt
+        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+        gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
+        # retrieve the start_offset in the sequence:
+        # it is the first sequence step that contains the `start_offset` timestep
+        start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+        assert start_offset_sequence is not None
+        with self.streaming():
+            unconditional_state = self.get_streaming_state()
+            prev_offset = 0
+            gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
+            for offset in range(start_offset_sequence, gen_sequence_len):
+                # get current sequence (note that the streaming API is providing the caching over previous offsets)
+                curr_sequence = gen_sequence[..., prev_offset:offset]
+                curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
+                if check:
+                    # check coherence between mask and sequence
+                    assert (curr_sequence == torch.where(curr_mask, curr_sequence, self.special_token_id)).all()
+                    # should never happen as gen_sequence is filled progressively
+                    assert not (curr_sequence == unknown_token).any()
+                # sample next token from the model, next token shape is [B, K, 1]
+                next_token = self._sample_next_token(
+                    curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
+                    cfg_coef=cfg_coef, two_step_cfg=two_step_cfg)
+                # ensure the tokens that should be masked are properly set to special_token_id
+                # as the model never output special_token_id
+                valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
+                next_token[~valid_mask] = self.special_token_id
+                # ensure we don't overwrite prompt tokens, we only write over unknown tokens
+                # (then mask tokens should be left as is as well, which is correct)
+                gen_sequence[..., offset:offset+1] = torch.where(
+                    gen_sequence[..., offset:offset+1] == unknown_token,
+                    next_token, gen_sequence[..., offset:offset+1]
+                )
+                prev_offset = offset
+                if callback is not None:
+                    callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
+        unconditional_state.clear()
+        # ensure sequence has been entirely filled
+        assert not (gen_sequence == unknown_token).any()
+        # ensure gen_sequence pattern and mask are matching
+        # which means the gen_sequence is valid according to the pattern
+        assert (
+            gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence, self.special_token_id)
+        ).all()
+        # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
+        out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
+        # sanity checks over the returned codes and corresponding masks
+        assert (out_codes[..., :max_gen_len] != unknown_token).all()
+        assert (out_mask[..., :max_gen_len] == 1).all()
+        out_start_offset = start_offset if remove_prompts else 0
+        out_codes = out_codes[..., out_start_offset:max_gen_len]
+        # ensure the returned codes are all valid
+        assert (out_codes >= 0).all() and (out_codes <= self.card).all()
+        return out_codes

audiocraft/models/lm_magnet.py ADDED Viewed

	@@ -0,0 +1,498 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+import typing as tp
+import torch
+import numpy as np
+from ..utils import utils
+from ..modules.conditioners import (
+    ClassifierFreeGuidanceDropout,
+    ConditioningAttributes,
+    ConditionType,
+)
+from .lm import LMModel
+logger = logging.getLogger(__name__)
+ConditionTensors = tp.Dict[str, ConditionType]
+CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
+class MagnetLMModel(LMModel):
+    """Transformer-based, non-autoregressive model, operates on multiple streams of audio tokens (MAGNeT).
+    Args:
+        subcodes_context (int): The number of timesteps attended in the self-attention blocks of codebooks > 0.
+                                When set to -1, attention is unrestricted and all timesteps are attended. Defaults to 5.
+        compression_model_framerate (int): frame rate of the audio tokenizer.
+        segment_duration (int): Sample length in seconds.
+        span_len (int): Determines the length of masking spans. This is the minimal length of consecutive masked tokens,
+                        for both training and inference. Defaults to 3.
+        **kwargs: Additional parameters for the LMModel.
+    """
+    def __init__(self, subcodes_context: int = 5, compression_model_framerate: int = 50,
+                 segment_duration: int = 10, span_len: int = 3, **kwargs):
+        super().__init__(**kwargs)
+        self.causal = kwargs['causal']
+        self.subcodes_context = subcodes_context
+        self.span_len = span_len
+        self._build_attn_masks(compression_model_framerate=compression_model_framerate,
+                               segment_duration=segment_duration,
+                               num_heads=kwargs['num_heads'],
+                               device=kwargs['device'], dtype=kwargs['dtype'])
+    def restricted_context_attn_mask(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        """Creates a restricted attention mask (local attention map) where the context
+           is determined by self.subcodes_context.
+        Args:
+            seq_len (int): token sequence length.
+            device (torch.device): device of the output tensor.
+            dtype (torch.dtype): data type of the output tensor.
+        Returns:
+            torch.Tensor: The restricted attention mask.
+        """
+        # Return a context restricted non-causal att mask
+        queries_pos = torch.arange(seq_len, device=device).view(-1, 1)
+        keys_pos = torch.arange(seq_len, device=device).view(1, -1)
+        delta = queries_pos - keys_pos
+        valid = torch.abs(delta) <= self.subcodes_context
+        return torch.where(
+            valid,
+            torch.zeros([], device=device, dtype=dtype),
+            torch.full([], float('-inf'), device=device, dtype=dtype))
+    def _stage_attn_mask(self, stage: int, seq_len: int, num_heads: int,
+                         device: torch.device, dtype: torch.dtype) -> tp.Optional[torch.Tensor]:
+        """Creates a restricted attention mask given the stage (codebook index).
+        Args:
+            stage (int): The codebook index. Takes values in [0, n_q].
+            seq_len (int): Token sequence length.
+            num_heads (int): Num transformer attention heads.
+            device (torch.device): device of the output tensor.
+            dtype (torch.dtype): data type of the output tensor.
+        Returns:
+            torch.Tensor: Either a restricted attention mask or None if stage attention is unrestricted.
+        """
+        sa_mask = None
+        if stage > 0 and self.subcodes_context > -1:
+            # parallel - non-causal - with restricted subcodes context
+            sa_mask = self.restricted_context_attn_mask(seq_len, device=device, dtype=dtype)
+        if sa_mask is not None:
+            # Repeat for each attention head
+            sa_mask = sa_mask.repeat((1, num_heads, 1, 1))
+            # align8 to enable memory efficient attention
+            MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR = 8
+            seq_len_aligned = \
+                int(np.ceil(seq_len / MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR)) * MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR
+            sa_mask_aligned = torch.zeros((1, num_heads, seq_len_aligned, seq_len_aligned), device=device, dtype=dtype)
+            sa_mask_aligned[..., :seq_len, :seq_len] = sa_mask
+            sa_mask = sa_mask_aligned
+        return sa_mask
+    def _build_attn_masks(self, compression_model_framerate: int, segment_duration: int, num_heads: int,
+                          device: torch.device, dtype: torch.dtype):
+        """Construct attention mask per stage. For each of the RVQ codebook levels in the [0, n_q] range,
+           either a local attention map or None would be stored as an entry in the self.attn_mask_per_stage list.
+        Args:
+            compression_model_framerate (int): The frame rate of the tokenizer.
+            segment_duration (int): Sample length in seconds.
+            num_heads (int): Num transformer attention heads.
+            device (torch.device): device of the output tensor.
+            dtype (torch.dtype): data type of the output tensor.
+        """
+        seq_len = compression_model_framerate * segment_duration
+        self.attn_mask_per_stage = [self._stage_attn_mask(stage, seq_len, num_heads,
+                                                          device, dtype) for stage in range(self.n_q)]
+    @torch.no_grad()
+    def generate(self,
+                 prompt: tp.Optional[torch.Tensor] = None,
+                 conditions: tp.List[ConditioningAttributes] = [],
+                 num_samples: tp.Optional[int] = None,
+                 max_gen_len: int = 256,
+                 use_sampling: bool = True,
+                 temp: float = 1.0,
+                 top_k: int = 250,
+                 top_p: float = 0.0,
+                 cfg_coef: tp.Optional[float] = None,
+                 two_step_cfg: tp.Optional[bool] = None,
+                 remove_prompts: bool = False,
+                 check: bool = False,
+                 callback: tp.Optional[tp.Callable[[int, int], None]] = None,
+                 **kwargs) -> torch.Tensor:
+        assert cfg_coef is None, "Unsupported in MAGNeT. Use max_cfg_coef,min_cfg_coef instead."
+        assert two_step_cfg is None, "MAGNeT currently doesn't support two step classifier-free-guidance."
+        assert remove_prompts is False, "MAGNeT currently doesn't support the remove_prompts arg."
+        assert check is False, "MAGNeT currently doesn't support the check arg."
+        # Call the MAGNeT-specific generation method
+        return self._generate_magnet(prompt=prompt,
+                                     conditions=conditions,
+                                     num_samples=num_samples,
+                                     max_gen_len=max_gen_len,
+                                     use_sampling=use_sampling,
+                                     temp=temp,
+                                     top_k=top_k,
+                                     top_p=top_p,
+                                     callback=callback, **kwargs)
+    @torch.no_grad()
+    def _generate_magnet(self,
+                         prompt: tp.Optional[torch.Tensor] = None,
+                         conditions: tp.List[ConditioningAttributes] = [],
+                         num_samples: tp.Optional[int] = None,
+                         max_gen_len: int = 256,
+                         use_sampling: bool = True,
+                         temp: float = 3.0,
+                         top_k: int = 0,
+                         top_p: float = 0.9,
+                         callback: tp.Optional[tp.Callable[[int, int], None]] = None,
+                         max_cfg_coef: float = 10.0,
+                         min_cfg_coef: float = 1.0,
+                         decoding_steps: tp.List[int] = [20, 10, 10, 10],
+                         anneal_temp: bool = True,
+                         span_scoring='max',
+                         span_arrangement='nonoverlap') -> torch.Tensor:
+        """Generate audio tokens given textual conditions, and optionally given audio prompts,
+        by running MAGNeT's iterative decoding algorithm for each of the n_q RVQ levels.
+        Args:
+            prompt (torch.Tensor): Prompt tokens of shape [B, K, T].
+            conditions (list of ConditioningAttributes): List of conditions.
+            num_samples (int): Number of samples to generate when no prompt and no conditions are given.
+            max_gen_len (int): Maximum generation length.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Initial sampling temperature.
+            top_k (int): k for "top-k" sampling.
+            top_p (float): p for "top-p" sampling.
+            callback (Callback): Callback function to report generation progress.
+            max_clsfg_coef (float): Initial coefficient used for classifier free guidance.
+            min_clsfg_coef (float): Final coefficient used for classifier free guidance.
+            decoding_steps (list of n_q ints): The number of iterative decoding steps,
+                                            for each of the n_q RVQ codebooks.
+            anneal_temp (bool): When set to True, softmax temperature will be linearly decayed to zero, at each stage.
+            span_scoring (str): Use the maximum probability of each span ('max')
+                                or the product of probabilities ('prod').
+            span_arrangement (str): Use either non-overlapping spans ('nonoverlap') or overlapping spans ('stride1').
+                                                in the masking scheme.
+        Returns:
+            torch.Tensor: Generated tokens.
+        """
+        assert not self.training, "generation shouldn't be used in training mode."
+        first_param = next(iter(self.parameters()))
+        device = first_param.device
+        # Checking all input shapes are consistent.
+        possible_num_samples = []
+        if num_samples is not None:
+            possible_num_samples.append(num_samples)
+        elif prompt is not None:
+            possible_num_samples.append(prompt.shape[0])
+        elif conditions:
+            possible_num_samples.append(len(conditions))
+        else:
+            possible_num_samples.append(1)
+        assert [x == possible_num_samples[0] for x in possible_num_samples], "Inconsistent inputs shapes"
+        num_samples = possible_num_samples[0]
+        # below we create set of conditions: one conditional and one unconditional
+        # to do that we merge the regular condition together with the null condition
+        # we then do 1 forward pass instead of 2.
+        cfg_conditions: tp.Optional[ConditionTensors]
+        if conditions:
+            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+            conditions = conditions + null_conditions
+            tokenized = self.condition_provider.tokenize(conditions)
+            cfg_conditions = self.condition_provider(tokenized)
+        else:
+            cfg_conditions = {}
+        if prompt is None:
+            assert num_samples > 0
+            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+        B, K, prompt_length = prompt.shape
+        start_offset = prompt_length
+        assert start_offset < max_gen_len
+        mask_id = self.special_token_id
+        # we generate codes with a fixed sequence length
+        shape = (B, K, max_gen_len)
+        gen_codes = torch.full(shape, mask_id, dtype=torch.long, device=device)
+        # filling the gen_codes with the prompt if needed
+        gen_codes[..., :start_offset] = prompt
+        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+        gen_sequence = gen_codes
+        curr_step = 0
+        for stage, n_steps in zip(range(self.n_q), decoding_steps):
+            gen_sequence, curr_step = self._generate_stage(gen_sequence,
+                                                           cfg_conditions,
+                                                           stage=stage,
+                                                           device=device,
+                                                           prompt_length=prompt_length,
+                                                           prompt=prompt,
+                                                           temp=temp,
+                                                           max_cfg_coef=max_cfg_coef,
+                                                           min_cfg_coef=min_cfg_coef,
+                                                           top_k=top_k,
+                                                           top_p=top_p,
+                                                           timesteps=n_steps,
+                                                           anneal_temp=anneal_temp,
+                                                           span_scoring=span_scoring,
+                                                           use_sampling=use_sampling,
+                                                           span_arrangement=span_arrangement,
+                                                           curr_step=curr_step,
+                                                           total_steps=sum(decoding_steps),
+                                                           callback=callback)
+        return gen_sequence
+    @torch.no_grad()
+    def _generate_stage(self,
+                        gen_sequence: torch.Tensor,
+                        condition_tensors: tp.Optional[ConditionTensors],
+                        stage: int,
+                        device: torch.device,
+                        prompt_length: int = 0,
+                        prompt: tp.Optional[torch.Tensor] = None,
+                        use_sampling: bool = True,
+                        temp: float = 3.0,
+                        max_cfg_coef: float = 10.0,
+                        min_cfg_coef: float = 1.0,
+                        top_k: int = 0,
+                        top_p: float = 0.0,
+                        timesteps: int = 10,
+                        anneal_temp: bool = True,
+                        span_scoring: str = 'max',
+                        span_arrangement: str = 'nonoverlap',
+                        curr_step: int = 0,
+                        total_steps: int = 0,
+                        callback: tp.Optional[tp.Callable[[int, int], None]] = None) -> tp.Tuple[torch.Tensor, int]:
+        """Generate audio tokens of a single RVQ level (stage), given the previously generated stages,
+           and the textual conditions.
+        Args:
+            gen_sequence (torch.Tensor): Previously generated tokens.
+            condition_tensors (tp.Optional[ConditionTensors]): pre-computed conditioning tensors.
+            stage (int): RVQ level to generate.
+            device (torch.device): device of the output tensor.
+            prompt_length (int): Temporal length of the audio prompt.
+            prompt (torch.Tensor): Prompt tokens of shape [B, K, T].
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Initial sampling temperature.
+            max_clsfg_coef (float): Initial coefficient used for classifier free guidance.
+            min_clsfg_coef (float): Final coefficient used for classifier free guidance.
+            top_k (int): k for "top-k" sampling.
+            top_p (float): p for "top-p" sampling.
+            timesteps (int): Number of iterative decoding steps.
+            anneal_temp (bool): When set to True, softmax temperature will be linearly decayed to zero, at each stage.
+            span_scoring (str): Use the maximum probability of each span ('max')
+                                or the product of probabilities ('prod').
+            span_arrangement (str): Use either non-overlapping spans ('nonoverlap') or overlapping spans ('stride1').
+                                                in the masking scheme.
+            curr_step (int): Global iterative decoding step counter.
+            total_steps (int): Total decoding steps.
+            callback (Callback): Callback function to report generation progress.
+        Returns:
+            tuple(torch.Tensor, int): Generated tokens and the current decoding step counter.
+        """
+        B, K, T = gen_sequence.shape
+        shape = (B, 1, T)  # generating a single codebook per stage
+        mask_id = self.special_token_id
+        stage_gen_seq = torch.full(shape, mask_id, dtype=torch.long, device=device)
+        assert span_arrangement == 'nonoverlap' or span_arrangement == 'stride1'
+        chunk_masking = self.span_len > 1 and span_arrangement == 'nonoverlap'
+        DONT_REMASK_ME_SCORE = -1e4
+        model = self if self._fsdp is None else self._fsdp
+        if chunk_masking:
+            # span-wise scores
+            n_chunks = T // self.span_len
+            if T % self.span_len != 0:
+                # trim sequence ending to achieve a multiple of span_len
+                T = self.span_len * n_chunks
+                gen_sequence = gen_sequence[..., :T]
+                stage_gen_seq = stage_gen_seq[..., :T]
+            chunked_shape = (B, 1, n_chunks)
+            n_prompt_chunks = prompt_length // self.span_len
+            scores = torch.zeros(chunked_shape, dtype=torch.float32, device=device)
+            scores[..., :n_prompt_chunks] = DONT_REMASK_ME_SCORE
+            num_chunks_to_gen = n_chunks - n_prompt_chunks
+        else:
+            # token-wise scores
+            scores = torch.zeros(shape, dtype=torch.float32, device=device)
+            scores[..., :prompt_length] = DONT_REMASK_ME_SCORE
+            gen_T = T - prompt_length
+        # run MAGNeT iterative decoding for "timesteps" iterations
+        for timestep, steps_left in zip(torch.linspace(0, 1, timesteps, device=device), reversed(range(timesteps))):
+            mask_p = torch.cos(timestep * math.pi * 0.5)
+            if chunk_masking:
+                num_masked = max(int((mask_p * num_chunks_to_gen).item()), 1)
+            else:
+                num_masked = max(int((mask_p * gen_T).item()), 1)
+            # masking
+            run_lps_masking = (span_arrangement == 'stride1') and self.span_len > 1
+            if run_lps_masking:
+                # masking of the k least probable overlapping (stride 1) spans
+                mask = torch.concat((
+                    [self._least_probable_span_masking(scores[[i], :, :], num_masked).to(device)
+                     for i in range(B)]), dim=0)
+                stage_gen_seq[mask] = mask_id
+            else:
+                # masking of the k least probable non-overlapping spans
+                masked = scores.topk(num_masked, dim=-1).indices
+                if chunk_masking:
+                    chunks_mask = torch.full(chunked_shape, False, dtype=torch.bool, device=device)
+                    chunks_mask = chunks_mask.scatter(2, masked, True)
+                    mask = torch.repeat_interleave(chunks_mask, self.span_len, dim=-1)
+                    stage_gen_seq[mask] = mask_id
+                else:
+                    stage_gen_seq = stage_gen_seq.scatter(2, masked, mask_id)
+            if prompt is not None:
+                stage_gen_seq[..., :prompt_length] = prompt[:, stage, :].unsqueeze(1)
+            gen_sequence[:, [stage], :] = stage_gen_seq
+            if condition_tensors:
+                # duplicate input for classifier free guidance
+                sequence = torch.cat([gen_sequence, gen_sequence], dim=0)
+            all_logits = model(sequence, [], condition_tensors, stage=stage)
+            if condition_tensors:
+                # classifier free guidance with annealing
+                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
+                clsfg_coef = float(mask_p) * max_cfg_coef + (1 - float(mask_p)) * min_cfg_coef
+                logits = uncond_logits + (cond_logits - uncond_logits) * clsfg_coef
+            else:
+                logits = all_logits
+            # temperature annealing - linear
+            t = temp * (steps_left / timesteps) if anneal_temp else temp
+            # sampling
+            logits = logits[:, stage, :, :].unsqueeze(1)
+            probs = torch.softmax(logits / max(t, 1e-2), dim=-1)
+            if use_sampling:
+                if top_p > 0.0:
+                    sampled_tokens = utils.sample_top_p(probs, p=top_p)
+                elif top_k > 0:
+                    sampled_tokens = utils.sample_top_k(probs, k=top_k)
+                else:
+                    sampled_tokens = utils.multinomial(probs, num_samples=1)
+            else:
+                sampled_tokens = torch.argmax(logits, dim=-1, keepdim=True)
+            # place mask_id token in each of the masked positions
+            mask = stage_gen_seq == mask_id
+            stage_gen_seq = torch.where(mask, sampled_tokens[..., 0], stage_gen_seq)
+            gen_sequence[:, [stage], :] = stage_gen_seq
+            # get probs of sampled tokens
+            sampled_probs = torch.gather(probs, 3, sampled_tokens)[..., 0]
+            # span scoring
+            if chunk_masking:
+                if span_scoring == 'max':
+                    # max in linear space
+                    scores = 1 - torch.max(sampled_probs.reshape((B, 1, n_chunks, -1)), dim=-1)[0]
+                elif span_scoring == 'prod':
+                    # prod in log space
+                    scores = torch.sum(-torch.log(sampled_probs).reshape((B, 1, n_chunks, -1)), dim=-1)
+                else:
+                    raise NotImplementedError
+            else:
+                # prod in log space for lps masking (stride1)
+                scores = -torch.log(sampled_probs)
+            # Fix unmasked tokens by placing inf probs (-inf scores)
+            if chunk_masking:
+                scores = scores.masked_fill(~chunks_mask, DONT_REMASK_ME_SCORE)
+            else:
+                scores = scores.masked_fill(~mask, DONT_REMASK_ME_SCORE)
+            if callback is not None:
+                curr_step += 1
+                callback(curr_step, total_steps)
+        return gen_sequence, curr_step
+    def _construct_spans_mask(self, span_starts: torch.Tensor, T: int, device: torch.device) -> torch.Tensor:
+        """Build a [1x1xT] boolean mask consists of overlapping spans of True values, where
+           span_starts defines the initial index of each span, and the span length is
+           defined by self.span_len.
+        Args:
+            span_starts (torch.Tensor): Boolean mask determines the temporal location of each span start.
+            T (int): Sequence length.
+            device (torch.device): device of the output tensor.
+        Returns:
+            torch.Tensor: Spans mask of shape [1x1xT]
+        """
+        mask = torch.full((1, 1, T), False, device=device)
+        mask[:, :, span_starts] = True
+        shifted_mask = mask.clone()
+        for _ in range(self.span_len - 1):
+            shifted_mask = torch.concat((torch.full((1, 1, 1), False, device=device), shifted_mask[:, :, :-1]), dim=-1)
+            mask = torch.logical_or(mask, shifted_mask)
+        return mask
+    def _least_probable_span_masking(self, scores: torch.Tensor, num_masked_trg: int) -> torch.Tensor:
+        """Construct a [1x1xT] boolean mask, consists of the u least probable spans,
+           where the token probability is determined by -scores, and the total
+           number of masked tokens is as closest as possible to num_masked_trg.
+           Find u using binary search.
+        Args:
+            scores (torch.Tensor): Per token score [-log(prob)]
+            num_masked_trg: int: The desired amount of tokens to be masked.
+        Returns:
+            torch.Tensor: Spans mask of shape [1x1xT]
+        """
+        T = scores.shape[-1]
+        device = scores.device
+        scores_unfolded = scores.unfold(2, self.span_len, 1)
+        # Span score is the product of probs (sum in log space)
+        span_scores = scores_unfolded.sum(dim=-1)
+        spans_by_scores = torch.argsort(span_scores[0, 0], descending=True)
+        num_masked_trg = max(num_masked_trg, self.span_len)
+        # Binary search for u - the number least probable overlapping masked spans s.t.
+        # the total masking rate is the closest to num_masked_trg / T.
+        min_u = num_masked_trg // self.span_len
+        max_u = num_masked_trg - self.span_len + 1
+        mid = round(0.5 * (min_u + max_u))
+        if mid == min_u or mid == max_u:
+            return self._construct_spans_mask(spans_by_scores[:mid], T, device)
+        while mid > min_u and mid < max_u:
+            mask = self._construct_spans_mask(spans_by_scores[:mid], T, device)
+            n_masked = mask.sum()
+            if n_masked > num_masked_trg:
+                max_u = mid
+                mid = round(0.5 * (min_u + max_u))
+            else:
+                min_u = mid
+                mid = round(0.5 * (min_u + max_u))
+        return mask

audiocraft/models/loaders.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Utility functions to load from the checkpoints.
+Each checkpoint is a torch.saved dict with the following keys:
+- 'xp.cfg': the hydra config as dumped during training. This should be used
+    to rebuild the object using the audiocraft.models.builders functions,
+- 'model_best_state': a readily loadable best state for the model, including
+    the conditioner. The model obtained from `xp.cfg` should be compatible
+    with this state dict. In the case of a LM, the encodec model would not be
+    bundled along but instead provided separately.
+Those functions also support loading from a remote location with the Torch Hub API.
+They also support overriding some parameters, in particular the device and dtype
+of the returned model.
+"""
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+import typing as tp
+import os
+from omegaconf import OmegaConf, DictConfig
+import torch
+import audiocraft
+from . import builders
+from .encodec import CompressionModel
+def get_audiocraft_cache_dir() -> tp.Optional[str]:
+    return os.environ.get('AUDIOCRAFT_CACHE_DIR', None)
+def _get_state_dict(
+    file_or_url_or_id: tp.Union[Path, str],
+    filename: tp.Optional[str] = None,
+    device='cpu',
+    cache_dir: tp.Optional[str] = None,
+):
+    if cache_dir is None:
+        cache_dir = get_audiocraft_cache_dir()
+    # Return the state dict either from a file or url
+    file_or_url_or_id = str(file_or_url_or_id)
+    assert isinstance(file_or_url_or_id, str)
+    if os.path.isfile(file_or_url_or_id):
+        return torch.load(file_or_url_or_id, map_location=device)
+    if os.path.isdir(file_or_url_or_id):
+        file = f"{file_or_url_or_id}/{filename}"
+        return torch.load(file, map_location=device)
+    elif file_or_url_or_id.startswith('https://'):
+        return torch.hub.load_state_dict_from_url(file_or_url_or_id, map_location=device, check_hash=True)
+    else:
+        assert filename is not None, "filename needs to be defined if using HF checkpoints"
+        file = hf_hub_download(
+            repo_id=file_or_url_or_id, filename=filename, cache_dir=cache_dir,
+            library_name="audiocraft", library_version=audiocraft.__version__)
+        return torch.load(file, map_location=device)
+def load_compression_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
+    return _get_state_dict(file_or_url_or_id, filename="compression_state_dict.bin", cache_dir=cache_dir)
+def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
+    pkg = load_compression_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
+    if 'pretrained' in pkg:
+        return CompressionModel.get_pretrained(pkg['pretrained'], device=device)
+    cfg = OmegaConf.create(pkg['xp.cfg'])
+    cfg.device = str(device)
+    model = builders.get_compression_model(cfg)
+    model.load_state_dict(pkg['best_state'])
+    model.eval()
+    return model
+def load_lm_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
+    return _get_state_dict(file_or_url_or_id, filename="state_dict.bin", cache_dir=cache_dir)
+def _delete_param(cfg: DictConfig, full_name: str):
+    parts = full_name.split('.')
+    for part in parts[:-1]:
+        if part in cfg:
+            cfg = cfg[part]
+        else:
+            return
+    OmegaConf.set_struct(cfg, False)
+    if parts[-1] in cfg:
+        del cfg[parts[-1]]
+    OmegaConf.set_struct(cfg, True)
+def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
+    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg['xp.cfg'])
+    cfg.device = str(device)
+    if cfg.device == 'cpu':
+        cfg.dtype = 'float32'
+    else:
+        cfg.dtype = 'float16'
+    _delete_param(cfg, 'conditioners.self_wav.chroma_stem.cache_path')
+    _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
+    _delete_param(cfg, 'conditioners.args.drop_desc_p')
+    model = builders.get_lm_model(cfg)
+    model.load_state_dict(pkg['best_state'])
+    model.eval()
+    model.cfg = cfg
+    return model
+def load_lm_model_magnet(file_or_url_or_id: tp.Union[Path, str], compression_model_frame_rate: int,
+                         device='cpu', cache_dir: tp.Optional[str] = None):
+    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg['xp.cfg'])
+    cfg.device = str(device)
+    if cfg.device == 'cpu':
+        cfg.dtype = 'float32'
+    else:
+        cfg.dtype = 'float16'
+    _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
+    _delete_param(cfg, 'conditioners.args.drop_desc_p')
+    cfg.transformer_lm.compression_model_framerate = compression_model_frame_rate
+    cfg.transformer_lm.segment_duration = cfg.dataset.segment_duration
+    cfg.transformer_lm.span_len = cfg.masking.span_len
+    # MAGNeT models v1 support only xformers backend.
+    from audiocraft.modules.transformer import set_efficient_attention_backend
+    if cfg.transformer_lm.memory_efficient:
+        set_efficient_attention_backend("xformers")
+    model = builders.get_lm_model(cfg)
+    model.load_state_dict(pkg['best_state'])
+    model.eval()
+    model.cfg = cfg
+    return model
+def load_mbd_ckpt(file_or_url_or_id: tp.Union[Path, str],
+                  filename: tp.Optional[str] = None,
+                  cache_dir: tp.Optional[str] = None):
+    return _get_state_dict(file_or_url_or_id, filename=filename, cache_dir=cache_dir)
+def load_diffusion_models(file_or_url_or_id: tp.Union[Path, str],
+                          device='cpu',
+                          filename: tp.Optional[str] = None,
+                          cache_dir: tp.Optional[str] = None):
+    pkg = load_mbd_ckpt(file_or_url_or_id, filename=filename, cache_dir=cache_dir)
+    models = []
+    processors = []
+    cfgs = []
+    sample_rate = pkg['sample_rate']
+    for i in range(pkg['n_bands']):
+        cfg = pkg[i]['cfg']
+        model = builders.get_diffusion_model(cfg)
+        model_dict = pkg[i]['model_state']
+        model.load_state_dict(model_dict)
+        model.to(device)
+        processor = builders.get_processor(cfg=cfg.processor, sample_rate=sample_rate)
+        processor_dict = pkg[i]['processor_state']
+        processor.load_state_dict(processor_dict)
+        processor.to(device)
+        models.append(model)
+        processors.append(processor)
+        cfgs.append(cfg)
+    return models, processors, cfgs

audiocraft/models/magnet.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Main model for using MAGNeT. This will combine all the required components
+and provide easy access to the generation API.
+"""
+import typing as tp
+import torch
+from .genmodel import BaseGenModel
+from .loaders import load_compression_model, load_lm_model_magnet
+class MAGNeT(BaseGenModel):
+    """MAGNeT main model with convenient generation API.
+    Args:
+       See MusicGen class.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # MAGNeT operates over a fixed sequence length defined in it's config.
+        self.duration = self.lm.cfg.dataset.segment_duration
+        self.set_generation_params()
+    @staticmethod
+    def get_pretrained(name: str = 'facebook/magnet-small-10secs', device=None):
+        """Return pretrained model, we provide six models:
+        - facebook/magnet-small-10secs (300M), text to music, 10-second audio samples.
+          # see: https://huggingface.co/facebook/magnet-small-10secs
+        - facebook/magnet-medium-10secs (1.5B), text to music, 10-second audio samples.
+          # see: https://huggingface.co/facebook/magnet-medium-10secs
+        - facebook/magnet-small-30secs (300M), text to music, 30-second audio samples.
+          # see: https://huggingface.co/facebook/magnet-small-30secs
+        - facebook/magnet-medium-30secs (1.5B), text to music, 30-second audio samples.
+          # see: https://huggingface.co/facebook/magnet-medium-30secs
+        - facebook/audio-magnet-small (300M), text to sound-effect (10-second samples).
+          # see: https://huggingface.co/facebook/audio-magnet-small
+        - facebook/audio-magnet-medium (1.5B), text to sound-effect (10-second samples).
+          # see: https://huggingface.co/facebook/audio-magnet-medium
+        """
+        if device is None:
+            if torch.cuda.device_count():
+                device = 'cuda'
+            else:
+                device = 'cpu'
+        compression_model = load_compression_model(name, device=device)
+        lm = load_lm_model_magnet(name, compression_model_frame_rate=int(compression_model.frame_rate), device=device)
+        if 'self_wav' in lm.condition_provider.conditioners:
+            lm.condition_provider.conditioners['self_wav'].match_len_on_eval = True
+        kwargs = {'name': name, 'compression_model': compression_model, 'lm': lm}
+        return MAGNeT(**kwargs)
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 0,
+                              top_p: float = 0.9, temperature: float = 3.0,
+                              max_cfg_coef: float = 10.0, min_cfg_coef: float = 1.0,
+                              decoding_steps: tp.List[int] = [20, 10, 10, 10],
+                              span_arrangement: str = 'nonoverlap'):
+        """Set the generation parameters for MAGNeT.
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 0.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.9.
+            temperature (float, optional): Initial softmax temperature parameter. Defaults to 3.0.
+            max_cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 10.0.
+            min_cfg_coef (float, optional): End coefficient of classifier free guidance annealing. Defaults to 1.0.
+            decoding_steps (list of n_q ints, optional): The number of iterative decoding steps,
+                                                         for each of the n_q RVQ codebooks.
+            span_arrangement (str, optional): Use either non-overlapping spans ('nonoverlap')
+                                              or overlapping spans ('stride1') in the masking scheme.
+        """
+        self.generation_params = {
+            'use_sampling': use_sampling,
+            'temp': temperature,
+            'top_k': top_k,
+            'top_p': top_p,
+            'max_cfg_coef': max_cfg_coef,
+            'min_cfg_coef': min_cfg_coef,
+            'decoding_steps': [int(s) for s in decoding_steps],
+            'span_arrangement': span_arrangement
+        }

audiocraft/models/multibanddiffusion.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Multi Band Diffusion models as described in
+"From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion"
+(paper link).
+"""
+import typing as tp
+import torch
+import julius
+from .unet import DiffusionUnet
+from ..modules.diffusion_schedule import NoiseSchedule
+from .encodec import CompressionModel
+from ..solvers.compression import CompressionSolver
+from .loaders import load_compression_model, load_diffusion_models
+class DiffusionProcess:
+    """Sampling for a diffusion Model.
+    Args:
+        model (DiffusionUnet): Diffusion U-Net model.
+        noise_schedule (NoiseSchedule): Noise schedule for diffusion process.
+    """
+    def __init__(self, model: DiffusionUnet, noise_schedule: NoiseSchedule) -> None:
+        self.model = model
+        self.schedule = noise_schedule
+    def generate(self, condition: torch.Tensor, initial_noise: torch.Tensor,
+                 step_list: tp.Optional[tp.List[int]] = None):
+        """Perform one diffusion process to generate one of the bands.
+        Args:
+            condition (torch.Tensor): The embeddings from the compression model.
+            initial_noise (torch.Tensor): The initial noise to start the process.
+        """
+        return self.schedule.generate_subsampled(model=self.model, initial=initial_noise, step_list=step_list,
+                                                 condition=condition)
+class MultiBandDiffusion:
+    """Sample from multiple diffusion models.
+    Args:
+        DPs (list of DiffusionProcess): Diffusion processes.
+        codec_model (CompressionModel): Underlying compression model used to obtain discrete tokens.
+    """
+    def __init__(self, DPs: tp.List[DiffusionProcess], codec_model: CompressionModel) -> None:
+        self.DPs = DPs
+        self.codec_model = codec_model
+        self.device = next(self.codec_model.parameters()).device
+    @property
+    def sample_rate(self) -> int:
+        return self.codec_model.sample_rate
+    @staticmethod
+    def get_mbd_musicgen(device=None):
+        """Load our diffusion models trained for MusicGen."""
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        path = 'facebook/multiband-diffusion'
+        filename = 'mbd_musicgen_32khz.th'
+        name = 'facebook/musicgen-small'
+        codec_model = load_compression_model(name, device=device)
+        models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
+        DPs = []
+        for i in range(len(models)):
+            schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
+            DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
+        return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)
+    @staticmethod
+    def get_mbd_24khz(bw: float = 3.0,
+                      device: tp.Optional[tp.Union[torch.device, str]] = None,
+                      n_q: tp.Optional[int] = None):
+        """Get the pretrained Models for MultibandDiffusion.
+        Args:
+            bw (float): Bandwidth of the compression model.
+            device (torch.device or str, optional): Device on which the models are loaded.
+            n_q (int, optional): Number of quantizers to use within the compression model.
+        """
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        assert bw in [1.5, 3.0, 6.0], f"bandwidth {bw} not available"
+        if n_q is not None:
+            assert n_q in [2, 4, 8]
+            assert {1.5: 2, 3.0: 4, 6.0: 8}[bw] == n_q, \
+                f"bandwidth and number of codebooks missmatch to use n_q = {n_q} bw should be {n_q * (1.5 / 2)}"
+        n_q = {1.5: 2, 3.0: 4, 6.0: 8}[bw]
+        codec_model = CompressionSolver.model_from_checkpoint(
+            '//pretrained/facebook/encodec_24khz', device=device)
+        codec_model.set_num_codebooks(n_q)
+        codec_model = codec_model.to(device)
+        path = 'facebook/multiband-diffusion'
+        filename = f'mbd_comp_{n_q}.pt'
+        models, processors, cfgs = load_diffusion_models(path, filename=filename, device=device)
+        DPs = []
+        for i in range(len(models)):
+            schedule = NoiseSchedule(**cfgs[i].schedule, sample_processor=processors[i], device=device)
+            DPs.append(DiffusionProcess(model=models[i], noise_schedule=schedule))
+        return MultiBandDiffusion(DPs=DPs, codec_model=codec_model)
+    @torch.no_grad()
+    def get_condition(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
+        """Get the conditioning (i.e. latent representations of the compression model) from a waveform.
+        Args:
+            wav (torch.Tensor): The audio that we want to extract the conditioning from.
+            sample_rate (int): Sample rate of the audio."""
+        if sample_rate != self.sample_rate:
+            wav = julius.resample_frac(wav, sample_rate, self.sample_rate)
+        codes, scale = self.codec_model.encode(wav)
+        assert scale is None, "Scaled compression models not supported."
+        emb = self.get_emb(codes)
+        return emb
+    @torch.no_grad()
+    def get_emb(self, codes: torch.Tensor):
+        """Get latent representation from the discrete codes.
+        Args:
+            codes (torch.Tensor): Discrete tokens."""
+        emb = self.codec_model.decode_latent(codes)
+        return emb
+    def generate(self, emb: torch.Tensor, size: tp.Optional[torch.Size] = None,
+                 step_list: tp.Optional[tp.List[int]] = None):
+        """Generate waveform audio from the latent embeddings of the compression model.
+        Args:
+            emb (torch.Tensor): Conditioning embeddings
+            size (None, torch.Size): Size of the output
+                if None this is computed from the typical upsampling of the model.
+            step_list (list[int], optional): list of Markov chain steps, defaults to 50 linearly spaced step.
+        """
+        if size is None:
+            upsampling = int(self.codec_model.sample_rate / self.codec_model.frame_rate)
+            size = torch.Size([emb.size(0), self.codec_model.channels, emb.size(-1) * upsampling])
+        assert size[0] == emb.size(0)
+        out = torch.zeros(size).to(self.device)
+        for DP in self.DPs:
+            out += DP.generate(condition=emb, step_list=step_list, initial_noise=torch.randn_like(out))
+        return out
+    def re_eq(self, wav: torch.Tensor, ref: torch.Tensor, n_bands: int = 32, strictness: float = 1):
+        """Match the eq to the encodec output by matching the standard deviation of some frequency bands.
+        Args:
+            wav (torch.Tensor): Audio to equalize.
+            ref (torch.Tensor): Reference audio from which we match the spectrogram.
+            n_bands (int): Number of bands of the eq.
+            strictness (float): How strict the matching. 0 is no matching, 1 is exact matching.
+        """
+        split = julius.SplitBands(n_bands=n_bands, sample_rate=self.codec_model.sample_rate).to(wav.device)
+        bands = split(wav)
+        bands_ref = split(ref)
+        out = torch.zeros_like(ref)
+        for i in range(n_bands):
+            out += bands[i] * (bands_ref[i].std() / bands[i].std()) ** strictness
+        return out
+    def regenerate(self, wav: torch.Tensor, sample_rate: int):
+        """Regenerate a waveform through compression and diffusion regeneration.
+        Args:
+            wav (torch.Tensor): Original 'ground truth' audio.
+            sample_rate (int): Sample rate of the input (and output) wav.
+        """
+        if sample_rate != self.codec_model.sample_rate:
+            wav = julius.resample_frac(wav, sample_rate, self.codec_model.sample_rate)
+        emb = self.get_condition(wav, sample_rate=self.codec_model.sample_rate)
+        size = wav.size()
+        out = self.generate(emb, size=size)
+        if sample_rate != self.codec_model.sample_rate:
+            out = julius.resample_frac(out, self.codec_model.sample_rate, sample_rate)
+        return out
+    def tokens_to_wav(self, tokens: torch.Tensor, n_bands: int = 32):
+        """Generate Waveform audio with diffusion from the discrete codes.
+        Args:
+            tokens (torch.Tensor): Discrete codes.
+            n_bands (int): Bands for the eq matching.
+        """
+        wav_encodec = self.codec_model.decode(tokens)
+        condition = self.get_emb(tokens)
+        wav_diffusion = self.generate(emb=condition, size=wav_encodec.size())
+        return self.re_eq(wav=wav_diffusion, ref=wav_encodec, n_bands=n_bands)

audiocraft/models/musicgen.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Main model for using MusicGen. This will combine all the required components
+and provide easy access to the generation API.
+"""
+import typing as tp
+import warnings
+import torch
+from .encodec import CompressionModel
+from .genmodel import BaseGenModel
+from .lm import LMModel
+from .builders import get_debug_compression_model, get_debug_lm_model
+from .loaders import load_compression_model, load_lm_model
+from ..data.audio_utils import convert_audio
+from ..modules.conditioners import ConditioningAttributes, WavCondition
+MelodyList = tp.List[tp.Optional[torch.Tensor]]
+MelodyType = tp.Union[torch.Tensor, MelodyList]
+# backward compatible names mapping
+_HF_MODEL_CHECKPOINTS_MAP = {
+    "small": "facebook/musicgen-small",
+    "medium": "facebook/musicgen-medium",
+    "large": "facebook/musicgen-large",
+    "melody": "facebook/musicgen-melody",
+}
+class MusicGen(BaseGenModel):
+    """MusicGen main model with convenient generation API.
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+        max_duration (float, optional): maximum duration the model can produce,
+            otherwise, inferred from the training params.
+    """
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: tp.Optional[float] = None):
+        super().__init__(name, compression_model, lm, max_duration)
+        self.set_generation_params(duration=15)  # default duration
+    @staticmethod
+    def get_pretrained(name: str = 'facebook/musicgen-melody', device=None):
+        """Return pretrained model, we provide four models:
+        - facebook/musicgen-small (300M), text to music,
+          # see: https://huggingface.co/facebook/musicgen-small
+        - facebook/musicgen-medium (1.5B), text to music,
+          # see: https://huggingface.co/facebook/musicgen-medium
+        - facebook/musicgen-melody (1.5B) text to music and text+melody to music,
+          # see: https://huggingface.co/facebook/musicgen-melody
+        - facebook/musicgen-large (3.3B), text to music,
+          # see: https://huggingface.co/facebook/musicgen-large
+        """
+        if device is None:
+            if torch.cuda.device_count():
+                device = 'cuda'
+            else:
+                device = 'cpu'
+        if name == 'debug':
+            # used only for unit tests
+            compression_model = get_debug_compression_model(device)
+            lm = get_debug_lm_model(device)
+            return MusicGen(name, compression_model, lm, max_duration=30)
+        if name in _HF_MODEL_CHECKPOINTS_MAP:
+            warnings.warn(
+                "MusicGen pretrained model relying on deprecated checkpoint mapping. " +
+                f"Please use full pre-trained id instead: facebook/musicgen-{name}")
+            name = _HF_MODEL_CHECKPOINTS_MAP[name]
+        lm = load_lm_model(name, device=device)
+        compression_model = load_compression_model(name, device=device)
+        if 'self_wav' in lm.condition_provider.conditioners:
+            lm.condition_provider.conditioners['self_wav'].match_len_on_eval = True
+            lm.condition_provider.conditioners['self_wav']._use_masking = False
+        return MusicGen(name, compression_model, lm)
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                              top_p: float = 0.0, temperature: float = 1.0,
+                              duration: float = 30.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 18):
+        """Set the generation parameters for MusicGen.
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 250.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+            duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
+            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+                instead of batching together the two. This has some impact on how things
+                are padded but seems to have little impact in practice.
+            extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
+                should we extend the audio each time. Larger values will mean less context is
+                preserved, and shorter value will require extra computations.
+        """
+        assert extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
+        self.extend_stride = extend_stride
+        self.duration = duration
+        self.generation_params = {
+            'use_sampling': use_sampling,
+            'temp': temperature,
+            'top_k': top_k,
+            'top_p': top_p,
+            'cfg_coef': cfg_coef,
+            'two_step_cfg': two_step_cfg,
+        }
+    def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyType,
+                             melody_sample_rate: int, progress: bool = False,
+                             return_tokens: bool = False) -> tp.Union[torch.Tensor,
+                                                                      tp.Tuple[torch.Tensor, torch.Tensor]]:
+        """Generate samples conditioned on text and melody.
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
+                melody conditioning. Should have shape [B, C, T] with B matching the description length,
+                C=1 or 2. It can be [C, T] if there is a single description. It can also be
+                a list of [C, T] tensors.
+            melody_sample_rate: (int): Sample rate of the melody waveforms.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        """
+        if isinstance(melody_wavs, torch.Tensor):
+            if melody_wavs.dim() == 2:
+                melody_wavs = melody_wavs[None]
+            if melody_wavs.dim() != 3:
+                raise ValueError("Melody wavs should have a shape [B, C, T].")
+            melody_wavs = list(melody_wavs)
+        else:
+            for melody in melody_wavs:
+                if melody is not None:
+                    assert melody.dim() == 2, "One melody in the list has the wrong number of dims."
+        melody_wavs = [
+            convert_audio(wav, melody_sample_rate, self.sample_rate, self.audio_channels)
+            if wav is not None else None
+            for wav in melody_wavs]
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
+                                                                        melody_wavs=melody_wavs)
+        assert prompt_tokens is None
+        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
+        if return_tokens:
+            return self.generate_audio(tokens), tokens
+        return self.generate_audio(tokens)
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            descriptions: tp.Sequence[tp.Optional[str]],
+            prompt: tp.Optional[torch.Tensor],
+            melody_wavs: tp.Optional[MelodyList] = None,
+    ) -> tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
+        """Prepare model inputs.
+        Args:
+            descriptions (list of str): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+            melody_wavs (torch.Tensor, optional): A batch of waveforms
+                used as melody conditioning. Defaults to None.
+        """
+        attributes = [
+            ConditioningAttributes(text={'description': description})
+            for description in descriptions]
+        if melody_wavs is None:
+            for attr in attributes:
+                attr.wav['self_wav'] = WavCondition(
+                    torch.zeros((1, 1, 1), device=self.device),
+                    torch.tensor([0], device=self.device),
+                    sample_rate=[self.sample_rate],
+                    path=[None])
+        else:
+            if 'self_wav' not in self.lm.condition_provider.conditioners:
+                raise RuntimeError("This model doesn't support melody conditioning. "
+                                   "Use the `melody` model.")
+            assert len(melody_wavs) == len(descriptions), \
+                f"number of melody wavs must match number of descriptions! " \
+                f"got melody len={len(melody_wavs)}, and descriptions len={len(descriptions)}"
+            for attr, melody in zip(attributes, melody_wavs):
+                if melody is None:
+                    attr.wav['self_wav'] = WavCondition(
+                        torch.zeros((1, 1, 1), device=self.device),
+                        torch.tensor([0], device=self.device),
+                        sample_rate=[self.sample_rate],
+                        path=[None])
+                else:
+                    attr.wav['self_wav'] = WavCondition(
+                        melody[None].to(device=self.device),
+                        torch.tensor([melody.shape[-1]], device=self.device),
+                        sample_rate=[self.sample_rate],
+                        path=[None],
+                    )
+        if prompt is not None:
+            if descriptions is not None:
+                assert len(descriptions) == len(prompt), "Prompt and nb. descriptions doesn't match"
+            prompt = prompt.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt)
+            assert scale is None
+        else:
+            prompt_tokens = None
+        return attributes, prompt_tokens
+    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
+                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -> torch.Tensor:
+        """Generate discrete audio tokens given audio prompt and/or conditions.
+        Args:
+            attributes (list of ConditioningAttributes): Conditions used for generation (text/melody).
+            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        """
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, tokens_to_generate)
+            else:
+                print(f'{generated_tokens: 6d} / {tokens_to_generate: 6d}', end='\r')
+        if prompt_tokens is not None:
+            assert max_prompt_len >= prompt_tokens.shape[-1], \
+                "Prompt is longer than audio to generate"
+        callback = None
+        if progress:
+            callback = _progress_callback
+        if self.duration <= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+        else:
+            # now this gets a bit messier, we need to handle prompts,
+            # melody conditioning etc.
+            ref_wavs = [attr.wav['self_wav'] for attr in attributes]
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+            assert self.extend_stride is not None, "Stride should be defined to generate beyond max_duration"
+            assert self.extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+            while current_gen_offset + prompt_length < total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                for attr, ref_wav in zip(attributes, ref_wavs):
+                    wav_length = ref_wav.length.item()
+                    if wav_length == 0:
+                        continue
+                    # We will extend the wav periodically if it not long enough.
+                    # we have to do it here rather than in conditioners.py as otherwise
+                    # we wouldn't have the full wav.
+                    initial_position = int(time_offset * self.sample_rate)
+                    wav_target_length = int(self.max_duration * self.sample_rate)
+                    positions = torch.arange(initial_position,
+                                             initial_position + wav_target_length, device=self.device)
+                    attr.wav['self_wav'] = WavCondition(
+                        ref_wav[0][..., positions % wav_length],
+                        torch.full_like(ref_wav[1], wav_target_length),
+                        [self.sample_rate] * ref_wav[0].size(0),
+                        [None], [0.])
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+            gen_tokens = torch.cat(all_tokens, dim=-1)
+        return gen_tokens

audiocraft/models/unet.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Pytorch Unet Module used for diffusion.
+"""
+from dataclasses import dataclass
+import typing as tp
+import torch
+from torch import nn
+from torch.nn import functional as F
+from audiocraft.modules.transformer import StreamingTransformer, create_sin_embedding
+@dataclass
+class Output:
+    sample: torch.Tensor
+def get_model(cfg, channels: int, side: int, num_steps: int):
+    if cfg.model == 'unet':
+        return DiffusionUnet(
+            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
+    else:
+        raise RuntimeError('Not Implemented')
+class ResBlock(nn.Module):
+    def __init__(self, channels: int, kernel: int = 3, norm_groups: int = 4,
+                 dilation: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
+                 dropout: float = 0.):
+        super().__init__()
+        stride = 1
+        padding = dilation * (kernel - stride) // 2
+        Conv = nn.Conv1d
+        Drop = nn.Dropout1d
+        self.norm1 = nn.GroupNorm(norm_groups, channels)
+        self.conv1 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
+        self.activation1 = activation()
+        self.dropout1 = Drop(dropout)
+        self.norm2 = nn.GroupNorm(norm_groups, channels)
+        self.conv2 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
+        self.activation2 = activation()
+        self.dropout2 = Drop(dropout)
+    def forward(self, x):
+        h = self.dropout1(self.conv1(self.activation1(self.norm1(x))))
+        h = self.dropout2(self.conv2(self.activation2(self.norm2(h))))
+        return x + h
+class DecoderLayer(nn.Module):
+    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
+                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
+                 dropout: float = 0.):
+        super().__init__()
+        padding = (kernel - stride) // 2
+        self.res_blocks = nn.Sequential(
+            *[ResBlock(chin, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
+              for idx in range(res_blocks)])
+        self.norm = nn.GroupNorm(norm_groups, chin)
+        ConvTr = nn.ConvTranspose1d
+        self.convtr = ConvTr(chin, chout, kernel, stride, padding, bias=False)
+        self.activation = activation()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.res_blocks(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.convtr(x)
+        return x
+class EncoderLayer(nn.Module):
+    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
+                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
+                 dropout: float = 0.):
+        super().__init__()
+        padding = (kernel - stride) // 2
+        Conv = nn.Conv1d
+        self.conv = Conv(chin, chout, kernel, stride, padding, bias=False)
+        self.norm = nn.GroupNorm(norm_groups, chout)
+        self.activation = activation()
+        self.res_blocks = nn.Sequential(
+            *[ResBlock(chout, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
+              for idx in range(res_blocks)])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, T = x.shape
+        stride, = self.conv.stride
+        pad = (stride - (T % stride)) % stride
+        x = F.pad(x, (0, pad))
+        x = self.conv(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.res_blocks(x)
+        return x
+class BLSTM(nn.Module):
+    """BiLSTM with same hidden units as input dim.
+    """
+    def __init__(self, dim, layers=2):
+        super().__init__()
+        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = nn.Linear(2 * dim, dim)
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        x = self.lstm(x)[0]
+        x = self.linear(x)
+        x = x.permute(1, 2, 0)
+        return x
+class DiffusionUnet(nn.Module):
+    def __init__(self, chin: int = 3, hidden: int = 24, depth: int = 3, growth: float = 2.,
+                 max_channels: int = 10_000, num_steps: int = 1000, emb_all_layers=False, cross_attention: bool = False,
+                 bilstm: bool = False, transformer: bool = False,
+                 codec_dim: tp.Optional[int] = None, **kwargs):
+        super().__init__()
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        self.embeddings: tp.Optional[nn.ModuleList] = None
+        self.embedding = nn.Embedding(num_steps, hidden)
+        if emb_all_layers:
+            self.embeddings = nn.ModuleList()
+        self.condition_embedding: tp.Optional[nn.Module] = None
+        for d in range(depth):
+            encoder = EncoderLayer(chin, hidden, **kwargs)
+            decoder = DecoderLayer(hidden, chin, **kwargs)
+            self.encoders.append(encoder)
+            self.decoders.insert(0, decoder)
+            if emb_all_layers and d > 0:
+                assert self.embeddings is not None
+                self.embeddings.append(nn.Embedding(num_steps, hidden))
+            chin = hidden
+            hidden = min(int(chin * growth), max_channels)
+        self.bilstm: tp.Optional[nn.Module]
+        if bilstm:
+            self.bilstm = BLSTM(chin)
+        else:
+            self.bilstm = None
+        self.use_transformer = transformer
+        self.cross_attention = False
+        if transformer:
+            self.cross_attention = cross_attention
+            self.transformer = StreamingTransformer(chin, 8, 6, bias_ff=False, bias_attn=False,
+                                                    cross_attention=cross_attention)
+        self.use_codec = False
+        if codec_dim is not None:
+            self.conv_codec = nn.Conv1d(codec_dim, chin, 1)
+            self.use_codec = True
+    def forward(self, x: torch.Tensor, step: tp.Union[int, torch.Tensor], condition: tp.Optional[torch.Tensor] = None):
+        skips = []
+        bs = x.size(0)
+        z = x
+        view_args = [1]
+        if type(step) is torch.Tensor:
+            step_tensor = step
+        else:
+            step_tensor = torch.tensor([step], device=x.device, dtype=torch.long).expand(bs)
+        for idx, encoder in enumerate(self.encoders):
+            z = encoder(z)
+            if idx == 0:
+                z = z + self.embedding(step_tensor).view(bs, -1, *view_args).expand_as(z)
+            elif self.embeddings is not None:
+                z = z + self.embeddings[idx - 1](step_tensor).view(bs, -1, *view_args).expand_as(z)
+            skips.append(z)
+        if self.use_codec:  # insert condition in the bottleneck
+            assert condition is not None, "Model defined for conditionnal generation"
+            condition_emb = self.conv_codec(condition)  # reshape to the bottleneck dim
+            assert condition_emb.size(-1) <= 2 * z.size(-1), \
+                f"You are downsampling the conditionning with factor >=2 : {condition_emb.size(-1)=} and {z.size(-1)=}"
+            if not self.cross_attention:
+                condition_emb = torch.nn.functional.interpolate(condition_emb, z.size(-1))
+                assert z.size() == condition_emb.size()
+                z += condition_emb
+                cross_attention_src = None
+            else:
+                cross_attention_src = condition_emb.permute(0, 2, 1)  # B, T, C
+                B, T, C = cross_attention_src.shape
+                positions = torch.arange(T, device=x.device).view(1, -1, 1)
+                pos_emb = create_sin_embedding(positions, C, max_period=10_000, dtype=cross_attention_src.dtype)
+                cross_attention_src = cross_attention_src + pos_emb
+        if self.use_transformer:
+            z = self.transformer(z.permute(0, 2, 1), cross_attention_src=cross_attention_src).permute(0, 2, 1)
+        else:
+            if self.bilstm is None:
+                z = torch.zeros_like(z)
+            else:
+                z = self.bilstm(z)
+        for decoder in self.decoders:
+            s = skips.pop(-1)
+            z = z[:, :, :s.shape[2]]
+            z = z + s
+            z = decoder(z)
+        z = z[:, :, :x.shape[2]]
+        return Output(z)