Spaces:

unpairedelectron07
/

Text-to-Music-Generator

Running

App Files Files Community

unpairedelectron07 commited on Jan 21

Commit

e3061ad

•

1 Parent(s): 699b46d

Upload 7 files

Browse files

Files changed (7) hide show

audiocraft/solvers/audiogen.py +19 -0
audiocraft/solvers/base.py +631 -0
audiocraft/solvers/builders.py +366 -0
audiocraft/solvers/compression.py +328 -0
audiocraft/solvers/diffusion.py +279 -0
audiocraft/solvers/magnet.py +276 -0
audiocraft/solvers/musicgen.py +721 -0

audiocraft/solvers/audiogen.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from . import builders, musicgen
+class AudioGenSolver(musicgen.MusicGenSolver):
+    """Solver for AudioGen re-implementation training task.
+    Note that this implementation does not strictly follows
+    the method proposed in https://arxiv.org/abs/2209.15352
+    but is derived from MusicGen's training pipeline.
+    More information can be found in the AudioGen model card.
+    """
+    DATASET_TYPE: builders.DatasetType = builders.DatasetType.SOUND

audiocraft/solvers/base.py ADDED Viewed

	@@ -0,0 +1,631 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+import typing as tp
+import flashy
+import omegaconf
+import torch
+from torch import nn
+from .. import optim
+from ..optim import fsdp
+from ..utils import checkpoint
+from ..utils.autocast import TorchAutocast
+from ..utils.best_state import BestStateDictManager
+from ..utils.deadlock import DeadlockDetect
+from ..utils.profiler import Profiler
+from ..utils.utils import copy_state, dict_from_config, model_hash, with_rank_rng
+class StandardSolver(ABC, flashy.BaseSolver):
+    """Standard solver for AudioCraft.
+    The standard solver implements a base training loop with the following stages:
+    train, valid, evaluate and generate that are expected to be all defined for
+    solvers in AudioCraft. It also provides a nice default management of Dora history replay,
+    checkpoint management across epoch, and logging configuration.
+    AudioCraft solvers must inherit from the StandardSolver and define the methods
+    associated to each stage as well as the show, build_model and build_dataloaders methods.
+    """
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__()
+        self.logger.info(f"Instantiating solver {self.__class__.__name__} for XP {self.xp.sig}")
+        self.logger.info(f"All XP logs are stored in {self.xp.folder}")
+        self.cfg = cfg
+        self.device = cfg.device
+        self.model: nn.Module
+        self._continue_best_source_keys = ['best_state', 'fsdp_best_state']
+        self._fsdp_modules: tp.List[fsdp.FSDP] = []
+        self._ema_sources: nn.ModuleDict = nn.ModuleDict()
+        self.ema: tp.Optional[optim.ModuleDictEMA] = None
+        self.dataloaders: tp.Dict[str, torch.utils.data.DataLoader] = dict()
+        self._log_updates = self.cfg.logging.get('log_updates', 10)
+        if self.cfg.logging.log_tensorboard:
+            self.init_tensorboard(**self.cfg.get('tensorboard'))
+        if self.cfg.logging.log_wandb and self:
+            self.init_wandb(**self.cfg.get('wandb'))
+        # keep a copy of the best performing state for stateful objects
+        # used for evaluation and generation stages
+        dtype_best: tp.Optional[torch.dtype] = None
+        if self.cfg.fsdp.use:
+            dtype_best = getattr(torch, self.cfg.fsdp.param_dtype)  # type: ignore
+            assert isinstance(dtype_best, torch.dtype)
+        elif self.cfg.autocast:
+            dtype_best = getattr(torch, self.cfg.autocast_dtype)  # type: ignore
+            assert isinstance(dtype_best, torch.dtype)
+        self.best_state: BestStateDictManager = BestStateDictManager(dtype=dtype_best)
+        # Hacky support for keeping a copy of the full best state in rank0.
+        self.fsdp_best_state: tp.Dict[str, tp.Any] = {}
+        self.register_stateful('best_state', 'fsdp_best_state')  # register best_state object to keep it in state_dict
+        self._new_best_state: bool = False  # should save a new checkpoint
+        # instantiate datasets and appropriate number of updates per epoch
+        self.build_dataloaders()
+        if self.cfg.execute_only is None:
+            assert 'train' in self.dataloaders, "The train dataset split must be provided."
+            assert 'valid' in self.dataloaders, "The valid dataset split must be provided."
+        self.train_updates_per_epoch = len(self.dataloaders['train']) if 'train' in self.dataloaders else 0
+        if self.cfg.optim.updates_per_epoch:
+            self.train_updates_per_epoch = self.cfg.optim.updates_per_epoch
+        self.total_updates = self.train_updates_per_epoch * self.cfg.optim.epochs
+        # instantiate model & exponential moving average on the model
+        self.build_model()
+        self.logger.info("Model hash: %s", model_hash(self.model))
+        assert 'model' in self.stateful.sources, \
+            "Please register the model to stateful with self.register_stateful('model') in build_model."
+        self.profiler = Profiler(self.model, **self.cfg.profiler)
+        self.initialize_ema()
+        self.register_stateful('ema')
+        assert self.ema is None or 'ema' in self.stateful.sources, \
+            "Please register the ema to stateful with self.register_stateful('ema') in build_model."
+        self.deadlock_detect = DeadlockDetect(**self.cfg.deadlock)
+        # basic statistics on the trained model
+        model_size = sum(p.numel() for p in self.model.parameters() if p.requires_grad) / 1e6
+        # one copy of grad, one copy of momentum, one copy of denominator and model weights.
+        # and 4 bytes for each float!
+        mem_usage = model_size * 4 * 4 / 1000
+        self.logger.info("Model size: %.2f M params", model_size)
+        self.logger.info("Base memory usage, with model, grad and optim: %.2f GB", mem_usage)
+    @property
+    def autocast(self):
+        """Convenient autocast (or not) using the solver configuration."""
+        return TorchAutocast(enabled=self.cfg.autocast, device_type=self.device, dtype=self.autocast_dtype)
+    def _get_state_source(self, name) -> flashy.state.StateDictSource:
+        # Internal utility to get a state source from the solver
+        return self.stateful.sources[name]
+    @property
+    def best_metric_name(self) -> tp.Optional[str]:
+        """Metric name used to identify the best state. This metric should be stored in the metrics
+        used on the stage for best state identification (most likely, `valid`). If None, then
+        no best state is saved.
+        """
+        return None
+    def register_best_state(self, *args: str):
+        """Register state sources in `BestStateDictManager` to keep their best states along with their
+        latest states. The best state will be used at evaluation stages instead of the latest states.
+        Shortcut around `BestStateDictManager.register` method. You can pass any number of
+        attribute, included nested attributes and those will be included into the checkpoints
+        and automatically restored when `BaseSolver.restore` is called.
+        """
+        for name in args:
+            state_source = self._get_state_source(name)
+            assert name in self.stateful.sources, "Registered states in best should be registered in stateful first!"
+            self.best_state.register(name, state_source)
+    def register_ema(self, *args: str):
+        """Register state sources for exponential moving average.
+        The registered sources are used to instantiate a ModuleDictEMA instance.
+        The ModuleDictEMA keeps a `nn.ModuleDict` module that is updated when self.ema.step() is called
+        and swapped with the original state sources with self.swap_ema_state() method.
+        Usage:
+            self.register_ema('model')
+        """
+        assert self.ema is None, "Cannot register state source to already instantiated EMA."
+        for name in args:
+            self._ema_sources[name] = getattr(self, name)
+    def wrap_with_fsdp(self, model: torch.nn.Module, *args, **kwargs):
+        model = fsdp.wrap_with_fsdp(self.cfg.fsdp, model, *args, **kwargs)
+        if isinstance(model, fsdp.FSDP):
+            self._fsdp_modules.append(model)
+        return model
+    def update_best_state_from_stage(self, stage_name: str = 'valid'):
+        """Update latest best state based on pending metrics of a given stage. This method relies
+        on the `BestStateDictManager.update` method to update the best state_dict with latest weights
+        if the registered states happen to match to the best performing setup.
+        """
+        if self.best_metric_name is None:
+            # when no best metric is defined, the last state is always the best
+            self._new_best_state = True
+            self.logger.info("Updating best state with current state.")
+        else:
+            assert stage_name in self._pending_metrics, f"Metrics for stage {stage_name} not found."
+            assert self.best_metric_name in self._pending_metrics[stage_name], \
+                f"Best metric not found in {stage_name} metrics. Cannot register best state"
+            current_score = self._pending_metrics[stage_name][self.best_metric_name]
+            all_best_metric_scores = [
+                past_metrics[stage_name][self.best_metric_name]
+                for past_metrics in self.history
+            ]
+            all_best_metric_scores.append(current_score)
+            best_score = min(all_best_metric_scores)
+            self._new_best_state = current_score == best_score
+            if self._new_best_state:
+                old_best = min(all_best_metric_scores[:-1] + [float('inf')])
+                self.logger.info(
+                    f"New best state with {self.best_metric_name}={current_score:.3f} (was {old_best:.3f})")
+        if self._new_best_state:
+            if self.cfg.fsdp.use:
+                # this will give an empty state dict on all ranks but the rank 0
+                # which will have a copy in memory of the full model.
+                with fsdp.switch_to_full_state_dict(self._fsdp_modules):
+                    for name in self.best_state.states.keys():
+                        state_source = self._get_state_source(name)
+                        self.best_state.update(name, state_source)
+                    # we save to a different dict.
+                    self.fsdp_best_state.update(self.best_state.state_dict())
+                # We cannot efficiently load fsdp_best_state when using FSDP,
+                # so we have do do a second pass, with the local shards.
+            for name in self.best_state.states.keys():
+                state_source = self._get_state_source(name)
+                self.best_state.update(name, state_source)
+    def _load_new_state_dict(self, state_dict: dict) -> dict:
+        old_states = {}
+        for name, new_state in state_dict.items():
+            state_source = self._get_state_source(name)
+            old_states[name] = copy_state(state_source.state_dict())
+            state_source.load_state_dict(new_state)
+        return old_states
+    @contextmanager
+    def swap_best_state(self):
+        self.logger.debug(f"Swapping to best state for: {', '.join(self.best_state.state_dict().keys())}")
+        old_states = self._load_new_state_dict(self.best_state.state_dict())
+        try:
+            yield
+        finally:
+            self.logger.debug("Swapping back from best to original state")
+            for name, old_state in old_states.items():
+                state_source = self._get_state_source(name)
+                state_source.load_state_dict(old_state)
+    @contextmanager
+    def swap_ema_state(self):
+        if self.ema is None:
+            yield
+        else:
+            ema_state_dict = self.ema.state_dict()['state']
+            self.logger.debug(f"Swapping to EMA state for: {', '.join(ema_state_dict.keys())}")
+            old_states = self._load_new_state_dict(ema_state_dict)
+            try:
+                yield
+            finally:
+                self.logger.debug("Swapping back from EMA state to original state")
+                for name, old_state in old_states.items():
+                    state_source = self._get_state_source(name)
+                    state_source.load_state_dict(old_state)
+    @property
+    def is_training(self):
+        return self.current_stage == 'train'
+    def log_model_summary(self, model: nn.Module):
+        """Log model summary, architecture and size of the model."""
+        self.logger.info(model)
+        mb = sum(p.numel() for p in model.parameters()) * 4 / 2 ** 20
+        self.logger.info("Size: %.1f MB", mb)
+    @abstractmethod
+    def build_model(self):
+        """Method to implement to initialize model."""
+        ...
+    def initialize_ema(self):
+        """Initialize exponential moving average with the registered sources.
+        EMA object is created if the optim.ema.model.decay value is non-null.
+        """
+        from .builders import get_ema
+        self.ema = get_ema(self._ema_sources, self.cfg.optim.ema)
+        if self.ema is None:
+            self.logger.info('No EMA on the model.')
+        else:
+            assert self.cfg.optim.ema.updates > 0
+            self.logger.info(
+                f'Initializing EMA on the model with decay = {self.ema.decay}'
+                f' every {self.cfg.optim.ema.updates} updates'
+            )
+    @abstractmethod
+    def build_dataloaders(self):
+        """Method to implement to initialize dataloaders."""
+        ...
+    @abstractmethod
+    def show(self):
+        """Method to log any information without running the job."""
+        ...
+    @property
+    def log_updates(self):
+        # convenient access to log updates
+        return self._log_updates
+    def checkpoint_path(self, **kwargs):
+        kwargs.setdefault('use_fsdp', self.cfg.fsdp.use)
+        return self.folder / checkpoint.checkpoint_name(**kwargs)
+    def epoch_checkpoint_path(self, epoch: int, **kwargs):
+        kwargs.setdefault('use_fsdp', self.cfg.fsdp.use)
+        return self.folder / checkpoint.checkpoint_name(str(epoch), **kwargs)
+    def checkpoint_path_with_name(self, name: str, **kwargs):
+        kwargs.setdefault('use_fsdp', self.cfg.fsdp.use)
+        return self.folder / checkpoint.checkpoint_name(name=name, **kwargs)
+    def save_checkpoints(self):
+        """Save checkpoint, optionally keeping a copy for a given epoch."""
+        is_sharded = self.cfg.fsdp.use
+        if not flashy.distrib.is_rank_zero() and not is_sharded:
+            return
+        self.logger.info("Model hash: %s", model_hash(self.model))
+        state = self.state_dict()
+        epoch = self.epoch - 1  # pushing metrics will increase the epoch in Flashy, so we do -1 here
+        # save minimal state_dict as new checkpoint every X epoch
+        if self.cfg.checkpoint.save_every:
+            if epoch % self.cfg.checkpoint.save_every == 0:
+                minimal_state = state
+                if self.cfg.checkpoint.keep_every_states is not None and len(self.cfg.checkpoint.keep_every_states) > 0:
+                    minimal_state = {
+                        name: source for name, source in state.items()
+                        if name in self.cfg.checkpoint.keep_every_states
+                    }
+                epoch_checkpoint_path = self.epoch_checkpoint_path(epoch)
+                checkpoint.save_checkpoint(minimal_state, epoch_checkpoint_path, is_sharded)
+        # save checkpoint as latest checkpoint
+        if self.cfg.checkpoint.save_last:
+            last_checkpoint_path = self.checkpoint_path()
+            checkpoint.save_checkpoint(state, last_checkpoint_path, is_sharded)
+        # flush any stale checkpoint to reduce disk footprint
+        checkpoint.flush_stale_checkpoints(self.checkpoint_path())
+    def load_from_pretrained(self, name: str) -> dict:
+        raise NotImplementedError("Solver does not provide a way to load pretrained models.")
+    def load_checkpoints(self, load_best: bool = False, ignore_state_keys: tp.List[str] = []) -> tp.Optional[dict]:
+        """Load last checkpoint or the one specified in continue_from.
+        Args:
+            load_best (bool): Whether to load from best state dict or not.
+                Best state dict is always used when not loading the current xp.
+            ignore_state_keys (list of str): List of sources to ignore when loading the state, e.g. `optimizer`.
+        Returns:
+            state (dict, optional): The loaded state dictionary.
+        """
+        # load checkpoints from xp folder or cfg.continue_from
+        is_sharded = self.cfg.fsdp.use
+        load_from_path: tp.Optional[Path] = None
+        checkpoint_source: tp.Optional[checkpoint.CheckpointSource] = None
+        if load_best:
+            self.logger.info("Trying to load state_dict from best state.")
+        state: tp.Optional[dict] = None
+        rank0_checkpoint_path = self.checkpoint_path(use_fsdp=False)
+        current_checkpoint_path = self.checkpoint_path()
+        _pretrained_prefix = '//pretrained/'
+        continue_pretrained = (self.cfg.continue_from or '').startswith(_pretrained_prefix)
+        if rank0_checkpoint_path.exists():
+            self.logger.info(f"Loading existing checkpoint: {current_checkpoint_path}")
+            load_from_path = current_checkpoint_path
+            checkpoint.check_sharded_checkpoint(current_checkpoint_path, rank0_checkpoint_path)
+            checkpoint_source = checkpoint.CheckpointSource.CURRENT_XP
+        elif self.cfg.continue_from and not continue_pretrained:
+            self.logger.info(f"Continuing from provided checkpoint: {self.cfg.continue_from}")
+            # we're always continuing from consolidated checkpoints: self.cfg.use_fsdp and not continue_best
+            load_from_path = checkpoint.resolve_checkpoint_path(self.cfg.continue_from, use_fsdp=False)
+            if load_from_path is None:
+                self.logger.error('Could not resolve the continue_from checkpoint %s', self.cfg.continue_from)
+                raise RuntimeError(f'Could not resolve continue_from checkpoint {self.cfg.continue_from}')
+            checkpoint_source = checkpoint.CheckpointSource.OTHER
+        if load_from_path is not None:
+            state = checkpoint.load_checkpoint(load_from_path, is_sharded)
+        elif continue_pretrained:
+            self.logger.info("Loading a pretrained model. Ignoring 'load_best' and 'ignore_state_keys' params.")
+            state = self.load_from_pretrained(self.cfg.continue_from[len(_pretrained_prefix):])
+            checkpoint_source = checkpoint.CheckpointSource.PRETRAINED
+            load_best = True
+        # checkpoints are not from the current xp, we only retrieve the best state
+        if checkpoint_source is not None and checkpoint_source != checkpoint.CheckpointSource.CURRENT_XP:
+            assert state is not None
+            self.logger.info("Checkpoint source is not the current xp: Load state_dict from best state.")
+            load_best = True
+            state = {key: state[key] for key in self._continue_best_source_keys if key in state}
+            # loaded checkpoints are FSDP checkpoints: we're reading the best state
+            # from FSDP and we drop the regular best_state
+            if 'fsdp_best_state' in state and state['fsdp_best_state']:
+                state.pop('best_state', None)
+                self.logger.info("... Loaded checkpoint has FSDP best state")
+            # FSDP is enabled in the solver, if the loaded checkpoints do not have FSDP support
+            # then we're initializing FSDP best state with the regular best state
+            elif self.cfg.fsdp.use:
+                if 'fsdp_best_state' not in state or not state['fsdp_best_state']:
+                    # we swap non-FSDP checkpoints best_state to FSDP-compatible best state
+                    state['fsdp_best_state'] = state.pop('best_state')
+                    self.logger.info("... Loaded checkpoint does not have FSDP best state. Use regular best state")
+        if state is not None:
+            if load_best:
+                self.logger.info("Ignoring keys when loading best %r", ignore_state_keys)
+                for key in set(ignore_state_keys):
+                    if key in state:
+                        state.pop(key)
+                has_best_state = 'best_state' in state or 'fsdp_best_state' in state
+                assert has_best_state, ("Trying to load best state but neither 'best_state'",
+                                        " or 'fsdp_best_state' found in checkpoints.")
+            self.load_state_dict(state)
+        # for FSDP, let's make extra sure nothing bad happened with out of sync
+        # checkpoints across workers.
+        epoch = float(self.epoch)
+        avg_epoch = flashy.distrib.average_metrics({'epoch': epoch})['epoch']
+        if avg_epoch != epoch:
+            raise RuntimeError(
+                f"Inconsistent loading of checkpoints happened, our epoch is {epoch} "
+                f"but average of epochs is {avg_epoch}, at least one gpu must have a "
+                "different epoch number.")
+        # on load_best, properly reinitialize state_dict, best states and ema
+        # otherwise we load from the current xp and don't alter anything
+        if load_best:
+            self.logger.info("Loading state_dict from best state.")
+            if not self.cfg.fsdp.use and self.fsdp_best_state:
+                # loading from an FSDP checkpoint but with FSDP deactivated
+                self.logger.info("... Loading from FSDP best state dict.")
+                self.best_state.load_state_dict(self.fsdp_best_state)
+            # if load_best, we permanently override the regular state_dict with the best state
+            if self.cfg.fsdp.use:
+                self.logger.info("FSDP is used, loading from FSDP best state.")
+                with fsdp.switch_to_full_state_dict(self._fsdp_modules):
+                    # this might be really fragile but okay for now.
+                    self.load_state_dict(self.fsdp_best_state)
+            else:
+                # we permanently swap the stateful objects to their best state
+                self._load_new_state_dict(self.best_state.state_dict())
+            # the EMA modules should also be instantiated with best state.
+            # the easiest way to do so is to reinitialize a new EMA with best state loaded.
+            if self.ema is not None:
+                self.logger.info("Re-initializing EMA from best state")
+                self.initialize_ema()
+            if self.cfg.fsdp.use:
+                self.logger.info("Re-initializing best state after using FSDP best state.")
+                for name in self.best_state.states.keys():
+                    state_source = self._get_state_source(name)
+                    self.best_state.update(name, state_source)
+        return state
+    def restore(self, load_best: bool = False, replay_metrics: bool = False,
+                ignore_state_keys: tp.List[str] = []) -> bool:
+        """Restore the status of a solver for a given xp.
+        Args:
+            load_best (bool): if `True`, load the best state from the checkpoint.
+            replay_metrics (bool): if `True`, logs all the metrics from past epochs.
+            ignore_state_keys (list of str): list of sources to ignore when loading the state, e.g. `optimizer`.
+        """
+        self.logger.info("Restoring weights and history.")
+        restored_checkpoints = self.load_checkpoints(load_best, ignore_state_keys)
+        self.logger.info("Model hash: %s", model_hash(self.model))
+        if replay_metrics and len(self.history) > 0:
+            self.logger.info("Replaying past metrics...")
+            for epoch, stages in enumerate(self.history):
+                for stage_name, metrics in stages.items():
+                    # We manually log the metrics summary to the result logger
+                    # as we don't want to add them to the pending metrics
+                    self.result_logger._log_summary(stage_name, metrics, step=epoch + 1, step_name='epoch',
+                                                    formatter=self.get_formatter(stage_name))
+        return restored_checkpoints is not None
+    def commit(self, save_checkpoints: bool = True):
+        """Commit metrics to dora and save checkpoints at the end of an epoch."""
+        # we override commit to introduce more complex checkpoint saving behaviors
+        self.history.append(self._pending_metrics)  # This will increase self.epoch
+        if save_checkpoints:
+            self.save_checkpoints()
+        self._start_epoch()
+        if flashy.distrib.is_rank_zero():
+            self.xp.link.update_history(self.history)
+    def run_epoch(self):
+        """Run a single epoch with all stages.
+        Metrics for a given stage are stored in _pending_metrics and committed by the solver afterwards.
+        Children solvers can extend this method with custom behavior, e.g.:
+            def run_epoch(self):
+                ... # custom code
+                super().run_epoch()
+                ... # custom code
+        """
+        self.run_stage('train', self.train)
+        with torch.no_grad():
+            with self.swap_ema_state():
+                self.run_stage('valid', self.valid)
+                # the best state is updated with EMA states if available
+                self.update_best_state_from_stage('valid')
+            with self.swap_best_state():
+                if self.should_run_stage('evaluate'):
+                    self.run_stage('evaluate', self.evaluate)
+                if self.should_run_stage('generate'):
+                    self.run_stage('generate', with_rank_rng()(self.generate))
+    def run(self):
+        """Training loop."""
+        assert len(self.state_dict()) > 0
+        self.restore(replay_metrics=True)  # load checkpoint and replay history
+        self.log_hyperparams(dict_from_config(self.cfg))
+        for epoch in range(self.epoch, self.cfg.optim.epochs + 1):
+            if self.should_stop_training():
+                return
+            self.run_epoch()
+            # Commit will send the metrics to Dora and save checkpoints by default.
+            self.commit()
+    def should_stop_training(self) -> bool:
+        """Check whether we should stop training or not."""
+        return self.epoch > self.cfg.optim.epochs
+    def should_run_stage(self, stage_name) -> bool:
+        """Check whether we want to run the specified stages."""
+        stage_every = self.cfg[stage_name].get('every', None)
+        is_last_epoch = self.epoch == self.cfg.optim.epochs
+        is_epoch_every = (stage_every and self.epoch % stage_every == 0)
+        return is_last_epoch or is_epoch_every
+    @abstractmethod
+    def run_step(self, idx: int, batch: tp.Any, metrics: dict):
+        """Perform one training or valid step on a given batch."""
+        ...
+    def common_train_valid(self, dataset_split: str, **kwargs: tp.Any):
+        """Common logic for train and valid stages."""
+        self.model.train(self.is_training)
+        loader = self.dataloaders[dataset_split]
+        # get a different order for distributed training, otherwise this will get ignored
+        if flashy.distrib.world_size() > 1 \
+           and isinstance(loader.sampler, torch.utils.data.distributed.DistributedSampler):
+            loader.sampler.set_epoch(self.epoch)
+        updates_per_epoch = self.train_updates_per_epoch if self.is_training else len(loader)
+        if self.cfg.benchmark_no_load:
+            self.logger.warning("Fake loading for benchmarking: re-using first batch")
+            batch = next(iter(loader))
+            loader = [batch] * updates_per_epoch  # type: ignore
+        lp = self.log_progress(self.current_stage, loader, total=updates_per_epoch, updates=self.log_updates)
+        average = flashy.averager()  # epoch wise average
+        instant_average = flashy.averager()  # average between two logging
+        metrics: dict = {}
+        with self.profiler, self.deadlock_detect:  # profiler will only run for the first 20 updates.
+            for idx, batch in enumerate(lp):
+                self.deadlock_detect.update('batch')
+                if idx >= updates_per_epoch:
+                    break
+                metrics = {}
+                metrics = self.run_step(idx, batch, metrics)
+                self.deadlock_detect.update('step')
+                # run EMA step
+                if self.ema is not None and self.is_training and (idx + 1) % self.cfg.optim.ema.updates == 0:
+                    self.logger.debug("EMA model step")
+                    self.ema.step()
+                self.deadlock_detect.update('ema')
+                self.profiler.step()
+                instant_metrics = instant_average(metrics)
+                if lp.update(**instant_metrics):
+                    instant_average = flashy.averager()  # reset averager between two logging
+                metrics = average(metrics)  # epoch wise average
+                self.deadlock_detect.update('end_batch')
+        metrics = flashy.distrib.average_metrics(metrics, updates_per_epoch)
+        return metrics
+    def train(self):
+        """Train stage."""
+        return self.common_train_valid('train')
+    def valid(self):
+        """Valid stage."""
+        return self.common_train_valid('valid')
+    @abstractmethod
+    def evaluate(self):
+        """Evaluate stage."""
+        ...
+    @abstractmethod
+    def generate(self):
+        """Generate stage."""
+        ...
+    def run_one_stage(self, stage_name: str):
+        """Run only the specified stage.
+        This method is useful to only generate samples from a trained experiment
+        or rerun the validation or evaluation stages.
+        """
+        fn = {
+            'generate': with_rank_rng()(self.generate),
+            'evaluate': self.evaluate,
+            'valid': self.valid,
+        }
+        if stage_name not in fn:
+            raise ValueError(f'Trying to run stage {stage_name} is not supported.')
+        assert len(self.state_dict()) > 0
+        self._start_epoch()
+        with torch.no_grad(), self.swap_best_state():
+            self.run_stage(stage_name, fn[stage_name])
+        if not self.cfg.execute_inplace:
+            self.commit(save_checkpoints=False)
+    @staticmethod
+    def get_eval_solver_from_sig(sig: str, dtype: tp.Optional[str] = None,
+                                 device: tp.Optional[str] = None, autocast: bool = True,
+                                 batch_size: tp.Optional[int] = None,
+                                 override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                                 **kwargs):
+        """Mostly a convenience function around audiocraft.train.get_solver_from_sig,
+        populating all the proper param, deactivating EMA, FSDP, loading the best state,
+        basically all you need to get a solver ready to "play" with in single GPU mode
+        and with minimal memory overhead.
+        Args:
+            sig (str): signature to load.
+            dtype (str or None): potential dtype, as a string, i.e. 'float16'.
+            device (str or None): potential device, as a string, i.e. 'cuda'.
+            override_cfg (dict or omegaconf.DictConfig or None): potential device, as a string, i.e. 'cuda'.
+        """
+        from audiocraft import train
+        our_override_cfg: tp.Dict[str, tp.Any] = {'optim': {'ema': {'use': False}}}
+        our_override_cfg['autocast'] = autocast
+        if dtype is not None:
+            our_override_cfg['dtype'] = dtype
+        if device is not None:
+            our_override_cfg['device'] = device
+        if batch_size is not None:
+            our_override_cfg['dataset'] = {'batch_size': batch_size}
+        if override_cfg is None:
+            override_cfg = {}
+        override_cfg = omegaconf.OmegaConf.merge(
+            omegaconf.DictConfig(override_cfg), omegaconf.DictConfig(our_override_cfg))  # type: ignore
+        solver = train.get_solver_from_sig(
+            sig, override_cfg=override_cfg,
+            load_best=True, disable_fsdp=True,
+            ignore_state_keys=['optimizer', 'ema'], **kwargs)
+        solver.model.eval()
+        return solver

audiocraft/solvers/builders.py ADDED Viewed

	@@ -0,0 +1,366 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+All the functions to build the relevant solvers and used objects
+from the Hydra config.
+"""
+from enum import Enum
+import logging
+import typing as tp
+import dora
+import flashy
+import omegaconf
+import torch
+from torch import nn
+from torch.optim import Optimizer
+# LRScheduler was renamed in some torch versions
+try:
+    from torch.optim.lr_scheduler import LRScheduler  # type: ignore
+except ImportError:
+    from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from .base import StandardSolver
+from .. import adversarial, data, losses, metrics, optim
+from ..utils.utils import dict_from_config, get_loader
+logger = logging.getLogger(__name__)
+class DatasetType(Enum):
+    AUDIO = "audio"
+    MUSIC = "music"
+    SOUND = "sound"
+def get_solver(cfg: omegaconf.DictConfig) -> StandardSolver:
+    """Instantiate solver from config."""
+    from .audiogen import AudioGenSolver
+    from .compression import CompressionSolver
+    from .musicgen import MusicGenSolver
+    from .diffusion import DiffusionSolver
+    from .magnet import MagnetSolver, AudioMagnetSolver
+    klass = {
+        'compression': CompressionSolver,
+        'musicgen': MusicGenSolver,
+        'audiogen': AudioGenSolver,
+        'magnet': MagnetSolver,
+        'audio_magnet': AudioMagnetSolver,
+        'lm': MusicGenSolver,  # backward compatibility
+        'diffusion': DiffusionSolver,
+        'sound_lm': AudioGenSolver,  # backward compatibility
+    }[cfg.solver]
+    return klass(cfg)  # type: ignore
+def get_optim_parameter_groups(model: nn.Module):
+    """Create parameter groups for the model using the appropriate method
+    if defined for each modules, to create the different groups.
+    Args:
+        model (nn.Module): torch model
+    Returns:
+        List of parameter groups
+    """
+    seen_params: tp.Set[nn.parameter.Parameter] = set()
+    other_params = []
+    groups = []
+    for name, module in model.named_modules():
+        if hasattr(module, 'make_optim_group'):
+            group = module.make_optim_group()
+            params = set(group['params'])
+            assert params.isdisjoint(seen_params)
+            seen_params |= set(params)
+            groups.append(group)
+    for param in model.parameters():
+        if param not in seen_params:
+            other_params.append(param)
+    groups.insert(0, {'params': other_params})
+    parameters = groups
+    return parameters
+def get_optimizer(params: tp.Union[nn.Module, tp.Iterable[torch.Tensor]], cfg: omegaconf.DictConfig) -> Optimizer:
+    """Build torch optimizer from config and set of parameters.
+    Supported optimizers: Adam, AdamW
+    Args:
+        params (nn.Module or iterable of torch.Tensor): Parameters to optimize.
+        cfg (DictConfig): Optimization-related configuration.
+    Returns:
+        torch.optim.Optimizer.
+    """
+    if 'optimizer' not in cfg:
+        if getattr(cfg, 'optim', None) is not None:
+            raise KeyError("Optimizer not found in config. Try instantiating optimizer from cfg.optim?")
+        else:
+            raise KeyError("Optimizer not found in config.")
+    parameters = get_optim_parameter_groups(params) if isinstance(params, nn.Module) else params
+    optimizer: torch.optim.Optimizer
+    if cfg.optimizer == 'adam':
+        optimizer = torch.optim.Adam(parameters, lr=cfg.lr, **cfg.adam)
+    elif cfg.optimizer == 'adamw':
+        optimizer = torch.optim.AdamW(parameters, lr=cfg.lr, **cfg.adam)
+    elif cfg.optimizer == 'dadam':
+        optimizer = optim.DAdaptAdam(parameters, lr=cfg.lr, **cfg.adam)
+    else:
+        raise ValueError(f"Unsupported Optimizer: {cfg.optimizer}")
+    return optimizer
+def get_lr_scheduler(optimizer: torch.optim.Optimizer,
+                     cfg: omegaconf.DictConfig,
+                     total_updates: int) -> tp.Optional[LRScheduler]:
+    """Build torch learning rate scheduler from config and associated optimizer.
+    Supported learning rate schedulers: ExponentialLRScheduler, PlateauLRScheduler
+    Args:
+        optimizer (torch.optim.Optimizer): Optimizer.
+        cfg (DictConfig): Schedule-related configuration.
+        total_updates (int): Total number of updates.
+    Returns:
+        torch.optim.Optimizer.
+    """
+    if 'lr_scheduler' not in cfg:
+        raise KeyError("LR Scheduler not found in config")
+    lr_sched: tp.Optional[LRScheduler] = None
+    if cfg.lr_scheduler == 'step':
+        lr_sched = torch.optim.lr_scheduler.StepLR(optimizer, **cfg.step)
+    elif cfg.lr_scheduler == 'exponential':
+        lr_sched = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=cfg.exponential)
+    elif cfg.lr_scheduler == 'cosine':
+        kwargs = dict_from_config(cfg.cosine)
+        warmup_steps = kwargs.pop('warmup')
+        lr_sched = optim.CosineLRScheduler(
+            optimizer, warmup_steps=warmup_steps, total_steps=total_updates, **kwargs)
+    elif cfg.lr_scheduler == 'polynomial_decay':
+        kwargs = dict_from_config(cfg.polynomial_decay)
+        warmup_steps = kwargs.pop('warmup')
+        lr_sched = optim.PolynomialDecayLRScheduler(
+            optimizer, warmup_steps=warmup_steps, total_steps=total_updates, **kwargs)
+    elif cfg.lr_scheduler == 'inverse_sqrt':
+        kwargs = dict_from_config(cfg.inverse_sqrt)
+        warmup_steps = kwargs.pop('warmup')
+        lr_sched = optim.InverseSquareRootLRScheduler(optimizer, warmup_steps=warmup_steps, **kwargs)
+    elif cfg.lr_scheduler == 'linear_warmup':
+        kwargs = dict_from_config(cfg.linear_warmup)
+        warmup_steps = kwargs.pop('warmup')
+        lr_sched = optim.LinearWarmupLRScheduler(optimizer, warmup_steps=warmup_steps, **kwargs)
+    elif cfg.lr_scheduler is not None:
+        raise ValueError(f"Unsupported LR Scheduler: {cfg.lr_scheduler}")
+    return lr_sched
+def get_ema(module_dict: nn.ModuleDict, cfg: omegaconf.DictConfig) -> tp.Optional[optim.ModuleDictEMA]:
+    """Initialize Exponential Moving Average.
+    Args:
+        module_dict (nn.ModuleDict): ModuleDict for which to compute the EMA.
+        cfg (omegaconf.DictConfig): Optim EMA configuration.
+    Returns:
+        optim.ModuleDictEMA: EMA version of the ModuleDict.
+    """
+    kw: tp.Dict[str, tp.Any] = dict(cfg)
+    use = kw.pop('use', False)
+    decay = kw.pop('decay', None)
+    device = kw.pop('device', None)
+    if not use:
+        return None
+    if len(module_dict) == 0:
+        raise ValueError("Trying to build EMA but an empty module_dict source is provided!")
+    ema_module = optim.ModuleDictEMA(module_dict, decay=decay, device=device)
+    return ema_module
+def get_loss(loss_name: str, cfg: omegaconf.DictConfig):
+    """Instantiate loss from configuration."""
+    klass = {
+        'l1': torch.nn.L1Loss,
+        'l2': torch.nn.MSELoss,
+        'mel': losses.MelSpectrogramL1Loss,
+        'mrstft': losses.MRSTFTLoss,
+        'msspec': losses.MultiScaleMelSpectrogramLoss,
+        'sisnr': losses.SISNR,
+    }[loss_name]
+    kwargs = dict(getattr(cfg, loss_name))
+    return klass(**kwargs)
+def get_balancer(loss_weights: tp.Dict[str, float], cfg: omegaconf.DictConfig) -> losses.Balancer:
+    """Instantiate loss balancer from configuration for the provided weights."""
+    kwargs: tp.Dict[str, tp.Any] = dict_from_config(cfg)
+    return losses.Balancer(loss_weights, **kwargs)
+def get_adversary(name: str, cfg: omegaconf.DictConfig) -> nn.Module:
+    """Initialize adversary from config."""
+    klass = {
+        'msd': adversarial.MultiScaleDiscriminator,
+        'mpd': adversarial.MultiPeriodDiscriminator,
+        'msstftd': adversarial.MultiScaleSTFTDiscriminator,
+    }[name]
+    adv_cfg: tp.Dict[str, tp.Any] = dict(getattr(cfg, name))
+    return klass(**adv_cfg)
+def get_adversarial_losses(cfg) -> nn.ModuleDict:
+    """Initialize dict of adversarial losses from config."""
+    device = cfg.device
+    adv_cfg = getattr(cfg, 'adversarial')
+    adversaries = adv_cfg.get('adversaries', [])
+    adv_loss_name = adv_cfg['adv_loss']
+    feat_loss_name = adv_cfg.get('feat_loss')
+    normalize = adv_cfg.get('normalize', True)
+    feat_loss: tp.Optional[adversarial.FeatureMatchingLoss] = None
+    if feat_loss_name:
+        assert feat_loss_name in ['l1', 'l2'], f"Feature loss only support L1 or L2 but {feat_loss_name} found."
+        loss = get_loss(feat_loss_name, cfg)
+        feat_loss = adversarial.FeatureMatchingLoss(loss, normalize)
+    loss = adversarial.get_adv_criterion(adv_loss_name)
+    loss_real = adversarial.get_real_criterion(adv_loss_name)
+    loss_fake = adversarial.get_fake_criterion(adv_loss_name)
+    adv_losses = nn.ModuleDict()
+    for adv_name in adversaries:
+        adversary = get_adversary(adv_name, cfg).to(device)
+        optimizer = get_optimizer(adversary.parameters(), cfg.optim)
+        adv_loss = adversarial.AdversarialLoss(
+            adversary,
+            optimizer,
+            loss=loss,
+            loss_real=loss_real,
+            loss_fake=loss_fake,
+            loss_feat=feat_loss,
+            normalize=normalize
+        )
+        adv_losses[adv_name] = adv_loss
+    return adv_losses
+def get_visqol(cfg: omegaconf.DictConfig) -> metrics.ViSQOL:
+    """Instantiate ViSQOL metric from config."""
+    kwargs = dict_from_config(cfg)
+    return metrics.ViSQOL(**kwargs)
+def get_fad(cfg: omegaconf.DictConfig) -> metrics.FrechetAudioDistanceMetric:
+    """Instantiate Frechet Audio Distance metric from config."""
+    kwargs = dict_from_config(cfg.tf)
+    xp = dora.get_xp()
+    kwargs['log_folder'] = xp.folder
+    return metrics.FrechetAudioDistanceMetric(**kwargs)
+def get_kldiv(cfg: omegaconf.DictConfig) -> metrics.KLDivergenceMetric:
+    """Instantiate KL-Divergence metric from config."""
+    kld_metrics = {
+        'passt': metrics.PasstKLDivergenceMetric,
+    }
+    klass = kld_metrics[cfg.model]
+    kwargs = dict_from_config(cfg.get(cfg.model))
+    return klass(**kwargs)
+def get_text_consistency(cfg: omegaconf.DictConfig) -> metrics.TextConsistencyMetric:
+    """Instantiate Text Consistency metric from config."""
+    text_consistency_metrics = {
+        'clap': metrics.CLAPTextConsistencyMetric
+    }
+    klass = text_consistency_metrics[cfg.model]
+    kwargs = dict_from_config(cfg.get(cfg.model))
+    return klass(**kwargs)
+def get_chroma_cosine_similarity(cfg: omegaconf.DictConfig) -> metrics.ChromaCosineSimilarityMetric:
+    """Instantiate Chroma Cosine Similarity metric from config."""
+    assert cfg.model == 'chroma_base', "Only support 'chroma_base' method for chroma cosine similarity metric"
+    kwargs = dict_from_config(cfg.get(cfg.model))
+    return metrics.ChromaCosineSimilarityMetric(**kwargs)
+def get_audio_datasets(cfg: omegaconf.DictConfig,
+                       dataset_type: DatasetType = DatasetType.AUDIO) -> tp.Dict[str, torch.utils.data.DataLoader]:
+    """Build AudioDataset from configuration.
+    Args:
+        cfg (omegaconf.DictConfig): Configuration.
+        dataset_type: The type of dataset to create.
+    Returns:
+        dict[str, torch.utils.data.DataLoader]: Map of dataloader for each data split.
+    """
+    dataloaders: dict = {}
+    sample_rate = cfg.sample_rate
+    channels = cfg.channels
+    seed = cfg.seed
+    max_sample_rate = cfg.datasource.max_sample_rate
+    max_channels = cfg.datasource.max_channels
+    assert cfg.dataset is not None, "Could not find dataset definition in config"
+    dataset_cfg = dict_from_config(cfg.dataset)
+    splits_cfg: dict = {}
+    splits_cfg['train'] = dataset_cfg.pop('train')
+    splits_cfg['valid'] = dataset_cfg.pop('valid')
+    splits_cfg['evaluate'] = dataset_cfg.pop('evaluate')
+    splits_cfg['generate'] = dataset_cfg.pop('generate')
+    execute_only_stage = cfg.get('execute_only', None)
+    for split, path in cfg.datasource.items():
+        if not isinstance(path, str):
+            continue  # skipping this as not a path
+        if execute_only_stage is not None and split != execute_only_stage:
+            continue
+        logger.info(f"Loading audio data split {split}: {str(path)}")
+        assert (
+            cfg.sample_rate <= max_sample_rate
+        ), f"Expecting a max sample rate of {max_sample_rate} for datasource but {sample_rate} found."
+        assert (
+            cfg.channels <= max_channels
+        ), f"Expecting a max number of channels of {max_channels} for datasource but {channels} found."
+        split_cfg = splits_cfg[split]
+        split_kwargs = {k: v for k, v in split_cfg.items()}
+        kwargs = {**dataset_cfg, **split_kwargs}  # split kwargs overrides default dataset_cfg
+        kwargs['sample_rate'] = sample_rate
+        kwargs['channels'] = channels
+        if kwargs.get('permutation_on_files') and cfg.optim.updates_per_epoch:
+            kwargs['num_samples'] = (
+                flashy.distrib.world_size() * cfg.dataset.batch_size * cfg.optim.updates_per_epoch)
+        num_samples = kwargs['num_samples']
+        shuffle = kwargs['shuffle']
+        return_info = kwargs.pop('return_info')
+        batch_size = kwargs.pop('batch_size', None)
+        num_workers = kwargs.pop('num_workers')
+        if dataset_type == DatasetType.MUSIC:
+            dataset = data.music_dataset.MusicDataset.from_meta(path, **kwargs)
+        elif dataset_type == DatasetType.SOUND:
+            dataset = data.sound_dataset.SoundDataset.from_meta(path, **kwargs)
+        elif dataset_type == DatasetType.AUDIO:
+            dataset = data.info_audio_dataset.InfoAudioDataset.from_meta(path, return_info=return_info, **kwargs)
+        else:
+            raise ValueError(f"Dataset type is unsupported: {dataset_type}")
+        loader = get_loader(
+            dataset,
+            num_samples,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            seed=seed,
+            collate_fn=dataset.collater if return_info else None,
+            shuffle=shuffle,
+        )
+        dataloaders[split] = loader
+    return dataloaders

audiocraft/solvers/compression.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import multiprocessing
+from pathlib import Path
+import typing as tp
+import flashy
+import omegaconf
+import torch
+from torch import nn
+from . import base, builders
+from .. import models, quantization
+from ..utils import checkpoint
+from ..utils.samples.manager import SampleManager
+from ..utils.utils import get_pool_executor
+logger = logging.getLogger(__name__)
+class CompressionSolver(base.StandardSolver):
+    """Solver for compression task.
+    The compression task combines a set of perceptual and objective losses
+    to train an EncodecModel (composed of an encoder-decoder and a quantizer)
+    to perform high fidelity audio reconstruction.
+    """
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__(cfg)
+        self.rng: torch.Generator  # set at each epoch
+        self.adv_losses = builders.get_adversarial_losses(self.cfg)
+        self.aux_losses = nn.ModuleDict()
+        self.info_losses = nn.ModuleDict()
+        assert not cfg.fsdp.use, "FSDP not supported by CompressionSolver."
+        loss_weights = dict()
+        for loss_name, weight in self.cfg.losses.items():
+            if loss_name in ['adv', 'feat']:
+                for adv_name, _ in self.adv_losses.items():
+                    loss_weights[f'{loss_name}_{adv_name}'] = weight
+            elif weight > 0:
+                self.aux_losses[loss_name] = builders.get_loss(loss_name, self.cfg)
+                loss_weights[loss_name] = weight
+            else:
+                self.info_losses[loss_name] = builders.get_loss(loss_name, self.cfg)
+        self.balancer = builders.get_balancer(loss_weights, self.cfg.balancer)
+        self.register_stateful('adv_losses')
+    @property
+    def best_metric_name(self) -> tp.Optional[str]:
+        # best model is the last for the compression model
+        return None
+    def build_model(self):
+        """Instantiate model and optimizer."""
+        # Model and optimizer
+        self.model = models.builders.get_compression_model(self.cfg).to(self.device)
+        self.optimizer = builders.get_optimizer(self.model.parameters(), self.cfg.optim)
+        self.register_stateful('model', 'optimizer')
+        self.register_best_state('model')
+        self.register_ema('model')
+    def build_dataloaders(self):
+        """Instantiate audio dataloaders for each stage."""
+        self.dataloaders = builders.get_audio_datasets(self.cfg)
+    def show(self):
+        """Show the compression model and employed adversarial loss."""
+        self.logger.info(f"Compression model with {self.model.quantizer.total_codebooks} codebooks:")
+        self.log_model_summary(self.model)
+        self.logger.info("Adversarial loss:")
+        self.log_model_summary(self.adv_losses)
+        self.logger.info("Auxiliary losses:")
+        self.logger.info(self.aux_losses)
+        self.logger.info("Info losses:")
+        self.logger.info(self.info_losses)
+    def run_step(self, idx: int, batch: torch.Tensor, metrics: dict):
+        """Perform one training or valid step on a given batch."""
+        x = batch.to(self.device)
+        y = x.clone()
+        qres = self.model(x)
+        assert isinstance(qres, quantization.QuantizedResult)
+        y_pred = qres.x
+        # Log bandwidth in kb/s
+        metrics['bandwidth'] = qres.bandwidth.mean()
+        if self.is_training:
+            d_losses: dict = {}
+            if len(self.adv_losses) > 0 and torch.rand(1, generator=self.rng).item() <= 1 / self.cfg.adversarial.every:
+                for adv_name, adversary in self.adv_losses.items():
+                    disc_loss = adversary.train_adv(y_pred, y)
+                    d_losses[f'd_{adv_name}'] = disc_loss
+                metrics['d_loss'] = torch.sum(torch.stack(list(d_losses.values())))
+            metrics.update(d_losses)
+        balanced_losses: dict = {}
+        other_losses: dict = {}
+        # penalty from quantization
+        if qres.penalty is not None and qres.penalty.requires_grad:
+            other_losses['penalty'] = qres.penalty  # penalty term from the quantizer
+        # adversarial losses
+        for adv_name, adversary in self.adv_losses.items():
+            adv_loss, feat_loss = adversary(y_pred, y)
+            balanced_losses[f'adv_{adv_name}'] = adv_loss
+            balanced_losses[f'feat_{adv_name}'] = feat_loss
+        # auxiliary losses
+        for loss_name, criterion in self.aux_losses.items():
+            loss = criterion(y_pred, y)
+            balanced_losses[loss_name] = loss
+        # weighted losses
+        metrics.update(balanced_losses)
+        metrics.update(other_losses)
+        metrics.update(qres.metrics)
+        if self.is_training:
+            # backprop losses that are not handled by balancer
+            other_loss = torch.tensor(0., device=self.device)
+            if 'penalty' in other_losses:
+                other_loss += other_losses['penalty']
+            if other_loss.requires_grad:
+                other_loss.backward(retain_graph=True)
+                ratio1 = sum(p.grad.data.norm(p=2).pow(2)
+                             for p in self.model.parameters() if p.grad is not None)
+                assert isinstance(ratio1, torch.Tensor)
+                metrics['ratio1'] = ratio1.sqrt()
+            # balancer losses backward, returns effective training loss
+            # with effective weights at the current batch.
+            metrics['g_loss'] = self.balancer.backward(balanced_losses, y_pred)
+            # add metrics corresponding to weight ratios
+            metrics.update(self.balancer.metrics)
+            ratio2 = sum(p.grad.data.norm(p=2).pow(2)
+                         for p in self.model.parameters() if p.grad is not None)
+            assert isinstance(ratio2, torch.Tensor)
+            metrics['ratio2'] = ratio2.sqrt()
+            # optim
+            flashy.distrib.sync_model(self.model)
+            if self.cfg.optim.max_norm:
+                torch.nn.utils.clip_grad_norm_(
+                    self.model.parameters(), self.cfg.optim.max_norm
+                )
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+        # informative losses only
+        info_losses: dict = {}
+        with torch.no_grad():
+            for loss_name, criterion in self.info_losses.items():
+                loss = criterion(y_pred, y)
+                info_losses[loss_name] = loss
+        metrics.update(info_losses)
+        # aggregated GAN losses: this is useful to report adv and feat across different adversarial loss setups
+        adv_losses = [loss for loss_name, loss in metrics.items() if loss_name.startswith('adv')]
+        if len(adv_losses) > 0:
+            metrics['adv'] = torch.sum(torch.stack(adv_losses))
+        feat_losses = [loss for loss_name, loss in metrics.items() if loss_name.startswith('feat')]
+        if len(feat_losses) > 0:
+            metrics['feat'] = torch.sum(torch.stack(feat_losses))
+        return metrics
+    def run_epoch(self):
+        # reset random seed at the beginning of the epoch
+        self.rng = torch.Generator()
+        self.rng.manual_seed(1234 + self.epoch)
+        # run epoch
+        super().run_epoch()
+    def evaluate(self):
+        """Evaluate stage. Runs audio reconstruction evaluation."""
+        self.model.eval()
+        evaluate_stage_name = str(self.current_stage)
+        loader = self.dataloaders['evaluate']
+        updates = len(loader)
+        lp = self.log_progress(f'{evaluate_stage_name} inference', loader, total=updates, updates=self.log_updates)
+        average = flashy.averager()
+        pendings = []
+        ctx = multiprocessing.get_context('spawn')
+        with get_pool_executor(self.cfg.evaluate.num_workers, mp_context=ctx) as pool:
+            for idx, batch in enumerate(lp):
+                x = batch.to(self.device)
+                with torch.no_grad():
+                    qres = self.model(x)
+                y_pred = qres.x.cpu()
+                y = batch.cpu()  # should already be on CPU but just in case
+                pendings.append(pool.submit(evaluate_audio_reconstruction, y_pred, y, self.cfg))
+            metrics_lp = self.log_progress(f'{evaluate_stage_name} metrics', pendings, updates=self.log_updates)
+            for pending in metrics_lp:
+                metrics = pending.result()
+                metrics = average(metrics)
+        metrics = flashy.distrib.average_metrics(metrics, len(loader))
+        return metrics
+    def generate(self):
+        """Generate stage."""
+        self.model.eval()
+        sample_manager = SampleManager(self.xp, map_reference_to_sample_id=True)
+        generate_stage_name = str(self.current_stage)
+        loader = self.dataloaders['generate']
+        updates = len(loader)
+        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+        for batch in lp:
+            reference, _ = batch
+            reference = reference.to(self.device)
+            with torch.no_grad():
+                qres = self.model(reference)
+            assert isinstance(qres, quantization.QuantizedResult)
+            reference = reference.cpu()
+            estimate = qres.x.cpu()
+            sample_manager.add_samples(estimate, self.epoch, ground_truth_wavs=reference)
+        flashy.distrib.barrier()
+    def load_from_pretrained(self, name: str) -> dict:
+        model = models.CompressionModel.get_pretrained(name)
+        if isinstance(model, models.DAC):
+            raise RuntimeError("Cannot fine tune a DAC model.")
+        elif isinstance(model, models.HFEncodecCompressionModel):
+            self.logger.warning('Trying to automatically convert a HuggingFace model '
+                                'to AudioCraft, this might fail!')
+            state = model.model.state_dict()
+            new_state = {}
+            for k, v in state.items():
+                if k.startswith('decoder.layers') and '.conv.' in k and '.block.' not in k:
+                    # We need to determine if this a convtr or a regular conv.
+                    layer = int(k.split('.')[2])
+                    if isinstance(model.model.decoder.layers[layer].conv, torch.nn.ConvTranspose1d):
+                        k = k.replace('.conv.', '.convtr.')
+                k = k.replace('encoder.layers.', 'encoder.model.')
+                k = k.replace('decoder.layers.', 'decoder.model.')
+                k = k.replace('conv.', 'conv.conv.')
+                k = k.replace('convtr.', 'convtr.convtr.')
+                k = k.replace('quantizer.layers.', 'quantizer.vq.layers.')
+                k = k.replace('.codebook.', '._codebook.')
+                new_state[k] = v
+            state = new_state
+        elif isinstance(model, models.EncodecModel):
+            state = model.state_dict()
+        else:
+            raise RuntimeError(f"Cannot fine tune model type {type(model)}.")
+        return {
+            'best_state': {'model': state}
+        }
+    @staticmethod
+    def model_from_checkpoint(checkpoint_path: tp.Union[Path, str],
+                              device: tp.Union[torch.device, str] = 'cpu') -> models.CompressionModel:
+        """Instantiate a CompressionModel from a given checkpoint path or dora sig.
+        This method is a convenient endpoint to load a CompressionModel to use in other solvers.
+        Args:
+            checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
+                This also supports pre-trained models by using a path of the form //pretrained/NAME.
+                See `model_from_pretrained` for a list of supported pretrained models.
+            use_ema (bool): Use EMA variant of the model instead of the actual model.
+            device (torch.device or str): Device on which the model is loaded.
+        """
+        checkpoint_path = str(checkpoint_path)
+        if checkpoint_path.startswith('//pretrained/'):
+            name = checkpoint_path.split('/', 3)[-1]
+            return models.CompressionModel.get_pretrained(name, device)
+        logger = logging.getLogger(__name__)
+        logger.info(f"Loading compression model from checkpoint: {checkpoint_path}")
+        _checkpoint_path = checkpoint.resolve_checkpoint_path(checkpoint_path, use_fsdp=False)
+        assert _checkpoint_path is not None, f"Could not resolve compression model checkpoint path: {checkpoint_path}"
+        state = checkpoint.load_checkpoint(_checkpoint_path)
+        assert state is not None and 'xp.cfg' in state, f"Could not load compression model from ckpt: {checkpoint_path}"
+        cfg = state['xp.cfg']
+        cfg.device = device
+        compression_model = models.builders.get_compression_model(cfg).to(device)
+        assert compression_model.sample_rate == cfg.sample_rate, "Compression model sample rate should match"
+        assert 'best_state' in state and state['best_state'] != {}
+        assert 'exported' not in state, "When loading an exported checkpoint, use the //pretrained/ prefix."
+        compression_model.load_state_dict(state['best_state']['model'])
+        compression_model.eval()
+        logger.info("Compression model loaded!")
+        return compression_model
+    @staticmethod
+    def wrapped_model_from_checkpoint(cfg: omegaconf.DictConfig,
+                                      checkpoint_path: tp.Union[Path, str],
+                                      device: tp.Union[torch.device, str] = 'cpu') -> models.CompressionModel:
+        """Instantiate a wrapped CompressionModel from a given checkpoint path or dora sig.
+        Args:
+            cfg (omegaconf.DictConfig): Configuration to read from for wrapped mode.
+            checkpoint_path (Path or str): Path to checkpoint or dora sig from where the checkpoint is resolved.
+            use_ema (bool): Use EMA variant of the model instead of the actual model.
+            device (torch.device or str): Device on which the model is loaded.
+        """
+        compression_model = CompressionSolver.model_from_checkpoint(checkpoint_path, device)
+        compression_model = models.builders.get_wrapped_compression_model(compression_model, cfg)
+        return compression_model
+def evaluate_audio_reconstruction(y_pred: torch.Tensor, y: torch.Tensor, cfg: omegaconf.DictConfig) -> dict:
+    """Audio reconstruction evaluation method that can be conveniently pickled."""
+    metrics = {}
+    if cfg.evaluate.metrics.visqol:
+        visqol = builders.get_visqol(cfg.metrics.visqol)
+        metrics['visqol'] = visqol(y_pred, y, cfg.sample_rate)
+    sisnr = builders.get_loss('sisnr', cfg)
+    metrics['sisnr'] = sisnr(y_pred, y)
+    return metrics

audiocraft/solvers/diffusion.py ADDED Viewed

	@@ -0,0 +1,279 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import flashy
+import julius
+import omegaconf
+import torch
+import torch.nn.functional as F
+from . import builders
+from . import base
+from .. import models
+from ..modules.diffusion_schedule import NoiseSchedule
+from ..metrics import RelativeVolumeMel
+from ..models.builders import get_processor
+from ..utils.samples.manager import SampleManager
+from ..solvers.compression import CompressionSolver
+class PerStageMetrics:
+    """Handle prompting the metrics per stage.
+    It outputs the metrics per range of diffusion states.
+    e.g. avg loss when t in [250, 500]
+    """
+    def __init__(self, num_steps: int, num_stages: int = 4):
+        self.num_steps = num_steps
+        self.num_stages = num_stages
+    def __call__(self, losses: dict, step: tp.Union[int, torch.Tensor]):
+        if type(step) is int:
+            stage = int((step / self.num_steps) * self.num_stages)
+            return {f"{name}_{stage}": loss for name, loss in losses.items()}
+        elif type(step) is torch.Tensor:
+            stage_tensor = ((step / self.num_steps) * self.num_stages).long()
+            out: tp.Dict[str, float] = {}
+            for stage_idx in range(self.num_stages):
+                mask = (stage_tensor == stage_idx)
+                N = mask.sum()
+                stage_out = {}
+                if N > 0:  # pass if no elements in the stage
+                    for name, loss in losses.items():
+                        stage_loss = (mask * loss).sum() / N
+                        stage_out[f"{name}_{stage_idx}"] = stage_loss
+                out = {**out, **stage_out}
+            return out
+class DataProcess:
+    """Apply filtering or resampling.
+    Args:
+        initial_sr (int): Initial sample rate.
+        target_sr (int): Target sample rate.
+        use_resampling: Whether to use resampling or not.
+        use_filter (bool):
+        n_bands (int): Number of bands to consider.
+        idx_band (int):
+        device (torch.device or str):
+        cutoffs ():
+        boost (bool):
+    """
+    def __init__(self, initial_sr: int = 24000, target_sr: int = 16000, use_resampling: bool = False,
+                 use_filter: bool = False, n_bands: int = 4,
+                 idx_band: int = 0, device: torch.device = torch.device('cpu'), cutoffs=None, boost=False):
+        """Apply filtering or resampling
+        Args:
+            initial_sr (int): sample rate of the dataset
+            target_sr (int): sample rate after resampling
+            use_resampling (bool): whether or not performs resampling
+            use_filter (bool): when True filter the data to keep only one frequency band
+            n_bands (int): Number of bands used
+            cuts (none or list): The cutoff frequencies of the band filtering
+                                if None then we use mel scale bands.
+            idx_band (int): index of the frequency band. 0 are lows ... (n_bands - 1) highs
+            boost (bool): make the data scale match our music dataset.
+        """
+        assert idx_band < n_bands
+        self.idx_band = idx_band
+        if use_filter:
+            if cutoffs is not None:
+                self.filter = julius.SplitBands(sample_rate=initial_sr, cutoffs=cutoffs).to(device)
+            else:
+                self.filter = julius.SplitBands(sample_rate=initial_sr, n_bands=n_bands).to(device)
+        self.use_filter = use_filter
+        self.use_resampling = use_resampling
+        self.target_sr = target_sr
+        self.initial_sr = initial_sr
+        self.boost = boost
+    def process_data(self, x, metric=False):
+        if x is None:
+            return None
+        if self.boost:
+            x /= torch.clamp(x.std(dim=(1, 2), keepdim=True), min=1e-4)
+            x * 0.22
+        if self.use_filter and not metric:
+            x = self.filter(x)[self.idx_band]
+        if self.use_resampling:
+            x = julius.resample_frac(x, old_sr=self.initial_sr, new_sr=self.target_sr)
+        return x
+    def inverse_process(self, x):
+        """Upsampling only."""
+        if self.use_resampling:
+            x = julius.resample_frac(x, old_sr=self.target_sr, new_sr=self.target_sr)
+        return x
+class DiffusionSolver(base.StandardSolver):
+    """Solver for compression task.
+    The diffusion task allows for MultiBand diffusion model training.
+    Args:
+        cfg (DictConfig): Configuration.
+    """
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__(cfg)
+        self.cfg = cfg
+        self.device = cfg.device
+        self.sample_rate: int = self.cfg.sample_rate
+        self.codec_model = CompressionSolver.model_from_checkpoint(
+            cfg.compression_model_checkpoint, device=self.device)
+        self.codec_model.set_num_codebooks(cfg.n_q)
+        assert self.codec_model.sample_rate == self.cfg.sample_rate, (
+            f"Codec model sample rate is {self.codec_model.sample_rate} but "
+            f"Solver sample rate is {self.cfg.sample_rate}."
+            )
+        assert self.codec_model.sample_rate == self.sample_rate, \
+            f"Sample rate of solver {self.sample_rate} and codec {self.codec_model.sample_rate} " \
+            "don't match."
+        self.sample_processor = get_processor(cfg.processor, sample_rate=self.sample_rate)
+        self.register_stateful('sample_processor')
+        self.sample_processor.to(self.device)
+        self.schedule = NoiseSchedule(
+            **cfg.schedule, device=self.device, sample_processor=self.sample_processor)
+        self.eval_metric: tp.Optional[torch.nn.Module] = None
+        self.rvm = RelativeVolumeMel()
+        self.data_processor = DataProcess(initial_sr=self.sample_rate, target_sr=cfg.resampling.target_sr,
+                                          use_resampling=cfg.resampling.use, cutoffs=cfg.filter.cutoffs,
+                                          use_filter=cfg.filter.use, n_bands=cfg.filter.n_bands,
+                                          idx_band=cfg.filter.idx_band, device=self.device)
+    @property
+    def best_metric_name(self) -> tp.Optional[str]:
+        if self._current_stage == "evaluate":
+            return 'rvm'
+        else:
+            return 'loss'
+    @torch.no_grad()
+    def get_condition(self, wav: torch.Tensor) -> torch.Tensor:
+        codes, scale = self.codec_model.encode(wav)
+        assert scale is None, "Scaled compression models not supported."
+        emb = self.codec_model.decode_latent(codes)
+        return emb
+    def build_model(self):
+        """Build model and optimizer as well as optional Exponential Moving Average of the model.
+        """
+        # Model and optimizer
+        self.model = models.builders.get_diffusion_model(self.cfg).to(self.device)
+        self.optimizer = builders.get_optimizer(self.model.parameters(), self.cfg.optim)
+        self.register_stateful('model', 'optimizer')
+        self.register_best_state('model')
+        self.register_ema('model')
+    def build_dataloaders(self):
+        """Build audio dataloaders for each stage."""
+        self.dataloaders = builders.get_audio_datasets(self.cfg)
+    def show(self):
+        # TODO
+        raise NotImplementedError()
+    def run_step(self, idx: int, batch: torch.Tensor, metrics: dict):
+        """Perform one training or valid step on a given batch."""
+        x = batch.to(self.device)
+        loss_fun = F.mse_loss if self.cfg.loss.kind == 'mse' else F.l1_loss
+        condition = self.get_condition(x)  # [bs, 128, T/hop, n_emb]
+        sample = self.data_processor.process_data(x)
+        input_, target, step = self.schedule.get_training_item(sample,
+                                                               tensor_step=self.cfg.schedule.variable_step_batch)
+        out = self.model(input_, step, condition=condition).sample
+        base_loss = loss_fun(out, target, reduction='none').mean(dim=(1, 2))
+        reference_loss = loss_fun(input_, target, reduction='none').mean(dim=(1, 2))
+        loss = base_loss / reference_loss ** self.cfg.loss.norm_power
+        if self.is_training:
+            loss.mean().backward()
+            flashy.distrib.sync_model(self.model)
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+        metrics = {
+            'loss': loss.mean(), 'normed_loss': (base_loss / reference_loss).mean(),
+            }
+        metrics.update(self.per_stage({'loss': loss, 'normed_loss': base_loss / reference_loss}, step))
+        metrics.update({
+            'std_in': input_.std(), 'std_out': out.std()})
+        return metrics
+    def run_epoch(self):
+        # reset random seed at the beginning of the epoch
+        self.rng = torch.Generator()
+        self.rng.manual_seed(1234 + self.epoch)
+        self.per_stage = PerStageMetrics(self.schedule.num_steps, self.cfg.metrics.num_stage)
+        # run epoch
+        super().run_epoch()
+    def evaluate(self):
+        """Evaluate stage.
+        Runs audio reconstruction evaluation.
+        """
+        self.model.eval()
+        evaluate_stage_name = f'{self.current_stage}'
+        loader = self.dataloaders['evaluate']
+        updates = len(loader)
+        lp = self.log_progress(f'{evaluate_stage_name} estimate', loader, total=updates, updates=self.log_updates)
+        metrics = {}
+        n = 1
+        for idx, batch in enumerate(lp):
+            x = batch.to(self.device)
+            with torch.no_grad():
+                y_pred = self.regenerate(x)
+            y_pred = y_pred.cpu()
+            y = batch.cpu()  # should already be on CPU but just in case
+            rvm = self.rvm(y_pred, y)
+            lp.update(**rvm)
+            if len(metrics) == 0:
+                metrics = rvm
+            else:
+                for key in rvm.keys():
+                    metrics[key] = (metrics[key] * n + rvm[key]) / (n + 1)
+        metrics = flashy.distrib.average_metrics(metrics)
+        return metrics
+    @torch.no_grad()
+    def regenerate(self, wav: torch.Tensor, step_list: tp.Optional[list] = None):
+        """Regenerate the given waveform."""
+        condition = self.get_condition(wav)
+        initial = self.schedule.get_initial_noise(self.data_processor.process_data(wav))  # sampling rate changes.
+        result = self.schedule.generate_subsampled(self.model, initial=initial, condition=condition,
+                                                   step_list=step_list)
+        result = self.data_processor.inverse_process(result)
+        return result
+    def generate(self):
+        """Generate stage."""
+        sample_manager = SampleManager(self.xp)
+        self.model.eval()
+        generate_stage_name = f'{self.current_stage}'
+        loader = self.dataloaders['generate']
+        updates = len(loader)
+        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+        for batch in lp:
+            reference, _ = batch
+            reference = reference.to(self.device)
+            estimate = self.regenerate(reference)
+            reference = reference.cpu()
+            estimate = estimate.cpu()
+            sample_manager.add_samples(estimate, self.epoch, ground_truth_wavs=reference)
+        flashy.distrib.barrier()

audiocraft/solvers/magnet.py ADDED Viewed

	@@ -0,0 +1,276 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from omegaconf import DictConfig
+from . import builders, musicgen
+from einops import rearrange
+from torch.nn import functional as F
+from ..modules.conditioners import SegmentWithAttributes
+import torch
+import numpy as np
+import random
+import typing as tp
+import math
+import flashy
+class MagnetSolver(musicgen.MusicGenSolver):
+    """Solver for MAGNeT - Masked Audio Generation using
+        a single Non-autoregressive Transformer https://arxiv.org/abs/2401.04577.
+    """
+    def __init__(self, cfg: DictConfig):
+        super().__init__(cfg)
+        # initialize generation parameters by config
+        self.generation_params = {
+            'use_sampling': self.cfg.generate.lm.use_sampling,
+            'temp': self.cfg.generate.lm.temp,
+            'top_k': self.cfg.generate.lm.top_k,
+            'top_p': self.cfg.generate.lm.top_p,
+            'max_cfg_coef': self.cfg.generate.lm.max_cfg_coef,
+            'min_cfg_coef': self.cfg.generate.lm.min_cfg_coef,
+            'decoding_steps': list(self.cfg.generate.lm.decoding_steps),
+            'anneal_temp': self.cfg.generate.lm.anneal_temp,
+            'span_scoring': self.cfg.generate.lm.span_scoring,
+            'span_arrangement': self.cfg.generate.lm.span_arrangement
+        }
+        sequence_len = int(cfg.dataset.segment_duration * self.compression_model.frame_rate)
+        self.mean_maskrate_to_u = torch.tensor(self._calc_mean_maskrate_to_u_LUT(sequence_len), device=self.device)
+        self.ce_per_codebook = [torch.log(torch.tensor(self.compression_model.cardinality, device=self.device))
+                                for _ in range(cfg.transformer_lm.n_q)]
+    def build_model(self) -> None:
+        self.cfg.transformer_lm.segment_duration = self.cfg.dataset.segment_duration
+        self.cfg.transformer_lm.span_len = self.cfg.masking.span_len
+        assert self.cfg.efficient_attention_backend == "xformers", "MAGNeT v1 models support only xformers backend."
+        super().build_model()
+    def _calc_mean_maskrate_to_u_LUT(self, T: int):
+        """ Create a Look Up Table (LUT) transforming a discrete masking percentage m in 0,1,...,100 to u,
+            the number of overlapping spans of length L to place s.t. the masking rate is approximately m/float(100).
+            It first creates the inverse transformation, of the masking rate as function of u,
+            using the expression choose(T - L, u) / choose(T, u), where L is the atomic span length used
+            during masking. See https://arxiv.org/abs/2401.04577,
+            appendix C, for the mean mask rate derivation.
+            We leverage the fact that:
+                                choose(T - L, u) / choose(T, u) = Prod_{j = 0}^{u - 1}((T - L - j)/(T - j))
+            in the provided implementation, in order to avoid overflow.
+        Args:
+            T (float): Sequence length.
+        Returns:
+            (List) A LUT transforming m in 0,1,...,100 to u,
+            s.t. the masking rate of the span-L mask is approximately m/float(100).
+        """
+        L = self.cfg.masking.span_len
+        u2mean = [0.0]  # mean mask rate is 0.0 for u = 0
+        v = (T - L) / float(T)
+        for u in range(1, T):
+            u2mean.append(1 - v)
+            v *= (T - L - u) / (T - u)  # Overflow-safe implementation of choose(T - L, u) / choose(T, u).
+        mean2u = []
+        for maskperc in range(101):
+            maskrate = maskperc / float(100)
+            u = int(np.searchsorted(u2mean, maskrate))
+            mean2u.append(u)
+        return mean2u
+    def _non_spans_mask(self, mask_probs: torch.Tensor, B: int, T: int, device: torch.device) -> torch.Tensor:
+        """ Construct a boolean mask of shape [B, T, 1], with masking rates defined by mask_probs.
+            The masked tokens are singletons, placed uniformly at random.
+        Args:
+            mask_probs (torch.Tensor): The desired masking rate per sample, of shape [B,]
+            B (int): Batch size.
+            T (int): Sequence length.
+            device (torch.device): device of the output tensor
+        Returns:
+            (torch.Tensor): A mask of shape [B, T]
+        """
+        num_token_masked = (T * mask_probs).round().clamp(min=1)
+        batch_randperm = torch.rand((B, T), device=device).argsort(dim=-1)
+        return batch_randperm < rearrange(num_token_masked, 'b -> b 1')
+    def _spans_mask(self, mask_probs: torch.Tensor, B: int, T: int, device: torch.device) -> torch.Tensor:
+        """ Construct a spans mask with masking rates defined by mask_probs,
+            where the atomic span length ( > 1 ) is defined by cfg.masking.span_len.
+        Args:
+            mask_probs (torch.Tensor): The desired masking rate per sample, of shape [B,]
+            B (int): Batch size.
+            T (int): Sequence length.
+            device (torch.device): device of the output tensor
+        Returns:
+            (torch.Tensor): A spans mask of shape [B, T]
+        """
+        rounded_probs = torch.round(100 * mask_probs).long()
+        k = self.mean_maskrate_to_u[rounded_probs].clamp(min=1)  # k is the number of span starts
+        # sample random span starts
+        batch_randperm = torch.rand((B, T), device=device).argsort(dim=-1)
+        mask = batch_randperm < rearrange(k, 'b -> b 1')
+        B, T = mask.shape
+        shifted_mask = mask.clone()
+        for _ in range(self.cfg.masking.span_len - 1):
+            shifted_mask = torch.concat((torch.full((B, 1), False, device=device), shifted_mask[:, :-1]), dim=1)
+            mask = torch.logical_or(mask, shifted_mask)
+        return mask
+    def _get_mask(self, mask_probs: torch.Tensor, B: int, T: int, device: torch.device) -> torch.Tensor:
+        """ Construct a boolean mask with masking rates defined by mask_probs, and atomic
+            span length defined by cfg.masking.span_len.
+        Args:
+            mask_probs (torch.Tensor): The desired masking rate per sample, of shape [B,]
+            B (int): Batch size.
+            T (int): Sequence length.
+            device (torch.device): device of the output tensor
+        Returns:
+            (torch.Tensor): A boolean tensor of shape [B, T]
+        """
+        if self.cfg.masking.span_len <= 1:
+            return self._non_spans_mask(mask_probs, B, T, device)
+        return self._spans_mask(mask_probs, B, T, device)
+    def _compute_cross_entropy_magnet(self, logits: torch.Tensor,
+                                      targets: torch.Tensor, mask: torch.Tensor, stage: torch.Tensor) -> torch.Tensor:
+        """ Compute cross entropy between multi-codebook targets and model's logits.
+        The cross entropy is computed only on a specific codebook, defined by the stage argument.
+        Valid timesteps for each codebook are pulled from the mask, where invalid
+        timesteps are set to 0.
+        Args:
+            logits (torch.Tensor): Model's logits of shape [B, K, T, card].
+            targets (torch.Tensor): Target codes, of shape [B, K, T].
+            mask (torch.Tensor): Mask for valid target codes, of shape [B, K, T].
+            stage (torch.Tensor): The codebook (idx) that is being optimized, as a scalar tensor.
+        Returns:
+            ce (torch.Tensor): Cross entropy of the codebook that is being optimized.
+        """
+        assert logits.shape[:-1] == targets.shape
+        assert mask.shape == targets.shape
+        ce = torch.zeros([], device=targets.device)
+        logits_k = logits[:, stage, ...].contiguous().view(-1, logits.size(-1))  # [B x T, card]
+        targets_k = targets[:, stage, ...].contiguous().view(-1)  # [B x T]
+        mask_k = mask[:, stage, ...].contiguous().view(-1)  # [B x T]
+        IGNORE_IDX = -1
+        targets_k[~mask_k] = IGNORE_IDX
+        q_ce = F.cross_entropy(logits_k, targets_k, ignore_index=IGNORE_IDX)
+        ce += q_ce
+        return ce
+    def run_step(self, idx: int, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]], metrics: dict) -> dict:
+        """Perform one training or valid step on a given batch."""
+        check_synchronization_points = idx == 1 and self.device == 'cuda'
+        condition_tensors, audio_tokens, padding_mask = self._prepare_tokens_and_attributes(
+            batch, check_synchronization_points)
+        self.deadlock_detect.update('tokens_and_conditions')
+        if check_synchronization_points:
+            torch.cuda.set_sync_debug_mode('warn')
+        B, K, T = audio_tokens.shape
+        device = self.device
+        # Choose the stage (codebook idx) for update, uniformly at random.
+        stage_ = random.randint(0, K - 1)
+        stage = torch.full((1, ), stage_, device=device)
+        # masking
+        rand_time = torch.zeros((B,), device=device).float().uniform_(0, 1)
+        rand_mask_probs = torch.cos(rand_time * math.pi * 0.5)
+        # stage mask
+        stage_mask = self._get_mask(rand_mask_probs, B, T, device)  # [B, T]
+        stage_mask = stage_mask.unsqueeze(1)  # [B, 1, T]
+        # Keep all preceding codebooks.
+        mask = torch.full((B, K, T), False, device=device)
+        mask[:, stage, :] = stage_mask
+        # Mask all codebooks larger than stage_
+        mask_id = self.model.special_token_id
+        mask[:, (stage_+1):, :] = torch.full((B, K - stage_ - 1, T), True, device=device)
+        input_tokens = torch.where(mask, mask_id, audio_tokens)
+        # Take loss only on the chosen stage, and only on the masked tokens.
+        loss_mask = torch.full((B, K, T), False, device=device)
+        loss_mask[:, stage, :] = stage_mask
+        with self.autocast:
+            model_output = self.model.compute_predictions(input_tokens, [], condition_tensors, stage=stage_)
+            logits = model_output.logits
+            loss_mask &= padding_mask
+            ce = self._compute_cross_entropy_magnet(logits, audio_tokens, loss_mask, stage)
+            loss = ce
+        self.deadlock_detect.update('loss')
+        if check_synchronization_points:
+            torch.cuda.set_sync_debug_mode('default')
+        if self.is_training:
+            metrics['lr'] = self.optimizer.param_groups[0]['lr']
+            if self.scaler is not None:
+                loss = self.scaler.scale(loss)
+            self.deadlock_detect.update('scale')
+            if self.cfg.fsdp.use:
+                loss.backward()
+                flashy.distrib.average_tensors(self.model.buffers())
+            elif self.cfg.optim.eager_sync:
+                with flashy.distrib.eager_sync_model(self.model):
+                    loss.backward()
+            else:
+                # this should always be slower but can be useful
+                # for weird use cases like multiple backwards.
+                loss.backward()
+                flashy.distrib.sync_model(self.model)
+            self.deadlock_detect.update('backward')
+            if self.scaler is not None:
+                self.scaler.unscale_(self.optimizer)
+            if self.cfg.optim.max_norm:
+                if self.cfg.fsdp.use:
+                    metrics['grad_norm'] = self.model.clip_grad_norm_(self.cfg.optim.max_norm)  # type: ignore
+                else:
+                    metrics['grad_norm'] = torch.nn.utils.clip_grad_norm_(
+                        self.model.parameters(), self.cfg.optim.max_norm
+                    )
+            if self.scaler is None:
+                self.optimizer.step()
+            else:
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+            if self.lr_scheduler:
+                self.lr_scheduler.step()
+            self.optimizer.zero_grad()
+            self.deadlock_detect.update('optim')
+            if self.scaler is not None:
+                scale = self.scaler.get_scale()
+                metrics['grad_scale'] = scale
+            if not loss.isfinite().all():
+                raise RuntimeError("Model probably diverged.")
+        metrics['ce'] = ce
+        metrics['ppl'] = torch.exp(ce)
+        return metrics
+class AudioMagnetSolver(MagnetSolver):
+    """Solver for audio-MAGNeT. A MAGNeT model for sound generation.
+    More information can be found in the MAGNeT model card.
+    """
+    DATASET_TYPE: builders.DatasetType = builders.DatasetType.SOUND

audiocraft/solvers/musicgen.py ADDED Viewed

	@@ -0,0 +1,721 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from pathlib import Path
+import time
+import typing as tp
+import warnings
+import flashy
+import math
+import omegaconf
+import torch
+from torch.nn import functional as F
+from . import base, builders
+from .compression import CompressionSolver
+from .. import metrics as eval_metrics
+from .. import models
+from ..data.audio_dataset import AudioDataset
+from ..data.music_dataset import MusicDataset, MusicInfo, AudioInfo
+from ..data.audio_utils import normalize_audio
+from ..modules.conditioners import JointEmbedCondition, SegmentWithAttributes, WavCondition
+from ..utils.cache import CachedBatchWriter, CachedBatchLoader
+from ..utils.samples.manager import SampleManager
+from ..utils.utils import get_dataset_from_loader, is_jsonable, warn_once, model_hash
+class MusicGenSolver(base.StandardSolver):
+    """Solver for MusicGen training task.
+    Used in: https://arxiv.org/abs/2306.05284
+    """
+    DATASET_TYPE: builders.DatasetType = builders.DatasetType.MUSIC
+    def __init__(self, cfg: omegaconf.DictConfig):
+        super().__init__(cfg)
+        # easier access to sampling parameters
+        self.generation_params = {
+            'use_sampling': self.cfg.generate.lm.use_sampling,
+            'temp': self.cfg.generate.lm.temp,
+            'top_k': self.cfg.generate.lm.top_k,
+            'top_p': self.cfg.generate.lm.top_p,
+        }
+        self._best_metric_name: tp.Optional[str] = 'ce'
+        self._cached_batch_writer = None
+        self._cached_batch_loader = None
+        if cfg.cache.path:
+            if cfg.cache.write:
+                self._cached_batch_writer = CachedBatchWriter(Path(cfg.cache.path))
+                if self.cfg.cache.write_num_shards:
+                    self.logger.warning("Multiple shard cache, best_metric_name will be set to None.")
+                    self._best_metric_name = None
+            else:
+                self._cached_batch_loader = CachedBatchLoader(
+                    Path(cfg.cache.path), cfg.dataset.batch_size, cfg.dataset.num_workers,
+                    min_length=self.cfg.optim.updates_per_epoch or 1)
+                self.dataloaders['original_train'] = self.dataloaders['train']
+                self.dataloaders['train'] = self._cached_batch_loader  # type: ignore
+    @staticmethod
+    def get_eval_solver_from_sig(sig: str, dtype: tp.Optional[str] = None,
+                                 device: tp.Optional[str] = None, autocast: bool = True,
+                                 batch_size: tp.Optional[int] = None,
+                                 override_cfg: tp.Optional[tp.Union[dict, omegaconf.DictConfig]] = None,
+                                 **kwargs):
+        """Mostly a convenience function around magma.train.get_solver_from_sig,
+        populating all the proper param, deactivating EMA, FSDP, loading the best state,
+        basically all you need to get a solver ready to "play" with in single GPU mode
+        and with minimal memory overhead.
+        Args:
+            sig (str): signature to load.
+            dtype (str or None): potential dtype, as a string, i.e. 'float16'.
+            device (str or None): potential device, as a string, i.e. 'cuda'.
+            override_cfg (dict or omegaconf.DictConfig or None): potential device, as a string, i.e. 'cuda'.
+        """
+        from audiocraft import train
+        our_override_cfg: tp.Dict[str, tp.Any] = {'optim': {'ema': {'use': False}}}
+        our_override_cfg['autocast'] = autocast
+        if dtype is not None:
+            our_override_cfg['dtype'] = dtype
+        if device is not None:
+            our_override_cfg['device'] = device
+        if batch_size is not None:
+            our_override_cfg['dataset'] = {'batch_size': batch_size}
+        if override_cfg is None:
+            override_cfg = {}
+        override_cfg = omegaconf.OmegaConf.merge(
+            omegaconf.DictConfig(override_cfg), omegaconf.DictConfig(our_override_cfg))  # type: ignore
+        solver = train.get_solver_from_sig(
+            sig, override_cfg=override_cfg,
+            load_best=True, disable_fsdp=True,
+            ignore_state_keys=['optimizer', 'ema'], **kwargs)
+        solver.model.eval()
+        return solver
+    def get_formatter(self, stage_name: str) -> flashy.Formatter:
+        return flashy.Formatter({
+            'lr': '.2E',
+            'ce': '.3f',
+            'ppl': '.3f',
+            'grad_norm': '.3E',
+        }, exclude_keys=['ce_q*', 'ppl_q*'])
+    @property
+    def best_metric_name(self) -> tp.Optional[str]:
+        return self._best_metric_name
+    def build_model(self) -> None:
+        """Instantiate models and optimizer."""
+        # we can potentially not use all quantizers with which the EnCodec model was trained
+        # (e.g. we trained the model with quantizers dropout)
+        self.compression_model = CompressionSolver.wrapped_model_from_checkpoint(
+            self.cfg, self.cfg.compression_model_checkpoint, device=self.device)
+        assert self.compression_model.sample_rate == self.cfg.sample_rate, (
+            f"Compression model sample rate is {self.compression_model.sample_rate} but "
+            f"Solver sample rate is {self.cfg.sample_rate}."
+            )
+        # ensure we have matching configuration between LM and compression model
+        assert self.cfg.transformer_lm.card == self.compression_model.cardinality, (
+            "Cardinalities of the LM and compression model don't match: ",
+            f"LM cardinality is {self.cfg.transformer_lm.card} vs ",
+            f"compression model cardinality is {self.compression_model.cardinality}"
+        )
+        assert self.cfg.transformer_lm.n_q == self.compression_model.num_codebooks, (
+            "Numbers of codebooks of the LM and compression models don't match: ",
+            f"LM number of codebooks is {self.cfg.transformer_lm.n_q} vs ",
+            f"compression model numer of codebooks is {self.compression_model.num_codebooks}"
+        )
+        self.logger.info("Compression model has %d codebooks with %d cardinality, and a framerate of %d",
+                         self.compression_model.num_codebooks, self.compression_model.cardinality,
+                         self.compression_model.frame_rate)
+        # instantiate LM model
+        self.model: models.LMModel = models.builders.get_lm_model(self.cfg).to(self.device)
+        if self.cfg.fsdp.use:
+            assert not self.cfg.autocast, "Cannot use autocast with fsdp"
+            self.model = self.wrap_with_fsdp(self.model)
+        self.register_ema('model')
+        # initialize optimization
+        self.optimizer = builders.get_optimizer(builders.get_optim_parameter_groups(self.model), self.cfg.optim)
+        self.lr_scheduler = builders.get_lr_scheduler(self.optimizer, self.cfg.schedule, self.total_updates)
+        self.register_stateful('model', 'optimizer', 'lr_scheduler')
+        self.register_best_state('model')
+        self.autocast_dtype = {
+            'float16': torch.float16, 'bfloat16': torch.bfloat16
+        }[self.cfg.autocast_dtype]
+        self.scaler: tp.Optional[torch.cuda.amp.GradScaler] = None
+        if self.cfg.fsdp.use:
+            need_scaler = self.cfg.fsdp.param_dtype == 'float16'
+        else:
+            need_scaler = self.cfg.autocast and self.autocast_dtype is torch.float16
+        if need_scaler:
+            if self.cfg.fsdp.use:
+                from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+                self.scaler = ShardedGradScaler()  # type: ignore
+            else:
+                self.scaler = torch.cuda.amp.GradScaler()
+            self.register_stateful('scaler')
+    def build_dataloaders(self) -> None:
+        """Instantiate audio dataloaders for each stage."""
+        self.dataloaders = builders.get_audio_datasets(self.cfg, dataset_type=self.DATASET_TYPE)
+    def show(self) -> None:
+        """Show the compression model and LM model."""
+        self.logger.info("Compression model:")
+        self.log_model_summary(self.compression_model)
+        self.logger.info("LM model:")
+        self.log_model_summary(self.model)
+    def load_state_dict(self, state: dict) -> None:
+        if 'condition_provider' in state:
+            model_state = state['model']
+            condition_provider_state = state.pop('condition_provider')
+            prefix = 'condition_provider.'
+            for key, value in condition_provider_state.items():
+                key = prefix + key
+                assert key not in model_state
+                model_state[key] = value
+        if 'compression_model' in state:
+            # We used to store the `compression_model` state in the checkpoint, however
+            # this is in general not needed, as the compression model should always be readable
+            # from the original `cfg.compression_model_checkpoint` location.
+            compression_model_state = state.pop('compression_model')
+            before_hash = model_hash(self.compression_model)
+            self.compression_model.load_state_dict(compression_model_state)
+            after_hash = model_hash(self.compression_model)
+            if before_hash != after_hash:
+                raise RuntimeError(
+                    "The compression model state inside the checkpoint is different"
+                    " from the one obtained from compression_model_checkpoint..."
+                    "We do not support altering the compression model inside the LM "
+                    "checkpoint as parts of the code, in particular for running eval post-training "
+                    "will use the compression_model_checkpoint as the source of truth.")
+        super().load_state_dict(state)
+    def load_from_pretrained(self, name: str):
+        # TODO: support native HF versions of MusicGen.
+        lm_pkg = models.loaders.load_lm_model_ckpt(name)
+        state: dict = {
+            'best_state': {
+                'model': lm_pkg['best_state'],
+            },
+        }
+        return state
+    def _compute_cross_entropy(
+        self, logits: torch.Tensor, targets: torch.Tensor, mask: torch.Tensor
+    ) -> tp.Tuple[torch.Tensor, tp.List[torch.Tensor]]:
+        """Compute cross entropy between multi-codebook targets and model's logits.
+        The cross entropy is computed per codebook to provide codebook-level cross entropy.
+        Valid timesteps for each of the codebook are pulled from the mask, where invalid
+        timesteps are set to 0.
+        Args:
+            logits (torch.Tensor): Model's logits of shape [B, K, T, card].
+            targets (torch.Tensor): Target codes, of shape [B, K, T].
+            mask (torch.Tensor): Mask for valid target codes, of shape [B, K, T].
+        Returns:
+            ce (torch.Tensor): Cross entropy averaged over the codebooks
+            ce_per_codebook (list of torch.Tensor): Cross entropy per codebook (detached).
+        """
+        B, K, T = targets.shape
+        assert logits.shape[:-1] == targets.shape
+        assert mask.shape == targets.shape
+        ce = torch.zeros([], device=targets.device)
+        ce_per_codebook: tp.List[torch.Tensor] = []
+        for k in range(K):
+            logits_k = logits[:, k, ...].contiguous().view(-1, logits.size(-1))  # [B x T, card]
+            targets_k = targets[:, k, ...].contiguous().view(-1)  # [B x T]
+            mask_k = mask[:, k, ...].contiguous().view(-1)  # [B x T]
+            ce_targets = targets_k[mask_k]
+            ce_logits = logits_k[mask_k]
+            q_ce = F.cross_entropy(ce_logits, ce_targets)
+            ce += q_ce
+            ce_per_codebook.append(q_ce.detach())
+        # average cross entropy across codebooks
+        ce = ce / K
+        return ce, ce_per_codebook
+    def _prepare_tokens_and_attributes(
+        self, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]],
+        check_synchronization_points: bool = False
+    ) -> tp.Tuple[dict, torch.Tensor, torch.Tensor]:
+        """Prepare input batchs for language model training.
+        Args:
+            batch (tuple[torch.Tensor, list[SegmentWithAttributes]]): Input batch with audio tensor of shape [B, C, T]
+                and corresponding metadata as SegmentWithAttributes (with B items).
+            check_synchronization_points (bool): Whether to check for synchronization points slowing down training.
+        Returns:
+            Condition tensors (dict[str, any]): Preprocessed condition attributes.
+            Tokens (torch.Tensor): Audio tokens from compression model, of shape [B, K, T_s],
+                with B the batch size, K the number of codebooks, T_s the token timesteps.
+            Padding mask (torch.Tensor): Mask with valid positions in the tokens tensor, of shape [B, K, T_s].
+        """
+        if self.model.training:
+            warnings.warn(
+                "Up to version 1.0.1, the _prepare_tokens_and_attributes was evaluated with `torch.no_grad()`. "
+                "This is inconsistent with how model were trained in the MusicGen paper. We removed the "
+                "`torch.no_grad()` in version 1.1.0. Small changes to the final performance are expected. "
+                "Really sorry about that.")
+        if self._cached_batch_loader is None or self.current_stage != "train":
+            audio, infos = batch
+            audio = audio.to(self.device)
+            audio_tokens = None
+            assert audio.size(0) == len(infos), (
+                f"Mismatch between number of items in audio batch ({audio.size(0)})",
+                f" and in metadata ({len(infos)})"
+            )
+        else:
+            audio = None
+            # In that case the batch will be a tuple coming from the _cached_batch_writer bit below.
+            infos, = batch  # type: ignore
+            assert all([isinstance(info, AudioInfo) for info in infos])
+            assert all([info.audio_tokens is not None for info in infos])  # type: ignore
+            audio_tokens = torch.stack([info.audio_tokens for info in infos]).to(self.device)  # type: ignore
+            audio_tokens = audio_tokens.long()
+            for info in infos:
+                if isinstance(info, MusicInfo):
+                    # Careful here, if you want to use this condition_wav (e.b. chroma conditioning),
+                    # then you must be using the chroma cache! otherwise the code will try
+                    # to use this segment and fail (by that I mean you will see NaN everywhere).
+                    info.self_wav = WavCondition(
+                        torch.full([1, info.channels, info.total_frames], float('NaN')),
+                        length=torch.tensor([info.n_frames]),
+                        sample_rate=[info.sample_rate],
+                        path=[info.meta.path],
+                        seek_time=[info.seek_time])
+                    dataset = get_dataset_from_loader(self.dataloaders['original_train'])
+                    assert isinstance(dataset, MusicDataset), type(dataset)
+                    if dataset.paraphraser is not None and info.description is not None:
+                        # Hackingly reapplying paraphraser when using cache.
+                        info.description = dataset.paraphraser.sample_paraphrase(
+                            info.meta.path, info.description)
+        # prepare attributes
+        attributes = [info.to_condition_attributes() for info in infos]
+        attributes = self.model.cfg_dropout(attributes)
+        attributes = self.model.att_dropout(attributes)
+        tokenized = self.model.condition_provider.tokenize(attributes)
+        # Now we should be synchronization free.
+        if self.device == "cuda" and check_synchronization_points:
+            torch.cuda.set_sync_debug_mode("warn")
+        if audio_tokens is None:
+            with torch.no_grad():
+                audio_tokens, scale = self.compression_model.encode(audio)
+                assert scale is None, "Scaled compression model not supported with LM."
+        with self.autocast:
+            condition_tensors = self.model.condition_provider(tokenized)
+        # create a padding mask to hold valid vs invalid positions
+        padding_mask = torch.ones_like(audio_tokens, dtype=torch.bool, device=audio_tokens.device)
+        # replace encodec tokens from padded audio with special_token_id
+        if self.cfg.tokens.padding_with_special_token:
+            audio_tokens = audio_tokens.clone()
+            padding_mask = padding_mask.clone()
+            token_sample_rate = self.compression_model.frame_rate
+            B, K, T_s = audio_tokens.shape
+            for i in range(B):
+                n_samples = infos[i].n_frames
+                audio_sample_rate = infos[i].sample_rate
+                # take the last token generated from actual audio frames (non-padded audio)
+                valid_tokens = math.floor(float(n_samples) / audio_sample_rate * token_sample_rate)
+                audio_tokens[i, :, valid_tokens:] = self.model.special_token_id
+                padding_mask[i, :, valid_tokens:] = 0
+        if self.device == "cuda" and check_synchronization_points:
+            torch.cuda.set_sync_debug_mode("default")
+        if self._cached_batch_writer is not None and self.current_stage == 'train':
+            assert self._cached_batch_loader is None
+            assert audio_tokens is not None
+            for info, one_audio_tokens in zip(infos, audio_tokens):
+                assert isinstance(info, AudioInfo)
+                if isinstance(info, MusicInfo):
+                    assert not info.joint_embed, "joint_embed and cache not supported yet."
+                    info.self_wav = None
+                assert one_audio_tokens.max() < 2**15, one_audio_tokens.max().item()
+                info.audio_tokens = one_audio_tokens.short().cpu()
+            self._cached_batch_writer.save(infos)
+        return condition_tensors, audio_tokens, padding_mask
+    def run_step(self, idx: int, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]], metrics: dict) -> dict:
+        """Perform one training or valid step on a given batch."""
+        check_synchronization_points = idx == 1 and self.device == 'cuda'
+        condition_tensors, audio_tokens, padding_mask = self._prepare_tokens_and_attributes(
+            batch, check_synchronization_points)
+        self.deadlock_detect.update('tokens_and_conditions')
+        if check_synchronization_points:
+            torch.cuda.set_sync_debug_mode('warn')
+        with self.autocast:
+            model_output = self.model.compute_predictions(audio_tokens, [], condition_tensors)  # type: ignore
+            logits = model_output.logits
+            mask = padding_mask & model_output.mask
+            ce, ce_per_codebook = self._compute_cross_entropy(logits, audio_tokens, mask)
+            loss = ce
+        self.deadlock_detect.update('loss')
+        if check_synchronization_points:
+            torch.cuda.set_sync_debug_mode('default')
+        if self.is_training:
+            metrics['lr'] = self.optimizer.param_groups[0]['lr']
+            if self.scaler is not None:
+                loss = self.scaler.scale(loss)
+            self.deadlock_detect.update('scale')
+            if self.cfg.fsdp.use:
+                loss.backward()
+                flashy.distrib.average_tensors(self.model.buffers())
+            elif self.cfg.optim.eager_sync:
+                with flashy.distrib.eager_sync_model(self.model):
+                    loss.backward()
+            else:
+                # this should always be slower but can be useful
+                # for weird use cases like multiple backwards.
+                loss.backward()
+                flashy.distrib.sync_model(self.model)
+            self.deadlock_detect.update('backward')
+            if self.scaler is not None:
+                self.scaler.unscale_(self.optimizer)
+            if self.cfg.optim.max_norm:
+                if self.cfg.fsdp.use:
+                    metrics['grad_norm'] = self.model.clip_grad_norm_(self.cfg.optim.max_norm)  # type: ignore
+                else:
+                    metrics['grad_norm'] = torch.nn.utils.clip_grad_norm_(
+                        self.model.parameters(), self.cfg.optim.max_norm
+                    )
+            if self.scaler is None:
+                self.optimizer.step()
+            else:
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+            if self.lr_scheduler:
+                self.lr_scheduler.step()
+            self.optimizer.zero_grad()
+            self.deadlock_detect.update('optim')
+            if self.scaler is not None:
+                scale = self.scaler.get_scale()
+                metrics['grad_scale'] = scale
+            if not loss.isfinite().all():
+                raise RuntimeError("Model probably diverged.")
+        metrics['ce'] = ce
+        metrics['ppl'] = torch.exp(ce)
+        for k, ce_q in enumerate(ce_per_codebook):
+            metrics[f'ce_q{k + 1}'] = ce_q
+            metrics[f'ppl_q{k + 1}'] = torch.exp(ce_q)
+        return metrics
+    @torch.no_grad()
+    def run_generate_step(self, batch: tp.Tuple[torch.Tensor, tp.List[SegmentWithAttributes]],
+                          gen_duration: float, prompt_duration: tp.Optional[float] = None,
+                          remove_prompt: bool = False,
+                          **generation_params) -> dict:
+        """Run generate step on a batch of optional audio tensor and corresponding attributes.
+        Args:
+            batch (tuple[torch.Tensor, list[SegmentWithAttributes]]):
+            use_prompt (bool): Whether to do audio continuation generation with prompt from audio batch.
+            gen_duration (float): Target audio duration for the generation.
+            prompt_duration (float, optional): Duration for the audio prompt to use for continuation.
+            remove_prompt (bool, optional): Whether to remove the prompt from the generated audio.
+            generation_params: Additional generation parameters.
+        Returns:
+            gen_outputs (dict): Generation outputs, consisting in audio, audio tokens from both the generation
+                and the prompt along with additional information.
+        """
+        bench_start = time.time()
+        audio, meta = batch
+        assert audio.size(0) == len(meta), (
+            f"Mismatch between number of items in audio batch ({audio.size(0)})",
+            f" and in metadata ({len(meta)})"
+        )
+        # prepare attributes
+        attributes = [x.to_condition_attributes() for x in meta]
+        # TODO: Add dropout for chroma?
+        # prepare audio prompt
+        if prompt_duration is None:
+            prompt_audio = None
+        else:
+            assert prompt_duration < gen_duration, "Prompt duration must be lower than target generation duration"
+            prompt_audio_frames = int(prompt_duration * self.compression_model.sample_rate)
+            prompt_audio = audio[..., :prompt_audio_frames]
+        # get audio tokens from compression model
+        if prompt_audio is None or prompt_audio.nelement() == 0:
+            num_samples = len(attributes)
+            prompt_tokens = None
+        else:
+            num_samples = None
+            prompt_audio = prompt_audio.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt_audio)
+            assert scale is None, "Compression model in MusicGen should not require rescaling."
+        # generate by sampling from the LM
+        with self.autocast:
+            total_gen_len = math.ceil(gen_duration * self.compression_model.frame_rate)
+            gen_tokens = self.model.generate(
+                prompt_tokens, attributes, max_gen_len=total_gen_len,
+                num_samples=num_samples, **self.generation_params)
+        # generate audio from tokens
+        assert gen_tokens.dim() == 3
+        gen_audio = self.compression_model.decode(gen_tokens, None)
+        bench_end = time.time()
+        gen_outputs = {
+            'rtf': (bench_end - bench_start) / gen_duration,
+            'ref_audio': audio,
+            'gen_audio': gen_audio,
+            'gen_tokens': gen_tokens,
+            'prompt_audio': prompt_audio,
+            'prompt_tokens': prompt_tokens,
+        }
+        return gen_outputs
+    def generate_audio(self) -> dict:
+        """Audio generation stage."""
+        generate_stage_name = f'{self.current_stage}'
+        sample_manager = SampleManager(self.xp)
+        self.logger.info(f"Generating samples in {sample_manager.base_folder}")
+        loader = self.dataloaders['generate']
+        updates = len(loader)
+        lp = self.log_progress(generate_stage_name, loader, total=updates, updates=self.log_updates)
+        dataset = get_dataset_from_loader(loader)
+        dataset_duration = dataset.segment_duration
+        assert dataset_duration is not None
+        assert isinstance(dataset, AudioDataset)
+        target_duration = self.cfg.generate.lm.gen_duration
+        prompt_duration = self.cfg.generate.lm.prompt_duration
+        if target_duration is None:
+            target_duration = dataset_duration
+        if prompt_duration is None:
+            prompt_duration = dataset_duration / 4
+        assert prompt_duration < dataset_duration, (
+            f"Specified prompt duration ({prompt_duration}s) is longer",
+            f" than reference audio duration ({dataset_duration}s)"
+        )
+        def get_hydrated_conditions(meta: tp.List[SegmentWithAttributes]):
+            hydrated_conditions = []
+            for sample in [x.to_condition_attributes() for x in meta]:
+                cond_dict = {}
+                for cond_type in sample.__annotations__.keys():
+                    for cond_key, cond_val in getattr(sample, cond_type).items():
+                        if cond_key not in self.model.condition_provider.conditioners.keys():
+                            continue
+                        if is_jsonable(cond_val):
+                            cond_dict[cond_key] = cond_val
+                        elif isinstance(cond_val, WavCondition):
+                            cond_dict[cond_key] = cond_val.path
+                        elif isinstance(cond_val, JointEmbedCondition):
+                            cond_dict[cond_key] = cond_val.text  # only support text at inference for now
+                        else:
+                            # if we reached this point, it is not clear how to log the condition
+                            # so we just log the type.
+                            cond_dict[cond_key] = str(type(cond_val))
+                            continue
+                hydrated_conditions.append(cond_dict)
+            return hydrated_conditions
+        metrics: dict = {}
+        average = flashy.averager()
+        for batch in lp:
+            audio, meta = batch
+            # metadata for sample manager
+            hydrated_conditions = get_hydrated_conditions(meta)
+            sample_generation_params = {
+                **{f'classifier_free_guidance_{k}': v for k, v in self.cfg.classifier_free_guidance.items()},
+                **self.generation_params
+            }
+            if self.cfg.generate.lm.unprompted_samples:
+                if self.cfg.generate.lm.gen_gt_samples:
+                    # get the ground truth instead of generation
+                    self.logger.warn(
+                        "Use ground truth instead of audio generation as generate.lm.gen_gt_samples=true")
+                    gen_unprompted_audio = audio
+                    rtf = 1.
+                else:
+                    gen_unprompted_outputs = self.run_generate_step(
+                        batch, gen_duration=target_duration, prompt_duration=None,
+                        **self.generation_params)
+                    gen_unprompted_audio = gen_unprompted_outputs['gen_audio'].cpu()
+                    rtf = gen_unprompted_outputs['rtf']
+                sample_manager.add_samples(
+                    gen_unprompted_audio, self.epoch, hydrated_conditions,
+                    ground_truth_wavs=audio, generation_args=sample_generation_params)
+            if self.cfg.generate.lm.prompted_samples:
+                gen_outputs = self.run_generate_step(
+                    batch, gen_duration=target_duration, prompt_duration=prompt_duration,
+                    **self.generation_params)
+                gen_audio = gen_outputs['gen_audio'].cpu()
+                prompt_audio = gen_outputs['prompt_audio'].cpu()
+                sample_manager.add_samples(
+                    gen_audio, self.epoch, hydrated_conditions,
+                    prompt_wavs=prompt_audio, ground_truth_wavs=audio,
+                    generation_args=sample_generation_params)
+            metrics['rtf'] = rtf
+            metrics = average(metrics)
+        flashy.distrib.barrier()
+        return metrics
+    def generate(self) -> dict:
+        """Generate stage."""
+        self.model.eval()
+        with torch.no_grad():
+            return self.generate_audio()
+    def run_epoch(self):
+        if self.cfg.cache.write:
+            if ((self.epoch - 1) % self.cfg.cache.write_num_shards) != self.cfg.cache.write_shard:
+                return
+        super().run_epoch()
+    def train(self):
+        """Train stage.
+        """
+        if self._cached_batch_writer is not None:
+            self._cached_batch_writer.start_epoch(self.epoch)
+        if self._cached_batch_loader is None:
+            dataset = get_dataset_from_loader(self.dataloaders['train'])
+            assert isinstance(dataset, AudioDataset)
+            dataset.current_epoch = self.epoch
+        else:
+            self._cached_batch_loader.start_epoch(self.epoch)
+        return super().train()
+    def evaluate_audio_generation(self) -> dict:
+        """Evaluate audio generation with off-the-shelf metrics."""
+        evaluate_stage_name = f'{self.current_stage}_generation'
+        # instantiate evaluation metrics, if at least one metric is defined, run audio generation evaluation
+        fad: tp.Optional[eval_metrics.FrechetAudioDistanceMetric] = None
+        kldiv: tp.Optional[eval_metrics.KLDivergenceMetric] = None
+        text_consistency: tp.Optional[eval_metrics.TextConsistencyMetric] = None
+        chroma_cosine: tp.Optional[eval_metrics.ChromaCosineSimilarityMetric] = None
+        should_run_eval = False
+        eval_chroma_wavs: tp.Optional[torch.Tensor] = None
+        if self.cfg.evaluate.metrics.fad:
+            fad = builders.get_fad(self.cfg.metrics.fad).to(self.device)
+            should_run_eval = True
+        if self.cfg.evaluate.metrics.kld:
+            kldiv = builders.get_kldiv(self.cfg.metrics.kld).to(self.device)
+            should_run_eval = True
+        if self.cfg.evaluate.metrics.text_consistency:
+            text_consistency = builders.get_text_consistency(self.cfg.metrics.text_consistency).to(self.device)
+            should_run_eval = True
+        if self.cfg.evaluate.metrics.chroma_cosine:
+            chroma_cosine = builders.get_chroma_cosine_similarity(self.cfg.metrics.chroma_cosine).to(self.device)
+            # if we have predefind wavs for chroma we should purge them for computing the cosine metric
+            has_predefined_eval_chromas = 'self_wav' in self.model.condition_provider.conditioners and \
+                                          self.model.condition_provider.conditioners['self_wav'].has_eval_wavs()
+            if has_predefined_eval_chromas:
+                warn_once(self.logger, "Attempting to run cosine eval for config with pre-defined eval chromas! "
+                                       'Resetting eval chromas to None for evaluation.')
+                eval_chroma_wavs = self.model.condition_provider.conditioners.self_wav.eval_wavs  # type: ignore
+                self.model.condition_provider.conditioners.self_wav.reset_eval_wavs(None)  # type: ignore
+            should_run_eval = True
+        def get_compressed_audio(audio: torch.Tensor) -> torch.Tensor:
+            audio_tokens, scale = self.compression_model.encode(audio.to(self.device))
+            compressed_audio = self.compression_model.decode(audio_tokens, scale)
+            return compressed_audio[..., :audio.shape[-1]]
+        metrics: dict = {}
+        if should_run_eval:
+            loader = self.dataloaders['evaluate']
+            updates = len(loader)
+            lp = self.log_progress(f'{evaluate_stage_name} inference', loader, total=updates, updates=self.log_updates)
+            average = flashy.averager()
+            dataset = get_dataset_from_loader(loader)
+            assert isinstance(dataset, AudioDataset)
+            self.logger.info(f"Computing evaluation metrics on {len(dataset)} samples")
+            for idx, batch in enumerate(lp):
+                audio, meta = batch
+                assert all([self.cfg.sample_rate == m.sample_rate for m in meta])
+                target_duration = audio.shape[-1] / self.cfg.sample_rate
+                if self.cfg.evaluate.fixed_generation_duration:
+                    target_duration = self.cfg.evaluate.fixed_generation_duration
+                gen_outputs = self.run_generate_step(
+                    batch, gen_duration=target_duration,
+                    **self.generation_params
+                )
+                y_pred = gen_outputs['gen_audio'].detach()
+                y_pred = y_pred[..., :audio.shape[-1]]
+                normalize_kwargs = dict(self.cfg.generate.audio)
+                normalize_kwargs.pop('format', None)
+                y_pred = torch.stack([normalize_audio(w, **normalize_kwargs) for w in y_pred], dim=0).cpu()
+                y = audio.cpu()  # should already be on CPU but just in case
+                sizes = torch.tensor([m.n_frames for m in meta])  # actual sizes without padding
+                sample_rates = torch.tensor([m.sample_rate for m in meta])  # sample rates for audio samples
+                audio_stems = [Path(m.meta.path).stem + f"_{m.seek_time}" for m in meta]
+                if fad is not None:
+                    if self.cfg.metrics.fad.use_gt:
+                        y_pred = get_compressed_audio(y).cpu()
+                    fad.update(y_pred, y, sizes, sample_rates, audio_stems)
+                if kldiv is not None:
+                    if self.cfg.metrics.kld.use_gt:
+                        y_pred = get_compressed_audio(y).cpu()
+                    kldiv.update(y_pred, y, sizes, sample_rates)
+                if text_consistency is not None:
+                    texts = [m.description for m in meta]
+                    if self.cfg.metrics.text_consistency.use_gt:
+                        y_pred = y
+                    text_consistency.update(y_pred, texts, sizes, sample_rates)
+                if chroma_cosine is not None:
+                    if self.cfg.metrics.chroma_cosine.use_gt:
+                        y_pred = get_compressed_audio(y).cpu()
+                    chroma_cosine.update(y_pred, y, sizes, sample_rates)
+                    # restore chroma conditioner's eval chroma wavs
+                    if eval_chroma_wavs is not None:
+                        self.model.condition_provider.conditioners['self_wav'].reset_eval_wavs(eval_chroma_wavs)
+            flashy.distrib.barrier()
+            if fad is not None:
+                metrics['fad'] = fad.compute()
+            if kldiv is not None:
+                kld_metrics = kldiv.compute()
+                metrics.update(kld_metrics)
+            if text_consistency is not None:
+                metrics['text_consistency'] = text_consistency.compute()
+            if chroma_cosine is not None:
+                metrics['chroma_cosine'] = chroma_cosine.compute()
+            metrics = average(metrics)
+            metrics = flashy.distrib.average_metrics(metrics, len(loader))
+        return metrics
+    def evaluate(self) -> dict:
+        """Evaluate stage."""
+        self.model.eval()
+        with torch.no_grad():
+            metrics: dict = {}
+            if self.cfg.evaluate.metrics.base:
+                metrics.update(self.common_train_valid('evaluate'))
+            gen_metrics = self.evaluate_audio_generation()
+            return {**metrics, **gen_metrics}