prompt_token defaults to torch.zeros()

Browse files

Files changed (6) hide show

audiocraft/audiogen.py +0 -31
audiocraft/genmodel.py +32 -96
audiocraft/lm.py +4 -23
audiocraft/utils/audio_utils.py +0 -176
audiocraft/utils/samples/__init__.py +0 -5
audiocraft/utils/samples/manager.py +0 -386

audiocraft/audiogen.py CHANGED Viewed

@@ -16,37 +16,6 @@ from audiocraft.encodec import CompressionModel
 from audiocraft.genmodel import BaseGenModel
 from audiocraft.lm import LMModel
 from audiocraft.loaders import load_compression_model, load_lm_model
-from .utils.audio_utils import f32_pcm, normalize_audio
-def audio_write(stem_name,
-                wav,
-                sample_rate,
-                format= 'wav',
-                mp3_rate=320,
-                ogg_rate= None,
-                normalize= True,
-                strategy= 'peak',
-                peak_clip_headroom_db=1,
-                rms_headroom_db= 18,
-                loudness_headroom_db = 14,
-                loudness_compressor = False,
-                log_clipping = True,
-                make_parent_dir = True,
-                add_suffix = True):
-    assert wav.dtype.is_floating_point, "wav is not floating point"
-    if wav.dim() == 1:
-        wav = wav[None]
-    elif wav.dim() > 2:
-        raise ValueError("Input wav should be at most 2 dimension.")
-    assert wav.isfinite().all()
-    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
-                          rms_headroom_db, loudness_headroom_db, loudness_compressor,
-                          log_clipping=log_clipping, sample_rate=sample_rate,
-                          stem_name=str(stem_name))
-    return wav
-# ===
 class AudioGen(BaseGenModel):
     """AudioGen main model with convenient generation API.

 from audiocraft.genmodel import BaseGenModel
 from audiocraft.lm import LMModel
 from audiocraft.loaders import load_compression_model, load_lm_model
 class AudioGen(BaseGenModel):
     """AudioGen main model with convenient generation API.

audiocraft/genmodel.py CHANGED Viewed

@@ -1,12 +1,10 @@
-from abc import ABC, abstractmethod
 import typing as tp
 import omegaconf
 import torch
 from .encodec import CompressionModel
 from .lm import LMModel
-from .utils.audio_utils import convert_audio
 from .conditioners import ConditioningAttributes
 from .utils.autocast import TorchAutocast
@@ -16,11 +14,9 @@ class BaseGenModel(ABC):
     Args:
         name (str): name of the model.
-        compression_model (CompressionModel): Compression model
-            used to map audio to invertible discrete representations.
-        lm (LMModel): Language model over discrete representations.
-        max_duration (float, optional): maximum duration the model can produce,
-            otherwise, inferred from the training params.
     """
     def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
                  max_duration: tp.Optional[float] = None):
@@ -46,19 +42,16 @@ class BaseGenModel(ABC):
         self.max_duration: float = max_duration
         self.duration = self.max_duration
-        # self.extend_stride is the length of audio extension when generating samples longer
-        # than self.max_duration. NOTE: the derived class must set self.extend_stride to a
-        # positive float value when generating with self.duration > self.max_duration.
-        self.extend_stride: tp.Optional[float] = None
         self.device = next(iter(lm.parameters())).device
-        self.generation_params: dict = {}
-        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
         if self.device.type == 'cpu':
             self.autocast = TorchAutocast(enabled=False)
         else:
             self.autocast = TorchAutocast(
-                enabled=True, device_type=self.device.type, dtype=torch.float16)
     @property
     def frame_rate(self) -> float:
@@ -92,64 +85,36 @@ class BaseGenModel(ABC):
     @torch.no_grad()
     def _prepare_tokens_and_attributes(
             self,
-            descriptions: tp.Sequence[tp.Optional[str]],
-            prompt: tp.Optional[torch.Tensor],
-    ) -> tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
-        """Prepare model inputs.
-        Args:
-            descriptions (list of str): A list of strings used as text conditioning.
-            prompt (torch.Tensor): A batch of waveforms used for continuation.
-        """
         attributes = [
-            ConditioningAttributes(text={'description': description})
-            for description in descriptions]
-        if prompt is not None:
-            if descriptions is not None:
-                assert len(descriptions) == len(prompt), "Prompt and nb. descriptions doesn't match"
-            prompt = prompt.to(self.device)
-            prompt_tokens, scale = self.compression_model.encode(prompt)
-            assert scale is None
-        else:
-            prompt_tokens = None
         return attributes, prompt_tokens
-    def generate_unconditional(self, num_samples: int, progress: bool = False,
-                               return_tokens: bool = False) -> tp.Union[torch.Tensor,
-                                                                        tp.Tuple[torch.Tensor, torch.Tensor]]:
-        """Generate samples in an unconditional manner.
-        Args:
-            num_samples (int): Number of samples to be generated.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-        """
         descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
         if return_tokens:
             return self.generate_audio(tokens), tokens
         return self.generate_audio(tokens)
     def generate(self, descriptions, progress = False, return_tokens= False):
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
-        assert prompt_tokens is None
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
         if return_tokens:
             return self.generate_audio(tokens), tokens
         return self.generate_audio(tokens)
-    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
-                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -> torch.Tensor:
-        """Generate discrete audio tokens given audio prompt and/or conditions.
-        Args:
-            attributes (list of ConditioningAttributes): Conditions used for generation (here text).
-            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-        Returns:
-            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
-        """
         total_gen_len = int(self.duration * self.frame_rate)
         max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
         current_gen_offset: int = 0
@@ -163,49 +128,20 @@ class BaseGenModel(ABC):
             else:
                 print(f'{generated_tokens: 6d} / {tokens_to_generate: 6d}', end='\r')
-        if prompt_tokens is not None:
-            assert max_prompt_len >= prompt_tokens.shape[-1], \
-                "Prompt is longer than audio to generate"
         callback = None
         if progress:
             callback = _progress_callback
         if self.duration <= self.max_duration:
             # generate by sampling from LM, simple case.
             with self.autocast:
-                gen_tokens = self.lm.generate(
-                    prompt_tokens, attributes,
-                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
         else:
-            assert self.extend_stride is not None, "Stride should be defined to generate beyond max_duration"
-            assert self.extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
-            all_tokens = []
-            if prompt_tokens is None:
-                prompt_length = 0
-            else:
-                all_tokens.append(prompt_tokens)
-                prompt_length = prompt_tokens.shape[-1]
-            stride_tokens = int(self.frame_rate * self.extend_stride)
-            while current_gen_offset + prompt_length < total_gen_len:
-                time_offset = current_gen_offset / self.frame_rate
-                chunk_duration = min(self.duration - time_offset, self.max_duration)
-                max_gen_len = int(chunk_duration * self.frame_rate)
-                with self.autocast:
-                    gen_tokens = self.lm.generate(
-                        prompt_tokens, attributes,
-                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
-                if prompt_tokens is None:
-                    all_tokens.append(gen_tokens)
-                else:
-                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
-                prompt_tokens = gen_tokens[:, :, stride_tokens:]
-                prompt_length = prompt_tokens.shape[-1]
-                current_gen_offset += stride_tokens
-            gen_tokens = torch.cat(all_tokens, dim=-1)
         return gen_tokens
     def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:

 import typing as tp
 import omegaconf
 import torch
+from abc import ABC, abstractmethod
 from .encodec import CompressionModel
 from .lm import LMModel
 from .conditioners import ConditioningAttributes
 from .utils.autocast import TorchAutocast
     Args:
         name (str): name of the model.
+        compression_model (CompressionModel): Encodec with Seanet Decoder
+        lm (LMModel): Language model over discrete representations
+        max_duration (float, optional): As is using top250 token draw() we can gen xN sequences
     """
     def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
                  max_duration: tp.Optional[float] = None):
         self.max_duration: float = max_duration
         self.duration = self.max_duration
         self.device = next(iter(lm.parameters())).device
+        self.generation_params={}
+        self._progress_callback=None
         if self.device.type == 'cpu':
             self.autocast = TorchAutocast(enabled=False)
         else:
             self.autocast = TorchAutocast(
+                enabled=True,
+                device_type=self.device.type,
+                dtype=torch.float16)
     @property
     def frame_rate(self) -> float:
     @torch.no_grad()
     def _prepare_tokens_and_attributes(
             self,
+            descriptions,
+            prompt,
+    ):
         attributes = [
+            ConditioningAttributes(text={'description': description}) for description in descriptions]
+        prompt_tokens = None
         return attributes, prompt_tokens
+    def generate_unconditional(self,
+                               num_samples,
+                               progress=False,
+                               return_tokens=False):
         descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
+        attributes, _ = self._prepare_tokens_and_attributes(descriptions, None)
+        tokens = self._generate_tokens(attributes)
         if return_tokens:
             return self.generate_audio(tokens), tokens
         return self.generate_audio(tokens)
     def generate(self, descriptions, progress = False, return_tokens= False):
+        attributes, _ = self._prepare_tokens_and_attributes(descriptions, None)
+        tokens = self._generate_tokens(attributes)
         if return_tokens:
             return self.generate_audio(tokens), tokens
         return self.generate_audio(tokens)
+    def _generate_tokens(self, attributes,
+                         prompt_tokens=None,
+                         progress=False):
         total_gen_len = int(self.duration * self.frame_rate)
         max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
         current_gen_offset: int = 0
             else:
                 print(f'{generated_tokens: 6d} / {tokens_to_generate: 6d}', end='\r')
         callback = None
         if progress:
             callback = _progress_callback
         if self.duration <= self.max_duration:
             # generate by sampling from LM, simple case.
             with self.autocast:
+                gen_tokens = self.lm.generate(conditions=attributes,
+                                                callback=callback,
+                                                max_gen_len=total_gen_len,
+                                                **self.generation_params)
         else:
+            print('<>Long gen ?<>')
         return gen_tokens
     def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:

audiocraft/lm.py CHANGED Viewed

@@ -323,7 +323,7 @@ class LMModel(StreamingModule):
     def generate(self,
                  prompt = None,
                  conditions = [],
-                 num_samples = None,
                  max_gen_len: int = 256,
                  use_sampling: bool = True,
                  temp: float = 1.0,
@@ -335,30 +335,10 @@ class LMModel(StreamingModule):
                  check: bool = False,
                  callback: tp.Optional[tp.Callable[[int, int], None]] = None,
                  **kwargs) -> torch.Tensor:
-        """Default generation takes random token of top_250 logits
-        Args:
-        Returns:
-            torch.Tensor: tokens
-        """
-        assert not self.training, "generation shouldn't be used in training mode."
         first_param = next(iter(self.parameters()))
         device = first_param.device
-        # Checking all input shapes are consistent.
-        possible_num_samples = []
-        if num_samples is not None:
-            possible_num_samples.append(num_samples)
-        elif prompt is not None:
-            possible_num_samples.append(prompt.shape[0])
-        elif conditions:
-            possible_num_samples.append(len(conditions))
-        else:
-            possible_num_samples.append(1)
-        assert [x == possible_num_samples[0] for x in possible_num_samples], "Inconsistent inputs shapes"
-        num_samples = possible_num_samples[0]
         # below we create set of conditions: one conditional and one unconditional
         # to do that we merge the regular condition together with the null condition
         # we then do 1 forward pass instead of 2.
@@ -380,6 +360,7 @@ class LMModel(StreamingModule):
         if prompt is None:
             assert num_samples > 0
             prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
         B, K, T = prompt.shape
         start_offset = T

     def generate(self,
                  prompt = None,
                  conditions = [],
+                 num_samples = 1,       # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
                  max_gen_len: int = 256,
                  use_sampling: bool = True,
                  temp: float = 1.0,
                  check: bool = False,
                  callback: tp.Optional[tp.Callable[[int, int], None]] = None,
                  **kwargs) -> torch.Tensor:
+        print(f'{num_samples=}')
         first_param = next(iter(self.parameters()))
         device = first_param.device
         # below we create set of conditions: one conditional and one unconditional
         # to do that we merge the regular condition together with the null condition
         # we then do 1 forward pass instead of 2.
         if prompt is None:
             assert num_samples > 0
             prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+            print('\n\n\n\n DEFAULT PROMPT ZERO \n\n-')
         B, K, T = prompt.shape
         start_offset = T

audiocraft/utils/audio_utils.py DELETED Viewed

@@ -1,176 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Various utilities for audio convertion (pcm format, sample rate and channels),
-and volume normalization."""
-import sys
-import typing as tp
-import julius
-import torch
-import torchaudio
-def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -> torch.Tensor:
-    """Convert audio to the given number of channels.
-    Args:
-        wav (torch.Tensor): Audio wave of shape [B, C, T].
-        channels (int): Expected number of channels as output.
-    Returns:
-        torch.Tensor: Downmixed or unchanged audio wave [B, C, T].
-    """
-    *shape, src_channels, length = wav.shape
-    if src_channels == channels:
-        pass
-    elif channels == 1:
-        # Case 1:
-        # The caller asked 1-channel audio, and the stream has multiple
-        # channels, downmix all channels.
-        wav = wav.mean(dim=-2, keepdim=True)
-    elif src_channels == 1:
-        # Case 2:
-        # The caller asked for multiple channels, but the input file has
-        # a single channel, replicate the audio over all channels.
-        wav = wav.expand(*shape, channels, length)
-    elif src_channels >= channels:
-        # Case 3:
-        # The caller asked for multiple channels, and the input file has
-        # more channels than requested. In that case return the first channels.
-        wav = wav[..., :channels, :]
-    else:
-        # Case 4: What is a reasonable choice here?
-        raise ValueError('The audio file has less channels than requested but is not mono.')
-    return wav
-def convert_audio(wav: torch.Tensor, from_rate: float,
-                  to_rate: float, to_channels: int) -> torch.Tensor:
-    """Convert audio to new sample rate and number of audio channels."""
-    wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
-    wav = convert_audio_channels(wav, to_channels)
-    return wav
-def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
-                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
-    """Normalize an input signal to a user loudness in dB LKFS.
-    Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
-    Args:
-        wav (torch.Tensor): Input multichannel audio data.
-        sample_rate (int): Sample rate.
-        loudness_headroom_db (float): Target loudness of the output in dB LUFS.
-        loudness_compressor (bool): Uses tanh for soft clipping.
-        energy_floor (float): anything below that RMS level will not be rescaled.
-    Returns:
-        torch.Tensor: Loudness normalized output data.
-    """
-    energy = wav.pow(2).mean().sqrt().item()
-    if energy < energy_floor:
-        return wav
-    transform = torchaudio.transforms.Loudness(sample_rate)
-    input_loudness_db = transform(wav).item()
-    # calculate the gain needed to scale to the desired loudness level
-    delta_loudness = -loudness_headroom_db - input_loudness_db
-    gain = 10.0 ** (delta_loudness / 20.0)
-    output = gain * wav
-    if loudness_compressor:
-        output = torch.tanh(output)
-    assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
-    return output
-def _clip_wav(wav: torch.Tensor, log_clipping: bool = False, stem_name: tp.Optional[str] = None) -> None:
-    """Utility function to clip the audio with logging if specified."""
-    max_scale = wav.abs().max()
-    if log_clipping and max_scale > 1:
-        clamp_prob = (wav.abs() > 1).float().mean().item()
-        print(f"CLIPPING {stem_name or ''} happening with proba (a bit of clipping is okay):",
-              clamp_prob, "maximum scale: ", max_scale.item(), file=sys.stderr)
-    wav.clamp_(-1, 1)
-def normalize_audio(wav: torch.Tensor, normalize: bool = True,
-                    strategy: str = 'peak', peak_clip_headroom_db: float = 1,
-                    rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
-                    loudness_compressor: bool = False, log_clipping: bool = False,
-                    sample_rate: tp.Optional[int] = None,
-                    stem_name: tp.Optional[str] = None) -> torch.Tensor:
-    """Normalize the audio according to the prescribed strategy (see after).
-    Args:
-        wav (torch.Tensor): Audio data.
-        normalize (bool): if `True` (default), normalizes according to the prescribed
-            strategy (see after). If `False`, the strategy is only used in case clipping
-            would happen.
-        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
-            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
-            with extra headroom to avoid clipping. 'clip' just clips.
-        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
-        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
-            than the `peak_clip` one to avoid further clipping.
-        loudness_headroom_db (float): Target loudness for loudness normalization.
-        loudness_compressor (bool): If True, uses tanh based soft clipping.
-        log_clipping (bool): If True, basic logging on stderr when clipping still
-            occurs despite strategy (only for 'rms').
-        sample_rate (int): Sample rate for the audio data (required for loudness).
-        stem_name (str, optional): Stem name for clipping logging.
-    Returns:
-        torch.Tensor: Normalized audio.
-    """
-    scale_peak = 10 ** (-peak_clip_headroom_db / 20)
-    scale_rms = 10 ** (-rms_headroom_db / 20)
-    if strategy == 'peak':
-        rescaling = (scale_peak / wav.abs().max())
-        if normalize or rescaling < 1:
-            wav = wav * rescaling
-    elif strategy == 'clip':
-        wav = wav.clamp(-scale_peak, scale_peak)
-    elif strategy == 'rms':
-        mono = wav.mean(dim=0)
-        rescaling = scale_rms / mono.pow(2).mean().sqrt()
-        if normalize or rescaling < 1:
-            wav = wav * rescaling
-        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
-    elif strategy == 'loudness':
-        assert sample_rate is not None, "Loudness normalization requires sample rate."
-        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
-        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
-    else:
-        assert wav.abs().max() < 1
-        assert strategy == '' or strategy == 'none', f"Unexpected strategy: '{strategy}'"
-    return wav
-def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
-    """Convert audio to float 32 bits PCM format.
-    """
-    if wav.dtype.is_floating_point:
-        return wav
-    elif wav.dtype == torch.int16:
-        return wav.float() / 2**15
-    elif wav.dtype == torch.int32:
-        return wav.float() / 2**31
-    raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
-def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
-    """Convert audio to int 16 bits PCM format.
-    ..Warning:: There exist many formula for doing this conversion. None are perfect
-    due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
-    or inconsistencies with f32_pcm. If the given wav doesn't have enough headroom,
-    it is possible that `i16_pcm(f32_pcm)) != Identity`.
-    """
-    if wav.dtype.is_floating_point:
-        assert wav.abs().max() <= 1
-        candidate = (wav * 2 ** 15).round()
-        if candidate.max() >= 2 ** 15:  # clipping would occur
-            candidate = (wav * (2 ** 15 - 1)).round()
-        return candidate.short()
-    else:
-        assert wav.dtype == torch.int16
-        return wav

audiocraft/utils/samples/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.

audiocraft/utils/samples/manager.py DELETED Viewed

@@ -1,386 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-API that can manage the storage and retrieval of generated samples produced by experiments.
-It offers the following benefits:
-* Samples are stored in a consistent way across epoch
-* Metadata about the samples can be stored and retrieved
-* Can retrieve audio
-* Identifiers are reliable and deterministic for prompted and conditioned samples
-* Can request the samples for multiple XPs, grouped by sample identifier
-* For no-input samples (not prompt and no conditions), samples across XPs are matched
-  by sorting their identifiers
-"""
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import asdict, dataclass
-from functools import lru_cache
-import hashlib
-import json
-import logging
-from pathlib import Path
-import re
-import typing as tp
-import unicodedata
-import uuid
-import dora
-import torch
-from ...data.audio import audio_read, audio_write
-logger = logging.getLogger(__name__)
-@dataclass
-class ReferenceSample:
-    id: str
-    path: str
-    duration: float
-@dataclass
-class Sample:
-    id: str
-    path: str
-    epoch: int
-    duration: float
-    conditioning: tp.Optional[tp.Dict[str, tp.Any]]
-    prompt: tp.Optional[ReferenceSample]
-    reference: tp.Optional[ReferenceSample]
-    generation_args: tp.Optional[tp.Dict[str, tp.Any]]
-    def __hash__(self):
-        return hash(self.id)
-    def audio(self) -> tp.Tuple[torch.Tensor, int]:
-        return audio_read(self.path)
-    def audio_prompt(self) -> tp.Optional[tp.Tuple[torch.Tensor, int]]:
-        return audio_read(self.prompt.path) if self.prompt is not None else None
-    def audio_reference(self) -> tp.Optional[tp.Tuple[torch.Tensor, int]]:
-        return audio_read(self.reference.path) if self.reference is not None else None
-class SampleManager:
-    """Audio samples IO handling within a given dora xp.
-    The sample manager handles the dumping and loading logic for generated and
-    references samples across epochs for a given xp, providing a simple API to
-    store, retrieve and compare audio samples.
-    Args:
-        xp (dora.XP): Dora experiment object. The XP contains information on the XP folder
-            where all outputs are stored and the configuration of the experiment,
-            which is useful to retrieve audio-related parameters.
-        map_reference_to_sample_id (bool): Whether to use the sample_id for all reference samples
-            instead of generating a dedicated hash id. This is useful to allow easier comparison
-            with ground truth sample from the files directly without having to read the JSON metadata
-            to do the mapping (at the cost of potentially dumping duplicate prompts/references
-            depending on the task).
-    """
-    def __init__(self, xp: dora.XP, map_reference_to_sample_id: bool = False):
-        self.xp = xp
-        self.base_folder: Path = xp.folder / xp.cfg.generate.path
-        self.reference_folder = self.base_folder / 'reference'
-        self.map_reference_to_sample_id = map_reference_to_sample_id
-        self.samples: tp.List[Sample] = []
-        self._load_samples()
-    @property
-    def latest_epoch(self):
-        """Latest epoch across all samples."""
-        return max(self.samples, key=lambda x: x.epoch).epoch if self.samples else 0
-    def _load_samples(self):
-        """Scan the sample folder and load existing samples."""
-        jsons = self.base_folder.glob('**/*.json')
-        with ThreadPoolExecutor(6) as pool:
-            self.samples = list(pool.map(self._load_sample, jsons))
-    @staticmethod
-    @lru_cache(2**26)
-    def _load_sample(json_file: Path) -> Sample:
-        with open(json_file, 'r') as f:
-            data: tp.Dict[str, tp.Any] = json.load(f)
-        # fetch prompt data
-        prompt_data = data.get('prompt')
-        prompt = ReferenceSample(id=prompt_data['id'], path=prompt_data['path'],
-                                 duration=prompt_data['duration']) if prompt_data else None
-        # fetch reference data
-        reference_data = data.get('reference')
-        reference = ReferenceSample(id=reference_data['id'], path=reference_data['path'],
-                                    duration=reference_data['duration']) if reference_data else None
-        # build sample object
-        return Sample(id=data['id'], path=data['path'], epoch=data['epoch'], duration=data['duration'],
-                      prompt=prompt, conditioning=data.get('conditioning'), reference=reference,
-                      generation_args=data.get('generation_args'))
-    def _init_hash(self):
-        return hashlib.sha1()
-    def _get_tensor_id(self, tensor: torch.Tensor) -> str:
-        hash_id = self._init_hash()
-        hash_id.update(tensor.numpy().data)
-        return hash_id.hexdigest()
-    def _get_sample_id(self, index: int, prompt_wav: tp.Optional[torch.Tensor],
-                       conditions: tp.Optional[tp.Dict[str, str]]) -> str:
-        """Computes an id for a sample given its input data.
-        This id is deterministic if prompt and/or conditions are provided by using a sha1 hash on the input.
-        Otherwise, a random id of the form "noinput_{uuid4().hex}" is returned.
-        Args:
-            index (int): Batch index, Helpful to differentiate samples from the same batch.
-            prompt_wav (torch.Tensor): Prompt used during generation.
-            conditions (dict[str, str]): Conditioning used during generation.
-        """
-        # For totally unconditioned generations we will just use a random UUID.
-        # The function get_samples_for_xps will do a simple ordered match with a custom key.
-        if prompt_wav is None and not conditions:
-            return f"noinput_{uuid.uuid4().hex}"
-        # Human readable portion
-        hr_label = ""
-        # Create a deterministic id using hashing
-        hash_id = self._init_hash()
-        hash_id.update(f"{index}".encode())
-        if prompt_wav is not None:
-            hash_id.update(prompt_wav.numpy().data)
-            hr_label += "_prompted"
-        else:
-            hr_label += "_unprompted"
-        if conditions:
-            encoded_json = json.dumps(conditions, sort_keys=True).encode()
-            hash_id.update(encoded_json)
-            cond_str = "-".join([f"{key}={slugify(value)}"
-                                 for key, value in sorted(conditions.items())])
-            cond_str = cond_str[:100]  # some raw text might be too long to be a valid filename
-            cond_str = cond_str if len(cond_str) > 0 else "unconditioned"
-            hr_label += f"_{cond_str}"
-        else:
-            hr_label += "_unconditioned"
-        return hash_id.hexdigest() + hr_label
-    def _store_audio(self, wav: torch.Tensor, stem_path: Path, overwrite: bool = False) -> Path:
-        """Stores the audio with the given stem path using the XP's configuration.
-        Args:
-            wav (torch.Tensor): Audio to store.
-            stem_path (Path): Path in sample output directory with file stem to use.
-            overwrite (bool): When False (default), skips storing an existing audio file.
-        Returns:
-            Path: The path at which the audio is stored.
-        """
-        existing_paths = [
-            path for path in stem_path.parent.glob(stem_path.stem + '.*')
-            if path.suffix != '.json'
-        ]
-        exists = len(existing_paths) > 0
-        if exists and overwrite:
-            logger.warning(f"Overwriting existing audio file with stem path {stem_path}")
-        elif exists:
-            return existing_paths[0]
-        audio_path = audio_write(stem_path, wav, **self.xp.cfg.generate.audio)
-        return audio_path
-    def add_sample(self, sample_wav: torch.Tensor, epoch: int, index: int = 0,
-                   conditions: tp.Optional[tp.Dict[str, str]] = None, prompt_wav: tp.Optional[torch.Tensor] = None,
-                   ground_truth_wav: tp.Optional[torch.Tensor] = None,
-                   generation_args: tp.Optional[tp.Dict[str, tp.Any]] = None) -> Sample:
-        """Adds a single sample.
-        The sample is stored in the XP's sample output directory, under a corresponding epoch folder.
-        Each sample is assigned an id which is computed using the input data. In addition to the
-        sample itself, a json file containing associated metadata is stored next to it.
-        Args:
-            sample_wav (torch.Tensor): sample audio to store. Tensor of shape [channels, shape].
-            epoch (int): current training epoch.
-            index (int): helpful to differentiate samples from the same batch.
-            conditions (dict[str, str], optional): conditioning used during generation.
-            prompt_wav (torch.Tensor, optional): prompt used during generation. Tensor of shape [channels, shape].
-            ground_truth_wav (torch.Tensor, optional): reference audio where prompt was extracted from.
-                Tensor of shape [channels, shape].
-            generation_args (dict[str, any], optional): dictionary of other arguments used during generation.
-        Returns:
-            Sample: The saved sample.
-        """
-        sample_id = self._get_sample_id(index, prompt_wav, conditions)
-        reuse_id = self.map_reference_to_sample_id
-        prompt, ground_truth = None, None
-        if prompt_wav is not None:
-            prompt_id = sample_id if reuse_id else self._get_tensor_id(prompt_wav.sum(0, keepdim=True))
-            prompt_duration = prompt_wav.shape[-1] / self.xp.cfg.sample_rate
-            prompt_path = self._store_audio(prompt_wav, self.base_folder / str(epoch) / 'prompt' / prompt_id)
-            prompt = ReferenceSample(prompt_id, str(prompt_path), prompt_duration)
-        if ground_truth_wav is not None:
-            ground_truth_id = sample_id if reuse_id else self._get_tensor_id(ground_truth_wav.sum(0, keepdim=True))
-            ground_truth_duration = ground_truth_wav.shape[-1] / self.xp.cfg.sample_rate
-            ground_truth_path = self._store_audio(ground_truth_wav, self.base_folder / 'reference' / ground_truth_id)
-            ground_truth = ReferenceSample(ground_truth_id, str(ground_truth_path), ground_truth_duration)
-        sample_path = self._store_audio(sample_wav, self.base_folder / str(epoch) / sample_id, overwrite=True)
-        duration = sample_wav.shape[-1] / self.xp.cfg.sample_rate
-        sample = Sample(sample_id, str(sample_path), epoch, duration, conditions, prompt, ground_truth, generation_args)
-        self.samples.append(sample)
-        with open(sample_path.with_suffix('.json'), 'w') as f:
-            json.dump(asdict(sample), f, indent=2)
-        return sample
-    def add_samples(self, samples_wavs: torch.Tensor, epoch: int,
-                    conditioning: tp.Optional[tp.List[tp.Dict[str, tp.Any]]] = None,
-                    prompt_wavs: tp.Optional[torch.Tensor] = None,
-                    ground_truth_wavs: tp.Optional[torch.Tensor] = None,
-                    generation_args: tp.Optional[tp.Dict[str, tp.Any]] = None) -> tp.List[Sample]:
-        """Adds a batch of samples.
-        The samples are stored in the XP's sample output directory, under a corresponding
-        epoch folder. Each sample is assigned an id which is computed using the input data and their batch index.
-        In addition to the sample itself, a json file containing associated metadata is stored next to it.
-        Args:
-            sample_wavs (torch.Tensor): Batch of audio wavs to store. Tensor of shape [batch_size, channels, shape].
-            epoch (int): Current training epoch.
-            conditioning (list of dict[str, str], optional): List of conditions used during generation,
-                one per sample in the batch.
-            prompt_wavs (torch.Tensor, optional): Prompts used during generation. Tensor of shape
-                [batch_size, channels, shape].
-            ground_truth_wav (torch.Tensor, optional): Reference audio where prompts were extracted from.
-                Tensor of shape [batch_size, channels, shape].
-            generation_args (dict[str, Any], optional): Dictionary of other arguments used during generation.
-        Returns:
-            samples (list of Sample): The saved audio samples with prompts, ground truth and metadata.
-        """
-        samples = []
-        for idx, wav in enumerate(samples_wavs):
-            prompt_wav = prompt_wavs[idx] if prompt_wavs is not None else None
-            gt_wav = ground_truth_wavs[idx] if ground_truth_wavs is not None else None
-            conditions = conditioning[idx] if conditioning is not None else None
-            samples.append(self.add_sample(wav, epoch, idx, conditions, prompt_wav, gt_wav, generation_args))
-        return samples
-    def get_samples(self, epoch: int = -1, max_epoch: int = -1, exclude_prompted: bool = False,
-                    exclude_unprompted: bool = False, exclude_conditioned: bool = False,
-                    exclude_unconditioned: bool = False) -> tp.Set[Sample]:
-        """Returns a set of samples for this XP. Optionally, you can filter which samples to obtain.
-        Please note that existing samples are loaded during the manager's initialization, and added samples through this
-        manager are also tracked. Any other external changes are not tracked automatically, so creating a new manager
-        is the only way detect them.
-        Args:
-            epoch (int): If provided, only return samples corresponding to this epoch.
-            max_epoch (int): If provided, only return samples corresponding to the latest epoch that is <= max_epoch.
-            exclude_prompted (bool): If True, does not include samples that used a prompt.
-            exclude_unprompted (bool): If True, does not include samples that did not use a prompt.
-            exclude_conditioned (bool): If True, excludes samples that used conditioning.
-            exclude_unconditioned (bool): If True, excludes samples that did not use conditioning.
-        Returns:
-            Samples (set of Sample): The retrieved samples matching the provided filters.
-        """
-        if max_epoch >= 0:
-            samples_epoch = max(sample.epoch for sample in self.samples if sample.epoch <= max_epoch)
-        else:
-            samples_epoch = self.latest_epoch if epoch < 0 else epoch
-        samples = {
-            sample
-            for sample in self.samples
-            if (
-                (sample.epoch == samples_epoch) and
-                (not exclude_prompted or sample.prompt is None) and
-                (not exclude_unprompted or sample.prompt is not None) and
-                (not exclude_conditioned or not sample.conditioning) and
-                (not exclude_unconditioned or sample.conditioning)
-            )
-        }
-        return samples
-def slugify(value: tp.Any, allow_unicode: bool = False):
-    """Process string for safer file naming.
-    Taken from https://github.com/django/django/blob/master/django/utils/text.py
-    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
-    dashes to single dashes. Remove characters that aren't alphanumerics,
-    underscores, or hyphens. Convert to lowercase. Also strip leading and
-    trailing whitespace, dashes, and underscores.
-    """
-    value = str(value)
-    if allow_unicode:
-        value = unicodedata.normalize("NFKC", value)
-    else:
-        value = (
-            unicodedata.normalize("NFKD", value)
-            .encode("ascii", "ignore")
-            .decode("ascii")
-        )
-    value = re.sub(r"[^\w\s-]", "", value.lower())
-    return re.sub(r"[-\s]+", "-", value).strip("-_")
-def _match_stable_samples(samples_per_xp: tp.List[tp.Set[Sample]]) -> tp.Dict[str, tp.List[Sample]]:
-    # Create a dictionary of stable id -> sample per XP
-    stable_samples_per_xp = [{
-        sample.id: sample for sample in samples
-        if sample.prompt is not None or sample.conditioning
-    } for samples in samples_per_xp]
-    # Set of all stable ids
-    stable_ids = {id for samples in stable_samples_per_xp for id in samples.keys()}
-    # Dictionary of stable id -> list of samples. If an XP does not have it, assign None
-    stable_samples = {id: [xp.get(id) for xp in stable_samples_per_xp] for id in stable_ids}
-    # Filter out ids that contain None values (we only want matched samples after all)
-    # cast is necessary to avoid mypy linter errors.
-    return {id: tp.cast(tp.List[Sample], samples) for id, samples in stable_samples.items() if None not in samples}
-def _match_unstable_samples(samples_per_xp: tp.List[tp.Set[Sample]]) -> tp.Dict[str, tp.List[Sample]]:
-    # For unstable ids, we use a sorted list since we'll match them in order
-    unstable_samples_per_xp = [[
-        sample for sample in sorted(samples, key=lambda x: x.id)
-        if sample.prompt is None and not sample.conditioning
-    ] for samples in samples_per_xp]
-    # Trim samples per xp so all samples can have a match
-    min_len = min([len(samples) for samples in unstable_samples_per_xp])
-    unstable_samples_per_xp = [samples[:min_len] for samples in unstable_samples_per_xp]
-    # Dictionary of index -> list of matched samples
-    return {
-        f'noinput_{i}': [samples[i] for samples in unstable_samples_per_xp] for i in range(min_len)
-    }
-def get_samples_for_xps(xps: tp.List[dora.XP], **kwargs) -> tp.Dict[str, tp.List[Sample]]:
-    """Gets a dictionary of matched samples across the given XPs.
-    Each dictionary entry maps a sample id to a list of samples for that id. The number of samples per id
-    will always match the number of XPs provided and will correspond to each XP in the same order given.
-    In other words, only samples that can be match across all provided XPs will be returned
-    in order to satisfy this rule.
-    There are two types of ids that can be returned: stable and unstable.
-    * Stable IDs are deterministic ids that were computed by the SampleManager given a sample's inputs
-      (prompts/conditioning). This is why we can match them across XPs.
-    * Unstable IDs are of the form "noinput_{idx}" and are generated on-the-fly, in order to map samples
-      that used non-deterministic, random ids. This is the case for samples that did not use prompts or
-      conditioning for their generation. This function will sort these samples by their id and match them
-      by their index.
-    Args:
-        xps: a list of XPs to match samples from.
-        start_epoch (int): If provided, only return samples corresponding to this epoch or newer.
-        end_epoch (int): If provided, only return samples corresponding to this epoch or older.
-        exclude_prompted (bool): If True, does not include samples that used a prompt.
-        exclude_unprompted (bool): If True, does not include samples that did not use a prompt.
-        exclude_conditioned (bool): If True, excludes samples that used conditioning.
-        exclude_unconditioned (bool): If True, excludes samples that did not use conditioning.
-    """
-    managers = [SampleManager(xp) for xp in xps]
-    samples_per_xp = [manager.get_samples(**kwargs) for manager in managers]
-    stable_samples = _match_stable_samples(samples_per_xp)
-    unstable_samples = _match_unstable_samples(samples_per_xp)
-    return dict(stable_samples, **unstable_samples)