instantiate audiogen in demo

Browse files

Files changed (15) hide show

audiocraft/audiogen.py +14 -77
audiocraft/builders.py +5 -3
audiocraft/conditioners.py +22 -89
audiocraft/lm.py +29 -3
audiocraft/loaders.py +0 -25
audiocraft/utils/__init__.py +0 -6
audiocraft/utils/autocast.py +0 -40
audiocraft/utils/cache.py +0 -324
audiocraft/utils/checkpoint.py +0 -161
audiocraft/utils/export.py +0 -79
audiocraft/utils/export_legacy.py +0 -70
audiocraft/utils/notebook.py +0 -32
audiocraft/utils/profiler.py +0 -38
audiocraft/utils/utils.py +0 -98
demo.py +74 -4

audiocraft/audiogen.py CHANGED Viewed

@@ -1,9 +1,3 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
 import typing as tp
 import torch
 from audiocraft.loaders import load_compression_model, load_lm_model
@@ -11,7 +5,6 @@ import typing as tp
 import omegaconf
 import torch
 import numpy as np
-from abc import ABC, abstractmethod
 from .lm import LMModel
 from .conditioners import ConditioningAttributes
 from .utils.autocast import TorchAutocast
@@ -25,7 +18,7 @@ def _shift(x):
     return x
-class BaseGenModel(ABC):
     """Base generative model with convenient generation API.
     Args:
@@ -90,21 +83,13 @@ class BaseGenModel(ABC):
         attributes = [
             ConditioningAttributes(text={'description': d}) for d in descriptions]
         tokens = self._generate_tokens(attributes)
         return self.generate_audio(tokens)
     def _generate_tokens(self, attributes):
         total_gen_len = int(self.duration * self.frame_rate)
-#         # print(f'{self.generation_params=}')
-# self.generation_params={'use_sampling': True,
-#                         'temp': 1.0, 'top_k': 250,
-#                         'top_p': 0.0, 'cfg_coef': 2.4, 'two_step_cfg': False}
         if self.duration <= self.max_duration:
             # generate by sampling from LM, simple case.
@@ -127,66 +112,18 @@ class BaseGenModel(ABC):
             gen_audio = self.compression_model.decode(gen_tokens, None)
         return gen_audio
-class AudioGen(BaseGenModel):
-    def __init__(self, name, compression_model, lm, max_duration=None):
-        # print(f'Using {compression_model=}\n-----=-----')
-        super().__init__(name, compression_model, lm, max_duration)
-        self.set_generation_params(duration=5)  # default duration
-    @staticmethod
-    def get_pretrained(name: str = 'facebook/audiogen-medium', device=None):
-        """Return pretrained model, we provide a single model for now:
-        - facebook/audiogen-medium (1.5B), text to sound,
-          # see: https://huggingface.co/facebook/audiogen-medium
-        """
-        if device is None:
-            if torch.cuda.device_count():
-                device = 'cuda'
-            else:
-                device = 'cpu'
-        compression_model = load_compression_model(name, device=device)
-        lm = load_lm_model(name, device=device)
-        assert 'self_wav' not in lm.condition_provider.conditioners, \
-            "AudioGen do not support waveform conditioning for now"
-        return AudioGen(name, compression_model, lm)
-    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
-                              top_p: float = 0.0, temperature: float = 1.0,
-                              duration: float = 10.0, cfg_coef: float = 2.4,
-                              two_step_cfg: bool = False, extend_stride: float = 2):
-        """Set the generation parameters for AudioGen.
-        Args:
-            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
-            top_k (int, optional): top_k used for sampling. Defaults to 250.
-            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
-            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
-            duration (float, optional): Duration of the generated waveform. Defaults to 10.0.
-            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
-            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
-                instead of batching together the two. This has some impact on how things
-                are padded but seems to have little impact in practice.
-            extend_stride: when doing extended generation (i.e. more than 10 seconds), by how much
-                should we extend the audio each time. Larger values will mean less context is
-                preserved, and shorter value will require extra computations.
-        """
-        assert extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
-        self.extend_stride = extend_stride
-        self.duration = duration
-        self.generation_params = {
-            'use_sampling': use_sampling,
-            'temp': temperature,
-            'top_k': top_k,
-            'top_p': top_p,
-            'cfg_coef': cfg_coef,
-            'two_step_cfg': two_step_cfg,
-        }

 import typing as tp
 import torch
 from audiocraft.loaders import load_compression_model, load_lm_model
 import omegaconf
 import torch
 import numpy as np
 from .lm import LMModel
 from .conditioners import ConditioningAttributes
 from .utils.autocast import TorchAutocast
     return x
+class AudioGen():
     """Base generative model with convenient generation API.
     Args:
         attributes = [
             ConditioningAttributes(text={'description': d}) for d in descriptions]
         tokens = self._generate_tokens(attributes)
+        print(f'\n{tokens.shape=}\n{tokens=}  FINAL 5 AUD')
         return self.generate_audio(tokens)
     def _generate_tokens(self, attributes):
         total_gen_len = int(self.duration * self.frame_rate)
         if self.duration <= self.max_duration:
             # generate by sampling from LM, simple case.
             gen_audio = self.compression_model.decode(gen_tokens, None)
         return gen_audio
+def get_pretrained(name='facebook/audiogen-medium',
+                   device=None):
+    """Return pretrained model, we provide a single model for now:
+    - facebook/audiogen-medium (1.5B), text to sound,
+        # see: https://huggingface.co/facebook/audiogen-medium
+    """
+    compression_model = load_compression_model(name, device=device)
+    lm = load_lm_model(name, device=device)
+    assert 'self_wav' not in lm.condition_provider.conditioners, \
+        "AudioGen do not support waveform conditioning for now"
+    return AudioGen(name, compression_model, lm)

audiocraft/builders.py CHANGED Viewed

@@ -12,16 +12,18 @@ from .lm import LMModel
 from .seanet import SEANetDecoder
 from .codebooks_patterns import DelayedPatternProvider
 from .conditioners import (
-    BaseConditioner,
     ConditionFuser,
     ConditioningProvider,
     T5Conditioner,
 )
 from .unet import DiffusionUnet
 from .vq import ResidualVectorQuantizer
-from .utils.utils import dict_from_config
 from .diffusion_schedule import MultiBandProcessor, SampleProcessor
 def get_quantizer(quantizer, cfg, dimension):
     klass = {
@@ -118,7 +120,7 @@ def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -> Cond
     duration = cfg.dataset.segment_duration
     cfg = getattr(cfg, 'conditioners')
     dict_cfg = {} if cfg is None else dict_from_config(cfg)
-    conditioners: tp.Dict[str, BaseConditioner] = {}
     condition_provider_args = dict_cfg.pop('args', {})
     condition_provider_args.pop('merge_text_conditions_p', None)
     condition_provider_args.pop('drop_desc_p', None)

 from .seanet import SEANetDecoder
 from .codebooks_patterns import DelayedPatternProvider
 from .conditioners import (
     ConditionFuser,
     ConditioningProvider,
     T5Conditioner,
 )
 from .unet import DiffusionUnet
 from .vq import ResidualVectorQuantizer
 from .diffusion_schedule import MultiBandProcessor, SampleProcessor
+def dict_from_config(cfg):
+    dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
+    return dct
 def get_quantizer(quantizer, cfg, dimension):
     klass = {
     duration = cfg.dataset.segment_duration
     cfg = getattr(cfg, 'conditioners')
     dict_cfg = {} if cfg is None else dict_from_config(cfg)
+    conditioners: tp.Dict[str, T5Conditioner] = {}
     condition_provider_args = dict_cfg.pop('args', {})
     condition_provider_args.pop('merge_text_conditions_p', None)
     condition_provider_args.pop('drop_desc_p', None)

audiocraft/conditioners.py CHANGED Viewed

@@ -8,12 +8,6 @@ import soundfile
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
-from .utils.autocast import TorchAutocast
 logger = logging.getLogger(__name__)
 TextCondition = tp.Optional[str]  # a text condition can be a string or None (if doesn't exist)
 ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
@@ -21,7 +15,6 @@ ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
 class JointEmbedCondition(tp.NamedTuple):
     wav: torch.Tensor
     text: tp.List[tp.Optional[str]]
@@ -76,18 +69,6 @@ class ConditioningAttributes:
         return out
 class Tokenizer:
     """Base tokenizer implementation
     (in case we want to introduce more advances tokenizers in the future).
@@ -95,59 +76,10 @@ class Tokenizer:
     def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError()
-class BaseConditioner(nn.Module):
-    """Base model for all conditioner modules.
-    We allow the output dim to be different than the hidden dim for two reasons:
-    1) keep our LUTs small when the vocab is large;
-    2) make all condition dims consistent.
-    Args:
-        dim (int): Hidden dim of the model.
-        output_dim (int): Output dim of the conditioner.
-    """
-    def __init__(self, dim: int, output_dim: int):
-        super().__init__()
-        self.dim = dim
-        self.output_dim = output_dim
-        self.output_proj = nn.Linear(dim, output_dim)
-    def tokenize(self, *args, **kwargs) -> tp.Any:
-        """Should be any part of the processing that will lead to a synchronization
-        point, e.g. BPE tokenization with transfer to the GPU.
-        The returned value will be saved and return later when calling forward().
-        """
-        raise NotImplementedError()
-class TextConditioner(BaseConditioner):
-    ...
-class T5Conditioner(TextConditioner):
-    """T5-based TextConditioner.
-    Args:
-        name (str): Name of the T5 model.
-        output_dim (int): Output dim of the conditioner.
-        finetune (bool): Whether to fine-tune T5 at train time.
-        device (str): Device for T5 Conditioner.
-        autocast_dtype (tp.Optional[str], optional): Autocast dtype.
-        word_dropout (float, optional): Word dropout probability.
-        normalize_text (bool, optional): Whether to apply text normalization.
-    """
     MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b",
               "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large",
               "google/flan-t5-xl", "google/flan-t5-xxl"]
@@ -164,24 +96,23 @@ class T5Conditioner(TextConditioner):
         "google/flan-t5-11b": 1024,
     }
-    def __init__(self, name: str, output_dim: int, finetune: bool, device: str,
-                 autocast_dtype: tp.Optional[str] = 'float32', word_dropout: float = 0.,
-                 normalize_text: bool = False):
         assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
-        super().__init__(self.MODELS_DIMS[name], output_dim)
         self.device = device
         self.name = name
-        self.finetune = finetune
         self.word_dropout = word_dropout
-        if autocast_dtype is None or self.device == 'cpu':
-            self.autocast = TorchAutocast(enabled=False)
-            if self.device != 'cpu':
-                logger.warning("T5 has no autocast, this might lead to NaN")
-        else:
-            dtype = getattr(torch, autocast_dtype)
-            assert isinstance(dtype, torch.dtype)
-            logger.info(f"T5 will be evaluated with autocast as {autocast_dtype}")
-            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
         # Let's disable logging temporarily because T5 will vomit some errors otherwise.
         # thanks https://gist.github.com/simon-weber/7853144
         previous_level = logging.root.manager.disable
@@ -190,7 +121,7 @@ class T5Conditioner(TextConditioner):
             warnings.simplefilter("ignore")
             try:
                 self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
-                t5 = T5EncoderModel.from_pretrained(name).train(mode=finetune)
             finally:
                 logging.disable(previous_level)
         if finetune:
@@ -223,15 +154,15 @@ class T5Conditioner(TextConditioner):
         mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
         return inputs
-    def forward(self, inputs: tp.Dict[str, torch.Tensor]) -> ConditionType:
         mask = inputs['attention_mask']
-        with torch.set_grad_enabled(self.finetune), self.autocast:
             embeds = self.t5(**inputs).last_hidden_state
         embeds = self.output_proj(embeds.to(self.output_proj.weight))
         embeds = (embeds * mask.unsqueeze(-1))
         # T5 torch.Size([2, 4, 1536]) dict_keys(['input_ids', 'attention_mask'])
-        # print(f'{inputs["input_ids"].shape=}')  # inputs["input_ids"].shape=torch.Size([2, 4])
         return embeds, mask
@@ -248,7 +179,9 @@ class ConditioningProvider(nn.Module):
         conditioners (dict): Dictionary of conditioners.
         device (torch.device or str, optional): Device for conditioners and output condition types.
     """
-    def __init__(self, conditioners: tp.Dict[str, BaseConditioner], device: tp.Union[torch.device, str] = "cpu"):
         super().__init__()
         self.device = device
         self.conditioners = nn.ModuleDict(conditioners)
@@ -263,7 +196,7 @@ class ConditioningProvider(nn.Module):
     @property
     def text_conditions(self):
-        return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]

 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
 logger = logging.getLogger(__name__)
 TextCondition = tp.Optional[str]  # a text condition can be a string or None (if doesn't exist)
 ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
 class JointEmbedCondition(tp.NamedTuple):
     wav: torch.Tensor
     text: tp.List[tp.Optional[str]]
         return out
 class Tokenizer:
     """Base tokenizer implementation
     (in case we want to introduce more advances tokenizers in the future).
     def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError()
+class T5Conditioner(nn.Module):
     MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b",
               "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large",
               "google/flan-t5-xl", "google/flan-t5-xxl"]
         "google/flan-t5-11b": 1024,
     }
+    def __init__(self,
+                 name: str,
+                 output_dim: int,
+                 device: str,
+                 word_dropout: float = 0.,
+                 normalize_text: bool = False,
+                 finetune=False):
+        print(f'{finetune=}')
         assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
+        super().__init__()
+        self.dim = self.MODELS_DIMS[name]
+        self.output_dim = output_dim
+        self.output_proj = nn.Linear(self.dim, output_dim)
         self.device = device
         self.name = name
         self.word_dropout = word_dropout
         # Let's disable logging temporarily because T5 will vomit some errors otherwise.
         # thanks https://gist.github.com/simon-weber/7853144
         previous_level = logging.root.manager.disable
             warnings.simplefilter("ignore")
             try:
                 self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
+                t5 = T5EncoderModel.from_pretrained(name).eval()  #.train(mode=finetune)
             finally:
                 logging.disable(previous_level)
         if finetune:
         mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
         return inputs
+    def forward(self, inputs):
         mask = inputs['attention_mask']
+        with torch.no_grad():
             embeds = self.t5(**inputs).last_hidden_state
         embeds = self.output_proj(embeds.to(self.output_proj.weight))
         embeds = (embeds * mask.unsqueeze(-1))
         # T5 torch.Size([2, 4, 1536]) dict_keys(['input_ids', 'attention_mask'])
+        print(f'{embeds.dtype=}')  # inputs["input_ids"].shape=torch.Size([2, 4])
         return embeds, mask
         conditioners (dict): Dictionary of conditioners.
         device (torch.device or str, optional): Device for conditioners and output condition types.
     """
+    def __init__(self,
+                 conditioners,
+                 device="cpu"):
         super().__init__()
         self.device = device
         self.conditioners = nn.ModuleDict(conditioners)
     @property
     def text_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, T5Conditioner)]

audiocraft/lm.py CHANGED Viewed

@@ -10,9 +10,35 @@ from audiocraft.transformer import StreamingTransformer, create_norm_fn
 from dataclasses import dataclass
 from functools import partial
 from torch import nn
-from audiocraft.utils import utils
 from audiocraft.activations import get_activation_fn
 # ============================================== From LM.py
@@ -147,7 +173,7 @@ class LMModel(nn.Module):
         super().__init__()
         self.cfg_coef = cfg_coef
-        self.n_draw = 2
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
@@ -332,7 +358,7 @@ class LMModel(nn.Module):
                                   token_count=offset)
             # print(f'BEF {logits.shape=} BEF utils.SampleTop5')  # AGREES 4 BEF logits.shape=torch.Size([1, 4, 1, 2048]) BEF utils.SampleTop5
-            next_token = utils.sample_top_k(logits, n_draw=self.n_draw)  # [1,4,2048] logits

 from dataclasses import dataclass
 from functools import partial
 from torch import nn
 from audiocraft.activations import get_activation_fn
+def sample_top_k(p, k=250, n_draw=None):
+    """
+        p probabs 2048 ?
+        num_draw : how many tokens to sample (for duplicate elongation)
+    """
+    p = torch.softmax(p, dim=-1)  # p/temp
+    top_k_value, i250 = torch.topk(p, k, dim=-1)   # probs: [1, 4, 2048]
+    # print('\n_____TOPK________\n', top_k_value.shape, top_k_value[0, 0, :10], '\n___________END_TOPK____________\n')
+    min_value_top_k = top_k_value[..., [-1]]  #
+    p *= (p >= min_value_top_k).float()
+    p.div_(p.sum(dim=-1, keepdim=True))
+    # -- next_token = multinomial(probs, num_samples=num_draw)
+    # RESHAPED into bs, 4, 250
+    p_ = p.reshape(-1, p.shape[-1])
+    out = torch.multinomial(p_,
+                             num_samples=n_draw,
+                             replacement=False)  # [4, num_draw]
+    return out.transpose(0, 1)[:, :, None]       # [num_draw, 4, 1]
 # ============================================== From LM.py
         super().__init__()
         self.cfg_coef = cfg_coef
+        self.n_draw = 1
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
                                   token_count=offset)
             # print(f'BEF {logits.shape=} BEF utils.SampleTop5')  # AGREES 4 BEF logits.shape=torch.Size([1, 4, 1, 2048]) BEF utils.SampleTop5
+            next_token = sample_top_k(logits, n_draw=self.n_draw)  # [1,4,2048] logits

audiocraft/loaders.py CHANGED Viewed

@@ -120,32 +120,7 @@ def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu',
     return model
-def load_lm_model_magnet(file_or_url_or_id: tp.Union[Path, str], compression_model_frame_rate: int,
-                         device='cpu', cache_dir: tp.Optional[str] = None):
-    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    if cfg.device == 'cpu':
-        cfg.dtype = 'float32'
-    else:
-        cfg.dtype = 'float16'
-    _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
-    _delete_param(cfg, 'conditioners.args.drop_desc_p')
-    cfg.transformer_lm.compression_model_framerate = compression_model_frame_rate
-    cfg.transformer_lm.segment_duration = cfg.dataset.segment_duration
-    cfg.transformer_lm.span_len = cfg.masking.span_len
-    # MAGNeT models v1 support only xformers backend.
-    from .transformer import set_efficient_attention_backend
-    if cfg.transformer_lm.memory_efficient:
-        set_efficient_attention_backend("xformers")
-    model = builders.get_lm_model(cfg)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    model.cfg = cfg
-    return model
 def load_mbd_ckpt(file_or_url_or_id: tp.Union[Path, str],


120	return model
121
122















123










124
125
126	def load_mbd_ckpt(file_or_url_or_id: tp.Union[Path, str],

audiocraft/utils/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Utilities."""

audiocraft/utils/autocast.py DELETED Viewed

@@ -1,40 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-class TorchAutocast:
-    """TorchAutocast utility class.
-    Allows you to enable and disable autocast. This is specially useful
-    when dealing with different architectures and clusters with different
-    levels of support.
-    Args:
-        enabled (bool): Whether to enable torch.autocast or not.
-        args: Additional args for torch.autocast.
-        kwargs: Additional kwargs for torch.autocast
-    """
-    def __init__(self, enabled: bool, *args, **kwargs):
-        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
-    def __enter__(self):
-        if self.autocast is None:
-            return
-        try:
-            self.autocast.__enter__()
-        except RuntimeError:
-            device = self.autocast.device
-            dtype = self.autocast.fast_dtype
-            raise RuntimeError(
-                f"There was an error autocasting with dtype={dtype} device={device}\n"
-                "If you are on the FAIR Cluster, you might need to use autocast_dtype=float16"
-            )
-    def __exit__(self, *args, **kwargs):
-        if self.autocast is None:
-            return
-        self.autocast.__exit__(*args, **kwargs)

audiocraft/utils/cache.py DELETED Viewed

@@ -1,324 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from concurrent.futures import ThreadPoolExecutor
-from collections import deque
-from functools import partial
-from hashlib import sha1
-import logging
-from pathlib import Path
-import sys
-import typing as tp
-import zipfile
-import flashy
-import torch
-logger = logging.getLogger(__name__)
-def get_full_embed(full_embed: torch.Tensor, x: tp.Any, idx: int, device: tp.Union[str, torch.device]) -> torch.Tensor:
-    """Utility function for the EmbeddingCache, returning the full embedding without any chunking.
-    This method can be used in case there is no need in extracting a chunk of the full embedding
-    read from the cache.
-    Args:
-        full_embed (torch.Tensor): The full embedding.
-        x (any): Batch object from which the full embedding is derived.
-        idx (torch.Tensor): Index of object to consider in the batch object.
-    Returns:
-        full_embed (torch.Tensor): The full embedding
-    """
-    return full_embed.to(device)
-class EmbeddingCache:
-    """Cache around embeddings computation for faster execution.
-    The EmbeddingCache is storing pre-computed embeddings on disk and provides a simple API
-    to retrieve the pre-computed embeddings on full inputs and extract only a given chunk
-    using a user-provided function. When the cache is warm (all embeddings are pre-computed),
-    the EmbeddingCache allows for faster training as it removes the need of computing the embeddings.
-    Additionally, it provides in-memory cache around the loaded embeddings to limit IO footprint
-    and synchronization points in the forward calls.
-    Args:
-        cache_path (Path): Path to folder where all pre-computed embeddings are saved on disk.
-        device (str or torch.device): Device on which the embedding is returned.
-        compute_embed_fn (callable[[Path, any, int], torch.Tensor], optional): Function to compute
-            the embedding from a given object and path. This user provided function can compute the
-            embedding from the provided object or using the provided path as entry point. The last parameter
-            specify the index corresponding to the current embedding in the object that can represent batch metadata.
-        extract_embed_fn (callable[[torch.Tensor, any, int], torch.Tensor], optional): Function to extract
-            the desired embedding chunk from the full embedding loaded from the cache. The last parameter
-            specify the index corresponding to the current embedding in the object that can represent batch metadata.
-            If not specified, will return the full embedding unmodified.
-    """
-    def __init__(self, cache_path: tp.Union[str, Path], device: tp.Union[str, torch.device],
-                 compute_embed_fn: tp.Callable[[Path, tp.Any, int], torch.Tensor],
-                 extract_embed_fn: tp.Optional[tp.Callable[[torch.Tensor, tp.Any, int], torch.Tensor]] = None):
-        self.cache_path = Path(cache_path)
-        self.device = device
-        self._compute_embed_fn = compute_embed_fn
-        self._extract_embed_fn: tp.Callable[[torch.Tensor, tp.Any, int], torch.Tensor]
-        if extract_embed_fn is not None:
-            self._extract_embed_fn = extract_embed_fn
-        else:
-            self._extract_embed_fn = partial(get_full_embed, device=device)
-        if self.cache_path is not None:
-            self.cache_path.mkdir(exist_ok=True, parents=True)
-            logger.info(f"Cache instantiated at: {self.cache_path}")
-            self.pool = ThreadPoolExecutor(8)
-            self.pool.__enter__()
-        self._current_batch_cache: dict = {}
-        self._memory_cache: dict = {}
-    def _get_cache_path(self, path: tp.Union[Path, str]):
-        """Get cache path for the given file path."""
-        sig = sha1(str(path).encode()).hexdigest()
-        return self.cache_path / sig
-    @staticmethod
-    def _get_full_embed_from_cache(cache: Path):
-        """Loads full pre-computed embedding from the cache."""
-        try:
-            embed = torch.load(cache, 'cpu')
-        except Exception as exc:
-            logger.error("Error loading %s: %r", cache, exc)
-            embed = None
-        return embed
-    def get_embed_from_cache(self, paths: tp.List[Path], x: tp.Any) -> torch.Tensor:
-        """Get embedding from cache, computing and storing it to cache if not already cached.
-        The EmbeddingCache first tries to load the embedding from the in-memory cache
-        containing the pre-computed chunks populated through `populate_embed_cache`.
-        If not found, the full embedding is computed and stored on disk to be later accessed
-        to populate the in-memory cache, and the desired embedding chunk is extracted and returned.
-        Args:
-            paths (list[Path or str]): List of paths from where the embeddings can be loaded.
-            x (any): Object from which the embedding is extracted.
-        """
-        embeds = []
-        for idx, path in enumerate(paths):
-            cache = self._get_cache_path(path)
-            if cache in self._current_batch_cache:
-                embed = self._current_batch_cache[cache]
-            else:
-                full_embed = self._compute_embed_fn(path, x, idx)
-                try:
-                    with flashy.utils.write_and_rename(cache, pid=True) as f:
-                        torch.save(full_embed.cpu(), f)
-                except Exception as exc:
-                    logger.error('Error saving embed %s (%s): %r', cache, full_embed.shape, exc)
-                else:
-                    logger.info('New embed cache saved: %s (%s)', cache, full_embed.shape)
-                    embed = self._extract_embed_fn(full_embed, x, idx)
-            embeds.append(embed)
-        embed = torch.stack(embeds, dim=0)
-        return embed
-    def populate_embed_cache(self, paths: tp.List[Path], x: tp.Any) -> None:
-        """Populate in-memory caches for embeddings reading from the embeddings stored on disk.
-        The in-memory caches consist in a cache for the full embedding and another cache for the
-        final embedding chunk. Such caches are used to limit the IO access when computing the actual embeddings
-        and reduce the IO footprint and synchronization points during forward passes.
-        Args:
-            paths (list[Path]): List of paths from where the embeddings can be loaded.
-            x (any): Object from which the embedding is extracted.
-        """
-        self._current_batch_cache.clear()
-        if self.cache_path is not None:
-            futures: list = []
-            for path in paths:
-                assert path is not None, "Path is required for computation from cache"
-                cache = self._get_cache_path(path)
-                if cache in self._memory_cache or not cache.exists():
-                    futures.append(None)
-                else:
-                    futures.append(self.pool.submit(EmbeddingCache._get_full_embed_from_cache, cache))
-            for idx, (path, future) in enumerate(zip(paths, futures)):
-                assert path is not None
-                cache = self._get_cache_path(path)
-                full_embed = None
-                if future is None:
-                    if cache in self._memory_cache:
-                        full_embed = self._memory_cache[cache]
-                else:
-                    full_embed = future.result()
-                    if full_embed is not None:
-                        self._memory_cache[cache] = full_embed
-                        full_embed = full_embed.to(self.device)
-                if full_embed is not None:
-                    embed = self._extract_embed_fn(full_embed, x, idx)
-                    self._current_batch_cache[cache] = embed
-class CachedBatchWriter:
-    """Write pre computed caches for mini batches. This can
-    make loading a lot more efficient depending on your filesystem.
-    Args:
-        cache_folder (Path): folder in which the cached minibatches
-            will be stored.
-    Inside cache folder, the structure is the following:
-    `epoch_number / update_number.zip`
-    And the zip file contains one entry per batch item.
-    It is possible to use the cache with a batch size smaller than
-    created with but obviously not larger. Make sure to call the
-    `start_epoch(epoch)` method for indicating changes of epochs.
-    See the grid `audiocraft/grids/musicgen/musicgen_warmup_cache.py`
-    for an example of how to warmup the cache.
-    """
-    def __init__(self, cache_folder: Path):
-        self.cache_folder = cache_folder
-        self._current_epoch: tp.Optional[int] = None
-        self._current_index = 0
-    def start_epoch(self, epoch: int):
-        """Call at the beginning of each epoch.
-        """
-        self._current_epoch = epoch
-        self._current_index = 0
-        self._zip_path.parent.mkdir(exist_ok=True, parents=True)
-    @staticmethod
-    def _get_zip_path(cache_folder: Path, epoch: int, index: int):
-        return cache_folder / f"{epoch:05d}" / f"{index:06d}.zip"
-    @property
-    def _zip_path(self):
-        assert self._current_epoch is not None
-        return CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch, self._current_index)
-    def save(self, *content):
-        """Save one mini batch. This function is distributed-aware
-        and will automatically merge all the items from the different
-        workers.
-        """
-        all_contents = []
-        for rank in range(flashy.distrib.world_size()):
-            their_content = flashy.distrib.broadcast_object(content, src=rank)
-            all_contents.append(their_content)
-        if flashy.distrib.is_rank_zero():
-            idx = 0
-            with flashy.utils.write_and_rename(self._zip_path) as tmp:
-                with zipfile.ZipFile(tmp, 'w') as zf:
-                    for content in all_contents:
-                        for vals in zip(*content):
-                            with zf.open(f'{idx}', 'w') as f:  # type: ignore
-                                torch.save(vals, f)
-                            idx += 1
-        flashy.distrib.barrier()
-        self._current_index += 1
-class CachedBatchLoader:
-    """Loader for cached mini-batches dumped with `CachedBatchWriter`.
-    Args:
-        cache_folder (Path): folder in which the cached minibatches are stored.
-        batch_size (int): batch size (per GPU) expected.
-        num_workers (int): number of workers to use for loading.
-        min_length (int): minimum expected length for each epoch. If some
-            mini-batches are missing, and error is raised.
-    This is iterable just like a regular DataLoader.
-    """
-    def __init__(self, cache_folder: Path, batch_size: int,
-                 num_workers: int = 10, min_length: int = 1):
-        self.cache_folder = cache_folder
-        self.batch_size = batch_size
-        self.num_workers = num_workers
-        self.min_length = min_length
-        self._current_epoch: tp.Optional[int] = None
-        self.sampler = None  # for compatibility with the regular DataLoader
-    def __len__(self):
-        path = CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch or 0, 0).parent
-        return len([p for p in path.iterdir() if p.suffix == ".zip"])
-    def start_epoch(self, epoch: int):
-        """Call at the beginning of each epoch.
-        """
-        self._current_epoch = epoch
-    def _zip_path(self, index: int):
-        assert self._current_epoch is not None
-        return CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch, index)
-    def _load_one(self, index: int):
-        zip_path = self._zip_path(index)
-        if not zip_path.exists():
-            if index < self.min_length:
-                raise RuntimeError(f"Cache should have at least {self.min_length} batches, but {index} doesn't exist")
-            return None
-        mode = "rb" if sys.version_info >= (3, 9) else "r"
-        try:
-            with zipfile.ZipFile(zip_path, 'r') as zf:
-                rank = flashy.distrib.rank()
-                world_size = flashy.distrib.world_size()
-                root = zipfile.Path(zf)
-                items = list(root.iterdir())
-                total_batch_size = self.batch_size * world_size
-                if len(items) < total_batch_size:
-                    raise RuntimeError(
-                        f"The cache can handle a max batch size of {len(items)}, "
-                        f"but {total_batch_size} is needed.")
-                start = rank * self.batch_size
-                items = items[start: start + self.batch_size]
-                assert len(items) == self.batch_size
-                entries = []
-                entries = [torch.load(item.open(mode), 'cpu') for item in items]  # type: ignore
-                transposed = zip(*entries)
-                out = []
-                for part in transposed:
-                    assert len(part) > 0
-                    if isinstance(part[0], torch.Tensor):
-                        out.append(torch.stack(part))
-                    else:
-                        assert isinstance(part, torch.Tensor)
-                        out.append(part)
-                return out
-        except Exception:
-            logger.error("Error when reading zip path %s", zip_path)
-            raise
-    def __iter__(self):
-        """This will yields tuples, exactly as provided to the
-        `CachedBatchWriter.save` method.
-        """
-        pool = ThreadPoolExecutor(self.num_workers)
-        next_index = 0
-        queue = deque()
-        def _get_next():
-            nonlocal next_index
-            r = queue.popleft().result()
-            if r is None:
-                return None
-            else:
-                queue.append(pool.submit(self._load_one, next_index))
-                next_index += 1
-            return r
-        with pool:
-            # fill the buffer of fetching jobs.
-            for _ in range(2 * self.num_workers):
-                queue.append(pool.submit(self._load_one, next_index))
-                next_index += 1
-            while True:
-                batch = _get_next()
-                if batch is None:
-                    return
-                yield batch

audiocraft/utils/checkpoint.py DELETED Viewed

@@ -1,161 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from enum import Enum
-import logging
-from pathlib import Path
-import re
-import typing as tp
-import flashy
-import torch
-from ..environment import AudioCraftEnvironment
-logger = logging.getLogger(__name__)
-class CheckpointSource(Enum):
-    CURRENT_XP = "current_xp"
-    PRETRAINED = "pretrained"
-    OTHER = "other"
-def checkpoint_name(name: tp.Optional[str] = None, rank: tp.Optional[int] = None, use_fsdp: bool = False) -> str:
-    """Checkpoint name formatted for all use in AudioCraft codebase and has the following format:
-    `checkpoint_<name>.th(.<rank>)`. By convention, name is expected to be empty for last checkpoint,
-    'best' for the best checkpoint or the epoch number.
-    Args:
-        name (str, optional): Name suffix for the checkpoint file stem.
-        rank (optional, int): Rank for distributed processing, retrieved with flashy if not provided.
-        use_fsdp (bool): Whether the calling solver relies on FSDP.
-    Returns:
-        str: The checkpoint name.
-    """
-    suffix = ''
-    if rank is None:
-        rank = flashy.distrib.rank()
-    if rank > 0 and use_fsdp:
-        suffix = '.' + str(rank)
-    name_part = ''
-    if name is not None:
-        name_part = f'_{name}'
-    return f'checkpoint{name_part}.th{suffix}'
-def is_sharded_checkpoint(path: Path) -> bool:
-    """Whether the checkpoint at the given path corresponds to a sharded checkpoint across rank."""
-    return re.search(r'\.th\.\d+$', path.name) is not None
-def resolve_checkpoint_path(sig_or_path: tp.Union[Path, str], name: tp.Optional[str] = None,
-                            use_fsdp: bool = False) -> tp.Optional[Path]:
-    """Resolve a given checkpoint path for a provided dora sig or path.
-    Args:
-        sig_or_path (Path or str): Checkpoint path or dora signature.
-        name (str, optional): Name suffix for the checkpoint file stem.
-        rank (optional, int): Rank for distributed processing, retrieved with flashy if not provided.
-        use_fsdp (bool): Whether the calling solver relies on FSDP.
-    Returns:
-        Path, optional: Resolved checkpoint path, if it exists.
-    """
-    from audiocraft import train
-    xps_root = train.main.dora.dir / 'xps'
-    sig_or_path = str(sig_or_path)
-    if sig_or_path.startswith('//sig/'):
-        sig = sig_or_path[len('//sig/'):]
-        path = xps_root / sig
-    else:
-        path = Path(sig_or_path)
-        path = AudioCraftEnvironment.resolve_reference_path(path)
-    if path.is_dir():
-        path = path / checkpoint_name(name, use_fsdp=use_fsdp)
-    if path.exists():
-        return path
-    else:
-        return None
-def load_checkpoint(checkpoint_path: Path, is_sharded: bool = False) -> tp.Any:
-    """Load state from checkpoints at the specified checkpoint path."""
-    if is_sharded:
-        rank0_checkpoint_path = checkpoint_path.parent / checkpoint_name(use_fsdp=False)
-        if rank0_checkpoint_path.exists():
-            check_sharded_checkpoint(checkpoint_path, rank0_checkpoint_path)
-    state = torch.load(checkpoint_path, 'cpu')
-    logger.info("Checkpoint loaded from %s", checkpoint_path)
-    return state
-def save_checkpoint(state: tp.Any, checkpoint_path: Path, is_sharded: bool = False) -> None:
-    """Save state to disk to the specified checkpoint_path."""
-    _safe_save_checkpoint(state, checkpoint_path, is_sharded)
-    logger.info("Checkpoint saved to %s", checkpoint_path)
-def flush_stale_checkpoints(checkpoint_path: Path, keep_last: tp.Optional[int] = None) -> None:
-    """Flush checkpoints to only keep last N checkpoints."""
-    if keep_last is None or keep_last <= 0:
-        return
-    checkpoint_dir = checkpoint_path.parent
-    suffix = ''
-    if flashy.distrib.rank() > 0:
-        suffix = f'.{flashy.distrib.rank()}'
-    checkpoint_files_with_epoch = []
-    for path in Path(checkpoint_dir).glob(f'checkpoint_*.th{suffix}'):
-        epoch_part = path.name.split('.', 1)[0].split('_', 1)[1]
-        if epoch_part.isdigit():
-            checkpoint_files_with_epoch.append((path, int(epoch_part)))
-    checkpoint_files = [path for path, _ in list(sorted(checkpoint_files_with_epoch, key=lambda t: t[1]))]
-    total_to_flush = max(0, len(checkpoint_files) - keep_last)
-    files_to_flush = checkpoint_files[:total_to_flush]
-    for path in files_to_flush:
-        logger.debug("Removing checkpoint: %s", str(path))
-        path.unlink(missing_ok=True)
-def check_sharded_checkpoint(checkpoint_path: Path, rank0_checkpoint_path: Path) -> None:
-    """Check sharded checkpoint state, ensuring the checkpoints are not corrupted."""
-    # Finish the work of a previous run that got interrupted while dumping.
-    old_path = Path(str(checkpoint_path) + '.old')
-    if old_path.exists():
-        raise RuntimeError(
-            f"Old checkpoint {old_path} from previous version of this code exist, cannot safely proceed.")
-    token = Path(str(rank0_checkpoint_path) + '.tmp.done')
-    tmp_path = Path(str(checkpoint_path) + '.tmp')
-    if token.exists():
-        if tmp_path.exists():
-            tmp_path.rename(checkpoint_path)
-    flashy.distrib.barrier()
-    if flashy.distrib.is_rank_zero() and token.exists():
-        token.unlink()
-def _safe_save_checkpoint(state: tp.Any, checkpoint_path: Path, is_sharded: bool = False) -> None:
-    """Save checkpoints in a safe manner even with when sharded checkpoints across nodes."""
-    def _barrier_if_sharded():
-        if is_sharded:
-            flashy.distrib.barrier()
-    if flashy.distrib.is_rank_zero():
-        token = Path(str(checkpoint_path) + '.tmp.done')
-        if token.exists():
-            token.unlink()
-    _barrier_if_sharded()
-    with flashy.utils.write_and_rename(checkpoint_path) as f:
-        torch.save(state, f)
-        _barrier_if_sharded()
-        if flashy.distrib.is_rank_zero():
-            token.touch()
-        _barrier_if_sharded()
-    _barrier_if_sharded()
-    if flashy.distrib.rank() == 0:
-        token.unlink()

audiocraft/utils/export.py DELETED Viewed

@@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Utility to export a training checkpoint to a lightweight release checkpoint.
-"""
-from pathlib import Path
-import typing as tp
-from omegaconf import OmegaConf
-import torch
-from audiocraft import __version__
-def export_encodec(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
-    """Export only the best state from the given EnCodec checkpoint. This
-    should be used if you trained your own EnCodec model.
-    """
-    pkg = torch.load(checkpoint_path, 'cpu')
-    new_pkg = {
-        'best_state': pkg['best_state']['model'],
-        'xp.cfg': OmegaConf.to_yaml(pkg['xp.cfg']),
-        'version': __version__,
-        'exported': True,
-    }
-    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
-    torch.save(new_pkg, out_file)
-    return out_file
-def export_pretrained_compression_model(pretrained_encodec: str, out_file: tp.Union[Path, str]):
-    """Export a compression model (potentially EnCodec) from a pretrained model.
-    This is required for packaging the audio tokenizer along a MusicGen or AudioGen model.
-    Do not include the //pretrained/ prefix. For instance if you trained a model
-    with `facebook/encodec_32khz`, just put that as a name. Same for `dac_44khz`.
-    In that case, this will not actually include a copy of the model, simply the reference
-    to the model used.
-    """
-    if Path(pretrained_encodec).exists():
-        pkg = torch.load(pretrained_encodec)
-        assert 'best_state' in pkg
-        assert 'xp.cfg' in pkg
-        assert 'version' in pkg
-        assert 'exported' in pkg
-    else:
-        pkg = {
-            'pretrained': pretrained_encodec,
-            'exported': True,
-            'version': __version__,
-        }
-    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
-    torch.save(pkg, out_file)
-def export_lm(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
-    """Export only the best state from the given MusicGen or AudioGen checkpoint.
-    """
-    pkg = torch.load(checkpoint_path, 'cpu')
-    if pkg['fsdp_best_state']:
-        best_state = pkg['fsdp_best_state']['model']
-    else:
-        assert pkg['best_state']
-        best_state = pkg['best_state']['model']
-    new_pkg = {
-        'best_state': best_state,
-        'xp.cfg': OmegaConf.to_yaml(pkg['xp.cfg']),
-        'version': __version__,
-        'exported': True,
-    }
-    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
-    torch.save(new_pkg, out_file)
-    return out_file

audiocraft/utils/export_legacy.py DELETED Viewed

@@ -1,70 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Legacy functions used at the time of the first release, kept for referencd.
-"""
-from pathlib import Path
-import typing as tp
-from omegaconf import OmegaConf, DictConfig
-import torch
-from audiocraft import __version__
-def _clean_lm_cfg(cfg: DictConfig):
-    OmegaConf.set_struct(cfg, False)
-    # This used to be set automatically in the LM solver, need a more robust solution
-    # for the future.
-    cfg['transformer_lm']['card'] = 2048
-    n_q = 4
-    stereo_cfg = getattr(cfg, 'interleave_stereo_codebooks', None)
-    if stereo_cfg is not None and stereo_cfg.use:
-        if 'downsample' in stereo_cfg:
-            del stereo_cfg['downsample']
-        n_q = 8
-    cfg['transformer_lm']['n_q'] = n_q
-    # Experimental params no longer supported.
-    bad_params = ['spectral_norm_attn_iters', 'spectral_norm_ff_iters',
-                  'residual_balancer_attn', 'residual_balancer_ff', 'layer_drop']
-    for name in bad_params:
-        del cfg['transformer_lm'][name]
-    OmegaConf.set_struct(cfg, True)
-    return cfg
-def export_encodec(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
-    pkg = torch.load(checkpoint_path, 'cpu')
-    new_pkg = {
-        'best_state': pkg['ema']['state']['model'],
-        'xp.cfg': OmegaConf.to_yaml(pkg['xp.cfg']),
-        # The following params were NOT exported for the first release of MusicGen.
-        'version': __version__,
-        'exported': True,
-    }
-    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
-    torch.save(new_pkg, out_file)
-    return out_file
-def export_lm(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
-    pkg = torch.load(checkpoint_path, 'cpu')
-    if pkg['fsdp_best_state']:
-        best_state = pkg['fsdp_best_state']['model']
-    else:
-        best_state = pkg['best_state']['model']
-    new_pkg = {
-        'best_state': best_state,
-        'xp.cfg': OmegaConf.to_yaml(_clean_lm_cfg(pkg['xp.cfg'])),
-        # The following params were NOT exported for the first release of MusicGen.
-        'version': __version__,
-        'exported': True,
-    }
-    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
-    torch.save(new_pkg, out_file)
-    return out_file

audiocraft/utils/notebook.py DELETED Viewed

@@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-try:
-    import IPython.display as ipd  # type: ignore
-except ImportError:
-    # Note in a notebook...
-    pass
-import torch
-def display_audio(samples: torch.Tensor, sample_rate: int):
-    """Renders an audio player for the given audio samples.
-    Args:
-        samples (torch.Tensor): a Tensor of decoded audio samples
-            with shapes [B, C, T] or [C, T]
-        sample_rate (int): sample rate audio should be displayed with.
-    """
-    assert samples.dim() == 2 or samples.dim() == 3
-    samples = samples.detach().cpu()
-    if samples.dim() == 2:
-        samples = samples[None, ...]
-    for audio in samples:
-        ipd.display(ipd.Audio(audio, rate=sample_rate))

audiocraft/utils/profiler.py DELETED Viewed

@@ -1,38 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import logging
-import typing as tp
-import dora
-import torch
-logger = logging.getLogger(__name__)
-class Profiler:
-    """Context manager wrapper for xformers profiler.
-    """
-    def __init__(self, module: torch.nn.Module, enabled: bool = False):
-        self.profiler: tp.Optional[tp.Any] = None
-        if enabled:
-            from xformers.profiler import profile
-            output_dir = dora.get_xp().folder / 'profiler_data'
-            logger.info("Profiling activated, results with be saved to %s", output_dir)
-            self.profiler = profile(output_dir=output_dir, module=module)
-    def step(self):
-        if self.profiler is not None:
-            self.profiler.step()  # type: ignore
-    def __enter__(self):
-        if self.profiler is not None:
-            return self.profiler.__enter__()  # type: ignore
-    def __exit__(self, exc_type, exc_value, exc_tb):
-        if self.profiler is not None:
-            return self.profiler.__exit__(exc_type, exc_value, exc_tb)  # type: ignore

audiocraft/utils/utils.py DELETED Viewed

@@ -1,98 +0,0 @@
-import hashlib
-import json
-import logging
-import typing as tp
-import flashy
-import flashy.distrib
-import omegaconf
-import torch
-logger = logging.getLogger(__name__)
-def model_hash(model: torch.nn.Module) -> str:
-    """Return a model hash. This should allow us to track regressions in model init
-    from the logs of past experiments.
-    """
-    hasher = hashlib.sha1()
-    for p in model.parameters():
-        hasher.update(p.data.cpu().numpy().tobytes())
-    return hasher.hexdigest()
-def dict_from_config(cfg: omegaconf.DictConfig) -> dict:
-    """Convenience function to map an omegaconf configuration to a dictionary.
-    Args:
-        cfg (omegaconf.DictConfig): Original configuration to map to dict.
-    Returns:
-        dict: Config as dictionary object.
-    """
-    dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
-    assert isinstance(dct, dict)
-    return dct
-def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
-               num_workers: int, seed: int, **kwargs) -> torch.utils.data.DataLoader:
-    """Convenience function to load dataset into a dataloader with optional subset sampling.
-    Args:
-        dataset: Dataset to load.
-        num_samples (Optional[int]): Number of samples to limit subset size.
-        batch_size (int): Batch size.
-        num_workers (int): Number of workers for data loading.
-        seed (int): Random seed.
-    """
-    if num_samples is not None:
-        dataset = random_subset(dataset, num_samples, seed)
-    dataloader = flashy.distrib.loader(
-        dataset,
-        batch_size=batch_size,
-        num_workers=num_workers,
-        **kwargs
-    )
-    return dataloader
-def get_dataset_from_loader(dataloader):
-    dataset = dataloader.dataset
-    if isinstance(dataset, torch.utils.data.Subset):
-        return dataset.dataset
-    else:
-        return dataset
-def sample_top_k(p, k=250, n_draw=None):
-    """
-        p probabs 2048 ?
-        num_draw : how many tokens to sample (for duplicate elongation)
-    """
-    p = torch.softmax(p, dim=-1)  # p/temp
-    top_k_value, i250 = torch.topk(p, k, dim=-1)   # probs: [1, 4, 2048]
-    # print('\n_____TOPK________\n', top_k_value.shape, top_k_value[0, 0, :10], '\n___________END_TOPK____________\n')
-    min_value_top_k = top_k_value[..., [-1]]  #
-    p *= (p >= min_value_top_k).float()
-    p.div_(p.sum(dim=-1, keepdim=True))
-    # -- next_token = multinomial(probs, num_samples=num_draw)
-    # RESHAPED into bs, 4, 250
-    p_ = p.reshape(-1, p.shape[-1])
-    out = torch.multinomial(p_,
-                             num_samples=n_draw,
-                             replacement=False)  # [4, num_draw]
-    return out.transpose(0, 1)[:, :, None]       # [num_draw, 4, 1]

demo.py CHANGED Viewed

@@ -1,14 +1,84 @@
-from audiocraft.audiogen import AudioGen #, audio_write
 import audiofile
 import numpy as np
 print('\n\n\n\n___________________')
 txt = 'dogs barging in the street'
-sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=.46)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7

 import audiofile
 import numpy as np
+import typing as tp
+import torch
+from audiocraft.loaders import load_compression_model, load_lm_model
+from audiocraft.lm import LMModel
+from audiocraft.conditioners import ConditioningAttributes
+class AudioGen():
+    def __init__(self,
+                 compression_model=None,
+                 lm=None,
+                 duration=.04,
+                 top_k=249):
+        self.compression_model = compression_model
+        self.lm = lm
+        self.top_k = top_k
+        self.compression_model.eval()
+        self.lm.eval()
+        self.duration = duration
+        self.device = next(iter(lm.parameters())).device
+    @property
+    def frame_rate(self) -> float:
+        """Roughly the number of AR steps per seconds."""
+        return self.compression_model.frame_rate
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate of the generated audio."""
+        return self.compression_model.sample_rate
+    def generate(self, descriptions):
+        attributes = [
+            ConditioningAttributes(text={'description': d}) for d in descriptions]
+        tokens = self._generate_tokens(attributes)
+        print(f'\n{tokens.shape=}\n{tokens=}  FINAL 5 AUD')
+        return self.generate_audio(tokens)
+    def _generate_tokens(self, attributes):
+        total_gen_len = int(self.duration * self.frame_rate)
+        gen_tokens = self.lm.generate(conditions=attributes,
+                                          max_gen_len=total_gen_len)
+        gen_tokens = gen_tokens.transpose(0, 1).reshape(4, -1)[None, :, :]
+        return gen_tokens
+    def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:
+        """Generate Audio from tokens."""
+        assert gen_tokens.dim() == 3
+        with torch.no_grad():
+            gen_audio = self.compression_model.decode(gen_tokens, None)
+        return gen_audio
+device = 'cuda:0'
+  # https://huggingface.co/facebook/audiogen-medium
+sound_generator = AudioGen(
+    compression_model=load_compression_model('facebook/audiogen-medium', device=device),
+    lm=load_lm_model('facebook/audiogen-medium', device=device).to(torch.float),
+    duration=.04,
+    top_k=1)
 print('\n\n\n\n___________________')
 txt = 'dogs barging in the street'
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7