AudioGen class

Browse files

Files changed (8) hide show

audiocraft/__init__.py +1 -0
audiocraft/builders.py +241 -132
audiocraft/conditioners.py +4 -32
audiocraft/lm.py +5 -5
audiocraft/loaders.py +0 -130
audiocraft/lstm.py +0 -25
audiocraft/seanet.py +21 -3
demo.py +3 -57

audiocraft/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+ from .builders import AudioGen

audiocraft/builders.py CHANGED Viewed

@@ -1,12 +1,11 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
 import typing as tp
 import omegaconf
 import torch
 from .encodec import EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
@@ -15,143 +14,253 @@ from .conditioners import (
     ConditionFuser,
     ConditioningProvider,
     T5Conditioner,
 )
 from .vq import ResidualVectorQuantizer
 def dict_from_config(cfg):
     dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
     return dct
-def get_quantizer(quantizer, cfg, dimension):
-    klass = {
-        'no_quant': None,
-        'rvq': ResidualVectorQuantizer
-    }[quantizer]
-    kwargs = dict_from_config(getattr(cfg, quantizer))
-    if quantizer != 'no_quant':
-        kwargs['dimension'] = dimension
-    return klass(**kwargs)
-def get_encodec_autoencoder(cfg):
-    kwargs = dict_from_config(getattr(cfg, 'seanet'))
-    _ = kwargs.pop('encoder')
-    decoder_override_kwargs = kwargs.pop('decoder')
-    decoder_kwargs = {**kwargs, **decoder_override_kwargs}
-    decoder = SEANetDecoder(**decoder_kwargs)
-    return decoder
-def get_compression_model(cfg):
-    """Instantiate a compression model."""
-    if cfg.compression_model == 'encodec':
-        kwargs = dict_from_config(getattr(cfg, 'encodec'))
-        quantizer_name = kwargs.pop('quantizer')
-        decoder = get_encodec_autoencoder(cfg)
-        quantizer = get_quantizer(quantizer_name, cfg, 128)
-        renormalize = kwargs.pop('renormalize', False)
-        # deprecated params
-        # print(f'{frame_rate=} {encoder.dimension=}')  frame_rate=50 encoder.dimension=128
-        kwargs.pop('renorm', None)
-        # print('\n______!____________\n', kwargs, '\n______!____________\n')
-        #     ______!____________
-        #     {'autoencoder': 'seanet', 'sample_rate': 16000, 'channels': 1, 'causal': False}
-        #     ______!____________
-        return EncodecModel(decoder=decoder,
-                            quantizer=quantizer,
-                            frame_rate=50,
-                            renormalize=renormalize,
-                            sample_rate=16000,
-                            channels=1,
-                            causal=False
-                            ).to(cfg.device)
-    else:
-        raise KeyError(f"Unexpected compression model {cfg.compression_model}")
-def get_lm_model(cfg: omegaconf.DictConfig) -> LMModel:
-    """Instantiate a transformer LM."""
-    if cfg.lm_model in ['transformer_lm', 'transformer_lm_magnet']:
-        kwargs = dict_from_config(getattr(cfg, 'transformer_lm'))
-        n_q = kwargs['n_q']
-        q_modeling = kwargs.pop('q_modeling', None)
-        codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
-        attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
-        cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
-        cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
-        fuser = get_condition_fuser(cfg)
-        condition_provider = get_conditioner_provider(kwargs["dim"], cfg).to(cfg.device)
-        if len(fuser.fuse2cond['cross']) > 0:  # enforce cross-att programmatically
-            kwargs['cross_attention'] = True
-        if codebooks_pattern_cfg.modeling is None:
-            assert q_modeling is not None, \
-                "LM model should either have a codebook pattern defined or transformer_lm.q_modeling"
-            codebooks_pattern_cfg = omegaconf.OmegaConf.create(
-                {'modeling': q_modeling, 'delay': {'delays': list(range(n_q))}}
-            )
-        pattern_provider = get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
-        # lm_class = MagnetLMModel if cfg.lm_model == 'transformer_lm_magnet' else LMModel
-        lm_class = LMModel # hard coded D
-        print(f'{lm_class=}\n\n\n\n=====================')
-        return lm_class(
-            pattern_provider=pattern_provider,
-            condition_provider=condition_provider,
-            fuser=fuser,
-            cfg_dropout=cfg_prob,
-            cfg_coef=cfg_coef,
-            attribute_dropout=attribute_dropout,
-            dtype=getattr(torch, cfg.dtype),
-            device=cfg.device,
-            **kwargs
-        ).to(cfg.device)
-    else:
-        raise KeyError(f"Unexpected LM model {cfg.lm_model}")
-def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -> ConditioningProvider:
-    """Instantiate a conditioning model."""
-    device = cfg.device
-    duration = cfg.dataset.segment_duration
-    cfg = getattr(cfg, 'conditioners')
-    dict_cfg = {} if cfg is None else dict_from_config(cfg)
-    conditioners: tp.Dict[str, T5Conditioner] = {}
-    condition_provider_args = dict_cfg.pop('args', {})
-    condition_provider_args.pop('merge_text_conditions_p', None)
-    condition_provider_args.pop('drop_desc_p', None)
-    for cond, cond_cfg in dict_cfg.items():
-        model_type = cond_cfg['model']
-        model_args = cond_cfg[model_type]
-        if model_type == 't5':
-            conditioners[str(cond)] = T5Conditioner(output_dim=output_dim, device=device, **model_args)
         else:
-            raise ValueError(f"Unrecognized conditioning model: {model_type}")
-    conditioner = ConditioningProvider(conditioners, device=device, **condition_provider_args)
-    return conditioner
-def get_condition_fuser(cfg: omegaconf.DictConfig) -> ConditionFuser:
-    """Instantiate a condition fuser object."""
-    fuser_cfg = getattr(cfg, 'fuser')
-    fuser_methods = ['sum', 'cross', 'prepend', 'input_interpolate']
-    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
-    kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
-    fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
-    return fuser
-def get_codebooks_pattern_provider(n_q, cfg):
-    pattern_providers = {
-        'delay': DelayedPatternProvider,  # THIS
-    }
-    name = cfg.modeling
-    kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
-    klass = pattern_providers[name]
-    return klass(n_q, **kwargs)

 import typing as tp
 import omegaconf
+from torch import nn
 import torch
+from huggingface_hub import hf_hub_download
+import os
+from omegaconf import OmegaConf, DictConfig
 from .encodec import EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
     ConditionFuser,
     ConditioningProvider,
     T5Conditioner,
+    ConditioningAttributes
 )
 from .vq import ResidualVectorQuantizer
+def _delete_param(cfg: DictConfig, full_name: str):
+    parts = full_name.split('.')
+    for part in parts[:-1]:
+        if part in cfg:
+            cfg = cfg[part]
+        else:
+            return
+    OmegaConf.set_struct(cfg, False)
+    if parts[-1] in cfg:
+        del cfg[parts[-1]]
+    OmegaConf.set_struct(cfg, True)
 def dict_from_config(cfg):
     dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
     return dct
+# ============================================== DEFINE AUDIOGEN
+class AudioGen(nn.Module):
+    # https://huggingface.co/facebook/audiogen-medium
+    def __init__(self,
+                 duration=0.024,
+                 device='cpu'):
+        super().__init__()
+        self.device = device  # needed for loading & select float16 LM
+        self.load_compression_model()
+        self.load_lm_model()
+        self.duration = duration
+    @property
+    def frame_rate(self):
+        return self.compression_model.frame_rate
+    def generate(self,
+                 descriptions):
+        with torch.no_grad():
+            attributes = [
+                ConditioningAttributes(text={'description': d}) for d in descriptions]
+            gen_tokens = self.lm.generate(
+                conditions=attributes,
+                max_gen_len=int(self.duration * self.frame_rate)) #[n_draw, 4, 37]
+            x = self.compression_model.decode(gen_tokens, None)   #[n_draw, 1, 11840]
+            n_draw, _, n_time_samples = x.shape
+            x = x.reshape(1, n_draw * n_time_samples)  # linearise n_draw
+            print('______________\nGENTOk 5', gen_tokens)
+            print('GENAUD 5', x.sum())
+        return x
+    # == BUILD Fn
+    def get_quantizer(self, quantizer, cfg, dimension):
+        klass = {
+            'no_quant': None,
+            'rvq': ResidualVectorQuantizer
+        }[quantizer]
+        kwargs = dict_from_config(getattr(cfg, quantizer))
+        if quantizer != 'no_quant':
+            kwargs['dimension'] = dimension
+        return klass(**kwargs)
+    def get_encodec_autoencoder(self, cfg):
+        kwargs = dict_from_config(getattr(cfg, 'seanet'))
+        _ = kwargs.pop('encoder')
+        decoder_override_kwargs = kwargs.pop('decoder')
+        decoder_kwargs = {**kwargs, **decoder_override_kwargs}
+        decoder = SEANetDecoder(**decoder_kwargs)
+        return decoder
+    def get_compression_model(self, cfg):
+        """Instantiate a compression model."""
+        if cfg.compression_model == 'encodec':
+            kwargs = dict_from_config(getattr(cfg, 'encodec'))
+            quantizer_name = kwargs.pop('quantizer')
+            decoder = self.get_encodec_autoencoder(cfg)
+            quantizer = self.get_quantizer(quantizer_name, cfg, 128)
+            renormalize = kwargs.pop('renormalize', False)
+            # deprecated params
+            # print(f'{frame_rate=} {encoder.dimension=}')  frame_rate=50 encoder.dimension=128
+            kwargs.pop('renorm', None)
+            # print('\n______!____________\n', kwargs, '\n______!____________\n')
+            #     ______!____________
+            #     {'autoencoder': 'seanet', 'sample_rate': 16000, 'channels': 1, 'causal': False}
+            #     ______!____________
+            return EncodecModel(decoder=decoder,
+                                quantizer=quantizer,
+                                frame_rate=50,
+                                renormalize=renormalize,
+                                sample_rate=16000,
+                                channels=1,
+                                causal=False
+                                ).to(cfg.device)
         else:
+            raise KeyError(f"Unexpected compression model {cfg.compression_model}")
+    def get_lm_model(self, cfg):
+        """Instantiate a transformer LM."""
+        if cfg.lm_model in ['transformer_lm',
+                            'transformer_lm_magnet']:
+            kwargs = dict_from_config(getattr(cfg, 'transformer_lm'))
+            n_q = kwargs['n_q']
+            q_modeling = kwargs.pop('q_modeling', None)
+            codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
+            attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
+            cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
+            cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
+            fuser = self.get_condition_fuser(cfg)
+            condition_provider = self.get_conditioner_provider(kwargs["dim"], cfg
+                                                               ).to(self.device)
+            if len(fuser.fuse2cond['cross']) > 0:  # enforce cross-att programmatically
+                kwargs['cross_attention'] = True
+            if codebooks_pattern_cfg.modeling is None:
+                print('Q MODELING\n=\n=><')
+                assert q_modeling is not None, \
+                    "LM model should either have a codebook pattern defined or transformer_lm.q_modeling"
+                codebooks_pattern_cfg = omegaconf.OmegaConf.create(
+                    {'modeling': q_modeling, 'delay': {'delays': list(range(n_q))}}
+                )
+            pattern_provider = self.get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
+            return LMModel(
+                pattern_provider=pattern_provider,
+                condition_provider=condition_provider,
+                fuser=fuser,
+                cfg_dropout=cfg_prob,
+                cfg_coef=cfg_coef,
+                attribute_dropout=attribute_dropout,
+                dtype=getattr(torch, cfg.dtype),
+                device=self.device,
+                **kwargs
+            ).to(cfg.device)
+        else:
+            raise KeyError(f"Unexpected LM model {cfg.lm_model}")
+    def get_conditioner_provider(self, output_dim,
+                                cfg):
+        """Instantiate T5 text"""
+        cfg = getattr(cfg, 'conditioners')
+        dict_cfg = {} if cfg is None else dict_from_config(cfg)
+        conditioners={}
+        condition_provider_args = dict_cfg.pop('args', {})
+        condition_provider_args.pop('merge_text_conditions_p', None)
+        condition_provider_args.pop('drop_desc_p', None)
+        for cond, cond_cfg in dict_cfg.items():
+            model_type = cond_cfg['model']
+            model_args = cond_cfg[model_type]
+            if model_type == 't5':
+                conditioners[str(cond)] = T5Conditioner(output_dim=output_dim,
+                                                        device=self.device,
+                                                        **model_args)
+            else:
+                raise ValueError(f"Unrecognized conditioning model: {model_type}")
+        # print(f'{condition_provider_args=}')
+        return ConditioningProvider(conditioners)
+    def get_condition_fuser(self, cfg):
+        """Instantiate a condition fuser object."""
+        fuser_cfg = getattr(cfg, 'fuser')
+        fuser_methods = ['sum', 'cross', 'prepend', 'input_interpolate']
+        fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
+        kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
+        fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
+        return fuser
+    def get_codebooks_pattern_provider(self, n_q, cfg):
+        pattern_providers = {
+            'delay': DelayedPatternProvider,  # THIS
+        }
+        name = cfg.modeling
+        kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
+        klass = pattern_providers[name]
+        return klass(n_q, **kwargs)
+    # ======================
+    def load_compression_model(self):
+        file = hf_hub_download(
+            repo_id='facebook/audiogen-medium',
+            filename="compression_state_dict.bin",
+            cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
+            library_name="audiocraft",
+            library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
+        pkg = torch.load(file, map_location='cpu')
+        # if 'pretrained' in pkg:
+        #     print('NO RPtrained\n=\n=\n=\n=\n=')
+        #     return EncodecModel.get_pretrained(pkg['pretrained'], device='cpu')
+        cfg = OmegaConf.create(pkg['xp.cfg'])
+        cfg.device = 'cpu'
+        model = self.get_compression_model(cfg)
+        model.load_state_dict(pkg['best_state'], strict=False)  # ckpt has also unused encoder weights
+        # return model
+        self.compression_model = model
+    def load_lm_model(self):
+        file = hf_hub_download(
+            repo_id='facebook/audiogen-medium',
+            filename="state_dict.bin",
+            cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
+            library_name="audiocraft",
+            library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
+        pkg = torch.load(file,
+                        map_location=self.device) #'cpu')
+        cfg = OmegaConf.create(pkg['xp.cfg'])
+        # cfg.device = 'cpu'
+        if self.device == 'cpu':
+            cfg.dtype = 'float32'
+        else:
+            cfg.dtype = 'float16'
+        _delete_param(cfg, 'conditioners.self_wav.chroma_stem.cache_path')
+        _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
+        _delete_param(cfg, 'conditioners.args.drop_desc_p')
+        model = self.get_lm_model(cfg)
+        model.load_state_dict(pkg['best_state'])
+        model.cfg = cfg
+        # return model
+        self.lm = model.to(torch.float)

audiocraft/conditioners.py CHANGED Viewed

@@ -173,27 +173,12 @@ class T5Conditioner(nn.Module):
 class ConditioningProvider(nn.Module):
-    """Prepare and provide conditions given all the supported conditioners.
-    Args:
-        conditioners (dict): Dictionary of conditioners.
-        device (torch.device or str, optional): Device for conditioners and output condition types.
-    """
     def __init__(self,
-                 conditioners,
-                 device="cpu"):
         super().__init__()
-        self.device = device
         self.conditioners = nn.ModuleDict(conditioners)
-    # @property
-    # def joint_embed_conditions(self):
-    #     return [m.attribute for m in self.conditioners.values() if isinstance(m, JointEmbeddingConditioner)]
-    # @property
-    # def has_joint_embed_conditions(self):
-    #     return len(self.joint_embed_conditions) > 0
     @property
     def text_conditions(self):
         return [k for k, v in self.conditioners.items() if isinstance(v, T5Conditioner)]
@@ -201,19 +186,6 @@ class ConditioningProvider(nn.Module):
     def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
-        """Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
-        This should be called before starting any real GPU work to avoid synchronization points.
-        This will return a dict matching conditioner names to their arbitrary tokenized representations.
-        Args:
-            inputs (list[ConditioningAttributes]): List of ConditioningAttributes objects containing
-                text and wav conditions.
-        """
-        assert all([isinstance(x, ConditioningAttributes) for x in inputs]), (
-            "Got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]",
-            f" but types were {set([type(x) for x in inputs])}"
-        )
         output = {}
         text = self._collate_text(inputs)
         # wavs = self._collate_wavs(inputs)
@@ -223,9 +195,9 @@ class ConditioningProvider(nn.Module):
         #     f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
         #     f"got {text.keys(), wavs.keys(), joint_embeds.keys()}"
         # )
         for attribute, batch in text.items(): #, joint_embeds.items()):
             output[attribute] = self.conditioners[attribute].tokenize(batch)
         return output
     def forward(self, tokenized: tp.Dict[str, tp.Any]) -> tp.Dict[str, ConditionType]:
@@ -246,7 +218,7 @@ class ConditioningProvider(nn.Module):
             output[attribute] = (condition, mask)
         return output
-    def _collate_text(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.List[tp.Optional[str]]]:
         """Given a list of ConditioningAttributes objects, compile a dictionary where the keys
         are the attributes and the values are the aggregated input per attribute.
         For example:

 class ConditioningProvider(nn.Module):
     def __init__(self,
+                 conditioners):
         super().__init__()
         self.conditioners = nn.ModuleDict(conditioners)
     @property
     def text_conditions(self):
         return [k for k, v in self.conditioners.items() if isinstance(v, T5Conditioner)]
     def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
         output = {}
         text = self._collate_text(inputs)
         # wavs = self._collate_wavs(inputs)
         #     f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
         #     f"got {text.keys(), wavs.keys(), joint_embeds.keys()}"
         # )
         for attribute, batch in text.items(): #, joint_embeds.items()):
             output[attribute] = self.conditioners[attribute].tokenize(batch)
+        print(f'COndProvToknz {output=}\n==')
         return output
     def forward(self, tokenized: tp.Dict[str, tp.Any]) -> tp.Dict[str, ConditionType]:
             output[attribute] = (condition, mask)
         return output
+    def _collate_text(self, samples):
         """Given a list of ConditioningAttributes objects, compile a dictionary where the keys
         are the attributes and the values are the aggregated input per attribute.
         For example:

audiocraft/lm.py CHANGED Viewed

@@ -10,7 +10,7 @@ from functools import partial
 from torch import nn
 from audiocraft.activations import get_activation_fn
-def sample_top_k(p, k=250, n_draw=None):
     """
         p probabs 2048 ?
         num_draw : how many tokens to sample (for duplicate elongation)
@@ -32,8 +32,8 @@ def sample_top_k(p, k=250, n_draw=None):
     out = torch.multinomial(p_,
-                             num_samples=n_draw,
-                             replacement=False)  # [4, num_draw]
     return out.transpose(0, 1)[:, :, None]       # [num_draw, 4, 1]
@@ -171,7 +171,7 @@ class LMModel(nn.Module):
         super().__init__()
         self.cfg_coef = cfg_coef
-        self.n_draw = 3
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
@@ -265,7 +265,7 @@ class LMModel(nn.Module):
         # input_, cross_attention_input = self.fuser(input_, condition_tensors)
         cross_attention_input = condition_tensors['description'][0]
-        print(f'{input_.shape=}')
         out = self.transformer(input_,
                                cross_attention_src=cross_attention_input,
                                token_count=token_count)

 from torch import nn
 from audiocraft.activations import get_activation_fn
+def sample_top_k(p, k=1, n_draw=None):
     """
         p probabs 2048 ?
         num_draw : how many tokens to sample (for duplicate elongation)
     out = torch.multinomial(p_,
+                            num_samples=n_draw,
+                            replacement=False)  # [4, num_draw]
     return out.transpose(0, 1)[:, :, None]       # [num_draw, 4, 1]
         super().__init__()
         self.cfg_coef = cfg_coef
+        self.n_draw = 1
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
         # input_, cross_attention_input = self.fuser(input_, condition_tensors)
         cross_attention_input = condition_tensors['description'][0]
+        # print(f'{input_.shape=}')
         out = self.transformer(input_,
                                cross_attention_src=cross_attention_input,
                                token_count=token_count)

audiocraft/loaders.py DELETED Viewed

@@ -1,130 +0,0 @@
-from pathlib import Path
-from huggingface_hub import hf_hub_download
-import typing as tp
-import os
-from omegaconf import OmegaConf, DictConfig
-import torch
-from . import builders
-from .encodec import EncodecModel
-def get_audiocraft_cache_dir() -> tp.Optional[str]:
-    return os.environ.get('AUDIOCRAFT_CACHE_DIR', None)
-def _get_state_dict(
-    file_or_url_or_id: tp.Union[Path, str],
-    filename: tp.Optional[str] = None,
-    device='cpu',
-    cache_dir: tp.Optional[str] = None,
-):
-    if cache_dir is None:
-        cache_dir = get_audiocraft_cache_dir()
-    # Return the state dict either from a file or url
-    file_or_url_or_id = str(file_or_url_or_id)
-    assert isinstance(file_or_url_or_id, str)
-    if os.path.isfile(file_or_url_or_id):
-        return torch.load(file_or_url_or_id, map_location=device)
-    if os.path.isdir(file_or_url_or_id):
-        file = f"{file_or_url_or_id}/{filename}"
-        return torch.load(file, map_location=device)
-    elif file_or_url_or_id.startswith('https://'):
-        return torch.hub.load_state_dict_from_url(file_or_url_or_id, map_location=device, check_hash=True)
-    else:
-        assert filename is not None, "filename needs to be defined if using HF checkpoints"
-        file = hf_hub_download(
-            repo_id=file_or_url_or_id, filename=filename, cache_dir=cache_dir,
-            library_name="audiocraft",
-            library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
-        return torch.load(file, map_location=device)
-def load_compression_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
-    return _get_state_dict(file_or_url_or_id, filename="compression_state_dict.bin", cache_dir=cache_dir)
-def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
-    pkg = load_compression_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
-    if 'pretrained' in pkg:
-        return EncodecModel.get_pretrained(pkg['pretrained'], device=device)
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    model = builders.get_compression_model(cfg)
-    model.load_state_dict(pkg['best_state'], strict=False)  # ckpt contains uninstantiated encoder
-    model.eval()
-    return model
-def load_lm_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
-    return _get_state_dict(file_or_url_or_id, filename="state_dict.bin", cache_dir=cache_dir)
-def _delete_param(cfg: DictConfig, full_name: str):
-    parts = full_name.split('.')
-    for part in parts[:-1]:
-        if part in cfg:
-            cfg = cfg[part]
-        else:
-            return
-    OmegaConf.set_struct(cfg, False)
-    if parts[-1] in cfg:
-        del cfg[parts[-1]]
-    OmegaConf.set_struct(cfg, True)
-def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu',
-                  cache_dir: tp.Optional[str] = None):
-    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    if cfg.device == 'cpu':
-        cfg.dtype = 'float32'
-    else:
-        cfg.dtype = 'float16'
-    _delete_param(cfg, 'conditioners.self_wav.chroma_stem.cache_path')
-    _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
-    _delete_param(cfg, 'conditioners.args.drop_desc_p')
-    model = builders.get_lm_model(cfg)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    model.cfg = cfg
-    return model
-def load_mbd_ckpt(file_or_url_or_id: tp.Union[Path, str],
-                  filename: tp.Optional[str] = None,
-                  cache_dir: tp.Optional[str] = None):
-    return _get_state_dict(file_or_url_or_id, filename=filename, cache_dir=cache_dir)
-def load_diffusion_models(file_or_url_or_id: tp.Union[Path, str],
-                          device='cpu',
-                          filename: tp.Optional[str] = None,
-                          cache_dir: tp.Optional[str] = None):
-    pkg = load_mbd_ckpt(file_or_url_or_id, filename=filename, cache_dir=cache_dir)
-    models = []
-    processors = []
-    cfgs = []
-    sample_rate = pkg['sample_rate']
-    for i in range(pkg['n_bands']):
-        cfg = pkg[i]['cfg']
-        model = builders.get_diffusion_model(cfg)
-        model_dict = pkg[i]['model_state']
-        model.load_state_dict(model_dict)
-        model.to(device)
-        processor = builders.get_processor(cfg=cfg.processor, sample_rate=sample_rate)
-        processor_dict = pkg[i]['processor_state']
-        processor.load_state_dict(processor_dict)
-        processor.to(device)
-        models.append(model)
-        processors.append(processor)
-        cfgs.append(cfg)
-    return models, processors, cfgs

audiocraft/lstm.py DELETED Viewed

@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from torch import nn
-class StreamableLSTM(nn.Module):
-    """LSTM without worrying about the hidden state, nor the layout of the data.
-    Expects input as convolutional layout.
-    """
-    def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
-        super().__init__()
-        self.skip = skip
-        self.lstm = nn.LSTM(dimension, dimension, num_layers)
-    def forward(self, x):
-        x = x.permute(2, 0, 1)
-        y, _ = self.lstm(x)
-        if self.skip:
-            y = y + x
-        y = y.permute(1, 2, 0)
-        return y

audiocraft/seanet.py CHANGED Viewed

@@ -5,12 +5,30 @@
 # LICENSE file in the root directory of this source tree.
 import typing as tp
 import numpy as np
 import torch.nn as nn
 from .conv import StreamableConv1d, StreamableConvTranspose1d
-from .lstm import StreamableLSTM
 class SEANetResnetBlock(nn.Module):

 # LICENSE file in the root directory of this source tree.
 import typing as tp
 import numpy as np
 import torch.nn as nn
 from .conv import StreamableConv1d, StreamableConvTranspose1d
+class StreamableLSTM(nn.Module):
+    """LSTM without worrying about the hidden state, nor the layout of the data.
+    Expects input as convolutional layout.
+    """
+    def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
+        super().__init__()
+        self.skip = skip
+        self.lstm = nn.LSTM(dimension, dimension, num_layers)
+    def forward(self, x):
+        print('LSTM called 1c')
+        x = x.permute(2, 0, 1)
+        y, _ = self.lstm(x)
+        if self.skip:
+            y = y + x
+        y = y.permute(1, 2, 0)
+        return y
 class SEANetResnetBlock(nn.Module):

demo.py CHANGED Viewed

@@ -1,64 +1,10 @@
 import audiofile
 import numpy as np
-import torch
-from audiocraft.loaders import load_compression_model, load_lm_model
-from audiocraft.conditioners import ConditioningAttributes
-class AudioGen():
-    def __init__(self,
-                 compression_model=None,
-                 lm=None,
-                 duration=.74):
-        self.compression_model = compression_model
-        self.lm = lm
-        self.duration = duration
-    @property
-    def frame_rate(self):
-        return self.compression_model.frame_rate
-    def generate(self,
-                 descriptions):
-        with torch.no_grad():
-            attributes = [
-                ConditioningAttributes(text={'description': d}) for d in descriptions]
-            gen_tokens = self.lm.generate(
-                conditions=attributes,
-                max_gen_len=int(self.duration * self.frame_rate)) #[n_draw, 4, 37]
-            x = self.compression_model.decode(gen_tokens, None)   #[n_draw, 1, 11840]
-            n_draw, _, n_time_samples = x.shape
-            x = x.reshape(1, n_draw * n_time_samples)  # linearise n_draw
-        return x
-device = 'cuda:0'
-  # https://huggingface.co/facebook/audiogen-medium
-sound_generator = AudioGen(
-    compression_model=load_compression_model('facebook/audiogen-medium', device=device).eval(),
-    lm=load_lm_model('facebook/audiogen-medium', device=device).to(torch.float).eval(),
-    duration=.74)
-print('\n\n\n\n___________________')
 txt = 'dogs barging in the street'
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()
 x /= np.abs(x).max() + 1e-7

 import audiofile
 import numpy as np
+from audiocraft import AudioGen
 txt = 'dogs barging in the street'
+sound_generator = AudioGen(duration=.04,
+                           device='cuda:0').to('cuda:0').eval()
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()
 x /= np.abs(x).max() + 1e-7