from typing import Dict, Optional, Tuple, Union from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel, BarkPreTrainedModel from transformers.models.bark.generation_configuration_bark import ( BarkCoarseGenerationConfig, BarkFineGenerationConfig, BarkSemanticGenerationConfig, ) from transformers import BarkConfig, AutoModel from transformers.modeling_utils import get_parameter_device from transformers.utils import ( is_accelerate_available, ) import torch class BarkModel(BarkPreTrainedModel): config_class = BarkConfig def __init__(self, config): super().__init__(config) self.semantic = BarkSemanticModel(config.semantic_config) self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config) self.fine_acoustics = BarkFineModel(config.fine_acoustics_config) self.codec_model = AutoModel.from_config(config.codec_config) self.config = config @property def device(self) -> torch.device: """ `torch.device`: The device on which the module is (assuming that all the module parameters are on the same device). """ # for bark_model, device must be verified on its sub-models # if has _hf_hook, has been offloaded so the device has to be found in the hook if not hasattr(self.semantic, "_hf_hook"): return get_parameter_device(self) for module in self.semantic.modules(): if ( hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "execution_device") and module._hf_hook.execution_device is not None ): return torch.device(module._hf_hook.execution_device) def enable_cpu_offload(self, gpu_id: Optional[int] = 0): r""" Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until the next sub-model runs. Args: gpu_id (`int`, *optional*, defaults to 0): GPU id on which the sub-models will be loaded and offloaded. """ if is_accelerate_available(): from accelerate import cpu_offload_with_hook else: raise ImportError("`enable_model_cpu_offload` requires `accelerate`.") device = torch.device(f"cuda:{gpu_id}") if self.device.type != "cpu": self.to("cpu") torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) # this layer is used outside the first foward pass of semantic so need to be loaded before semantic self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device) hook = None for cpu_offloaded_model in [ self.semantic, self.coarse_acoustics, self.fine_acoustics, ]: _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) self.fine_acoustics_hook = hook _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook) # We'll offload the last model manually. self.codec_model_hook = hook def codec_decode(self, fine_output): """Turn quantized audio codes into audio array using encodec.""" fine_output = fine_output.transpose(0, 1) emb = self.codec_model.quantizer.decode(fine_output) out = self.codec_model.decoder(emb) audio_arr = out.squeeze(1) # squeeze the codebook dimension return audio_arr @torch.no_grad() def generate( self, input_ids: Optional[torch.Tensor] = None, history_prompt: Optional[Dict[str, torch.Tensor]] = None, **kwargs, ) -> torch.LongTensor: """ Generates audio from an input prompt and an additional optional `Bark` speaker prompt. Args: input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*): Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the longest generation among the batch. history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*): Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch. kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types: - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model. - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the semantic, coarse and fine respectively. It has the priority over the keywords without a prefix. This means you can, for example, specify a generation strategy for all sub-models except one. Returns: torch.LongTensor: Output generated audio. Example: ```python >>> from transformers import AutoProcessor, BarkModel >>> processor = AutoProcessor.from_pretrained("suno/bark-small") >>> model = BarkModel.from_pretrained("suno/bark-small") >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)` >>> voice_preset = "v2/en_speaker_6" >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset) >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100) >>> audio_array = audio_array.cpu().numpy().squeeze() ``` """ # TODO (joao):workaround until nested generation config is compatible with PreTrained Model # todo: dict semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config) coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config) fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config) kwargs_semantic = { # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel "attention_mask": kwargs.pop("attention_mask", None) } kwargs_coarse = {} kwargs_fine = {} for key, value in kwargs.items(): if key.startswith("semantic_"): key = key[len("semantic_") :] kwargs_semantic[key] = value elif key.startswith("coarse_"): key = key[len("coarse_") :] kwargs_coarse[key] = value elif key.startswith("fine_"): key = key[len("fine_") :] kwargs_fine[key] = value else: # If the key is already in a specific config, then it's been set with a # submodules specific value and we don't override if key not in kwargs_semantic: kwargs_semantic[key] = value if key not in kwargs_coarse: kwargs_coarse[key] = value if key not in kwargs_fine: kwargs_fine[key] = value # 1. Generate from the semantic model semantic_output = self.semantic.generate( input_ids, history_prompt=history_prompt, semantic_generation_config=semantic_generation_config, **kwargs_semantic, ) # 2. Generate from the coarse model coarse_output = self.coarse_acoustics.generate( semantic_output, history_prompt=history_prompt, semantic_generation_config=semantic_generation_config, coarse_generation_config=coarse_generation_config, codebook_size=self.generation_config.codebook_size, **kwargs_coarse, ) # 3. "generate" from the fine model output = self.fine_acoustics.generate( coarse_output, history_prompt=history_prompt, semantic_generation_config=semantic_generation_config, coarse_generation_config=coarse_generation_config, fine_generation_config=fine_generation_config, codebook_size=self.generation_config.codebook_size, **kwargs_fine, ) if getattr(self, "fine_acoustics_hook", None) is not None: # Manually offload fine_acoustics to CPU # and load codec_model to GPU # since bark doesn't use codec_model forward pass self.fine_acoustics_hook.offload() self.codec_model = self.codec_model.to(self.device) return output