# Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.from typing import List, Union from typing import List, Union from ..utils import is_torch_available from .base import Pipeline if is_torch_available(): from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan" class TextToAudioPipeline(Pipeline): """ Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This pipeline generates an audio file from an input text and optional other conditional inputs. Example: ```python >>> from transformers import pipeline >>> pipe = pipeline(model="suno/bark-small") >>> output = pipe("Hey it's HuggingFace on the phone!") >>> audio = output["audio"] >>> sampling_rate = output["sampling_rate"] ``` Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or `"text-to-audio"`. See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech). """ def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs): super().__init__(*args, **kwargs) if self.framework == "tf": raise ValueError("The TextToAudioPipeline is only available in PyTorch.") self.vocoder = None if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values(): self.vocoder = ( SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device) if vocoder is None else vocoder ) self.sampling_rate = sampling_rate if self.vocoder is not None: self.sampling_rate = self.vocoder.config.sampling_rate if self.sampling_rate is None: # get sampling_rate from config and generation config config = self.model.config gen_config = self.model.__dict__.get("generation_config", None) if gen_config is not None: config.update(gen_config.to_dict()) for sampling_rate_name in ["sample_rate", "sampling_rate"]: sampling_rate = getattr(config, sampling_rate_name, None) if sampling_rate is not None: self.sampling_rate = sampling_rate def preprocess(self, text, **kwargs): if isinstance(text, str): text = [text] if self.model.config.model_type == "bark": # bark Tokenizer is called with BarkProcessor which uses those kwargs new_kwargs = { "max_length": self.model.generation_config.semantic_config.get("max_input_semantic_length", 256), "add_special_tokens": False, "return_attention_mask": True, "return_token_type_ids": False, "padding": "max_length", } # priority is given to kwargs new_kwargs.update(kwargs) kwargs = new_kwargs output = self.tokenizer(text, **kwargs, return_tensors="pt") return output def _forward(self, model_inputs, **kwargs): # we expect some kwargs to be additional tensors which need to be on the right device kwargs = self._ensure_tensor_on_device(kwargs, device=self.device) if self.model.can_generate(): output = self.model.generate(**model_inputs, **kwargs) else: output = self.model(**model_inputs, **kwargs)[0] if self.vocoder is not None: # in that case, the output is a spectrogram that needs to be converted into a waveform output = self.vocoder(output) return output def __call__(self, text_inputs: Union[str, List[str]], **forward_params): """ Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information. Args: text_inputs (`str` or `List[str]`): The text(s) to generate. forward_params (*optional*): Parameters passed to the model generation/forward method. Return: A `dict` or a list of `dict`: The dictionaries have two keys: - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform. - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform. """ return super().__call__(text_inputs, **forward_params) def _sanitize_parameters( self, preprocess_params=None, forward_params=None, ): if preprocess_params is None: preprocess_params = {} if forward_params is None: forward_params = {} postprocess_params = {} return preprocess_params, forward_params, postprocess_params def postprocess(self, waveform): output_dict = {} output_dict["audio"] = waveform.cpu().float().numpy() output_dict["sampling_rate"] = self.sampling_rate return output_dict