Spaces:
Build error
Build error
| import os | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| import numpy as np | |
| from coqpit import Coqpit | |
| from encodec import EncodecModel | |
| from transformers import BertTokenizer | |
| from TTS.tts.layers.bark.inference_funcs import ( | |
| codec_decode, | |
| generate_coarse, | |
| generate_fine, | |
| generate_text_semantic, | |
| generate_voice, | |
| load_voice, | |
| ) | |
| from TTS.tts.layers.bark.load_model import load_model | |
| from TTS.tts.layers.bark.model import GPT | |
| from TTS.tts.layers.bark.model_fine import FineGPT | |
| from TTS.tts.models.base_tts import BaseTTS | |
| class BarkAudioConfig(Coqpit): | |
| sample_rate: int = 24000 | |
| output_sample_rate: int = 24000 | |
| class Bark(BaseTTS): | |
| def __init__( | |
| self, | |
| config: Coqpit, | |
| tokenizer: BertTokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased"), | |
| ) -> None: | |
| super().__init__(config=config, ap=None, tokenizer=None, speaker_manager=None, language_manager=None) | |
| self.config.num_chars = len(tokenizer) | |
| self.tokenizer = tokenizer | |
| self.semantic_model = GPT(config.semantic_config) | |
| self.coarse_model = GPT(config.coarse_config) | |
| self.fine_model = FineGPT(config.fine_config) | |
| self.encodec = EncodecModel.encodec_model_24khz() | |
| self.encodec.set_target_bandwidth(6.0) | |
| def device(self): | |
| return next(self.parameters()).device | |
| def load_bark_models(self): | |
| self.semantic_model, self.config = load_model( | |
| ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text" | |
| ) | |
| self.coarse_model, self.config = load_model( | |
| ckpt_path=self.config.LOCAL_MODEL_PATHS["coarse"], | |
| device=self.device, | |
| config=self.config, | |
| model_type="coarse", | |
| ) | |
| self.fine_model, self.config = load_model( | |
| ckpt_path=self.config.LOCAL_MODEL_PATHS["fine"], device=self.device, config=self.config, model_type="fine" | |
| ) | |
| def train_step( | |
| self, | |
| ): | |
| pass | |
| def text_to_semantic( | |
| self, | |
| text: str, | |
| history_prompt: Optional[str] = None, | |
| temp: float = 0.7, | |
| base=None, | |
| allow_early_stop=True, | |
| **kwargs, | |
| ): | |
| """Generate semantic array from text. | |
| Args: | |
| text: text to be turned into audio | |
| history_prompt: history choice for audio cloning | |
| temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| Returns: | |
| numpy semantic array to be fed into `semantic_to_waveform` | |
| """ | |
| x_semantic = generate_text_semantic( | |
| text, | |
| self, | |
| history_prompt=history_prompt, | |
| temp=temp, | |
| base=base, | |
| allow_early_stop=allow_early_stop, | |
| **kwargs, | |
| ) | |
| return x_semantic | |
| def semantic_to_waveform( | |
| self, | |
| semantic_tokens: np.ndarray, | |
| history_prompt: Optional[str] = None, | |
| temp: float = 0.7, | |
| base=None, | |
| ): | |
| """Generate audio array from semantic input. | |
| Args: | |
| semantic_tokens: semantic token output from `text_to_semantic` | |
| history_prompt: history choice for audio cloning | |
| temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| Returns: | |
| numpy audio array at sample frequency 24khz | |
| """ | |
| x_coarse_gen = generate_coarse( | |
| semantic_tokens, | |
| self, | |
| history_prompt=history_prompt, | |
| temp=temp, | |
| base=base, | |
| ) | |
| x_fine_gen = generate_fine( | |
| x_coarse_gen, | |
| self, | |
| history_prompt=history_prompt, | |
| temp=0.5, | |
| base=base, | |
| ) | |
| audio_arr = codec_decode(x_fine_gen, self) | |
| return audio_arr, x_coarse_gen, x_fine_gen | |
| def generate_audio( | |
| self, | |
| text: str, | |
| history_prompt: Optional[str] = None, | |
| text_temp: float = 0.7, | |
| waveform_temp: float = 0.7, | |
| base=None, | |
| allow_early_stop=True, | |
| **kwargs, | |
| ): | |
| """Generate audio array from input text. | |
| Args: | |
| text: text to be turned into audio | |
| history_prompt: history choice for audio cloning | |
| text_temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| Returns: | |
| numpy audio array at sample frequency 24khz | |
| """ | |
| x_semantic = self.text_to_semantic( | |
| text, | |
| history_prompt=history_prompt, | |
| temp=text_temp, | |
| base=base, | |
| allow_early_stop=allow_early_stop, | |
| **kwargs, | |
| ) | |
| audio_arr, c, f = self.semantic_to_waveform( | |
| x_semantic, history_prompt=history_prompt, temp=waveform_temp, base=base | |
| ) | |
| return audio_arr, [x_semantic, c, f] | |
| def generate_voice(self, audio, speaker_id, voice_dir): | |
| """Generate a voice from the given audio and text. | |
| Args: | |
| audio (str): Path to the audio file. | |
| speaker_id (str): Speaker name. | |
| voice_dir (str): Path to the directory to save the generate voice. | |
| """ | |
| if voice_dir is not None: | |
| voice_dirs = [voice_dir] | |
| try: | |
| _ = load_voice(speaker_id, voice_dirs) | |
| except (KeyError, FileNotFoundError): | |
| output_path = os.path.join(voice_dir, speaker_id + ".npz") | |
| os.makedirs(voice_dir, exist_ok=True) | |
| generate_voice(audio, self, output_path) | |
| def _set_voice_dirs(self, voice_dirs): | |
| def_voice_dir = None | |
| if isinstance(self.config.DEF_SPEAKER_DIR, str): | |
| os.makedirs(self.config.DEF_SPEAKER_DIR, exist_ok=True) | |
| if os.path.isdir(self.config.DEF_SPEAKER_DIR): | |
| def_voice_dir = self.config.DEF_SPEAKER_DIR | |
| _voice_dirs = [def_voice_dir] if def_voice_dir is not None else [] | |
| if voice_dirs is not None: | |
| if isinstance(voice_dirs, str): | |
| voice_dirs = [voice_dirs] | |
| _voice_dirs = voice_dirs + _voice_dirs | |
| return _voice_dirs | |
| # TODO: remove config from synthesize | |
| def synthesize( | |
| self, text, config, speaker_id="random", voice_dirs=None, **kwargs | |
| ): # pylint: disable=unused-argument | |
| """Synthesize speech with the given input text. | |
| Args: | |
| text (str): Input text. | |
| config (BarkConfig): Config with inference parameters. | |
| speaker_id (str): One of the available speaker names. If `random`, it generates a random speaker. | |
| speaker_wav (str): Path to the speaker audio file for cloning a new voice. It is cloned and saved in | |
| `voice_dirs` with the name `speaker_id`. Defaults to None. | |
| voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None. | |
| **kwargs: Model specific inference settings used by `generate_audio()` and `TTS.tts.layers.bark.inference_funcs.generate_text_semantic(). | |
| Returns: | |
| A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference, | |
| `text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents` | |
| as latents used at inference. | |
| """ | |
| speaker_id = "random" if speaker_id is None else speaker_id | |
| voice_dirs = self._set_voice_dirs(voice_dirs) | |
| history_prompt = load_voice(self, speaker_id, voice_dirs) | |
| outputs = self.generate_audio(text, history_prompt=history_prompt, **kwargs) | |
| return_dict = { | |
| "wav": outputs[0], | |
| "text_inputs": text, | |
| } | |
| return return_dict | |
| def eval_step(self): | |
| ... | |
| def forward(self): | |
| ... | |
| def inference(self): | |
| ... | |
| def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument | |
| return Bark(config) | |
| # pylint: disable=unused-argument, redefined-builtin | |
| def load_checkpoint( | |
| self, | |
| config, | |
| checkpoint_dir, | |
| text_model_path=None, | |
| coarse_model_path=None, | |
| fine_model_path=None, | |
| hubert_model_path=None, | |
| hubert_tokenizer_path=None, | |
| eval=False, | |
| strict=True, | |
| **kwargs, | |
| ): | |
| """Load a model checkpoints from a directory. This model is with multiple checkpoint files and it | |
| expects to have all the files to be under the given `checkpoint_dir` with the rigth names. | |
| If eval is True, set the model to eval mode. | |
| Args: | |
| config (TortoiseConfig): The model config. | |
| checkpoint_dir (str): The directory where the checkpoints are stored. | |
| ar_checkpoint_path (str, optional): The path to the autoregressive checkpoint. Defaults to None. | |
| diff_checkpoint_path (str, optional): The path to the diffusion checkpoint. Defaults to None. | |
| clvp_checkpoint_path (str, optional): The path to the CLVP checkpoint. Defaults to None. | |
| vocoder_checkpoint_path (str, optional): The path to the vocoder checkpoint. Defaults to None. | |
| eval (bool, optional): Whether to set the model to eval mode. Defaults to False. | |
| strict (bool, optional): Whether to load the model strictly. Defaults to True. | |
| """ | |
| text_model_path = text_model_path or os.path.join(checkpoint_dir, "text_2.pt") | |
| coarse_model_path = coarse_model_path or os.path.join(checkpoint_dir, "coarse_2.pt") | |
| fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt") | |
| hubert_model_path = hubert_model_path or os.path.join(checkpoint_dir, "hubert.pt") | |
| hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth") | |
| self.config.LOCAL_MODEL_PATHS["text"] = text_model_path | |
| self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path | |
| self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path | |
| self.config.LOCAL_MODEL_PATHS["hubert"] = hubert_model_path | |
| self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path | |
| self.load_bark_models() | |
| if eval: | |
| self.eval() | |