import os import sys from random import randint from typing import List, Optional, Set, Union from tortoise.utils.audio import get_voices, load_audio, load_voices from tortoise.utils.text import split_and_recombine_text def get_all_voices(extra_voice_dirs_str: str = ""): extra_voice_dirs = extra_voice_dirs_str.split(",") if extra_voice_dirs_str else [] return sorted(get_voices(extra_voice_dirs)), extra_voice_dirs def parse_voice_str(voice_str: str, all_voices: List[str]): selected_voices = all_voices if voice_str == "all" else voice_str.split(",") selected_voices = [v.split("&") if "&" in v else [v] for v in selected_voices] for voices in selected_voices: for v in voices: if v != "random" and v not in all_voices: raise ValueError( f"voice {v} not available, use --list-voices to see available voices." ) return selected_voices def voice_loader(selected_voices: list, extra_voice_dirs: List[str]): for voices in selected_voices: yield voices, *load_voices(voices, extra_voice_dirs) def parse_multiarg_text(text: List[str]): return (" ".join(text) if text else "".join(line for line in sys.stdin)).strip() def split_text(text: str, text_split: str): if text_split: desired_length, max_length = map(int, text_split.split(",")) if desired_length > max_length: raise ValueError( f"--text-split: desired_length ({desired_length}) must be <= max_length ({max_length})" ) texts = split_and_recombine_text(text, desired_length, max_length) else: texts = split_and_recombine_text(text) # if not texts: raise ValueError("no text provided") return texts def validate_output_dir(output_dir: str, selected_voices: list, candidates: int): if output_dir: os.makedirs(output_dir, exist_ok=True) else: if len(selected_voices) > 1: raise ValueError('cannot have multiple voices without --output-dir"') if candidates > 1: raise ValueError('cannot have multiple candidates without --output-dir"') return output_dir def check_pydub(play: bool): if play: try: import pydub import pydub.playback return pydub except ImportError: raise RuntimeError( '--play requires pydub to be installed, which can be done with "pip install pydub"' ) def get_seed(seed: Optional[int]): return randint(0, 2**32 - 1) if seed is None else seed from pathlib import Path from typing import Any, Callable import torch import torchaudio def run_and_save_tts( call_tts, text, output_dir: Path, return_deterministic_state, return_filepaths=False, voicefixer=True, ): output_dir.mkdir(exist_ok=True) if return_deterministic_state: gen, dbg = call_tts(text) torch.save(dbg, output_dir / "dbg.pt") else: gen = call_tts(text) # if not isinstance(gen, list): gen = [gen] gen = [g.squeeze(0).cpu() for g in gen] fps = [] for i, g in enumerate(gen): fps.append(output_dir / f"{i}.wav") save_gen_with_voicefix(g, fps[-1], squeeze=False, voicefixer=voicefixer) # torchaudio.save(output_dir/f'{i}.wav', g, 24000) return fps if return_filepaths else gen def infer_on_texts( call_tts: Callable[[str], Any], texts: List[str], output_dir: Union[str, Path], return_deterministic_state: bool, lines_to_regen: Set[int], logger=print, return_filepaths=False, voicefixer=True, ): audio_chunks = [] base_p = Path(output_dir) base_p.mkdir(exist_ok=True) for text_idx, text in enumerate(texts): line_p = base_p / f"{text_idx}" line_p.mkdir(exist_ok=True) # if text_idx not in lines_to_regen: files = list(line_p.glob("*.wav")) if files: logger(f"loading existing audio fragments for [{text_idx}]") audio_chunks.append([load_audio(str(f), 24000) for f in files]) continue else: logger(f"no existing audio fragment for [{text_idx}]") # logger(f"generating audio for text {text_idx}: {text}") audio_chunks.append( run_and_save_tts( call_tts, text, line_p, return_deterministic_state, voicefixer=voicefixer, ) ) fnames = [] results = [] for i in range(len(audio_chunks[0])): resultant = torch.cat([c[i] for c in audio_chunks], dim=-1) fnames.append(base_p / f"combined-{i}.wav") save_gen_with_voicefix( resultant, fnames[-1], squeeze=False, voicefixer=False ) # do not run fix on combined!! results.append(resultant) # torchaudio.save(base_p/'combined.wav', resultant, 24000) return fnames if return_filepaths else results from voicefixer import VoiceFixer vfixer = VoiceFixer() def save_gen_with_voicefix(g, fpath, squeeze=True, voicefixer=True): torchaudio.save(fpath, g.squeeze(0).cpu() if squeeze else g, 24000, format="wav") if voicefixer: vfixer.restore( input=fpath, output=fpath, cuda=True, mode=0, # your_vocoder_func = convert_mel_to_wav # TODO test if integration with unvinet improves things )