Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
# AGPL: a notification must be added stating that changes have been made to that file. | |
import os | |
import sys | |
import tempfile | |
from dataclasses import dataclass | |
from pathlib import Path | |
from typing import Literal, Optional | |
import torch | |
import torchaudio | |
from simple_parsing import ArgumentParser, field | |
from tortoise.api import MODELS_DIR, TextToSpeech | |
from tortoise.utils.audio import load_audio | |
from tortoise.utils.diffusion import SAMPLERS | |
from tortoise.models.vocoder import VocConf | |
class General: | |
"""General options""" | |
text: str = field(positional=True, nargs="*", metavar="text") | |
"""Text to speak. If omitted, text is read from stdin.""" | |
voice: str = field(default="random", alias=["-v"]) | |
"""Selects the voice to use for generation. Use the & character to join two voices together. | |
Use a comma to perform inference on multiple voices. Set to "all" to use all available voices. | |
Note that multiple voices require the --output-dir option to be set.""" | |
voices_dir: Optional[str] = field(default=None, alias=["-V"]) | |
"""Path to directory containing extra voices to be loaded. Use a comma to specify multiple directories.""" | |
preset: Literal["ultra_fast", "fast", "standard", "high_quality"] = field( | |
default="fast", alias=["-p"] | |
) | |
"""Which voice quality preset to use.""" | |
quiet: bool = field(default=False, alias=["-q"]) | |
"""Suppress all output.""" | |
voicefixer: bool = field(default=True) | |
"""Enable/Disable voicefixer""" | |
class Output: | |
"""Output options""" | |
list_voices: bool = field(default=False, alias=["-l"]) | |
"""List available voices and exit.""" | |
play: bool = field(default=False, alias=["-P"]) | |
"""Play the audio (requires pydub).""" | |
output: Optional[Path] = field(default=None, alias=["-o"]) | |
"""Save the audio to a file.""" | |
output_dir: Path = field(default=Path("results/"), alias=["-O"]) | |
"""Save the audio to a directory as individual segments.""" | |
class MultiOutput: | |
"""Multi-output options""" | |
candidates: int = 1 | |
"""How many output candidates to produce per-voice. Note that only the first candidate is used in the combined output.""" | |
regenerate: Optional[str] = None | |
"""Comma-separated list of clip numbers to re-generate.""" | |
skip_existing: bool = False | |
"""Set to skip re-generating existing clips.""" | |
class Advanced: | |
"""Advanced options""" | |
produce_debug_state: bool = False | |
"""Whether or not to produce debug_states in current directory, which can aid in reproducing problems.""" | |
seed: Optional[int] = None | |
"""Random seed which can be used to reproduce results.""" | |
models_dir: str = MODELS_DIR | |
"""Where to find pretrained model checkpoints. Tortoise automatically downloads these to | |
~/.cache/tortoise/.models, so this should only be specified if you have custom checkpoints.""" | |
text_split: Optional[str] = None | |
"""How big chunks to split the text into, in the format <desired_length>,<max_length>.""" | |
disable_redaction: bool = False | |
"""Normally text enclosed in brackets are automatically redacted from the spoken output | |
(but are still rendered by the model), this can be used for prompt engineering. | |
Set this to disable this behavior.""" | |
device: Optional[str] = None | |
"""Device to use for inference.""" | |
batch_size: Optional[int] = None | |
"""Batch size to use for inference. If omitted, the batch size is set based on available GPU memory.""" | |
vocoder: Literal["Univnet", "BigVGAN", "BigVGAN_Base"] = "BigVGAN_Base" | |
"""Pretrained vocoder to be used. | |
Univnet - tortoise original | |
BigVGAN - 112M model | |
BigVGAN_Base - 14M model | |
""" | |
ar_checkpoint: Optional[str] = None | |
"""Path to a checkpoint to use for the autoregressive model. If omitted, the default checkpoint is used.""" | |
clvp_checkpoint: Optional[str] = None | |
"""Path to a checkpoint to use for the CLVP model. If omitted, the default checkpoint is used.""" | |
diff_checkpoint: Optional[str] = None | |
"""Path to a checkpoint to use for the diffusion model. If omitted, the default checkpoint is used.""" | |
class Tuning: | |
"""Tuning options (overrides preset settings)""" | |
num_autoregressive_samples: Optional[int] = None | |
"""Number of samples taken from the autoregressive model, all of which are filtered using CLVP. | |
As TorToiSe is a probabilistic model, more samples means a higher probability of creating something "great".""" | |
temperature: Optional[float] = None | |
"""The softmax temperature of the autoregressive model.""" | |
length_penalty: Optional[float] = None | |
"""A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs.""" | |
repetition_penalty: Optional[float] = None | |
"""A penalty that prevents the autoregressive decoder from repeating itself during decoding. | |
Can be used to reduce the incidence of long silences or "uhhhhhhs", etc.""" | |
top_p: Optional[float] = None | |
"""P value used in nucleus sampling. 0 to 1. Lower values mean the decoder produces more "likely" (aka boring) outputs.""" | |
max_mel_tokens: Optional[int] = None | |
"""Restricts the output length. 1 to 600. Each unit is 1/20 of a second.""" | |
cvvp_amount: Optional[float] = None | |
"""How much the CVVP model should influence the output. | |
Increasing this can in some cases reduce the likelihood of multiple speakers.""" | |
diffusion_iterations: Optional[int] = None | |
"""Number of diffusion steps to perform. More steps means the network has more chances to iteratively | |
refine the output, which should theoretically mean a higher quality output. | |
Generally a value above 250 is not noticeably better, however.""" | |
cond_free: Optional[bool] = None | |
"""Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for | |
each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output | |
of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and | |
dramatically improves realism.""" | |
cond_free_k: Optional[float] = None | |
"""Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf]. | |
As cond_free_k increases, the output becomes dominated by the conditioning-free signal. | |
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k""" | |
diffusion_temperature: Optional[float] = None | |
"""Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 | |
are the "mean" prediction of the diffusion network and will sound bland and smeared.""" | |
class Speed: | |
"""New/speed options""" | |
low_vram: bool = False | |
"""re-enable default offloading behaviour of tortoise""" | |
half: bool = False | |
"""enable autocast to half precision for autoregressive model""" | |
no_cache: bool = False | |
"""disable kv_cache usage. This should really only be used if you are very low on vram.""" | |
sampler: Optional[str] = field(default=None, choices=SAMPLERS) | |
"""override the sampler used for diffusion (default depends on --preset)""" | |
original_tortoise: bool = False | |
"""ensure results are identical to original tortoise-tts repo""" | |
if __name__ == "__main__": | |
parser = ArgumentParser( | |
description="TorToiSe is a text-to-speech program that is capable of synthesizing speech " | |
"in multiple voices with realistic prosody and intonation." | |
) | |
# bugs out for some reason | |
# parser.add_argument( | |
# "--web", | |
# action="store_true", | |
# help="launch the webui (doesn't pass it the other arguments)", | |
# ) | |
parser.add_arguments(General, "general") | |
parser.add_arguments(Output, "output") | |
parser.add_arguments(MultiOutput, "multi_output") | |
parser.add_arguments(Advanced, "advanced") | |
parser.add_arguments(Tuning, "tuning") | |
parser.add_arguments(Speed, "speed") | |
usage_examples = f""" | |
Examples: | |
Read text using random voice and place it in a file: | |
{parser.prog} -o hello.wav "Hello, how are you?" | |
Read text from stdin and play it using the tom voice: | |
echo "Say it like you mean it!" | {parser.prog} -P -v tom | |
Read a text file using multiple voices and save the audio clips to a directory: | |
{parser.prog} -O /tmp/tts-results -v tom,emma <textfile.txt | |
""" | |
# show usage even when Ctrl+C is pressed early | |
try: | |
args = parser.parse_args() | |
except SystemExit as e: | |
if e.code == 0: | |
print(usage_examples) | |
sys.exit(e.code) | |
# bugs out for some reason | |
# if args.web: | |
# from importlib import import_module | |
# app = import_module("app") | |
# sys.exit(app.main()) | |
from tortoise.inference import ( | |
check_pydub, | |
get_all_voices, | |
get_seed, | |
parse_multiarg_text, | |
parse_voice_str, | |
split_text, | |
validate_output_dir, | |
voice_loader, | |
save_gen_with_voicefix | |
) | |
# get voices | |
all_voices, extra_voice_dirs = get_all_voices(args.general.voices_dir) | |
if args.output.list_voices: | |
for v in all_voices: | |
print(v) | |
sys.exit(0) | |
selected_voices = parse_voice_str(args.general.voice, all_voices) | |
voice_generator = voice_loader(selected_voices, extra_voice_dirs) | |
# parse text | |
if not args.general.text: | |
print("reading text from stdin!") | |
text = parse_multiarg_text(args.general.text) | |
texts = split_text(text, args.advanced.text_split) | |
output_dir = validate_output_dir( | |
args.output.output_dir, selected_voices, args.multi_output.candidates | |
) | |
# error out early if pydub isn't installed | |
pydub = check_pydub(args.output.play) | |
seed = get_seed(args.advanced.seed) | |
verbose = not args.general.quiet | |
vocoder = getattr(VocConf, args.advanced.vocoder) | |
if verbose: | |
print("Loading tts...") | |
tts = TextToSpeech( | |
models_dir=args.advanced.models_dir, | |
enable_redaction=not args.advanced.disable_redaction, | |
device=args.advanced.device, | |
autoregressive_batch_size=args.advanced.batch_size, | |
high_vram=not args.speed.low_vram, | |
kv_cache=not args.speed.no_cache, | |
ar_checkpoint=args.advanced.ar_checkpoint, | |
clvp_checkpoint=args.advanced.clvp_checkpoint, | |
diff_checkpoint=args.advanced.diff_checkpoint, | |
vocoder=vocoder, | |
) | |
gen_settings = { | |
"use_deterministic_seed": seed, | |
"verbose": verbose, | |
"k": args.multi_output.candidates, | |
"preset": args.general.preset, | |
} | |
tuning_options = [ | |
"num_autoregressive_samples", | |
"temperature", | |
"length_penalty", | |
"repetition_penalty", | |
"top_p", | |
"max_mel_tokens", | |
"cvvp_amount", | |
"diffusion_iterations", | |
"cond_free", | |
"cond_free_k", | |
"diffusion_temperature", | |
] | |
for option in tuning_options: | |
if getattr(args.tuning, option) is not None: | |
gen_settings[option] = getattr(args.tuning, option) | |
speed_options = [ | |
"sampler", | |
"original_tortoise", | |
"half", | |
] | |
for option in speed_options: | |
if getattr(args.speed, option) is not None: | |
gen_settings[option] = getattr(args.speed, option) | |
total_clips = len(texts) * len(selected_voices) | |
regenerate_clips = ( | |
[int(x) for x in args.multi_output.regenerate.split(",")] | |
if args.multi_output.regenerate | |
else None | |
) | |
for voice_idx, (voice, voice_samples, conditioning_latents) in enumerate( | |
voice_generator | |
): | |
audio_parts = [] | |
for text_idx, text in enumerate(texts): | |
clip_name = f'{"-".join(voice)}_{text_idx:02d}' | |
if args.output.output_dir: | |
first_clip = os.path.join(args.output.output_dir, f"{clip_name}_00.wav") | |
if ( | |
args.multi_output.skip_existing | |
or (regenerate_clips and text_idx not in regenerate_clips) | |
) and os.path.exists(first_clip): | |
audio_parts.append(load_audio(first_clip, 24000)) | |
if verbose: | |
print(f"Skipping {clip_name}") | |
continue | |
if verbose: | |
print( | |
f"Rendering {clip_name} ({(voice_idx * len(texts) + text_idx + 1)} of {total_clips})..." | |
) | |
print(" " + text) | |
gen = tts.tts_with_preset( | |
text, | |
voice_samples=voice_samples, | |
conditioning_latents=conditioning_latents, | |
**gen_settings, | |
) | |
gen = gen if args.multi_output.candidates > 1 else [gen] | |
for candidate_idx, audio in enumerate(gen): | |
audio = audio.squeeze(0).cpu() | |
if candidate_idx == 0: | |
audio_parts.append(audio) | |
if args.output.output_dir: | |
filename = f"{clip_name}_{candidate_idx:02d}.wav" | |
save_gen_with_voicefix(audio, os.path.join(args.output.output_dir, filename), squeeze=False, voicefixer=args.general.voicefixer) | |
audio = torch.cat(audio_parts, dim=-1) | |
if args.output.output_dir: | |
filename = f'{"-".join(voice)}_combined.wav' | |
save_gen_with_voicefix( | |
audio, | |
os.path.join(args.output.output_dir, filename), | |
squeeze=False, | |
voicefixer=args.general.voicefixer, | |
) | |
elif args.output.output: | |
filename = args.output.output or os.tmp | |
save_gen_with_voicefix(audio, filename, squeeze=False, voicefixer=args.general.voicefixer) | |
elif args.output.play: | |
print("WARNING: cannot use voicefixer with --play") | |
f = tempfile.NamedTemporaryFile(suffix=".wav", delete=True) | |
torchaudio.save(f.name, audio, 24000) | |
pydub.playback.play(pydub.AudioSegment.from_wav(f.name)) | |
if args.advanced.produce_debug_state: | |
os.makedirs("debug_states", exist_ok=True) | |
dbg_state = (seed, texts, voice_samples, conditioning_latents, args) | |
torch.save( | |
dbg_state, os.path.join("debug_states", f'debug_{"-".join(voice)}.pth') | |
) | |