Spaces:
Runtime error
Runtime error
File size: 5,672 Bytes
ee04bc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import argparse
import os
from time import time
import torch
import torchaudio
from api import TextToSpeech, MODELS_DIR
from utils.audio import load_audio, load_voices
from utils.text import split_and_recombine_text
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--textfile",
type=str,
help="A file containing the text to read.",
default="tortoise/data/riding_hood.txt",
)
parser.add_argument(
"--voice",
type=str,
help="Selects the voice to use for generation. See options in voices/ directory (and add your own!) "
"Use the & character to join two voices together. Use a comma to perform inference on multiple voices.",
default="pat",
)
parser.add_argument(
"--output_path",
type=str,
help="Where to store outputs.",
default="results/longform/",
)
parser.add_argument(
"--preset", type=str, help="Which voice preset to use.", default="standard"
)
parser.add_argument(
"--regenerate",
type=str,
help="Comma-separated list of clip numbers to re-generate, or nothing.",
default=None,
)
parser.add_argument(
"--candidates",
type=int,
help="How many output candidates to produce per-voice. Only the first candidate is actually used in the final product, the others can be used manually.",
default=1,
)
parser.add_argument(
"--model_dir",
type=str,
help="Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this"
"should only be specified if you have custom checkpoints.",
default=MODELS_DIR,
)
parser.add_argument(
"--seed",
type=int,
help="Random seed which can be used to reproduce results.",
default=None,
)
parser.add_argument(
"--produce_debug_state",
type=bool,
help="Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.",
default=True,
)
args = parser.parse_args()
tts = TextToSpeech(models_dir=args.model_dir)
outpath = args.output_path
selected_voices = args.voice.split(",")
regenerate = args.regenerate
if regenerate is not None:
regenerate = [int(e) for e in regenerate.split(",")]
# Process text
with open(args.textfile, "r", encoding="utf-8") as f:
text = " ".join([l for l in f.readlines()])
if "|" in text:
print(
"Found the '|' character in your text, which I will use as a cue for where to split it up. If this was not"
"your intent, please remove all '|' characters from the input."
)
texts = text.split("|")
else:
texts = split_and_recombine_text(text)
seed = int(time()) if args.seed is None else args.seed
for selected_voice in selected_voices:
voice_outpath = os.path.join(outpath, selected_voice)
os.makedirs(voice_outpath, exist_ok=True)
if "&" in selected_voice:
voice_sel = selected_voice.split("&")
else:
voice_sel = [selected_voice]
voice_samples, conditioning_latents = load_voices(voice_sel)
all_parts = []
for j, text in enumerate(texts):
if regenerate is not None and j not in regenerate:
all_parts.append(
load_audio(os.path.join(voice_outpath, f"{j}.wav"), 24000)
)
continue
gen = tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset=args.preset,
k=args.candidates,
use_deterministic_seed=seed,
)
if args.candidates == 1:
gen = gen.squeeze(0).cpu()
torchaudio.save(os.path.join(voice_outpath, f"{j}.wav"), gen, 24000)
else:
candidate_dir = os.path.join(voice_outpath, str(j))
os.makedirs(candidate_dir, exist_ok=True)
for k, g in enumerate(gen):
torchaudio.save(
os.path.join(candidate_dir, f"{k}.wav"),
g.squeeze(0).cpu(),
24000,
)
gen = gen[0].squeeze(0).cpu()
all_parts.append(gen)
if args.candidates == 1:
full_audio = torch.cat(all_parts, dim=-1)
torchaudio.save(
os.path.join(voice_outpath, "combined.wav"), full_audio, 24000
)
if args.produce_debug_state:
os.makedirs("debug_states", exist_ok=True)
dbg_state = (seed, texts, voice_samples, conditioning_latents)
torch.save(dbg_state, f"debug_states/read_debug_{selected_voice}.pth")
# Combine each candidate's audio clips.
if args.candidates > 1:
audio_clips = []
for candidate in range(args.candidates):
for line in range(len(texts)):
wav_file = os.path.join(
voice_outpath, str(line), f"{candidate}.wav"
)
audio_clips.append(load_audio(wav_file, 24000))
audio_clips = torch.cat(audio_clips, dim=-1)
torchaudio.save(
os.path.join(voice_outpath, f"combined_{candidate:02d}.wav"),
audio_clips,
24000,
)
audio_clips = []
|