File size: 5,672 Bytes
ee04bc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import argparse
import os
from time import time

import torch
import torchaudio

from api import TextToSpeech, MODELS_DIR
from utils.audio import load_audio, load_voices
from utils.text import split_and_recombine_text


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--textfile",
        type=str,
        help="A file containing the text to read.",
        default="tortoise/data/riding_hood.txt",
    )
    parser.add_argument(
        "--voice",
        type=str,
        help="Selects the voice to use for generation. See options in voices/ directory (and add your own!) "
        "Use the & character to join two voices together. Use a comma to perform inference on multiple voices.",
        default="pat",
    )
    parser.add_argument(
        "--output_path",
        type=str,
        help="Where to store outputs.",
        default="results/longform/",
    )
    parser.add_argument(
        "--preset", type=str, help="Which voice preset to use.", default="standard"
    )
    parser.add_argument(
        "--regenerate",
        type=str,
        help="Comma-separated list of clip numbers to re-generate, or nothing.",
        default=None,
    )
    parser.add_argument(
        "--candidates",
        type=int,
        help="How many output candidates to produce per-voice. Only the first candidate is actually used in the final product, the others can be used manually.",
        default=1,
    )
    parser.add_argument(
        "--model_dir",
        type=str,
        help="Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this"
        "should only be specified if you have custom checkpoints.",
        default=MODELS_DIR,
    )
    parser.add_argument(
        "--seed",
        type=int,
        help="Random seed which can be used to reproduce results.",
        default=None,
    )
    parser.add_argument(
        "--produce_debug_state",
        type=bool,
        help="Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.",
        default=True,
    )

    args = parser.parse_args()
    tts = TextToSpeech(models_dir=args.model_dir)

    outpath = args.output_path
    selected_voices = args.voice.split(",")
    regenerate = args.regenerate
    if regenerate is not None:
        regenerate = [int(e) for e in regenerate.split(",")]

    # Process text
    with open(args.textfile, "r", encoding="utf-8") as f:
        text = " ".join([l for l in f.readlines()])
    if "|" in text:
        print(
            "Found the '|' character in your text, which I will use as a cue for where to split it up. If this was not"
            "your intent, please remove all '|' characters from the input."
        )
        texts = text.split("|")
    else:
        texts = split_and_recombine_text(text)

    seed = int(time()) if args.seed is None else args.seed
    for selected_voice in selected_voices:
        voice_outpath = os.path.join(outpath, selected_voice)
        os.makedirs(voice_outpath, exist_ok=True)

        if "&" in selected_voice:
            voice_sel = selected_voice.split("&")
        else:
            voice_sel = [selected_voice]

        voice_samples, conditioning_latents = load_voices(voice_sel)
        all_parts = []
        for j, text in enumerate(texts):
            if regenerate is not None and j not in regenerate:
                all_parts.append(
                    load_audio(os.path.join(voice_outpath, f"{j}.wav"), 24000)
                )
                continue
            gen = tts.tts_with_preset(
                text,
                voice_samples=voice_samples,
                conditioning_latents=conditioning_latents,
                preset=args.preset,
                k=args.candidates,
                use_deterministic_seed=seed,
            )
            if args.candidates == 1:
                gen = gen.squeeze(0).cpu()
                torchaudio.save(os.path.join(voice_outpath, f"{j}.wav"), gen, 24000)
            else:
                candidate_dir = os.path.join(voice_outpath, str(j))
                os.makedirs(candidate_dir, exist_ok=True)
                for k, g in enumerate(gen):
                    torchaudio.save(
                        os.path.join(candidate_dir, f"{k}.wav"),
                        g.squeeze(0).cpu(),
                        24000,
                    )
                gen = gen[0].squeeze(0).cpu()
            all_parts.append(gen)

        if args.candidates == 1:
            full_audio = torch.cat(all_parts, dim=-1)
            torchaudio.save(
                os.path.join(voice_outpath, "combined.wav"), full_audio, 24000
            )

        if args.produce_debug_state:
            os.makedirs("debug_states", exist_ok=True)
            dbg_state = (seed, texts, voice_samples, conditioning_latents)
            torch.save(dbg_state, f"debug_states/read_debug_{selected_voice}.pth")

        # Combine each candidate's audio clips.
        if args.candidates > 1:
            audio_clips = []
            for candidate in range(args.candidates):
                for line in range(len(texts)):
                    wav_file = os.path.join(
                        voice_outpath, str(line), f"{candidate}.wav"
                    )
                    audio_clips.append(load_audio(wav_file, 24000))
                audio_clips = torch.cat(audio_clips, dim=-1)
                torchaudio.save(
                    os.path.join(voice_outpath, f"combined_{candidate:02d}.wav"),
                    audio_clips,
                    24000,
                )
                audio_clips = []