Manmay commited on
Commit
fca9b48
1 Parent(s): 9be7eab

Add application file

Browse files
Files changed (1) hide show
  1. app.py +201 -0
app.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ import torchaudio
5
+ import time
6
+ from datetime import datetime
7
+ from tortoise.api import TextToSpeech
8
+ from tortoise.utils.text import split_and_recombine_text
9
+ from tortoise.utils.audio import load_audio, load_voice, load_voices
10
+
11
+ VOICE_OPTIONS = [
12
+ "angie",
13
+ "cond_latent_example",
14
+ "deniro",
15
+ "freeman",
16
+ "halle",
17
+ "lj",
18
+ "myself",
19
+ "pat2",
20
+ "snakes",
21
+ "tom",
22
+ "train_daws",
23
+ "train_dreams",
24
+ "train_grace",
25
+ "train_lescault",
26
+ "weaver",
27
+ "applejack",
28
+ "daniel",
29
+ "emma",
30
+ "geralt",
31
+ "jlaw",
32
+ "mol",
33
+ "pat",
34
+ "rainbow",
35
+ "tim_reynolds",
36
+ "train_atkins",
37
+ "train_dotrice",
38
+ "train_empire",
39
+ "train_kennard",
40
+ "train_mouse",
41
+ "william",
42
+ "random", # special option for random voice
43
+ "disabled", # special option for disabled voice
44
+ ]
45
+
46
+
47
+ def inference(
48
+ text,
49
+ script,
50
+ name,
51
+ voice,
52
+ voice_b,
53
+ voice_c,
54
+ preset,
55
+ seed,
56
+ regenerate,
57
+ split_by_newline,
58
+ ):
59
+ if regenerate.strip() == "":
60
+ regenerate = None
61
+
62
+ if name.strip() == "":
63
+ raise gr.Error("No name provided")
64
+
65
+ if text is None or text.strip() == "":
66
+ with open(script.name) as f:
67
+ text = f.read()
68
+ if text.strip() == "":
69
+ raise gr.Error("Please provide either text or script file with content.")
70
+
71
+ if split_by_newline == "Yes":
72
+ texts = list(filter(lambda x: x.strip() != "", text.split("\n")))
73
+ else:
74
+ texts = split_and_recombine_text(text)
75
+
76
+ os.makedirs(os.path.join("longform", name), exist_ok=True)
77
+
78
+ if regenerate is not None:
79
+ regenerate = list(map(int, regenerate.split()))
80
+
81
+ voices = [voice]
82
+ if voice_b != "disabled":
83
+ voices.append(voice_b)
84
+ if voice_c != "disabled":
85
+ voices.append(voice_c)
86
+
87
+ if len(voices) == 1:
88
+ voice_samples, conditioning_latents = load_voice(voice)
89
+ else:
90
+ voice_samples, conditioning_latents = load_voices(voices)
91
+
92
+ start_time = time.time()
93
+
94
+ all_parts = []
95
+ for j, text in enumerate(texts):
96
+ if regenerate is not None and j + 1 not in regenerate:
97
+ all_parts.append(
98
+ load_audio(os.path.join("longform", name, f"{j+1}.wav"), 24000)
99
+ )
100
+ continue
101
+ gen = tts.tts_with_preset(
102
+ text,
103
+ voice_samples=voice_samples,
104
+ conditioning_latents=conditioning_latents,
105
+ preset=preset,
106
+ k=1,
107
+ use_deterministic_seed=seed,
108
+ )
109
+
110
+ gen = gen.squeeze(0).cpu()
111
+ torchaudio.save(os.path.join("longform", name, f"{j+1}.wav"), gen, 24000)
112
+
113
+ all_parts.append(gen)
114
+
115
+ full_audio = torch.cat(all_parts, dim=-1)
116
+
117
+ os.makedirs("outputs", exist_ok=True)
118
+ torchaudio.save(os.path.join("outputs", f"{name}.wav"), full_audio, 24000)
119
+
120
+ with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
121
+ f.write(
122
+ f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
123
+ )
124
+
125
+ output_texts = [f"({j+1}) {texts[j]}" for j in range(len(texts))]
126
+
127
+ return ((24000, full_audio.squeeze().cpu().numpy()), "\n".join(output_texts))
128
+
129
+
130
+ def main():
131
+ text = gr.Textbox(
132
+ lines=4,
133
+ label="Text (Provide either text, or upload a newline separated text file below):",
134
+ )
135
+ script = gr.File(label="Upload a text file")
136
+ name = gr.Textbox(
137
+ lines=1, label="Name of the output file / folder to store intermediate results:"
138
+ )
139
+ preset = gr.Radio(
140
+ ["ultra_fast", "fast", "standard", "high_quality"],
141
+ value="fast",
142
+ label="Preset mode (determines quality with tradeoff over speed):",
143
+ type="value",
144
+ )
145
+ voice = gr.Dropdown(
146
+ VOICE_OPTIONS, value="angie", label="Select voice:", type="value"
147
+ )
148
+ voice_b = gr.Dropdown(
149
+ VOICE_OPTIONS,
150
+ value="disabled",
151
+ label="(Optional) Select second voice:",
152
+ type="value",
153
+ )
154
+ voice_c = gr.Dropdown(
155
+ VOICE_OPTIONS,
156
+ value="disabled",
157
+ label="(Optional) Select third voice:",
158
+ type="value",
159
+ )
160
+ seed = gr.Number(value=0, precision=0, label="Seed (for reproducibility):")
161
+ regenerate = gr.Textbox(
162
+ lines=1,
163
+ label="Comma-separated indices of clips to regenerate [starting from 1]",
164
+ )
165
+ split_by_newline = gr.Radio(
166
+ ["Yes", "No"],
167
+ label="Split by newline (If [No], it will automatically try to find relevant splits):",
168
+ type="value",
169
+ value="No",
170
+ )
171
+ output_audio = gr.Audio(label="Combined audio:")
172
+ output_text = gr.Textbox(label="Split texts with indices:", lines=10)
173
+
174
+ interface = gr.Interface(
175
+ fn=inference,
176
+ inputs=[
177
+ text,
178
+ script,
179
+ name,
180
+ voice,
181
+ voice_b,
182
+ voice_c,
183
+ preset,
184
+ seed,
185
+ regenerate,
186
+ split_by_newline,
187
+ ],
188
+ outputs=[output_audio, output_text],
189
+ )
190
+ interface.launch(share=True)
191
+
192
+
193
+ if __name__ == "__main__":
194
+ tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
195
+
196
+ with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
197
+ f.write(
198
+ f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
199
+ )
200
+
201
+ main()