Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -32,10 +32,12 @@ Use this space to generate long-form speech up to around ~2 minutes in length. T
|
|
32 |
""".strip()
|
33 |
|
34 |
# Create pipelines, downloading required files as necessary
|
|
|
|
|
35 |
hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
|
36 |
hybrid_task.download_required_files(text_callback=print)
|
37 |
hybrid_pipe = hybrid_task()
|
38 |
-
hybrid_pipe.load()
|
39 |
|
40 |
transformer_task = Task.get(
|
41 |
"speech-synthesis", model="zonos-transformer", available_only=False
|
@@ -45,7 +47,7 @@ transformer_pipe = transformer_task()
|
|
45 |
|
46 |
if is_hf_spaces:
|
47 |
# Must load all models on GPU when using ZERO
|
48 |
-
transformer_pipe.load()
|
49 |
|
50 |
# Global state
|
51 |
pipelines = {
|
@@ -126,6 +128,12 @@ def generate_audio(
|
|
126 |
skip_speaking_rate: bool,
|
127 |
skip_emotion: bool,
|
128 |
skip_speaker: bool,
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
progress=gr.Progress(),
|
130 |
) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
|
131 |
"""
|
@@ -142,9 +150,15 @@ def generate_audio(
|
|
142 |
try:
|
143 |
wav_out = selected_pipeline(
|
144 |
text=text,
|
|
|
145 |
language=language,
|
146 |
reference_audio=speaker_audio,
|
|
|
|
|
|
|
147 |
prefix_audio=prefix_audio,
|
|
|
|
|
148 |
seed=seed,
|
149 |
max_chunk_length=max_chunk_length,
|
150 |
cross_fade_duration=cross_fade_duration,
|
@@ -176,7 +190,13 @@ def generate_audio(
|
|
176 |
output_format="float",
|
177 |
)
|
178 |
|
179 |
-
return (
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
finally:
|
181 |
selected_pipeline.off_progress()
|
182 |
|
@@ -186,6 +206,7 @@ if __name__ == "__main__":
|
|
186 |
with gr.Row():
|
187 |
with gr.Column(scale=3):
|
188 |
gr.Markdown(header_markdown)
|
|
|
189 |
gr.Image(
|
190 |
value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
|
191 |
container=False,
|
@@ -207,6 +228,10 @@ if __name__ == "__main__":
|
|
207 |
value="en-us",
|
208 |
label="Language",
|
209 |
)
|
|
|
|
|
|
|
|
|
210 |
|
211 |
with gr.Row():
|
212 |
if not is_hf_spaces:
|
@@ -260,6 +285,9 @@ if __name__ == "__main__":
|
|
260 |
label="Optional Prefix Audio (continue from this audio)",
|
261 |
type="filepath",
|
262 |
)
|
|
|
|
|
|
|
263 |
with gr.Column(scale=3):
|
264 |
cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
|
265 |
min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
|
@@ -274,8 +302,31 @@ if __name__ == "__main__":
|
|
274 |
with gr.Row(variant="panel", equal_height=True) as speaker_row:
|
275 |
with gr.Column():
|
276 |
speaker_uncond = gr.Checkbox(label="Skip Speaker")
|
277 |
-
speaker_noised_checkbox = gr.Checkbox(
|
278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
speaker_audio = gr.Audio(
|
280 |
label="Optional Speaker Audio (for cloning)",
|
281 |
type="filepath",
|
@@ -398,8 +449,14 @@ if __name__ == "__main__":
|
|
398 |
speaking_rate_uncond,
|
399 |
emotion_uncond,
|
400 |
speaker_uncond,
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
],
|
402 |
outputs=[output_audio, seed_number],
|
403 |
)
|
404 |
-
|
405 |
demo.launch()
|
|
|
32 |
""".strip()
|
33 |
|
34 |
# Create pipelines, downloading required files as necessary
|
35 |
+
speech_enhancement = Task.get("speech-enhancement", model="deep-filter-net-v3", available_only=False)
|
36 |
+
speech_enhancement.download_required_files(text_callback=print)
|
37 |
hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
|
38 |
hybrid_task.download_required_files(text_callback=print)
|
39 |
hybrid_pipe = hybrid_task()
|
40 |
+
hybrid_pipe.load(allow_optional=True)
|
41 |
|
42 |
transformer_task = Task.get(
|
43 |
"speech-synthesis", model="zonos-transformer", available_only=False
|
|
|
47 |
|
48 |
if is_hf_spaces:
|
49 |
# Must load all models on GPU when using ZERO
|
50 |
+
transformer_pipe.load(allow_optional=True)
|
51 |
|
52 |
# Global state
|
53 |
pipelines = {
|
|
|
128 |
skip_speaking_rate: bool,
|
129 |
skip_emotion: bool,
|
130 |
skip_speaker: bool,
|
131 |
+
speaker_pitch_shift: float,
|
132 |
+
speaker_equalize: bool,
|
133 |
+
speaker_enhance: bool,
|
134 |
+
prefix_equalize: bool,
|
135 |
+
prefix_enhance: bool,
|
136 |
+
enhance: bool,
|
137 |
progress=gr.Progress(),
|
138 |
) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
|
139 |
"""
|
|
|
150 |
try:
|
151 |
wav_out = selected_pipeline(
|
152 |
text=text,
|
153 |
+
enhance=enhance,
|
154 |
language=language,
|
155 |
reference_audio=speaker_audio,
|
156 |
+
reference_audio_pitch_shift=speaker_pitch_shift,
|
157 |
+
equalize_reference_audio=speaker_equalize,
|
158 |
+
enhance_reference_audio=speaker_enhance,
|
159 |
prefix_audio=prefix_audio,
|
160 |
+
equalize_prefix_audio=prefix_equalize,
|
161 |
+
enhance_prefix_audio=prefix_enhance,
|
162 |
seed=seed,
|
163 |
max_chunk_length=max_chunk_length,
|
164 |
cross_fade_duration=cross_fade_duration,
|
|
|
190 |
output_format="float",
|
191 |
)
|
192 |
|
193 |
+
return (
|
194 |
+
(
|
195 |
+
48000 if enhance else 44100,
|
196 |
+
wav_out.squeeze().numpy()
|
197 |
+
),
|
198 |
+
seed
|
199 |
+
)
|
200 |
finally:
|
201 |
selected_pipeline.off_progress()
|
202 |
|
|
|
206 |
with gr.Row():
|
207 |
with gr.Column(scale=3):
|
208 |
gr.Markdown(header_markdown)
|
209 |
+
|
210 |
gr.Image(
|
211 |
value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
|
212 |
container=False,
|
|
|
228 |
value="en-us",
|
229 |
label="Language",
|
230 |
)
|
231 |
+
enhanced_checkbox = gr.Checkbox(
|
232 |
+
value=True,
|
233 |
+
label="Enhance Output with DeepFilterNet"
|
234 |
+
)
|
235 |
|
236 |
with gr.Row():
|
237 |
if not is_hf_spaces:
|
|
|
285 |
label="Optional Prefix Audio (continue from this audio)",
|
286 |
type="filepath",
|
287 |
)
|
288 |
+
prefix_equalize_checkbox = gr.Checkbox(label="Equalize Prefix Audio", value=True)
|
289 |
+
prefix_enhance_checkbox = gr.Checkbox(label="Enhance Prefix Audio with DeepFilterNet", value=True)
|
290 |
+
|
291 |
with gr.Column(scale=3):
|
292 |
cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
|
293 |
min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
|
|
|
302 |
with gr.Row(variant="panel", equal_height=True) as speaker_row:
|
303 |
with gr.Column():
|
304 |
speaker_uncond = gr.Checkbox(label="Skip Speaker")
|
305 |
+
speaker_noised_checkbox = gr.Checkbox(
|
306 |
+
label="Speaker Noised",
|
307 |
+
value=False,
|
308 |
+
interactive=False,
|
309 |
+
info="'Speaker Noised' is a conditioning value that the model understands, not a processing step. Check this box if your input audio is noisy."
|
310 |
+
)
|
311 |
+
speaker_equalize_checkbox = gr.Checkbox(label="Equalize Speaker Audio", value=True)
|
312 |
+
speaker_enhance_checkbox = gr.Checkbox(label="Enhance Speaker Audio with DeepFilterNet", value=True)
|
313 |
+
|
314 |
+
def on_enhanced_change(use_enhance: bool) -> Dict[str, Any]:
|
315 |
+
update_dict = {"interactive": not use_enhance}
|
316 |
+
if use_enhance:
|
317 |
+
update_dict["value"] = False
|
318 |
+
return gr.update(**update_dict)
|
319 |
+
|
320 |
+
speaker_enhance_checkbox.change(
|
321 |
+
fn=on_enhanced_change,
|
322 |
+
inputs=[speaker_enhance_checkbox],
|
323 |
+
outputs=[speaker_noised_checkbox]
|
324 |
+
)
|
325 |
+
speaker_pitch_shift = gr.Slider(
|
326 |
+
-1200, 1200, -44.99, 0.01, label="Speaker Pitch Shift (Cents)",
|
327 |
+
info="A pitch shift to apply to speaker audio before extracting embeddings. A slight down-shift of ~45 cents tends to produce a more accurate voice cloning."
|
328 |
+
)
|
329 |
+
|
330 |
speaker_audio = gr.Audio(
|
331 |
label="Optional Speaker Audio (for cloning)",
|
332 |
type="filepath",
|
|
|
449 |
speaking_rate_uncond,
|
450 |
emotion_uncond,
|
451 |
speaker_uncond,
|
452 |
+
speaker_pitch_shift,
|
453 |
+
speaker_equalize_checkbox,
|
454 |
+
speaker_enhance_checkbox,
|
455 |
+
prefix_equalize_checkbox,
|
456 |
+
prefix_enhance_checkbox,
|
457 |
+
enhanced_checkbox,
|
458 |
],
|
459 |
outputs=[output_audio, seed_number],
|
460 |
)
|
461 |
+
|
462 |
demo.launch()
|