benjamin-paine commited on
Commit
104eece
·
verified ·
1 Parent(s): 3eda29a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -6
app.py CHANGED
@@ -32,10 +32,12 @@ Use this space to generate long-form speech up to around ~2 minutes in length. T
32
  """.strip()
33
 
34
  # Create pipelines, downloading required files as necessary
 
 
35
  hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
36
  hybrid_task.download_required_files(text_callback=print)
37
  hybrid_pipe = hybrid_task()
38
- hybrid_pipe.load()
39
 
40
  transformer_task = Task.get(
41
  "speech-synthesis", model="zonos-transformer", available_only=False
@@ -45,7 +47,7 @@ transformer_pipe = transformer_task()
45
 
46
  if is_hf_spaces:
47
  # Must load all models on GPU when using ZERO
48
- transformer_pipe.load()
49
 
50
  # Global state
51
  pipelines = {
@@ -126,6 +128,12 @@ def generate_audio(
126
  skip_speaking_rate: bool,
127
  skip_emotion: bool,
128
  skip_speaker: bool,
 
 
 
 
 
 
129
  progress=gr.Progress(),
130
  ) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
131
  """
@@ -142,9 +150,15 @@ def generate_audio(
142
  try:
143
  wav_out = selected_pipeline(
144
  text=text,
 
145
  language=language,
146
  reference_audio=speaker_audio,
 
 
 
147
  prefix_audio=prefix_audio,
 
 
148
  seed=seed,
149
  max_chunk_length=max_chunk_length,
150
  cross_fade_duration=cross_fade_duration,
@@ -176,7 +190,13 @@ def generate_audio(
176
  output_format="float",
177
  )
178
 
179
- return (44100, wav_out.squeeze().numpy()), seed
 
 
 
 
 
 
180
  finally:
181
  selected_pipeline.off_progress()
182
 
@@ -186,6 +206,7 @@ if __name__ == "__main__":
186
  with gr.Row():
187
  with gr.Column(scale=3):
188
  gr.Markdown(header_markdown)
 
189
  gr.Image(
190
  value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
191
  container=False,
@@ -207,6 +228,10 @@ if __name__ == "__main__":
207
  value="en-us",
208
  label="Language",
209
  )
 
 
 
 
210
 
211
  with gr.Row():
212
  if not is_hf_spaces:
@@ -260,6 +285,9 @@ if __name__ == "__main__":
260
  label="Optional Prefix Audio (continue from this audio)",
261
  type="filepath",
262
  )
 
 
 
263
  with gr.Column(scale=3):
264
  cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
265
  min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
@@ -274,8 +302,31 @@ if __name__ == "__main__":
274
  with gr.Row(variant="panel", equal_height=True) as speaker_row:
275
  with gr.Column():
276
  speaker_uncond = gr.Checkbox(label="Skip Speaker")
277
- speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False)
278
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  speaker_audio = gr.Audio(
280
  label="Optional Speaker Audio (for cloning)",
281
  type="filepath",
@@ -398,8 +449,14 @@ if __name__ == "__main__":
398
  speaking_rate_uncond,
399
  emotion_uncond,
400
  speaker_uncond,
 
 
 
 
 
 
401
  ],
402
  outputs=[output_audio, seed_number],
403
  )
404
-
405
  demo.launch()
 
32
  """.strip()
33
 
34
  # Create pipelines, downloading required files as necessary
35
+ speech_enhancement = Task.get("speech-enhancement", model="deep-filter-net-v3", available_only=False)
36
+ speech_enhancement.download_required_files(text_callback=print)
37
  hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
38
  hybrid_task.download_required_files(text_callback=print)
39
  hybrid_pipe = hybrid_task()
40
+ hybrid_pipe.load(allow_optional=True)
41
 
42
  transformer_task = Task.get(
43
  "speech-synthesis", model="zonos-transformer", available_only=False
 
47
 
48
  if is_hf_spaces:
49
  # Must load all models on GPU when using ZERO
50
+ transformer_pipe.load(allow_optional=True)
51
 
52
  # Global state
53
  pipelines = {
 
128
  skip_speaking_rate: bool,
129
  skip_emotion: bool,
130
  skip_speaker: bool,
131
+ speaker_pitch_shift: float,
132
+ speaker_equalize: bool,
133
+ speaker_enhance: bool,
134
+ prefix_equalize: bool,
135
+ prefix_enhance: bool,
136
+ enhance: bool,
137
  progress=gr.Progress(),
138
  ) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
139
  """
 
150
  try:
151
  wav_out = selected_pipeline(
152
  text=text,
153
+ enhance=enhance,
154
  language=language,
155
  reference_audio=speaker_audio,
156
+ reference_audio_pitch_shift=speaker_pitch_shift,
157
+ equalize_reference_audio=speaker_equalize,
158
+ enhance_reference_audio=speaker_enhance,
159
  prefix_audio=prefix_audio,
160
+ equalize_prefix_audio=prefix_equalize,
161
+ enhance_prefix_audio=prefix_enhance,
162
  seed=seed,
163
  max_chunk_length=max_chunk_length,
164
  cross_fade_duration=cross_fade_duration,
 
190
  output_format="float",
191
  )
192
 
193
+ return (
194
+ (
195
+ 48000 if enhance else 44100,
196
+ wav_out.squeeze().numpy()
197
+ ),
198
+ seed
199
+ )
200
  finally:
201
  selected_pipeline.off_progress()
202
 
 
206
  with gr.Row():
207
  with gr.Column(scale=3):
208
  gr.Markdown(header_markdown)
209
+
210
  gr.Image(
211
  value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
212
  container=False,
 
228
  value="en-us",
229
  label="Language",
230
  )
231
+ enhanced_checkbox = gr.Checkbox(
232
+ value=True,
233
+ label="Enhance Output with DeepFilterNet"
234
+ )
235
 
236
  with gr.Row():
237
  if not is_hf_spaces:
 
285
  label="Optional Prefix Audio (continue from this audio)",
286
  type="filepath",
287
  )
288
+ prefix_equalize_checkbox = gr.Checkbox(label="Equalize Prefix Audio", value=True)
289
+ prefix_enhance_checkbox = gr.Checkbox(label="Enhance Prefix Audio with DeepFilterNet", value=True)
290
+
291
  with gr.Column(scale=3):
292
  cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
293
  min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
 
302
  with gr.Row(variant="panel", equal_height=True) as speaker_row:
303
  with gr.Column():
304
  speaker_uncond = gr.Checkbox(label="Skip Speaker")
305
+ speaker_noised_checkbox = gr.Checkbox(
306
+ label="Speaker Noised",
307
+ value=False,
308
+ interactive=False,
309
+ info="'Speaker Noised' is a conditioning value that the model understands, not a processing step. Check this box if your input audio is noisy."
310
+ )
311
+ speaker_equalize_checkbox = gr.Checkbox(label="Equalize Speaker Audio", value=True)
312
+ speaker_enhance_checkbox = gr.Checkbox(label="Enhance Speaker Audio with DeepFilterNet", value=True)
313
+
314
+ def on_enhanced_change(use_enhance: bool) -> Dict[str, Any]:
315
+ update_dict = {"interactive": not use_enhance}
316
+ if use_enhance:
317
+ update_dict["value"] = False
318
+ return gr.update(**update_dict)
319
+
320
+ speaker_enhance_checkbox.change(
321
+ fn=on_enhanced_change,
322
+ inputs=[speaker_enhance_checkbox],
323
+ outputs=[speaker_noised_checkbox]
324
+ )
325
+ speaker_pitch_shift = gr.Slider(
326
+ -1200, 1200, -44.99, 0.01, label="Speaker Pitch Shift (Cents)",
327
+ info="A pitch shift to apply to speaker audio before extracting embeddings. A slight down-shift of ~45 cents tends to produce a more accurate voice cloning."
328
+ )
329
+
330
  speaker_audio = gr.Audio(
331
  label="Optional Speaker Audio (for cloning)",
332
  type="filepath",
 
449
  speaking_rate_uncond,
450
  emotion_uncond,
451
  speaker_uncond,
452
+ speaker_pitch_shift,
453
+ speaker_equalize_checkbox,
454
+ speaker_enhance_checkbox,
455
+ prefix_equalize_checkbox,
456
+ prefix_enhance_checkbox,
457
+ enhanced_checkbox,
458
  ],
459
  outputs=[output_audio, seed_number],
460
  )
461
+
462
  demo.launch()