mrfakename commited on
Commit
9e5b3c3
1 Parent(s): f50e0ae

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show
  1. app.py +47 -53
app.py CHANGED
@@ -120,6 +120,14 @@ def infer(
120
  speed=1,
121
  show_info=gr.Info,
122
  ):
 
 
 
 
 
 
 
 
123
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
124
 
125
  if model == "F5-TTS":
@@ -240,7 +248,7 @@ with gr.Blocks() as app_tts:
240
  nfe_step=nfe_slider,
241
  speed=speed_slider,
242
  )
243
- return audio_out, spectrogram_path, gr.update(value=ref_text_out)
244
 
245
  generate_btn.click(
246
  basic_tts,
@@ -320,7 +328,7 @@ with gr.Blocks() as app_multistyle:
320
  )
321
 
322
  # Regular speech type (mandatory)
323
- with gr.Row():
324
  with gr.Column():
325
  regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
326
  regular_insert = gr.Button("Insert Label", variant="secondary")
@@ -329,12 +337,12 @@ with gr.Blocks() as app_multistyle:
329
 
330
  # Regular speech type (max 100)
331
  max_speech_types = 100
332
- speech_type_rows = [] # 99
333
- speech_type_names = [regular_name] # 100
334
- speech_type_audios = [regular_audio] # 100
335
- speech_type_ref_texts = [regular_ref_text] # 100
336
- speech_type_delete_btns = [] # 99
337
- speech_type_insert_btns = [regular_insert] # 100
338
 
339
  # Additional speech types (99 more)
340
  for i in range(max_speech_types - 1):
@@ -355,51 +363,32 @@ with gr.Blocks() as app_multistyle:
355
  # Button to add speech type
356
  add_speech_type_btn = gr.Button("Add Speech Type")
357
 
358
- # Keep track of current number of speech types
359
- speech_type_count = gr.State(value=1)
360
 
361
  # Function to add a speech type
362
- def add_speech_type_fn(speech_type_count):
 
 
363
  if speech_type_count < max_speech_types:
 
364
  speech_type_count += 1
365
- # Prepare updates for the rows
366
- row_updates = []
367
- for i in range(1, max_speech_types):
368
- if i < speech_type_count:
369
- row_updates.append(gr.update(visible=True))
370
- else:
371
- row_updates.append(gr.update())
372
  else:
373
- # Optionally, show a warning
374
- row_updates = [gr.update() for _ in range(1, max_speech_types)]
375
- return [speech_type_count] + row_updates
376
 
377
- add_speech_type_btn.click(
378
- add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
379
- )
380
 
381
  # Function to delete a speech type
382
- def make_delete_speech_type_fn(index):
383
- def delete_speech_type_fn(speech_type_count):
384
- # Prepare updates
385
- row_updates = []
386
-
387
- for i in range(1, max_speech_types):
388
- if i == index:
389
- row_updates.append(gr.update(visible=False))
390
- else:
391
- row_updates.append(gr.update())
392
-
393
- speech_type_count = max(1, speech_type_count)
394
-
395
- return [speech_type_count] + row_updates
396
-
397
- return delete_speech_type_fn
398
 
399
  # Update delete button clicks
400
- for i, delete_btn in enumerate(speech_type_delete_btns):
401
- delete_fn = make_delete_speech_type_fn(i)
402
- delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
 
 
403
 
404
  # Text input for the prompt
405
  gen_text_input_multistyle = gr.Textbox(
@@ -413,7 +402,7 @@ with gr.Blocks() as app_multistyle:
413
  current_text = current_text or ""
414
  speech_type_name = speech_type_name or "None"
415
  updated_text = current_text + f"{{{speech_type_name}}} "
416
- return gr.update(value=updated_text)
417
 
418
  return insert_speech_type_fn
419
 
@@ -473,10 +462,14 @@ with gr.Blocks() as app_multistyle:
473
  if style in speech_types:
474
  current_style = style
475
  else:
476
- # If style not available, default to Regular
477
  current_style = "Regular"
478
 
479
- ref_audio = speech_types[current_style]["audio"]
 
 
 
 
480
  ref_text = speech_types[current_style].get("ref_text", "")
481
 
482
  # Generate speech for this segment
@@ -491,12 +484,10 @@ with gr.Blocks() as app_multistyle:
491
  # Concatenate all audio segments
492
  if generated_audio_segments:
493
  final_audio_data = np.concatenate(generated_audio_segments)
494
- return [(sr, final_audio_data)] + [
495
- gr.update(value=speech_types[style]["ref_text"]) for style in speech_types
496
- ]
497
  else:
498
  gr.Warning("No audio generated.")
499
- return [None] + [gr.update(value=speech_types[style]["ref_text"]) for style in speech_types]
500
 
501
  generate_multistyle_btn.click(
502
  generate_multistyle_speech,
@@ -514,7 +505,7 @@ with gr.Blocks() as app_multistyle:
514
 
515
  # Validation function to disable Generate button if speech types are missing
516
  def validate_speech_types(gen_text, regular_name, *args):
517
- speech_type_names_list = args[:max_speech_types]
518
 
519
  # Collect the speech types names
520
  speech_types_available = set()
@@ -678,7 +669,7 @@ Have a conversation with an AI using your reference voice!
678
  speed=1.0,
679
  show_info=print, # show_info=print no pull to top when generating
680
  )
681
- return audio_result, gr.update(value=ref_text_out)
682
 
683
  def clear_conversation():
684
  """Reset the conversation"""
@@ -828,7 +819,10 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
828
  visible=False,
829
  )
830
  custom_model_cfg = gr.Dropdown(
831
- choices=[DEFAULT_TTS_MODEL_CFG[2]],
 
 
 
832
  value=load_last_used_custom()[2],
833
  allow_custom_value=True,
834
  label="Config: in a dictionary form",
 
120
  speed=1,
121
  show_info=gr.Info,
122
  ):
123
+ if not ref_audio_orig:
124
+ gr.Warning("Please provide reference audio.")
125
+ return gr.update(), gr.update(), ref_text
126
+
127
+ if not gen_text.strip():
128
+ gr.Warning("Please enter text to generate.")
129
+ return gr.update(), gr.update(), ref_text
130
+
131
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
132
 
133
  if model == "F5-TTS":
 
248
  nfe_step=nfe_slider,
249
  speed=speed_slider,
250
  )
251
+ return audio_out, spectrogram_path, ref_text_out
252
 
253
  generate_btn.click(
254
  basic_tts,
 
328
  )
329
 
330
  # Regular speech type (mandatory)
331
+ with gr.Row() as regular_row:
332
  with gr.Column():
333
  regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
334
  regular_insert = gr.Button("Insert Label", variant="secondary")
 
337
 
338
  # Regular speech type (max 100)
339
  max_speech_types = 100
340
+ speech_type_rows = [regular_row]
341
+ speech_type_names = [regular_name]
342
+ speech_type_audios = [regular_audio]
343
+ speech_type_ref_texts = [regular_ref_text]
344
+ speech_type_delete_btns = [None]
345
+ speech_type_insert_btns = [regular_insert]
346
 
347
  # Additional speech types (99 more)
348
  for i in range(max_speech_types - 1):
 
363
  # Button to add speech type
364
  add_speech_type_btn = gr.Button("Add Speech Type")
365
 
366
+ # Keep track of autoincrement of speech types, no roll back
367
+ speech_type_count = 1
368
 
369
  # Function to add a speech type
370
+ def add_speech_type_fn():
371
+ row_updates = [gr.update() for _ in range(max_speech_types)]
372
+ global speech_type_count
373
  if speech_type_count < max_speech_types:
374
+ row_updates[speech_type_count] = gr.update(visible=True)
375
  speech_type_count += 1
 
 
 
 
 
 
 
376
  else:
377
+ gr.Warning("Exhausted maximum number of speech types. Consider restart the app.")
378
+ return row_updates
 
379
 
380
+ add_speech_type_btn.click(add_speech_type_fn, outputs=speech_type_rows)
 
 
381
 
382
  # Function to delete a speech type
383
+ def delete_speech_type_fn():
384
+ return gr.update(visible=False), None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
  # Update delete button clicks
387
+ for i in range(1, len(speech_type_delete_btns)):
388
+ speech_type_delete_btns[i].click(
389
+ delete_speech_type_fn,
390
+ outputs=[speech_type_rows[i], speech_type_names[i], speech_type_audios[i], speech_type_ref_texts[i]],
391
+ )
392
 
393
  # Text input for the prompt
394
  gen_text_input_multistyle = gr.Textbox(
 
402
  current_text = current_text or ""
403
  speech_type_name = speech_type_name or "None"
404
  updated_text = current_text + f"{{{speech_type_name}}} "
405
+ return updated_text
406
 
407
  return insert_speech_type_fn
408
 
 
462
  if style in speech_types:
463
  current_style = style
464
  else:
465
+ gr.Warning(f"Type {style} is not available, will use Regular as default.")
466
  current_style = "Regular"
467
 
468
+ try:
469
+ ref_audio = speech_types[current_style]["audio"]
470
+ except KeyError:
471
+ gr.Warning(f"Please provide reference audio for type {current_style}.")
472
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types]
473
  ref_text = speech_types[current_style].get("ref_text", "")
474
 
475
  # Generate speech for this segment
 
484
  # Concatenate all audio segments
485
  if generated_audio_segments:
486
  final_audio_data = np.concatenate(generated_audio_segments)
487
+ return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
 
 
488
  else:
489
  gr.Warning("No audio generated.")
490
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types]
491
 
492
  generate_multistyle_btn.click(
493
  generate_multistyle_speech,
 
505
 
506
  # Validation function to disable Generate button if speech types are missing
507
  def validate_speech_types(gen_text, regular_name, *args):
508
+ speech_type_names_list = args
509
 
510
  # Collect the speech types names
511
  speech_types_available = set()
 
669
  speed=1.0,
670
  show_info=print, # show_info=print no pull to top when generating
671
  )
672
+ return audio_result, ref_text_out
673
 
674
  def clear_conversation():
675
  """Reset the conversation"""
 
819
  visible=False,
820
  )
821
  custom_model_cfg = gr.Dropdown(
822
+ choices=[
823
+ DEFAULT_TTS_MODEL_CFG[2],
824
+ json.dumps(dict(dim=768, depth=18, heads=12, ff_mult=2, text_dim=512, conv_layers=4)),
825
+ ],
826
  value=load_last_used_custom()[2],
827
  allow_custom_value=True,
828
  label="Config: in a dictionary form",