Plachta commited on
Commit
88df3b8
1 Parent(s): c0d010f

Replaced Encodec with Vocos

Browse files
Files changed (1) hide show
  1. app.py +111 -122
app.py CHANGED
@@ -323,7 +323,7 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
323
  return message, (24000, samples.squeeze(0).cpu().numpy())
324
 
325
 
326
-
327
  @torch.no_grad()
328
  def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
329
  """
@@ -331,11 +331,9 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
331
  fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
332
  sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
333
  """
334
- from utils.sentence_cutter import split_text_into_sentences
335
  if len(text) > 1000:
336
  return "Rejected, Text too long (should be less than 1000 characters)", None
337
  mode = 'fixed-prompt'
338
- global model, audio_tokenizer, text_tokenizer, text_collater
339
  if (prompt is None or prompt == "") and preset_prompt == "":
340
  mode = 'sliding-window' # If no prompt is given, use sliding-window mode
341
  sentences = split_text_into_sentences(text)
@@ -463,122 +461,113 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
463
  else:
464
  raise ValueError(f"No such mode {mode}")
465
 
466
-
467
- def main():
468
- app = gr.Blocks()
469
- with app:
470
- gr.Markdown(top_md)
471
- with gr.Tab("Infer from audio"):
472
- gr.Markdown(infer_from_audio_md)
473
- with gr.Row():
474
- with gr.Column():
475
-
476
- textbox = gr.TextArea(label="Text",
477
- placeholder="Type your sentence here",
478
- value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
479
- language_dropdown = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect', label='language')
480
- accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
481
- textbox_transcript = gr.TextArea(label="Transcript",
482
- placeholder="Write transcript here. (leave empty to use whisper)",
483
- value="", elem_id=f"prompt-name")
484
- upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
485
- record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
486
- with gr.Column():
487
- text_output = gr.Textbox(label="Message")
488
- audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
489
- btn = gr.Button("Generate!")
490
- btn.click(infer_from_audio,
491
- inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
492
- outputs=[text_output, audio_output])
493
- textbox_mp = gr.TextArea(label="Prompt name",
494
- placeholder="Name your prompt here",
495
- value="prompt_1", elem_id=f"prompt-name")
496
- btn_mp = gr.Button("Make prompt!")
497
- prompt_output = gr.File(interactive=False)
498
- btn_mp.click(make_npz_prompt,
499
- inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
500
- outputs=[text_output, prompt_output])
501
- gr.Examples(examples=infer_from_audio_examples,
502
- inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
503
- outputs=[text_output, audio_output],
504
- fn=infer_from_audio,
505
- cache_examples=False,)
506
- with gr.Tab("Make prompt"):
507
- gr.Markdown(make_prompt_md)
508
- with gr.Row():
509
- with gr.Column():
510
- textbox2 = gr.TextArea(label="Prompt name",
511
- placeholder="Name your prompt here",
512
- value="prompt_1", elem_id=f"prompt-name")
513
- # 添加选择语言和输入台本的地方
514
- textbox_transcript2 = gr.TextArea(label="Transcript",
515
- placeholder="Write transcript here. (leave empty to use whisper)",
516
- value="", elem_id=f"prompt-name")
517
- upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
518
- record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
519
- with gr.Column():
520
- text_output_2 = gr.Textbox(label="Message")
521
- prompt_output_2 = gr.File(interactive=False)
522
- btn_2 = gr.Button("Make!")
523
- btn_2.click(make_npz_prompt,
524
- inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
525
- outputs=[text_output_2, prompt_output_2])
526
- gr.Examples(examples=make_npz_prompt_examples,
527
- inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
528
- outputs=[text_output_2, prompt_output_2],
529
- fn=make_npz_prompt,
530
- cache_examples=False,)
531
- with gr.Tab("Infer from prompt"):
532
- gr.Markdown(infer_from_prompt_md)
533
- with gr.Row():
534
- with gr.Column():
535
- textbox_3 = gr.TextArea(label="Text",
536
- placeholder="Type your sentence here",
537
- value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
538
- language_dropdown_3 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語', 'Mix'], value='auto-detect',
539
- label='language')
540
- accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
541
- label='accent')
542
- preset_dropdown_3 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
543
- prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
544
- with gr.Column():
545
- text_output_3 = gr.Textbox(label="Message")
546
- audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
547
- btn_3 = gr.Button("Generate!")
548
- btn_3.click(infer_from_prompt,
549
- inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
550
- outputs=[text_output_3, audio_output_3])
551
- gr.Examples(examples=infer_from_prompt_examples,
552
- inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
553
- outputs=[text_output_3, audio_output_3],
554
- fn=infer_from_prompt,
555
- cache_examples=False,)
556
- with gr.Tab("Infer long text"):
557
- gr.Markdown(long_text_md)
558
- with gr.Row():
559
- with gr.Column():
560
- textbox_4 = gr.TextArea(label="Text",
561
- placeholder="Type your sentence here",
562
- value=long_text_example, elem_id=f"tts-input")
563
- language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
564
- label='language')
565
- accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
566
- label='accent')
567
- preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
568
- prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True)
569
- with gr.Column():
570
- text_output_4 = gr.TextArea(label="Message")
571
- audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio")
572
- btn_4 = gr.Button("Generate!")
573
- btn_4.click(infer_long_text,
574
- inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
575
- outputs=[text_output_4, audio_output_4])
576
-
577
- app.launch()
578
-
579
- if __name__ == "__main__":
580
- formatter = (
581
- "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
582
- )
583
- logging.basicConfig(format=formatter, level=logging.INFO)
584
- main()
 
323
  return message, (24000, samples.squeeze(0).cpu().numpy())
324
 
325
 
326
+ from utils.sentence_cutter import split_text_into_sentences
327
  @torch.no_grad()
328
  def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
329
  """
 
331
  fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
332
  sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
333
  """
 
334
  if len(text) > 1000:
335
  return "Rejected, Text too long (should be less than 1000 characters)", None
336
  mode = 'fixed-prompt'
 
337
  if (prompt is None or prompt == "") and preset_prompt == "":
338
  mode = 'sliding-window' # If no prompt is given, use sliding-window mode
339
  sentences = split_text_into_sentences(text)
 
461
  else:
462
  raise ValueError(f"No such mode {mode}")
463
 
464
+ app = gr.Blocks()
465
+ with app:
466
+ gr.Markdown(top_md)
467
+ with gr.Tab("Infer from audio"):
468
+ gr.Markdown(infer_from_audio_md)
469
+ with gr.Row():
470
+ with gr.Column():
471
+
472
+ textbox = gr.TextArea(label="Text",
473
+ placeholder="Type your sentence here",
474
+ value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
475
+ language_dropdown = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect', label='language')
476
+ accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
477
+ textbox_transcript = gr.TextArea(label="Transcript",
478
+ placeholder="Write transcript here. (leave empty to use whisper)",
479
+ value="", elem_id=f"prompt-name")
480
+ upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
481
+ record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
482
+ with gr.Column():
483
+ text_output = gr.Textbox(label="Message")
484
+ audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
485
+ btn = gr.Button("Generate!")
486
+ btn.click(infer_from_audio,
487
+ inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
488
+ outputs=[text_output, audio_output])
489
+ textbox_mp = gr.TextArea(label="Prompt name",
490
+ placeholder="Name your prompt here",
491
+ value="prompt_1", elem_id=f"prompt-name")
492
+ btn_mp = gr.Button("Make prompt!")
493
+ prompt_output = gr.File(interactive=False)
494
+ btn_mp.click(make_npz_prompt,
495
+ inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
496
+ outputs=[text_output, prompt_output])
497
+ gr.Examples(examples=infer_from_audio_examples,
498
+ inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
499
+ outputs=[text_output, audio_output],
500
+ fn=infer_from_audio,
501
+ cache_examples=False,)
502
+ with gr.Tab("Make prompt"):
503
+ gr.Markdown(make_prompt_md)
504
+ with gr.Row():
505
+ with gr.Column():
506
+ textbox2 = gr.TextArea(label="Prompt name",
507
+ placeholder="Name your prompt here",
508
+ value="prompt_1", elem_id=f"prompt-name")
509
+ # 添加选择语言和输入台本的地方
510
+ textbox_transcript2 = gr.TextArea(label="Transcript",
511
+ placeholder="Write transcript here. (leave empty to use whisper)",
512
+ value="", elem_id=f"prompt-name")
513
+ upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
514
+ record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
515
+ with gr.Column():
516
+ text_output_2 = gr.Textbox(label="Message")
517
+ prompt_output_2 = gr.File(interactive=False)
518
+ btn_2 = gr.Button("Make!")
519
+ btn_2.click(make_npz_prompt,
520
+ inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
521
+ outputs=[text_output_2, prompt_output_2])
522
+ gr.Examples(examples=make_npz_prompt_examples,
523
+ inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
524
+ outputs=[text_output_2, prompt_output_2],
525
+ fn=make_npz_prompt,
526
+ cache_examples=False,)
527
+ with gr.Tab("Infer from prompt"):
528
+ gr.Markdown(infer_from_prompt_md)
529
+ with gr.Row():
530
+ with gr.Column():
531
+ textbox_3 = gr.TextArea(label="Text",
532
+ placeholder="Type your sentence here",
533
+ value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
534
+ language_dropdown_3 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語', 'Mix'], value='auto-detect',
535
+ label='language')
536
+ accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
537
+ label='accent')
538
+ preset_dropdown_3 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
539
+ prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
540
+ with gr.Column():
541
+ text_output_3 = gr.Textbox(label="Message")
542
+ audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
543
+ btn_3 = gr.Button("Generate!")
544
+ btn_3.click(infer_from_prompt,
545
+ inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
546
+ outputs=[text_output_3, audio_output_3])
547
+ gr.Examples(examples=infer_from_prompt_examples,
548
+ inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
549
+ outputs=[text_output_3, audio_output_3],
550
+ fn=infer_from_prompt,
551
+ cache_examples=False,)
552
+ with gr.Tab("Infer long text"):
553
+ gr.Markdown(long_text_md)
554
+ with gr.Row():
555
+ with gr.Column():
556
+ textbox_4 = gr.TextArea(label="Text",
557
+ placeholder="Type your sentence here",
558
+ value=long_text_example, elem_id=f"tts-input")
559
+ language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
560
+ label='language')
561
+ accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
562
+ label='accent')
563
+ preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
564
+ prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True)
565
+ with gr.Column():
566
+ text_output_4 = gr.TextArea(label="Message")
567
+ audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio")
568
+ btn_4 = gr.Button("Generate!")
569
+ btn_4.click(infer_long_text,
570
+ inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
571
+ outputs=[text_output_4, audio_output_4])
572
+
573
+ app.launch()