Add large file to gitignore
Browse files- .DS_Store +0 -0
- .gitignore +3 -0
- app.py +35 -22
- demo_audio/notes.txt +33 -0
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
.gitignore
CHANGED
@@ -4,3 +4,6 @@
|
|
4 |
# Python cache files
|
5 |
__pycache__/
|
6 |
*.pyc
|
|
|
|
|
|
|
|
4 |
# Python cache files
|
5 |
__pycache__/
|
6 |
*.pyc
|
7 |
+
|
8 |
+
|
9 |
+
echo "demo_audio/notebookllm_starhealth_demo.wav" >> .gitignore
|
app.py
CHANGED
@@ -383,17 +383,15 @@ def update_speed(new_speed):
|
|
383 |
speed = new_speed
|
384 |
return f"Speed set to: {speed}"
|
385 |
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
391 |
-
* [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
|
392 |
-
* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation
|
393 |
-
""")
|
394 |
with gr.Blocks(theme='gstaff/sketch') as app_tts:
|
395 |
gr.Markdown("# Batched TTS")
|
396 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
|
|
|
|
|
|
397 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
398 |
model_choice = gr.Radio(
|
399 |
choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
|
@@ -510,6 +508,9 @@ def parse_emotional_text(gen_text):
|
|
510 |
|
511 |
return segments
|
512 |
|
|
|
|
|
|
|
513 |
with gr.Blocks() as app_emotional:
|
514 |
# New section for emotional generation
|
515 |
gr.Markdown(
|
@@ -520,7 +521,7 @@ with gr.Blocks() as app_emotional:
|
|
520 |
|
521 |
**Example Input:**
|
522 |
|
523 |
-
(Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what,
|
524 |
"""
|
525 |
)
|
526 |
|
@@ -531,6 +532,13 @@ with gr.Blocks() as app_emotional:
|
|
531 |
regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
|
532 |
regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
|
533 |
regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
534 |
|
535 |
# Additional speech types (up to 99 more)
|
536 |
max_speech_types = 100
|
@@ -538,6 +546,7 @@ with gr.Blocks() as app_emotional:
|
|
538 |
speech_type_audios = []
|
539 |
speech_type_ref_texts = []
|
540 |
speech_type_delete_btns = []
|
|
|
541 |
|
542 |
for i in range(max_speech_types - 1):
|
543 |
with gr.Row():
|
@@ -545,10 +554,18 @@ with gr.Blocks() as app_emotional:
|
|
545 |
audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
|
546 |
ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
|
547 |
delete_btn = gr.Button("Delete", variant="secondary", visible=False)
|
|
|
548 |
speech_type_names.append(name_input)
|
549 |
speech_type_audios.append(audio_input)
|
550 |
speech_type_ref_texts.append(ref_text_input)
|
551 |
speech_type_delete_btns.append(delete_btn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
552 |
|
553 |
# Button to add speech type
|
554 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
@@ -565,17 +582,20 @@ with gr.Blocks() as app_emotional:
|
|
565 |
audio_updates = []
|
566 |
ref_text_updates = []
|
567 |
delete_btn_updates = []
|
|
|
568 |
for i in range(max_speech_types - 1):
|
569 |
if i < speech_type_count:
|
570 |
name_updates.append(gr.update(visible=True))
|
571 |
audio_updates.append(gr.update(visible=True))
|
572 |
ref_text_updates.append(gr.update(visible=True))
|
573 |
delete_btn_updates.append(gr.update(visible=True))
|
|
|
574 |
else:
|
575 |
name_updates.append(gr.update())
|
576 |
audio_updates.append(gr.update())
|
577 |
ref_text_updates.append(gr.update())
|
578 |
delete_btn_updates.append(gr.update())
|
|
|
579 |
else:
|
580 |
# Optionally, show a warning
|
581 |
# gr.Warning("Maximum number of speech types reached.")
|
@@ -583,12 +603,13 @@ with gr.Blocks() as app_emotional:
|
|
583 |
audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
584 |
ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
585 |
delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
586 |
-
|
|
|
587 |
|
588 |
add_speech_type_btn.click(
|
589 |
add_speech_type_fn,
|
590 |
inputs=speech_type_count,
|
591 |
-
outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
|
592 |
)
|
593 |
|
594 |
# Function to delete a speech type
|
@@ -749,21 +770,13 @@ with gr.Blocks() as app_emotional:
|
|
749 |
inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
|
750 |
outputs=generate_emotional_btn
|
751 |
)
|
|
|
752 |
with gr.Blocks() as app:
|
753 |
gr.Markdown(
|
754 |
"""
|
755 |
-
#
|
756 |
-
|
757 |
-
This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
|
758 |
-
|
759 |
-
* [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
|
760 |
-
* [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
|
761 |
-
|
762 |
-
The checkpoints support English and Chinese.
|
763 |
-
|
764 |
-
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
|
765 |
|
766 |
-
|
767 |
"""
|
768 |
)
|
769 |
gr.TabbedInterface([app_tts, app_podcast, app_emotional], ["TTS", "Podcast", "Multi-Style"])
|
|
|
383 |
speed = new_speed
|
384 |
return f"Speed set to: {speed}"
|
385 |
|
386 |
+
def process_audio(ref_audio_path):
|
387 |
+
return ref_audio_path
|
388 |
+
|
|
|
|
|
|
|
|
|
|
|
389 |
with gr.Blocks(theme='gstaff/sketch') as app_tts:
|
390 |
gr.Markdown("# Batched TTS")
|
391 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
392 |
+
download_button = gr.File(label="Download Your Recording")
|
393 |
+
ref_audio_input.change(process_audio, inputs=ref_audio_input, outputs=download_button)
|
394 |
+
|
395 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
396 |
model_choice = gr.Radio(
|
397 |
choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
|
|
|
508 |
|
509 |
return segments
|
510 |
|
511 |
+
def get_audio_file(audio_path):
|
512 |
+
return audio_path
|
513 |
+
|
514 |
with gr.Blocks() as app_emotional:
|
515 |
# New section for emotional generation
|
516 |
gr.Markdown(
|
|
|
521 |
|
522 |
**Example Input:**
|
523 |
|
524 |
+
(Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, fuck you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
|
525 |
"""
|
526 |
)
|
527 |
|
|
|
532 |
regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
|
533 |
regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
|
534 |
regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
|
535 |
+
download_regular_audio = gr.File(label="Download Regular Reference Audio")
|
536 |
+
|
537 |
+
regular_audio.change(
|
538 |
+
get_audio_file,
|
539 |
+
inputs=regular_audio,
|
540 |
+
outputs=download_regular_audio
|
541 |
+
)
|
542 |
|
543 |
# Additional speech types (up to 99 more)
|
544 |
max_speech_types = 100
|
|
|
546 |
speech_type_audios = []
|
547 |
speech_type_ref_texts = []
|
548 |
speech_type_delete_btns = []
|
549 |
+
download_speech_type_audios = []
|
550 |
|
551 |
for i in range(max_speech_types - 1):
|
552 |
with gr.Row():
|
|
|
554 |
audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
|
555 |
ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
|
556 |
delete_btn = gr.Button("Delete", variant="secondary", visible=False)
|
557 |
+
download_audio_input = gr.File(label="Download Reference Audio", visible=False)
|
558 |
speech_type_names.append(name_input)
|
559 |
speech_type_audios.append(audio_input)
|
560 |
speech_type_ref_texts.append(ref_text_input)
|
561 |
speech_type_delete_btns.append(delete_btn)
|
562 |
+
download_speech_type_audios.append(download_audio_input)
|
563 |
+
|
564 |
+
audio_input.change(
|
565 |
+
get_audio_file,
|
566 |
+
inputs=audio_input,
|
567 |
+
outputs=download_audio_input
|
568 |
+
)
|
569 |
|
570 |
# Button to add speech type
|
571 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
|
|
582 |
audio_updates = []
|
583 |
ref_text_updates = []
|
584 |
delete_btn_updates = []
|
585 |
+
download_btn_updates = []
|
586 |
for i in range(max_speech_types - 1):
|
587 |
if i < speech_type_count:
|
588 |
name_updates.append(gr.update(visible=True))
|
589 |
audio_updates.append(gr.update(visible=True))
|
590 |
ref_text_updates.append(gr.update(visible=True))
|
591 |
delete_btn_updates.append(gr.update(visible=True))
|
592 |
+
download_btn_updates.append(gr.update(visible=True))
|
593 |
else:
|
594 |
name_updates.append(gr.update())
|
595 |
audio_updates.append(gr.update())
|
596 |
ref_text_updates.append(gr.update())
|
597 |
delete_btn_updates.append(gr.update())
|
598 |
+
download_btn_updates.append(gr.update())
|
599 |
else:
|
600 |
# Optionally, show a warning
|
601 |
# gr.Warning("Maximum number of speech types reached.")
|
|
|
603 |
audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
604 |
ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
605 |
delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
606 |
+
download_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
607 |
+
return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates + download_btn_updates
|
608 |
|
609 |
add_speech_type_btn.click(
|
610 |
add_speech_type_fn,
|
611 |
inputs=speech_type_count,
|
612 |
+
outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns + download_speech_type_audios
|
613 |
)
|
614 |
|
615 |
# Function to delete a speech type
|
|
|
770 |
inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
|
771 |
outputs=generate_emotional_btn
|
772 |
)
|
773 |
+
|
774 |
with gr.Blocks() as app:
|
775 |
gr.Markdown(
|
776 |
"""
|
777 |
+
# TTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
778 |
|
779 |
+
This is a local web UI for TTS with advanced batch processing support. This app supports the following TTS models:
|
780 |
"""
|
781 |
)
|
782 |
gr.TabbedInterface([app_tts, app_podcast, app_emotional], ["TTS", "Podcast", "Multi-Style"])
|
demo_audio/notes.txt
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Bharat is a beautiful country filled with rich culture and history. From the colorful festivals to delicious food, there is so much to enjoy. People from different backgrounds live together, sharing their traditions and stories.
|
2 |
+
|
3 |
+
|
4 |
+
Good evening. In a major development today, authorities have announced new measures to tackle the rising pollution levels in major cities across the country. The government plans to implement stricter regulations on industrial emissions and promote electric vehicles to improve air quality. Stay tuned for more updates on this developing story.
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
/// Podcast script
|
9 |
+
Shamik: Welcome back to Tech Talk! I’m your host, Shamik, and today we have a fascinating discussion lined up about the future of AI in our everyday lives. Joining me is my co-host, Ramesh. How are you today, Ramesh?
|
10 |
+
|
11 |
+
Ramesh: I’m doing great, Shamik! Excited to dive into this topic. AI is transforming so many aspects of our lives, from how we work to how we interact with technology.
|
12 |
+
|
13 |
+
Shamik: Absolutely! It’s incredible to see how AI has progressed. A few years ago, the idea of a smart assistant was just starting to gain traction. Now, we have AI integrated into everything, from our phones to our home appliances.
|
14 |
+
|
15 |
+
Ramesh: Right! And it’s not just about convenience. AI is also enhancing productivity in various industries. For example, in healthcare, AI algorithms can analyze medical images faster than human doctors in some cases, helping to catch issues earlier.
|
16 |
+
|
17 |
+
Shamik: That’s a great point! But do you think there are potential downsides to this rapid integration of AI?
|
18 |
+
|
19 |
+
Ramesh: Definitely. While AI has the potential to improve efficiency, there are concerns about privacy and job displacement. We need to strike a balance between innovation and ethical considerations.
|
20 |
+
|
21 |
+
Shamik: I completely agree. It’s crucial for companies to be transparent about how they use AI and to prioritize data privacy. What about the impact on education? AI tools are becoming more common in classrooms.
|
22 |
+
|
23 |
+
Ramesh: That’s true. AI can personalize learning experiences, adapting to each student’s pace. However, there’s also the risk of over-reliance on technology, which could hinder critical thinking skills.
|
24 |
+
|
25 |
+
Shamik: Great insight! As we move forward, it’s vital to keep discussing these challenges and benefits. Before we wrap up, any final thoughts on what the future holds for AI?
|
26 |
+
|
27 |
+
Ramesh: I believe the future is bright! With responsible development and regulation, AI can be a powerful ally in solving some of our biggest challenges.
|
28 |
+
|
29 |
+
Shamik: Well said, Ramesh! Thank you for sharing your thoughts today. And to our listeners, thank you for tuning in. We’ll catch you next time on Tech Talk!
|
30 |
+
|
31 |
+
/// Podcast script
|
32 |
+
|
33 |
+
|