Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
c180847
1
Parent(s):
89a55a5
Fixed script speaker tags. Added speaker to script mapping allowing automation of correct speaker to example scripts.
Browse files- app.py +80 -8
- text_examples/3p_military_meeting_natural.txt +1 -1
app.py
CHANGED
|
@@ -94,6 +94,22 @@ class VibeVoiceDemo:
|
|
| 94 |
name = os.path.splitext(wav_file)[0]
|
| 95 |
self.available_voices[name] = os.path.join(voices_dir, wav_file)
|
| 96 |
print(f"Voices loaded: {list(self.available_voices.keys())}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
|
| 99 |
try:
|
|
@@ -271,6 +287,18 @@ class VibeVoiceDemo:
|
|
| 271 |
"4p_product_meeting.txt"
|
| 272 |
]
|
| 273 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
for txt_file in original_files:
|
| 275 |
try:
|
| 276 |
with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
|
|
@@ -552,29 +580,73 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
|
|
| 552 |
import random
|
| 553 |
scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
|
| 554 |
if scripts_list:
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
|
| 559 |
random_example_btn.click(
|
| 560 |
fn=load_random_example,
|
| 561 |
inputs=[use_natural],
|
| 562 |
-
outputs=[num_speakers, script_input],
|
| 563 |
queue=False
|
| 564 |
)
|
| 565 |
|
| 566 |
def load_specific_example(idx, use_natural_checkbox):
|
|
|
|
| 567 |
scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
|
| 568 |
if idx < len(scripts_list):
|
| 569 |
num_speakers_value, script_value = scripts_list[idx]
|
| 570 |
-
|
| 571 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
|
| 573 |
for idx, btn in enumerate(example_buttons):
|
| 574 |
btn.click(
|
| 575 |
fn=lambda nat, i=idx: load_specific_example(i, nat),
|
| 576 |
inputs=[use_natural],
|
| 577 |
-
outputs=[num_speakers, script_input],
|
| 578 |
queue=False
|
| 579 |
)
|
| 580 |
|
|
@@ -658,7 +730,7 @@ def run_demo(
|
|
| 658 |
model_paths: dict = None,
|
| 659 |
device: str = "cuda",
|
| 660 |
inference_steps: int = 5,
|
| 661 |
-
share: bool =
|
| 662 |
):
|
| 663 |
"""
|
| 664 |
model_paths default includes two entries. Replace paths as needed.
|
|
|
|
| 94 |
name = os.path.splitext(wav_file)[0]
|
| 95 |
self.available_voices[name] = os.path.join(voices_dir, wav_file)
|
| 96 |
print(f"Voices loaded: {list(self.available_voices.keys())}")
|
| 97 |
+
|
| 98 |
+
# Organize voices by gender
|
| 99 |
+
self.male_voices = [
|
| 100 |
+
"en-Carter_man",
|
| 101 |
+
"en-Frank_man",
|
| 102 |
+
"en-Yasser_man",
|
| 103 |
+
"in-Samuel_man",
|
| 104 |
+
"zh-Anchen_man_bgm",
|
| 105 |
+
"zh-Bowen_man"
|
| 106 |
+
]
|
| 107 |
+
self.female_voices = [
|
| 108 |
+
"en-Alice_woman_bgm",
|
| 109 |
+
"en-Alice_woman",
|
| 110 |
+
"en-Maya_woman",
|
| 111 |
+
"zh-Xinran_woman"
|
| 112 |
+
]
|
| 113 |
|
| 114 |
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
|
| 115 |
try:
|
|
|
|
| 287 |
"4p_product_meeting.txt"
|
| 288 |
]
|
| 289 |
|
| 290 |
+
# Gender mapping for each script's speakers
|
| 291 |
+
self.script_speaker_genders = [
|
| 292 |
+
["female"], # AI TED Talk - Dr. Rachel Thompson
|
| 293 |
+
["neutral"], # Political Speech - generic speaker
|
| 294 |
+
["male", "female"], # Finance IPO - James Harrison, Patricia Wells
|
| 295 |
+
["female", "male"], # Telehealth - Dr. Williams, Mr. Johnson
|
| 296 |
+
["male", "male", "male"], # Military - Colonel, Major, Commander
|
| 297 |
+
["male", "male", "male"], # Oil - Frank, Miguel, Sarah (keeping Sarah as is)
|
| 298 |
+
["male", "male", "female", "male"], # Game Creation - Alex, Jordan, Sam, Taylor
|
| 299 |
+
["female", "male", "female", "male"] # Product Meeting - Sarah, Marcus, Jennifer, David
|
| 300 |
+
]
|
| 301 |
+
|
| 302 |
for txt_file in original_files:
|
| 303 |
try:
|
| 304 |
with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
|
|
|
|
| 580 |
import random
|
| 581 |
scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
|
| 582 |
if scripts_list:
|
| 583 |
+
idx = random.randint(0, len(scripts_list) - 1)
|
| 584 |
+
num_speakers_value, script_value = scripts_list[idx]
|
| 585 |
+
|
| 586 |
+
# Get gender preferences for this script
|
| 587 |
+
genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else []
|
| 588 |
+
|
| 589 |
+
# Select appropriate voices based on gender
|
| 590 |
+
voice_selections = []
|
| 591 |
+
for i in range(4):
|
| 592 |
+
if i < len(genders):
|
| 593 |
+
gender = genders[i]
|
| 594 |
+
if gender == "male" and demo_instance.male_voices:
|
| 595 |
+
voice = random.choice(demo_instance.male_voices)
|
| 596 |
+
elif gender == "female" and demo_instance.female_voices:
|
| 597 |
+
voice = random.choice(demo_instance.female_voices)
|
| 598 |
+
else:
|
| 599 |
+
# neutral or fallback
|
| 600 |
+
all_voices = list(demo_instance.available_voices.keys())
|
| 601 |
+
voice = random.choice(all_voices) if all_voices else None
|
| 602 |
+
else:
|
| 603 |
+
voice = None
|
| 604 |
+
voice_selections.append(voice)
|
| 605 |
+
|
| 606 |
+
return [num_speakers_value, script_value] + voice_selections
|
| 607 |
+
return [2, "Speaker 0: Welcome to our AI conference demo!\nSpeaker 1: Thanks, excited to be here!"] + [None, None, None, None]
|
| 608 |
|
| 609 |
random_example_btn.click(
|
| 610 |
fn=load_random_example,
|
| 611 |
inputs=[use_natural],
|
| 612 |
+
outputs=[num_speakers, script_input] + speaker_selections,
|
| 613 |
queue=False
|
| 614 |
)
|
| 615 |
|
| 616 |
def load_specific_example(idx, use_natural_checkbox):
|
| 617 |
+
import random
|
| 618 |
scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
|
| 619 |
if idx < len(scripts_list):
|
| 620 |
num_speakers_value, script_value = scripts_list[idx]
|
| 621 |
+
# Get gender preferences for this script
|
| 622 |
+
genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else []
|
| 623 |
+
|
| 624 |
+
# Select appropriate voices based on gender
|
| 625 |
+
voice_selections = []
|
| 626 |
+
for i in range(4):
|
| 627 |
+
if i < len(genders):
|
| 628 |
+
gender = genders[i]
|
| 629 |
+
if gender == "male" and demo_instance.male_voices:
|
| 630 |
+
voice = random.choice(demo_instance.male_voices)
|
| 631 |
+
elif gender == "female" and demo_instance.female_voices:
|
| 632 |
+
voice = random.choice(demo_instance.female_voices)
|
| 633 |
+
else:
|
| 634 |
+
# neutral or fallback
|
| 635 |
+
all_voices = list(demo_instance.available_voices.keys())
|
| 636 |
+
voice = random.choice(all_voices) if all_voices else None
|
| 637 |
+
else:
|
| 638 |
+
voice = None
|
| 639 |
+
voice_selections.append(voice)
|
| 640 |
+
|
| 641 |
+
# Return values for all outputs
|
| 642 |
+
return [num_speakers_value, script_value] + voice_selections
|
| 643 |
+
return [2, ""] + [None, None, None, None]
|
| 644 |
|
| 645 |
for idx, btn in enumerate(example_buttons):
|
| 646 |
btn.click(
|
| 647 |
fn=lambda nat, i=idx: load_specific_example(i, nat),
|
| 648 |
inputs=[use_natural],
|
| 649 |
+
outputs=[num_speakers, script_input] + speaker_selections,
|
| 650 |
queue=False
|
| 651 |
)
|
| 652 |
|
|
|
|
| 730 |
model_paths: dict = None,
|
| 731 |
device: str = "cuda",
|
| 732 |
inference_steps: int = 5,
|
| 733 |
+
share: bool = False,
|
| 734 |
):
|
| 735 |
"""
|
| 736 |
model_paths default includes two entries. Replace paths as needed.
|
text_examples/3p_military_meeting_natural.txt
CHANGED
|
@@ -4,7 +4,7 @@ Speaker 2: Certainly, Colonel. The Falcon series represents a significant advanc
|
|
| 4 |
|
| 5 |
Speaker 1: That dual capability is exactly what makes this program so promising. Captain Rodriguez, from a humanitarian perspective, how do you see these systems being integrated into our, um, disaster response and civilian assistance protocols?
|
| 6 |
|
| 7 |
-
Speaker
|
| 8 |
|
| 9 |
Speaker 2: The intelligence gathering aspect is crucial for both mission planning and safety. Before any humanitarian drops, we can use the surveillance systems to ensure the area is secure and that civilians are actually present at the, um, target location. The thermal imaging is particularly useful for locating survivors in damaged buildings or identifying gathering points where people need, uh, assistance.
|
| 10 |
|
|
|
|
| 4 |
|
| 5 |
Speaker 1: That dual capability is exactly what makes this program so promising. Captain Rodriguez, from a humanitarian perspective, how do you see these systems being integrated into our, um, disaster response and civilian assistance protocols?
|
| 6 |
|
| 7 |
+
Speaker 3: Colonel, the potential is enormous. In conflict zones where traditional ground-based humanitarian convoys can't safely operate, these drones can provide, um, critical medical supplies to isolated populations. We can deliver emergency medications, blood products, and communication devices to civilians trapped in contested areas. The reconnaissance capability also allows us to assess humanitarian needs in real-time, identifying displaced persons, evaluating infrastructure damage, and, uh, locating civilians who need immediate assistance.
|
| 8 |
|
| 9 |
Speaker 2: The intelligence gathering aspect is crucial for both mission planning and safety. Before any humanitarian drops, we can use the surveillance systems to ensure the area is secure and that civilians are actually present at the, um, target location. The thermal imaging is particularly useful for locating survivors in damaged buildings or identifying gathering points where people need, uh, assistance.
|
| 10 |
|