ACloudCenter commited on
Commit
c180847
·
1 Parent(s): 89a55a5

Fixed script speaker tags. Added speaker to script mapping allowing automation of correct speaker to example scripts.

Browse files
app.py CHANGED
@@ -94,6 +94,22 @@ class VibeVoiceDemo:
94
  name = os.path.splitext(wav_file)[0]
95
  self.available_voices[name] = os.path.join(voices_dir, wav_file)
96
  print(f"Voices loaded: {list(self.available_voices.keys())}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
99
  try:
@@ -271,6 +287,18 @@ class VibeVoiceDemo:
271
  "4p_product_meeting.txt"
272
  ]
273
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  for txt_file in original_files:
275
  try:
276
  with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
@@ -552,29 +580,73 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
552
  import random
553
  scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
554
  if scripts_list:
555
- num_speakers_value, script_value = random.choice(scripts_list)
556
- return num_speakers_value, script_value
557
- return 2, "Speaker 0: Welcome to our AI conference demo!\nSpeaker 1: Thanks, excited to be here!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
 
559
  random_example_btn.click(
560
  fn=load_random_example,
561
  inputs=[use_natural],
562
- outputs=[num_speakers, script_input],
563
  queue=False
564
  )
565
 
566
  def load_specific_example(idx, use_natural_checkbox):
 
567
  scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
568
  if idx < len(scripts_list):
569
  num_speakers_value, script_value = scripts_list[idx]
570
- return num_speakers_value, script_value
571
- return 2, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
 
573
  for idx, btn in enumerate(example_buttons):
574
  btn.click(
575
  fn=lambda nat, i=idx: load_specific_example(i, nat),
576
  inputs=[use_natural],
577
- outputs=[num_speakers, script_input],
578
  queue=False
579
  )
580
 
@@ -658,7 +730,7 @@ def run_demo(
658
  model_paths: dict = None,
659
  device: str = "cuda",
660
  inference_steps: int = 5,
661
- share: bool = True,
662
  ):
663
  """
664
  model_paths default includes two entries. Replace paths as needed.
 
94
  name = os.path.splitext(wav_file)[0]
95
  self.available_voices[name] = os.path.join(voices_dir, wav_file)
96
  print(f"Voices loaded: {list(self.available_voices.keys())}")
97
+
98
+ # Organize voices by gender
99
+ self.male_voices = [
100
+ "en-Carter_man",
101
+ "en-Frank_man",
102
+ "en-Yasser_man",
103
+ "in-Samuel_man",
104
+ "zh-Anchen_man_bgm",
105
+ "zh-Bowen_man"
106
+ ]
107
+ self.female_voices = [
108
+ "en-Alice_woman_bgm",
109
+ "en-Alice_woman",
110
+ "en-Maya_woman",
111
+ "zh-Xinran_woman"
112
+ ]
113
 
114
  def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
115
  try:
 
287
  "4p_product_meeting.txt"
288
  ]
289
 
290
+ # Gender mapping for each script's speakers
291
+ self.script_speaker_genders = [
292
+ ["female"], # AI TED Talk - Dr. Rachel Thompson
293
+ ["neutral"], # Political Speech - generic speaker
294
+ ["male", "female"], # Finance IPO - James Harrison, Patricia Wells
295
+ ["female", "male"], # Telehealth - Dr. Williams, Mr. Johnson
296
+ ["male", "male", "male"], # Military - Colonel, Major, Commander
297
+ ["male", "male", "male"], # Oil - Frank, Miguel, Sarah (keeping Sarah as is)
298
+ ["male", "male", "female", "male"], # Game Creation - Alex, Jordan, Sam, Taylor
299
+ ["female", "male", "female", "male"] # Product Meeting - Sarah, Marcus, Jennifer, David
300
+ ]
301
+
302
  for txt_file in original_files:
303
  try:
304
  with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
 
580
  import random
581
  scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
582
  if scripts_list:
583
+ idx = random.randint(0, len(scripts_list) - 1)
584
+ num_speakers_value, script_value = scripts_list[idx]
585
+
586
+ # Get gender preferences for this script
587
+ genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else []
588
+
589
+ # Select appropriate voices based on gender
590
+ voice_selections = []
591
+ for i in range(4):
592
+ if i < len(genders):
593
+ gender = genders[i]
594
+ if gender == "male" and demo_instance.male_voices:
595
+ voice = random.choice(demo_instance.male_voices)
596
+ elif gender == "female" and demo_instance.female_voices:
597
+ voice = random.choice(demo_instance.female_voices)
598
+ else:
599
+ # neutral or fallback
600
+ all_voices = list(demo_instance.available_voices.keys())
601
+ voice = random.choice(all_voices) if all_voices else None
602
+ else:
603
+ voice = None
604
+ voice_selections.append(voice)
605
+
606
+ return [num_speakers_value, script_value] + voice_selections
607
+ return [2, "Speaker 0: Welcome to our AI conference demo!\nSpeaker 1: Thanks, excited to be here!"] + [None, None, None, None]
608
 
609
  random_example_btn.click(
610
  fn=load_random_example,
611
  inputs=[use_natural],
612
+ outputs=[num_speakers, script_input] + speaker_selections,
613
  queue=False
614
  )
615
 
616
  def load_specific_example(idx, use_natural_checkbox):
617
+ import random
618
  scripts_list = demo_instance.example_scripts_natural if use_natural_checkbox else demo_instance.example_scripts
619
  if idx < len(scripts_list):
620
  num_speakers_value, script_value = scripts_list[idx]
621
+ # Get gender preferences for this script
622
+ genders = demo_instance.script_speaker_genders[idx] if idx < len(demo_instance.script_speaker_genders) else []
623
+
624
+ # Select appropriate voices based on gender
625
+ voice_selections = []
626
+ for i in range(4):
627
+ if i < len(genders):
628
+ gender = genders[i]
629
+ if gender == "male" and demo_instance.male_voices:
630
+ voice = random.choice(demo_instance.male_voices)
631
+ elif gender == "female" and demo_instance.female_voices:
632
+ voice = random.choice(demo_instance.female_voices)
633
+ else:
634
+ # neutral or fallback
635
+ all_voices = list(demo_instance.available_voices.keys())
636
+ voice = random.choice(all_voices) if all_voices else None
637
+ else:
638
+ voice = None
639
+ voice_selections.append(voice)
640
+
641
+ # Return values for all outputs
642
+ return [num_speakers_value, script_value] + voice_selections
643
+ return [2, ""] + [None, None, None, None]
644
 
645
  for idx, btn in enumerate(example_buttons):
646
  btn.click(
647
  fn=lambda nat, i=idx: load_specific_example(i, nat),
648
  inputs=[use_natural],
649
+ outputs=[num_speakers, script_input] + speaker_selections,
650
  queue=False
651
  )
652
 
 
730
  model_paths: dict = None,
731
  device: str = "cuda",
732
  inference_steps: int = 5,
733
+ share: bool = False,
734
  ):
735
  """
736
  model_paths default includes two entries. Replace paths as needed.
text_examples/3p_military_meeting_natural.txt CHANGED
@@ -4,7 +4,7 @@ Speaker 2: Certainly, Colonel. The Falcon series represents a significant advanc
4
 
5
  Speaker 1: That dual capability is exactly what makes this program so promising. Captain Rodriguez, from a humanitarian perspective, how do you see these systems being integrated into our, um, disaster response and civilian assistance protocols?
6
 
7
- Speaker 2: Colonel, the potential is enormous. In conflict zones where traditional ground-based humanitarian convoys can't safely operate, these drones can provide, um, critical medical supplies to isolated populations. We can deliver emergency medications, blood products, and communication devices to civilians trapped in contested areas. The reconnaissance capability also allows us to assess humanitarian needs in real-time, identifying displaced persons, evaluating infrastructure damage, and, uh, locating civilians who need immediate assistance.
8
 
9
  Speaker 2: The intelligence gathering aspect is crucial for both mission planning and safety. Before any humanitarian drops, we can use the surveillance systems to ensure the area is secure and that civilians are actually present at the, um, target location. The thermal imaging is particularly useful for locating survivors in damaged buildings or identifying gathering points where people need, uh, assistance.
10
 
 
4
 
5
  Speaker 1: That dual capability is exactly what makes this program so promising. Captain Rodriguez, from a humanitarian perspective, how do you see these systems being integrated into our, um, disaster response and civilian assistance protocols?
6
 
7
+ Speaker 3: Colonel, the potential is enormous. In conflict zones where traditional ground-based humanitarian convoys can't safely operate, these drones can provide, um, critical medical supplies to isolated populations. We can deliver emergency medications, blood products, and communication devices to civilians trapped in contested areas. The reconnaissance capability also allows us to assess humanitarian needs in real-time, identifying displaced persons, evaluating infrastructure damage, and, uh, locating civilians who need immediate assistance.
8
 
9
  Speaker 2: The intelligence gathering aspect is crucial for both mission planning and safety. Before any humanitarian drops, we can use the surveillance systems to ensure the area is secure and that civilians are actually present at the, um, target location. The thermal imaging is particularly useful for locating survivors in damaged buildings or identifying gathering points where people need, uh, assistance.
10