NeuralFalcon commited on
Commit
3c4a5a6
Β·
verified Β·
1 Parent(s): 117f764

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -208
app.py CHANGED
@@ -21,8 +21,8 @@ import torch
21
  import os
22
  import traceback
23
  import shutil
24
- import re # Added for timestamp feature
25
- import uuid # Added for timestamp feature
26
 
27
  from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig
28
  from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference, VibeVoiceGenerationOutput
@@ -41,7 +41,7 @@ def drive_save(file_copy):
41
  print("Running on Google Colab and auto-saving to Google Drive...")
42
  os.makedirs(save_folder, exist_ok=True)
43
  dest_path = os.path.join(save_folder, os.path.basename(file_copy))
44
- shutil.copy2(file_copy, dest_path) # preserves metadata
45
  print(f"File saved to: {dest_path}")
46
  return dest_path
47
  else:
@@ -52,12 +52,8 @@ import os, requests, urllib.request, urllib.error
52
  from tqdm.auto import tqdm
53
 
54
  def download_file(url, download_file_path, redownload=False):
55
- """Download a single file with urllib + tqdm progress bar."""
56
-
57
  base_path = os.path.dirname(download_file_path)
58
  os.makedirs(base_path, exist_ok=True)
59
-
60
- # skip logic
61
  if os.path.exists(download_file_path):
62
  if redownload:
63
  os.remove(download_file_path)
@@ -65,7 +61,6 @@ def download_file(url, download_file_path, redownload=False):
65
  elif os.path.getsize(download_file_path) > 0:
66
  tqdm.write(f"βœ”οΈ Skipped (already exists): {os.path.basename(download_file_path)}")
67
  return True
68
-
69
  try:
70
  request = urllib.request.urlopen(url)
71
  total = int(request.headers.get('Content-Length', 0))
@@ -73,7 +68,6 @@ def download_file(url, download_file_path, redownload=False):
73
  print(f"❌ Error: Unable to open URL: {url}")
74
  print(f"Reason: {e.reason}")
75
  return False
76
-
77
  with tqdm(total=total, desc=os.path.basename(download_file_path), unit='B', unit_scale=True, unit_divisor=1024) as progress:
78
  try:
79
  urllib.request.urlretrieve(
@@ -85,77 +79,54 @@ def download_file(url, download_file_path, redownload=False):
85
  print(f"❌ Error: Failed to download {url}")
86
  print(f"Reason: {e.reason}")
87
  return False
88
-
89
  tqdm.write(f"⬇️ Downloaded: {os.path.basename(download_file_path)}")
90
  return True
91
 
92
-
93
  def download_model(repo_id, download_folder="./", redownload=False):
94
- # normalize empty string as current dir
95
  if not download_folder.strip():
96
  download_folder = "."
97
  url = f"https://huggingface.co/api/models/{repo_id}"
98
  download_dir = os.path.abspath(f"{download_folder.rstrip('/')}/{repo_id.split('/')[-1]}")
99
  os.makedirs(download_dir, exist_ok=True)
100
-
101
  print(f"πŸ“‚ Download directory: {download_dir}")
102
-
103
  response = requests.get(url)
104
  if response.status_code != 200:
105
  print("❌ Error:", response.status_code, response.text)
106
  return None
107
-
108
  data = response.json()
109
  siblings = data.get("siblings", [])
110
  files = [f["rfilename"] for f in siblings]
111
-
112
  print(f"πŸ“¦ Found {len(files)} files in repo '{repo_id}'. Checking cache ...")
113
-
114
  for file in tqdm(files, desc="Processing files", unit="file"):
115
  file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file}"
116
  file_path = os.path.join(download_dir, file)
117
  download_file(file_url, file_path, redownload=redownload)
118
-
119
  return download_dir
120
 
121
-
122
-
123
-
124
-
125
-
126
-
127
- # NEW FEATURE: Function to generate unique filenames for output
128
  def generate_file_name(text):
129
- """Generates a unique, clean filename based on the script's first line."""
130
  output_dir = "./podcast_audio"
131
  os.makedirs(output_dir, exist_ok=True)
132
- # Clean the text to get a base for the filename
133
  cleaned = re.sub(r"^\s*speaker\s*\d+\s*:\s*", "", text, flags=re.IGNORECASE)
134
  short = cleaned[:30].strip()
135
  short = re.sub(r'[^a-zA-Z0-9\s]', '', short)
136
  short = short.lower().strip().replace(" ", "_")
137
  if not short:
138
  short = "podcast_output"
139
- # Add a unique identifier
140
  unique_name = f"{short}_{uuid.uuid4().hex[:6]}"
141
-
142
  return os.path.join(output_dir, unique_name)
143
 
144
-
145
  class VibeVoiceDemo:
146
  def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5):
147
- """Initialize the VibeVoice demo with model loading."""
148
  self.model_path = model_path
149
  self.device = device
150
  self.inference_steps = inference_steps
151
- self.is_generating = False # Track generation state
152
- self.stop_generation = False # Flag to stop generation
153
  self.load_model()
154
  self.setup_voice_presets()
155
- self.load_example_scripts() # Load example scripts
156
 
157
  def load_model(self):
158
- """Load the VibeVoice model and processor."""
159
  print(f"Loading processor & model from {self.model_path}")
160
  self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
161
  if self.device == "cuda":
@@ -167,10 +138,9 @@ class VibeVoiceDemo:
167
  else:
168
  self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
169
  self.model_path,
170
- torch_dtype=torch.float32, # Use float32 for CPU
171
  device_map="cpu",
172
  )
173
-
174
  self.model.eval()
175
  self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
176
  self.model.model.noise_scheduler.config,
@@ -182,81 +152,62 @@ class VibeVoiceDemo:
182
  print(f"Language model attention: {self.model.model.language_model.config._attn_implementation}")
183
 
184
  def setup_voice_presets(self):
185
- """Setup voice presets by scanning the voices directory."""
186
  voices_dir = os.path.join(os.path.dirname(__file__), "voices")
187
  if not os.path.exists(voices_dir):
188
  print(f"Warning: Voices directory not found at {voices_dir}, creating it.")
189
  os.makedirs(voices_dir, exist_ok=True)
190
  self.voice_presets = {}
191
- audio_files = [f for f in os.listdir(voices_dir)
192
- if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')) and os.path.isfile(os.path.join(voices_dir, f))]
193
  for audio_file in audio_files:
194
  name = os.path.splitext(audio_file)[0]
195
- full_path = os.path.join(voices_dir, audio_file)
196
- self.voice_presets[name] = full_path
197
  self.voice_presets = dict(sorted(self.voice_presets.items()))
198
  self.available_voices = {name: path for name, path in self.voice_presets.items() if os.path.exists(path)}
199
- if not self.available_voices:
200
- print("Warning: No voice presets found.")
201
  print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
202
 
203
  def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
204
- """Read and preprocess audio file."""
205
  try:
206
  wav, sr = sf.read(audio_path)
207
- if len(wav.shape) > 1:
208
- wav = np.mean(wav, axis=1)
209
- if sr != target_sr:
210
- wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
211
  return wav
212
  except Exception as e:
213
  print(f"Error reading audio {audio_path}: {e}")
214
  return np.array([])
215
 
216
  def trim_silence_from_numpy(self, audio_np: np.ndarray, sample_rate: int, silence_thresh: int = -45, min_silence_len: int = 100, keep_silence: int = 50) -> np.ndarray:
217
- """Removes silence from a NumPy audio array using pydub."""
218
  audio_int16 = (audio_np * 32767).astype(np.int16)
219
- sound = AudioSegment(
220
- data=audio_int16.tobytes(),
221
- sample_width=audio_int16.dtype.itemsize,
222
- frame_rate=sample_rate,
223
- channels=1
224
- )
225
- audio_chunks = split_on_silence(
226
- sound, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence
227
- )
228
- if not audio_chunks:
229
- return np.array([0.0], dtype=np.float32)
230
-
231
  combined = sum(audio_chunks)
232
  samples = np.array(combined.get_array_of_samples())
233
- trimmed_audio_np = samples.astype(np.float32) / 32767.0
234
- return trimmed_audio_np
235
 
236
  def generate_podcast_with_timestamps(self,
237
  num_speakers: int,
238
  script: str,
239
- speaker_1: str = None,
240
- speaker_2: str = None,
241
- speaker_3: str = None,
242
- speaker_4: str = None,
243
- cfg_scale: float = 1.3,
244
- remove_silence: bool = False,
245
  progress=gr.Progress()):
 
 
 
 
 
246
  try:
247
  self.stop_generation = False
248
  self.is_generating = True
249
-
250
- # --- Input Validation and Setup ---
251
  if not script.strip(): raise gr.Error("Error: Please provide a script.")
252
  script = script.replace("’", "'")
253
  if not 1 <= num_speakers <= 4: raise gr.Error("Error: Number of speakers must be between 1 and 4.")
254
-
255
  selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
256
  for i, speaker in enumerate(selected_speakers):
257
  if not speaker or speaker not in self.available_voices:
258
  raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.")
259
-
260
  voice_samples = [self.read_audio(self.available_voices[name]) for name in selected_speakers]
261
  if any(len(vs) == 0 for vs in voice_samples): raise gr.Error("Error: Failed to load one or more audio files.")
262
 
@@ -269,82 +220,68 @@ class VibeVoiceDemo:
269
  formatted_script_lines.append(line)
270
  else:
271
  speaker_id = len(formatted_script_lines) % num_speakers
272
- formatted_script_lines.append(f"Speaker {speaker_id}: {line}")
273
 
274
  if not formatted_script_lines: raise gr.Error("Error: Script is empty after formatting.")
275
 
276
- # --- Prepare for Generation ---
277
  timestamps = {}
278
  current_time = 0.0
279
  sample_rate = 24000
280
- total_lines = len(formatted_script_lines)
281
-
282
  base_filename = generate_file_name(formatted_script_lines[0])
283
  final_audio_path = base_filename + ".wav"
284
  final_json_path = base_filename + ".json"
285
 
286
- # --- Open file and write chunks sequentially (MEMORY EFFICIENT) ---
287
  with sf.SoundFile(final_audio_path, 'w', samplerate=sample_rate, channels=1, subtype='PCM_16') as audio_file:
288
  for i, line in enumerate(formatted_script_lines):
289
  if self.stop_generation:
 
290
  break
291
-
292
- progress(i / total_lines, desc=f"Generating line {i+1}/{total_lines}")
293
-
294
  match = re.match(r'Speaker\s*(\d+):\s*(.*)', line, re.IGNORECASE)
295
  if not match: continue
296
-
297
  speaker_idx = int(match.group(1)) - 1
298
  text_content = match.group(2).strip()
299
-
300
- if speaker_idx < 0 or speaker_idx >= len(voice_samples):
301
- continue
302
-
303
- inputs = self.processor(
304
- text=[line], voice_samples=[voice_samples], padding=True, return_tensors="pt"
305
- )
306
-
307
- output_waveform: VibeVoiceGenerationOutput = self.model.generate(
308
- **inputs, max_new_tokens=None, cfg_scale=cfg_scale, tokenizer=self.processor.tokenizer,
309
- generation_config={'do_sample': False}, verbose=False, refresh_negative=True
310
- )
311
-
312
  audio_np = output_waveform.speech_outputs[0].cpu().float().numpy().squeeze()
313
-
314
- # NEW FEATURE: Remove silence if enabled
315
- if remove_silence:
316
- audio_np = self.trim_silence_from_numpy(audio_np, sample_rate)
317
-
318
  duration = len(audio_np) / sample_rate
319
- audio_int16 = (audio_np * 32767).astype(np.int16)
320
- audio_file.write(audio_int16)
321
-
322
- timestamps[str(i + 1)] = {
323
- "text": text_content, "speaker_id": speaker_idx+1,
324
- "start": current_time, "end": current_time + duration
325
- }
326
  current_time += duration
327
 
328
- # --- Finalize and Save JSON ---
329
- progress(1.0, desc="Saving timestamp file...")
330
- with open(final_json_path, "w") as f:
331
- json.dump(timestamps, f, indent=2)
 
 
 
 
332
  try:
333
- drive_save(final_audio_path)
334
- drive_save(final_json_path)
335
- except Exception as e:
336
- print(f"Error saving files to Google Drive: {e}")
337
 
338
- print(f"\n✨ Generation successful!\n🎡 Audio: {final_audio_path}\nπŸ“„ Timestamps: {final_json_path}\n")
 
 
339
  self.is_generating = False
340
-
341
- return final_audio_path, final_audio_path, final_json_path, gr.update(visible=True), gr.update(visible=False)
342
 
343
  except Exception as e:
344
  self.is_generating = False
345
  print(f"❌ An unexpected error occurred: {str(e)}")
346
  traceback.print_exc()
347
- return None, None, None, gr.update(visible=True), gr.update(visible=False)
 
 
 
 
348
 
349
  def stop_audio_generation(self):
350
  if self.is_generating:
@@ -361,40 +298,26 @@ class VibeVoiceDemo:
361
  with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
362
  script = f.read().strip()
363
  if script: self.example_scripts.append([self._get_num_speakers_from_script(script), script])
364
- except Exception as e:
365
- print(f"Error loading example {txt_file}: {e}")
366
 
367
  def _get_num_speakers_from_script(self, script: str) -> int:
368
  speakers = set(re.findall(r'^Speaker\s+(\d+)\s*:', script, re.MULTILINE | re.IGNORECASE))
369
  return max(int(s) for s in speakers) if speakers else 1
370
 
371
  def create_demo_interface(demo_instance: VibeVoiceDemo):
372
- with gr.Blocks(
373
- title="VibeVoice AI Podcast Generator"
374
- ) as interface:
375
-
376
  gr.HTML("""
377
  <div style="text-align: center; margin: 20px auto; max-width: 800px;">
378
  <h1 style="font-size: 2.5em; margin-bottom: 10px;">πŸŽ™οΈ Vibe Podcasting</h1>
379
- <p style="font-size: 1.2em; color: #555; margin-bottom: 15px;">
380
- Generate Long-form Multi-speaker AI Podcasts with VibeVoice
381
- </p>
382
- <a href="https://colab.research.google.com/github/NeuralFalconYT/AI-Podcast-Generator/blob/main/VibeVoice_Colab.ipynb"
383
- target="_blank"
384
- style="display: inline-block; padding: 10px 20px; background-color: #4285F4; color: white;
385
- border-radius: 6px; text-decoration: none; font-size: 1em;">
386
- πŸ₯³ Run on Google Colab
387
- </a>
388
  </div>
389
  """)
390
-
391
  with gr.Row():
392
- # Left column - Settings
393
  with gr.Column(scale=1):
394
  with gr.Group():
395
  gr.Markdown("### πŸŽ›οΈ Podcast Settings")
396
  num_speakers = gr.Slider(minimum=1, maximum=4, value=2, step=1, label="Number of Speakers")
397
-
398
  gr.Markdown("### 🎭 Speaker Selection")
399
  speaker_selections = []
400
  available_voices = list(demo_instance.available_voices.keys())
@@ -403,42 +326,34 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
403
  val = defaults[i] if i < len(defaults) and defaults[i] in available_voices else None
404
  speaker = gr.Dropdown(choices=available_voices, value=val, label=f"Speaker {i+1}", visible=(i < 2))
405
  speaker_selections.append(speaker)
406
-
407
  with gr.Accordion("🎀 Upload Custom Voices", open=False):
408
  upload_audio = gr.File(label="Upload Voice Samples", file_count="multiple", file_types=["audio"])
409
  process_upload_btn = gr.Button("Add Uploaded Voices to Speaker Selection")
410
-
411
  with gr.Accordion("βš™οΈ Advanced Settings", open=False):
412
  cfg_scale = gr.Slider(minimum=1.0, maximum=2.0, value=1.3, step=0.05, label="CFG Scale")
413
- # NEW FEATURE: Silence removal checkbox
414
  remove_silence_checkbox = gr.Checkbox(label="Trim Silence from Podcast", value=False,)
415
-
416
- # Right column - Generation
417
  with gr.Column(scale=2):
418
  with gr.Group():
419
  gr.Markdown("### πŸ“ Script Input")
420
- script_input = gr.Textbox(label="Conversation Script", placeholder="Enter script here...", lines=10)
421
-
 
 
 
422
  with gr.Row():
423
  random_example_btn = gr.Button("🎲 Random Example", scale=1)
424
  generate_btn = gr.Button("πŸš€ Generate Podcast", variant="primary", scale=2)
425
-
426
  stop_btn = gr.Button("πŸ›‘ Stop Generation", variant="stop", visible=False)
427
-
428
  gr.Markdown("### 🎡 **Generated Output**")
429
  audio_output = gr.Audio(label="Play Generated Podcast")
430
  with gr.Accordion("πŸ“¦ Download Files", open=False):
431
  download_file = gr.File(label="Download Audio File (.wav)")
432
  json_file_output = gr.File(label="Download Timestamps (.json)")
433
 
434
- with gr.Accordion("πŸ’‘ Usage Tips & Examples", open=True):
435
- gr.Markdown("""
436
- - **Upload Your Own Voices:** Create your own podcast with custom voice samples.
437
- - **Timestamps:** Useful if you want to generate a video using Wan2.2 or other tools. The timestamps let you automatically separate each speaker (splitting the long podcast into smaller chunks), pass the audio clips to your video generation model, and then merge the generated video clips into a full podcast video (e.g., using FFmpeg + any video generation model such as image+audio β†’ video).
438
- """)
439
  gr.Examples(examples=demo_instance.example_scripts, inputs=[num_speakers, script_input], label="Try these example scripts:")
440
 
441
- # --- Backend Functions ---
442
  def process_and_refresh_voices(uploaded_files):
443
  if not uploaded_files: return [gr.update() for _ in speaker_selections] + [None]
444
  voices_dir = os.path.join(os.path.dirname(__file__), "voices")
@@ -448,29 +363,24 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
448
  return [gr.update(choices=new_choices) for _ in speaker_selections] + [None]
449
 
450
  def update_speaker_visibility(num):
451
- return [gr.update(visible=(i < num)) for i in range(4)]
452
-
453
- def handle_generate_click():
454
- return gr.update(visible=False), gr.update(visible=True)
455
 
456
  num_speakers.change(fn=update_speaker_visibility, inputs=num_speakers, outputs=speaker_selections)
457
  process_upload_btn.click(fn=process_and_refresh_voices, inputs=upload_audio, outputs=speaker_selections + [upload_audio])
458
 
459
- gen_event = generate_btn.click(
460
- fn=handle_generate_click,
461
- outputs=[generate_btn, stop_btn]
462
- ).then(
463
  fn=demo_instance.generate_podcast_with_timestamps,
464
  inputs=[num_speakers, script_input] + speaker_selections + [cfg_scale, remove_silence_checkbox],
465
  outputs=[audio_output, download_file, json_file_output, generate_btn, stop_btn],
466
  )
467
-
468
- stop_btn.click(fn=demo_instance.stop_audio_generation, cancels=[gen_event])
 
 
469
 
470
  def load_random_example():
471
  import random
472
  return random.choice(demo_instance.example_scripts) if demo_instance.example_scripts else (2, "Speaker 0: No examples loaded.")
473
-
474
  random_example_btn.click(fn=load_random_example, outputs=[num_speakers, script_input])
475
 
476
  return interface
@@ -478,7 +388,6 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
478
 
479
 
480
 
481
-
482
  def build_conversation_prompt(topic, *speaker_names):
483
  """
484
  Generates the final prompt. It takes the topic and a variable number of speaker names.
@@ -512,7 +421,6 @@ def build_conversation_prompt(topic, *speaker_names):
512
  prompt = f"""
513
  You are a professional podcast scriptwriter.
514
  Write a natural, engaging conversation between {num_speakers} speakers on the topic: "{topic}".
515
-
516
  {speaker_mapping_str}
517
  Formatting Rules:
518
  - You MUST always format dialogue with {', '.join(speaker_labels)} ONLY.
@@ -521,7 +429,6 @@ Formatting Rules:
521
  {introductions_str}
522
  - During the conversation, they may occasionally mention each other's names ({', '.join(names)}) naturally in the dialogue, but the labels must remain unchanged.
523
  - Do not add narration, descriptions, or any extra formatting.
524
-
525
  {example_str}
526
  """
527
  return prompt
@@ -600,57 +507,26 @@ def ui2():
600
  return demo
601
 
602
 
 
603
  import click
604
  @click.command()
605
- @click.option(
606
- "--model_path",
607
- default="microsoft/VibeVoice-1.5B",
608
- help="Hugging Face Model Repo ID."
609
- )
610
- @click.option(
611
- "--inference_steps",
612
- default=10,
613
- show_default=True,
614
- type=int,
615
- help="Number of inference steps for generation."
616
- )
617
- @click.option(
618
- "--debug",
619
- is_flag=True,
620
- default=False,
621
- help="Enable debug mode."
622
- )
623
- @click.option(
624
- "--share",
625
- is_flag=True,
626
- default=False,
627
- help="Enable sharing of the interface."
628
- )
629
  def main(model_path, inference_steps, debug, share):
630
- # model_path = "microsoft/VibeVoice-1.5B"
631
  # model_folder = download_model(model_path, download_folder="./", redownload=False)
632
  model_folder=model_path
633
  device = "cuda" if torch.cuda.is_available() else "cpu"
634
  set_seed(42)
635
- print("πŸŽ™οΈ Initializing VibeVoice Demo with Timestamp Support...")
636
- demo_instance = VibeVoiceDemo(
637
- model_path=model_folder,
638
- device=device,
639
- inference_steps=inference_steps
640
- )
641
-
642
- custom_css = """
643
- .gradio-container {
644
- font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif;
645
- }"""
646
  demo1 = create_demo_interface(demo_instance)
647
- demo2=ui2()
648
  demo = gr.TabbedInterface([demo1, demo2],["Vibe Podcasting","Generate Sample Podcast Script"],title="",theme=gr.themes.Soft(),css=custom_css)
649
-
650
  print("πŸš€ Launching Gradio Demo...")
651
  demo.queue().launch(debug=debug, share=share)
652
 
653
  if __name__ == "__main__":
654
- main()
655
-
656
- # !python /content/VibeVoice/demo/colab.py --model_path microsoft/VibeVoice-1.5B --inference_steps 10 --debug --share
 
21
  import os
22
  import traceback
23
  import shutil
24
+ import re
25
+ import uuid
26
 
27
  from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig
28
  from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference, VibeVoiceGenerationOutput
 
41
  print("Running on Google Colab and auto-saving to Google Drive...")
42
  os.makedirs(save_folder, exist_ok=True)
43
  dest_path = os.path.join(save_folder, os.path.basename(file_copy))
44
+ shutil.copy2(file_copy, dest_path)
45
  print(f"File saved to: {dest_path}")
46
  return dest_path
47
  else:
 
52
  from tqdm.auto import tqdm
53
 
54
  def download_file(url, download_file_path, redownload=False):
 
 
55
  base_path = os.path.dirname(download_file_path)
56
  os.makedirs(base_path, exist_ok=True)
 
 
57
  if os.path.exists(download_file_path):
58
  if redownload:
59
  os.remove(download_file_path)
 
61
  elif os.path.getsize(download_file_path) > 0:
62
  tqdm.write(f"βœ”οΈ Skipped (already exists): {os.path.basename(download_file_path)}")
63
  return True
 
64
  try:
65
  request = urllib.request.urlopen(url)
66
  total = int(request.headers.get('Content-Length', 0))
 
68
  print(f"❌ Error: Unable to open URL: {url}")
69
  print(f"Reason: {e.reason}")
70
  return False
 
71
  with tqdm(total=total, desc=os.path.basename(download_file_path), unit='B', unit_scale=True, unit_divisor=1024) as progress:
72
  try:
73
  urllib.request.urlretrieve(
 
79
  print(f"❌ Error: Failed to download {url}")
80
  print(f"Reason: {e.reason}")
81
  return False
 
82
  tqdm.write(f"⬇️ Downloaded: {os.path.basename(download_file_path)}")
83
  return True
84
 
 
85
  def download_model(repo_id, download_folder="./", redownload=False):
 
86
  if not download_folder.strip():
87
  download_folder = "."
88
  url = f"https://huggingface.co/api/models/{repo_id}"
89
  download_dir = os.path.abspath(f"{download_folder.rstrip('/')}/{repo_id.split('/')[-1]}")
90
  os.makedirs(download_dir, exist_ok=True)
 
91
  print(f"πŸ“‚ Download directory: {download_dir}")
 
92
  response = requests.get(url)
93
  if response.status_code != 200:
94
  print("❌ Error:", response.status_code, response.text)
95
  return None
 
96
  data = response.json()
97
  siblings = data.get("siblings", [])
98
  files = [f["rfilename"] for f in siblings]
 
99
  print(f"πŸ“¦ Found {len(files)} files in repo '{repo_id}'. Checking cache ...")
 
100
  for file in tqdm(files, desc="Processing files", unit="file"):
101
  file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file}"
102
  file_path = os.path.join(download_dir, file)
103
  download_file(file_url, file_path, redownload=redownload)
 
104
  return download_dir
105
 
 
 
 
 
 
 
 
106
  def generate_file_name(text):
 
107
  output_dir = "./podcast_audio"
108
  os.makedirs(output_dir, exist_ok=True)
 
109
  cleaned = re.sub(r"^\s*speaker\s*\d+\s*:\s*", "", text, flags=re.IGNORECASE)
110
  short = cleaned[:30].strip()
111
  short = re.sub(r'[^a-zA-Z0-9\s]', '', short)
112
  short = short.lower().strip().replace(" ", "_")
113
  if not short:
114
  short = "podcast_output"
 
115
  unique_name = f"{short}_{uuid.uuid4().hex[:6]}"
 
116
  return os.path.join(output_dir, unique_name)
117
 
 
118
  class VibeVoiceDemo:
119
  def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5):
 
120
  self.model_path = model_path
121
  self.device = device
122
  self.inference_steps = inference_steps
123
+ self.is_generating = False
124
+ self.stop_generation = False
125
  self.load_model()
126
  self.setup_voice_presets()
127
+ self.load_example_scripts()
128
 
129
  def load_model(self):
 
130
  print(f"Loading processor & model from {self.model_path}")
131
  self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
132
  if self.device == "cuda":
 
138
  else:
139
  self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
140
  self.model_path,
141
+ torch_dtype=torch.float32,
142
  device_map="cpu",
143
  )
 
144
  self.model.eval()
145
  self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
146
  self.model.model.noise_scheduler.config,
 
152
  print(f"Language model attention: {self.model.model.language_model.config._attn_implementation}")
153
 
154
  def setup_voice_presets(self):
 
155
  voices_dir = os.path.join(os.path.dirname(__file__), "voices")
156
  if not os.path.exists(voices_dir):
157
  print(f"Warning: Voices directory not found at {voices_dir}, creating it.")
158
  os.makedirs(voices_dir, exist_ok=True)
159
  self.voice_presets = {}
160
+ audio_files = [f for f in os.listdir(voices_dir) if f.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')) and os.path.isfile(os.path.join(voices_dir, f))]
 
161
  for audio_file in audio_files:
162
  name = os.path.splitext(audio_file)[0]
163
+ self.voice_presets[name] = os.path.join(voices_dir, audio_file)
 
164
  self.voice_presets = dict(sorted(self.voice_presets.items()))
165
  self.available_voices = {name: path for name, path in self.voice_presets.items() if os.path.exists(path)}
166
+ if not self.available_voices: print("Warning: No voice presets found.")
 
167
  print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
168
 
169
  def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
 
170
  try:
171
  wav, sr = sf.read(audio_path)
172
+ if len(wav.shape) > 1: wav = np.mean(wav, axis=1)
173
+ if sr != target_sr: wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
 
 
174
  return wav
175
  except Exception as e:
176
  print(f"Error reading audio {audio_path}: {e}")
177
  return np.array([])
178
 
179
  def trim_silence_from_numpy(self, audio_np: np.ndarray, sample_rate: int, silence_thresh: int = -45, min_silence_len: int = 100, keep_silence: int = 50) -> np.ndarray:
 
180
  audio_int16 = (audio_np * 32767).astype(np.int16)
181
+ sound = AudioSegment(data=audio_int16.tobytes(), sample_width=audio_int16.dtype.itemsize, frame_rate=sample_rate, channels=1)
182
+ audio_chunks = split_on_silence(sound, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence)
183
+ if not audio_chunks: return np.array([0.0], dtype=np.float32)
 
 
 
 
 
 
 
 
 
184
  combined = sum(audio_chunks)
185
  samples = np.array(combined.get_array_of_samples())
186
+ return samples.astype(np.float32) / 32767.0
 
187
 
188
  def generate_podcast_with_timestamps(self,
189
  num_speakers: int,
190
  script: str,
191
+ speaker_1: str, speaker_2: str, speaker_3: str, speaker_4: str,
192
+ cfg_scale: float,
193
+ remove_silence: bool,
 
 
 
194
  progress=gr.Progress()):
195
+
196
+ # Initial UI state: Clear previous results, show stop button
197
+ yield None, None, None, gr.update(visible=False), gr.update(visible=True)
198
+
199
+ final_audio_path, final_json_path = None, None
200
  try:
201
  self.stop_generation = False
202
  self.is_generating = True
203
+
 
204
  if not script.strip(): raise gr.Error("Error: Please provide a script.")
205
  script = script.replace("’", "'")
206
  if not 1 <= num_speakers <= 4: raise gr.Error("Error: Number of speakers must be between 1 and 4.")
 
207
  selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
208
  for i, speaker in enumerate(selected_speakers):
209
  if not speaker or speaker not in self.available_voices:
210
  raise gr.Error(f"Error: Please select a valid speaker for Speaker {i+1}.")
 
211
  voice_samples = [self.read_audio(self.available_voices[name]) for name in selected_speakers]
212
  if any(len(vs) == 0 for vs in voice_samples): raise gr.Error("Error: Failed to load one or more audio files.")
213
 
 
220
  formatted_script_lines.append(line)
221
  else:
222
  speaker_id = len(formatted_script_lines) % num_speakers
223
+ formatted_script_lines.append(f"Speaker {speaker_id+1}: {line}")
224
 
225
  if not formatted_script_lines: raise gr.Error("Error: Script is empty after formatting.")
226
 
 
227
  timestamps = {}
228
  current_time = 0.0
229
  sample_rate = 24000
230
+
 
231
  base_filename = generate_file_name(formatted_script_lines[0])
232
  final_audio_path = base_filename + ".wav"
233
  final_json_path = base_filename + ".json"
234
 
 
235
  with sf.SoundFile(final_audio_path, 'w', samplerate=sample_rate, channels=1, subtype='PCM_16') as audio_file:
236
  for i, line in enumerate(formatted_script_lines):
237
  if self.stop_generation:
238
+ print("\n🚫 Generation interrupted by user. Finalizing partial files...")
239
  break
240
+ progress(i / len(formatted_script_lines), desc=f"Generating line {i+1}/{len(formatted_script_lines)}")
 
 
241
  match = re.match(r'Speaker\s*(\d+):\s*(.*)', line, re.IGNORECASE)
242
  if not match: continue
 
243
  speaker_idx = int(match.group(1)) - 1
244
  text_content = match.group(2).strip()
245
+ if not (0 <= speaker_idx < len(voice_samples)): continue
246
+
247
+ inputs = self.processor(text=[line], voice_samples=[voice_samples[speaker_idx]], padding=True, return_tensors="pt")
248
+ output_waveform = self.model.generate(**inputs, max_new_tokens=None, cfg_scale=cfg_scale, tokenizer=self.processor.tokenizer, generation_config={'do_sample': False}, verbose=False, refresh_negative=True)
 
 
 
 
 
 
 
 
 
249
  audio_np = output_waveform.speech_outputs[0].cpu().float().numpy().squeeze()
250
+
251
+ if remove_silence: audio_np = self.trim_silence_from_numpy(audio_np, sample_rate)
 
 
 
252
  duration = len(audio_np) / sample_rate
253
+ audio_file.write((audio_np * 32767).astype(np.int16))
254
+ timestamps[str(i + 1)] = {"text": text_content, "speaker_id": speaker_idx + 1, "start": current_time, "end": current_time + duration}
 
 
 
 
 
255
  current_time += duration
256
 
257
+ if not timestamps:
258
+ self.is_generating = False
259
+ if os.path.exists(final_audio_path): os.remove(final_audio_path)
260
+ yield None, None, None, gr.update(visible=True), gr.update(visible=False)
261
+ return
262
+
263
+ progress(1.0, desc="Saving generated files...")
264
+ with open(final_json_path, "w") as f: json.dump(timestamps, f, indent=2)
265
  try:
266
+ drive_save(final_audio_path)
267
+ drive_save(final_json_path)
268
+ except Exception as e: print(f"Error saving files to Google Drive: {e}")
 
269
 
270
+ message = "Partial" if self.stop_generation else "Full"
271
+ print(f"\n✨ {message} generation successful!\n🎡 Audio: {final_audio_path}\nπŸ“„ Timestamps: {final_json_path}\n")
272
+
273
  self.is_generating = False
274
+ yield final_audio_path, final_audio_path, final_json_path, gr.update(visible=True), gr.update(visible=False)
 
275
 
276
  except Exception as e:
277
  self.is_generating = False
278
  print(f"❌ An unexpected error occurred: {str(e)}")
279
  traceback.print_exc()
280
+ try:
281
+ if final_audio_path and os.path.exists(final_audio_path): os.remove(final_audio_path)
282
+ if final_json_path and os.path.exists(final_json_path): os.remove(final_json_path)
283
+ except Exception as cleanup_e: print(f"Error during cleanup after exception: {cleanup_e}")
284
+ yield None, None, None, gr.update(visible=True), gr.update(visible=False)
285
 
286
  def stop_audio_generation(self):
287
  if self.is_generating:
 
298
  with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
299
  script = f.read().strip()
300
  if script: self.example_scripts.append([self._get_num_speakers_from_script(script), script])
301
+ except Exception as e: print(f"Error loading example {txt_file}: {e}")
 
302
 
303
  def _get_num_speakers_from_script(self, script: str) -> int:
304
  speakers = set(re.findall(r'^Speaker\s+(\d+)\s*:', script, re.MULTILINE | re.IGNORECASE))
305
  return max(int(s) for s in speakers) if speakers else 1
306
 
307
  def create_demo_interface(demo_instance: VibeVoiceDemo):
308
+ with gr.Blocks(title="VibeVoice AI Podcast Generator") as interface:
 
 
 
309
  gr.HTML("""
310
  <div style="text-align: center; margin: 20px auto; max-width: 800px;">
311
  <h1 style="font-size: 2.5em; margin-bottom: 10px;">πŸŽ™οΈ Vibe Podcasting</h1>
312
+ <p style="font-size: 1.2em; color: #555; margin-bottom: 15px;">Generate Long-form Multi-speaker AI Podcasts with VibeVoice</p>
313
+ <a href="https://colab.research.google.com/github/NeuralFalconYT/AI-Podcast-Generator/blob/main/VibeVoice_Colab.ipynb" target="_blank" style="display: inline-block; padding: 10px 20px; background-color: #4285F4; color: white; border-radius: 6px; text-decoration: none; font-size: 1em;">πŸ₯³ Run on Google Colab</a>
 
 
 
 
 
 
 
314
  </div>
315
  """)
 
316
  with gr.Row():
 
317
  with gr.Column(scale=1):
318
  with gr.Group():
319
  gr.Markdown("### πŸŽ›οΈ Podcast Settings")
320
  num_speakers = gr.Slider(minimum=1, maximum=4, value=2, step=1, label="Number of Speakers")
 
321
  gr.Markdown("### 🎭 Speaker Selection")
322
  speaker_selections = []
323
  available_voices = list(demo_instance.available_voices.keys())
 
326
  val = defaults[i] if i < len(defaults) and defaults[i] in available_voices else None
327
  speaker = gr.Dropdown(choices=available_voices, value=val, label=f"Speaker {i+1}", visible=(i < 2))
328
  speaker_selections.append(speaker)
 
329
  with gr.Accordion("🎀 Upload Custom Voices", open=False):
330
  upload_audio = gr.File(label="Upload Voice Samples", file_count="multiple", file_types=["audio"])
331
  process_upload_btn = gr.Button("Add Uploaded Voices to Speaker Selection")
 
332
  with gr.Accordion("βš™οΈ Advanced Settings", open=False):
333
  cfg_scale = gr.Slider(minimum=1.0, maximum=2.0, value=1.3, step=0.05, label="CFG Scale")
 
334
  remove_silence_checkbox = gr.Checkbox(label="Trim Silence from Podcast", value=False,)
 
 
335
  with gr.Column(scale=2):
336
  with gr.Group():
337
  gr.Markdown("### πŸ“ Script Input")
338
+ script_input = gr.Textbox(
339
+ label="Conversation Script",
340
+ placeholder="Speaker 1: Hi everyone, I’m Alex, and welcome back.\nSpeaker 2: And I’m lisa. Thanks for tuning in.",
341
+ lines=10
342
+ )
343
  with gr.Row():
344
  random_example_btn = gr.Button("🎲 Random Example", scale=1)
345
  generate_btn = gr.Button("πŸš€ Generate Podcast", variant="primary", scale=2)
 
346
  stop_btn = gr.Button("πŸ›‘ Stop Generation", variant="stop", visible=False)
 
347
  gr.Markdown("### 🎡 **Generated Output**")
348
  audio_output = gr.Audio(label="Play Generated Podcast")
349
  with gr.Accordion("πŸ“¦ Download Files", open=False):
350
  download_file = gr.File(label="Download Audio File (.wav)")
351
  json_file_output = gr.File(label="Download Timestamps (.json)")
352
 
353
+ with gr.Accordion("πŸ’‘ Usage Tips & Examples", open=False):
354
+ gr.Markdown("""- **Upload Your Own Voices:** Create your own podcast with custom voice samples. \n- **Timestamps:** Useful if you want to generate a video using Wan2.2 or other tools. The timestamps let you automatically separate each speaker (splitting the long podcast into smaller chunks), pass the audio clips to your video generation model, and then merge the generated video clips into a full podcast video (e.g., using FFmpeg + any video generation model such as image+audio β†’ video).""")
 
 
 
355
  gr.Examples(examples=demo_instance.example_scripts, inputs=[num_speakers, script_input], label="Try these example scripts:")
356
 
 
357
  def process_and_refresh_voices(uploaded_files):
358
  if not uploaded_files: return [gr.update() for _ in speaker_selections] + [None]
359
  voices_dir = os.path.join(os.path.dirname(__file__), "voices")
 
363
  return [gr.update(choices=new_choices) for _ in speaker_selections] + [None]
364
 
365
  def update_speaker_visibility(num):
366
+ return [gr.update(visible=(i < int(num))) for i in range(4)]
 
 
 
367
 
368
  num_speakers.change(fn=update_speaker_visibility, inputs=num_speakers, outputs=speaker_selections)
369
  process_upload_btn.click(fn=process_and_refresh_voices, inputs=upload_audio, outputs=speaker_selections + [upload_audio])
370
 
371
+ generate_btn.click(
 
 
 
372
  fn=demo_instance.generate_podcast_with_timestamps,
373
  inputs=[num_speakers, script_input] + speaker_selections + [cfg_scale, remove_silence_checkbox],
374
  outputs=[audio_output, download_file, json_file_output, generate_btn, stop_btn],
375
  )
376
+
377
+ stop_btn.click(
378
+ fn=demo_instance.stop_audio_generation
379
+ )
380
 
381
  def load_random_example():
382
  import random
383
  return random.choice(demo_instance.example_scripts) if demo_instance.example_scripts else (2, "Speaker 0: No examples loaded.")
 
384
  random_example_btn.click(fn=load_random_example, outputs=[num_speakers, script_input])
385
 
386
  return interface
 
388
 
389
 
390
 
 
391
  def build_conversation_prompt(topic, *speaker_names):
392
  """
393
  Generates the final prompt. It takes the topic and a variable number of speaker names.
 
421
  prompt = f"""
422
  You are a professional podcast scriptwriter.
423
  Write a natural, engaging conversation between {num_speakers} speakers on the topic: "{topic}".
 
424
  {speaker_mapping_str}
425
  Formatting Rules:
426
  - You MUST always format dialogue with {', '.join(speaker_labels)} ONLY.
 
429
  {introductions_str}
430
  - During the conversation, they may occasionally mention each other's names ({', '.join(names)}) naturally in the dialogue, but the labels must remain unchanged.
431
  - Do not add narration, descriptions, or any extra formatting.
 
432
  {example_str}
433
  """
434
  return prompt
 
507
  return demo
508
 
509
 
510
+
511
  import click
512
  @click.command()
513
+ @click.option("--model_path", default="microsoft/VibeVoice-1.5B", help="Hugging Face Model Repo ID.")
514
+ @click.option("--inference_steps", default=10, show_default=True, type=int, help="Number of inference steps for generation.")
515
+ @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
516
+ @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  def main(model_path, inference_steps, debug, share):
 
518
  # model_folder = download_model(model_path, download_folder="./", redownload=False)
519
  model_folder=model_path
520
  device = "cuda" if torch.cuda.is_available() else "cpu"
521
  set_seed(42)
522
+ print("πŸŽ™οΈ Initializing VibeVoice ...")
523
+ demo_instance = VibeVoiceDemo(model_path=model_folder, device=device, inference_steps=inference_steps)
524
+ custom_css = """.gradio-container { font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif; }"""
 
 
 
 
 
 
 
 
525
  demo1 = create_demo_interface(demo_instance)
526
+ demo2 = ui2()
527
  demo = gr.TabbedInterface([demo1, demo2],["Vibe Podcasting","Generate Sample Podcast Script"],title="",theme=gr.themes.Soft(),css=custom_css)
 
528
  print("πŸš€ Launching Gradio Demo...")
529
  demo.queue().launch(debug=debug, share=share)
530
 
531
  if __name__ == "__main__":
532
+ main()