ACloudCenter commited on
Commit
c7cf6c1
·
1 Parent(s): 9ae9cad

Modify feeback comps and checks.

Browse files
Files changed (2) hide show
  1. app.py +122 -22
  2. backend_modal/modal_runner.py +42 -9
app.py CHANGED
@@ -110,6 +110,42 @@ theme = gr.themes.Ocean(
110
  ).set(
111
  button_large_radius='*radius_sm'
112
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  def create_demo_interface():
115
  with gr.Blocks(
@@ -128,8 +164,12 @@ def create_demo_interface():
128
  with gr.Tabs():
129
  with gr.Tab("Generate"):
130
  gr.Markdown("### Generated Conference")
 
 
 
 
131
  complete_audio_output = gr.Audio(
132
- label="Complete Conference (Download)",
133
  type="numpy",
134
  autoplay=False,
135
  show_download_button=True,
@@ -223,7 +263,7 @@ def create_demo_interface():
223
  )
224
  with gr.Row():
225
  status_display = gr.Markdown(
226
- value="Status: idle.",
227
  elem_id="status-display",
228
  )
229
  progress_slider = gr.Slider(
@@ -317,23 +357,37 @@ def create_demo_interface():
317
  def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
318
  if remote_generate_function is None:
319
  error_message = "ERROR: Modal function not deployed. Please contact the space owner."
320
- yield None, error_message, "Status: error.", gr.update(value=0)
 
 
 
 
 
 
 
321
  return
322
 
323
- # Show a message that we are calling the remote function
 
 
 
324
  yield (
325
- None,
326
  "🔄 Calling remote GPU on Modal.com... this may take a moment to start.",
327
- "**Connecting**\nRequesting GPU resources…",
328
- gr.update(value=0),
 
329
  )
330
 
331
  try:
332
  speakers = speakers_and_params[:4]
333
  cfg_scale_val = speakers_and_params[4]
334
  current_log = ""
335
- last_pct = 0
336
- last_status = "**Connecting**\nRequesting GPU resources…"
 
 
 
337
 
338
  # Stream updates from the Modal function
339
  for update in remote_generate_function.remote_gen(
@@ -352,49 +406,95 @@ def create_demo_interface():
352
  if isinstance(update, dict):
353
  audio_payload = update.get("audio")
354
  progress_pct = update.get("pct", last_pct)
355
- stage_label = update.get("stage", "").replace("_", " ").title() or "Status"
356
- status_line = update.get("status") or "Processing"
357
  current_log = update.get("log", current_log)
358
 
 
359
  status_formatted = f"**{stage_label}**\n{status_line}"
360
- audio_output = audio_payload if audio_payload is not None else gr.update()
361
 
362
- last_pct = progress_pct
363
- last_status = status_formatted
 
 
 
 
 
 
 
 
 
 
 
364
 
365
  yield (
366
- audio_output,
367
  current_log,
368
  status_formatted,
369
- gr.update(value=progress_pct),
 
370
  )
 
 
 
 
 
 
371
  else:
372
  # Backwards compatibility: older backend returns (audio, log)
373
  audio_payload, log_text = update if isinstance(update, (tuple, list)) else (None, str(update))
 
374
  if log_text:
375
  current_log = log_text
376
- audio_output = audio_payload if audio_payload is not None else gr.update()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  yield (
378
- audio_output,
379
  current_log,
380
- last_status,
381
- gr.update(value=last_pct),
 
382
  )
383
  except Exception as e:
384
  tb = traceback.format_exc()
385
  print(f"Error calling Modal: {e}")
386
  error_log = f"❌ An error occurred: {e}\n\n{tb}"
 
387
  yield (
388
- None,
389
  error_log,
390
  "**Error**\nInference failed.",
391
  gr.update(value=0),
 
392
  )
393
 
394
  generate_btn.click(
395
  fn=generate_podcast_wrapper,
396
  inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
397
- outputs=[complete_audio_output, log_output, status_display, progress_slider]
398
  )
399
 
400
  with gr.Tab("Architecture"):
 
110
  ).set(
111
  button_large_radius='*radius_sm'
112
  )
113
+
114
+ AUDIO_LABEL_DEFAULT = "Complete Conference (Download)"
115
+ PRIMARY_STAGE_MESSAGES = {
116
+ "connecting": ("🚀 Request Submitted", "Provisioning GPU resources... cold starts can take up to a minute."),
117
+ "queued": ("🚦 Waiting For GPU", "Worker is spinning up. Cold starts may take 30-60 seconds."),
118
+ "loading_model": ("📦 Loading Model", "Streaming VibeVoice weights to the GPU."),
119
+ "loading_voices": ("🎙️ Loading Voices", None),
120
+ "preparing_inputs": ("📝 Preparing Script", "Formatting the conversation for the model."),
121
+ "generating_audio": ("🎧 Generating Audio", "Synthesizing speech — this is the longest step."),
122
+ "processing_audio": ("✨ Finalizing Audio", "Converting tensors into a playable waveform."),
123
+ "complete": ("✅ Ready", "Press play below or download your conference."),
124
+ "error": ("❌ Error", "Check the log for details."),
125
+ }
126
+ AUDIO_STAGE_LABELS = {
127
+ "connecting": "Complete Conference (requesting GPU...)",
128
+ "queued": "Complete Conference (GPU warming up...)",
129
+ "loading_model": "Complete Conference (loading model...)",
130
+ "loading_voices": "Complete Conference (loading voices...)",
131
+ "preparing_inputs": "Complete Conference (preparing inputs...)",
132
+ "generating_audio": "Complete Conference (generating audio...)",
133
+ "processing_audio": "Complete Conference (finalizing audio...)",
134
+ "error": "Complete Conference (error)",
135
+ }
136
+ READY_PRIMARY_STATUS = "### Ready\nPress **Generate** to run VibeVoice."
137
+
138
+
139
+ def build_primary_status(stage: str, status_line: str) -> str:
140
+ title, default_desc = PRIMARY_STAGE_MESSAGES.get(stage, ("⚙️ Working", "Processing..."))
141
+ desc_parts = []
142
+ if default_desc:
143
+ desc_parts.append(default_desc)
144
+ if status_line and status_line not in desc_parts:
145
+ desc_parts.append(status_line)
146
+ desc = "\n\n".join(desc_parts) if desc_parts else status_line
147
+ return f"### {title}\n{desc}"
148
+
149
 
150
  def create_demo_interface():
151
  with gr.Blocks(
 
164
  with gr.Tabs():
165
  with gr.Tab("Generate"):
166
  gr.Markdown("### Generated Conference")
167
+ primary_status = gr.Markdown(
168
+ value=READY_PRIMARY_STATUS,
169
+ elem_id="primary-status",
170
+ )
171
  complete_audio_output = gr.Audio(
172
+ label=AUDIO_LABEL_DEFAULT,
173
  type="numpy",
174
  autoplay=False,
175
  show_download_button=True,
 
263
  )
264
  with gr.Row():
265
  status_display = gr.Markdown(
266
+ value="**Idle**\nPress generate to get started.",
267
  elem_id="status-display",
268
  )
269
  progress_slider = gr.Slider(
 
357
  def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
358
  if remote_generate_function is None:
359
  error_message = "ERROR: Modal function not deployed. Please contact the space owner."
360
+ primary_error = build_primary_status("error", "Modal backend is offline.")
361
+ yield (
362
+ gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
363
+ error_message,
364
+ "**Error**\nModal backend unavailable.",
365
+ gr.update(value=0),
366
+ primary_error,
367
+ )
368
  return
369
 
370
+ connecting_status_line = "Provisioning GPU resources... cold starts can take up to a minute."
371
+ primary_connecting = build_primary_status("connecting", connecting_status_line)
372
+ status_detail = "**Connecting**\nRequesting GPU resources…"
373
+
374
  yield (
375
+ gr.update(label=AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)),
376
  "🔄 Calling remote GPU on Modal.com... this may take a moment to start.",
377
+ status_detail,
378
+ gr.update(value=1),
379
+ primary_connecting,
380
  )
381
 
382
  try:
383
  speakers = speakers_and_params[:4]
384
  cfg_scale_val = speakers_and_params[4]
385
  current_log = ""
386
+ last_pct = 1
387
+ last_status = status_detail
388
+ last_primary = primary_connecting
389
+ last_audio_label = AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)
390
+ last_stage = "connecting"
391
 
392
  # Stream updates from the Modal function
393
  for update in remote_generate_function.remote_gen(
 
406
  if isinstance(update, dict):
407
  audio_payload = update.get("audio")
408
  progress_pct = update.get("pct", last_pct)
409
+ stage_key = update.get("stage", last_stage) or last_stage
410
+ status_line = update.get("status") or "Processing..."
411
  current_log = update.get("log", current_log)
412
 
413
+ stage_label = stage_key.replace("_", " ").title() if stage_key else "Status"
414
  status_formatted = f"**{stage_label}**\n{status_line}"
415
+ progress_value = max(0, min(100, int(round(progress_pct))))
416
 
417
+ audio_label = AUDIO_STAGE_LABELS.get(stage_key)
418
+ if not audio_label:
419
+ audio_label = f"Complete Conference ({stage_label.lower()})" if stage_label else AUDIO_LABEL_DEFAULT
420
+ if stage_key == "complete":
421
+ audio_label = AUDIO_LABEL_DEFAULT
422
+ if stage_key == "error":
423
+ progress_value = 0
424
+
425
+ primary_value = build_primary_status(stage_key, status_line)
426
+
427
+ audio_update = gr.update(label=audio_label)
428
+ if audio_payload is not None:
429
+ audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
430
 
431
  yield (
432
+ audio_update,
433
  current_log,
434
  status_formatted,
435
+ gr.update(value=progress_value),
436
+ primary_value,
437
  )
438
+
439
+ last_pct = progress_value
440
+ last_status = status_formatted
441
+ last_primary = primary_value
442
+ last_audio_label = audio_label
443
+ last_stage = stage_key
444
  else:
445
  # Backwards compatibility: older backend returns (audio, log)
446
  audio_payload, log_text = update if isinstance(update, (tuple, list)) else (None, str(update))
447
+ status_line = None
448
  if log_text:
449
  current_log = log_text
450
+ status_line = log_text.splitlines()[-1]
451
+ if not status_line:
452
+ status_line = "Processing..."
453
+
454
+ if audio_payload is not None:
455
+ progress_value = 100
456
+ audio_label = AUDIO_LABEL_DEFAULT
457
+ primary_value = build_primary_status("complete", "Conference ready to download.")
458
+ status_formatted = "**Complete**\nConference ready to download."
459
+ else:
460
+ progress_value = max(last_pct, 70)
461
+ audio_label = AUDIO_STAGE_LABELS.get("generating_audio", last_audio_label)
462
+ primary_value = build_primary_status("generating_audio", status_line)
463
+ status_formatted = f"**Streaming**\n{status_line}"
464
+
465
+ audio_update = gr.update(label=audio_label)
466
+ if audio_payload is not None:
467
+ audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
468
+
469
+ last_pct = progress_value
470
+ last_status = status_formatted
471
+ last_primary = primary_value
472
+ last_audio_label = audio_label
473
+
474
  yield (
475
+ audio_update,
476
  current_log,
477
+ status_formatted,
478
+ gr.update(value=progress_value),
479
+ primary_value,
480
  )
481
  except Exception as e:
482
  tb = traceback.format_exc()
483
  print(f"Error calling Modal: {e}")
484
  error_log = f"❌ An error occurred: {e}\n\n{tb}"
485
+ primary_error = build_primary_status("error", "Inference failed.")
486
  yield (
487
+ gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
488
  error_log,
489
  "**Error**\nInference failed.",
490
  gr.update(value=0),
491
+ primary_error,
492
  )
493
 
494
  generate_btn.click(
495
  fn=generate_podcast_wrapper,
496
  inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
497
+ outputs=[complete_audio_output, log_output, status_display, progress_slider, primary_status]
498
  )
499
 
500
  with gr.Tab("Architecture"):
backend_modal/modal_runner.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import time
 
3
  import numpy as np
4
  import librosa
5
  import soundfile as sf
@@ -407,17 +408,49 @@ class VibeVoiceModel:
407
  status="Running VibeVoice diffusion (this may take 1-2 minutes)…",
408
  log_text=log_text,
409
  )
 
410
  start_time = time.time()
411
-
412
- with torch.inference_mode():
413
- outputs = model.generate(
414
- **inputs,
415
- max_new_tokens=None,
416
- cfg_scale=cfg_scale,
417
- tokenizer=processor.tokenizer,
418
- generation_config={'do_sample': False},
419
- verbose=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  )
 
 
 
 
 
 
 
 
 
 
421
  generation_time = time.time() - start_time
422
 
423
  log_lines.append(f"Generation completed in {generation_time:.2f} seconds")
 
1
  import os
2
  import time
3
+ import threading
4
  import numpy as np
5
  import librosa
6
  import soundfile as sf
 
408
  status="Running VibeVoice diffusion (this may take 1-2 minutes)…",
409
  log_text=log_text,
410
  )
411
+
412
  start_time = time.time()
413
+ result_container = {}
414
+ exception_container = {}
415
+
416
+ def _run_generation():
417
+ try:
418
+ with torch.inference_mode():
419
+ result_container['outputs'] = model.generate(
420
+ **inputs,
421
+ max_new_tokens=None,
422
+ cfg_scale=cfg_scale,
423
+ tokenizer=processor.tokenizer,
424
+ generation_config={'do_sample': False},
425
+ verbose=False,
426
+ )
427
+ except Exception as gen_err:
428
+ exception_container['error'] = gen_err
429
+
430
+ generation_thread = threading.Thread(target=_run_generation, daemon=True)
431
+ generation_thread.start()
432
+
433
+ # Emit keep-alive progress while the heavy generation is running
434
+ while generation_thread.is_alive():
435
+ elapsed = time.time() - start_time
436
+ status_msg = f"Running VibeVoice diffusion… {int(elapsed)}s elapsed"
437
+ pct_hint = min(88, 70 + int(elapsed // 5))
438
+ yield self._emit_progress(
439
+ stage="generating_audio",
440
+ pct=pct_hint,
441
+ status=status_msg,
442
+ log_text=log_text,
443
  )
444
+ time.sleep(5)
445
+
446
+ generation_thread.join()
447
+ if 'error' in exception_container:
448
+ raise exception_container['error']
449
+
450
+ outputs = result_container.get('outputs')
451
+ if outputs is None:
452
+ raise RuntimeError("Generation thread finished without producing outputs.")
453
+
454
  generation_time = time.time() - start_time
455
 
456
  log_lines.append(f"Generation completed in {generation_time:.2f} seconds")