Spaces:

ACloudCenter
/

Conference-Generator-VibeVoice

Running on CPU Upgrade

App Files Files

xet

Community

ACloudCenter commited on 24 days ago

Commit

c7cf6c1

1 Parent(s): 9ae9cad

Modify feeback comps and checks.

Browse files

Files changed (2) hide show

app.py +122 -22
backend_modal/modal_runner.py +42 -9

app.py CHANGED Viewed

@@ -110,6 +110,42 @@ theme = gr.themes.Ocean(
 ).set(
     button_large_radius='*radius_sm'
 )
 def create_demo_interface():
     with gr.Blocks(
@@ -128,8 +164,12 @@ def create_demo_interface():
         with gr.Tabs():
             with gr.Tab("Generate"):
                 gr.Markdown("### Generated Conference")
                 complete_audio_output = gr.Audio(
-                    label="Complete Conference (Download)",
                     type="numpy",
                     autoplay=False,
                     show_download_button=True,
@@ -223,7 +263,7 @@ def create_demo_interface():
                         )
                         with gr.Row():
                             status_display = gr.Markdown(
-                                value="Status: idle.",
                                 elem_id="status-display",
                             )
                             progress_slider = gr.Slider(
@@ -317,23 +357,37 @@ def create_demo_interface():
                 def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
                     if remote_generate_function is None:
                         error_message = "ERROR: Modal function not deployed. Please contact the space owner."
-                        yield None, error_message, "Status: error.", gr.update(value=0)
                         return
-                    # Show a message that we are calling the remote function
                     yield (
-                        None,
                         "🔄 Calling remote GPU on Modal.com... this may take a moment to start.",
-                        "**Connecting**\nRequesting GPU resources…",
-                        gr.update(value=0),
                     )
                     try:
                         speakers = speakers_and_params[:4]
                         cfg_scale_val = speakers_and_params[4]
                         current_log = ""
-                        last_pct = 0
-                        last_status = "**Connecting**\nRequesting GPU resources…"
                         # Stream updates from the Modal function
                         for update in remote_generate_function.remote_gen(
@@ -352,49 +406,95 @@ def create_demo_interface():
                             if isinstance(update, dict):
                                 audio_payload = update.get("audio")
                                 progress_pct = update.get("pct", last_pct)
-                                stage_label = update.get("stage", "").replace("_", " ").title() or "Status"
-                                status_line = update.get("status") or "Processing…"
                                 current_log = update.get("log", current_log)
                                 status_formatted = f"**{stage_label}**\n{status_line}"
-                                audio_output = audio_payload if audio_payload is not None else gr.update()
-                                last_pct = progress_pct
-                                last_status = status_formatted
                                 yield (
-                                    audio_output,
                                     current_log,
                                     status_formatted,
-                                    gr.update(value=progress_pct),
                                 )
                             else:
                                 # Backwards compatibility: older backend returns (audio, log)
                                 audio_payload, log_text = update if isinstance(update, (tuple, list)) else (None, str(update))
                                 if log_text:
                                     current_log = log_text
-                                audio_output = audio_payload if audio_payload is not None else gr.update()
                                 yield (
-                                    audio_output,
                                     current_log,
-                                    last_status,
-                                    gr.update(value=last_pct),
                                 )
                     except Exception as e:
                         tb = traceback.format_exc()
                         print(f"Error calling Modal: {e}")
                         error_log = f"❌ An error occurred: {e}\n\n{tb}"
                         yield (
-                            None,
                             error_log,
                             "**Error**\nInference failed.",
                             gr.update(value=0),
                         )
                 generate_btn.click(
                     fn=generate_podcast_wrapper,
                     inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
-                    outputs=[complete_audio_output, log_output, status_display, progress_slider]
                 )
             with gr.Tab("Architecture"):

 ).set(
     button_large_radius='*radius_sm'
 )
+AUDIO_LABEL_DEFAULT = "Complete Conference (Download)"
+PRIMARY_STAGE_MESSAGES = {
+    "connecting": ("🚀 Request Submitted", "Provisioning GPU resources... cold starts can take up to a minute."),
+    "queued": ("🚦 Waiting For GPU", "Worker is spinning up. Cold starts may take 30-60 seconds."),
+    "loading_model": ("📦 Loading Model", "Streaming VibeVoice weights to the GPU."),
+    "loading_voices": ("🎙️ Loading Voices", None),
+    "preparing_inputs": ("📝 Preparing Script", "Formatting the conversation for the model."),
+    "generating_audio": ("🎧 Generating Audio", "Synthesizing speech — this is the longest step."),
+    "processing_audio": ("✨ Finalizing Audio", "Converting tensors into a playable waveform."),
+    "complete": ("✅ Ready", "Press play below or download your conference."),
+    "error": ("❌ Error", "Check the log for details."),
+}
+AUDIO_STAGE_LABELS = {
+    "connecting": "Complete Conference (requesting GPU...)",
+    "queued": "Complete Conference (GPU warming up...)",
+    "loading_model": "Complete Conference (loading model...)",
+    "loading_voices": "Complete Conference (loading voices...)",
+    "preparing_inputs": "Complete Conference (preparing inputs...)",
+    "generating_audio": "Complete Conference (generating audio...)",
+    "processing_audio": "Complete Conference (finalizing audio...)",
+    "error": "Complete Conference (error)",
+}
+READY_PRIMARY_STATUS = "### Ready\nPress **Generate** to run VibeVoice."
+def build_primary_status(stage: str, status_line: str) -> str:
+    title, default_desc = PRIMARY_STAGE_MESSAGES.get(stage, ("⚙️ Working", "Processing..."))
+    desc_parts = []
+    if default_desc:
+        desc_parts.append(default_desc)
+    if status_line and status_line not in desc_parts:
+        desc_parts.append(status_line)
+    desc = "\n\n".join(desc_parts) if desc_parts else status_line
+    return f"### {title}\n{desc}"
 def create_demo_interface():
     with gr.Blocks(
         with gr.Tabs():
             with gr.Tab("Generate"):
                 gr.Markdown("### Generated Conference")
+                primary_status = gr.Markdown(
+                    value=READY_PRIMARY_STATUS,
+                    elem_id="primary-status",
+                )
                 complete_audio_output = gr.Audio(
+                    label=AUDIO_LABEL_DEFAULT,
                     type="numpy",
                     autoplay=False,
                     show_download_button=True,
                         )
                         with gr.Row():
                             status_display = gr.Markdown(
+                                value="**Idle**\nPress generate to get started.",
                                 elem_id="status-display",
                             )
                             progress_slider = gr.Slider(
                 def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
                     if remote_generate_function is None:
                         error_message = "ERROR: Modal function not deployed. Please contact the space owner."
+                        primary_error = build_primary_status("error", "Modal backend is offline.")
+                        yield (
+                            gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
+                            error_message,
+                            "**Error**\nModal backend unavailable.",
+                            gr.update(value=0),
+                            primary_error,
+                        )
                         return
+                    connecting_status_line = "Provisioning GPU resources... cold starts can take up to a minute."
+                    primary_connecting = build_primary_status("connecting", connecting_status_line)
+                    status_detail = "**Connecting**\nRequesting GPU resources…"
                     yield (
+                        gr.update(label=AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)),
                         "🔄 Calling remote GPU on Modal.com... this may take a moment to start.",
+                        status_detail,
+                        gr.update(value=1),
+                        primary_connecting,
                     )
                     try:
                         speakers = speakers_and_params[:4]
                         cfg_scale_val = speakers_and_params[4]
                         current_log = ""
+                        last_pct = 1
+                        last_status = status_detail
+                        last_primary = primary_connecting
+                        last_audio_label = AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)
+                        last_stage = "connecting"
                         # Stream updates from the Modal function
                         for update in remote_generate_function.remote_gen(
                             if isinstance(update, dict):
                                 audio_payload = update.get("audio")
                                 progress_pct = update.get("pct", last_pct)
+                                stage_key = update.get("stage", last_stage) or last_stage
+                                status_line = update.get("status") or "Processing..."
                                 current_log = update.get("log", current_log)
+                                stage_label = stage_key.replace("_", " ").title() if stage_key else "Status"
                                 status_formatted = f"**{stage_label}**\n{status_line}"
+                                progress_value = max(0, min(100, int(round(progress_pct))))
+                                audio_label = AUDIO_STAGE_LABELS.get(stage_key)
+                                if not audio_label:
+                                    audio_label = f"Complete Conference ({stage_label.lower()})" if stage_label else AUDIO_LABEL_DEFAULT
+                                if stage_key == "complete":
+                                    audio_label = AUDIO_LABEL_DEFAULT
+                                if stage_key == "error":
+                                    progress_value = 0
+                                primary_value = build_primary_status(stage_key, status_line)
+                                audio_update = gr.update(label=audio_label)
+                                if audio_payload is not None:
+                                    audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
                                 yield (
+                                    audio_update,
                                     current_log,
                                     status_formatted,
+                                    gr.update(value=progress_value),
+                                    primary_value,
                                 )
+                                last_pct = progress_value
+                                last_status = status_formatted
+                                last_primary = primary_value
+                                last_audio_label = audio_label
+                                last_stage = stage_key
                             else:
                                 # Backwards compatibility: older backend returns (audio, log)
                                 audio_payload, log_text = update if isinstance(update, (tuple, list)) else (None, str(update))
+                                status_line = None
                                 if log_text:
                                     current_log = log_text
+                                    status_line = log_text.splitlines()[-1]
+                                if not status_line:
+                                    status_line = "Processing..."
+                                if audio_payload is not None:
+                                    progress_value = 100
+                                    audio_label = AUDIO_LABEL_DEFAULT
+                                    primary_value = build_primary_status("complete", "Conference ready to download.")
+                                    status_formatted = "**Complete**\nConference ready to download."
+                                else:
+                                    progress_value = max(last_pct, 70)
+                                    audio_label = AUDIO_STAGE_LABELS.get("generating_audio", last_audio_label)
+                                    primary_value = build_primary_status("generating_audio", status_line)
+                                    status_formatted = f"**Streaming**\n{status_line}"
+                                audio_update = gr.update(label=audio_label)
+                                if audio_payload is not None:
+                                    audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
+                                last_pct = progress_value
+                                last_status = status_formatted
+                                last_primary = primary_value
+                                last_audio_label = audio_label
                                 yield (
+                                    audio_update,
                                     current_log,
+                                    status_formatted,
+                                    gr.update(value=progress_value),
+                                    primary_value,
                                 )
                     except Exception as e:
                         tb = traceback.format_exc()
                         print(f"Error calling Modal: {e}")
                         error_log = f"❌ An error occurred: {e}\n\n{tb}"
+                        primary_error = build_primary_status("error", "Inference failed.")
                         yield (
+                            gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
                             error_log,
                             "**Error**\nInference failed.",
                             gr.update(value=0),
+                            primary_error,
                         )
                 generate_btn.click(
                     fn=generate_podcast_wrapper,
                     inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
+                    outputs=[complete_audio_output, log_output, status_display, progress_slider, primary_status]
                 )
             with gr.Tab("Architecture"):

backend_modal/modal_runner.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import time
 import numpy as np
 import librosa
 import soundfile as sf
@@ -407,17 +408,49 @@ class VibeVoiceModel:
                 status="Running VibeVoice diffusion (this may take 1-2 minutes)…",
                 log_text=log_text,
             )
             start_time = time.time()
-            with torch.inference_mode():
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=None,
-                    cfg_scale=cfg_scale,
-                    tokenizer=processor.tokenizer,
-                    generation_config={'do_sample': False},
-                    verbose=False,
                 )
             generation_time = time.time() - start_time
             log_lines.append(f"Generation completed in {generation_time:.2f} seconds")

 import os
 import time
+import threading
 import numpy as np
 import librosa
 import soundfile as sf
                 status="Running VibeVoice diffusion (this may take 1-2 minutes)…",
                 log_text=log_text,
             )
             start_time = time.time()
+            result_container = {}
+            exception_container = {}
+            def _run_generation():
+                try:
+                    with torch.inference_mode():
+                        result_container['outputs'] = model.generate(
+                            **inputs,
+                            max_new_tokens=None,
+                            cfg_scale=cfg_scale,
+                            tokenizer=processor.tokenizer,
+                            generation_config={'do_sample': False},
+                            verbose=False,
+                        )
+                except Exception as gen_err:
+                    exception_container['error'] = gen_err
+            generation_thread = threading.Thread(target=_run_generation, daemon=True)
+            generation_thread.start()
+            # Emit keep-alive progress while the heavy generation is running
+            while generation_thread.is_alive():
+                elapsed = time.time() - start_time
+                status_msg = f"Running VibeVoice diffusion… {int(elapsed)}s elapsed"
+                pct_hint = min(88, 70 + int(elapsed // 5))
+                yield self._emit_progress(
+                    stage="generating_audio",
+                    pct=pct_hint,
+                    status=status_msg,
+                    log_text=log_text,
                 )
+                time.sleep(5)
+            generation_thread.join()
+            if 'error' in exception_container:
+                raise exception_container['error']
+            outputs = result_container.get('outputs')
+            if outputs is None:
+                raise RuntimeError("Generation thread finished without producing outputs.")
             generation_time = time.time() - start_time
             log_lines.append(f"Generation completed in {generation_time:.2f} seconds")