Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on 4 days ago

Commit

b60f330

1 Parent(s): 12556c0

feat: cross-model segment regen — regenerate any slot with TARO, MMAudio, or HunyuanFoley

Popup now shows three model buttons (TARO / MMAudio / Hunyuan) instead of
one generic Regenerate. Clicking a different model's button fires one of
three new shared xregen_* Gradio endpoints (xregen_taro, xregen_mmaudio,
xregen_hunyuan) that accept slot_id as a plain string so results are
spliced back into the correct slot regardless of which tab generated it.

New _resample_to_slot_sr() helper resamples the incoming wav to match
the slot's original SR (TARO=16kHz, MMAudio=44.1kHz, Hunyuan=48kHz)
before _splice_and_save stitches it in, so any model can replace any
segment without sample-rate mismatch.

All new Gradio components/buttons use render=False to avoid adding to
the SSR component tree and triggering the 'Too many arguments' warning.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +237 -61

app.py CHANGED Viewed

@@ -1313,6 +1313,107 @@ MODEL_CONFIGS["mmaudio"]["regen_fn"] = regen_mmaudio_segment
 MODEL_CONFIGS["hunyuan"]["regen_fn"] = regen_hunyuan_segment
 # ================================================================== #
 #                        SHARED UI HELPERS                            #
 # ================================================================== #
@@ -1868,18 +1969,16 @@ _GLOBAL_JS = """
   // Fire regen for a given slot and segment by posting directly to the
   // Gradio queue API — bypasses Svelte binding entirely.
-  function fireRegen(slot_id, seg_idx) {
-    // Determine tab prefix from slot_id (e.g. "taro_0" -> "taro")
-    const prefix = slot_id.split('_')[0];
     const slotNum = parseInt(slot_id.split('_')[1], 10);
-    // Build api_name for this slot's regen handler
-    const apiName = 'regen_' + prefix + '_' + slotNum;
-    const fnIndex = getFnIndex(apiName);
-    if (fnIndex === undefined) {
-      console.warn('[fireRegen] fn_index not found for api_name:', apiName, 'cache:', _fnIndexCache);
-      return;
-    }
     // Read state_json from the waveform container data-state attribute
     const container = document.getElementById('wf_container_' + slot_id);
@@ -1889,48 +1988,58 @@ _GLOBAL_JS = """
       return;
     }
-    // Read current input values from DOM by elem_id
-    let data;
-    if (prefix === 'taro') {
-      const video = null;  // video is a file component — pass null, server uses its own state
-      data = [
-        seg_idx,
-        stateJson,
-        video,
-        readComponentValue('taro_seed'),
-        readComponentValue('taro_cfg'),
-        readComponentValue('taro_steps'),
-        readComponentValue('taro_mode'),
-        readComponentValue('taro_cf_dur'),
-        readComponentValue('taro_cf_db')
-      ];
-    } else if (prefix === 'mma') {
-      data = [
-        seg_idx,
-        stateJson,
-        null,  // video
-        readComponentValue('mma_prompt'),
-        readComponentValue('mma_neg'),
-        readComponentValue('mma_seed'),
-        readComponentValue('mma_cfg'),
-        readComponentValue('mma_steps'),
-        readComponentValue('mma_cf_dur'),
-        readComponentValue('mma_cf_db')
-      ];
     } else {
-      data = [
-        seg_idx,
-        stateJson,
-        null,  // video
-        readComponentValue('hf_prompt'),
-        readComponentValue('hf_neg'),
-        readComponentValue('hf_seed'),
-        readComponentValue('hf_guidance'),
-        readComponentValue('hf_steps'),
-        readComponentValue('hf_size'),
-        readComponentValue('hf_cf_dur'),
-        readComponentValue('hf_cf_db')
-      ];
     }
     console.log('[fireRegen] calling api', apiName, 'fn_index', fnIndex, 'seg', seg_idx);
@@ -2068,19 +2177,24 @@ _GLOBAL_JS = """
     _popup.style.cssText = 'display:none;position:fixed;z-index:99999;' +
       'background:#2a2a2a;border:1px solid #555;border-radius:6px;' +
       'padding:8px 12px;box-shadow:0 4px 16px rgba(0,0,0,.5);font-family:sans-serif;';
     _popup.innerHTML =
       '<div id="_wf_popup_lbl" style="color:#ccc;font-size:11px;margin-bottom:6px;white-space:nowrap;"></div>' +
-      '<button id="_wf_popup_btn" style="background:#1d6fa5;color:#fff;border:none;' +
-        'border-radius:4px;padding:5px 14px;font-size:12px;cursor:pointer;width:100%;">&#10227; Regenerate</button>';
     document.body.appendChild(_popup);
-    document.getElementById('_wf_popup_btn').onclick = function(e) {
-      e.stopPropagation();  // prevents the document bubble-phase listener below from firing
-      var slot = _pendingSlot, idx = _pendingIdx;  // capture before hidePopup clears them
-      hidePopup();
-      if (slot !== null && idx !== null) {
-        fireRegen(slot, idx);
-      }
-    };
     // Use bubble phase (false) so stopPropagation() on the button click prevents this from firing
     document.addEventListener('click', function() { hidePopup(); }, false);
     return _popup;
@@ -2308,5 +2422,67 @@ with gr.Blocks(title="Generate Audio for Video", css=_SLOT_CSS, js=_GLOBAL_JS) a
     mma_video.change(fn=_sync,  inputs=[mma_video],  outputs=[taro_video, hf_video])
     hf_video.change(fn=_sync,   inputs=[hf_video],   outputs=[taro_video, mma_video])
 print("[startup] app.py fully loaded — regen handlers registered, SSR disabled")
 demo.queue(max_size=10).launch(ssr_mode=False, height=900, allowed_paths=["/tmp"])

 MODEL_CONFIGS["hunyuan"]["regen_fn"] = regen_hunyuan_segment
+# ================================================================== #
+#                  CROSS-MODEL REGEN WRAPPERS                        #
+# ================================================================== #
+# Three shared endpoints — one per model — that can be called from   #
+# *any* slot tab.  slot_id is passed as plain string data so the     #
+# result is applied back to the correct slot by the JS listener.     #
+# The new segment is resampled to match the slot's existing SR before #
+# being handed to _splice_and_save, so TARO (16 kHz) / MMAudio      #
+# (44.1 kHz) / Hunyuan (48 kHz) outputs can all be mixed freely.    #
+# ================================================================== #
+def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
+    """Resample *wav* from src_sr to dst_sr using torchaudio.
+    Works for mono (T,) and stereo (C, T) numpy arrays."""
+    if src_sr == dst_sr:
+        return wav
+    stereo = wav.ndim == 2
+    t = torch.from_numpy(np.ascontiguousarray(wav))
+    if not stereo:
+        t = t.unsqueeze(0)          # (1, T)
+    t = torchaudio.functional.resample(t.float(), src_sr, dst_sr)
+    if not stereo:
+        t = t.squeeze(0)            # (T,)
+    return t.numpy()
+def xregen_taro(seg_idx, state_json, slot_id,
+                seed_val, cfg_scale, num_steps, mode,
+                crossfade_s, crossfade_db):
+    """Cross-model regen: run TARO inference and splice into *slot_id*."""
+    meta        = json.loads(state_json)
+    slot_sr     = int(meta["sr"])
+    new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
+                                  seed_val, cfg_scale, num_steps, mode,
+                                  crossfade_s, crossfade_db, slot_id)
+    new_wav = _resample_to_slot_sr(new_wav_raw, TARO_SR, slot_sr)
+    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
+        new_wav, int(seg_idx), meta, slot_id
+    )
+    return gr.update(value=video_path), gr.update(value=waveform_html)
+def xregen_mmaudio(seg_idx, state_json, slot_id,
+                   prompt, negative_prompt, seed_val,
+                   cfg_strength, num_steps, crossfade_s, crossfade_db):
+    """Cross-model regen: run MMAudio inference and splice into *slot_id*."""
+    meta    = json.loads(state_json)
+    seg_idx = int(seg_idx)
+    seg_start, seg_end = meta["segments"][seg_idx]
+    seg_dur = seg_end - seg_start
+    slot_sr = int(meta["sr"])
+    silent_video = meta["silent_video"]
+    tmp_dir      = tempfile.mkdtemp()
+    seg_path     = os.path.join(tmp_dir, "xregen_seg.mp4")
+    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+        seg_path, vcodec="copy", an=None
+    ).run(overwrite_output=True, quiet=True)
+    _regen_mmaudio_gpu._cpu_ctx = {"seg_path": seg_path}
+    new_wav_raw, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
+                                             prompt, negative_prompt, seed_val,
+                                             cfg_strength, num_steps,
+                                             crossfade_s, crossfade_db, slot_id)
+    new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr)
+    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
+        new_wav, seg_idx, meta, slot_id
+    )
+    return gr.update(value=video_path), gr.update(value=waveform_html)
+def xregen_hunyuan(seg_idx, state_json, slot_id,
+                   prompt, negative_prompt, seed_val,
+                   guidance_scale, num_steps, model_size,
+                   crossfade_s, crossfade_db):
+    """Cross-model regen: run HunyuanFoley inference and splice into *slot_id*."""
+    meta    = json.loads(state_json)
+    seg_idx = int(seg_idx)
+    seg_start, seg_end = meta["segments"][seg_idx]
+    seg_dur = seg_end - seg_start
+    slot_sr = int(meta["sr"])
+    silent_video = meta["silent_video"]
+    tmp_dir      = tempfile.mkdtemp()
+    seg_path     = os.path.join(tmp_dir, "xregen_seg.mp4")
+    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+        seg_path, vcodec="copy", an=None
+    ).run(overwrite_output=True, quiet=True)
+    _regen_hunyuan_gpu._cpu_ctx = {"seg_path": seg_path}
+    new_wav_raw, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
+                                             prompt, negative_prompt, seed_val,
+                                             guidance_scale, num_steps, model_size,
+                                             crossfade_s, crossfade_db, slot_id)
+    new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr)
+    video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
+        new_wav, seg_idx, meta, slot_id
+    )
+    return gr.update(value=video_path), gr.update(value=waveform_html)
 # ================================================================== #
 #                        SHARED UI HELPERS                            #
 # ================================================================== #
   // Fire regen for a given slot and segment by posting directly to the
   // Gradio queue API — bypasses Svelte binding entirely.
+  // targetModel: 'taro' | 'mma' | 'hf'  (which model to use for inference)
+  // If targetModel matches the slot's own prefix, uses the per-slot regen_* endpoint.
+  // Otherwise uses the shared xregen_* cross-model endpoint.
+  function fireRegen(slot_id, seg_idx, targetModel) {
+    const prefix  = slot_id.split('_')[0];   // owning tab: 'taro'|'mma'|'hf'
     const slotNum = parseInt(slot_id.split('_')[1], 10);
+    // Decide which endpoint to call
+    const crossModel = (targetModel !== prefix);
+    let apiName, data;
     // Read state_json from the waveform container data-state attribute
     const container = document.getElementById('wf_container_' + slot_id);
       return;
     }
+    if (!crossModel) {
+      // ── Same-model regen: per-slot endpoint, video passed as null ──
+      apiName = 'regen_' + prefix + '_' + slotNum;
+      if (prefix === 'taro') {
+        data = [seg_idx, stateJson, null,
+          readComponentValue('taro_seed'), readComponentValue('taro_cfg'),
+          readComponentValue('taro_steps'), readComponentValue('taro_mode'),
+          readComponentValue('taro_cf_dur'), readComponentValue('taro_cf_db')];
+      } else if (prefix === 'mma') {
+        data = [seg_idx, stateJson, null,
+          readComponentValue('mma_prompt'), readComponentValue('mma_neg'),
+          readComponentValue('mma_seed'), readComponentValue('mma_cfg'),
+          readComponentValue('mma_steps'),
+          readComponentValue('mma_cf_dur'), readComponentValue('mma_cf_db')];
+      } else {
+        data = [seg_idx, stateJson, null,
+          readComponentValue('hf_prompt'), readComponentValue('hf_neg'),
+          readComponentValue('hf_seed'), readComponentValue('hf_guidance'),
+          readComponentValue('hf_steps'), readComponentValue('hf_size'),
+          readComponentValue('hf_cf_dur'), readComponentValue('hf_cf_db')];
+      }
     } else {
+      // ── Cross-model regen: shared xregen_* endpoint ──
+      // slot_id is passed so the server knows which slot's state to splice into.
+      // UI params are read from the target model's tab inputs.
+      if (targetModel === 'taro') {
+        apiName = 'xregen_taro';
+        data = [seg_idx, stateJson, slot_id,
+          readComponentValue('taro_seed'), readComponentValue('taro_cfg'),
+          readComponentValue('taro_steps'), readComponentValue('taro_mode'),
+          readComponentValue('taro_cf_dur'), readComponentValue('taro_cf_db')];
+      } else if (targetModel === 'mma') {
+        apiName = 'xregen_mmaudio';
+        data = [seg_idx, stateJson, slot_id,
+          readComponentValue('mma_prompt'), readComponentValue('mma_neg'),
+          readComponentValue('mma_seed'), readComponentValue('mma_cfg'),
+          readComponentValue('mma_steps'),
+          readComponentValue('mma_cf_dur'), readComponentValue('mma_cf_db')];
+      } else {
+        apiName = 'xregen_hunyuan';
+        data = [seg_idx, stateJson, slot_id,
+          readComponentValue('hf_prompt'), readComponentValue('hf_neg'),
+          readComponentValue('hf_seed'), readComponentValue('hf_guidance'),
+          readComponentValue('hf_steps'), readComponentValue('hf_size'),
+          readComponentValue('hf_cf_dur'), readComponentValue('hf_cf_db')];
+      }
+    }
+    const fnIndex = getFnIndex(apiName);
+    if (fnIndex === undefined) {
+      console.warn('[fireRegen] fn_index not found for api_name:', apiName, 'cache:', _fnIndexCache);
+      return;
     }
     console.log('[fireRegen] calling api', apiName, 'fn_index', fnIndex, 'seg', seg_idx);
     _popup.style.cssText = 'display:none;position:fixed;z-index:99999;' +
       'background:#2a2a2a;border:1px solid #555;border-radius:6px;' +
       'padding:8px 12px;box-shadow:0 4px 16px rgba(0,0,0,.5);font-family:sans-serif;';
+    var btnStyle = 'color:#fff;border:none;border-radius:4px;padding:5px 10px;' +
+                   'font-size:11px;cursor:pointer;flex:1;';
     _popup.innerHTML =
       '<div id="_wf_popup_lbl" style="color:#ccc;font-size:11px;margin-bottom:6px;white-space:nowrap;"></div>' +
+      '<div style="display:flex;gap:5px;">' +
+        '<button id="_wf_popup_taro" style="background:#1d6fa5;' + btnStyle + '">&#10227; TARO</button>' +
+        '<button id="_wf_popup_mma"  style="background:#2d7a4a;' + btnStyle + '">&#10227; MMAudio</button>' +
+        '<button id="_wf_popup_hf"   style="background:#7a3d8c;' + btnStyle + '">&#10227; Hunyuan</button>' +
+      '</div>';
     document.body.appendChild(_popup);
+    ['taro','mma','hf'].forEach(function(model) {
+      document.getElementById('_wf_popup_' + model).onclick = function(e) {
+        e.stopPropagation();
+        var slot = _pendingSlot, idx = _pendingIdx;
+        hidePopup();
+        if (slot !== null && idx !== null) fireRegen(slot, idx, model);
+      };
+    });
     // Use bubble phase (false) so stopPropagation() on the button click prevents this from firing
     document.addEventListener('click', function() { hidePopup(); }, false);
     return _popup;
     mma_video.change(fn=_sync,  inputs=[mma_video],  outputs=[taro_video, hf_video])
     hf_video.change(fn=_sync,   inputs=[hf_video],   outputs=[taro_video, mma_video])
+    # ---- Cross-model regen endpoints ----
+    # render=False inputs/outputs: no DOM elements created, no SSR validation impact.
+    # JS calls these via /gradio_api/queue/join using the api_name and applies
+    # the returned video+waveform directly to the target slot's DOM elements.
+    _xr_seg       = gr.Textbox(value="0",  render=False)
+    _xr_state     = gr.Textbox(value="",   render=False)
+    _xr_slot_id   = gr.Textbox(value="",   render=False)
+    _xr_vid_out   = gr.Video(render=False)
+    _xr_wave_out  = gr.HTML(render=False)
+    # TARO cross-model regen inputs: seg_idx, state_json, slot_id, seed, cfg, steps, mode, cf_dur, cf_db
+    _xr_taro_seed  = gr.Textbox(value="-1",  render=False)
+    _xr_taro_cfg   = gr.Textbox(value="7.5", render=False)
+    _xr_taro_steps = gr.Textbox(value="25",  render=False)
+    _xr_taro_mode  = gr.Textbox(value="sde", render=False)
+    _xr_taro_cfd   = gr.Textbox(value="2",   render=False)
+    _xr_taro_cfdb  = gr.Textbox(value="3",   render=False)
+    gr.Button(render=False).click(
+        fn=xregen_taro,
+        inputs=[_xr_seg, _xr_state, _xr_slot_id,
+                _xr_taro_seed, _xr_taro_cfg, _xr_taro_steps,
+                _xr_taro_mode, _xr_taro_cfd, _xr_taro_cfdb],
+        outputs=[_xr_vid_out, _xr_wave_out],
+        api_name="xregen_taro",
+    )
+    # MMAudio cross-model regen inputs: seg_idx, state_json, slot_id, prompt, neg, seed, cfg, steps, cf_dur, cf_db
+    _xr_mma_prompt = gr.Textbox(value="",    render=False)
+    _xr_mma_neg    = gr.Textbox(value="",    render=False)
+    _xr_mma_seed   = gr.Textbox(value="-1",  render=False)
+    _xr_mma_cfg    = gr.Textbox(value="4.5", render=False)
+    _xr_mma_steps  = gr.Textbox(value="25",  render=False)
+    _xr_mma_cfd    = gr.Textbox(value="2",   render=False)
+    _xr_mma_cfdb   = gr.Textbox(value="3",   render=False)
+    gr.Button(render=False).click(
+        fn=xregen_mmaudio,
+        inputs=[_xr_seg, _xr_state, _xr_slot_id,
+                _xr_mma_prompt, _xr_mma_neg, _xr_mma_seed,
+                _xr_mma_cfg, _xr_mma_steps, _xr_mma_cfd, _xr_mma_cfdb],
+        outputs=[_xr_vid_out, _xr_wave_out],
+        api_name="xregen_mmaudio",
+    )
+    # HunyuanFoley cross-model regen inputs: seg_idx, state_json, slot_id, prompt, neg, seed, guidance, steps, size, cf_dur, cf_db
+    _xr_hf_prompt = gr.Textbox(value="",    render=False)
+    _xr_hf_neg    = gr.Textbox(value="",    render=False)
+    _xr_hf_seed   = gr.Textbox(value="-1",  render=False)
+    _xr_hf_guide  = gr.Textbox(value="4.5", render=False)
+    _xr_hf_steps  = gr.Textbox(value="50",  render=False)
+    _xr_hf_size   = gr.Textbox(value="xxl", render=False)
+    _xr_hf_cfd    = gr.Textbox(value="2",   render=False)
+    _xr_hf_cfdb   = gr.Textbox(value="3",   render=False)
+    gr.Button(render=False).click(
+        fn=xregen_hunyuan,
+        inputs=[_xr_seg, _xr_state, _xr_slot_id,
+                _xr_hf_prompt, _xr_hf_neg, _xr_hf_seed,
+                _xr_hf_guide, _xr_hf_steps, _xr_hf_size,
+                _xr_hf_cfd, _xr_hf_cfdb],
+        outputs=[_xr_vid_out, _xr_wave_out],
+        api_name="xregen_hunyuan",
+    )
 print("[startup] app.py fully loaded — regen handlers registered, SSR disabled")
 demo.queue(max_size=10).launch(ssr_mode=False, height=900, allowed_paths=["/tmp"])