Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on 1 day ago

Commit

e7175d4

1 Parent(s): ac67bf3

fix: remove ctx_key from all function signatures — use fn-name-keyed global dict

ctx_key as a function argument exposed it to Gradio's API endpoint discovery,
causing 'Too many arguments provided for the endpoint' errors and GPU task aborts.

Fix: remove ctx_key from all @spaces.GPU function signatures and their duration
callables. Store/retrieve context using _ctx_store(fn_name, data) /
_ctx_load(fn_name) — a global dict keyed by function name. This is safe because
ZeroGPU is synchronous (wrapper blocks until GPU fn returns), so only one call
per GPU function is in-flight at a time within a single process.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +50 -57

app.py CHANGED Viewed

@@ -124,35 +124,30 @@ print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
-# CPU → GPU context passing via UUID-keyed global store.
 #
-# ZeroGPU dispatches @spaces.GPU functions on its own worker thread, so
-# threading.local() doesn't work.  Passing context as a function argument
-# is the right idea, but ZeroGPU validates args against the *duration*
-# callable's signature — any extra param not present in the duration fn
-# gets dropped or set to None before the GPU fn runs.
 #
-# Solution: add ctx_key="" to BOTH the duration fn AND the GPU fn.
-# The wrapper stores the context dict in _GPU_CTX[uuid] and passes the
-# uuid string as ctx_key.  The GPU fn does _GPU_CTX.pop(ctx_key).
-# Since the dict is global (not thread-local), the GPU worker thread can
-# read it regardless of which thread wrote it.  The uuid ensures
-# concurrent requests don't collide.
-import uuid as _uuid_mod
-_GPU_CTX: dict       = {}
-_GPU_CTX_LOCK        = threading.Lock()
-def _ctx_store(data: dict) -> str:
-    """Store *data* in the global context dict; return the UUID key."""
-    key = _uuid_mod.uuid4().hex
     with _GPU_CTX_LOCK:
-        _GPU_CTX[key] = data
-    return key
-def _ctx_load(key: str) -> dict:
-    """Pop and return the context dict for *key*."""
     with _GPU_CTX_LOCK:
-        return _GPU_CTX.pop(key, {})
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
@@ -577,7 +572,7 @@ def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: floa
 def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
-                   crossfade_s, crossfade_db, num_samples, ctx_key=""):
     """Pre-GPU callable — must match _taro_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("taro", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
@@ -794,7 +789,7 @@ def _cpu_preprocess(video_file: str, model_dur: float,
 @spaces.GPU(duration=_taro_duration)
 def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
-                    crossfade_s, crossfade_db, num_samples, ctx_key=""):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
     seed_val     = int(seed_val)
@@ -810,7 +805,7 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
-    ctx        = _ctx_load(ctx_key)
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
@@ -882,14 +877,14 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
-    ctx_key = _ctx_store({
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
     })
     # ── GPU inference only ──
     results = _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
-                              crossfade_s, crossfade_db, num_samples, ctx_key)
     # ── CPU post-processing (no GPU needed) ──
     # Upsample 16kHz → 48kHz and normalise result tuples to (seg_wavs, ...)
@@ -938,8 +933,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
-                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
-                      ctx_key=""):
     """Pre-GPU callable — must match _mmaudio_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("mmaudio", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
@@ -947,8 +941,7 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
-                       cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples,
-                       ctx_key=""):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample."""
     _ensure_syspath("MMAudio")
@@ -963,7 +956,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
-    ctx            = _ctx_load(ctx_key)
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
@@ -1042,12 +1035,12 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    ctx_key = _ctx_store({"segments": segments, "seg_clip_paths": seg_clip_paths})
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  cfg_strength, num_steps, crossfade_s, crossfade_db,
-                                 num_samples, ctx_key)
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
@@ -1085,7 +1078,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
                       guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                      num_samples, ctx_key=""):
     """Pre-GPU callable — must match _hunyuan_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
@@ -1094,7 +1087,7 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                       num_samples, ctx_key=""):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample."""
     _ensure_syspath("HunyuanVideo-Foley")
@@ -1113,7 +1106,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     model_dict, cfg = _load_hunyuan_model(device, model_size)
-    ctx              = _ctx_load(ctx_key)
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
@@ -1197,7 +1190,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    ctx_key = _ctx_store({
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
     })
@@ -1205,7 +1198,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
-                                 crossfade_s, crossfade_db, num_samples, ctx_key)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
@@ -1285,7 +1278,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
 def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
                          seed_val, cfg_scale, num_steps, mode,
-                         crossfade_s, crossfade_db, slot_id=None, ctx_key=""):
     # If cached CAVP/onset features exist, skip ~10s feature-extractor overhead
     try:
         meta = json.loads(seg_meta_json)
@@ -1305,7 +1298,7 @@ def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
 @spaces.GPU(duration=_taro_regen_duration)
 def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
                     seed_val, cfg_scale, num_steps, mode,
-                    crossfade_s, crossfade_db, slot_id=None, ctx_key=""):
     """GPU-only TARO regen — returns new_wav for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
@@ -1372,7 +1365,7 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
 def _mmaudio_regen_duration(video_file, seg_idx, seg_meta_json,
                              prompt, negative_prompt, seed_val,
                              cfg_strength, num_steps, crossfade_s, crossfade_db,
-                             slot_id=None, ctx_key=""):
     return _estimate_regen_duration("mmaudio", int(num_steps))
@@ -1380,7 +1373,7 @@ def _mmaudio_regen_duration(video_file, seg_idx, seg_meta_json,
 def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        cfg_strength, num_steps, crossfade_s, crossfade_db,
-                       slot_id=None, ctx_key=""):
     """GPU-only MMAudio regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
@@ -1396,7 +1389,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
-    seg_path = _ctx_load(ctx_key).get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
@@ -1438,13 +1431,13 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    ctx_key = _ctx_store({"seg_path": seg_path})
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      cfg_strength, num_steps, crossfade_s, crossfade_db,
-                                     slot_id, ctx_key)
     # Resample to 48kHz if needed (MMAudio outputs at 44100 Hz)
     if sr != TARGET_SR:
@@ -1463,7 +1456,7 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
 def _hunyuan_regen_duration(video_file, seg_idx, seg_meta_json,
                              prompt, negative_prompt, seed_val,
                              guidance_scale, num_steps, model_size,
-                             crossfade_s, crossfade_db, slot_id=None, ctx_key=""):
     return _estimate_regen_duration("hunyuan", int(num_steps))
@@ -1471,7 +1464,7 @@ def _hunyuan_regen_duration(video_file, seg_idx, seg_meta_json,
 def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size,
-                       crossfade_s, crossfade_db, slot_id=None, ctx_key=""):
     """GPU-only HunyuanFoley regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
@@ -1488,7 +1481,7 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     set_global_seed(random.randint(0, 2**32 - 1))
-    ctx      = _ctx_load(ctx_key)
     seg_path = ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
@@ -1532,7 +1525,7 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    ctx_key = _ctx_store({
         "seg_path": seg_path,
         "text_feats_path": meta.get("text_feats_path", ""),
     })
@@ -1541,7 +1534,7 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      guidance_scale, num_steps, model_size,
-                                     crossfade_s, crossfade_db, slot_id, ctx_key)
     meta["sr"] = sr
@@ -1658,11 +1651,11 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
-        ctx_key = _ctx_store({"seg_path": seg_path})
         wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          cfg_strength, num_steps,
-                                         crossfade_s, crossfade_db, slot_id, ctx_key)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
@@ -1683,14 +1676,14 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
-        ctx_key = _ctx_store({
             "seg_path": seg_path,
             "text_feats_path": meta.get("text_feats_path", ""),
         })
         wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          guidance_scale, num_steps, model_size,
-                                         crossfade_s, crossfade_db, slot_id, ctx_key)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)

 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
+# CPU → GPU context passing via function-name-keyed global store.
 #
+# Problem: ZeroGPU runs @spaces.GPU functions on its own worker thread, so
+# threading.local() is invisible to the GPU worker.  Passing ctx as a
+# function argument exposes it to Gradio's API endpoint, causing
+# "Too many arguments" errors.
 #
+# Solution: store context in a plain global dict keyed by function name.
+# A per-key Lock serialises concurrent callers for the same function
+# (ZeroGPU is already synchronous — the wrapper blocks until the GPU fn
+# returns — so in practice only one call per GPU fn is in-flight at a time).
+# The global dict is readable from any thread.
+_GPU_CTX: dict = {}
+_GPU_CTX_LOCK  = threading.Lock()
+def _ctx_store(fn_name: str, data: dict) -> None:
+    """Store *data* under *fn_name* key (overwrites previous)."""
     with _GPU_CTX_LOCK:
+        _GPU_CTX[fn_name] = data
+def _ctx_load(fn_name: str) -> dict:
+    """Pop and return the context dict stored under *fn_name*."""
     with _GPU_CTX_LOCK:
+        return _GPU_CTX.pop(fn_name, {})
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
 def _taro_duration(video_file, seed_val, cfg_scale, num_steps, mode,
+                   crossfade_s, crossfade_db, num_samples):
     """Pre-GPU callable — must match _taro_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("taro", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
 @spaces.GPU(duration=_taro_duration)
 def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
+                    crossfade_s, crossfade_db, num_samples):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
     seed_val     = int(seed_val)
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
+    ctx        = _ctx_load("taro_gpu_infer")
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
+    _ctx_store("taro_gpu_infer", {
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
     })
     # ── GPU inference only ──
     results = _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
+                              crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
     # Upsample 16kHz → 48kHz and normalise result tuples to (seg_wavs, ...)
 def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
+                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """Pre-GPU callable — must match _mmaudio_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("mmaudio", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
 @spaces.GPU(duration=_mmaudio_duration)
 def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                       cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """GPU-only MMAudio inference — model loading + flow-matching generation.
     Returns list of (seg_audios, sr) per sample."""
     _ensure_syspath("MMAudio")
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    ctx            = _ctx_load("mmaudio_gpu_infer")
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
         for i, (s, e) in enumerate(segments)
     ]
+    _ctx_store("mmaudio_gpu_infer", {"segments": segments, "seg_clip_paths": seg_clip_paths})
     # ── GPU inference only ──
     results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  cfg_strength, num_steps, crossfade_s, crossfade_db,
+                                 num_samples)
     # ── CPU post-processing ──
     # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
 def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
                       guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                      num_samples):
     """Pre-GPU callable — must match _hunyuan_gpu_infer's input order exactly."""
     return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
                                   video_file=video_file, crossfade_s=crossfade_s)
 @spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                       num_samples):
     """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
     Returns list of (seg_wavs, sr, text_feats) per sample."""
     _ensure_syspath("HunyuanVideo-Foley")
     model_dict, cfg = _load_hunyuan_model(device, model_size)
+    ctx              = _ctx_load("hunyuan_gpu_infer")
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
         for i, (s, e) in enumerate(segments)
     ]
+    _ctx_store("hunyuan_gpu_infer", {
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
     })
     # ── GPU inference only ──
     results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                                  guidance_scale, num_steps, model_size,
+                                 crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
 def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
                          seed_val, cfg_scale, num_steps, mode,
+                         crossfade_s, crossfade_db, slot_id=None):
     # If cached CAVP/onset features exist, skip ~10s feature-extractor overhead
     try:
         meta = json.loads(seg_meta_json)
 @spaces.GPU(duration=_taro_regen_duration)
 def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
                     seed_val, cfg_scale, num_steps, mode,
+                    crossfade_s, crossfade_db, slot_id=None):
     """GPU-only TARO regen — returns new_wav for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
 def _mmaudio_regen_duration(video_file, seg_idx, seg_meta_json,
                              prompt, negative_prompt, seed_val,
                              cfg_strength, num_steps, crossfade_s, crossfade_db,
+                             slot_id=None):
     return _estimate_regen_duration("mmaudio", int(num_steps))
 def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        cfg_strength, num_steps, crossfade_s, crossfade_db,
+                       slot_id=None):
     """GPU-only MMAudio regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
+    seg_path = _ctx_load("regen_mmaudio_gpu").get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    _ctx_store("regen_mmaudio_gpu", {"seg_path": seg_path})
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      cfg_strength, num_steps, crossfade_s, crossfade_db,
+                                     slot_id)
     # Resample to 48kHz if needed (MMAudio outputs at 44100 Hz)
     if sr != TARGET_SR:
 def _hunyuan_regen_duration(video_file, seg_idx, seg_meta_json,
                              prompt, negative_prompt, seed_val,
                              guidance_scale, num_steps, model_size,
+                             crossfade_s, crossfade_db, slot_id=None):
     return _estimate_regen_duration("hunyuan", int(num_steps))
 def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                        prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size,
+                       crossfade_s, crossfade_db, slot_id=None):
     """GPU-only HunyuanFoley regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     set_global_seed(random.randint(0, 2**32 - 1))
+    ctx      = _ctx_load("regen_hunyuan_gpu")
     seg_path = ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    _ctx_store("regen_hunyuan_gpu", {
         "seg_path": seg_path,
         "text_feats_path": meta.get("text_feats_path", ""),
     })
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
                                      prompt, negative_prompt, seed_val,
                                      guidance_scale, num_steps, model_size,
+                                     crossfade_s, crossfade_db, slot_id)
     meta["sr"] = sr
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
+        _ctx_store("regen_mmaudio_gpu", {"seg_path": seg_path})
         wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          cfg_strength, num_steps,
+                                         crossfade_s, crossfade_db, slot_id)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
             meta["silent_video"], seg_start, seg_end - seg_start,
             os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
         )
+        _ctx_store("regen_hunyuan_gpu", {
             "seg_path": seg_path,
             "text_feats_path": meta.get("text_feats_path", ""),
         })
         wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                          prompt, negative_prompt, seed_val,
                                          guidance_scale, num_steps, model_size,
+                                         crossfade_s, crossfade_db, slot_id)
         return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)