Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Sleeping

BoxOfColors Claude Sonnet 4.6 commited on 25 days ago

Commit

b94c46b

1 Parent(s): a6fff03

Revert to single multi-seg GPU call; bump hunyuan load_overhead to 90s

- Raise HunyuanFoley load_overhead 55→90 s to account for cold-disk 10 GB
weight load (~73 s measured) plus aux model init (~8 s)
- Raise _clamp_duration floor 60→120 s — Pro ZeroGPU users get 300 s/call,
so 120 s floor safely covers worst-case cold-start without wasting budget
- Replace per-segment GPU call architecture (_hunyuan_gpu_infer_one_seg ×N)
with a single multi-segment @spaces.GPU call (_hunyuan_gpu_infer) that
loads the model once and loops over all segments — avoids reloading the
10 GB weights N times which would exceed the Pro time budget entirely
- Duration estimate now uses _estimate_gpu_duration("hunyuan", num_samples,
num_steps) which scales with actual work (segments × steps) + 90 s overhead

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +72 -118

app.py CHANGED Viewed

@@ -553,7 +553,7 @@ MODEL_CONFIGS = {
         "window_s":       15.0,   # HunyuanFoley max video duration
         "sr":             48000,
         "secs_per_step":  0.35,   # measured 0.328 s/step on H200
-        "load_overhead":  55,     # ~55s to load the 10 GB XXL weights
         "tab_prefix":     "hf",
         "label":          "HunyuanFoley",
         "regen_fn":       None,
@@ -569,8 +569,10 @@ HUNYUAN_SECS_PER_STEP = MODEL_CONFIGS["hunyuan"]["secs_per_step"]
 def _clamp_duration(secs: float, label: str) -> int:
-    """Clamp a raw GPU-seconds estimate to [60, GPU_DURATION_CAP] and log it."""
-    result = min(GPU_DURATION_CAP, max(60, int(secs)))
     print(f"[duration] {label}: {secs:.0f}s raw → {result}s reserved")
     return result
@@ -1215,94 +1217,46 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
-def _hunyuan_seg_duration(video_file, prompt, negative_prompt, seed_val,
-                          guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                          num_samples, silent_video=None, seg_clip_path=None,
-                          dummy_seg_path=None, text_feats_path=None,
-                          clip_start_s=0.0, clip_dur_s=None, **_kwargs):
-    """Duration estimate for a single-segment HunyuanFoley GPU call.
-    One segment × num_steps + model load overhead — always fits in 60 s."""
-    cfg  = MODEL_CONFIGS["hunyuan"]
-    secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
-    print(f"[duration] HunyuanFoley 1seg: 1×{int(num_steps)}steps → {secs:.0f}s → capped ", end="")
-    return _clamp_duration(secs, "HunyuanFoley 1seg")
-@spaces.GPU(duration=_hunyuan_seg_duration)
-def _hunyuan_gpu_infer_one_seg(video_file, prompt, negative_prompt, seed_val,
-                                guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                                num_samples, silent_video, seg_clip_path,
-                                dummy_seg_path, text_feats_path,
-                                clip_start_s=0.0, clip_dur_s=None):
-    """GPU-only HunyuanFoley inference for ONE segment.
-    text_feats_path: path to pre-saved text_feats .pt file, or empty string to
-    extract fresh (first segment). Returns (wav_numpy, sr, text_feats_path).
-    """
-    import traceback as _tb
-    print(f"[_hunyuan_gpu_infer_one_seg] START seg_clip={seg_clip_path!r} "
-          f"text_feats_path={text_feats_path!r}")
-    try:
-        _ensure_syspath("HunyuanVideo-Foley")
-        from hunyuanvideo_foley.utils.model_utils   import denoise_process
-        from hunyuanvideo_foley.utils.feature_utils import feature_process, encode_video_features
-        device, _ = _get_device_and_dtype()
-        model_dict, cfg = _load_hunyuan_model(device, model_size.lower())
-        # Load or extract text features
-        if text_feats_path and os.path.exists(text_feats_path):
-            print(f"[_hunyuan_gpu_infer_one_seg] loading cached text_feats from {text_feats_path}")
-            text_feats = torch.load(text_feats_path, map_location=device, weights_only=False)
-            visual_feats, seg_audio_len = encode_video_features(seg_clip_path, model_dict)
-        else:
-            print(f"[_hunyuan_gpu_infer_one_seg] extracting text+visual features")
-            visual_feats, text_feats, seg_audio_len = feature_process(
-                seg_clip_path,
-                prompt if prompt else "",
-                model_dict, cfg,
-                neg_prompt=negative_prompt if negative_prompt else None,
-            )
-        print(f"[_hunyuan_gpu_infer_one_seg] denoising {seg_audio_len:.2f}s audio")
-        audio_batch, sr = denoise_process(
-            visual_feats, text_feats, seg_audio_len, model_dict, cfg,
-            guidance_scale=float(guidance_scale),
-            num_inference_steps=int(num_steps),
-            batch_size=1,
-        )
-        wav = audio_batch[0].float().cpu().numpy()
-        # Save text_feats to disk so next segment's GPU call can reuse it without
-        # re-running CLAP/SigLIP, and so we never return a CUDA tensor to main process.
-        tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
-        out_text_feats_path = os.path.join(tmp_dir, "hunyuan_text_feats.pt")
-        torch.save(text_feats, out_text_feats_path)
-        print(f"[_hunyuan_gpu_infer_one_seg] text_feats saved to {out_text_feats_path}")
-        return wav, sr, out_text_feats_path
-    except Exception as _e:
-        print(f"[_hunyuan_gpu_infer_one_seg] EXCEPTION: {_e}")
-        _tb.print_exc()
-        raise
-# Keep old name as alias for the xregen path which calls it directly
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
                        num_samples, silent_video, segments_json, total_dur_s,
                        clip_start_s=0.0, clip_dur_s=None):
-    """Wrapper used by xregen — single-segment call via _hunyuan_gpu_infer_one_seg."""
     import traceback as _tb
     print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} "
           f"num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
     try:
         _ensure_syspath("HunyuanVideo-Foley")
-        from hunyuanvideo_foley.utils.feature_utils import feature_process
         tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
         _sv = silent_video
-        _total = float(total_dur_s)
         if clip_dur_s is not None:
             clip_path = _extract_segment_clip(
                 silent_video, float(clip_start_s), float(clip_dur_s),
@@ -1312,33 +1266,54 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
             _total = float(clip_dur_s)
         segments = json.loads(segments_json)
         seg_clip_paths = [
             _extract_segment_clip(_sv, s, e - s,
                                   os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
             for i, (s, e) in enumerate(segments)
         ]
-        # One GPU call per segment — each fits in the 60 s ZeroGPU free-tier cap
         results = []
-        for sample_idx in range(int(num_samples)):
             seg_wavs = []
             sr = 48000
-            text_feats_path = ""
             _t0 = time.perf_counter()
             for seg_i, (seg_start, seg_end) in enumerate(segments):
-                print(f"[_hunyuan_gpu_infer] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
-                      f"{seg_start:.1f}–{seg_end:.1f}s")
-                wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
-                    video_file, prompt, negative_prompt, seed_val,
-                    guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                    num_samples, _sv, seg_clip_paths[seg_i],
-                    seg_clip_paths[0], text_feats_path,
-                    clip_start_s, None,
                 )
-                seg_wavs.append(wav)
             _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
                                   len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
             results.append((seg_wavs, sr, text_feats_path))
         return results
     except Exception as _e:
@@ -1350,7 +1325,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
     """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
-    One GPU call per segment to stay within ZeroGPU's 60 s free-tier cap."""
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
     crossfade_db = float(crossfade_db)
@@ -1360,37 +1335,16 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         video_file, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
-    seg_clip_paths = [
-        _extract_segment_clip(silent_video, s, e - s,
-                              os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
-        for i, (s, e) in enumerate(segments)
-    ]
-    # ── One GPU call per segment ──
-    results = []
-    for sample_idx in range(num_samples):
-        seg_wavs = []
-        sr = 48000
-        text_feats_path = ""
-        _t0 = time.perf_counter()
-        for seg_i, (seg_start, seg_end) in enumerate(segments):
-            print(f"[HunyuanFoley] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
-                  f"{seg_start:.1f}–{seg_end:.1f}s")
-            wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
-                video_file, prompt, negative_prompt, seed_val,
-                guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                num_samples, silent_video, seg_clip_paths[seg_i],
-                seg_clip_paths[0], text_feats_path,
-            )
-            seg_wavs.append(wav)
-        _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
-                              len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
-        results.append((seg_wavs, sr, text_feats_path))
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
-        _, _sr, tfp = result
-        return {"text_feats_path": tfp}
     outputs = _post_process_samples(
         results, model="hunyuan", tmp_dir=tmp_dir,

         "window_s":       15.0,   # HunyuanFoley max video duration
         "sr":             48000,
         "secs_per_step":  0.35,   # measured 0.328 s/step on H200
+        "load_overhead":  90,     # cold disk: ~73s for 10 GB weights + ~8s aux models
         "tab_prefix":     "hf",
         "label":          "HunyuanFoley",
         "regen_fn":       None,
 def _clamp_duration(secs: float, label: str) -> int:
+    """Clamp a raw GPU-seconds estimate to [120, GPU_DURATION_CAP] and log it.
+    ZeroGPU Pro users get up to 300 s per call; 120 s floor covers cold-disk
+    model loads (e.g. HunyuanFoley XXL ~73 s on first access)."""
+    result = min(GPU_DURATION_CAP, max(120, int(secs)))
     print(f"[duration] {label}: {secs:.0f}s raw → {result}s reserved")
     return result
+def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
+                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                      num_samples, silent_video=None, segments_json=None, total_dur_s=None,
+                      clip_start_s=0.0, clip_dur_s=None, **_kwargs):
+    """Pre-GPU callable — must match _hunyuan_gpu_infer's input signature exactly.
+    silent_video, segments_json, total_dur_s, clip_start_s, clip_dur_s are extra
+    positional args that xregen passes; they must appear here so ZeroGPU doesn't
+    raise TypeError when forwarding all args to this duration fn."""
+    return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
+                                  video_file=video_file, crossfade_s=crossfade_s)
+@spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
                        num_samples, silent_video, segments_json, total_dur_s,
                        clip_start_s=0.0, clip_dur_s=None):
+    """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
+    All segments processed in a single GPU call (Pro ZeroGPU allows up to 300 s).
+    """
     import traceback as _tb
     print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} "
           f"num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
     try:
         _ensure_syspath("HunyuanVideo-Foley")
+        from hunyuanvideo_foley.utils.model_utils   import denoise_process
+        from hunyuanvideo_foley.utils.feature_utils import feature_process, encode_video_features
+        seed_val    = _resolve_seed(seed_val)
+        num_samples = int(num_samples)
+        crossfade_s = float(crossfade_s)
+        total_dur_s = float(total_dur_s)
+        set_global_seed(seed_val)
+        device, _ = _get_device_and_dtype()
+        model_dict, cfg = _load_hunyuan_model(device, model_size.lower())
         tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
         _sv = silent_video
+        _total = total_dur_s
         if clip_dur_s is not None:
             clip_path = _extract_segment_clip(
                 silent_video, float(clip_start_s), float(clip_dur_s),
             _total = float(clip_dur_s)
         segments = json.loads(segments_json)
+        dummy_seg_path = _extract_segment_clip(
+            _sv, 0, min(_total, HUNYUAN_MAX_DUR),
+            os.path.join(tmp_dir, "_seg_dummy.mp4"),
+        )
         seg_clip_paths = [
             _extract_segment_clip(_sv, s, e - s,
                                   os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
             for i, (s, e) in enumerate(segments)
         ]
+        # Extract text features once for all segments
+        _, text_feats, _ = feature_process(
+            dummy_seg_path,
+            prompt if prompt else "",
+            model_dict, cfg,
+            neg_prompt=negative_prompt if negative_prompt else None,
+        )
         results = []
+        for sample_idx in range(num_samples):
             seg_wavs = []
             sr = 48000
             _t0 = time.perf_counter()
             for seg_i, (seg_start, seg_end) in enumerate(segments):
+                visual_feats, seg_audio_len = encode_video_features(
+                    seg_clip_paths[seg_i], model_dict)
+                print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
+                      f"{seg_start:.1f}–{seg_end:.1f}s → {seg_audio_len:.2f}s audio")
+                audio_batch, sr = denoise_process(
+                    visual_feats, text_feats, seg_audio_len, model_dict, cfg,
+                    guidance_scale=float(guidance_scale),
+                    num_inference_steps=int(num_steps),
+                    batch_size=1,
                 )
+                seg_wavs.append(audio_batch[0].float().cpu().numpy())
             _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
                                   len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
+            # Save text_feats inside the GPU worker — never return CUDA tensors
+            text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
+            torch.save(text_feats, text_feats_path)
+            print(f"[HunyuanFoley] text_feats saved to {text_feats_path}")
             results.append((seg_wavs, sr, text_feats_path))
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
         return results
     except Exception as _e:
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
     """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
+    CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
     crossfade_db = float(crossfade_db)
         video_file, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
+    # ── GPU inference (all segments in one call) ──
+    results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                                 guidance_scale, num_steps, model_size,
+                                 crossfade_s, crossfade_db, num_samples,
+                                 silent_video, json.dumps(segments), total_dur_s)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
+        _, _sr, text_feats_path = result
+        return {"text_feats_path": text_feats_path}
     outputs = _post_process_samples(
         results, model="hunyuan", tmp_dir=tmp_dir,