Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Sleeping

BoxOfColors Claude Sonnet 4.6 commited on 26 days ago

Commit

a6fff03

1 Parent(s): 1dcac2d

Fix HunyuanFoley initial gen: one GPU call per segment

ZeroGPU free tier caps GPU windows at 60s regardless of what is requested.
With 2 segments, model load (~19s) + seg1 (~13s) + seg2 (~13s) = ~45s, but
the Transformers cache migration on first call pushes it over 60s, causing
the worker to be silently killed mid-segment-2 with no error message.

Fix: replace the single multi-segment GPU call with one GPU call per segment.
Each call: model load + 1 segment inference = ~32s, comfortably under 60s.
Text features are saved to disk after the first segment and reloaded by
subsequent segments to avoid re-running CLAP/SigLIP extraction each time.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +140 -123

app.py CHANGED Viewed

@@ -1215,165 +1215,182 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
-def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
-                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                      num_samples, silent_video=None, segments_json=None, total_dur_s=None,
-                      clip_start_s=0.0, clip_dur_s=None, **_kwargs):
-    """Pre-GPU callable — must match _hunyuan_gpu_infer's input signature exactly.
-    silent_video, segments_json, total_dur_s, clip_start_s, clip_dur_s are extra
-    positional args that xregen passes; they must appear here so ZeroGPU doesn't
-    raise TypeError when forwarding all args to this duration fn."""
-    return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
-                                  video_file=video_file, crossfade_s=crossfade_s)
-@spaces.GPU(duration=_hunyuan_duration)
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
                        num_samples, silent_video, segments_json, total_dur_s,
                        clip_start_s=0.0, clip_dur_s=None):
-    """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
-    Returns list of (seg_wavs, sr, text_feats) per sample.
-    All paths passed explicitly as positional args to survive ZeroGPU isolation.
-    When *clip_dur_s* is set, the clip is extracted inside the GPU window.
-    """
     import traceback as _tb
-    print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
     try:
-        return _hunyuan_gpu_infer_impl(
-            video_file, prompt, negative_prompt, seed_val,
-            guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-            num_samples, silent_video, segments_json, total_dur_s,
-            clip_start_s, clip_dur_s)
     except Exception as _e:
         print(f"[_hunyuan_gpu_infer] EXCEPTION: {_e}")
         _tb.print_exc()
         raise
-def _hunyuan_gpu_infer_impl(video_file, prompt, negative_prompt, seed_val,
-                            guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
-                            num_samples, silent_video, segments_json, total_dur_s,
-                            clip_start_s=0.0, clip_dur_s=None):
-    _ensure_syspath("HunyuanVideo-Foley")
-    from hunyuanvideo_foley.utils.model_utils  import denoise_process
-    from hunyuanvideo_foley.utils.feature_utils import feature_process
-    seed_val     = _resolve_seed(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
-    total_dur_s  = float(total_dur_s)
-    set_global_seed(seed_val)
-    device, _    = _get_device_and_dtype()
-    model_size   = model_size.lower()
-    model_dict, cfg = _load_hunyuan_model(device, model_size)
-    # Extract xregen clip inside GPU fn if needed (tmp files from caller invisible here).
-    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
-    if clip_dur_s is not None:
-        clip_dur_s = float(clip_dur_s)
-        clip_path  = _extract_segment_clip(
-            silent_video, float(clip_start_s), clip_dur_s,
-            os.path.join(tmp_dir, "hny_xregen_clip.mp4"),
-        )
-        silent_video = clip_path
-        total_dur_s  = clip_dur_s
-    segments = json.loads(segments_json)
-    dummy_seg_path = _extract_segment_clip(
-        silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
-        os.path.join(tmp_dir, "_seg_dummy.mp4"),
-    )
     seg_clip_paths = [
         _extract_segment_clip(silent_video, s, e - s,
                               os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
         for i, (s, e) in enumerate(segments)
     ]
-    # Text feature extraction (GPU — runs once for all segments)
-    _, text_feats, _ = feature_process(
-        dummy_seg_path,
-        prompt if prompt else "",
-        model_dict,
-        cfg,
-        neg_prompt=negative_prompt if negative_prompt else None,
-    )
-    # Import visual-only feature extractor to avoid redundant text extraction
-    # per segment (text_feats already computed once above for the whole batch).
-    from hunyuanvideo_foley.utils.feature_utils import encode_video_features
     results = []
     for sample_idx in range(num_samples):
         seg_wavs = []
         sr = 48000
-        _t_hny_start = time.perf_counter()
         for seg_i, (seg_start, seg_end) in enumerate(segments):
-            seg_dur = seg_end - seg_start
-            seg_path = seg_clip_paths[seg_i]
-            # Extract only visual features — reuse text_feats from above
-            visual_feats, seg_audio_len = encode_video_features(seg_path, model_dict)
-            print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
-                  f"{seg_start:.1f}–{seg_end:.1f}s → {seg_audio_len:.2f}s audio")
-            audio_batch, sr = denoise_process(
-                visual_feats,
-                text_feats,
-                seg_audio_len,
-                model_dict,
-                cfg,
-                guidance_scale=float(guidance_scale),
-                num_inference_steps=int(num_steps),
-                batch_size=1,
             )
-            wav = audio_batch[0].float().cpu().numpy()  # full window
             seg_wavs.append(wav)
-        _log_inference_timing("HunyuanFoley", time.perf_counter() - _t_hny_start,
                               len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
-        # Save text_feats to disk inside the GPU worker so we never pickle a CUDA
-        # tensor back to the main process (ZeroGPU forbids CUDA init in main process).
-        text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
-        torch.save(text_feats, text_feats_path)
-        print(f"[HunyuanFoley] text_feats saved to {text_feats_path}")
         results.append((seg_wavs, sr, text_feats_path))
-        # Free GPU memory between samples to prevent VRAM fragmentation
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-    return results
-def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
-                     guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
-    """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
-    CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
-    num_samples  = int(num_samples)
-    crossfade_s  = float(crossfade_s)
-    crossfade_db = float(crossfade_db)
-    # ── CPU pre-processing (no GPU needed) ──
-    tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
-        video_file, HUNYUAN_MAX_DUR, crossfade_s)
-    print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
-    # ── GPU inference only ──
-    results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
-                                 guidance_scale, num_steps, model_size,
-                                 crossfade_s, crossfade_db, num_samples,
-                                 silent_video, json.dumps(segments), total_dur_s)
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
-        # text_feats was saved to disk inside the GPU worker (to avoid pickling CUDA
-        # tensors across the ZeroGPU process boundary); result[2] is the file path.
-        _, _sr, text_feats_path = result
-        return {"text_feats_path": text_feats_path}
     outputs = _post_process_samples(
         results, model="hunyuan", tmp_dir=tmp_dir,

+def _hunyuan_seg_duration(video_file, prompt, negative_prompt, seed_val,
+                          guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                          num_samples, silent_video=None, seg_clip_path=None,
+                          dummy_seg_path=None, text_feats_path=None,
+                          clip_start_s=0.0, clip_dur_s=None, **_kwargs):
+    """Duration estimate for a single-segment HunyuanFoley GPU call.
+    One segment × num_steps + model load overhead — always fits in 60 s."""
+    cfg  = MODEL_CONFIGS["hunyuan"]
+    secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
+    print(f"[duration] HunyuanFoley 1seg: 1×{int(num_steps)}steps → {secs:.0f}s → capped ", end="")
+    return _clamp_duration(secs, "HunyuanFoley 1seg")
+@spaces.GPU(duration=_hunyuan_seg_duration)
+def _hunyuan_gpu_infer_one_seg(video_file, prompt, negative_prompt, seed_val,
+                                guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                                num_samples, silent_video, seg_clip_path,
+                                dummy_seg_path, text_feats_path,
+                                clip_start_s=0.0, clip_dur_s=None):
+    """GPU-only HunyuanFoley inference for ONE segment.
+    text_feats_path: path to pre-saved text_feats .pt file, or empty string to
+    extract fresh (first segment). Returns (wav_numpy, sr, text_feats_path).
+    """
+    import traceback as _tb
+    print(f"[_hunyuan_gpu_infer_one_seg] START seg_clip={seg_clip_path!r} "
+          f"text_feats_path={text_feats_path!r}")
+    try:
+        _ensure_syspath("HunyuanVideo-Foley")
+        from hunyuanvideo_foley.utils.model_utils   import denoise_process
+        from hunyuanvideo_foley.utils.feature_utils import feature_process, encode_video_features
+        device, _ = _get_device_and_dtype()
+        model_dict, cfg = _load_hunyuan_model(device, model_size.lower())
+        # Load or extract text features
+        if text_feats_path and os.path.exists(text_feats_path):
+            print(f"[_hunyuan_gpu_infer_one_seg] loading cached text_feats from {text_feats_path}")
+            text_feats = torch.load(text_feats_path, map_location=device, weights_only=False)
+            visual_feats, seg_audio_len = encode_video_features(seg_clip_path, model_dict)
+        else:
+            print(f"[_hunyuan_gpu_infer_one_seg] extracting text+visual features")
+            visual_feats, text_feats, seg_audio_len = feature_process(
+                seg_clip_path,
+                prompt if prompt else "",
+                model_dict, cfg,
+                neg_prompt=negative_prompt if negative_prompt else None,
+            )
+        print(f"[_hunyuan_gpu_infer_one_seg] denoising {seg_audio_len:.2f}s audio")
+        audio_batch, sr = denoise_process(
+            visual_feats, text_feats, seg_audio_len, model_dict, cfg,
+            guidance_scale=float(guidance_scale),
+            num_inference_steps=int(num_steps),
+            batch_size=1,
+        )
+        wav = audio_batch[0].float().cpu().numpy()
+        # Save text_feats to disk so next segment's GPU call can reuse it without
+        # re-running CLAP/SigLIP, and so we never return a CUDA tensor to main process.
+        tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
+        out_text_feats_path = os.path.join(tmp_dir, "hunyuan_text_feats.pt")
+        torch.save(text_feats, out_text_feats_path)
+        print(f"[_hunyuan_gpu_infer_one_seg] text_feats saved to {out_text_feats_path}")
+        return wav, sr, out_text_feats_path
+    except Exception as _e:
+        print(f"[_hunyuan_gpu_infer_one_seg] EXCEPTION: {_e}")
+        _tb.print_exc()
+        raise
+# Keep old name as alias for the xregen path which calls it directly
 def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
                        guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
                        num_samples, silent_video, segments_json, total_dur_s,
                        clip_start_s=0.0, clip_dur_s=None):
+    """Wrapper used by xregen — single-segment call via _hunyuan_gpu_infer_one_seg."""
     import traceback as _tb
+    print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} "
+          f"num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
     try:
+        _ensure_syspath("HunyuanVideo-Foley")
+        from hunyuanvideo_foley.utils.feature_utils import feature_process
+        tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
+        _sv = silent_video
+        _total = float(total_dur_s)
+        if clip_dur_s is not None:
+            clip_path = _extract_segment_clip(
+                silent_video, float(clip_start_s), float(clip_dur_s),
+                os.path.join(tmp_dir, "hny_xregen_clip.mp4"),
+            )
+            _sv = clip_path
+            _total = float(clip_dur_s)
+        segments = json.loads(segments_json)
+        seg_clip_paths = [
+            _extract_segment_clip(_sv, s, e - s,
+                                  os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
+            for i, (s, e) in enumerate(segments)
+        ]
+        # One GPU call per segment — each fits in the 60 s ZeroGPU free-tier cap
+        results = []
+        for sample_idx in range(int(num_samples)):
+            seg_wavs = []
+            sr = 48000
+            text_feats_path = ""
+            _t0 = time.perf_counter()
+            for seg_i, (seg_start, seg_end) in enumerate(segments):
+                print(f"[_hunyuan_gpu_infer] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
+                      f"{seg_start:.1f}–{seg_end:.1f}s")
+                wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
+                    video_file, prompt, negative_prompt, seed_val,
+                    guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                    num_samples, _sv, seg_clip_paths[seg_i],
+                    seg_clip_paths[0], text_feats_path,
+                    clip_start_s, None,
+                )
+                seg_wavs.append(wav)
+            _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
+                                  len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
+            results.append((seg_wavs, sr, text_feats_path))
+        return results
     except Exception as _e:
         print(f"[_hunyuan_gpu_infer] EXCEPTION: {_e}")
         _tb.print_exc()
         raise
+def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
+                     guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
+    """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
+    One GPU call per segment to stay within ZeroGPU's 60 s free-tier cap."""
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
+    crossfade_db = float(crossfade_db)
+    # ── CPU pre-processing (no GPU needed) ──
+    tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
+        video_file, HUNYUAN_MAX_DUR, crossfade_s)
+    print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     seg_clip_paths = [
         _extract_segment_clip(silent_video, s, e - s,
                               os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
         for i, (s, e) in enumerate(segments)
     ]
+    # ── One GPU call per segment ──
     results = []
     for sample_idx in range(num_samples):
         seg_wavs = []
         sr = 48000
+        text_feats_path = ""
+        _t0 = time.perf_counter()
         for seg_i, (seg_start, seg_end) in enumerate(segments):
+            print(f"[HunyuanFoley] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
+                  f"{seg_start:.1f}–{seg_end:.1f}s")
+            wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
+                video_file, prompt, negative_prompt, seed_val,
+                guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
+                num_samples, silent_video, seg_clip_paths[seg_i],
+                seg_clip_paths[0], text_feats_path,
             )
             seg_wavs.append(wav)
+        _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
                               len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
         results.append((seg_wavs, sr, text_feats_path))
     # ── CPU post-processing (no GPU needed) ──
     def _hunyuan_extras(sample_idx, result, td):
+        _, _sr, tfp = result
+        return {"text_feats_path": tfp}
     outputs = _post_process_samples(
         results, model="hunyuan", tmp_dir=tmp_dir,