Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on about 19 hours ago

Commit

d9fa683

1 Parent(s): 47fd0ad

Refactor: reduce technical debt across app.py

- Remove unused ; move to top-level
- Add _resolve_seed() helper to unify inconsistent seed handling across
TARO/MMAudio/HunyuanFoley (was: 3 different patterns; now: one call)
- Remove redundant torch.device() wrapping in _hunyuan_gpu_infer and
_regen_hunyuan_gpu (device is already a string from _get_device_and_dtype)
- Fix all unregistered mkdtemp() calls in regen GPU functions so temp dirs
are tracked and cleaned up (prevents /tmp accumulation on long-running Spaces)
- Fix misleading comments in _xregen_splice offset alignment logic

Files changed (1) hide show

app.py +20 -22

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ Supported models
   HunyuanFoley  – text-guided foley via SigLIP2 + Synchformer + CLAP (48 kHz, up to 15 s)
 """
-import html as _html
 import os
 import sys
 import json
@@ -210,6 +210,12 @@ def set_global_seed(seed: int) -> None:
 def get_random_seed() -> int:
     return random.randint(0, 2**32 - 1)
 def get_video_duration(video_path: str) -> float:
     """Return video duration in seconds (CPU only)."""
     probe = ffmpeg.probe(video_path)
@@ -432,7 +438,6 @@ def _build_segments(total_dur_s: float, window_s: float, crossfade_s: float) ->
     crossfade_s = min(crossfade_s, window_s * 0.5)
     if total_dur_s <= window_s:
         return [(0.0, total_dur_s)]
-    import math
     step_min = window_s - crossfade_s          # minimum step to honour crossfade
     n = math.ceil((total_dur_s - crossfade_s) / step_min)
     n = max(n, 2)
@@ -849,11 +854,9 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
                     crossfade_s, crossfade_db, num_samples):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
-    seed_val     = int(seed_val)
     crossfade_s  = float(crossfade_s)
     num_samples  = int(num_samples)
-    if seed_val < 0:
-        seed_val = random.randint(0, 2**32 - 1)
     torch.set_grad_enabled(False)
     device, weight_dtype = _get_device_and_dtype()
@@ -1005,7 +1008,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     from mmaudio.eval_utils        import generate, load_video
     from mmaudio.model.flow_matching   import FlowMatching
-    seed_val     = int(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
@@ -1022,10 +1025,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     results = []
     for sample_idx in range(num_samples):
         rng = torch.Generator(device=device)
-        if seed_val >= 0:
-            rng.manual_seed(seed_val + sample_idx)
-        else:
-            rng.seed()
         seg_audios = []
         _t_mma_start = time.perf_counter()
@@ -1149,14 +1149,12 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
-    seed_val     = int(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
-    if seed_val >= 0:
-        set_global_seed(seed_val)
     device, _    = _get_device_and_dtype()
-    device       = torch.device(device)
     model_size   = model_size.lower()
     model_dict, cfg = _load_hunyuan_model(device, model_size)
@@ -1375,7 +1373,7 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
         from TARO.onset_util import extract_onset
         extract_cavp, onset_model = _load_taro_feature_extractors(device)
         silent_video = meta["silent_video"]
-        tmp_dir      = tempfile.mkdtemp()
         cavp_feats   = extract_cavp(silent_video, tmp_path=tmp_dir)
         onset_feats  = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
         del extract_cavp, onset_model
@@ -1446,7 +1444,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     # This avoids any cross-process context passing that fails under ZeroGPU isolation.
     seg_path = _extract_segment_clip(
         meta["silent_video"], seg_start, seg_dur,
-        os.path.join(tempfile.mkdtemp(), "regen_seg.mp4"),
     )
     rng = torch.Generator(device=device)
@@ -1521,7 +1519,6 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     device, _   = _get_device_and_dtype()
-    device      = torch.device(device)
     model_dict, cfg = _load_hunyuan_model(device, model_size)
     set_global_seed(random.randint(0, 2**32 - 1))
@@ -1529,7 +1526,7 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
     seg_path = _extract_segment_clip(
         meta["silent_video"], seg_start, seg_dur,
-        os.path.join(tempfile.mkdtemp(), "regen_seg.mp4"),
     )
     text_feats_path = meta.get("text_feats_path", "")
@@ -1661,13 +1658,14 @@ def _xregen_splice(new_wav_raw: np.ndarray, src_sr: int,
     slot_wavs = _load_seg_wavs(meta["wav_paths"])
     new_wav   = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
-    # If the clip started before the original segment start, prepend silence
-    # so that sample index 0 of new_wav corresponds to seg_start in video time.
     if clip_start_s is not None:
         seg_start = meta["segments"][seg_idx][0]
-        offset_s  = seg_start - clip_start_s   # positive = seg starts after clip start
         if offset_s < 0:
-            # clip started after seg_start — prepend silence to align
             pad_samples = int(round(abs(offset_s) * slot_sr))
             silence = np.zeros(
                 (new_wav.shape[0], pad_samples) if new_wav.ndim == 2 else pad_samples,

   HunyuanFoley  – text-guided foley via SigLIP2 + Synchformer + CLAP (48 kHz, up to 15 s)
 """
+import math
 import os
 import sys
 import json
 def get_random_seed() -> int:
     return random.randint(0, 2**32 - 1)
+def _resolve_seed(seed_val) -> int:
+    """Normalise seed_val to a non-negative int.
+    Negative values (UI default 'random') produce a fresh random seed."""
+    seed_val = int(seed_val)
+    return seed_val if seed_val >= 0 else get_random_seed()
 def get_video_duration(video_path: str) -> float:
     """Return video duration in seconds (CPU only)."""
     probe = ffmpeg.probe(video_path)
     crossfade_s = min(crossfade_s, window_s * 0.5)
     if total_dur_s <= window_s:
         return [(0.0, total_dur_s)]
     step_min = window_s - crossfade_s          # minimum step to honour crossfade
     n = math.ceil((total_dur_s - crossfade_s) / step_min)
     n = max(n, 2)
                     crossfade_s, crossfade_db, num_samples):
     """GPU-only TARO inference — model loading + feature extraction + diffusion.
     Returns list of (wavs_list, onset_feats) per sample."""
+    seed_val     = _resolve_seed(seed_val)
     crossfade_s  = float(crossfade_s)
     num_samples  = int(num_samples)
     torch.set_grad_enabled(False)
     device, weight_dtype = _get_device_and_dtype()
     from mmaudio.eval_utils        import generate, load_video
     from mmaudio.model.flow_matching   import FlowMatching
+    seed_val     = _resolve_seed(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
     results = []
     for sample_idx in range(num_samples):
         rng = torch.Generator(device=device)
+        rng.manual_seed(seed_val + sample_idx)
         seg_audios = []
         _t_mma_start = time.perf_counter()
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
+    seed_val     = _resolve_seed(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
+    set_global_seed(seed_val)
     device, _    = _get_device_and_dtype()
     model_size   = model_size.lower()
     model_dict, cfg = _load_hunyuan_model(device, model_size)
         from TARO.onset_util import extract_onset
         extract_cavp, onset_model = _load_taro_feature_extractors(device)
         silent_video = meta["silent_video"]
+        tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
         cavp_feats   = extract_cavp(silent_video, tmp_path=tmp_dir)
         onset_feats  = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
         del extract_cavp, onset_model
     # This avoids any cross-process context passing that fails under ZeroGPU isolation.
     seg_path = _extract_segment_clip(
         meta["silent_video"], seg_start, seg_dur,
+        os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
     )
     rng = torch.Generator(device=device)
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     device, _   = _get_device_and_dtype()
     model_dict, cfg = _load_hunyuan_model(device, model_size)
     set_global_seed(random.randint(0, 2**32 - 1))
     # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
     seg_path = _extract_segment_clip(
         meta["silent_video"], seg_start, seg_dur,
+        os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
     )
     text_feats_path = meta.get("text_feats_path", "")
     slot_wavs = _load_seg_wavs(meta["wav_paths"])
     new_wav   = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
+    # Align new_wav so sample index 0 corresponds to seg_start in video time.
+    # _stitch_wavs trims using seg_start as the time origin, so if the clip
+    # started AFTER seg_start (clip_start_s > seg_start), we prepend silence
+    # equal to (clip_start_s - seg_start) to shift the audio back to seg_start.
     if clip_start_s is not None:
         seg_start = meta["segments"][seg_idx][0]
+        offset_s  = seg_start - clip_start_s   # negative when clip starts after seg_start
         if offset_s < 0:
             pad_samples = int(round(abs(offset_s) * slot_sr))
             silence = np.zeros(
                 (new_wav.shape[0], pad_samples) if new_wav.ndim == 2 else pad_samples,