Spaces:
Running on Zero
Running on Zero
Commit ·
2b2a599
1
Parent(s): f8b59b5
HunyuanFoley: sliding-window segmentation for videos longer than 15 s
Browse filesThe model is hard-limited to 15 s per pass (MAX_VIDEO_DURATION_SECONDS=15
in constants.py, enforced in get_frames_av). For longer videos, slice the
input with ffmpeg into overlapping <=15 s segments, run feature_process +
denoise_process on each, then crossfade-stitch all segment wavs into a
single full-length audio track — same strategy as TARO. Text features are
encoded once from the first segment and reused across all segments.
app.py
CHANGED
|
@@ -507,33 +507,84 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 507 |
tmp_dir = tempfile.mkdtemp()
|
| 508 |
outputs = []
|
| 509 |
|
| 510 |
-
#
|
| 511 |
-
#
|
| 512 |
-
|
| 513 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
prompt if prompt else "",
|
| 515 |
model_dict,
|
| 516 |
cfg,
|
| 517 |
neg_prompt=negative_prompt if negative_prompt else None,
|
| 518 |
)
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
# denoise_process() runs the flow-matching diffusion loop and decodes with DAC-VAE
|
| 522 |
-
# batch_size=num_samples generates all samples in one pass
|
| 523 |
-
audio, sample_rate = denoise_process(
|
| 524 |
-
visual_feats,
|
| 525 |
-
text_feats,
|
| 526 |
-
audio_len_in_s,
|
| 527 |
-
model_dict,
|
| 528 |
-
cfg,
|
| 529 |
-
guidance_scale=float(guidance_scale),
|
| 530 |
-
num_inference_steps=int(num_steps),
|
| 531 |
-
batch_size=num_samples,
|
| 532 |
-
)
|
| 533 |
-
# audio shape: (batch, channels, samples)
|
| 534 |
for sample_idx in range(num_samples):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
|
| 536 |
-
torchaudio.save(audio_path,
|
| 537 |
video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
|
| 538 |
merge_audio_video(audio_path, video_file, video_path)
|
| 539 |
outputs.append((video_path, audio_path))
|
|
|
|
| 507 |
tmp_dir = tempfile.mkdtemp()
|
| 508 |
outputs = []
|
| 509 |
|
| 510 |
+
# HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
|
| 511 |
+
# input into overlapping segments, generate audio for each, then crossfade-
|
| 512 |
+
# stitch the results into a single full-length audio track.
|
| 513 |
+
total_dur_s = get_video_duration(video_file)
|
| 514 |
+
CF_S = 2.0 # crossfade seconds between segments
|
| 515 |
+
CF_DB = 3.0 # crossfade boost in dB
|
| 516 |
+
segments = _taro_build_segments(total_dur_s, CF_S) # reuse TARO helper
|
| 517 |
+
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
|
| 518 |
+
|
| 519 |
+
# Pre-encode text features once (same for every segment)
|
| 520 |
+
_dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
|
| 521 |
+
ffmpeg.input(video_file, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
|
| 522 |
+
_dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
|
| 523 |
+
).run(overwrite_output=True, quiet=True)
|
| 524 |
+
_, text_feats, _ = feature_process(
|
| 525 |
+
_dummy_seg_path,
|
| 526 |
prompt if prompt else "",
|
| 527 |
model_dict,
|
| 528 |
cfg,
|
| 529 |
neg_prompt=negative_prompt if negative_prompt else None,
|
| 530 |
)
|
| 531 |
+
|
| 532 |
+
# Generate audio per segment, then stitch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
for sample_idx in range(num_samples):
|
| 534 |
+
seg_wavs = []
|
| 535 |
+
sr = 48000 # HunyuanFoley always outputs 48 kHz
|
| 536 |
+
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 537 |
+
seg_dur = seg_end - seg_start
|
| 538 |
+
seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
|
| 539 |
+
ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
|
| 540 |
+
seg_path, vcodec="libx264", acodec="aac", strict="experimental"
|
| 541 |
+
).run(overwrite_output=True, quiet=True)
|
| 542 |
+
|
| 543 |
+
visual_feats, _, seg_audio_len = feature_process(
|
| 544 |
+
seg_path,
|
| 545 |
+
prompt if prompt else "",
|
| 546 |
+
model_dict,
|
| 547 |
+
cfg,
|
| 548 |
+
neg_prompt=negative_prompt if negative_prompt else None,
|
| 549 |
+
)
|
| 550 |
+
print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
|
| 551 |
+
f"{seg_start:.1f}–{seg_end:.1f}s → {seg_audio_len:.2f}s audio")
|
| 552 |
+
|
| 553 |
+
audio_batch, sr = denoise_process(
|
| 554 |
+
visual_feats,
|
| 555 |
+
text_feats,
|
| 556 |
+
seg_audio_len,
|
| 557 |
+
model_dict,
|
| 558 |
+
cfg,
|
| 559 |
+
guidance_scale=float(guidance_scale),
|
| 560 |
+
num_inference_steps=int(num_steps),
|
| 561 |
+
batch_size=1,
|
| 562 |
+
)
|
| 563 |
+
# audio_batch shape: (1, channels, samples) — take first (and only) sample
|
| 564 |
+
wav = audio_batch[0].float().cpu().numpy() # (channels, samples)
|
| 565 |
+
# Trim to exact segment length in samples
|
| 566 |
+
seg_samples = int(round(seg_dur * sr))
|
| 567 |
+
wav = wav[:, :seg_samples]
|
| 568 |
+
seg_wavs.append(wav)
|
| 569 |
+
|
| 570 |
+
# Stitch segments with crossfade (operates on (channels, samples) arrays)
|
| 571 |
+
def _cf_join_stereo(a, b, cf_s, db):
|
| 572 |
+
cf = int(round(cf_s * sr))
|
| 573 |
+
cf = min(cf, a.shape[1], b.shape[1])
|
| 574 |
+
if cf <= 0:
|
| 575 |
+
return np.concatenate([a, b], axis=1)
|
| 576 |
+
gain = 10 ** (db / 20.0)
|
| 577 |
+
overlap = a[:, -cf:] * gain + b[:, :cf] * gain
|
| 578 |
+
return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
|
| 579 |
+
|
| 580 |
+
full_wav = seg_wavs[0]
|
| 581 |
+
for nw in seg_wavs[1:]:
|
| 582 |
+
full_wav = _cf_join_stereo(full_wav, nw, CF_S, CF_DB)
|
| 583 |
+
# Trim to exact video duration
|
| 584 |
+
full_wav = full_wav[:, : int(round(total_dur_s * sr))]
|
| 585 |
+
|
| 586 |
audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
|
| 587 |
+
torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
|
| 588 |
video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
|
| 589 |
merge_audio_video(audio_path, video_file, video_path)
|
| 590 |
outputs.append((video_path, audio_path))
|