Revert to single multi-seg GPU call; bump hunyuan load_overhead to 90s
Browse files- Raise HunyuanFoley load_overhead 55β90 s to account for cold-disk 10 GB
weight load (~73 s measured) plus aux model init (~8 s)
- Raise _clamp_duration floor 60β120 s β Pro ZeroGPU users get 300 s/call,
so 120 s floor safely covers worst-case cold-start without wasting budget
- Replace per-segment GPU call architecture (_hunyuan_gpu_infer_one_seg ΓN)
with a single multi-segment @spaces.GPU call (_hunyuan_gpu_infer) that
loads the model once and loops over all segments β avoids reloading the
10 GB weights N times which would exceed the Pro time budget entirely
- Duration estimate now uses _estimate_gpu_duration("hunyuan", num_samples,
num_steps) which scales with actual work (segments Γ steps) + 90 s overhead
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
|
@@ -553,7 +553,7 @@ MODEL_CONFIGS = {
|
|
| 553 |
"window_s": 15.0, # HunyuanFoley max video duration
|
| 554 |
"sr": 48000,
|
| 555 |
"secs_per_step": 0.35, # measured 0.328 s/step on H200
|
| 556 |
-
"load_overhead":
|
| 557 |
"tab_prefix": "hf",
|
| 558 |
"label": "HunyuanFoley",
|
| 559 |
"regen_fn": None,
|
|
@@ -569,8 +569,10 @@ HUNYUAN_SECS_PER_STEP = MODEL_CONFIGS["hunyuan"]["secs_per_step"]
|
|
| 569 |
|
| 570 |
|
| 571 |
def _clamp_duration(secs: float, label: str) -> int:
|
| 572 |
-
"""Clamp a raw GPU-seconds estimate to [
|
| 573 |
-
|
|
|
|
|
|
|
| 574 |
print(f"[duration] {label}: {secs:.0f}s raw β {result}s reserved")
|
| 575 |
return result
|
| 576 |
|
|
@@ -1215,94 +1217,46 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 1215 |
|
| 1216 |
|
| 1217 |
|
| 1218 |
-
def
|
| 1219 |
-
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
-
|
| 1225 |
-
|
| 1226 |
-
|
| 1227 |
-
|
| 1228 |
-
return _clamp_duration(secs, "HunyuanFoley 1seg")
|
| 1229 |
-
|
| 1230 |
-
|
| 1231 |
-
@spaces.GPU(duration=_hunyuan_seg_duration)
|
| 1232 |
-
def _hunyuan_gpu_infer_one_seg(video_file, prompt, negative_prompt, seed_val,
|
| 1233 |
-
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1234 |
-
num_samples, silent_video, seg_clip_path,
|
| 1235 |
-
dummy_seg_path, text_feats_path,
|
| 1236 |
-
clip_start_s=0.0, clip_dur_s=None):
|
| 1237 |
-
"""GPU-only HunyuanFoley inference for ONE segment.
|
| 1238 |
-
|
| 1239 |
-
text_feats_path: path to pre-saved text_feats .pt file, or empty string to
|
| 1240 |
-
extract fresh (first segment). Returns (wav_numpy, sr, text_feats_path).
|
| 1241 |
-
"""
|
| 1242 |
-
import traceback as _tb
|
| 1243 |
-
print(f"[_hunyuan_gpu_infer_one_seg] START seg_clip={seg_clip_path!r} "
|
| 1244 |
-
f"text_feats_path={text_feats_path!r}")
|
| 1245 |
-
try:
|
| 1246 |
-
_ensure_syspath("HunyuanVideo-Foley")
|
| 1247 |
-
from hunyuanvideo_foley.utils.model_utils import denoise_process
|
| 1248 |
-
from hunyuanvideo_foley.utils.feature_utils import feature_process, encode_video_features
|
| 1249 |
-
|
| 1250 |
-
device, _ = _get_device_and_dtype()
|
| 1251 |
-
model_dict, cfg = _load_hunyuan_model(device, model_size.lower())
|
| 1252 |
-
|
| 1253 |
-
# Load or extract text features
|
| 1254 |
-
if text_feats_path and os.path.exists(text_feats_path):
|
| 1255 |
-
print(f"[_hunyuan_gpu_infer_one_seg] loading cached text_feats from {text_feats_path}")
|
| 1256 |
-
text_feats = torch.load(text_feats_path, map_location=device, weights_only=False)
|
| 1257 |
-
visual_feats, seg_audio_len = encode_video_features(seg_clip_path, model_dict)
|
| 1258 |
-
else:
|
| 1259 |
-
print(f"[_hunyuan_gpu_infer_one_seg] extracting text+visual features")
|
| 1260 |
-
visual_feats, text_feats, seg_audio_len = feature_process(
|
| 1261 |
-
seg_clip_path,
|
| 1262 |
-
prompt if prompt else "",
|
| 1263 |
-
model_dict, cfg,
|
| 1264 |
-
neg_prompt=negative_prompt if negative_prompt else None,
|
| 1265 |
-
)
|
| 1266 |
-
|
| 1267 |
-
print(f"[_hunyuan_gpu_infer_one_seg] denoising {seg_audio_len:.2f}s audio")
|
| 1268 |
-
audio_batch, sr = denoise_process(
|
| 1269 |
-
visual_feats, text_feats, seg_audio_len, model_dict, cfg,
|
| 1270 |
-
guidance_scale=float(guidance_scale),
|
| 1271 |
-
num_inference_steps=int(num_steps),
|
| 1272 |
-
batch_size=1,
|
| 1273 |
-
)
|
| 1274 |
-
wav = audio_batch[0].float().cpu().numpy()
|
| 1275 |
-
|
| 1276 |
-
# Save text_feats to disk so next segment's GPU call can reuse it without
|
| 1277 |
-
# re-running CLAP/SigLIP, and so we never return a CUDA tensor to main process.
|
| 1278 |
-
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1279 |
-
out_text_feats_path = os.path.join(tmp_dir, "hunyuan_text_feats.pt")
|
| 1280 |
-
torch.save(text_feats, out_text_feats_path)
|
| 1281 |
-
print(f"[_hunyuan_gpu_infer_one_seg] text_feats saved to {out_text_feats_path}")
|
| 1282 |
-
|
| 1283 |
-
return wav, sr, out_text_feats_path
|
| 1284 |
|
| 1285 |
-
except Exception as _e:
|
| 1286 |
-
print(f"[_hunyuan_gpu_infer_one_seg] EXCEPTION: {_e}")
|
| 1287 |
-
_tb.print_exc()
|
| 1288 |
-
raise
|
| 1289 |
|
| 1290 |
-
|
| 1291 |
def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1292 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1293 |
num_samples, silent_video, segments_json, total_dur_s,
|
| 1294 |
clip_start_s=0.0, clip_dur_s=None):
|
| 1295 |
-
"""
|
|
|
|
|
|
|
| 1296 |
import traceback as _tb
|
| 1297 |
print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} "
|
| 1298 |
f"num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
|
| 1299 |
try:
|
| 1300 |
_ensure_syspath("HunyuanVideo-Foley")
|
| 1301 |
-
from hunyuanvideo_foley.utils.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1302 |
|
| 1303 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1304 |
_sv = silent_video
|
| 1305 |
-
_total =
|
| 1306 |
if clip_dur_s is not None:
|
| 1307 |
clip_path = _extract_segment_clip(
|
| 1308 |
silent_video, float(clip_start_s), float(clip_dur_s),
|
|
@@ -1312,33 +1266,54 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 1312 |
_total = float(clip_dur_s)
|
| 1313 |
|
| 1314 |
segments = json.loads(segments_json)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1315 |
seg_clip_paths = [
|
| 1316 |
_extract_segment_clip(_sv, s, e - s,
|
| 1317 |
os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
|
| 1318 |
for i, (s, e) in enumerate(segments)
|
| 1319 |
]
|
| 1320 |
|
| 1321 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1322 |
results = []
|
| 1323 |
-
for sample_idx in range(
|
| 1324 |
seg_wavs = []
|
| 1325 |
sr = 48000
|
| 1326 |
-
text_feats_path = ""
|
| 1327 |
_t0 = time.perf_counter()
|
| 1328 |
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 1329 |
-
|
| 1330 |
-
|
| 1331 |
-
|
| 1332 |
-
|
| 1333 |
-
|
| 1334 |
-
|
| 1335 |
-
|
| 1336 |
-
|
|
|
|
| 1337 |
)
|
| 1338 |
-
seg_wavs.append(
|
|
|
|
| 1339 |
_log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
|
| 1340 |
len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1341 |
results.append((seg_wavs, sr, text_feats_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1342 |
return results
|
| 1343 |
|
| 1344 |
except Exception as _e:
|
|
@@ -1350,7 +1325,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
|
| 1350 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 1351 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
|
| 1352 |
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
|
| 1353 |
-
|
| 1354 |
num_samples = int(num_samples)
|
| 1355 |
crossfade_s = float(crossfade_s)
|
| 1356 |
crossfade_db = float(crossfade_db)
|
|
@@ -1360,37 +1335,16 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 1360 |
video_file, HUNYUAN_MAX_DUR, crossfade_s)
|
| 1361 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ β€15 s")
|
| 1362 |
|
| 1363 |
-
|
| 1364 |
-
|
| 1365 |
-
|
| 1366 |
-
|
| 1367 |
-
|
| 1368 |
-
|
| 1369 |
-
# ββ One GPU call per segment ββ
|
| 1370 |
-
results = []
|
| 1371 |
-
for sample_idx in range(num_samples):
|
| 1372 |
-
seg_wavs = []
|
| 1373 |
-
sr = 48000
|
| 1374 |
-
text_feats_path = ""
|
| 1375 |
-
_t0 = time.perf_counter()
|
| 1376 |
-
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 1377 |
-
print(f"[HunyuanFoley] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
|
| 1378 |
-
f"{seg_start:.1f}β{seg_end:.1f}s")
|
| 1379 |
-
wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
|
| 1380 |
-
video_file, prompt, negative_prompt, seed_val,
|
| 1381 |
-
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1382 |
-
num_samples, silent_video, seg_clip_paths[seg_i],
|
| 1383 |
-
seg_clip_paths[0], text_feats_path,
|
| 1384 |
-
)
|
| 1385 |
-
seg_wavs.append(wav)
|
| 1386 |
-
_log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
|
| 1387 |
-
len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
|
| 1388 |
-
results.append((seg_wavs, sr, text_feats_path))
|
| 1389 |
|
| 1390 |
# ββ CPU post-processing (no GPU needed) ββ
|
| 1391 |
def _hunyuan_extras(sample_idx, result, td):
|
| 1392 |
-
_, _sr,
|
| 1393 |
-
return {"text_feats_path":
|
| 1394 |
|
| 1395 |
outputs = _post_process_samples(
|
| 1396 |
results, model="hunyuan", tmp_dir=tmp_dir,
|
|
|
|
| 553 |
"window_s": 15.0, # HunyuanFoley max video duration
|
| 554 |
"sr": 48000,
|
| 555 |
"secs_per_step": 0.35, # measured 0.328 s/step on H200
|
| 556 |
+
"load_overhead": 90, # cold disk: ~73s for 10 GB weights + ~8s aux models
|
| 557 |
"tab_prefix": "hf",
|
| 558 |
"label": "HunyuanFoley",
|
| 559 |
"regen_fn": None,
|
|
|
|
| 569 |
|
| 570 |
|
| 571 |
def _clamp_duration(secs: float, label: str) -> int:
|
| 572 |
+
"""Clamp a raw GPU-seconds estimate to [120, GPU_DURATION_CAP] and log it.
|
| 573 |
+
ZeroGPU Pro users get up to 300 s per call; 120 s floor covers cold-disk
|
| 574 |
+
model loads (e.g. HunyuanFoley XXL ~73 s on first access)."""
|
| 575 |
+
result = min(GPU_DURATION_CAP, max(120, int(secs)))
|
| 576 |
print(f"[duration] {label}: {secs:.0f}s raw β {result}s reserved")
|
| 577 |
return result
|
| 578 |
|
|
|
|
| 1217 |
|
| 1218 |
|
| 1219 |
|
| 1220 |
+
def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
|
| 1221 |
+
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1222 |
+
num_samples, silent_video=None, segments_json=None, total_dur_s=None,
|
| 1223 |
+
clip_start_s=0.0, clip_dur_s=None, **_kwargs):
|
| 1224 |
+
"""Pre-GPU callable β must match _hunyuan_gpu_infer's input signature exactly.
|
| 1225 |
+
silent_video, segments_json, total_dur_s, clip_start_s, clip_dur_s are extra
|
| 1226 |
+
positional args that xregen passes; they must appear here so ZeroGPU doesn't
|
| 1227 |
+
raise TypeError when forwarding all args to this duration fn."""
|
| 1228 |
+
return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
|
| 1229 |
+
video_file=video_file, crossfade_s=crossfade_s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1231 |
|
| 1232 |
+
@spaces.GPU(duration=_hunyuan_duration)
|
| 1233 |
def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1234 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1235 |
num_samples, silent_video, segments_json, total_dur_s,
|
| 1236 |
clip_start_s=0.0, clip_dur_s=None):
|
| 1237 |
+
"""GPU-only HunyuanFoley inference β model loading + feature extraction + denoising.
|
| 1238 |
+
All segments processed in a single GPU call (Pro ZeroGPU allows up to 300 s).
|
| 1239 |
+
"""
|
| 1240 |
import traceback as _tb
|
| 1241 |
print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} "
|
| 1242 |
f"num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
|
| 1243 |
try:
|
| 1244 |
_ensure_syspath("HunyuanVideo-Foley")
|
| 1245 |
+
from hunyuanvideo_foley.utils.model_utils import denoise_process
|
| 1246 |
+
from hunyuanvideo_foley.utils.feature_utils import feature_process, encode_video_features
|
| 1247 |
+
|
| 1248 |
+
seed_val = _resolve_seed(seed_val)
|
| 1249 |
+
num_samples = int(num_samples)
|
| 1250 |
+
crossfade_s = float(crossfade_s)
|
| 1251 |
+
total_dur_s = float(total_dur_s)
|
| 1252 |
+
set_global_seed(seed_val)
|
| 1253 |
+
|
| 1254 |
+
device, _ = _get_device_and_dtype()
|
| 1255 |
+
model_dict, cfg = _load_hunyuan_model(device, model_size.lower())
|
| 1256 |
|
| 1257 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1258 |
_sv = silent_video
|
| 1259 |
+
_total = total_dur_s
|
| 1260 |
if clip_dur_s is not None:
|
| 1261 |
clip_path = _extract_segment_clip(
|
| 1262 |
silent_video, float(clip_start_s), float(clip_dur_s),
|
|
|
|
| 1266 |
_total = float(clip_dur_s)
|
| 1267 |
|
| 1268 |
segments = json.loads(segments_json)
|
| 1269 |
+
dummy_seg_path = _extract_segment_clip(
|
| 1270 |
+
_sv, 0, min(_total, HUNYUAN_MAX_DUR),
|
| 1271 |
+
os.path.join(tmp_dir, "_seg_dummy.mp4"),
|
| 1272 |
+
)
|
| 1273 |
seg_clip_paths = [
|
| 1274 |
_extract_segment_clip(_sv, s, e - s,
|
| 1275 |
os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
|
| 1276 |
for i, (s, e) in enumerate(segments)
|
| 1277 |
]
|
| 1278 |
|
| 1279 |
+
# Extract text features once for all segments
|
| 1280 |
+
_, text_feats, _ = feature_process(
|
| 1281 |
+
dummy_seg_path,
|
| 1282 |
+
prompt if prompt else "",
|
| 1283 |
+
model_dict, cfg,
|
| 1284 |
+
neg_prompt=negative_prompt if negative_prompt else None,
|
| 1285 |
+
)
|
| 1286 |
+
|
| 1287 |
results = []
|
| 1288 |
+
for sample_idx in range(num_samples):
|
| 1289 |
seg_wavs = []
|
| 1290 |
sr = 48000
|
|
|
|
| 1291 |
_t0 = time.perf_counter()
|
| 1292 |
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 1293 |
+
visual_feats, seg_audio_len = encode_video_features(
|
| 1294 |
+
seg_clip_paths[seg_i], model_dict)
|
| 1295 |
+
print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
|
| 1296 |
+
f"{seg_start:.1f}β{seg_end:.1f}s β {seg_audio_len:.2f}s audio")
|
| 1297 |
+
audio_batch, sr = denoise_process(
|
| 1298 |
+
visual_feats, text_feats, seg_audio_len, model_dict, cfg,
|
| 1299 |
+
guidance_scale=float(guidance_scale),
|
| 1300 |
+
num_inference_steps=int(num_steps),
|
| 1301 |
+
batch_size=1,
|
| 1302 |
)
|
| 1303 |
+
seg_wavs.append(audio_batch[0].float().cpu().numpy())
|
| 1304 |
+
|
| 1305 |
_log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
|
| 1306 |
len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
|
| 1307 |
+
|
| 1308 |
+
# Save text_feats inside the GPU worker β never return CUDA tensors
|
| 1309 |
+
text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
|
| 1310 |
+
torch.save(text_feats, text_feats_path)
|
| 1311 |
+
print(f"[HunyuanFoley] text_feats saved to {text_feats_path}")
|
| 1312 |
results.append((seg_wavs, sr, text_feats_path))
|
| 1313 |
+
|
| 1314 |
+
if torch.cuda.is_available():
|
| 1315 |
+
torch.cuda.empty_cache()
|
| 1316 |
+
|
| 1317 |
return results
|
| 1318 |
|
| 1319 |
except Exception as _e:
|
|
|
|
| 1325 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 1326 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
|
| 1327 |
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
|
| 1328 |
+
CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
|
| 1329 |
num_samples = int(num_samples)
|
| 1330 |
crossfade_s = float(crossfade_s)
|
| 1331 |
crossfade_db = float(crossfade_db)
|
|
|
|
| 1335 |
video_file, HUNYUAN_MAX_DUR, crossfade_s)
|
| 1336 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ β€15 s")
|
| 1337 |
|
| 1338 |
+
# ββ GPU inference (all segments in one call) ββ
|
| 1339 |
+
results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1340 |
+
guidance_scale, num_steps, model_size,
|
| 1341 |
+
crossfade_s, crossfade_db, num_samples,
|
| 1342 |
+
silent_video, json.dumps(segments), total_dur_s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1343 |
|
| 1344 |
# ββ CPU post-processing (no GPU needed) ββ
|
| 1345 |
def _hunyuan_extras(sample_idx, result, td):
|
| 1346 |
+
_, _sr, text_feats_path = result
|
| 1347 |
+
return {"text_feats_path": text_feats_path}
|
| 1348 |
|
| 1349 |
outputs = _post_process_samples(
|
| 1350 |
results, model="hunyuan", tmp_dir=tmp_dir,
|