BoxOfColors Claude Sonnet 4.6 commited on
Commit
b94c46b
Β·
1 Parent(s): a6fff03

Revert to single multi-seg GPU call; bump hunyuan load_overhead to 90s

Browse files

- Raise HunyuanFoley load_overhead 55β†’90 s to account for cold-disk 10 GB
weight load (~73 s measured) plus aux model init (~8 s)
- Raise _clamp_duration floor 60β†’120 s β€” Pro ZeroGPU users get 300 s/call,
so 120 s floor safely covers worst-case cold-start without wasting budget
- Replace per-segment GPU call architecture (_hunyuan_gpu_infer_one_seg Γ—N)
with a single multi-segment @spaces.GPU call (_hunyuan_gpu_infer) that
loads the model once and loops over all segments β€” avoids reloading the
10 GB weights N times which would exceed the Pro time budget entirely
- Duration estimate now uses _estimate_gpu_duration("hunyuan", num_samples,
num_steps) which scales with actual work (segments Γ— steps) + 90 s overhead

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +72 -118
app.py CHANGED
@@ -553,7 +553,7 @@ MODEL_CONFIGS = {
553
  "window_s": 15.0, # HunyuanFoley max video duration
554
  "sr": 48000,
555
  "secs_per_step": 0.35, # measured 0.328 s/step on H200
556
- "load_overhead": 55, # ~55s to load the 10 GB XXL weights
557
  "tab_prefix": "hf",
558
  "label": "HunyuanFoley",
559
  "regen_fn": None,
@@ -569,8 +569,10 @@ HUNYUAN_SECS_PER_STEP = MODEL_CONFIGS["hunyuan"]["secs_per_step"]
569
 
570
 
571
  def _clamp_duration(secs: float, label: str) -> int:
572
- """Clamp a raw GPU-seconds estimate to [60, GPU_DURATION_CAP] and log it."""
573
- result = min(GPU_DURATION_CAP, max(60, int(secs)))
 
 
574
  print(f"[duration] {label}: {secs:.0f}s raw β†’ {result}s reserved")
575
  return result
576
 
@@ -1215,94 +1217,46 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
1215
 
1216
 
1217
 
1218
- def _hunyuan_seg_duration(video_file, prompt, negative_prompt, seed_val,
1219
- guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1220
- num_samples, silent_video=None, seg_clip_path=None,
1221
- dummy_seg_path=None, text_feats_path=None,
1222
- clip_start_s=0.0, clip_dur_s=None, **_kwargs):
1223
- """Duration estimate for a single-segment HunyuanFoley GPU call.
1224
- One segment Γ— num_steps + model load overhead β€” always fits in 60 s."""
1225
- cfg = MODEL_CONFIGS["hunyuan"]
1226
- secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
1227
- print(f"[duration] HunyuanFoley 1seg: 1Γ—{int(num_steps)}steps β†’ {secs:.0f}s β†’ capped ", end="")
1228
- return _clamp_duration(secs, "HunyuanFoley 1seg")
1229
-
1230
-
1231
- @spaces.GPU(duration=_hunyuan_seg_duration)
1232
- def _hunyuan_gpu_infer_one_seg(video_file, prompt, negative_prompt, seed_val,
1233
- guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1234
- num_samples, silent_video, seg_clip_path,
1235
- dummy_seg_path, text_feats_path,
1236
- clip_start_s=0.0, clip_dur_s=None):
1237
- """GPU-only HunyuanFoley inference for ONE segment.
1238
-
1239
- text_feats_path: path to pre-saved text_feats .pt file, or empty string to
1240
- extract fresh (first segment). Returns (wav_numpy, sr, text_feats_path).
1241
- """
1242
- import traceback as _tb
1243
- print(f"[_hunyuan_gpu_infer_one_seg] START seg_clip={seg_clip_path!r} "
1244
- f"text_feats_path={text_feats_path!r}")
1245
- try:
1246
- _ensure_syspath("HunyuanVideo-Foley")
1247
- from hunyuanvideo_foley.utils.model_utils import denoise_process
1248
- from hunyuanvideo_foley.utils.feature_utils import feature_process, encode_video_features
1249
-
1250
- device, _ = _get_device_and_dtype()
1251
- model_dict, cfg = _load_hunyuan_model(device, model_size.lower())
1252
-
1253
- # Load or extract text features
1254
- if text_feats_path and os.path.exists(text_feats_path):
1255
- print(f"[_hunyuan_gpu_infer_one_seg] loading cached text_feats from {text_feats_path}")
1256
- text_feats = torch.load(text_feats_path, map_location=device, weights_only=False)
1257
- visual_feats, seg_audio_len = encode_video_features(seg_clip_path, model_dict)
1258
- else:
1259
- print(f"[_hunyuan_gpu_infer_one_seg] extracting text+visual features")
1260
- visual_feats, text_feats, seg_audio_len = feature_process(
1261
- seg_clip_path,
1262
- prompt if prompt else "",
1263
- model_dict, cfg,
1264
- neg_prompt=negative_prompt if negative_prompt else None,
1265
- )
1266
-
1267
- print(f"[_hunyuan_gpu_infer_one_seg] denoising {seg_audio_len:.2f}s audio")
1268
- audio_batch, sr = denoise_process(
1269
- visual_feats, text_feats, seg_audio_len, model_dict, cfg,
1270
- guidance_scale=float(guidance_scale),
1271
- num_inference_steps=int(num_steps),
1272
- batch_size=1,
1273
- )
1274
- wav = audio_batch[0].float().cpu().numpy()
1275
-
1276
- # Save text_feats to disk so next segment's GPU call can reuse it without
1277
- # re-running CLAP/SigLIP, and so we never return a CUDA tensor to main process.
1278
- tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1279
- out_text_feats_path = os.path.join(tmp_dir, "hunyuan_text_feats.pt")
1280
- torch.save(text_feats, out_text_feats_path)
1281
- print(f"[_hunyuan_gpu_infer_one_seg] text_feats saved to {out_text_feats_path}")
1282
-
1283
- return wav, sr, out_text_feats_path
1284
 
1285
- except Exception as _e:
1286
- print(f"[_hunyuan_gpu_infer_one_seg] EXCEPTION: {_e}")
1287
- _tb.print_exc()
1288
- raise
1289
 
1290
- # Keep old name as alias for the xregen path which calls it directly
1291
  def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1292
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1293
  num_samples, silent_video, segments_json, total_dur_s,
1294
  clip_start_s=0.0, clip_dur_s=None):
1295
- """Wrapper used by xregen β€” single-segment call via _hunyuan_gpu_infer_one_seg."""
 
 
1296
  import traceback as _tb
1297
  print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} "
1298
  f"num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
1299
  try:
1300
  _ensure_syspath("HunyuanVideo-Foley")
1301
- from hunyuanvideo_foley.utils.feature_utils import feature_process
 
 
 
 
 
 
 
 
 
 
1302
 
1303
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1304
  _sv = silent_video
1305
- _total = float(total_dur_s)
1306
  if clip_dur_s is not None:
1307
  clip_path = _extract_segment_clip(
1308
  silent_video, float(clip_start_s), float(clip_dur_s),
@@ -1312,33 +1266,54 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1312
  _total = float(clip_dur_s)
1313
 
1314
  segments = json.loads(segments_json)
 
 
 
 
1315
  seg_clip_paths = [
1316
  _extract_segment_clip(_sv, s, e - s,
1317
  os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
1318
  for i, (s, e) in enumerate(segments)
1319
  ]
1320
 
1321
- # One GPU call per segment β€” each fits in the 60 s ZeroGPU free-tier cap
 
 
 
 
 
 
 
1322
  results = []
1323
- for sample_idx in range(int(num_samples)):
1324
  seg_wavs = []
1325
  sr = 48000
1326
- text_feats_path = ""
1327
  _t0 = time.perf_counter()
1328
  for seg_i, (seg_start, seg_end) in enumerate(segments):
1329
- print(f"[_hunyuan_gpu_infer] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
1330
- f"{seg_start:.1f}–{seg_end:.1f}s")
1331
- wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
1332
- video_file, prompt, negative_prompt, seed_val,
1333
- guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1334
- num_samples, _sv, seg_clip_paths[seg_i],
1335
- seg_clip_paths[0], text_feats_path,
1336
- clip_start_s, None,
 
1337
  )
1338
- seg_wavs.append(wav)
 
1339
  _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
1340
  len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
 
 
 
 
 
1341
  results.append((seg_wavs, sr, text_feats_path))
 
 
 
 
1342
  return results
1343
 
1344
  except Exception as _e:
@@ -1350,7 +1325,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1350
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
1351
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
1352
  """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
1353
- One GPU call per segment to stay within ZeroGPU's 60 s free-tier cap."""
1354
  num_samples = int(num_samples)
1355
  crossfade_s = float(crossfade_s)
1356
  crossfade_db = float(crossfade_db)
@@ -1360,37 +1335,16 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
1360
  video_file, HUNYUAN_MAX_DUR, crossfade_s)
1361
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ— ≀15 s")
1362
 
1363
- seg_clip_paths = [
1364
- _extract_segment_clip(silent_video, s, e - s,
1365
- os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
1366
- for i, (s, e) in enumerate(segments)
1367
- ]
1368
-
1369
- # ── One GPU call per segment ──
1370
- results = []
1371
- for sample_idx in range(num_samples):
1372
- seg_wavs = []
1373
- sr = 48000
1374
- text_feats_path = ""
1375
- _t0 = time.perf_counter()
1376
- for seg_i, (seg_start, seg_end) in enumerate(segments):
1377
- print(f"[HunyuanFoley] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
1378
- f"{seg_start:.1f}–{seg_end:.1f}s")
1379
- wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
1380
- video_file, prompt, negative_prompt, seed_val,
1381
- guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1382
- num_samples, silent_video, seg_clip_paths[seg_i],
1383
- seg_clip_paths[0], text_feats_path,
1384
- )
1385
- seg_wavs.append(wav)
1386
- _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
1387
- len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
1388
- results.append((seg_wavs, sr, text_feats_path))
1389
 
1390
  # ── CPU post-processing (no GPU needed) ──
1391
  def _hunyuan_extras(sample_idx, result, td):
1392
- _, _sr, tfp = result
1393
- return {"text_feats_path": tfp}
1394
 
1395
  outputs = _post_process_samples(
1396
  results, model="hunyuan", tmp_dir=tmp_dir,
 
553
  "window_s": 15.0, # HunyuanFoley max video duration
554
  "sr": 48000,
555
  "secs_per_step": 0.35, # measured 0.328 s/step on H200
556
+ "load_overhead": 90, # cold disk: ~73s for 10 GB weights + ~8s aux models
557
  "tab_prefix": "hf",
558
  "label": "HunyuanFoley",
559
  "regen_fn": None,
 
569
 
570
 
571
  def _clamp_duration(secs: float, label: str) -> int:
572
+ """Clamp a raw GPU-seconds estimate to [120, GPU_DURATION_CAP] and log it.
573
+ ZeroGPU Pro users get up to 300 s per call; 120 s floor covers cold-disk
574
+ model loads (e.g. HunyuanFoley XXL ~73 s on first access)."""
575
+ result = min(GPU_DURATION_CAP, max(120, int(secs)))
576
  print(f"[duration] {label}: {secs:.0f}s raw β†’ {result}s reserved")
577
  return result
578
 
 
1217
 
1218
 
1219
 
1220
+ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
1221
+ guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1222
+ num_samples, silent_video=None, segments_json=None, total_dur_s=None,
1223
+ clip_start_s=0.0, clip_dur_s=None, **_kwargs):
1224
+ """Pre-GPU callable β€” must match _hunyuan_gpu_infer's input signature exactly.
1225
+ silent_video, segments_json, total_dur_s, clip_start_s, clip_dur_s are extra
1226
+ positional args that xregen passes; they must appear here so ZeroGPU doesn't
1227
+ raise TypeError when forwarding all args to this duration fn."""
1228
+ return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
1229
+ video_file=video_file, crossfade_s=crossfade_s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1230
 
 
 
 
 
1231
 
1232
+ @spaces.GPU(duration=_hunyuan_duration)
1233
  def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1234
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1235
  num_samples, silent_video, segments_json, total_dur_s,
1236
  clip_start_s=0.0, clip_dur_s=None):
1237
+ """GPU-only HunyuanFoley inference β€” model loading + feature extraction + denoising.
1238
+ All segments processed in a single GPU call (Pro ZeroGPU allows up to 300 s).
1239
+ """
1240
  import traceback as _tb
1241
  print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} "
1242
  f"num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
1243
  try:
1244
  _ensure_syspath("HunyuanVideo-Foley")
1245
+ from hunyuanvideo_foley.utils.model_utils import denoise_process
1246
+ from hunyuanvideo_foley.utils.feature_utils import feature_process, encode_video_features
1247
+
1248
+ seed_val = _resolve_seed(seed_val)
1249
+ num_samples = int(num_samples)
1250
+ crossfade_s = float(crossfade_s)
1251
+ total_dur_s = float(total_dur_s)
1252
+ set_global_seed(seed_val)
1253
+
1254
+ device, _ = _get_device_and_dtype()
1255
+ model_dict, cfg = _load_hunyuan_model(device, model_size.lower())
1256
 
1257
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1258
  _sv = silent_video
1259
+ _total = total_dur_s
1260
  if clip_dur_s is not None:
1261
  clip_path = _extract_segment_clip(
1262
  silent_video, float(clip_start_s), float(clip_dur_s),
 
1266
  _total = float(clip_dur_s)
1267
 
1268
  segments = json.loads(segments_json)
1269
+ dummy_seg_path = _extract_segment_clip(
1270
+ _sv, 0, min(_total, HUNYUAN_MAX_DUR),
1271
+ os.path.join(tmp_dir, "_seg_dummy.mp4"),
1272
+ )
1273
  seg_clip_paths = [
1274
  _extract_segment_clip(_sv, s, e - s,
1275
  os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
1276
  for i, (s, e) in enumerate(segments)
1277
  ]
1278
 
1279
+ # Extract text features once for all segments
1280
+ _, text_feats, _ = feature_process(
1281
+ dummy_seg_path,
1282
+ prompt if prompt else "",
1283
+ model_dict, cfg,
1284
+ neg_prompt=negative_prompt if negative_prompt else None,
1285
+ )
1286
+
1287
  results = []
1288
+ for sample_idx in range(num_samples):
1289
  seg_wavs = []
1290
  sr = 48000
 
1291
  _t0 = time.perf_counter()
1292
  for seg_i, (seg_start, seg_end) in enumerate(segments):
1293
+ visual_feats, seg_audio_len = encode_video_features(
1294
+ seg_clip_paths[seg_i], model_dict)
1295
+ print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
1296
+ f"{seg_start:.1f}–{seg_end:.1f}s β†’ {seg_audio_len:.2f}s audio")
1297
+ audio_batch, sr = denoise_process(
1298
+ visual_feats, text_feats, seg_audio_len, model_dict, cfg,
1299
+ guidance_scale=float(guidance_scale),
1300
+ num_inference_steps=int(num_steps),
1301
+ batch_size=1,
1302
  )
1303
+ seg_wavs.append(audio_batch[0].float().cpu().numpy())
1304
+
1305
  _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
1306
  len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
1307
+
1308
+ # Save text_feats inside the GPU worker β€” never return CUDA tensors
1309
+ text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
1310
+ torch.save(text_feats, text_feats_path)
1311
+ print(f"[HunyuanFoley] text_feats saved to {text_feats_path}")
1312
  results.append((seg_wavs, sr, text_feats_path))
1313
+
1314
+ if torch.cuda.is_available():
1315
+ torch.cuda.empty_cache()
1316
+
1317
  return results
1318
 
1319
  except Exception as _e:
 
1325
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
1326
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
1327
  """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
1328
+ CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
1329
  num_samples = int(num_samples)
1330
  crossfade_s = float(crossfade_s)
1331
  crossfade_db = float(crossfade_db)
 
1335
  video_file, HUNYUAN_MAX_DUR, crossfade_s)
1336
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ— ≀15 s")
1337
 
1338
+ # ── GPU inference (all segments in one call) ──
1339
+ results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1340
+ guidance_scale, num_steps, model_size,
1341
+ crossfade_s, crossfade_db, num_samples,
1342
+ silent_video, json.dumps(segments), total_dur_s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1343
 
1344
  # ── CPU post-processing (no GPU needed) ──
1345
  def _hunyuan_extras(sample_idx, result, td):
1346
+ _, _sr, text_feats_path = result
1347
+ return {"text_feats_path": text_feats_path}
1348
 
1349
  outputs = _post_process_samples(
1350
  results, model="hunyuan", tmp_dir=tmp_dir,