BoxOfColors Claude Sonnet 4.6 commited on
Commit
a6fff03
Β·
1 Parent(s): 1dcac2d

Fix HunyuanFoley initial gen: one GPU call per segment

Browse files

ZeroGPU free tier caps GPU windows at 60s regardless of what is requested.
With 2 segments, model load (~19s) + seg1 (~13s) + seg2 (~13s) = ~45s, but
the Transformers cache migration on first call pushes it over 60s, causing
the worker to be silently killed mid-segment-2 with no error message.

Fix: replace the single multi-segment GPU call with one GPU call per segment.
Each call: model load + 1 segment inference = ~32s, comfortably under 60s.
Text features are saved to disk after the first segment and reloaded by
subsequent segments to avoid re-running CLAP/SigLIP extraction each time.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +140 -123
app.py CHANGED
@@ -1215,165 +1215,182 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
1215
 
1216
 
1217
 
1218
- def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
1219
- guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1220
- num_samples, silent_video=None, segments_json=None, total_dur_s=None,
1221
- clip_start_s=0.0, clip_dur_s=None, **_kwargs):
1222
- """Pre-GPU callable β€” must match _hunyuan_gpu_infer's input signature exactly.
1223
- silent_video, segments_json, total_dur_s, clip_start_s, clip_dur_s are extra
1224
- positional args that xregen passes; they must appear here so ZeroGPU doesn't
1225
- raise TypeError when forwarding all args to this duration fn."""
1226
- return _estimate_gpu_duration("hunyuan", int(num_samples), int(num_steps),
1227
- video_file=video_file, crossfade_s=crossfade_s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1229
 
1230
- @spaces.GPU(duration=_hunyuan_duration)
1231
  def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1232
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1233
  num_samples, silent_video, segments_json, total_dur_s,
1234
  clip_start_s=0.0, clip_dur_s=None):
1235
- """GPU-only HunyuanFoley inference β€” model loading + feature extraction + denoising.
1236
- Returns list of (seg_wavs, sr, text_feats) per sample.
1237
-
1238
- All paths passed explicitly as positional args to survive ZeroGPU isolation.
1239
- When *clip_dur_s* is set, the clip is extracted inside the GPU window.
1240
- """
1241
  import traceback as _tb
1242
- print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
 
1243
  try:
1244
- return _hunyuan_gpu_infer_impl(
1245
- video_file, prompt, negative_prompt, seed_val,
1246
- guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1247
- num_samples, silent_video, segments_json, total_dur_s,
1248
- clip_start_s, clip_dur_s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1249
  except Exception as _e:
1250
  print(f"[_hunyuan_gpu_infer] EXCEPTION: {_e}")
1251
  _tb.print_exc()
1252
  raise
1253
 
1254
- def _hunyuan_gpu_infer_impl(video_file, prompt, negative_prompt, seed_val,
1255
- guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1256
- num_samples, silent_video, segments_json, total_dur_s,
1257
- clip_start_s=0.0, clip_dur_s=None):
1258
- _ensure_syspath("HunyuanVideo-Foley")
1259
- from hunyuanvideo_foley.utils.model_utils import denoise_process
1260
- from hunyuanvideo_foley.utils.feature_utils import feature_process
1261
 
1262
- seed_val = _resolve_seed(seed_val)
 
 
 
1263
  num_samples = int(num_samples)
1264
  crossfade_s = float(crossfade_s)
1265
- total_dur_s = float(total_dur_s)
1266
- set_global_seed(seed_val)
1267
-
1268
- device, _ = _get_device_and_dtype()
1269
- model_size = model_size.lower()
1270
-
1271
- model_dict, cfg = _load_hunyuan_model(device, model_size)
1272
 
1273
- # Extract xregen clip inside GPU fn if needed (tmp files from caller invisible here).
1274
- tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1275
- if clip_dur_s is not None:
1276
- clip_dur_s = float(clip_dur_s)
1277
- clip_path = _extract_segment_clip(
1278
- silent_video, float(clip_start_s), clip_dur_s,
1279
- os.path.join(tmp_dir, "hny_xregen_clip.mp4"),
1280
- )
1281
- silent_video = clip_path
1282
- total_dur_s = clip_dur_s
1283
 
1284
- segments = json.loads(segments_json)
1285
- dummy_seg_path = _extract_segment_clip(
1286
- silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
1287
- os.path.join(tmp_dir, "_seg_dummy.mp4"),
1288
- )
1289
  seg_clip_paths = [
1290
  _extract_segment_clip(silent_video, s, e - s,
1291
  os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
1292
  for i, (s, e) in enumerate(segments)
1293
  ]
1294
 
1295
- # Text feature extraction (GPU β€” runs once for all segments)
1296
- _, text_feats, _ = feature_process(
1297
- dummy_seg_path,
1298
- prompt if prompt else "",
1299
- model_dict,
1300
- cfg,
1301
- neg_prompt=negative_prompt if negative_prompt else None,
1302
- )
1303
-
1304
- # Import visual-only feature extractor to avoid redundant text extraction
1305
- # per segment (text_feats already computed once above for the whole batch).
1306
- from hunyuanvideo_foley.utils.feature_utils import encode_video_features
1307
-
1308
  results = []
1309
  for sample_idx in range(num_samples):
1310
  seg_wavs = []
1311
  sr = 48000
1312
- _t_hny_start = time.perf_counter()
 
1313
  for seg_i, (seg_start, seg_end) in enumerate(segments):
1314
- seg_dur = seg_end - seg_start
1315
- seg_path = seg_clip_paths[seg_i]
1316
-
1317
- # Extract only visual features β€” reuse text_feats from above
1318
- visual_feats, seg_audio_len = encode_video_features(seg_path, model_dict)
1319
- print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
1320
- f"{seg_start:.1f}–{seg_end:.1f}s β†’ {seg_audio_len:.2f}s audio")
1321
-
1322
- audio_batch, sr = denoise_process(
1323
- visual_feats,
1324
- text_feats,
1325
- seg_audio_len,
1326
- model_dict,
1327
- cfg,
1328
- guidance_scale=float(guidance_scale),
1329
- num_inference_steps=int(num_steps),
1330
- batch_size=1,
1331
  )
1332
- wav = audio_batch[0].float().cpu().numpy() # full window
1333
  seg_wavs.append(wav)
1334
-
1335
- _log_inference_timing("HunyuanFoley", time.perf_counter() - _t_hny_start,
1336
  len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
1337
-
1338
- # Save text_feats to disk inside the GPU worker so we never pickle a CUDA
1339
- # tensor back to the main process (ZeroGPU forbids CUDA init in main process).
1340
- text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
1341
- torch.save(text_feats, text_feats_path)
1342
- print(f"[HunyuanFoley] text_feats saved to {text_feats_path}")
1343
  results.append((seg_wavs, sr, text_feats_path))
1344
 
1345
- # Free GPU memory between samples to prevent VRAM fragmentation
1346
- if torch.cuda.is_available():
1347
- torch.cuda.empty_cache()
1348
-
1349
- return results
1350
-
1351
-
1352
- def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
1353
- guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
1354
- """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
1355
- CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
1356
- num_samples = int(num_samples)
1357
- crossfade_s = float(crossfade_s)
1358
- crossfade_db = float(crossfade_db)
1359
-
1360
- # ── CPU pre-processing (no GPU needed) ──
1361
- tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
1362
- video_file, HUNYUAN_MAX_DUR, crossfade_s)
1363
- print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ— ≀15 s")
1364
-
1365
- # ── GPU inference only ──
1366
- results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1367
- guidance_scale, num_steps, model_size,
1368
- crossfade_s, crossfade_db, num_samples,
1369
- silent_video, json.dumps(segments), total_dur_s)
1370
-
1371
  # ── CPU post-processing (no GPU needed) ──
1372
  def _hunyuan_extras(sample_idx, result, td):
1373
- # text_feats was saved to disk inside the GPU worker (to avoid pickling CUDA
1374
- # tensors across the ZeroGPU process boundary); result[2] is the file path.
1375
- _, _sr, text_feats_path = result
1376
- return {"text_feats_path": text_feats_path}
1377
 
1378
  outputs = _post_process_samples(
1379
  results, model="hunyuan", tmp_dir=tmp_dir,
 
1215
 
1216
 
1217
 
1218
+ def _hunyuan_seg_duration(video_file, prompt, negative_prompt, seed_val,
1219
+ guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1220
+ num_samples, silent_video=None, seg_clip_path=None,
1221
+ dummy_seg_path=None, text_feats_path=None,
1222
+ clip_start_s=0.0, clip_dur_s=None, **_kwargs):
1223
+ """Duration estimate for a single-segment HunyuanFoley GPU call.
1224
+ One segment Γ— num_steps + model load overhead β€” always fits in 60 s."""
1225
+ cfg = MODEL_CONFIGS["hunyuan"]
1226
+ secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
1227
+ print(f"[duration] HunyuanFoley 1seg: 1Γ—{int(num_steps)}steps β†’ {secs:.0f}s β†’ capped ", end="")
1228
+ return _clamp_duration(secs, "HunyuanFoley 1seg")
1229
+
1230
+
1231
+ @spaces.GPU(duration=_hunyuan_seg_duration)
1232
+ def _hunyuan_gpu_infer_one_seg(video_file, prompt, negative_prompt, seed_val,
1233
+ guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1234
+ num_samples, silent_video, seg_clip_path,
1235
+ dummy_seg_path, text_feats_path,
1236
+ clip_start_s=0.0, clip_dur_s=None):
1237
+ """GPU-only HunyuanFoley inference for ONE segment.
1238
+
1239
+ text_feats_path: path to pre-saved text_feats .pt file, or empty string to
1240
+ extract fresh (first segment). Returns (wav_numpy, sr, text_feats_path).
1241
+ """
1242
+ import traceback as _tb
1243
+ print(f"[_hunyuan_gpu_infer_one_seg] START seg_clip={seg_clip_path!r} "
1244
+ f"text_feats_path={text_feats_path!r}")
1245
+ try:
1246
+ _ensure_syspath("HunyuanVideo-Foley")
1247
+ from hunyuanvideo_foley.utils.model_utils import denoise_process
1248
+ from hunyuanvideo_foley.utils.feature_utils import feature_process, encode_video_features
1249
+
1250
+ device, _ = _get_device_and_dtype()
1251
+ model_dict, cfg = _load_hunyuan_model(device, model_size.lower())
1252
 
1253
+ # Load or extract text features
1254
+ if text_feats_path and os.path.exists(text_feats_path):
1255
+ print(f"[_hunyuan_gpu_infer_one_seg] loading cached text_feats from {text_feats_path}")
1256
+ text_feats = torch.load(text_feats_path, map_location=device, weights_only=False)
1257
+ visual_feats, seg_audio_len = encode_video_features(seg_clip_path, model_dict)
1258
+ else:
1259
+ print(f"[_hunyuan_gpu_infer_one_seg] extracting text+visual features")
1260
+ visual_feats, text_feats, seg_audio_len = feature_process(
1261
+ seg_clip_path,
1262
+ prompt if prompt else "",
1263
+ model_dict, cfg,
1264
+ neg_prompt=negative_prompt if negative_prompt else None,
1265
+ )
1266
+
1267
+ print(f"[_hunyuan_gpu_infer_one_seg] denoising {seg_audio_len:.2f}s audio")
1268
+ audio_batch, sr = denoise_process(
1269
+ visual_feats, text_feats, seg_audio_len, model_dict, cfg,
1270
+ guidance_scale=float(guidance_scale),
1271
+ num_inference_steps=int(num_steps),
1272
+ batch_size=1,
1273
+ )
1274
+ wav = audio_batch[0].float().cpu().numpy()
1275
+
1276
+ # Save text_feats to disk so next segment's GPU call can reuse it without
1277
+ # re-running CLAP/SigLIP, and so we never return a CUDA tensor to main process.
1278
+ tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1279
+ out_text_feats_path = os.path.join(tmp_dir, "hunyuan_text_feats.pt")
1280
+ torch.save(text_feats, out_text_feats_path)
1281
+ print(f"[_hunyuan_gpu_infer_one_seg] text_feats saved to {out_text_feats_path}")
1282
+
1283
+ return wav, sr, out_text_feats_path
1284
+
1285
+ except Exception as _e:
1286
+ print(f"[_hunyuan_gpu_infer_one_seg] EXCEPTION: {_e}")
1287
+ _tb.print_exc()
1288
+ raise
1289
 
1290
+ # Keep old name as alias for the xregen path which calls it directly
1291
  def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
1292
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1293
  num_samples, silent_video, segments_json, total_dur_s,
1294
  clip_start_s=0.0, clip_dur_s=None):
1295
+ """Wrapper used by xregen β€” single-segment call via _hunyuan_gpu_infer_one_seg."""
 
 
 
 
 
1296
  import traceback as _tb
1297
+ print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} "
1298
+ f"num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
1299
  try:
1300
+ _ensure_syspath("HunyuanVideo-Foley")
1301
+ from hunyuanvideo_foley.utils.feature_utils import feature_process
1302
+
1303
+ tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1304
+ _sv = silent_video
1305
+ _total = float(total_dur_s)
1306
+ if clip_dur_s is not None:
1307
+ clip_path = _extract_segment_clip(
1308
+ silent_video, float(clip_start_s), float(clip_dur_s),
1309
+ os.path.join(tmp_dir, "hny_xregen_clip.mp4"),
1310
+ )
1311
+ _sv = clip_path
1312
+ _total = float(clip_dur_s)
1313
+
1314
+ segments = json.loads(segments_json)
1315
+ seg_clip_paths = [
1316
+ _extract_segment_clip(_sv, s, e - s,
1317
+ os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
1318
+ for i, (s, e) in enumerate(segments)
1319
+ ]
1320
+
1321
+ # One GPU call per segment β€” each fits in the 60 s ZeroGPU free-tier cap
1322
+ results = []
1323
+ for sample_idx in range(int(num_samples)):
1324
+ seg_wavs = []
1325
+ sr = 48000
1326
+ text_feats_path = ""
1327
+ _t0 = time.perf_counter()
1328
+ for seg_i, (seg_start, seg_end) in enumerate(segments):
1329
+ print(f"[_hunyuan_gpu_infer] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
1330
+ f"{seg_start:.1f}–{seg_end:.1f}s")
1331
+ wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
1332
+ video_file, prompt, negative_prompt, seed_val,
1333
+ guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1334
+ num_samples, _sv, seg_clip_paths[seg_i],
1335
+ seg_clip_paths[0], text_feats_path,
1336
+ clip_start_s, None,
1337
+ )
1338
+ seg_wavs.append(wav)
1339
+ _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
1340
+ len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
1341
+ results.append((seg_wavs, sr, text_feats_path))
1342
+ return results
1343
+
1344
  except Exception as _e:
1345
  print(f"[_hunyuan_gpu_infer] EXCEPTION: {_e}")
1346
  _tb.print_exc()
1347
  raise
1348
 
 
 
 
 
 
 
 
1349
 
1350
+ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
1351
+ guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
1352
+ """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
1353
+ One GPU call per segment to stay within ZeroGPU's 60 s free-tier cap."""
1354
  num_samples = int(num_samples)
1355
  crossfade_s = float(crossfade_s)
1356
+ crossfade_db = float(crossfade_db)
 
 
 
 
 
 
1357
 
1358
+ # ── CPU pre-processing (no GPU needed) ──
1359
+ tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
1360
+ video_file, HUNYUAN_MAX_DUR, crossfade_s)
1361
+ print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ— ≀15 s")
 
 
 
 
 
 
1362
 
 
 
 
 
 
1363
  seg_clip_paths = [
1364
  _extract_segment_clip(silent_video, s, e - s,
1365
  os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
1366
  for i, (s, e) in enumerate(segments)
1367
  ]
1368
 
1369
+ # ── One GPU call per segment ──
 
 
 
 
 
 
 
 
 
 
 
 
1370
  results = []
1371
  for sample_idx in range(num_samples):
1372
  seg_wavs = []
1373
  sr = 48000
1374
+ text_feats_path = ""
1375
+ _t0 = time.perf_counter()
1376
  for seg_i, (seg_start, seg_end) in enumerate(segments):
1377
+ print(f"[HunyuanFoley] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
1378
+ f"{seg_start:.1f}–{seg_end:.1f}s")
1379
+ wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
1380
+ video_file, prompt, negative_prompt, seed_val,
1381
+ guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
1382
+ num_samples, silent_video, seg_clip_paths[seg_i],
1383
+ seg_clip_paths[0], text_feats_path,
 
 
 
 
 
 
 
 
 
 
1384
  )
 
1385
  seg_wavs.append(wav)
1386
+ _log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
 
1387
  len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
 
 
 
 
 
 
1388
  results.append((seg_wavs, sr, text_feats_path))
1389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1390
  # ── CPU post-processing (no GPU needed) ──
1391
  def _hunyuan_extras(sample_idx, result, td):
1392
+ _, _sr, tfp = result
1393
+ return {"text_feats_path": tfp}
 
 
1394
 
1395
  outputs = _post_process_samples(
1396
  results, model="hunyuan", tmp_dir=tmp_dir,