Commit Β·
a6fff03
1
Parent(s): 1dcac2d
Fix HunyuanFoley initial gen: one GPU call per segment
Browse filesZeroGPU free tier caps GPU windows at 60s regardless of what is requested.
With 2 segments, model load (~19s) + seg1 (~13s) + seg2 (~13s) = ~45s, but
the Transformers cache migration on first call pushes it over 60s, causing
the worker to be silently killed mid-segment-2 with no error message.
Fix: replace the single multi-segment GPU call with one GPU call per segment.
Each call: model load + 1 segment inference = ~32s, comfortably under 60s.
Text features are saved to disk after the first segment and reloaded by
subsequent segments to avoid re-running CLAP/SigLIP extraction each time.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -1215,165 +1215,182 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 1215 |
|
| 1216 |
|
| 1217 |
|
| 1218 |
-
def
|
| 1219 |
-
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
-
|
| 1225 |
-
|
| 1226 |
-
|
| 1227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1229 |
|
| 1230 |
-
|
| 1231 |
def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1232 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1233 |
num_samples, silent_video, segments_json, total_dur_s,
|
| 1234 |
clip_start_s=0.0, clip_dur_s=None):
|
| 1235 |
-
"""
|
| 1236 |
-
Returns list of (seg_wavs, sr, text_feats) per sample.
|
| 1237 |
-
|
| 1238 |
-
All paths passed explicitly as positional args to survive ZeroGPU isolation.
|
| 1239 |
-
When *clip_dur_s* is set, the clip is extracted inside the GPU window.
|
| 1240 |
-
"""
|
| 1241 |
import traceback as _tb
|
| 1242 |
-
print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r}
|
|
|
|
| 1243 |
try:
|
| 1244 |
-
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1249 |
except Exception as _e:
|
| 1250 |
print(f"[_hunyuan_gpu_infer] EXCEPTION: {_e}")
|
| 1251 |
_tb.print_exc()
|
| 1252 |
raise
|
| 1253 |
|
| 1254 |
-
def _hunyuan_gpu_infer_impl(video_file, prompt, negative_prompt, seed_val,
|
| 1255 |
-
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1256 |
-
num_samples, silent_video, segments_json, total_dur_s,
|
| 1257 |
-
clip_start_s=0.0, clip_dur_s=None):
|
| 1258 |
-
_ensure_syspath("HunyuanVideo-Foley")
|
| 1259 |
-
from hunyuanvideo_foley.utils.model_utils import denoise_process
|
| 1260 |
-
from hunyuanvideo_foley.utils.feature_utils import feature_process
|
| 1261 |
|
| 1262 |
-
|
|
|
|
|
|
|
|
|
|
| 1263 |
num_samples = int(num_samples)
|
| 1264 |
crossfade_s = float(crossfade_s)
|
| 1265 |
-
|
| 1266 |
-
set_global_seed(seed_val)
|
| 1267 |
-
|
| 1268 |
-
device, _ = _get_device_and_dtype()
|
| 1269 |
-
model_size = model_size.lower()
|
| 1270 |
-
|
| 1271 |
-
model_dict, cfg = _load_hunyuan_model(device, model_size)
|
| 1272 |
|
| 1273 |
-
#
|
| 1274 |
-
tmp_dir =
|
| 1275 |
-
|
| 1276 |
-
|
| 1277 |
-
clip_path = _extract_segment_clip(
|
| 1278 |
-
silent_video, float(clip_start_s), clip_dur_s,
|
| 1279 |
-
os.path.join(tmp_dir, "hny_xregen_clip.mp4"),
|
| 1280 |
-
)
|
| 1281 |
-
silent_video = clip_path
|
| 1282 |
-
total_dur_s = clip_dur_s
|
| 1283 |
|
| 1284 |
-
segments = json.loads(segments_json)
|
| 1285 |
-
dummy_seg_path = _extract_segment_clip(
|
| 1286 |
-
silent_video, 0, min(total_dur_s, HUNYUAN_MAX_DUR),
|
| 1287 |
-
os.path.join(tmp_dir, "_seg_dummy.mp4"),
|
| 1288 |
-
)
|
| 1289 |
seg_clip_paths = [
|
| 1290 |
_extract_segment_clip(silent_video, s, e - s,
|
| 1291 |
os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
|
| 1292 |
for i, (s, e) in enumerate(segments)
|
| 1293 |
]
|
| 1294 |
|
| 1295 |
-
#
|
| 1296 |
-
_, text_feats, _ = feature_process(
|
| 1297 |
-
dummy_seg_path,
|
| 1298 |
-
prompt if prompt else "",
|
| 1299 |
-
model_dict,
|
| 1300 |
-
cfg,
|
| 1301 |
-
neg_prompt=negative_prompt if negative_prompt else None,
|
| 1302 |
-
)
|
| 1303 |
-
|
| 1304 |
-
# Import visual-only feature extractor to avoid redundant text extraction
|
| 1305 |
-
# per segment (text_feats already computed once above for the whole batch).
|
| 1306 |
-
from hunyuanvideo_foley.utils.feature_utils import encode_video_features
|
| 1307 |
-
|
| 1308 |
results = []
|
| 1309 |
for sample_idx in range(num_samples):
|
| 1310 |
seg_wavs = []
|
| 1311 |
sr = 48000
|
| 1312 |
-
|
|
|
|
| 1313 |
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 1314 |
-
|
| 1315 |
-
|
| 1316 |
-
|
| 1317 |
-
|
| 1318 |
-
|
| 1319 |
-
|
| 1320 |
-
|
| 1321 |
-
|
| 1322 |
-
audio_batch, sr = denoise_process(
|
| 1323 |
-
visual_feats,
|
| 1324 |
-
text_feats,
|
| 1325 |
-
seg_audio_len,
|
| 1326 |
-
model_dict,
|
| 1327 |
-
cfg,
|
| 1328 |
-
guidance_scale=float(guidance_scale),
|
| 1329 |
-
num_inference_steps=int(num_steps),
|
| 1330 |
-
batch_size=1,
|
| 1331 |
)
|
| 1332 |
-
wav = audio_batch[0].float().cpu().numpy() # full window
|
| 1333 |
seg_wavs.append(wav)
|
| 1334 |
-
|
| 1335 |
-
_log_inference_timing("HunyuanFoley", time.perf_counter() - _t_hny_start,
|
| 1336 |
len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
|
| 1337 |
-
|
| 1338 |
-
# Save text_feats to disk inside the GPU worker so we never pickle a CUDA
|
| 1339 |
-
# tensor back to the main process (ZeroGPU forbids CUDA init in main process).
|
| 1340 |
-
text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
|
| 1341 |
-
torch.save(text_feats, text_feats_path)
|
| 1342 |
-
print(f"[HunyuanFoley] text_feats saved to {text_feats_path}")
|
| 1343 |
results.append((seg_wavs, sr, text_feats_path))
|
| 1344 |
|
| 1345 |
-
# Free GPU memory between samples to prevent VRAM fragmentation
|
| 1346 |
-
if torch.cuda.is_available():
|
| 1347 |
-
torch.cuda.empty_cache()
|
| 1348 |
-
|
| 1349 |
-
return results
|
| 1350 |
-
|
| 1351 |
-
|
| 1352 |
-
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 1353 |
-
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
|
| 1354 |
-
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
|
| 1355 |
-
CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
|
| 1356 |
-
num_samples = int(num_samples)
|
| 1357 |
-
crossfade_s = float(crossfade_s)
|
| 1358 |
-
crossfade_db = float(crossfade_db)
|
| 1359 |
-
|
| 1360 |
-
# ββ CPU pre-processing (no GPU needed) ββ
|
| 1361 |
-
tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
|
| 1362 |
-
video_file, HUNYUAN_MAX_DUR, crossfade_s)
|
| 1363 |
-
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ β€15 s")
|
| 1364 |
-
|
| 1365 |
-
# ββ GPU inference only ββ
|
| 1366 |
-
results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1367 |
-
guidance_scale, num_steps, model_size,
|
| 1368 |
-
crossfade_s, crossfade_db, num_samples,
|
| 1369 |
-
silent_video, json.dumps(segments), total_dur_s)
|
| 1370 |
-
|
| 1371 |
# ββ CPU post-processing (no GPU needed) ββ
|
| 1372 |
def _hunyuan_extras(sample_idx, result, td):
|
| 1373 |
-
|
| 1374 |
-
|
| 1375 |
-
_, _sr, text_feats_path = result
|
| 1376 |
-
return {"text_feats_path": text_feats_path}
|
| 1377 |
|
| 1378 |
outputs = _post_process_samples(
|
| 1379 |
results, model="hunyuan", tmp_dir=tmp_dir,
|
|
|
|
| 1215 |
|
| 1216 |
|
| 1217 |
|
| 1218 |
+
def _hunyuan_seg_duration(video_file, prompt, negative_prompt, seed_val,
|
| 1219 |
+
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1220 |
+
num_samples, silent_video=None, seg_clip_path=None,
|
| 1221 |
+
dummy_seg_path=None, text_feats_path=None,
|
| 1222 |
+
clip_start_s=0.0, clip_dur_s=None, **_kwargs):
|
| 1223 |
+
"""Duration estimate for a single-segment HunyuanFoley GPU call.
|
| 1224 |
+
One segment Γ num_steps + model load overhead β always fits in 60 s."""
|
| 1225 |
+
cfg = MODEL_CONFIGS["hunyuan"]
|
| 1226 |
+
secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
|
| 1227 |
+
print(f"[duration] HunyuanFoley 1seg: 1Γ{int(num_steps)}steps β {secs:.0f}s β capped ", end="")
|
| 1228 |
+
return _clamp_duration(secs, "HunyuanFoley 1seg")
|
| 1229 |
+
|
| 1230 |
+
|
| 1231 |
+
@spaces.GPU(duration=_hunyuan_seg_duration)
|
| 1232 |
+
def _hunyuan_gpu_infer_one_seg(video_file, prompt, negative_prompt, seed_val,
|
| 1233 |
+
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1234 |
+
num_samples, silent_video, seg_clip_path,
|
| 1235 |
+
dummy_seg_path, text_feats_path,
|
| 1236 |
+
clip_start_s=0.0, clip_dur_s=None):
|
| 1237 |
+
"""GPU-only HunyuanFoley inference for ONE segment.
|
| 1238 |
+
|
| 1239 |
+
text_feats_path: path to pre-saved text_feats .pt file, or empty string to
|
| 1240 |
+
extract fresh (first segment). Returns (wav_numpy, sr, text_feats_path).
|
| 1241 |
+
"""
|
| 1242 |
+
import traceback as _tb
|
| 1243 |
+
print(f"[_hunyuan_gpu_infer_one_seg] START seg_clip={seg_clip_path!r} "
|
| 1244 |
+
f"text_feats_path={text_feats_path!r}")
|
| 1245 |
+
try:
|
| 1246 |
+
_ensure_syspath("HunyuanVideo-Foley")
|
| 1247 |
+
from hunyuanvideo_foley.utils.model_utils import denoise_process
|
| 1248 |
+
from hunyuanvideo_foley.utils.feature_utils import feature_process, encode_video_features
|
| 1249 |
+
|
| 1250 |
+
device, _ = _get_device_and_dtype()
|
| 1251 |
+
model_dict, cfg = _load_hunyuan_model(device, model_size.lower())
|
| 1252 |
|
| 1253 |
+
# Load or extract text features
|
| 1254 |
+
if text_feats_path and os.path.exists(text_feats_path):
|
| 1255 |
+
print(f"[_hunyuan_gpu_infer_one_seg] loading cached text_feats from {text_feats_path}")
|
| 1256 |
+
text_feats = torch.load(text_feats_path, map_location=device, weights_only=False)
|
| 1257 |
+
visual_feats, seg_audio_len = encode_video_features(seg_clip_path, model_dict)
|
| 1258 |
+
else:
|
| 1259 |
+
print(f"[_hunyuan_gpu_infer_one_seg] extracting text+visual features")
|
| 1260 |
+
visual_feats, text_feats, seg_audio_len = feature_process(
|
| 1261 |
+
seg_clip_path,
|
| 1262 |
+
prompt if prompt else "",
|
| 1263 |
+
model_dict, cfg,
|
| 1264 |
+
neg_prompt=negative_prompt if negative_prompt else None,
|
| 1265 |
+
)
|
| 1266 |
+
|
| 1267 |
+
print(f"[_hunyuan_gpu_infer_one_seg] denoising {seg_audio_len:.2f}s audio")
|
| 1268 |
+
audio_batch, sr = denoise_process(
|
| 1269 |
+
visual_feats, text_feats, seg_audio_len, model_dict, cfg,
|
| 1270 |
+
guidance_scale=float(guidance_scale),
|
| 1271 |
+
num_inference_steps=int(num_steps),
|
| 1272 |
+
batch_size=1,
|
| 1273 |
+
)
|
| 1274 |
+
wav = audio_batch[0].float().cpu().numpy()
|
| 1275 |
+
|
| 1276 |
+
# Save text_feats to disk so next segment's GPU call can reuse it without
|
| 1277 |
+
# re-running CLAP/SigLIP, and so we never return a CUDA tensor to main process.
|
| 1278 |
+
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1279 |
+
out_text_feats_path = os.path.join(tmp_dir, "hunyuan_text_feats.pt")
|
| 1280 |
+
torch.save(text_feats, out_text_feats_path)
|
| 1281 |
+
print(f"[_hunyuan_gpu_infer_one_seg] text_feats saved to {out_text_feats_path}")
|
| 1282 |
+
|
| 1283 |
+
return wav, sr, out_text_feats_path
|
| 1284 |
+
|
| 1285 |
+
except Exception as _e:
|
| 1286 |
+
print(f"[_hunyuan_gpu_infer_one_seg] EXCEPTION: {_e}")
|
| 1287 |
+
_tb.print_exc()
|
| 1288 |
+
raise
|
| 1289 |
|
| 1290 |
+
# Keep old name as alias for the xregen path which calls it directly
|
| 1291 |
def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
|
| 1292 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1293 |
num_samples, silent_video, segments_json, total_dur_s,
|
| 1294 |
clip_start_s=0.0, clip_dur_s=None):
|
| 1295 |
+
"""Wrapper used by xregen β single-segment call via _hunyuan_gpu_infer_one_seg."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1296 |
import traceback as _tb
|
| 1297 |
+
print(f"[_hunyuan_gpu_infer] START video={video_file!r} model_size={model_size!r} "
|
| 1298 |
+
f"num_steps={num_steps!r} clip_start={clip_start_s} clip_dur={clip_dur_s}")
|
| 1299 |
try:
|
| 1300 |
+
_ensure_syspath("HunyuanVideo-Foley")
|
| 1301 |
+
from hunyuanvideo_foley.utils.feature_utils import feature_process
|
| 1302 |
+
|
| 1303 |
+
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1304 |
+
_sv = silent_video
|
| 1305 |
+
_total = float(total_dur_s)
|
| 1306 |
+
if clip_dur_s is not None:
|
| 1307 |
+
clip_path = _extract_segment_clip(
|
| 1308 |
+
silent_video, float(clip_start_s), float(clip_dur_s),
|
| 1309 |
+
os.path.join(tmp_dir, "hny_xregen_clip.mp4"),
|
| 1310 |
+
)
|
| 1311 |
+
_sv = clip_path
|
| 1312 |
+
_total = float(clip_dur_s)
|
| 1313 |
+
|
| 1314 |
+
segments = json.loads(segments_json)
|
| 1315 |
+
seg_clip_paths = [
|
| 1316 |
+
_extract_segment_clip(_sv, s, e - s,
|
| 1317 |
+
os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
|
| 1318 |
+
for i, (s, e) in enumerate(segments)
|
| 1319 |
+
]
|
| 1320 |
+
|
| 1321 |
+
# One GPU call per segment β each fits in the 60 s ZeroGPU free-tier cap
|
| 1322 |
+
results = []
|
| 1323 |
+
for sample_idx in range(int(num_samples)):
|
| 1324 |
+
seg_wavs = []
|
| 1325 |
+
sr = 48000
|
| 1326 |
+
text_feats_path = ""
|
| 1327 |
+
_t0 = time.perf_counter()
|
| 1328 |
+
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 1329 |
+
print(f"[_hunyuan_gpu_infer] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
|
| 1330 |
+
f"{seg_start:.1f}β{seg_end:.1f}s")
|
| 1331 |
+
wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
|
| 1332 |
+
video_file, prompt, negative_prompt, seed_val,
|
| 1333 |
+
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1334 |
+
num_samples, _sv, seg_clip_paths[seg_i],
|
| 1335 |
+
seg_clip_paths[0], text_feats_path,
|
| 1336 |
+
clip_start_s, None,
|
| 1337 |
+
)
|
| 1338 |
+
seg_wavs.append(wav)
|
| 1339 |
+
_log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
|
| 1340 |
+
len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
|
| 1341 |
+
results.append((seg_wavs, sr, text_feats_path))
|
| 1342 |
+
return results
|
| 1343 |
+
|
| 1344 |
except Exception as _e:
|
| 1345 |
print(f"[_hunyuan_gpu_infer] EXCEPTION: {_e}")
|
| 1346 |
_tb.print_exc()
|
| 1347 |
raise
|
| 1348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1349 |
|
| 1350 |
+
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 1351 |
+
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
|
| 1352 |
+
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
|
| 1353 |
+
One GPU call per segment to stay within ZeroGPU's 60 s free-tier cap."""
|
| 1354 |
num_samples = int(num_samples)
|
| 1355 |
crossfade_s = float(crossfade_s)
|
| 1356 |
+
crossfade_db = float(crossfade_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1357 |
|
| 1358 |
+
# ββ CPU pre-processing (no GPU needed) ββ
|
| 1359 |
+
tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
|
| 1360 |
+
video_file, HUNYUAN_MAX_DUR, crossfade_s)
|
| 1361 |
+
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) Γ β€15 s")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1363 |
seg_clip_paths = [
|
| 1364 |
_extract_segment_clip(silent_video, s, e - s,
|
| 1365 |
os.path.join(tmp_dir, f"hny_seg_{i}.mp4"))
|
| 1366 |
for i, (s, e) in enumerate(segments)
|
| 1367 |
]
|
| 1368 |
|
| 1369 |
+
# ββ One GPU call per segment ββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1370 |
results = []
|
| 1371 |
for sample_idx in range(num_samples):
|
| 1372 |
seg_wavs = []
|
| 1373 |
sr = 48000
|
| 1374 |
+
text_feats_path = ""
|
| 1375 |
+
_t0 = time.perf_counter()
|
| 1376 |
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 1377 |
+
print(f"[HunyuanFoley] sample {sample_idx+1} seg {seg_i+1}/{len(segments)} "
|
| 1378 |
+
f"{seg_start:.1f}β{seg_end:.1f}s")
|
| 1379 |
+
wav, sr, text_feats_path = _hunyuan_gpu_infer_one_seg(
|
| 1380 |
+
video_file, prompt, negative_prompt, seed_val,
|
| 1381 |
+
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db,
|
| 1382 |
+
num_samples, silent_video, seg_clip_paths[seg_i],
|
| 1383 |
+
seg_clip_paths[0], text_feats_path,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1384 |
)
|
|
|
|
| 1385 |
seg_wavs.append(wav)
|
| 1386 |
+
_log_inference_timing("HunyuanFoley", time.perf_counter() - _t0,
|
|
|
|
| 1387 |
len(segments), int(num_steps), HUNYUAN_SECS_PER_STEP)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1388 |
results.append((seg_wavs, sr, text_feats_path))
|
| 1389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1390 |
# ββ CPU post-processing (no GPU needed) ββ
|
| 1391 |
def _hunyuan_extras(sample_idx, result, td):
|
| 1392 |
+
_, _sr, tfp = result
|
| 1393 |
+
return {"text_feats_path": tfp}
|
|
|
|
|
|
|
| 1394 |
|
| 1395 |
outputs = _post_process_samples(
|
| 1396 |
results, model="hunyuan", tmp_dir=tmp_dir,
|