Bils commited on
Commit
e7621f8
·
verified ·
1 Parent(s): 22d96d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -226
app.py CHANGED
@@ -1,54 +1,51 @@
 
 
 
1
  import os
2
- import io
3
  import sys
 
4
  import json
 
 
5
  import shutil
 
6
  import random
7
  import tempfile
8
- import base64
9
- from datetime import datetime
10
  from typing import List, Optional, Tuple, Dict
11
 
12
- import gradio as gr
13
  import numpy as np
14
  import torch
15
  import torchaudio
 
16
  from loguru import logger
17
  from huggingface_hub import snapshot_download
18
-
19
- # --- Tencent repo imports (pulled at startup) ---
20
- # These are available after we git clone the repo in prepare_once()
21
- # Do not move these imports above the clone step in __main__.
22
- # from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process
23
- # from hunyuanvideo_foley.utils.feature_utils import feature_process
24
- # from hunyuanvideo_foley.utils.media_utils import merge_audio_video
25
-
26
- # HF Spaces GPU decorator
27
- import spaces
28
 
29
  # -------------------------
30
  # Constants & configuration
31
  # -------------------------
 
 
 
 
 
 
 
32
  SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
33
  SPACE_TAGLINE = "Text/Video → Audio Foley. Created by bilsimaging.com"
34
- GALLERY_DIR = os.environ.get("OUTPUTS_DIR", "outputs")
35
- WEIGHTS_DIR = os.environ.get("HIFI_FOLEY_MODEL_PATH", "/home/user/app/weights")
36
- REPO_DIR = "/home/user/app/HunyuanVideo-Foley"
37
- CONFIG_PATH = os.environ.get(
38
- "HIFI_FOLEY_CONFIG",
39
- f"{REPO_DIR}/configs/hunyuanvideo-foley-xxl.yaml"
40
- )
41
- # keep <=120s for ZeroGPU
42
- GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
43
 
44
- os.makedirs(GALLERY_DIR, exist_ok=True)
45
- os.makedirs(WEIGHTS_DIR, exist_ok=True)
46
 
47
- # Globals populated after model load
48
  _model_dict = None
49
  _cfg = None
50
  _device: Optional[torch.device] = None
51
 
 
52
  # ------------
53
  # Small helpers
54
  # ------------
@@ -67,61 +64,32 @@ def _setup_device(pref: str = "auto", gpu_id: int = 0) -> torch.device:
67
  return d
68
 
69
 
70
- def _save_video_result(video_file: str, audio_tensor: torch.Tensor, sr: int, idx: int) -> str:
71
- """Save audio to wav, merge with original video, and save mp4 into gallery."""
72
- from hunyuanvideo_foley.utils.media_utils import merge_audio_video
73
-
74
- temp_dir = tempfile.mkdtemp()
75
- audio_path = os.path.join(temp_dir, f"gen_{idx}.wav")
76
-
77
- # torchaudio expects shape [channels, samples]
78
- if audio_tensor.ndim == 1:
79
- audio_tensor = audio_tensor.unsqueeze(0)
80
- torchaudio.save(audio_path, audio_tensor.cpu(), sr)
81
-
82
- timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
83
- out_name = f"shortifoley_{timestamp}_{idx}.mp4"
84
- out_path = os.path.join(GALLERY_DIR, out_name)
85
- merge_audio_video(audio_path, video_file, out_path)
86
- return out_path
87
-
88
-
89
- def _list_gallery(limit: int = 100) -> List[str]:
90
- files = []
91
- for fn in sorted(os.listdir(GALLERY_DIR), reverse=True):
92
- if fn.lower().endswith((".mp4", ".webm", ".mov", ".mkv")):
93
- files.append(os.path.join(GALLERY_DIR, fn))
94
- if len(files) >= limit:
95
- break
96
- return files
97
-
98
-
99
  def _ensure_repo() -> None:
100
- """Shallow clone the Tencent repo with LFS smudge disabled to avoid quota issues."""
101
- if os.path.exists(REPO_DIR) and os.path.isdir(REPO_DIR):
102
  return
103
  cmd = (
104
- f"GIT_LFS_SKIP_SMUDGE=1 git -c filter.lfs.smudge= "
105
- f"-c filter.lfs.required=false clone --depth 1 "
106
- f"https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git {REPO_DIR}"
107
  )
108
  logger.info(f">> {cmd}")
109
  os.system(cmd)
110
 
111
 
112
  def _download_weights_if_needed() -> None:
113
- """Pull big .pth files (and small assets) from HF model repo snapshot."""
114
- # The official weights are hosted on the HF model page, so we snapshot into WEIGHTS_DIR
115
  snapshot_download(
116
  repo_id="tencent/HunyuanVideo-Foley",
117
- local_dir=WEIGHTS_DIR,
118
  resume_download=True,
119
  allow_patterns=[
120
  "hunyuanvideo_foley.pth",
121
  "synchformer_state_dict.pth",
122
  "vae_128d_48k.pth",
123
  "assets/*",
124
- "config.yaml", # not used directly here, but harmless
125
  ],
126
  )
127
 
@@ -137,15 +105,13 @@ def prepare_once() -> None:
137
  def auto_load_models() -> str:
138
  """
139
  Load HunyuanVideo-Foley + encoders on the chosen device.
140
- Uses safetensors where possible; falls back to HF/torch internal loaders.
141
  """
142
  global _model_dict, _cfg, _device
143
 
144
  if _model_dict is not None and _cfg is not None:
145
  return "Model already loaded."
146
 
147
- # Late imports (repo becomes available after clone).
148
- sys.path.append(REPO_DIR)
149
  from hunyuanvideo_foley.utils.model_utils import load_model
150
 
151
  _device = _setup_device("auto", 0)
@@ -154,13 +120,79 @@ def auto_load_models() -> str:
154
  logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
155
 
156
  try:
157
- _model_dict, _cfg = load_model(WEIGHTS_DIR, CONFIG_PATH, _device)
158
  return "✅ Model loaded."
159
  except Exception as e:
160
  logger.error(e)
161
  return f"❌ Failed to load model: {e}"
162
 
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  @spaces.GPU(duration=GPU_DURATION)
165
  @torch.inference_mode()
166
  def infer_single_video(
@@ -172,22 +204,15 @@ def infer_single_video(
172
  ) -> Tuple[List[str], str]:
173
  """
174
  Generate Foley audio for an uploaded video (1–6 variants).
175
- Args:
176
- video_file: Path to a local video file on the Space.
177
- text_prompt: Optional text prompt to steer the audio.
178
- guidance_scale: CFG scale.
179
- num_inference_steps: Denoising steps.
180
- sample_nums: Number of audio variants to produce (1–6).
181
- Returns:
182
- (video_paths, status_message)
183
  """
184
  if _model_dict is None or _cfg is None:
185
- return [], "❌ Load the model first."
186
 
187
  if not video_file:
188
  return [], "❌ Please provide a video."
189
 
190
- sys.path.append(REPO_DIR)
191
  from hunyuanvideo_foley.utils.feature_utils import feature_process
192
  from hunyuanvideo_foley.utils.model_utils import denoise_process
193
 
@@ -197,40 +222,39 @@ def infer_single_video(
197
  )
198
 
199
  # generate batch
200
- sample_nums = int(max(1, min(6, sample_nums)))
201
  audio, sr = denoise_process(
202
  visual_feats,
203
  text_feats,
204
  audio_len_s,
205
  _model_dict,
206
  _cfg,
207
- guidance_scale=guidance_scale,
208
  num_inference_steps=int(num_inference_steps),
209
- batch_size=sample_nums,
210
  )
211
 
212
  # save results
213
- out_videos = []
214
- for i in range(sample_nums):
215
- out_videos.append(_save_video_result(video_file, audio[i], sr, i + 1))
216
 
217
- return out_videos, f"✅ Generated {len(out_videos)} result(s). Saved to {GALLERY_DIR}/"
218
 
219
 
220
  # ---------------
221
- # MCP-only API(s)
222
  # ---------------
223
  def _download_to_tmp(url: str) -> str:
224
- """Download a remote file to a temp path. Lightweight helper for MCP."""
225
  try:
226
- import requests # optional dependency
227
  except Exception:
228
- raise RuntimeError("The server is missing 'requests'. Add it to requirements.txt to use URL inputs.")
229
 
230
  r = requests.get(url, timeout=30)
231
  r.raise_for_status()
232
- suffix = ".mp4"
233
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
234
  tmp.write(r.content)
235
  tmp.flush()
236
  tmp.close()
@@ -238,10 +262,9 @@ def _download_to_tmp(url: str) -> str:
238
 
239
 
240
  def _maybe_from_base64(data_url_or_b64: str) -> str:
241
- """Accept data: URLs or raw base64 for MCP convenience; returns temp file path."""
242
  b64 = data_url_or_b64
243
  if data_url_or_b64.startswith("data:"):
244
- # data:video/mp4;base64,XXXX
245
  b64 = data_url_or_b64.split(",", 1)[-1]
246
  raw = base64.b64decode(b64)
247
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
@@ -252,36 +275,16 @@ def _maybe_from_base64(data_url_or_b64: str) -> str:
252
 
253
 
254
  def _normalize_video_input(video_url_or_b64: str) -> str:
255
- """Return a local filename from url or base64. Raises on error."""
256
  v = (video_url_or_b64 or "").strip()
257
  if v.startswith("http://") or v.startswith("https://"):
258
  return _download_to_tmp(v)
259
- # assume base64
260
  return _maybe_from_base64(v)
261
 
262
 
263
- def _api_generate_from_local(
264
- local_video_path: str,
265
- text_prompt: str = "",
266
- guidance_scale: float = 4.5,
267
- num_inference_steps: int = 50,
268
- sample_nums: int = 1,
269
- ) -> Dict[str, List[str]]:
270
- outs, msg = infer_single_video(
271
- video_file=local_video_path,
272
- text_prompt=text_prompt or "",
273
- guidance_scale=float(guidance_scale),
274
- num_inference_steps=int(num_inference_steps),
275
- sample_nums=int(sample_nums),
276
- )
277
- return {"videos": outs, "message": msg}
278
-
279
-
280
- # Expose a **pure API** endpoint that becomes an MCP tool but does not show a UI.
281
  with gr.Blocks() as mcp_only_endpoints:
282
  gr.Markdown("These endpoints are MCP/API only and have no visible UI.", show_label=False)
283
 
284
- @gr.api # becomes an MCP tool and a REST API endpoint automatically
285
  def api_generate_from_url(
286
  video_url_or_b64: str,
287
  text_prompt: str = "",
@@ -291,46 +294,76 @@ with gr.Blocks() as mcp_only_endpoints:
291
  ) -> Dict[str, List[str]]:
292
  """
293
  Generate Foley from a remote video URL or base64-encoded video.
294
- Args:
295
- video_url_or_b64: http(s) URL or data/base64 string of a short video (mp4).
296
- text_prompt: Optional audio description (English).
297
- guidance_scale: CFG scale (1.0–10.0).
298
- num_inference_steps: Denoising steps (10–100).
299
- sample_nums: Number of variants to return (1–6).
300
- Returns:
301
- dict with { "videos": [paths], "message": str }
302
  """
303
  if _model_dict is None or _cfg is None:
304
- raise RuntimeError("Model not loaded. Call /load_model tool or use the UI once.")
 
 
 
305
 
306
- local_path = _normalize_video_input(video_url_or_b64)
307
- return _api_generate_from_local(local_path, text_prompt, guidance_scale, num_inference_steps, sample_nums)
 
 
308
 
309
- # Tiny status resource & prompt to help MCP clients
310
  @gr.mcp.resource("shortifoley://status")
311
  def shortifoley_status() -> str:
312
  """Return a simple readiness string for MCP clients."""
313
  ready = _model_dict is not None and _cfg is not None
314
  dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
315
- return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={GALLERY_DIR}"
316
 
317
  @gr.mcp.prompt()
318
  def foley_prompt(name: str = "default") -> str:
319
- """A reusable prompt template for generating Foley."""
320
  return (
321
  "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
322
  "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
323
  )
324
 
325
 
326
- # -----------------
327
- # Gradio UI (Blocks)
328
- # -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  def create_ui() -> gr.Blocks:
330
  with gr.Blocks(
331
  title="ShortiFoley — HunyuanVideo-Foley",
332
  css="""
333
- .main-header{ text-align:center; padding:1.5rem; border-radius:16px; background:linear-gradient(135deg,#667eea,#764ba2); color:white; }
334
  .card{ background:white; border:1px solid #e1e5e9; border-radius:16px; padding:1rem; box-shadow:0 8px 32px rgba(0,0,0,.06); }
335
  .generate-btn button{ font-weight:700; }
336
  """
@@ -338,91 +371,82 @@ def create_ui() -> gr.Blocks:
338
 
339
  gr.HTML(f"<div class='main-header'><h1>{SPACE_TITLE}</h1><p>{SPACE_TAGLINE}</p></div>")
340
 
341
- with gr.Row():
342
- with gr.Column(scale=1, elem_classes=["card"]):
343
- gr.Markdown("### 📹 Input")
344
- video_input = gr.Video(label="Upload Video", height=300)
345
- text_input = gr.Textbox(
346
- label="🎯 Audio Description (optional, English)",
347
- placeholder="e.g., Quick rubber-soled footsteps on tile; echoey hallway."
348
- )
349
  with gr.Row():
350
- guidance_scale = gr.Slider(1.0, 10.0, value=4.5, step=0.1, label="CFG Scale")
351
- steps = gr.Slider(10, 100, value=50, step=5, label="Steps")
352
- samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
353
-
354
- generate = gr.Button("🎵 Generate Audio", variant="primary", elem_classes=["generate-btn"])
355
-
356
- with gr.Column(scale=1, elem_classes=["card"]):
357
- gr.Markdown("### 🎥 Result(s)")
358
- v1 = gr.Video(label="Sample 1", height=260, visible=True)
359
- v2 = gr.Video(label="Sample 2", height=160, visible=False)
360
- v3 = gr.Video(label="Sample 3", height=160, visible=False)
361
- v4 = gr.Video(label="Sample 4", height=160, visible=False)
362
- v5 = gr.Video(label="Sample 5", height=160, visible=False)
363
- v6 = gr.Video(label="Sample 6", height=160, visible=False)
364
- status = gr.Textbox(label="Status", interactive=False)
365
-
366
- with gr.Tab("📁 Gallery"):
367
- gr.Markdown("Latest generated videos (autosaved to `outputs/`).")
368
- gallery = gr.Gallery(
369
- value=_list_gallery(),
370
- columns=3,
371
- preview=True,
372
- label="Saved Results"
373
- )
374
- refresh = gr.Button("🔄 Refresh Gallery")
375
-
376
- # Event handlers
377
- def _process(
378
- video_file, text_prompt, cfg, nsteps, nsamples
379
- ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], str]:
380
- outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
381
- # set visibilities based on how many were generated
382
- vis = [gr.update(visible=i < len(outs), value=(outs[i] if i < len(outs) else None)) for i in range(6)]
383
- # update gallery (prepend newest)
384
- return (
385
- *[v.value if isinstance(v, gr.Video) else None for v in []], # filler not used; kept for clarity
386
- )
387
-
388
- def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
389
- outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
390
- updates = []
391
- # six video slots
392
- for i in range(6):
393
- if i < len(outs):
394
- updates.append(gr.update(visible=True, value=outs[i]))
395
- else:
396
- updates.append(gr.update(visible=False, value=None))
397
- # status
398
- updates.append(msg)
399
- # refresh gallery implicitly
400
- gallery_items = _list_gallery()
401
- return (*updates, gr.update(value=gallery_items))
402
-
403
- generate.click(
404
- fn=_process_and_update,
405
- inputs=[video_input, text_input, guidance_scale, steps, samples],
406
- outputs=[v1, v2, v3, v4, v5, v6, status, gallery],
407
- api_name="/infer",
408
- api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
409
- )
410
 
411
- # Visibility toggling from samples slider
412
- def _toggle_vis(n):
413
- n = int(n)
414
- return [
415
- gr.update(visible=True),
416
- gr.update(visible=n >= 2),
417
- gr.update(visible=n >= 3),
418
- gr.update(visible=n >= 4),
419
- gr.update(visible=n >= 5),
420
- gr.update(visible=n >= 6),
421
- ]
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
- samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
 
424
 
425
- refresh.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
 
426
 
427
  return demo
428
 
@@ -437,20 +461,22 @@ def set_seeds(s: int = 1):
437
  # App bootstrap
438
  # -------------
439
  if __name__ == "__main__":
440
- # clean logger -> print to stdout
441
  logger.remove()
442
  logger.add(lambda m: print(m, end=""), level="INFO")
443
-
444
  set_seeds(1)
445
 
446
  logger.info("===== Application Startup =====\n")
447
  prepare_once()
448
 
449
- # Late import after repo present
450
- sys.path.append(REPO_DIR)
451
- from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process # noqa: F401
452
- from hunyuanvideo_foley.utils.feature_utils import feature_process # noqa: F401
453
- from hunyuanvideo_foley.utils.media_utils import merge_audio_video # noqa: F401
 
 
 
 
454
 
455
  msg = auto_load_models()
456
  if not msg.startswith("✅"):
@@ -459,16 +485,13 @@ if __name__ == "__main__":
459
  logger.info(msg)
460
 
461
  ui = create_ui()
462
-
463
- # Mount MCP-only endpoints alongside the UI (optional but handy)
464
  ui.blocks.append(mcp_only_endpoints)
465
 
466
- # IMPORTANT: enable MCP server (tools/resources/prompts). This is all you need.
467
- # See: https://www.gradio.app/guides/building-mcp-server-with-gradio
468
  ui.launch(
469
  server_name="0.0.0.0",
470
  share=False,
471
  show_error=True,
472
- mcp_server=True, # <— MCP enabled
473
- # ssr_mode=True (default in 5.x)
474
  )
 
1
+ # app.py — ShortiFoley (Video -> Foley)
2
+ # Created by bilsimaging.com
3
+
4
  import os
 
5
  import sys
6
+ import io
7
  import json
8
+ import uuid
9
+ import time
10
  import shutil
11
+ import base64
12
  import random
13
  import tempfile
14
+ import datetime
15
+ from pathlib import Path
16
  from typing import List, Optional, Tuple, Dict
17
 
 
18
  import numpy as np
19
  import torch
20
  import torchaudio
21
+ import gradio as gr
22
  from loguru import logger
23
  from huggingface_hub import snapshot_download
24
+ import spaces # HF Spaces ZeroGPU & MCP integration
 
 
 
 
 
 
 
 
 
25
 
26
  # -------------------------
27
  # Constants & configuration
28
  # -------------------------
29
+ ROOT = Path(__file__).parent.resolve()
30
+ REPO_DIR = ROOT / "HunyuanVideo-Foley"
31
+ WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
32
+ CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
33
+ OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs")))
34
+ OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
35
+
36
  SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
37
  SPACE_TAGLINE = "Text/Video → Audio Foley. Created by bilsimaging.com"
38
+ WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
 
 
 
 
 
 
 
 
39
 
40
+ # Keep GPU <= 120s for ZeroGPU (default 110)
41
+ GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
42
 
43
+ # Globals
44
  _model_dict = None
45
  _cfg = None
46
  _device: Optional[torch.device] = None
47
 
48
+
49
  # ------------
50
  # Small helpers
51
  # ------------
 
64
  return d
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def _ensure_repo() -> None:
68
+ """Shallow-clone Tencent repo with LFS smudge disabled (avoid LFS quota checkout)."""
69
+ if REPO_DIR.exists():
70
  return
71
  cmd = (
72
+ "GIT_LFS_SKIP_SMUDGE=1 "
73
+ "git -c filter.lfs.smudge= -c filter.lfs.required=false "
74
+ f"clone --depth 1 https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git {REPO_DIR}"
75
  )
76
  logger.info(f">> {cmd}")
77
  os.system(cmd)
78
 
79
 
80
  def _download_weights_if_needed() -> None:
81
+ """Snapshot only needed files from HF weights/model hub."""
82
+ WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
83
  snapshot_download(
84
  repo_id="tencent/HunyuanVideo-Foley",
85
+ local_dir=str(WEIGHTS_DIR),
86
  resume_download=True,
87
  allow_patterns=[
88
  "hunyuanvideo_foley.pth",
89
  "synchformer_state_dict.pth",
90
  "vae_128d_48k.pth",
91
  "assets/*",
92
+ "config.yaml", # harmless
93
  ],
94
  )
95
 
 
105
  def auto_load_models() -> str:
106
  """
107
  Load HunyuanVideo-Foley + encoders on the chosen device.
 
108
  """
109
  global _model_dict, _cfg, _device
110
 
111
  if _model_dict is not None and _cfg is not None:
112
  return "Model already loaded."
113
 
114
+ sys.path.append(str(REPO_DIR))
 
115
  from hunyuanvideo_foley.utils.model_utils import load_model
116
 
117
  _device = _setup_device("auto", 0)
 
120
  logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
121
 
122
  try:
123
+ _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
124
  return "✅ Model loaded."
125
  except Exception as e:
126
  logger.error(e)
127
  return f"❌ Failed to load model: {e}"
128
 
129
 
130
+ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
131
+ """Use project's helper (preferred) with a fallback to ffmpeg via subprocess."""
132
+ sys.path.append(str(REPO_DIR))
133
+ try:
134
+ from hunyuanvideo_foley.utils.media_utils import merge_audio_video
135
+ merge_audio_video(audio_path, video_path, out_path)
136
+ except Exception as e:
137
+ # Fallback: plain ffmpeg merge (assumes same duration or lets ffmpeg handle)
138
+ logger.warning(f"merge_audio_video failed, falling back to ffmpeg: {e}")
139
+ import subprocess
140
+ cmd = [
141
+ "ffmpeg", "-y",
142
+ "-i", video_path,
143
+ "-i", audio_path,
144
+ "-c:v", "copy",
145
+ "-c:a", "aac",
146
+ "-shortest",
147
+ out_path
148
+ ]
149
+ subprocess.run(cmd, check=True)
150
+
151
+
152
+ def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
153
+ prompt: str) -> str:
154
+ """Save WAV + MP4 in outputs/, add metadata and a small watermark note (metadata only)."""
155
+ # torchaudio expects [C, N]
156
+ if audio_tensor.ndim == 1:
157
+ audio_tensor = audio_tensor.unsqueeze(0)
158
+
159
+ tmpdir = Path(tempfile.mkdtemp())
160
+ wav_path = tmpdir / f"gen_{idx}.wav"
161
+ torchaudio.save(str(wav_path), audio_tensor.cpu(), sr)
162
+
163
+ ts = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
164
+ base = f"shortifoley_{ts}_{idx}"
165
+ out_mp4 = OUTPUTS_DIR / f"{base}.mp4"
166
+
167
+ _merge_audio_video(str(wav_path), video_src, str(out_mp4))
168
+
169
+ # Save JSON sidecar
170
+ meta = {
171
+ "id": base,
172
+ "created_utc": datetime.datetime.utcnow().isoformat() + "Z",
173
+ "source_video": Path(video_src).name,
174
+ "output_video": Path(out_mp4).name,
175
+ "prompt": prompt or "",
176
+ "watermark": WATERMARK_NOTE,
177
+ "tool": "ShortiFoley (HunyuanVideo-Foley)"
178
+ }
179
+ (OUTPUTS_DIR / f"{base}.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2))
180
+
181
+ return str(out_mp4)
182
+
183
+
184
+ def _list_gallery(limit: int = 100) -> List[str]:
185
+ vids = []
186
+ for p in sorted(OUTPUTS_DIR.glob("*.mp4"), key=lambda x: x.stat().st_mtime, reverse=True):
187
+ vids.append(str(p))
188
+ if len(vids) >= limit:
189
+ break
190
+ return vids
191
+
192
+
193
+ # ================
194
+ # Inference kernel
195
+ # ================
196
  @spaces.GPU(duration=GPU_DURATION)
197
  @torch.inference_mode()
198
  def infer_single_video(
 
204
  ) -> Tuple[List[str], str]:
205
  """
206
  Generate Foley audio for an uploaded video (1–6 variants).
207
+ Returns: (list of output video paths, status message)
 
 
 
 
 
 
 
208
  """
209
  if _model_dict is None or _cfg is None:
210
+ return [], "❌ Load the model first (open the app once)."
211
 
212
  if not video_file:
213
  return [], "❌ Please provide a video."
214
 
215
+ sys.path.append(str(REPO_DIR))
216
  from hunyuanvideo_foley.utils.feature_utils import feature_process
217
  from hunyuanvideo_foley.utils.model_utils import denoise_process
218
 
 
222
  )
223
 
224
  # generate batch
225
+ n = int(max(1, min(6, sample_nums)))
226
  audio, sr = denoise_process(
227
  visual_feats,
228
  text_feats,
229
  audio_len_s,
230
  _model_dict,
231
  _cfg,
232
+ guidance_scale=float(guidance_scale),
233
  num_inference_steps=int(num_inference_steps),
234
+ batch_size=n,
235
  )
236
 
237
  # save results
238
+ outs = []
239
+ for i in range(n):
240
+ outs.append(_save_outputs(video_file, audio[i], sr, i + 1, text_prompt or ""))
241
 
242
+ return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
243
 
244
 
245
  # ---------------
246
+ # MCP-only APIs
247
  # ---------------
248
  def _download_to_tmp(url: str) -> str:
249
+ """Download a remote file to temp."""
250
  try:
251
+ import requests
252
  except Exception:
253
+ raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
254
 
255
  r = requests.get(url, timeout=30)
256
  r.raise_for_status()
257
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
 
258
  tmp.write(r.content)
259
  tmp.flush()
260
  tmp.close()
 
262
 
263
 
264
  def _maybe_from_base64(data_url_or_b64: str) -> str:
265
+ """Accept data: URLs or raw base64; returns temp file path."""
266
  b64 = data_url_or_b64
267
  if data_url_or_b64.startswith("data:"):
 
268
  b64 = data_url_or_b64.split(",", 1)[-1]
269
  raw = base64.b64decode(b64)
270
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
 
275
 
276
 
277
  def _normalize_video_input(video_url_or_b64: str) -> str:
 
278
  v = (video_url_or_b64 or "").strip()
279
  if v.startswith("http://") or v.startswith("https://"):
280
  return _download_to_tmp(v)
 
281
  return _maybe_from_base64(v)
282
 
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  with gr.Blocks() as mcp_only_endpoints:
285
  gr.Markdown("These endpoints are MCP/API only and have no visible UI.", show_label=False)
286
 
287
+ @gr.api
288
  def api_generate_from_url(
289
  video_url_or_b64: str,
290
  text_prompt: str = "",
 
294
  ) -> Dict[str, List[str]]:
295
  """
296
  Generate Foley from a remote video URL or base64-encoded video.
297
+ Returns: {"videos": [paths], "message": str}
 
 
 
 
 
 
 
298
  """
299
  if _model_dict is None or _cfg is None:
300
+ raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
301
+ local = _normalize_video_input(video_url_or_b64)
302
+ outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
303
+ return {"videos": outs, "message": msg}
304
 
305
+ @gr.api
306
+ def load_model_tool() -> str:
307
+ """Ensure model is loaded on server (MCP convenience)."""
308
+ return auto_load_models()
309
 
 
310
  @gr.mcp.resource("shortifoley://status")
311
  def shortifoley_status() -> str:
312
  """Return a simple readiness string for MCP clients."""
313
  ready = _model_dict is not None and _cfg is not None
314
  dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
315
+ return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
316
 
317
  @gr.mcp.prompt()
318
  def foley_prompt(name: str = "default") -> str:
319
+ """Reusable guidance for describing sound ambience."""
320
  return (
321
  "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
322
  "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
323
  )
324
 
325
 
326
+ # -------------
327
+ # Gradio UI
328
+ # -------------
329
+ def _about_html() -> str:
330
+ return f"""
331
+ <div style="line-height:1.6">
332
+ <h2>About ShortiFoley</h2>
333
+ <p><b>ShortiFoley</b> automatically generates realistic Foley soundtracks for short videos using
334
+ Tencent’s HunyuanVideo-Foley with CLAP & SigLIP2 encoders. It includes autosave and an MCP server so
335
+ you can call it from agents or workflows (e.g., n8n).</p>
336
+ <p><b>Created by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a></b></p>
337
+
338
+ <h3>How to use</h3>
339
+ <ol>
340
+ <li>Upload a video (ideally &lt; 120 seconds).</li>
341
+ <li>Optionally enter a text description of the sound (English).</li>
342
+ <li>Adjust CFG scale, steps, and number of variants.</li>
343
+ <li>Click <b>Generate</b>. Results appear on the right and are stored in the Gallery.</li>
344
+ </ol>
345
+
346
+ <h3>Tips</h3>
347
+ <ul>
348
+ <li>Trim clips to the key action (5–30s) for faster, crisper results.</li>
349
+ <li>Include material cues (“wood”, “metal”, “concrete”), action cues (“splash”, “glass shatter”), and ambience (“roomy”, “echoey”).</li>
350
+ <li>Generate multiple variants and pick the most natural.</li>
351
+ </ul>
352
+
353
+ <h3>MCP / Automation</h3>
354
+ <p>This app runs as an <b>MCP server</b>. Open the footer “View API → MCP” to copy a ready config. You can also use the REST endpoints listed there. Perfect for n8n integrations.</p>
355
+
356
+ <h3>Watermark</h3>
357
+ <p>Each output’s metadata includes: <i>{WATERMARK_NOTE}</i>. If you want a <b>visible video overlay</b>, I can add an ffmpeg overlay step on request.</p>
358
+ </div>
359
+ """
360
+
361
+
362
  def create_ui() -> gr.Blocks:
363
  with gr.Blocks(
364
  title="ShortiFoley — HunyuanVideo-Foley",
365
  css="""
366
+ .main-header{ text-align:center; padding:1.2rem; border-radius:16px; background:linear-gradient(135deg,#667eea,#764ba2); color:white; }
367
  .card{ background:white; border:1px solid #e1e5e9; border-radius:16px; padding:1rem; box-shadow:0 8px 32px rgba(0,0,0,.06); }
368
  .generate-btn button{ font-weight:700; }
369
  """
 
371
 
372
  gr.HTML(f"<div class='main-header'><h1>{SPACE_TITLE}</h1><p>{SPACE_TAGLINE}</p></div>")
373
 
374
+ with gr.Tabs():
375
+ with gr.Tab("Run"):
 
 
 
 
 
 
376
  with gr.Row():
377
+ with gr.Column(scale=1, elem_classes=["card"]):
378
+ gr.Markdown("### 📹 Input")
379
+ video_input = gr.Video(label="Upload Video", height=300)
380
+ text_input = gr.Textbox(
381
+ label="🎯 Audio Description (optional, English)",
382
+ placeholder="e.g., Rubber soles on wet tile, distant chatter.",
383
+ lines=3
384
+ )
385
+ with gr.Row():
386
+ guidance_scale = gr.Slider(1.0, 10.0, value=4.5, step=0.1, label="CFG Scale")
387
+ steps = gr.Slider(10, 100, value=50, step=5, label="Steps")
388
+ samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
389
+ generate = gr.Button("🎵 Generate", variant="primary", elem_classes=["generate-btn"])
390
+
391
+ with gr.Column(scale=1, elem_classes=["card"]):
392
+ gr.Markdown("### 🎥 Result(s)")
393
+ v1 = gr.Video(label="Sample 1", height=260, visible=True)
394
+ v2 = gr.Video(label="Sample 2", height=160, visible=False)
395
+ v3 = gr.Video(label="Sample 3", height=160, visible=False)
396
+ v4 = gr.Video(label="Sample 4", height=160, visible=False)
397
+ v5 = gr.Video(label="Sample 5", height=160, visible=False)
398
+ v6 = gr.Video(label="Sample 6", height=160, visible=False)
399
+ status = gr.Textbox(label="Status", interactive=False)
400
+
401
+ # Generate handler
402
+ def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
403
+ outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
404
+ vis_updates = []
405
+ for i in range(6):
406
+ if i < len(outs):
407
+ vis_updates.append(gr.update(visible=True, value=outs[i]))
408
+ else:
409
+ vis_updates.append(gr.update(visible=False, value=None))
410
+ gal_items = _list_gallery()
411
+ return (*vis_updates, msg, gr.update(value=gal_items))
412
+
413
+ generate.click(
414
+ fn=_process_and_update,
415
+ inputs=[video_input, text_input, guidance_scale, steps, samples],
416
+ outputs=[v1, v2, v3, v4, v5, v6, status, ],
417
+ api_name="/infer",
418
+ api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
419
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
421
+ # Toggle visibility when # of samples changes
422
+ def _toggle_vis(n):
423
+ n = int(n)
424
+ return [
425
+ gr.update(visible=True),
426
+ gr.update(visible=n >= 2),
427
+ gr.update(visible=n >= 3),
428
+ gr.update(visible=n >= 4),
429
+ gr.update(visible=n >= 5),
430
+ gr.update(visible=n >= 6),
431
+ ]
432
+ samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
433
+
434
+ with gr.Tab("📁 Gallery"):
435
+ gr.Markdown("Latest generated videos (autosaved to `outputs/`).")
436
+ gallery = gr.Gallery(
437
+ value=_list_gallery(),
438
+ columns=3,
439
+ preview=True,
440
+ label="Saved Results"
441
+ )
442
+ refresh = gr.Button("🔄 Refresh Gallery")
443
+ refresh.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
444
 
445
+ with gr.Tab("ℹ️ About"):
446
+ gr.HTML(_about_html())
447
 
448
+ # Also expose gallery update after generate
449
+ generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
450
 
451
  return demo
452
 
 
461
  # App bootstrap
462
  # -------------
463
  if __name__ == "__main__":
 
464
  logger.remove()
465
  logger.add(lambda m: print(m, end=""), level="INFO")
 
466
  set_seeds(1)
467
 
468
  logger.info("===== Application Startup =====\n")
469
  prepare_once()
470
 
471
+ # Ensure import paths after repo is present
472
+ sys.path.append(str(REPO_DIR))
473
+ try:
474
+ # Probe key modules early (better error surfacing)
475
+ from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process # noqa: F401
476
+ from hunyuanvideo_foley.utils.feature_utils import feature_process # noqa: F401
477
+ from hunyuanvideo_foley.utils.media_utils import merge_audio_video # noqa: F401
478
+ except Exception as e:
479
+ logger.warning(f"Repo imports not ready yet: {e}")
480
 
481
  msg = auto_load_models()
482
  if not msg.startswith("✅"):
 
485
  logger.info(msg)
486
 
487
  ui = create_ui()
488
+ # Mount MCP-only endpoints alongside the UI
 
489
  ui.blocks.append(mcp_only_endpoints)
490
 
491
+ # Enable MCP server so tools/resources/prompts are discoverable
 
492
  ui.launch(
493
  server_name="0.0.0.0",
494
  share=False,
495
  show_error=True,
496
+ mcp_server=True, # MCP on
 
497
  )