Spaces:

Bils
/

ShortiFoley

Running on Zero

App Files Files Community

Bils commited on Aug 31

Commit

e7621f8

verified ·

1 Parent(s): 22d96d3

Update app.py

Browse files

Files changed (1) hide show

app.py +249 -226

app.py CHANGED Viewed

@@ -1,54 +1,51 @@
 import os
-import io
 import sys
 import json
 import shutil
 import random
 import tempfile
-import base64
-from datetime import datetime
 from typing import List, Optional, Tuple, Dict
-import gradio as gr
 import numpy as np
 import torch
 import torchaudio
 from loguru import logger
 from huggingface_hub import snapshot_download
-# --- Tencent repo imports (pulled at startup) ---
-# These are available after we git clone the repo in prepare_once()
-# Do not move these imports above the clone step in __main__.
-# from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process
-# from hunyuanvideo_foley.utils.feature_utils import feature_process
-# from hunyuanvideo_foley.utils.media_utils import merge_audio_video
-# HF Spaces GPU decorator
-import spaces
 # -------------------------
 # Constants & configuration
 # -------------------------
 SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
 SPACE_TAGLINE = "Text/Video → Audio Foley. Created by bilsimaging.com"
-GALLERY_DIR = os.environ.get("OUTPUTS_DIR", "outputs")
-WEIGHTS_DIR = os.environ.get("HIFI_FOLEY_MODEL_PATH", "/home/user/app/weights")
-REPO_DIR = "/home/user/app/HunyuanVideo-Foley"
-CONFIG_PATH = os.environ.get(
-    "HIFI_FOLEY_CONFIG",
-    f"{REPO_DIR}/configs/hunyuanvideo-foley-xxl.yaml"
-)
-# keep <=120s for ZeroGPU
-GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
-os.makedirs(GALLERY_DIR, exist_ok=True)
-os.makedirs(WEIGHTS_DIR, exist_ok=True)
-# Globals populated after model load
 _model_dict = None
 _cfg = None
 _device: Optional[torch.device] = None
 # ------------
 # Small helpers
 # ------------
@@ -67,61 +64,32 @@ def _setup_device(pref: str = "auto", gpu_id: int = 0) -> torch.device:
     return d
-def _save_video_result(video_file: str, audio_tensor: torch.Tensor, sr: int, idx: int) -> str:
-    """Save audio to wav, merge with original video, and save mp4 into gallery."""
-    from hunyuanvideo_foley.utils.media_utils import merge_audio_video
-    temp_dir = tempfile.mkdtemp()
-    audio_path = os.path.join(temp_dir, f"gen_{idx}.wav")
-    # torchaudio expects shape [channels, samples]
-    if audio_tensor.ndim == 1:
-        audio_tensor = audio_tensor.unsqueeze(0)
-    torchaudio.save(audio_path, audio_tensor.cpu(), sr)
-    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
-    out_name = f"shortifoley_{timestamp}_{idx}.mp4"
-    out_path = os.path.join(GALLERY_DIR, out_name)
-    merge_audio_video(audio_path, video_file, out_path)
-    return out_path
-def _list_gallery(limit: int = 100) -> List[str]:
-    files = []
-    for fn in sorted(os.listdir(GALLERY_DIR), reverse=True):
-        if fn.lower().endswith((".mp4", ".webm", ".mov", ".mkv")):
-            files.append(os.path.join(GALLERY_DIR, fn))
-        if len(files) >= limit:
-            break
-    return files
 def _ensure_repo() -> None:
-    """Shallow clone the Tencent repo with LFS smudge disabled to avoid quota issues."""
-    if os.path.exists(REPO_DIR) and os.path.isdir(REPO_DIR):
         return
     cmd = (
-        f"GIT_LFS_SKIP_SMUDGE=1 git -c filter.lfs.smudge= "
-        f"-c filter.lfs.required=false clone --depth 1 "
-        f"https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git {REPO_DIR}"
     )
     logger.info(f">> {cmd}")
     os.system(cmd)
 def _download_weights_if_needed() -> None:
-    """Pull big .pth files (and small assets) from HF model repo snapshot."""
-    # The official weights are hosted on the HF model page, so we snapshot into WEIGHTS_DIR
     snapshot_download(
         repo_id="tencent/HunyuanVideo-Foley",
-        local_dir=WEIGHTS_DIR,
         resume_download=True,
         allow_patterns=[
             "hunyuanvideo_foley.pth",
             "synchformer_state_dict.pth",
             "vae_128d_48k.pth",
             "assets/*",
-            "config.yaml",  # not used directly here, but harmless
         ],
     )
@@ -137,15 +105,13 @@ def prepare_once() -> None:
 def auto_load_models() -> str:
     """
     Load HunyuanVideo-Foley + encoders on the chosen device.
-    Uses safetensors where possible; falls back to HF/torch internal loaders.
     """
     global _model_dict, _cfg, _device
     if _model_dict is not None and _cfg is not None:
         return "Model already loaded."
-    # Late imports (repo becomes available after clone).
-    sys.path.append(REPO_DIR)
     from hunyuanvideo_foley.utils.model_utils import load_model
     _device = _setup_device("auto", 0)
@@ -154,13 +120,79 @@ def auto_load_models() -> str:
     logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
     try:
-        _model_dict, _cfg = load_model(WEIGHTS_DIR, CONFIG_PATH, _device)
         return "✅ Model loaded."
     except Exception as e:
         logger.error(e)
         return f"❌ Failed to load model: {e}"
 @spaces.GPU(duration=GPU_DURATION)
 @torch.inference_mode()
 def infer_single_video(
@@ -172,22 +204,15 @@ def infer_single_video(
 ) -> Tuple[List[str], str]:
     """
     Generate Foley audio for an uploaded video (1–6 variants).
-    Args:
-        video_file: Path to a local video file on the Space.
-        text_prompt: Optional text prompt to steer the audio.
-        guidance_scale: CFG scale.
-        num_inference_steps: Denoising steps.
-        sample_nums: Number of audio variants to produce (1–6).
-    Returns:
-        (video_paths, status_message)
     """
     if _model_dict is None or _cfg is None:
-        return [], "❌ Load the model first."
     if not video_file:
         return [], "❌ Please provide a video."
-    sys.path.append(REPO_DIR)
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     from hunyuanvideo_foley.utils.model_utils import denoise_process
@@ -197,40 +222,39 @@ def infer_single_video(
     )
     # generate batch
-    sample_nums = int(max(1, min(6, sample_nums)))
     audio, sr = denoise_process(
         visual_feats,
         text_feats,
         audio_len_s,
         _model_dict,
         _cfg,
-        guidance_scale=guidance_scale,
         num_inference_steps=int(num_inference_steps),
-        batch_size=sample_nums,
     )
     # save results
-    out_videos = []
-    for i in range(sample_nums):
-        out_videos.append(_save_video_result(video_file, audio[i], sr, i + 1))
-    return out_videos, f"✅ Generated {len(out_videos)} result(s). Saved to {GALLERY_DIR}/"
 # ---------------
-# MCP-only API(s)
 # ---------------
 def _download_to_tmp(url: str) -> str:
-    """Download a remote file to a temp path. Lightweight helper for MCP."""
     try:
-        import requests  # optional dependency
     except Exception:
-        raise RuntimeError("The server is missing 'requests'. Add it to requirements.txt to use URL inputs.")
     r = requests.get(url, timeout=30)
     r.raise_for_status()
-    suffix = ".mp4"
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
     tmp.write(r.content)
     tmp.flush()
     tmp.close()
@@ -238,10 +262,9 @@ def _download_to_tmp(url: str) -> str:
 def _maybe_from_base64(data_url_or_b64: str) -> str:
-    """Accept data: URLs or raw base64 for MCP convenience; returns temp file path."""
     b64 = data_url_or_b64
     if data_url_or_b64.startswith("data:"):
-        # data:video/mp4;base64,XXXX
         b64 = data_url_or_b64.split(",", 1)[-1]
     raw = base64.b64decode(b64)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
@@ -252,36 +275,16 @@ def _maybe_from_base64(data_url_or_b64: str) -> str:
 def _normalize_video_input(video_url_or_b64: str) -> str:
-    """Return a local filename from url or base64. Raises on error."""
     v = (video_url_or_b64 or "").strip()
     if v.startswith("http://") or v.startswith("https://"):
         return _download_to_tmp(v)
-    # assume base64
     return _maybe_from_base64(v)
-def _api_generate_from_local(
-    local_video_path: str,
-    text_prompt: str = "",
-    guidance_scale: float = 4.5,
-    num_inference_steps: int = 50,
-    sample_nums: int = 1,
-) -> Dict[str, List[str]]:
-    outs, msg = infer_single_video(
-        video_file=local_video_path,
-        text_prompt=text_prompt or "",
-        guidance_scale=float(guidance_scale),
-        num_inference_steps=int(num_inference_steps),
-        sample_nums=int(sample_nums),
-    )
-    return {"videos": outs, "message": msg}
-# Expose a **pure API** endpoint that becomes an MCP tool but does not show a UI.
 with gr.Blocks() as mcp_only_endpoints:
     gr.Markdown("These endpoints are MCP/API only and have no visible UI.", show_label=False)
-    @gr.api  # becomes an MCP tool and a REST API endpoint automatically
     def api_generate_from_url(
         video_url_or_b64: str,
         text_prompt: str = "",
@@ -291,46 +294,76 @@ with gr.Blocks() as mcp_only_endpoints:
     ) -> Dict[str, List[str]]:
         """
         Generate Foley from a remote video URL or base64-encoded video.
-        Args:
-            video_url_or_b64: http(s) URL or data/base64 string of a short video (mp4).
-            text_prompt: Optional audio description (English).
-            guidance_scale: CFG scale (1.0–10.0).
-            num_inference_steps: Denoising steps (10–100).
-            sample_nums: Number of variants to return (1–6).
-        Returns:
-            dict with { "videos": [paths], "message": str }
         """
         if _model_dict is None or _cfg is None:
-            raise RuntimeError("Model not loaded. Call /load_model tool or use the UI once.")
-        local_path = _normalize_video_input(video_url_or_b64)
-        return _api_generate_from_local(local_path, text_prompt, guidance_scale, num_inference_steps, sample_nums)
-    # Tiny status resource & prompt to help MCP clients
     @gr.mcp.resource("shortifoley://status")
     def shortifoley_status() -> str:
         """Return a simple readiness string for MCP clients."""
         ready = _model_dict is not None and _cfg is not None
         dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
-        return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={GALLERY_DIR}"
     @gr.mcp.prompt()
     def foley_prompt(name: str = "default") -> str:
-        """A reusable prompt template for generating Foley."""
         return (
             "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
             "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
         )
-# -----------------
-# Gradio UI (Blocks)
-# -----------------
 def create_ui() -> gr.Blocks:
     with gr.Blocks(
         title="ShortiFoley — HunyuanVideo-Foley",
         css="""
-        .main-header{ text-align:center; padding:1.5rem; border-radius:16px; background:linear-gradient(135deg,#667eea,#764ba2); color:white; }
         .card{ background:white; border:1px solid #e1e5e9; border-radius:16px; padding:1rem; box-shadow:0 8px 32px rgba(0,0,0,.06); }
         .generate-btn button{ font-weight:700; }
         """
@@ -338,91 +371,82 @@ def create_ui() -> gr.Blocks:
         gr.HTML(f"<div class='main-header'><h1>{SPACE_TITLE}</h1><p>{SPACE_TAGLINE}</p></div>")
-        with gr.Row():
-            with gr.Column(scale=1, elem_classes=["card"]):
-                gr.Markdown("### 📹 Input")
-                video_input = gr.Video(label="Upload Video", height=300)
-                text_input = gr.Textbox(
-                    label="🎯 Audio Description (optional, English)",
-                    placeholder="e.g., Quick rubber-soled footsteps on tile; echoey hallway."
-                )
                 with gr.Row():
-                    guidance_scale = gr.Slider(1.0, 10.0, value=4.5, step=0.1, label="CFG Scale")
-                    steps = gr.Slider(10, 100, value=50, step=5, label="Steps")
-                    samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
-                generate = gr.Button("🎵 Generate Audio", variant="primary", elem_classes=["generate-btn"])
-            with gr.Column(scale=1, elem_classes=["card"]):
-                gr.Markdown("### 🎥 Result(s)")
-                v1 = gr.Video(label="Sample 1", height=260, visible=True)
-                v2 = gr.Video(label="Sample 2", height=160, visible=False)
-                v3 = gr.Video(label="Sample 3", height=160, visible=False)
-                v4 = gr.Video(label="Sample 4", height=160, visible=False)
-                v5 = gr.Video(label="Sample 5", height=160, visible=False)
-                v6 = gr.Video(label="Sample 6", height=160, visible=False)
-                status = gr.Textbox(label="Status", interactive=False)
-        with gr.Tab("📁 Gallery"):
-            gr.Markdown("Latest generated videos (autosaved to `outputs/`).")
-            gallery = gr.Gallery(
-                value=_list_gallery(),
-                columns=3,
-                preview=True,
-                label="Saved Results"
-            )
-            refresh = gr.Button("🔄 Refresh Gallery")
-        # Event handlers
-        def _process(
-            video_file, text_prompt, cfg, nsteps, nsamples
-        ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], str]:
-            outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
-            # set visibilities based on how many were generated
-            vis = [gr.update(visible=i < len(outs), value=(outs[i] if i < len(outs) else None)) for i in range(6)]
-            # update gallery (prepend newest)
-            return (
-                *[v.value if isinstance(v, gr.Video) else None for v in []],  # filler not used; kept for clarity
-            )
-        def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
-            outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
-            updates = []
-            # six video slots
-            for i in range(6):
-                if i < len(outs):
-                    updates.append(gr.update(visible=True, value=outs[i]))
-                else:
-                    updates.append(gr.update(visible=False, value=None))
-            # status
-            updates.append(msg)
-            # refresh gallery implicitly
-            gallery_items = _list_gallery()
-            return (*updates, gr.update(value=gallery_items))
-        generate.click(
-            fn=_process_and_update,
-            inputs=[video_input, text_input, guidance_scale, steps, samples],
-            outputs=[v1, v2, v3, v4, v5, v6, status, gallery],
-            api_name="/infer",
-            api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
-        )
-        # Visibility toggling from samples slider
-        def _toggle_vis(n):
-            n = int(n)
-            return [
-                gr.update(visible=True),
-                gr.update(visible=n >= 2),
-                gr.update(visible=n >= 3),
-                gr.update(visible=n >= 4),
-                gr.update(visible=n >= 5),
-                gr.update(visible=n >= 6),
-            ]
-        samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
-        refresh.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
     return demo
@@ -437,20 +461,22 @@ def set_seeds(s: int = 1):
 # App bootstrap
 # -------------
 if __name__ == "__main__":
-    # clean logger -> print to stdout
     logger.remove()
     logger.add(lambda m: print(m, end=""), level="INFO")
     set_seeds(1)
     logger.info("===== Application Startup =====\n")
     prepare_once()
-    # Late import after repo present
-    sys.path.append(REPO_DIR)
-    from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa: F401
-    from hunyuanvideo_foley.utils.feature_utils import feature_process  # noqa: F401
-    from hunyuanvideo_foley.utils.media_utils import merge_audio_video  # noqa: F401
     msg = auto_load_models()
     if not msg.startswith("✅"):
@@ -459,16 +485,13 @@ if __name__ == "__main__":
         logger.info(msg)
     ui = create_ui()
-    # Mount MCP-only endpoints alongside the UI (optional but handy)
     ui.blocks.append(mcp_only_endpoints)
-    # IMPORTANT: enable MCP server (tools/resources/prompts). This is all you need.
-    # See: https://www.gradio.app/guides/building-mcp-server-with-gradio
     ui.launch(
         server_name="0.0.0.0",
         share=False,
         show_error=True,
-        mcp_server=True,        # <— MCP enabled
-        # ssr_mode=True (default in 5.x)
     )

+# app.py — ShortiFoley (Video -> Foley)
+# Created by bilsimaging.com
 import os
 import sys
+import io
 import json
+import uuid
+import time
 import shutil
+import base64
 import random
 import tempfile
+import datetime
+from pathlib import Path
 from typing import List, Optional, Tuple, Dict
 import numpy as np
 import torch
 import torchaudio
+import gradio as gr
 from loguru import logger
 from huggingface_hub import snapshot_download
+import spaces  # HF Spaces ZeroGPU & MCP integration
 # -------------------------
 # Constants & configuration
 # -------------------------
+ROOT = Path(__file__).parent.resolve()
+REPO_DIR = ROOT / "HunyuanVideo-Foley"
+WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
+CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
+OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs")))
+OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
 SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
 SPACE_TAGLINE = "Text/Video → Audio Foley. Created by bilsimaging.com"
+WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
+# Keep GPU <= 120s for ZeroGPU (default 110)
+GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
+# Globals
 _model_dict = None
 _cfg = None
 _device: Optional[torch.device] = None
 # ------------
 # Small helpers
 # ------------
     return d
 def _ensure_repo() -> None:
+    """Shallow-clone Tencent repo with LFS smudge disabled (avoid LFS quota checkout)."""
+    if REPO_DIR.exists():
         return
     cmd = (
+        "GIT_LFS_SKIP_SMUDGE=1 "
+        "git -c filter.lfs.smudge= -c filter.lfs.required=false "
+        f"clone --depth 1 https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git {REPO_DIR}"
     )
     logger.info(f">> {cmd}")
     os.system(cmd)
 def _download_weights_if_needed() -> None:
+    """Snapshot only needed files from HF weights/model hub."""
+    WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
     snapshot_download(
         repo_id="tencent/HunyuanVideo-Foley",
+        local_dir=str(WEIGHTS_DIR),
         resume_download=True,
         allow_patterns=[
             "hunyuanvideo_foley.pth",
             "synchformer_state_dict.pth",
             "vae_128d_48k.pth",
             "assets/*",
+            "config.yaml",  # harmless
         ],
     )
 def auto_load_models() -> str:
     """
     Load HunyuanVideo-Foley + encoders on the chosen device.
     """
     global _model_dict, _cfg, _device
     if _model_dict is not None and _cfg is not None:
         return "Model already loaded."
+    sys.path.append(str(REPO_DIR))
     from hunyuanvideo_foley.utils.model_utils import load_model
     _device = _setup_device("auto", 0)
     logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
     try:
+        _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
         return "✅ Model loaded."
     except Exception as e:
         logger.error(e)
         return f"❌ Failed to load model: {e}"
+def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
+    """Use project's helper (preferred) with a fallback to ffmpeg via subprocess."""
+    sys.path.append(str(REPO_DIR))
+    try:
+        from hunyuanvideo_foley.utils.media_utils import merge_audio_video
+        merge_audio_video(audio_path, video_path, out_path)
+    except Exception as e:
+        # Fallback: plain ffmpeg merge (assumes same duration or lets ffmpeg handle)
+        logger.warning(f"merge_audio_video failed, falling back to ffmpeg: {e}")
+        import subprocess
+        cmd = [
+            "ffmpeg", "-y",
+            "-i", video_path,
+            "-i", audio_path,
+            "-c:v", "copy",
+            "-c:a", "aac",
+            "-shortest",
+            out_path
+        ]
+        subprocess.run(cmd, check=True)
+def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
+                  prompt: str) -> str:
+    """Save WAV + MP4 in outputs/, add metadata and a small watermark note (metadata only)."""
+    # torchaudio expects [C, N]
+    if audio_tensor.ndim == 1:
+        audio_tensor = audio_tensor.unsqueeze(0)
+    tmpdir = Path(tempfile.mkdtemp())
+    wav_path = tmpdir / f"gen_{idx}.wav"
+    torchaudio.save(str(wav_path), audio_tensor.cpu(), sr)
+    ts = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
+    base = f"shortifoley_{ts}_{idx}"
+    out_mp4 = OUTPUTS_DIR / f"{base}.mp4"
+    _merge_audio_video(str(wav_path), video_src, str(out_mp4))
+    # Save JSON sidecar
+    meta = {
+        "id": base,
+        "created_utc": datetime.datetime.utcnow().isoformat() + "Z",
+        "source_video": Path(video_src).name,
+        "output_video": Path(out_mp4).name,
+        "prompt": prompt or "",
+        "watermark": WATERMARK_NOTE,
+        "tool": "ShortiFoley (HunyuanVideo-Foley)"
+    }
+    (OUTPUTS_DIR / f"{base}.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2))
+    return str(out_mp4)
+def _list_gallery(limit: int = 100) -> List[str]:
+    vids = []
+    for p in sorted(OUTPUTS_DIR.glob("*.mp4"), key=lambda x: x.stat().st_mtime, reverse=True):
+        vids.append(str(p))
+        if len(vids) >= limit:
+            break
+    return vids
+# ================
+# Inference kernel
+# ================
 @spaces.GPU(duration=GPU_DURATION)
 @torch.inference_mode()
 def infer_single_video(
 ) -> Tuple[List[str], str]:
     """
     Generate Foley audio for an uploaded video (1–6 variants).
+    Returns: (list of output video paths, status message)
     """
     if _model_dict is None or _cfg is None:
+        return [], "❌ Load the model first (open the app once)."
     if not video_file:
         return [], "❌ Please provide a video."
+    sys.path.append(str(REPO_DIR))
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     from hunyuanvideo_foley.utils.model_utils import denoise_process
     )
     # generate batch
+    n = int(max(1, min(6, sample_nums)))
     audio, sr = denoise_process(
         visual_feats,
         text_feats,
         audio_len_s,
         _model_dict,
         _cfg,
+        guidance_scale=float(guidance_scale),
         num_inference_steps=int(num_inference_steps),
+        batch_size=n,
     )
     # save results
+    outs = []
+    for i in range(n):
+        outs.append(_save_outputs(video_file, audio[i], sr, i + 1, text_prompt or ""))
+    return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
 # ---------------
+# MCP-only APIs
 # ---------------
 def _download_to_tmp(url: str) -> str:
+    """Download a remote file to temp."""
     try:
+        import requests
     except Exception:
+        raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
     r = requests.get(url, timeout=30)
     r.raise_for_status()
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
     tmp.write(r.content)
     tmp.flush()
     tmp.close()
 def _maybe_from_base64(data_url_or_b64: str) -> str:
+    """Accept data: URLs or raw base64; returns temp file path."""
     b64 = data_url_or_b64
     if data_url_or_b64.startswith("data:"):
         b64 = data_url_or_b64.split(",", 1)[-1]
     raw = base64.b64decode(b64)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
 def _normalize_video_input(video_url_or_b64: str) -> str:
     v = (video_url_or_b64 or "").strip()
     if v.startswith("http://") or v.startswith("https://"):
         return _download_to_tmp(v)
     return _maybe_from_base64(v)
 with gr.Blocks() as mcp_only_endpoints:
     gr.Markdown("These endpoints are MCP/API only and have no visible UI.", show_label=False)
+    @gr.api
     def api_generate_from_url(
         video_url_or_b64: str,
         text_prompt: str = "",
     ) -> Dict[str, List[str]]:
         """
         Generate Foley from a remote video URL or base64-encoded video.
+        Returns: {"videos": [paths], "message": str}
         """
         if _model_dict is None or _cfg is None:
+            raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
+        local = _normalize_video_input(video_url_or_b64)
+        outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
+        return {"videos": outs, "message": msg}
+    @gr.api
+    def load_model_tool() -> str:
+        """Ensure model is loaded on server (MCP convenience)."""
+        return auto_load_models()
     @gr.mcp.resource("shortifoley://status")
     def shortifoley_status() -> str:
         """Return a simple readiness string for MCP clients."""
         ready = _model_dict is not None and _cfg is not None
         dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
+        return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
     @gr.mcp.prompt()
     def foley_prompt(name: str = "default") -> str:
+        """Reusable guidance for describing sound ambience."""
         return (
             "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
             "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
         )
+# -------------
+# Gradio UI
+# -------------
+def _about_html() -> str:
+    return f"""
+    <div style="line-height:1.6">
+      <h2>About ShortiFoley</h2>
+      <p><b>ShortiFoley</b> automatically generates realistic Foley soundtracks for short videos using
+      Tencent’s HunyuanVideo-Foley with CLAP & SigLIP2 encoders. It includes autosave and an MCP server so
+      you can call it from agents or workflows (e.g., n8n).</p>
+      <p><b>Created by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a></b></p>
+      <h3>How to use</h3>
+      <ol>
+        <li>Upload a video (ideally &lt; 120 seconds).</li>
+        <li>Optionally enter a text description of the sound (English).</li>
+        <li>Adjust CFG scale, steps, and number of variants.</li>
+        <li>Click <b>Generate</b>. Results appear on the right and are stored in the Gallery.</li>
+      </ol>
+      <h3>Tips</h3>
+      <ul>
+        <li>Trim clips to the key action (5–30s) for faster, crisper results.</li>
+        <li>Include material cues (“wood”, “metal”, “concrete”), action cues (“splash”, “glass shatter”), and ambience (“roomy”, “echoey”).</li>
+        <li>Generate multiple variants and pick the most natural.</li>
+      </ul>
+      <h3>MCP / Automation</h3>
+      <p>This app runs as an <b>MCP server</b>. Open the footer “View API → MCP” to copy a ready config. You can also use the REST endpoints listed there. Perfect for n8n integrations.</p>
+      <h3>Watermark</h3>
+      <p>Each output’s metadata includes: <i>{WATERMARK_NOTE}</i>. If you want a <b>visible video overlay</b>, I can add an ffmpeg overlay step on request.</p>
+    </div>
+    """
 def create_ui() -> gr.Blocks:
     with gr.Blocks(
         title="ShortiFoley — HunyuanVideo-Foley",
         css="""
+        .main-header{ text-align:center; padding:1.2rem; border-radius:16px; background:linear-gradient(135deg,#667eea,#764ba2); color:white; }
         .card{ background:white; border:1px solid #e1e5e9; border-radius:16px; padding:1rem; box-shadow:0 8px 32px rgba(0,0,0,.06); }
         .generate-btn button{ font-weight:700; }
         """
         gr.HTML(f"<div class='main-header'><h1>{SPACE_TITLE}</h1><p>{SPACE_TAGLINE}</p></div>")
+        with gr.Tabs():
+            with gr.Tab("Run"):
                 with gr.Row():
+                    with gr.Column(scale=1, elem_classes=["card"]):
+                        gr.Markdown("### 📹 Input")
+                        video_input = gr.Video(label="Upload Video", height=300)
+                        text_input = gr.Textbox(
+                            label="🎯 Audio Description (optional, English)",
+                            placeholder="e.g., Rubber soles on wet tile, distant chatter.",
+                            lines=3
+                        )
+                        with gr.Row():
+                            guidance_scale = gr.Slider(1.0, 10.0, value=4.5, step=0.1, label="CFG Scale")
+                            steps = gr.Slider(10, 100, value=50, step=5, label="Steps")
+                            samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
+                        generate = gr.Button("🎵 Generate", variant="primary", elem_classes=["generate-btn"])
+                    with gr.Column(scale=1, elem_classes=["card"]):
+                        gr.Markdown("### 🎥 Result(s)")
+                        v1 = gr.Video(label="Sample 1", height=260, visible=True)
+                        v2 = gr.Video(label="Sample 2", height=160, visible=False)
+                        v3 = gr.Video(label="Sample 3", height=160, visible=False)
+                        v4 = gr.Video(label="Sample 4", height=160, visible=False)
+                        v5 = gr.Video(label="Sample 5", height=160, visible=False)
+                        v6 = gr.Video(label="Sample 6", height=160, visible=False)
+                        status = gr.Textbox(label="Status", interactive=False)
+                # Generate handler
+                def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
+                    outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
+                    vis_updates = []
+                    for i in range(6):
+                        if i < len(outs):
+                            vis_updates.append(gr.update(visible=True, value=outs[i]))
+                        else:
+                            vis_updates.append(gr.update(visible=False, value=None))
+                    gal_items = _list_gallery()
+                    return (*vis_updates, msg, gr.update(value=gal_items))
+                generate.click(
+                    fn=_process_and_update,
+                    inputs=[video_input, text_input, guidance_scale, steps, samples],
+                    outputs=[v1, v2, v3, v4, v5, v6, status, ],
+                    api_name="/infer",
+                    api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
+                )
+                # Toggle visibility when # of samples changes
+                def _toggle_vis(n):
+                    n = int(n)
+                    return [
+                        gr.update(visible=True),
+                        gr.update(visible=n >= 2),
+                        gr.update(visible=n >= 3),
+                        gr.update(visible=n >= 4),
+                        gr.update(visible=n >= 5),
+                        gr.update(visible=n >= 6),
+                    ]
+                samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
+            with gr.Tab("📁 Gallery"):
+                gr.Markdown("Latest generated videos (autosaved to `outputs/`).")
+                gallery = gr.Gallery(
+                    value=_list_gallery(),
+                    columns=3,
+                    preview=True,
+                    label="Saved Results"
+                )
+                refresh = gr.Button("🔄 Refresh Gallery")
+                refresh.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
+            with gr.Tab("ℹ️ About"):
+                gr.HTML(_about_html())
+        # Also expose gallery update after generate
+        generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
     return demo
 # App bootstrap
 # -------------
 if __name__ == "__main__":
     logger.remove()
     logger.add(lambda m: print(m, end=""), level="INFO")
     set_seeds(1)
     logger.info("===== Application Startup =====\n")
     prepare_once()
+    # Ensure import paths after repo is present
+    sys.path.append(str(REPO_DIR))
+    try:
+        # Probe key modules early (better error surfacing)
+        from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa: F401
+        from hunyuanvideo_foley.utils.feature_utils import feature_process  # noqa: F401
+        from hunyuanvideo_foley.utils.media_utils import merge_audio_video  # noqa: F401
+    except Exception as e:
+        logger.warning(f"Repo imports not ready yet: {e}")
     msg = auto_load_models()
     if not msg.startswith("✅"):
         logger.info(msg)
     ui = create_ui()
+    # Mount MCP-only endpoints alongside the UI
     ui.blocks.append(mcp_only_endpoints)
+    # Enable MCP server so tools/resources/prompts are discoverable
     ui.launch(
         server_name="0.0.0.0",
         share=False,
         show_error=True,
+        mcp_server=True,   # MCP on
     )