Spaces:

Bils
/

ShortiFoley

Running on Zero

App Files Files Community

Bils commited on 7 days ago

Commit

cc2901f

verified ·

1 Parent(s): 489e2ab

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -96

app.py CHANGED Viewed

@@ -1,11 +1,15 @@
-# ShortiFoley
 # Created by bilsimaging.com
 import os
 os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
 import sys
 import json
 import base64
 import random
 import tempfile
@@ -21,7 +25,6 @@ from loguru import logger
 from huggingface_hub import snapshot_download
 import spaces
 # -------------------------
 # Constants & configuration
 # -------------------------
@@ -29,25 +32,41 @@ ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "HunyuanVideo-Foley"
 WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
 CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
-OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs")))
 OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
 SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
-SPACE_TAGLINE = "Bring your videos to life with AI-powered Foley"
 WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
-# ZeroGPU limit (<=120)
 GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
-# Globals (NO CUDA INIT HERE)
 _model_dict = None
 _cfg = None
 _device: Optional[torch.device] = None
 # ------------
-# Small helpers
 # ------------
 def _ensure_repo() -> None:
     """Shallow-clone Tencent repo with LFS smudge disabled (avoid LFS quota checkout)."""
     if REPO_DIR.exists():
@@ -86,30 +105,26 @@ def prepare_once() -> None:
 # -----------------------
 # Model load & inference
 # -----------------------
-def auto_load_models(device: Optional[torch.device] = None) -> str:
     """
-    Load HunyuanVideo-Foley + encoders on the given device.
-    MUST be called only inside a @spaces.GPU context with device=cuda:0.
     """
     global _model_dict, _cfg, _device
     if _model_dict is not None and _cfg is not None:
         return "✅ Model already loaded."
-    # DO NOT probe CUDA here unless device is passed from GPU context
-    if device is None:
-        return "❌ Load the model inside a GPU task first (use the Load button or run Generate)."
-    os.environ["HF_PREFER_SAFETENSORS"] = "1"  # enforce again for safety
     sys.path.append(str(REPO_DIR))
     from hunyuanvideo_foley.utils.model_utils import load_model
-    _device = device
     logger.info("Loading HunyuanVideo-Foley model...")
     logger.info(f"MODEL_PATH:  {WEIGHTS_DIR}")
     logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
-    logger.info(f"TARGET_DEVICE: {_device}")
     try:
         _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
@@ -152,7 +167,7 @@ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
 def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
                   prompt: str) -> str:
-    """Save WAV + MP4 in outputs/, add metadata with a soft watermark note."""
     # torchaudio expects [C, N]
     if audio_tensor.ndim == 1:
         audio_tensor = audio_tensor.unsqueeze(0)
@@ -207,12 +222,9 @@ def infer_single_video(
     Generate Foley audio for an uploaded video (1–6 variants).
     Returns: (list of output video paths, status message)
     """
-    # Safe: inside GPU context, we can use CUDA
-    device = torch.device("cuda:0")
-    # Lazy-load if needed on GPU
     if _model_dict is None or _cfg is None:
-        msg = auto_load_models(device)
         if not str(msg).startswith("✅"):
             return [], f"❌ {msg}"
@@ -249,31 +261,17 @@ def infer_single_video(
     return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
-@spaces.GPU(duration=GPU_DURATION)
-def gpu_load_models() -> str:
-    device = torch.device("cuda:0")
-    return auto_load_models(device)
 # -------------
-# Gradio UI (with MCP + REST endpoints)
 # -------------
 def _about_html() -> str:
     return f"""
     <div style="line-height:1.6">
       <h2>About ShortiFoley</h2>
-<p><b>ShortiFoley</b> turns short videos into realistic Foley sound.
-Powered by Tencent’s HunyuanVideo-Foley (SigLIP2 + CLAP), with autosave and an MCP server for automation (e.g., n8n).</p>
-<p>It is part of the <b>Media Automation Suite</b> by
-<a href="https://bilsimaging.com" target="_blank" rel="noopener">bilsimaging.com</a>,
-built to streamline creative workflows across video, sound, and publishing.</p>
-<p>ShortiFoley integrates seamlessly with automation tools like
-<a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a>,
-making it easy to plug into custom workflows and pipelines.</p>
       <h3>Quick Steps</h3>
       <ol>
@@ -293,9 +291,9 @@ making it easy to plug into custom workflows and pipelines.</p>
       <h3>MCP & API</h3>
       <p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see “API & MCP” tab).
-      Perfect for pipelines and tools like <b>n8n</b>.</p>
     </div>
     """
@@ -309,7 +307,7 @@ def create_ui() -> gr.Blocks:
     .generate-btn button{ font-weight:800; border-radius:12px; padding:10px 18px;}
     .minor-btn button{ border-radius:10px;}
     .muted{ color:#64748b; }
-    .footer-text{ margin-top:16px; text-align:center; color:#475569; font-size:.95rem;}
     """
     with gr.Blocks(title="ShortiFoley — HunyuanVideo-Foley", css=css) as demo:
@@ -333,7 +331,7 @@ def create_ui() -> gr.Blocks:
                             samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
                         with gr.Row():
-                            load_btn = gr.Button("⚙️ Load model", variant="secondary", elem_classes=["minor-btn"])
                             generate = gr.Button("🎵 Generate", variant="primary", elem_classes=["generate-btn"])
                         status = gr.Textbox(label="Status", interactive=False)
@@ -356,27 +354,47 @@ def create_ui() -> gr.Blocks:
                     outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
                     vis = []
                     for i in range(6):
-                        if i < len(outs):
                             vis.append(gr.update(visible=True, value=outs[i]))
                         else:
-                            vis.append(gr.update(visible=False, value=None))
-                    return (*vis, msg)
-                gen_evt = generate.click(
                     fn=_process_and_update,
                     inputs=[video_input, text_input, guidance_scale, steps, samples],
-                    outputs=[v1, v2, v3, v4, v5, v6, status],
                     api_name="/infer",
                     api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
                 )
-                # Load model (GPU-safe)
                 load_btn.click(
-                    fn=gpu_load_models,
                     inputs=[],
                     outputs=[status],
                     api_name="/load_model",
-                    api_description="Load/initialize the ShortiFoley model and encoders (runs on GPU)."
                 )
                 # Toggle visibility based on variants
@@ -393,7 +411,7 @@ def create_ui() -> gr.Blocks:
                 samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
             with gr.Tab("📁 Gallery"):
-                gr.Markdown("Latest generated videos (autosaved to `outputs/`).")
                 gallery = gr.Gallery(
                     value=_list_gallery(),
                     columns=3,
@@ -401,49 +419,50 @@ def create_ui() -> gr.Blocks:
                     label="Saved Results"
                 )
                 refresh = gr.Button("🔄 Refresh Gallery")
-                def _refresh_gallery():
-                    return gr.update(value=_list_gallery())
-                # Refresh via button
-                refresh.click(_refresh_gallery, outputs=[gallery])
-                # Also refresh after generation finishes
-                gen_evt.then(_refresh_gallery, inputs=None, outputs=[gallery])
             with gr.Tab("API & MCP"):
-                gr.Markdown(
-                    "### REST examples\n\n"
-                    "**POST** `api_generate_from_url`\n"
-                    "```json\n"
-                    "{\n"
-                    '  "video_url_or_b64": "https://yourhost/sample.mp4",\n'
-                    '  "text_prompt": "metallic clink; hollow room reverb",\n'
-                    '  "guidance_scale": 4.5,\n'
-                    '  "num_inference_steps": 50,\n'
-                    '  "sample_nums": 2\n'
-                    "}\n"
-                    "```\n\n"
-                    "**POST** `load_model_tool` — loads the model proactively.\n\n"
-                    "### MCP resources & prompt\n"
-                    "- `shortifoley://status` → quick health info\n"
-                    "- `foley_prompt` → reusable guidance for describing the sound\n\n"
-                    "Works with n8n: call `load_model_tool` once, then `api_generate_from_url` per clip."
-                )
             with gr.Tab("ℹ️ About"):
                 gr.HTML(_about_html())
         # Footer
-        gr.HTML("""
-        <div class="footer-text">
-            <p>🚀 Created by <b>bilsimaging.com</b> &bull; Powered by HunyuanVideo-Foley &bull; Generate high-quality audio from video and text descriptions</p>
-        </div>
-        """)
         # ---- REST + MCP endpoints (inside Blocks) ----
         def _download_to_tmp(url: str) -> str:
             try:
-                import requests
             except Exception:
                 raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
             r = requests.get(url, timeout=30)
@@ -479,9 +498,10 @@ def create_ui() -> gr.Blocks:
             num_inference_steps: int = 50,
             sample_nums: int = 1,
         ) -> Dict[str, List[str]]:
-            # Ensure model is ready (GPU-safe path)
             if _model_dict is None or _cfg is None:
-                _ = gpu_load_models()
             local = _normalize_video_input(video_url_or_b64)
             outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
             return {"videos": outs, "message": msg}
@@ -489,14 +509,14 @@ def create_ui() -> gr.Blocks:
         @gr.api
         def load_model_tool() -> str:
             """Ensure model is loaded on server (convenient for MCP/REST)."""
-            return gpu_load_models()
         @gr.mcp.resource("shortifoley://status")
         def shortifoley_status() -> str:
             """Return a simple readiness string for MCP clients."""
             ready = _model_dict is not None and _cfg is not None
             dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
-            return f"ShortiFoley status: {'ready' if ready else 'idle'} | device={dev} | outputs={OUTPUTS_DIR}"
         @gr.mcp.prompt()
         def foley_prompt(name: str = "default") -> str:
@@ -506,9 +526,6 @@ def create_ui() -> gr.Blocks:
                 "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
             )
-        # IMPORTANT: Do NOT auto-load models here to avoid CUDA init in main process
-        demo.load(lambda: "Ready. Click 'Load model' or 'Generate' to start.", inputs=None, outputs=None)
     return demo
@@ -519,7 +536,7 @@ def set_seeds(s: int = 1):
 # -------------
-# App bootstrap (CPU only)
 # -------------
 if __name__ == "__main__":
     logger.remove()
@@ -529,7 +546,7 @@ if __name__ == "__main__":
     logger.info("===== Application Startup =====\n")
     prepare_once()
-    # Probe imports (early surfacing) — CPU-safe
     sys.path.append(str(REPO_DIR))
     try:
         from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa: F401

 # Created by bilsimaging.com
 import os
 os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
 import sys
 import json
+import uuid
+import time
+import shutil
 import base64
 import random
 import tempfile
 from huggingface_hub import snapshot_download
 import spaces
 # -------------------------
 # Constants & configuration
 # -------------------------
 REPO_DIR = ROOT / "HunyuanVideo-Foley"
 WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
 CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
+OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs" / "autosaved")))
 OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
 SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
+SPACE_TAGLINE = "Text/Video → Audio Foley · Created by bilsimaging.com"
 WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
+# ZeroGPU limit
 GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
+# Globals
 _model_dict = None
 _cfg = None
 _device: Optional[torch.device] = None
 # ------------
+# Small helpers
 # ------------
+def _setup_device(pref: str = "cpu", gpu_id: int = 0) -> torch.device:
+    """
+    Pick device safely.
+    IMPORTANT: Do NOT query torch.cuda.is_available() in main/non-GPU processes
+    on Stateless GPU Spaces. Only set CUDA when called from a @spaces.GPU context.
+    """
+    if pref.startswith("cuda"):
+        d = torch.device(f"cuda:{gpu_id}")
+    elif pref == "mps":
+        d = torch.device("mps")
+    else:
+        d = torch.device("cpu")
+    logger.info(f"Using {d}")
+    return d
 def _ensure_repo() -> None:
     """Shallow-clone Tencent repo with LFS smudge disabled (avoid LFS quota checkout)."""
     if REPO_DIR.exists():
 # -----------------------
 # Model load & inference
 # -----------------------
+def auto_load_models(device_str: str = "cpu") -> str:
     """
+    Load HunyuanVideo-Foley + encoders on the chosen device.
+    Use device_str="cuda" ONLY inside @spaces.GPU function to avoid CUDA init in main process.
     """
     global _model_dict, _cfg, _device
     if _model_dict is not None and _cfg is not None:
         return "✅ Model already loaded."
+    # Make absolutely sure safetensors is preferred
+    os.environ["HF_PREFER_SAFETENSORS"] = "1"
     sys.path.append(str(REPO_DIR))
     from hunyuanvideo_foley.utils.model_utils import load_model
+    _device = _setup_device(device_str, 0)
     logger.info("Loading HunyuanVideo-Foley model...")
     logger.info(f"MODEL_PATH:  {WEIGHTS_DIR}")
     logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
     try:
         _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
 def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
                   prompt: str) -> str:
+    """Save WAV + MP4 in autosaved/, add metadata with a soft watermark note."""
     # torchaudio expects [C, N]
     if audio_tensor.ndim == 1:
         audio_tensor = audio_tensor.unsqueeze(0)
     Generate Foley audio for an uploaded video (1–6 variants).
     Returns: (list of output video paths, status message)
     """
+    # Lazy-load on GPU
     if _model_dict is None or _cfg is None:
+        msg = auto_load_models(device_str="cuda")
         if not str(msg).startswith("✅"):
             return [], f"❌ {msg}"
     return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
 # -------------
+# Gradio UI (with MCP+API inside the same app)
 # -------------
 def _about_html() -> str:
     return f"""
     <div style="line-height:1.6">
       <h2>About ShortiFoley</h2>
+      <p><b>ShortiFoley</b> turns short videos into realistic Foley sound.<br/>
+      Powered by Tencent’s HunyuanVideo-Foley (SigLIP2 + CLAP), with autosave and an MCP server for automation
+      (<a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a> flows).</p>
+      <p><b>Created by <a href="https://bilsimaging.com" target="_blank" rel="noopener">bilsimaging.com</a></b></p>
       <h3>Quick Steps</h3>
       <ol>
       <h3>MCP & API</h3>
       <p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see “API & MCP” tab).
+      Perfect for media-automation pipelines and tools like <b><a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a></b>.</p>
     </div>
     """
     .generate-btn button{ font-weight:800; border-radius:12px; padding:10px 18px;}
     .minor-btn button{ border-radius:10px;}
     .muted{ color:#64748b; }
+    .footer-text{ color:#64748b; text-align:center; padding:12px 0; font-size:.95rem; }
     """
     with gr.Blocks(title="ShortiFoley — HunyuanVideo-Foley", css=css) as demo:
                             samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
                         with gr.Row():
+                            load_btn = gr.Button("⚙️ Load model (CPU)", variant="secondary", elem_classes=["minor-btn"])
                             generate = gr.Button("🎵 Generate", variant="primary", elem_classes=["generate-btn"])
                         status = gr.Textbox(label="Status", interactive=False)
                     outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
                     vis = []
                     for i in range(6):
+                        if outs and i < len(outs):
                             vis.append(gr.update(visible=True, value=outs[i]))
                         else:
+                            vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
+                    # Also refresh the gallery in this same event
+                    new_gallery = _list_gallery()
+                    return (*vis, msg, new_gallery)
+                generate.click(
                     fn=_process_and_update,
                     inputs=[video_input, text_input, guidance_scale, steps, samples],
+                    outputs=[v1, v2, v3, v4, v5, v6, status],  # updated below to include gallery via .then-like merge
                     api_name="/infer",
                     api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
                 )
+                # Workaround: extend outputs to include gallery refresh using a wrapper
+                def _process_and_update_with_gallery(video_file, text_prompt, cfg, nsteps, nsamples):
+                    outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
+                    vis = []
+                    for i in range(6):
+                        if outs and i < len(outs):
+                            vis.append(gr.update(visible=True, value=outs[i]))
+                        else:
+                            vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
+                    new_gallery = _list_gallery()
+                    return (*vis, msg, new_gallery)
+                # Re-bind with gallery as extra output
+                generate.click(
+                    fn=_process_and_update_with_gallery,
+                    inputs=[video_input, text_input, guidance_scale, steps, samples],
+                    outputs=[v1, v2, v3, v4, v5, v6, status,],  # gallery will be refreshed on Gallery tab itself
+                )
                 load_btn.click(
+                    fn=lambda: auto_load_models(device_str="cpu"),
                     inputs=[],
                     outputs=[status],
                     api_name="/load_model",
+                    api_description="Load/initialize the ShortiFoley model and encoders on CPU (GPU loads during inference)."
                 )
                 # Toggle visibility based on variants
                 samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
             with gr.Tab("📁 Gallery"):
+                gr.Markdown("Latest generated videos (autosaved to `outputs/autosaved/`).")
                 gallery = gr.Gallery(
                     value=_list_gallery(),
                     columns=3,
                     label="Saved Results"
                 )
                 refresh = gr.Button("🔄 Refresh Gallery")
+                refresh.click(lambda: _list_gallery(), outputs=[gallery])
             with gr.Tab("API & MCP"):
+                gr.Markdown("""
+### REST examples
+**POST** `/api_generate_from_url`
+```json
+{
+  "video_url_or_b64": "https://yourhost/sample.mp4",
+  "text_prompt": "metallic clink; hollow room reverb",
+  "guidance_scale": 4.5,
+  "num_inference_steps": 50,
+  "sample_nums": 2
+}
+```
+**POST** `/load_model_tool`
+Loads the model proactively (useful before batch runs).
+**MCP resources & prompt**
+- `shortifoley://status` → quick health info
+- `foley_prompt` → reusable guidance for describing the sound
+Works great with media-automation in tools like **n8n**: call `load_model_tool` once, then `api_generate_from_url` for each clip.
+""")
             with gr.Tab("ℹ️ About"):
                 gr.HTML(_about_html())
         # Footer
+        gr.HTML(
+            """
+            <div class="footer-text">
+                🚀 Created by <a href="https://bilsimaging.com" target="_blank" rel="noopener">bilsimaging.com</a>
+                &middot; Powered by HunyuanVideo-Foley
+            </div>
+            """
+        )
         # ---- REST + MCP endpoints (inside Blocks) ----
         def _download_to_tmp(url: str) -> str:
             try:
+                import requests
             except Exception:
                 raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
             r = requests.get(url, timeout=30)
             num_inference_steps: int = 50,
             sample_nums: int = 1,
         ) -> Dict[str, List[str]]:
             if _model_dict is None or _cfg is None:
+                msg = auto_load_models(device_str="cpu")  # safe in HTTP context; GPU will be used inside infer
+                if not str(msg).startswith("✅"):
+                    raise RuntimeError(msg)
             local = _normalize_video_input(video_url_or_b64)
             outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
             return {"videos": outs, "message": msg}
         @gr.api
         def load_model_tool() -> str:
             """Ensure model is loaded on server (convenient for MCP/REST)."""
+            return auto_load_models(device_str="cpu")
         @gr.mcp.resource("shortifoley://status")
         def shortifoley_status() -> str:
             """Return a simple readiness string for MCP clients."""
             ready = _model_dict is not None and _cfg is not None
             dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
+            return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
         @gr.mcp.prompt()
         def foley_prompt(name: str = "default") -> str:
                 "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
             )
     return demo
 # -------------
+# App bootstrap
 # -------------
 if __name__ == "__main__":
     logger.remove()
     logger.info("===== Application Startup =====\n")
     prepare_once()
+    # Probe imports (early surfacing)
     sys.path.append(str(REPO_DIR))
     try:
         from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa: F401