Spaces:

Hug0endob
/

Joycaption-basic

Build error

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

851e8b5

verified ·

1 Parent(s): b6a2d55

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -25

app.py CHANGED Viewed

@@ -1,57 +1,139 @@
 import torch
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 import gradio as gr
-from PIL import Image
-import requests
 # -------------------------------------------------
-# Model identifier (replace if you fork or use a different checkpoint)
 # -------------------------------------------------
-MODEL_NAME = "fpgaminer/joycaption-llama3.1-8b"   # 8‑B checkpoint fits comfortably on CPU
-# -------------------------------------------------
-# Load processor and model (CPU only)
-# -------------------------------------------------
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
-# `device_map="cpu"` forces everything onto the CPU
 llava_model = LlavaForConditionalGeneration.from_pretrained(
     MODEL_NAME,
     device_map="cpu",
-    torch_dtype=torch.bfloat16,   # native dtype for this model
 )
 llava_model.eval()
 # -------------------------------------------------
-# Inference function used by Gradio
 # -------------------------------------------------
-def generate_caption(image: Image.Image, prompt: str = "Describe the image.") -> str:
-    # Prepare inputs for the model
-    inputs = processor(images=image, text=prompt, return_tensors="pt")
     inputs = {k: v.to(llava_model.device) for k, v in inputs.items()}
-    # Generate up to 64 new tokens (adjust if you want longer captions)
     with torch.no_grad():
-        output_ids = llava_model.generate(**inputs, max_new_tokens=64)
-    # Decode to plain text
-    caption = processor.decode(output_ids[0], skip_special_tokens=True)
     return caption
 # -------------------------------------------------
 # Gradio UI
 # -------------------------------------------------
 iface = gr.Interface(
-    fn=generate_caption,
     inputs=[
-        gr.Image(type="pil", label="Upload an image"),
-        gr.Textbox(label="Prompt (optional)", value="Describe the image.")
     ],
     outputs=gr.Textbox(label="Generated caption"),
-    title="JoyCaption (CPU‑only) Demo",
-    description="Upload an image and let the JoyCaption model generate a caption. Runs entirely on the free CPU tier.",
-    allow_flagging="never"
 )
 if __name__ == "__main__":

+import os
 import torch
+import requests
+from io import BytesIO
+from PIL import Image, ImageSequence
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 import gradio as gr
 # -------------------------------------------------
+# Model configuration (CPU‑only)
 # -------------------------------------------------
+MODEL_NAME = "fpgaminer/joycaption-llama3.1-8b"
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
 llava_model = LlavaForConditionalGeneration.from_pretrained(
     MODEL_NAME,
     device_map="cpu",
+    torch_dtype=torch.bfloat16,
 )
 llava_model.eval()
 # -------------------------------------------------
+# Helper: download a file from a URL
 # -------------------------------------------------
+def download_bytes(url: str) -> bytes:
+    resp = requests.get(url, stream=True, timeout=30)
+    resp.raise_for_status()
+    return resp.content
+# -------------------------------------------------
+# Helper: convert MP4 → GIF using ezgif.com (public API)
+# -------------------------------------------------
+def mp4_to_gif(mp4_bytes: bytes) -> bytes:
+    """
+    Sends the MP4 bytes to ezgif.com and returns the resulting GIF bytes.
+    The API is undocumented but works via a simple multipart POST.
+    """
+    files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")}
+    # ezgif.com endpoint for MP4 → GIF conversion
+    resp = requests.post(
+        "https://s.ezgif.com/video-to-gif",
+        files=files,
+        data={"file": "video.mp4"},
+        timeout=60,
+    )
+    resp.raise_for_status()
+    # The response HTML contains a link to the generated GIF.
+    # We extract the first <img src="..."> that ends with .gif
+    import re
+    match = re.search(r'<img[^>]+src="([^"]+\.gif)"', resp.text)
+    if not match:
+        raise RuntimeError("Failed to extract GIF URL from ezgif response")
+    gif_url = match.group(1)
+    # ezgif serves the GIF from a relative path; make it absolute
+    if gif_url.startswith("//"):
+        gif_url = "https:" + gif_url
+    elif gif_url.startswith("/"):
+        gif_url = "https://s.ezgif.com" + gif_url
+    gif_resp = requests.get(gif_url, timeout=30)
+    gif_resp.raise_for_status()
+    return gif_resp.content
+# -------------------------------------------------
+# Main inference function
+# -------------------------------------------------
+def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str:
+    """
+    1. Download the resource.
+    2. If it is an MP4 → convert to GIF.
+    3. Load the first frame of the image/GIF.
+    4. Run JoyCaption and return the caption.
+    """
+    # -----------------------------------------------------------------
+    # 1️⃣ Download raw bytes
+    # -----------------------------------------------------------------
+    raw = download_bytes(url)
+    # -----------------------------------------------------------------
+    # 2️⃣ Determine type & possibly convert MP4 → GIF
+    # -----------------------------------------------------------------
+    lower_url = url.lower()
+    if lower_url.endswith(".mp4"):
+        # Convert video to GIF
+        raw = mp4_to_gif(raw)
+        # After conversion we treat it as a GIF
+        lower_url = ".gif"
+    # -----------------------------------------------------------------
+    # 3️⃣ Load image (first frame for GIFs)
+    # -----------------------------------------------------------------
+    img = Image.open(BytesIO(raw))
+    # If the file is a multi‑frame GIF, pick the first frame
+    if getattr(img, "is_animated", False):
+        img = next(ImageSequence.Iterator(img))
+    # Ensure RGB (JoyCaption expects 3‑channel images)
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    # -----------------------------------------------------------------
+    # 4️⃣ Run the model
+    # -----------------------------------------------------------------
+    inputs = processor(images=img, text=prompt, return_tensors="pt")
     inputs = {k: v.to(llava_model.device) for k, v in inputs.items()}
     with torch.no_grad():
+        out_ids = llava_model.generate(**inputs, max_new_tokens=64)
+    caption = processor.decode(out_ids[0], skip_special_tokens=True)
     return caption
 # -------------------------------------------------
 # Gradio UI
 # -------------------------------------------------
 iface = gr.Interface(
+    fn=generate_caption_from_url,
     inputs=[
+        gr.Textbox(
+            label="Image / GIF / MP4 URL",
+            placeholder="https://example.com/photo.jpg or https://example.com/clip.mp4",
+        ),
+        gr.Textbox(label="Prompt (optional)", value="Describe the image."),
     ],
     outputs=gr.Textbox(label="Generated caption"),
+    title="JoyCaption – URL input (supports GIF & MP4)",
+    description=(
+        "Enter a direct URL to an image, an animated GIF, or an MP4 video. "
+        "MP4 files are automatically converted to GIF via ezgif.com, "
+        "and the first frame of the GIF is captioned."
+    ),
+    allow_flagging="never",
 )
 if __name__ == "__main__":