Spaces:

Hug0endob
/

Joycaption-basic

Build error

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

d125cdc

verified ·

1 Parent(s): d94b3e3

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -20

app.py CHANGED Viewed

@@ -1,26 +1,25 @@
 import os
 import io
-import sys
 import time
 import requests
 from PIL import Image, ImageSequence
 import gradio as gr
-# Try to import llama-cpp-python
 try:
     from llama_cpp import Llama
 except Exception as e:
-    raise RuntimeError("llama-cpp-python import failed; ensure requirements installed and wheel built: " + str(e))
-MODEL_PATH = os.path.join("model", "model.gguf")  # start.sh places GGUF here
 if not os.path.exists(MODEL_PATH):
-    raise FileNotFoundError(f"Model not found at {MODEL_PATH}. Set correct GGUF in start.sh and redeploy.")
-# Helper: load first frame and convert to JPEG bytes
 def download_bytes(url: str, timeout: int = 30) -> bytes:
-    with requests.get(url, stream=True, timeout=timeout) as resp:
-        resp.raise_for_status()
-        return resp.content
 def load_first_frame_from_bytes(raw: bytes):
     img = Image.open(io.BytesIO(raw))
@@ -30,15 +29,12 @@ def load_first_frame_from_bytes(raw: bytes):
         img = img.convert("RGB")
     return img
-# Minimal image caption prompt template — adjust for your model's expected prompt
 def make_prompt_for_image(image_path: str, user_prompt: str = "Describe the image."):
-    # Many llama.cpp-based multimodal ggufs accept: "<img>{path}</img>\nUser: {prompt}\nAssistant:"
-    # We'll use that pattern.
     return f"<img>{image_path}</img>\nUser: {user_prompt}\nAssistant:"
-# Start model (llama-cpp-python will mmap model and run inference)
-# Use low-memory opts: n_ctx small, use_mlock=0, n_gpu_layers=0
-print("Loading model (this may take a minute)...", file=sys.stderr)
 llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)
 def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
@@ -53,7 +49,6 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
     except Exception as e:
         return f"Image processing error: {e}"
-    # Save a temporary JPEG locally so the gguf image token loader can access it
     tmp_dir = "/tmp/joycap"
     os.makedirs(tmp_dir, exist_ok=True)
     ts = int(time.time() * 1000)
@@ -65,7 +60,6 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
     prompt_full = make_prompt_for_image(tmp_path, prompt)
     try:
-        # llama-cpp-python generate call
         resp = llm.create(
             prompt=prompt_full,
             max_tokens=256,
@@ -86,12 +80,12 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
 iface = gr.Interface(
     fn=generate_caption_from_url,
     inputs=[
-        gr.Textbox(label="Image / GIF / MP4 URL", placeholder="https://example.com/photo.jpg"),
         gr.Textbox(label="Prompt (optional)", value="Describe the image."),
     ],
     outputs=gr.Textbox(label="Generated caption"),
-    title="JoyCaption - local GGUF (Q4)",
-    description="Runs a quantized GGUF model locally via llama.cpp (no external APIs). Ensure the GGUF in start.sh is a multimodal model that supports <img> tags.",
 )
 if __name__ == "__main__":

 import os
 import io
 import time
+import sys
 import requests
 from PIL import Image, ImageSequence
 import gradio as gr
+# llama-cpp-python import
 try:
     from llama_cpp import Llama
 except Exception as e:
+    raise RuntimeError("llama-cpp-python import failed: " + str(e))
+MODEL_PATH = os.path.join("model", "llama-joycaption-q4_k_m.gguf")
 if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Model not found at {MODEL_PATH}. Ensure start.sh downloaded the GGUF.")
 def download_bytes(url: str, timeout: int = 30) -> bytes:
+    with requests.get(url, stream=True, timeout=timeout) as r:
+        r.raise_for_status()
+        return r.content
 def load_first_frame_from_bytes(raw: bytes):
     img = Image.open(io.BytesIO(raw))
         img = img.convert("RGB")
     return img
 def make_prompt_for_image(image_path: str, user_prompt: str = "Describe the image."):
+    # JoyCaption-style multimodal GGUFs accept <img>{path}</img>
     return f"<img>{image_path}</img>\nUser: {user_prompt}\nAssistant:"
+# Initialize model (low-resource options)
+print("Loading GGUF model (this can take 30–120s)...", file=sys.stderr)
 llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)
 def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
     except Exception as e:
         return f"Image processing error: {e}"
     tmp_dir = "/tmp/joycap"
     os.makedirs(tmp_dir, exist_ok=True)
     ts = int(time.time() * 1000)
     prompt_full = make_prompt_for_image(tmp_path, prompt)
     try:
         resp = llm.create(
             prompt=prompt_full,
             max_tokens=256,
 iface = gr.Interface(
     fn=generate_caption_from_url,
     inputs=[
+        gr.Textbox(label="Image URL", placeholder="https://example.com/photo.jpg"),
         gr.Textbox(label="Prompt (optional)", value="Describe the image."),
     ],
     outputs=gr.Textbox(label="Generated caption"),
+    title="JoyCaption GGUF (Q4_K_M)",
+    description="Runs a quantized JoyCaption GGUF locally via llama.cpp (no external API).",
 )
 if __name__ == "__main__":