Hug0endob commited on
Commit
d125cdc
·
verified ·
1 Parent(s): d94b3e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -20
app.py CHANGED
@@ -1,26 +1,25 @@
1
  import os
2
  import io
3
- import sys
4
  import time
 
5
  import requests
6
  from PIL import Image, ImageSequence
7
  import gradio as gr
8
 
9
- # Try to import llama-cpp-python
10
  try:
11
  from llama_cpp import Llama
12
  except Exception as e:
13
- raise RuntimeError("llama-cpp-python import failed; ensure requirements installed and wheel built: " + str(e))
14
 
15
- MODEL_PATH = os.path.join("model", "model.gguf") # start.sh places GGUF here
16
  if not os.path.exists(MODEL_PATH):
17
- raise FileNotFoundError(f"Model not found at {MODEL_PATH}. Set correct GGUF in start.sh and redeploy.")
18
 
19
- # Helper: load first frame and convert to JPEG bytes
20
  def download_bytes(url: str, timeout: int = 30) -> bytes:
21
- with requests.get(url, stream=True, timeout=timeout) as resp:
22
- resp.raise_for_status()
23
- return resp.content
24
 
25
  def load_first_frame_from_bytes(raw: bytes):
26
  img = Image.open(io.BytesIO(raw))
@@ -30,15 +29,12 @@ def load_first_frame_from_bytes(raw: bytes):
30
  img = img.convert("RGB")
31
  return img
32
 
33
- # Minimal image caption prompt template — adjust for your model's expected prompt
34
  def make_prompt_for_image(image_path: str, user_prompt: str = "Describe the image."):
35
- # Many llama.cpp-based multimodal ggufs accept: "<img>{path}</img>\nUser: {prompt}\nAssistant:"
36
- # We'll use that pattern.
37
  return f"<img>{image_path}</img>\nUser: {user_prompt}\nAssistant:"
38
 
39
- # Start model (llama-cpp-python will mmap model and run inference)
40
- # Use low-memory opts: n_ctx small, use_mlock=0, n_gpu_layers=0
41
- print("Loading model (this may take a minute)...", file=sys.stderr)
42
  llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)
43
 
44
  def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
@@ -53,7 +49,6 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
53
  except Exception as e:
54
  return f"Image processing error: {e}"
55
 
56
- # Save a temporary JPEG locally so the gguf image token loader can access it
57
  tmp_dir = "/tmp/joycap"
58
  os.makedirs(tmp_dir, exist_ok=True)
59
  ts = int(time.time() * 1000)
@@ -65,7 +60,6 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
65
 
66
  prompt_full = make_prompt_for_image(tmp_path, prompt)
67
  try:
68
- # llama-cpp-python generate call
69
  resp = llm.create(
70
  prompt=prompt_full,
71
  max_tokens=256,
@@ -86,12 +80,12 @@ def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
86
  iface = gr.Interface(
87
  fn=generate_caption_from_url,
88
  inputs=[
89
- gr.Textbox(label="Image / GIF / MP4 URL", placeholder="https://example.com/photo.jpg"),
90
  gr.Textbox(label="Prompt (optional)", value="Describe the image."),
91
  ],
92
  outputs=gr.Textbox(label="Generated caption"),
93
- title="JoyCaption - local GGUF (Q4)",
94
- description="Runs a quantized GGUF model locally via llama.cpp (no external APIs). Ensure the GGUF in start.sh is a multimodal model that supports <img> tags.",
95
  )
96
 
97
  if __name__ == "__main__":
 
1
  import os
2
  import io
 
3
  import time
4
+ import sys
5
  import requests
6
  from PIL import Image, ImageSequence
7
  import gradio as gr
8
 
9
+ # llama-cpp-python import
10
  try:
11
  from llama_cpp import Llama
12
  except Exception as e:
13
+ raise RuntimeError("llama-cpp-python import failed: " + str(e))
14
 
15
+ MODEL_PATH = os.path.join("model", "llama-joycaption-q4_k_m.gguf")
16
  if not os.path.exists(MODEL_PATH):
17
+ raise FileNotFoundError(f"Model not found at {MODEL_PATH}. Ensure start.sh downloaded the GGUF.")
18
 
 
19
  def download_bytes(url: str, timeout: int = 30) -> bytes:
20
+ with requests.get(url, stream=True, timeout=timeout) as r:
21
+ r.raise_for_status()
22
+ return r.content
23
 
24
  def load_first_frame_from_bytes(raw: bytes):
25
  img = Image.open(io.BytesIO(raw))
 
29
  img = img.convert("RGB")
30
  return img
31
 
 
32
  def make_prompt_for_image(image_path: str, user_prompt: str = "Describe the image."):
33
+ # JoyCaption-style multimodal GGUFs accept <img>{path}</img>
 
34
  return f"<img>{image_path}</img>\nUser: {user_prompt}\nAssistant:"
35
 
36
+ # Initialize model (low-resource options)
37
+ print("Loading GGUF model (this can take 30–120s)...", file=sys.stderr)
 
38
  llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)
39
 
40
  def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
 
49
  except Exception as e:
50
  return f"Image processing error: {e}"
51
 
 
52
  tmp_dir = "/tmp/joycap"
53
  os.makedirs(tmp_dir, exist_ok=True)
54
  ts = int(time.time() * 1000)
 
60
 
61
  prompt_full = make_prompt_for_image(tmp_path, prompt)
62
  try:
 
63
  resp = llm.create(
64
  prompt=prompt_full,
65
  max_tokens=256,
 
80
  iface = gr.Interface(
81
  fn=generate_caption_from_url,
82
  inputs=[
83
+ gr.Textbox(label="Image URL", placeholder="https://example.com/photo.jpg"),
84
  gr.Textbox(label="Prompt (optional)", value="Describe the image."),
85
  ],
86
  outputs=gr.Textbox(label="Generated caption"),
87
+ title="JoyCaption GGUF (Q4_K_M)",
88
+ description="Runs a quantized JoyCaption GGUF locally via llama.cpp (no external API).",
89
  )
90
 
91
  if __name__ == "__main__":