gemmasign / app.py
KarthiEz's picture
Update app.py
d39e74e verified
# app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks + selectable SmolVLM
from packaging import version
import transformers
import torch
import gradio as gr
from PIL import Image
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
MIN_TF = "4.51.0" # newer TFs are friendlier to custom multimodal configs
if version.parse(transformers.__version__) < version.parse(MIN_TF):
raise RuntimeError(
f"Transformers >= {MIN_TF} required for Moondream2. "
f"Found {transformers.__version__}. Upgrade:\n"
f" pip install -U 'transformers>={MIN_TF},<5'"
)
# --- Models ---
MOONDREAM_MODEL_ID = "vikhyatk/moondream2"
# Pin to a stable snapshot to avoid “new version downloaded” surprises.
PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba"
SMOL_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct"
DEVICE = "cpu"
DTYPE = torch.float32
# ---- Moondream bootstrap strategy -------------------------------------------
# 1) Try image-text-to-text pipeline (preferred for Q&A)
# 2) If it rejects the custom config, try visual-question-answering pipeline
# 3) If that fails, load the model with trust_remote_code and call its remote methods
PIPE = None
MODE = None # "itt" | "vqa" | "remote"
MODEL = None
TOKENIZER = None
INIT_ERR = None
def _try_itt():
global PIPE, MODE
PIPE = pipeline(
"image-text-to-text",
model=MOONDREAM_MODEL_ID,
revision=PINNED_REV,
device=DEVICE,
dtype=DTYPE,
trust_remote_code=True,
use_fast=True,
)
MODE = "itt"
def _try_vqa():
global PIPE, MODE
PIPE = pipeline(
"visual-question-answering",
model=MOONDREAM_MODEL_ID,
revision=PINNED_REV,
device=DEVICE,
trust_remote_code=True,
)
MODE = "vqa"
def _try_remote():
# Some Moondream2 snapshots expose custom methods via remote code.
global MODEL, TOKENIZER, MODE
TOKENIZER = AutoTokenizer.from_pretrained(
MOONDREAM_MODEL_ID, revision=PINNED_REV, trust_remote_code=True
)
MODEL = AutoModelForCausalLM.from_pretrained(
MOONDREAM_MODEL_ID,
revision=PINNED_REV,
trust_remote_code=True,
torch_dtype=DTYPE,
device_map=None,
).to(DEVICE)
MODE = "remote"
def _boot():
global INIT_ERR
try:
_try_itt()
return
except Exception as e_itt:
try:
_try_vqa()
return
except Exception as e_vqa:
try:
_try_remote()
return
except Exception as e_remote:
INIT_ERR = (
"Moondream2 initialization failed.\n\n"
f"ITT error: {e_itt}\n\n"
f"VQA error: {e_vqa}\n\n"
f"Remote error: {e_remote}"
)
_boot()
# ---- SmolVLM (CPU) pipeline --------------------------------------------------
SMOL_PIPE = None
SMOL_INIT_ERR = None
try:
SMOL_PIPE = pipeline(
"image-text-to-text",
model=SMOL_MODEL_ID,
device=DEVICE,
dtype=DTYPE,
use_fast=True,
trust_remote_code=True, # harmless if not needed
)
except Exception as e:
SMOL_INIT_ERR = f"SmolVLM init failed: {e}"
# ---- Shared helpers ----------------------------------------------------------
def _normalize(out):
"""Normalize pipeline outputs to a plain string (assistant text only)."""
if out is None:
return ""
if isinstance(out, str):
return out
if isinstance(out, dict):
gen = out.get("generated_text")
if isinstance(gen, str):
return gen
if isinstance(gen, (list, tuple)) and gen:
for turn in reversed(gen):
if isinstance(turn, dict) and turn.get("role") == "assistant":
c = turn.get("content")
return " ".join(map(str, c)) if isinstance(c, list) else str(c or "")
return _normalize(gen[0])
if isinstance(out.get("text"), str):
return out["text"]
return str(out)
if isinstance(out, (list, tuple)) and out:
first = out[0]
if isinstance(first, dict):
if "generated_text" in first and isinstance(first["generated_text"], str):
return first["generated_text"]
if "answer" in first and isinstance(first["answer"], str):
return first["answer"]
return _normalize(first)
return str(out)
def _infer_remote(image: Image.Image, question: str) -> str:
"""Moondream2 last-resort path via remote-code helpers."""
if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"):
with torch.no_grad():
img_emb = MODEL.encode_image(image.convert("RGB"))
ans = MODEL.answer_question(img_emb, question)
return str(ans).strip()
prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:"
with torch.no_grad():
inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE)
out_ids = MODEL.generate(
**inputs,
max_new_tokens=128,
pad_token_id=TOKENIZER.eos_token_id,
)
out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0]
return out_text.strip()
# ---- Inference (now with model selection) ------------------------------------
def infer(image: Image.Image, question: str, model_choice: str) -> str:
if model_choice == "HuggingFaceTB/SmolVLM-500M-Instruct":
if SMOL_INIT_ERR:
return f"⚠️ {SMOL_INIT_ERR}"
if image is None:
return "Please upload an image."
q = (question or "").strip()
if not q:
return "Please enter a question."
try:
out = SMOL_PIPE(
text=[{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": q},
],
}],
max_new_tokens=128,
)
except Exception:
out = SMOL_PIPE({"images": [image], "text": q}, max_new_tokens=128)
return _normalize(out).strip() or "(empty response)"
# Default path: Moondream2 (unchanged logic)
if INIT_ERR:
return f"⚠️ Init error:\n{INIT_ERR}"
if image is None:
return "Please upload an image."
q = (question or "").strip()
if not q:
return "Please enter a question."
try:
if MODE == "itt":
try:
out = PIPE(
text=[{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": q},
],
}],
max_new_tokens=128,
)
except Exception:
out = PIPE({"images": [image], "text": q}, max_new_tokens=128)
return _normalize(out).strip() or "(empty response)"
if MODE == "vqa":
out = PIPE(image=image, question=q)
return _normalize(out).strip() or "(empty response)"
if MODE == "remote":
return _infer_remote(image, q) or "(empty response)"
return "Unknown mode."
except Exception as e:
return f"⚠️ Inference error: {e}"
# ---- Gradio UI ---------------------------------------------------------------
with gr.Blocks(title="CPU Vision Q&A") as demo:
gr.Markdown("## 🌙 Moondream2 & 🐣 SmolVLM — CPU Vision Q&A\n"
"Upload an image, ask a question, and pick your model.")
# Show Moondream init status (kept from your original app)
if INIT_ERR:
gr.Markdown(f"**Moondream startup status:** `{INIT_ERR}`")
if SMOL_INIT_ERR:
gr.Markdown(f"**SmolVLM startup status:** `{SMOL_INIT_ERR}`")
with gr.Row():
img = gr.Image(type="pil", label="Upload an image")
with gr.Column():
# NEW: model selector (default = Moondream2) — minimal surface change
model_choice = gr.Dropdown(
choices=[MOONDREAM_MODEL_ID, SMOL_MODEL_ID],
value=MOONDREAM_MODEL_ID,
label="Model",
)
prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?")
btn = gr.Button("Ask")
ans = gr.TextArea(label="Answer", lines=6)
# Wire the new dropdown into the call; everything else is unchanged
btn.click(infer, [img, prompt, model_choice], ans)
prompt.submit(infer, [img, prompt, model_choice], ans)
if __name__ == "__main__":
demo.queue().launch(debug=True)