|
|
|
|
|
|
|
|
from packaging import version |
|
|
import transformers |
|
|
import torch |
|
|
import gradio as gr |
|
|
from PIL import Image |
|
|
|
|
|
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
MIN_TF = "4.51.0" |
|
|
if version.parse(transformers.__version__) < version.parse(MIN_TF): |
|
|
raise RuntimeError( |
|
|
f"Transformers >= {MIN_TF} required for Moondream2. " |
|
|
f"Found {transformers.__version__}. Upgrade:\n" |
|
|
f" pip install -U 'transformers>={MIN_TF},<5'" |
|
|
) |
|
|
|
|
|
|
|
|
MOONDREAM_MODEL_ID = "vikhyatk/moondream2" |
|
|
|
|
|
PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba" |
|
|
|
|
|
SMOL_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct" |
|
|
|
|
|
DEVICE = "cpu" |
|
|
DTYPE = torch.float32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PIPE = None |
|
|
MODE = None |
|
|
MODEL = None |
|
|
TOKENIZER = None |
|
|
INIT_ERR = None |
|
|
|
|
|
def _try_itt(): |
|
|
global PIPE, MODE |
|
|
PIPE = pipeline( |
|
|
"image-text-to-text", |
|
|
model=MOONDREAM_MODEL_ID, |
|
|
revision=PINNED_REV, |
|
|
device=DEVICE, |
|
|
dtype=DTYPE, |
|
|
trust_remote_code=True, |
|
|
use_fast=True, |
|
|
) |
|
|
MODE = "itt" |
|
|
|
|
|
def _try_vqa(): |
|
|
global PIPE, MODE |
|
|
PIPE = pipeline( |
|
|
"visual-question-answering", |
|
|
model=MOONDREAM_MODEL_ID, |
|
|
revision=PINNED_REV, |
|
|
device=DEVICE, |
|
|
trust_remote_code=True, |
|
|
) |
|
|
MODE = "vqa" |
|
|
|
|
|
def _try_remote(): |
|
|
|
|
|
global MODEL, TOKENIZER, MODE |
|
|
TOKENIZER = AutoTokenizer.from_pretrained( |
|
|
MOONDREAM_MODEL_ID, revision=PINNED_REV, trust_remote_code=True |
|
|
) |
|
|
MODEL = AutoModelForCausalLM.from_pretrained( |
|
|
MOONDREAM_MODEL_ID, |
|
|
revision=PINNED_REV, |
|
|
trust_remote_code=True, |
|
|
torch_dtype=DTYPE, |
|
|
device_map=None, |
|
|
).to(DEVICE) |
|
|
MODE = "remote" |
|
|
|
|
|
def _boot(): |
|
|
global INIT_ERR |
|
|
try: |
|
|
_try_itt() |
|
|
return |
|
|
except Exception as e_itt: |
|
|
try: |
|
|
_try_vqa() |
|
|
return |
|
|
except Exception as e_vqa: |
|
|
try: |
|
|
_try_remote() |
|
|
return |
|
|
except Exception as e_remote: |
|
|
INIT_ERR = ( |
|
|
"Moondream2 initialization failed.\n\n" |
|
|
f"ITT error: {e_itt}\n\n" |
|
|
f"VQA error: {e_vqa}\n\n" |
|
|
f"Remote error: {e_remote}" |
|
|
) |
|
|
|
|
|
_boot() |
|
|
|
|
|
|
|
|
SMOL_PIPE = None |
|
|
SMOL_INIT_ERR = None |
|
|
try: |
|
|
SMOL_PIPE = pipeline( |
|
|
"image-text-to-text", |
|
|
model=SMOL_MODEL_ID, |
|
|
device=DEVICE, |
|
|
dtype=DTYPE, |
|
|
use_fast=True, |
|
|
trust_remote_code=True, |
|
|
) |
|
|
except Exception as e: |
|
|
SMOL_INIT_ERR = f"SmolVLM init failed: {e}" |
|
|
|
|
|
|
|
|
def _normalize(out): |
|
|
"""Normalize pipeline outputs to a plain string (assistant text only).""" |
|
|
if out is None: |
|
|
return "" |
|
|
if isinstance(out, str): |
|
|
return out |
|
|
|
|
|
if isinstance(out, dict): |
|
|
gen = out.get("generated_text") |
|
|
if isinstance(gen, str): |
|
|
return gen |
|
|
if isinstance(gen, (list, tuple)) and gen: |
|
|
for turn in reversed(gen): |
|
|
if isinstance(turn, dict) and turn.get("role") == "assistant": |
|
|
c = turn.get("content") |
|
|
return " ".join(map(str, c)) if isinstance(c, list) else str(c or "") |
|
|
return _normalize(gen[0]) |
|
|
if isinstance(out.get("text"), str): |
|
|
return out["text"] |
|
|
return str(out) |
|
|
|
|
|
if isinstance(out, (list, tuple)) and out: |
|
|
first = out[0] |
|
|
if isinstance(first, dict): |
|
|
if "generated_text" in first and isinstance(first["generated_text"], str): |
|
|
return first["generated_text"] |
|
|
if "answer" in first and isinstance(first["answer"], str): |
|
|
return first["answer"] |
|
|
return _normalize(first) |
|
|
|
|
|
return str(out) |
|
|
|
|
|
def _infer_remote(image: Image.Image, question: str) -> str: |
|
|
"""Moondream2 last-resort path via remote-code helpers.""" |
|
|
if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"): |
|
|
with torch.no_grad(): |
|
|
img_emb = MODEL.encode_image(image.convert("RGB")) |
|
|
ans = MODEL.answer_question(img_emb, question) |
|
|
return str(ans).strip() |
|
|
|
|
|
prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:" |
|
|
with torch.no_grad(): |
|
|
inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE) |
|
|
out_ids = MODEL.generate( |
|
|
**inputs, |
|
|
max_new_tokens=128, |
|
|
pad_token_id=TOKENIZER.eos_token_id, |
|
|
) |
|
|
out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0] |
|
|
return out_text.strip() |
|
|
|
|
|
|
|
|
def infer(image: Image.Image, question: str, model_choice: str) -> str: |
|
|
if model_choice == "HuggingFaceTB/SmolVLM-500M-Instruct": |
|
|
if SMOL_INIT_ERR: |
|
|
return f"⚠️ {SMOL_INIT_ERR}" |
|
|
if image is None: |
|
|
return "Please upload an image." |
|
|
q = (question or "").strip() |
|
|
if not q: |
|
|
return "Please enter a question." |
|
|
try: |
|
|
out = SMOL_PIPE( |
|
|
text=[{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": image}, |
|
|
{"type": "text", "text": q}, |
|
|
], |
|
|
}], |
|
|
max_new_tokens=128, |
|
|
) |
|
|
except Exception: |
|
|
out = SMOL_PIPE({"images": [image], "text": q}, max_new_tokens=128) |
|
|
return _normalize(out).strip() or "(empty response)" |
|
|
|
|
|
|
|
|
if INIT_ERR: |
|
|
return f"⚠️ Init error:\n{INIT_ERR}" |
|
|
if image is None: |
|
|
return "Please upload an image." |
|
|
q = (question or "").strip() |
|
|
if not q: |
|
|
return "Please enter a question." |
|
|
|
|
|
try: |
|
|
if MODE == "itt": |
|
|
try: |
|
|
out = PIPE( |
|
|
text=[{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": image}, |
|
|
{"type": "text", "text": q}, |
|
|
], |
|
|
}], |
|
|
max_new_tokens=128, |
|
|
) |
|
|
except Exception: |
|
|
out = PIPE({"images": [image], "text": q}, max_new_tokens=128) |
|
|
return _normalize(out).strip() or "(empty response)" |
|
|
|
|
|
if MODE == "vqa": |
|
|
out = PIPE(image=image, question=q) |
|
|
return _normalize(out).strip() or "(empty response)" |
|
|
|
|
|
if MODE == "remote": |
|
|
return _infer_remote(image, q) or "(empty response)" |
|
|
|
|
|
return "Unknown mode." |
|
|
except Exception as e: |
|
|
return f"⚠️ Inference error: {e}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="CPU Vision Q&A") as demo: |
|
|
gr.Markdown("## 🌙 Moondream2 & 🐣 SmolVLM — CPU Vision Q&A\n" |
|
|
"Upload an image, ask a question, and pick your model.") |
|
|
|
|
|
|
|
|
if INIT_ERR: |
|
|
gr.Markdown(f"**Moondream startup status:** `{INIT_ERR}`") |
|
|
if SMOL_INIT_ERR: |
|
|
gr.Markdown(f"**SmolVLM startup status:** `{SMOL_INIT_ERR}`") |
|
|
|
|
|
with gr.Row(): |
|
|
img = gr.Image(type="pil", label="Upload an image") |
|
|
with gr.Column(): |
|
|
|
|
|
model_choice = gr.Dropdown( |
|
|
choices=[MOONDREAM_MODEL_ID, SMOL_MODEL_ID], |
|
|
value=MOONDREAM_MODEL_ID, |
|
|
label="Model", |
|
|
) |
|
|
prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?") |
|
|
btn = gr.Button("Ask") |
|
|
ans = gr.TextArea(label="Answer", lines=6) |
|
|
|
|
|
|
|
|
btn.click(infer, [img, prompt, model_choice], ans) |
|
|
prompt.submit(infer, [img, prompt, model_choice], ans) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue().launch(debug=True) |
|
|
|