Spaces:

KarthiEz
/

gemmasign

Sleeping

App Files Files Community

gemmasign / app.py

KarthiEz

Update app.py

d39e74e verified 2 months ago

raw

history blame contribute delete

8.81 kB

	# app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks + selectable SmolVLM

	from packaging import version
	import transformers
	import torch
	import gradio as gr
	from PIL import Image

	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

	MIN_TF = "4.51.0" # newer TFs are friendlier to custom multimodal configs
	if version.parse(transformers.__version__) < version.parse(MIN_TF):
	raise RuntimeError(
	f"Transformers >= {MIN_TF} required for Moondream2. "
	f"Found {transformers.__version__}. Upgrade:\n"
	f" pip install -U 'transformers>={MIN_TF},<5'"
	)

	# --- Models ---
	MOONDREAM_MODEL_ID = "vikhyatk/moondream2"
	# Pin to a stable snapshot to avoid “new version downloaded” surprises.
	PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba"

	SMOL_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct"

	DEVICE = "cpu"
	DTYPE = torch.float32

	# ---- Moondream bootstrap strategy -------------------------------------------
	# 1) Try image-text-to-text pipeline (preferred for Q&A)
	# 2) If it rejects the custom config, try visual-question-answering pipeline
	# 3) If that fails, load the model with trust_remote_code and call its remote methods

	PIPE = None
	MODE = None # "itt" \| "vqa" \| "remote"
	MODEL = None
	TOKENIZER = None
	INIT_ERR = None

	def _try_itt():
	global PIPE, MODE
	PIPE = pipeline(
	"image-text-to-text",
	model=MOONDREAM_MODEL_ID,
	revision=PINNED_REV,
	device=DEVICE,
	dtype=DTYPE,
	trust_remote_code=True,
	use_fast=True,
	)
	MODE = "itt"

	def _try_vqa():
	global PIPE, MODE
	PIPE = pipeline(
	"visual-question-answering",
	model=MOONDREAM_MODEL_ID,
	revision=PINNED_REV,
	device=DEVICE,
	trust_remote_code=True,
	)
	MODE = "vqa"

	def _try_remote():
	# Some Moondream2 snapshots expose custom methods via remote code.
	global MODEL, TOKENIZER, MODE
	TOKENIZER = AutoTokenizer.from_pretrained(
	MOONDREAM_MODEL_ID, revision=PINNED_REV, trust_remote_code=True
	)
	MODEL = AutoModelForCausalLM.from_pretrained(
	MOONDREAM_MODEL_ID,
	revision=PINNED_REV,
	trust_remote_code=True,
	torch_dtype=DTYPE,
	device_map=None,
	).to(DEVICE)
	MODE = "remote"

	def _boot():
	global INIT_ERR
	try:
	_try_itt()
	return
	except Exception as e_itt:
	try:
	_try_vqa()
	return
	except Exception as e_vqa:
	try:
	_try_remote()
	return
	except Exception as e_remote:
	INIT_ERR = (
	"Moondream2 initialization failed.\n\n"
	f"ITT error: {e_itt}\n\n"
	f"VQA error: {e_vqa}\n\n"
	f"Remote error: {e_remote}"
	)

	_boot()

	# ---- SmolVLM (CPU) pipeline --------------------------------------------------
	SMOL_PIPE = None
	SMOL_INIT_ERR = None
	try:
	SMOL_PIPE = pipeline(
	"image-text-to-text",
	model=SMOL_MODEL_ID,
	device=DEVICE,
	dtype=DTYPE,
	use_fast=True,
	trust_remote_code=True, # harmless if not needed
	)
	except Exception as e:
	SMOL_INIT_ERR = f"SmolVLM init failed: {e}"

	# ---- Shared helpers ----------------------------------------------------------
	def _normalize(out):
	"""Normalize pipeline outputs to a plain string (assistant text only)."""
	if out is None:
	return ""
	if isinstance(out, str):
	return out

	if isinstance(out, dict):
	gen = out.get("generated_text")
	if isinstance(gen, str):
	return gen
	if isinstance(gen, (list, tuple)) and gen:
	for turn in reversed(gen):
	if isinstance(turn, dict) and turn.get("role") == "assistant":
	c = turn.get("content")
	return " ".join(map(str, c)) if isinstance(c, list) else str(c or "")
	return _normalize(gen[0])
	if isinstance(out.get("text"), str):
	return out["text"]
	return str(out)

	if isinstance(out, (list, tuple)) and out:
	first = out[0]
	if isinstance(first, dict):
	if "generated_text" in first and isinstance(first["generated_text"], str):
	return first["generated_text"]
	if "answer" in first and isinstance(first["answer"], str):
	return first["answer"]
	return _normalize(first)

	return str(out)

	def _infer_remote(image: Image.Image, question: str) -> str:
	"""Moondream2 last-resort path via remote-code helpers."""
	if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"):
	with torch.no_grad():
	img_emb = MODEL.encode_image(image.convert("RGB"))
	ans = MODEL.answer_question(img_emb, question)
	return str(ans).strip()

	prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:"
	with torch.no_grad():
	inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE)
	out_ids = MODEL.generate(
	**inputs,
	max_new_tokens=128,
	pad_token_id=TOKENIZER.eos_token_id,
	)
	out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0]
	return out_text.strip()

	# ---- Inference (now with model selection) ------------------------------------
	def infer(image: Image.Image, question: str, model_choice: str) -> str:
	if model_choice == "HuggingFaceTB/SmolVLM-500M-Instruct":
	if SMOL_INIT_ERR:
	return f"⚠️ {SMOL_INIT_ERR}"
	if image is None:
	return "Please upload an image."
	q = (question or "").strip()
	if not q:
	return "Please enter a question."
	try:
	out = SMOL_PIPE(
	text=[{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": q},
	],
	}],
	max_new_tokens=128,
	)
	except Exception:
	out = SMOL_PIPE({"images": [image], "text": q}, max_new_tokens=128)
	return _normalize(out).strip() or "(empty response)"

	# Default path: Moondream2 (unchanged logic)
	if INIT_ERR:
	return f"⚠️ Init error:\n{INIT_ERR}"
	if image is None:
	return "Please upload an image."
	q = (question or "").strip()
	if not q:
	return "Please enter a question."

	try:
	if MODE == "itt":
	try:
	out = PIPE(
	text=[{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": q},
	],
	}],
	max_new_tokens=128,
	)
	except Exception:
	out = PIPE({"images": [image], "text": q}, max_new_tokens=128)
	return _normalize(out).strip() or "(empty response)"

	if MODE == "vqa":
	out = PIPE(image=image, question=q)
	return _normalize(out).strip() or "(empty response)"

	if MODE == "remote":
	return _infer_remote(image, q) or "(empty response)"

	return "Unknown mode."
	except Exception as e:
	return f"⚠️ Inference error: {e}"

	# ---- Gradio UI ---------------------------------------------------------------
	with gr.Blocks(title="CPU Vision Q&A") as demo:
	gr.Markdown("## 🌙 Moondream2 & 🐣 SmolVLM — CPU Vision Q&A\n"
	"Upload an image, ask a question, and pick your model.")

	# Show Moondream init status (kept from your original app)
	if INIT_ERR:
	gr.Markdown(f"Moondream startup status: `{INIT_ERR}`")
	if SMOL_INIT_ERR:
	gr.Markdown(f"SmolVLM startup status: `{SMOL_INIT_ERR}`")

	with gr.Row():
	img = gr.Image(type="pil", label="Upload an image")
	with gr.Column():
	# NEW: model selector (default = Moondream2) — minimal surface change
	model_choice = gr.Dropdown(
	choices=[MOONDREAM_MODEL_ID, SMOL_MODEL_ID],
	value=MOONDREAM_MODEL_ID,
	label="Model",
	)
	prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?")
	btn = gr.Button("Ask")
	ans = gr.TextArea(label="Answer", lines=6)

	# Wire the new dropdown into the call; everything else is unchanged
	btn.click(infer, [img, prompt, model_choice], ans)
	prompt.submit(infer, [img, prompt, model_choice], ans)

	if __name__ == "__main__":
	demo.queue().launch(debug=True)