gui / app.py
dpv007's picture
Update app.py
7d00133 verified
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
# =========================
# Load model (CPU optimized)
# =========================
model_id = "microsoft/GUI-Actor-Verifier-2B"
processor = AutoProcessor.from_pretrained(
model_id,
trust_remote_code=True
)
model = AutoModelForImageTextToText.from_pretrained(
model_id,
trust_remote_code=True,
torch_dtype=torch.float32, # CPU needs float32
device_map="cpu", # force CPU
low_cpu_mem_usage=True
)
model.eval()
# =========================
# Inference
# =========================
def run_model(image, prompt):
try:
if image is None:
return "❌ Please upload an image."
if not prompt or prompt.strip() == "":
prompt = "Describe this image."
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt}
]
}
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
# Move tensors to CPU explicitly
inputs = {k: v.to("cpu") for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=50, # IMPORTANT: keep small for CPU
do_sample=False
)
result = processor.decode(
outputs[0][inputs["input_ids"].shape[-1]:],
skip_special_tokens=True
)
return result
except Exception as e:
return f"❌ Error: {str(e)}"
# =========================
# UI
# =========================
demo = gr.Interface(
fn=run_model,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(label="Your Question")
],
outputs=gr.Textbox(label="Model Output"),
title="GUI Actor Verifier (CPU Mode)",
description="⚠️ Running on CPU — responses may be slow."
)
demo.launch()