|
|
import torch |
|
|
import spaces |
|
|
import gradio as gr |
|
|
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
model = Pix2StructForConditionalGeneration.from_pretrained( |
|
|
"google/pix2struct-screen2words-large", dtype=torch.bfloat16 |
|
|
).to("cuda") |
|
|
model.eval() |
|
|
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large") |
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def describe_ui(image, text): |
|
|
|
|
|
inputs = processor(images=image, text=text or "", return_tensors="pt").to( |
|
|
dtype=torch.bfloat16, device="cuda" |
|
|
) |
|
|
predictions = model.generate(**inputs) |
|
|
return processor.decode(predictions[0], skip_special_tokens=False) |
|
|
|
|
|
|
|
|
gr.Interface( |
|
|
fn=describe_ui, |
|
|
inputs=[ |
|
|
gr.Image(type="pil", label="Upload UI Screenshot"), |
|
|
gr.Textbox(label="Optional prompt / instruction", placeholder="e.g. Describe layout and buttons"), |
|
|
], |
|
|
outputs=gr.Textbox(label="Model Output"), |
|
|
title="UI Screen Describer (Pix2Struct)", |
|
|
description="Upload a screenshot or UI image and optionally enter a text prompt. The model (Google Pix2Struct) will generate a detailed description.", |
|
|
).launch() |
|
|
|