from base64 import b64decode from io import BytesIO import gradio as gr import spaces from PIL import Image from transformers import pipeline model = pipeline( task="zero-shot-object-detection", model="google/owlvit-large-patch14", ) @spaces.GPU def predict(base64: str, texts: str): decoded_img = b64decode(base64) image_stream = BytesIO(decoded_img) img = Image.open(image_stream) predictions = model(img, text_queries=["".join(list(term)).strip() for term in texts.split(",")]) return predictions demo = gr.Interface( fn=predict, inputs=[ gr.Text(label="Image (B64)"), gr.Text(label="Queries", placeholder="A photo of a dog,A photo of a cat") ], outputs=gr.JSON(label="Predictions"), ) demo.launch()