|
from transformers import AutoProcessor, AutoModelForCausalLM |
|
from PIL import Image |
|
import gradio as gr |
|
|
|
model_id = 'microsoft/Florence-2-large' |
|
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, |
|
torch_dtype="auto", |
|
|
|
cache_dir="./cache", |
|
|
|
).eval() |
|
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, |
|
torch_dtype="auto", |
|
|
|
cache_dir="./cache", |
|
|
|
) |
|
|
|
def run_example(task_prompt, image, text_input=None): |
|
if text_input is None: |
|
prompt = task_prompt |
|
else: |
|
prompt = task_prompt + text_input |
|
|
|
inputs = processor(text=prompt, images=image, return_tensors="pt") |
|
generated_ids = model.generate( |
|
input_ids=inputs["input_ids"], |
|
pixel_values=inputs["pixel_values"], |
|
max_new_tokens=1024, |
|
early_stopping=False, |
|
do_sample=False, |
|
num_beams=3, |
|
) |
|
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] |
|
parsed_answer = processor.post_process_generation( |
|
generated_text, |
|
task=task_prompt, |
|
image_size=(image.width, image.height), |
|
|
|
) |
|
|
|
return parsed_answer |
|
|
|
def inference(image, task_prompt, text_input): |
|
return run_example(task_prompt, image, text_input) |
|
|
|
interface = gr.Interface( |
|
fn=inference, |
|
inputs=[ |
|
gr.Image(type="pil"), |
|
gr.Textbox(label="Task Prompt", placeholder="Enter task prompt here"), |
|
gr.Textbox(label="Additional Text Input", placeholder="Enter additional text input here (optional)", optional=True) |
|
], |
|
outputs="text", |
|
title="Hugging Face Model Inference", |
|
description="Generate text based on an image and a prompt using a Hugging Face model" |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch() |
|
|