from transformers import AutoProcessor, AutoModelForCausalLM from PIL import Image import gradio as gr model_id = 'microsoft/Florence-2-large' model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", #device_map="auto", cache_dir="./cache", #attn_implementation="flash_attention_2", ).eval() processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", #device_map="auto", cache_dir="./cache", #attn_implementation="flash_attention_2", ) def run_example(task_prompt, image, text_input=None): if text_input is None: prompt = task_prompt else: prompt = task_prompt + text_input inputs = processor(text=prompt, images=image, return_tensors="pt") generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3, ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = processor.post_process_generation( generated_text, task=task_prompt, image_size=(image.width, image.height), #stream=True ) return parsed_answer def inference(image, task_prompt, text_input): return run_example(task_prompt, image, text_input) interface = gr.Interface( fn=inference, inputs=[ gr.Image(type="pil"), gr.Textbox(label="Task Prompt", placeholder="Enter task prompt here"), gr.Textbox(label="Additional Text Input", placeholder="Enter additional text input here (optional)", optional=True) ], outputs="text", title="Hugging Face Model Inference", description="Generate text based on an image and a prompt using a Hugging Face model" ) if __name__ == "__main__": interface.launch()