import requests import asyncio from PIL import Image from transformers import AutoProcessor, AutoModelForVision2Seq model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224") processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") # The original Kosmos-2 demo saves the image first then reload it. For some images, this will give slightly different image input and change the generation outputs. #prompt = "{question}" def describe_image(image_path, question : str): inputs = processor(text=question, images=image_path, return_tensors="pt") generated_ids = await model.generate( pixel_values=inputs["pixel_values"], input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], image_embeds=None, image_embeds_position_mask=inputs["image_embeds_position_mask"], use_cache=True, max_new_tokens=128, ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # Specify `cleanup_and_extract=False` in order to see the raw model generation. processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False) processed_text, entities = processor.post_process_generation(generated_text) return processed_text import gradio as gr gr_app = gr.Interface(fn=describe_image, inputs=[gr.Image(label="Upload an image for description", type='pil'), gr.Textbox(label="Ask a question about the image")], outputs=[gr.Textbox(label="Image description")], title="App for image description") if __name__ == "__main__": gr_app.launch(show_error = True)