import requests
import asyncio

from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq


model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

# The original Kosmos-2 demo saves the image first then reload it. For some images, this will give slightly different image input and change the generation outputs.

#prompt = "{question}"

def describe_image(image_path, question : str):
  inputs = processor(text=question, images=image_path, return_tensors="pt")

      generated_ids = await model.generate(
      pixel_values=inputs["pixel_values"],
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      image_embeds=None,
      image_embeds_position_mask=inputs["image_embeds_position_mask"],
      use_cache=True,
      max_new_tokens=128,
  )
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

  # Specify `cleanup_and_extract=False` in order to see the raw model generation.
  processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)

  processed_text, entities = processor.post_process_generation(generated_text)

  return processed_text

import gradio as gr

gr_app = gr.Interface(fn=describe_image, inputs=[gr.Image(label="Upload an image for description", type='pil'), gr.Textbox(label="Ask a question about the image")],
                      outputs=[gr.Textbox(label="Image description")], title="App for image description")

if __name__ == "__main__":
  gr_app.launch(show_error = True)