import os
import requests
from io import BytesIO

from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq

def generate_caption(image):
  # Load pre-trained models & processors
  model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
  processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

  prompt = "<grounding>An image of"

  # Open the uploaded image file
  img = Image.open(BytesIO(image))

  # Save the image locally and open it again to avoid potential issues with reusing the same PIL object
  img.save("temp_image.jpg")
  img = Image.open("temp_image.jpg")

  inputs = processor(text=prompt, images=img, return_tensors="pt")
  
  # Generate caption
  generated_ids = model.generate(**inputs, max_new_tokens=128)
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
  
  # Process the generated caption
  processed_text, _ = processor.post_process_generation(generated_text)
  
  return processed_text

import gradio as gr

title = 'Image Caption Generator'
description = 'Generate descriptive captions for images.'
examples = [["https://example.com/image1.jpg"]]
article = '<p style="margin:auto;max-width:600px;">This tool generates descriptive captions for given images.</p>'

interface = gr.Interface(fn=generate_caption, 
                        inputs=gr.inputs.Image(source='upload'),
                        outputs=gr.outputs.Textbox(),
                        title=title, description=description, examples=examples, article=article)
                        
interface.launch()