import streamlit as st
from transformers import AutoTokenizer, AutoModelForImageCaptioning
import requests
from PIL import Image
import numpy as np

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/beit-base-patch16-224-in21k")
model = AutoModelForImageCaptioning.from_pretrained("microsoft/beit-base-patch16-224-in21k")

def generate_caption(image_url):
  # Get the image from the URL
  image = Image.open(requests.get(image_url, stream=True).raw)
  
  # Preprocess the image
  input_array = np.array(image) / 255.0
  input_array = np.transpose(input_array, (2, 0, 1))
  input_ids = tokenizer(image_url, return_tensors="pt").input_ids

  # Generate the caption
  output = model.generate(input_ids, max_length=20)
  caption = tokenizer.batch_decode(output, skip_special_tokens=True)

  return caption[0]

def main():
  # Create a sidebar for the user to input the image URL
  st.sidebar.header("Image Caption Generator")
  image_url = st.sidebar.text_input("Enter the URL of an image:")

  # Generate the caption if the user clicks the button
  if st.sidebar.button("Generate Caption"):
    if image_url != "":
      caption = generate_caption(image_url)
      st.success(f"Caption: {caption}")
    else:
      st.error("Please enter a valid image URL.")

# Run the main function
if __name__ == "__main__":
    main()