import gradio as gr
from transformers import pipeline
from diffusers import StableDiffusionPipeline
import torch
import wget

# Define the device to use (either "cuda" for GPU or "cpu" for CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the models
# Image captioning model to generate captions from uploaded images
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
# Stable Diffusion model for generating new images based on captions
sd_pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(device)

# Load the translation model (English to Arabic)
translator = pipeline(
    task="translation",
    model="facebook/nllb-200-distilled-600M",
    torch_dtype=torch.bfloat16,
    device=device
)

# Function to generate images based on the image's caption
def generate_image_and_translate(image, num_images=1):
    # Generate caption in English from the uploaded image
    caption_en = caption_image(image)[0]['generated_text']

    # Translate the English caption to Arabic
    caption_ar = translator(caption_en, src_lang="eng_Latn", tgt_lang="arb_Arab")[0]['translation_text']

    generated_images = []

    # Generate the specified number of images based on the English caption
    for _ in range(num_images):
        generated_image = sd_pipeline(prompt=caption_en).images[0]
        generated_images.append(generated_image)

    # Return the generated images along with both captions
    return generated_images, caption_en, caption_ar

# Function to generate images based on the image's caption
def generate_image_and_translate(image, num_images=1):
    # Generate caption in English from the uploaded image
    caption_en = caption_image(image)[0]['generated_text']

    # Translate the English caption to Arabic
    caption_ar = translator(caption_en, src_lang="eng_Latn", tgt_lang="arb_Arab")[0]['translation_text']

    generated_images = []

    # Generate the specified number of images based on the English caption
    for _ in range(num_images):
        generated_image = sd_pipeline(prompt=caption_en).images[0]
        generated_images.append(generated_image)

    # Return the generated images along with both captions
    return generated_images, caption_en, caption_ar

# Set up the Gradio interface
interface = gr.Interface(
    fn=generate_image_and_translate,   # Function to call when processing input
    inputs=[
        gr.Image(type="pil", label="📤 Upload Image"), # Input for image upload
        gr.Slider(minimum=1, maximum=10, label="🔢 Number of Images", value=1, step=1) # Slider to select number of images
    ],
    outputs=[
        gr.Gallery(label="🖼️ Generated Images"),
        gr.Textbox(label="📝 Generated Caption (English)", interactive=False),
        gr.Textbox(label="🌍 Translated Caption (Arabic)", interactive=False)
    ],
    title="Image Generation and Captioning", # Title of the interface
    description="Upload an image to extract a caption and display it in both Arabic and English. Then, a new image will be generated based on that caption.",  # Description
    theme='freddyaboulton/dracula_revamped' # Determine theme
)

# Launch the Gradio application
interface.launch()