Text Guided Image-to-Image Generation

import torch
from PIL import Image
from diffusers import DiffusionPipeline
import gradio as gr
import google.generativeai as genai
import os 
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the API key from the environment
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Error handling (optional)
if not GOOGLE_API_KEY:
    raise ValueError("Missing GOOGLE_API_KEY environment variable. Please set it in your .env file.")

# Configure the genai library
genai.configure(api_key=GOOGLE_API_KEY)

# Initialize Gemini models
model1 = genai.GenerativeModel('gemini-1.0-pro-latest')
model2 = genai.GenerativeModel('gemini-1.5-flash-latest')

# Define the function to transform images

model_path = "GiantAnalytics/sdxl_fine_tuned_model_aditya_2"
pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)

# Set the device based on CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe.to(device)

def enhance_prompt_and_generate_images(image, prompt):
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image.astype('uint8'), 'RGB')
    try:
      prompt11='''provide me all the information about texture of the design how it is looking and design of the input textile image in descriptive format
        It should provide like this Texture Details: , Design Details: and overall description of image'''
        # Step 1: Get an enhanced prompt using the Gemini API
        response1 = model2.generate_content([prompt11, image], stream=False)
        response1.resolve()
        initial_description = response1.text

        if initial_description:
            enhanced_prompt = f'''First, identify the user's specifications provided in the prompt: {user_input}.
        Understand the image details: {initial_description}. Now, generate a detailed prompt that combines the user inputs with the image details in a suitable way.
        This new prompt will help generate a new image with the SDXL model. The prompt should be concise and less than 100 tokens; curate it carefully.
        Focus on maintaining the theme and the overall feel of the design, incorporating subtle changes that enhance its uniqueness and visual appeal.'''
            response2 = model1.generate_content([enhanced_prompt], stream=False)
            response2.resolve()
            final_prompt = response2.text if response2.text else prompt
        else:
            final_prompt = prompt 
            print(final_prompt) # Use original prompt if no description is available

    except Exception as e:
        print(f"Failed to enhance prompt via Gemini API: {e}")
        final_prompt = prompt  # Use original prompt on any error

    # Step 2: Generate three image variations
    image_variations = []
    settings = [(7.5, 0.5), (8.0, 0.6), (6.0, 0.4)]  # Custom settings for guidance_scale and strength
    for i, (guidance, strength) in enumerate(settings):  # Different settings for variations
        generator = torch.Generator(device=device).manual_seed(i * 100)
        output = pipe(prompt=final_prompt, image=image, guidance_scale=guidance, strength=strength, generator=generator).images[0]
        image_variations.append(output)

    return image_variations

# Path to your local logo image
logo_path = '/content/RCD-Final Logosmall size.jpg'  # Replace with your image path

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column(scale=10):
            gr.Markdown(
                """
                <div id="logo-container">
                    <h1>Text Guided Image-to-Image Generation</h1>
                    <p>Enter a text prompt with required parameters to transform the Input Image using the Fine-Tuned SDXL Model.</p>
                </div>
                """,
                elem_id="logo-container"
            )
        with gr.Column(scale=1, elem_id="logo-column"):
            logo = gr.Image(value=logo_path, elem_id="logo", height=128, width=128)

    with gr.Row():
        img_input = gr.Image(label="Upload Image")
        prompt_input = gr.Textbox(label="Enter your prompt")
        submit_btn = gr.Button("Generate")

    with gr.Row():
        output_image1 = gr.Image(label="Variation 1")
        output_image2 = gr.Image(label="Variation 2")
        output_image3 = gr.Image(label="Variation 3")

    submit_btn.click(
        enhance_prompt_and_generate_images,
        inputs=[img_input, prompt_input],
        outputs=[output_image1, output_image2, output_image3]
    )

if __name__ == "__main__":
    demo.launch(debug=True)#inline=False)