import gradio as gr from transformers import pipeline from diffusers import StableDiffusion3Pipeline from diffusers import DiffusionPipeline import torch device = "cuda" if torch.cuda.is_available() else "cpu" model_repo_id = "stabilityai/stable-diffusion-3.5-medium" image_style = "pixel art" torch_dtype = torch.float32 if torch.cuda.is_available(): torch_dtype = torch.bfloat16 def generate_description(image): model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") return model(image)[0]['generated_text'] def generate_image_by_description(description): pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype) pipe = pipe.to(device) prompt = ( f"Generate a high-quality, detailed image of a {image_style} of a pigeon. " f"The description of the pigeon is: {description}. " "Make it visually appealing with clear textures and distinct colors." ) image = pipe( prompt, num_inference_steps=40, guidance_scale=4.5, ).images[0] return image with gr.Blocks() as demo: selected_image = gr.Image(type="filepath", label="Upload an Image of the Pigeon") generate_button = gr.Button("Generate Avatar", variant="primary") generated_image = gr.Image(type="numpy", label="Generated Avatar") # Function chaining: generate description, then generate image without displaying text def process_and_generate(image): description = generate_description(image) return generate_image_by_description(description) generate_button.click(process_and_generate, inputs=selected_image, outputs=generated_image) demo.launch()