#import spaces import gradio as gr from PIL import Image import torch import os import gc from pathlib import Path from model import ( process_images, DEVICE, DIMENSION, MODEL_ID ) import logging logging.disable(logging.WARNING) def create_interface(): with gr.Blocks() as demo: gr.Markdown(""" # Diffusion Style Transfer with Feature, Style, and Latent Loss This tool combines Stable Diffusion with VGG feature control and latent matching to create styled images. """) with gr.Row(): with gr.Column(scale=1): init_image = gr.Image( label="Initial Image", type="pil", height=384 ) style_image = gr.Image( label="Style Image", type="pil", height=384 ) with gr.Column(scale=1): prompt = gr.Textbox( label="Prompt", lines=2 ) negative_prompt = gr.Textbox( label="Negative Prompt", value="extra details, jpeg artifacts, chromatic aberration" ) with gr.Row(): inference_steps = gr.Slider( minimum=20, maximum=100, value=60, step=1, label="Inference Steps" ) strength = gr.Slider( minimum=0.0, maximum=1.0, value=0.2, step=0.05, label="Initial Image Strength" ) gr.Markdown("### Gram Matrix Based Style Guidance Scales") with gr.Row(): style_guidance_1 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Texture Fundamentals") with gr.Row(): style_guidance_2 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Pattern Assembly") with gr.Row(): style_guidance_3 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Motifs") with gr.Row(): style_guidance_4 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Compositional Grammar") with gr.Row(): style_guidance_5 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Signature") gr.Markdown("### Feature/Perceptual Based Content Guidance Scales") with gr.Row(): content_guidance_1 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Edge Detector") with gr.Row(): content_guidance_2 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Shape Assembler") with gr.Row(): content_guidance_3 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Part Recognizer") with gr.Row(): content_guidance_4 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Object Former") with gr.Row(): content_guidance_5 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Scene Integrator") gr.Markdown("### Latent Space Guidance") latent_guidance = gr.Slider( minimum=0, maximum=1000, value=24, step=1, label="Latent Space Closeness", info="Controls how closely the output matches the style image in latent space" ) run_button = gr.Button("Generate") with gr.Column(scale=1): output = gr.Image( label="Output", height=384 ) gr.Markdown(""" The process works in three main steps: 1. **Initial Image Processing** - Your input image is partially noised based on the 'Initial Image Strength' - Higher strength means more noise, allowing more creative freedom but less preservation of original content 2. **Diffusion Denoising** - The image is gradually denoised using Stable Diffusion - The process is guided by your prompt and negative prompt - Takes place over the specified number of inference steps 3. **Multi-Level Style and Content Control** - During denoising, three types of guidance shape the output: **Style Features** (VGG16 layers): * Texture Fundamentals: Basic textures and edges * Pattern Assembly: Repeated elements and patterns * Style Motifs: Distinctive style elements * Compositional Grammar: Arrangement of elements * Style Signature: Overall artistic style **Content Features** (VGG16 layers): * Edge Detector: Basic lines and boundaries * Shape Assembler: Simple geometric forms * Part Recognizer: Complex shapes and parts * Object Former: Complete object representations * Scene Integrator: Overall composition **Latent Space Matching**: * Controls how closely the output matches the style image in Stable Diffusion's latent space * Higher values enforce stronger similarity to the style image's overall structure * Lower values allow more freedom for the other guidance systems ### Tips for Best Results - Start with lower strength (0.2-0.4) to preserve more of your initial image - Use higher style guidance for stronger artistic effect - Focus content guidance on middle layers for best object preservation - Balance latent loss weight: * Higher (30-50) for closer style image matching * Lower (10-20) for more creative interpretations [Read more here in my blog post](http://christhomas.co.uk/blog/2025/02/17/how-to-guide-stable-diffusion-with-vgg-features-style-loss-and-latent-mae/) [Github repo](https://github.com/chris-thomas/steering-stable-diffusion) """) run_button.click( fn=process_images, inputs=[ init_image, style_image, prompt, negative_prompt, inference_steps, strength, style_guidance_1, style_guidance_2, style_guidance_3, style_guidance_4, style_guidance_5, content_guidance_1, content_guidance_2, content_guidance_3, content_guidance_4, content_guidance_5, latent_guidance ], outputs=output ) return demo if __name__ == "__main__": demo = create_interface() demo.launch()