Spaces:

chris-thomas
/

steering-diffusion-with-features-and-style-loss

Running on Zero

steering-diffusion-with-features-and-style-loss

File size: 7,457 Bytes

#import spaces
import gradio as gr
from PIL import Image
import torch
import os
import gc
from pathlib import Path

from model import (
    process_images,
    DEVICE,
    DIMENSION,
    MODEL_ID
)

import logging
logging.disable(logging.WARNING)

def create_interface():
    with gr.Blocks() as demo:
        gr.Markdown("""

        # Diffusion Style Transfer with Feature, Style, and Latent Loss



        This tool combines Stable Diffusion with VGG feature control and latent matching to create styled images. 

        """)
        with gr.Row():
            with gr.Column(scale=1):
                init_image = gr.Image(
                    label="Initial Image",
                    type="pil",
                    height=384
                )
                style_image = gr.Image(
                    label="Style Image",
                    type="pil",
                    height=384
                )
            
            with gr.Column(scale=1):
                prompt = gr.Textbox(
                    label="Prompt",
                    lines=2
                )
                negative_prompt = gr.Textbox(
                    label="Negative Prompt",
                    value="extra details, jpeg artifacts, chromatic aberration"
                )
                with gr.Row():
                    inference_steps = gr.Slider(
                        minimum=20,
                        maximum=100,
                        value=60,
                        step=1,
                        label="Inference Steps"
                    )
                    strength = gr.Slider(
                        minimum=0.0,
                        maximum=1.0,
                        value=0.2,
                        step=0.05,
                        label="Initial Image Strength"
                    )
                
                gr.Markdown("### Gram Matrix Based Style Guidance Scales")
                with gr.Row():
                    style_guidance_1 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Texture Fundamentals")
                with gr.Row():
                    style_guidance_2 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Pattern Assembly")
                with gr.Row():
                    style_guidance_3 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Motifs")
                with gr.Row():
                    style_guidance_4 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Compositional Grammar")
                with gr.Row():
                    style_guidance_5 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Signature")

                gr.Markdown("### Feature/Perceptual Based Content Guidance Scales")
                with gr.Row():
                    content_guidance_1 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Edge Detector")
                with gr.Row():
                    content_guidance_2 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Shape Assembler")
                with gr.Row():
                    content_guidance_3 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Part Recognizer")
                with gr.Row():
                    content_guidance_4 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Object Former")
                with gr.Row():
                    content_guidance_5 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Scene Integrator")

                gr.Markdown("### Latent Space Guidance")
                latent_guidance = gr.Slider(
                    minimum=0,
                    maximum=1000,
                    value=24, 
                    step=1,
                    label="Latent Space Closeness",
                    info="Controls how closely the output matches the style image in latent space"
                )

                run_button = gr.Button("Generate")
            
            with gr.Column(scale=1):
                output = gr.Image(
                    label="Output",
                    height=384
                )

            gr.Markdown("""

        The process works in three main steps:



        1. **Initial Image Processing**

        - Your input image is partially noised based on the 'Initial Image Strength'

        - Higher strength means more noise, allowing more creative freedom but less preservation of original content



        2. **Diffusion Denoising**

        - The image is gradually denoised using Stable Diffusion

        - The process is guided by your prompt and negative prompt

        - Takes place over the specified number of inference steps



        3. **Multi-Level Style and Content Control**

        - During denoising, three types of guidance shape the output:



        **Style Features** (VGG16 layers):

        * Texture Fundamentals: Basic textures and edges

        * Pattern Assembly: Repeated elements and patterns

        * Style Motifs: Distinctive style elements

        * Compositional Grammar: Arrangement of elements

        * Style Signature: Overall artistic style

        

        **Content Features** (VGG16 layers):

        * Edge Detector: Basic lines and boundaries

        * Shape Assembler: Simple geometric forms

        * Part Recognizer: Complex shapes and parts

        * Object Former: Complete object representations

        * Scene Integrator: Overall composition



        **Latent Space Matching**:

        * Controls how closely the output matches the style image in Stable Diffusion's latent space

        * Higher values enforce stronger similarity to the style image's overall structure

        * Lower values allow more freedom for the other guidance systems



        ### Tips for Best Results

        - Start with lower strength (0.2-0.4) to preserve more of your initial image

        - Use higher style guidance for stronger artistic effect

        - Focus content guidance on middle layers for best object preservation

        - Balance latent loss weight:

        * Higher (30-50) for closer style image matching

        * Lower (10-20) for more creative interpretations

                        

        [Read more here in my blog post](http://christhomas.co.uk/blog/2025/02/17/how-to-guide-stable-diffusion-with-vgg-features-style-loss-and-latent-mae/)   

        [Github repo](https://github.com/chris-thomas/steering-stable-diffusion)                                  

        """)
        
        run_button.click(
            fn=process_images,
            inputs=[
                init_image,
                style_image,
                prompt,
                negative_prompt,
                inference_steps,
                strength,
                style_guidance_1,
                style_guidance_2,
                style_guidance_3,
                style_guidance_4,
                style_guidance_5,
                content_guidance_1,
                content_guidance_2,
                content_guidance_3,
                content_guidance_4,
                content_guidance_5,
                latent_guidance
            ],
            outputs=output
        )

    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()