|
|
|
import gradio as gr
|
|
from PIL import Image
|
|
import torch
|
|
import os
|
|
import gc
|
|
from pathlib import Path
|
|
|
|
from model import (
|
|
process_images,
|
|
DEVICE,
|
|
DIMENSION,
|
|
MODEL_ID
|
|
)
|
|
|
|
import logging
|
|
logging.disable(logging.WARNING)
|
|
|
|
def create_interface():
|
|
with gr.Blocks() as demo:
|
|
gr.Markdown("""
|
|
# Diffusion Style Transfer with Feature, Style, and Latent Loss
|
|
|
|
This tool combines Stable Diffusion with VGG feature control and latent matching to create styled images.
|
|
""")
|
|
with gr.Row():
|
|
with gr.Column(scale=1):
|
|
init_image = gr.Image(
|
|
label="Initial Image",
|
|
type="pil",
|
|
height=384
|
|
)
|
|
style_image = gr.Image(
|
|
label="Style Image",
|
|
type="pil",
|
|
height=384
|
|
)
|
|
|
|
with gr.Column(scale=1):
|
|
prompt = gr.Textbox(
|
|
label="Prompt",
|
|
lines=2
|
|
)
|
|
negative_prompt = gr.Textbox(
|
|
label="Negative Prompt",
|
|
value="extra details, jpeg artifacts, chromatic aberration"
|
|
)
|
|
with gr.Row():
|
|
inference_steps = gr.Slider(
|
|
minimum=20,
|
|
maximum=100,
|
|
value=60,
|
|
step=1,
|
|
label="Inference Steps"
|
|
)
|
|
strength = gr.Slider(
|
|
minimum=0.0,
|
|
maximum=1.0,
|
|
value=0.2,
|
|
step=0.05,
|
|
label="Initial Image Strength"
|
|
)
|
|
|
|
gr.Markdown("### Gram Matrix Based Style Guidance Scales")
|
|
with gr.Row():
|
|
style_guidance_1 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Texture Fundamentals")
|
|
with gr.Row():
|
|
style_guidance_2 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Pattern Assembly")
|
|
with gr.Row():
|
|
style_guidance_3 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Motifs")
|
|
with gr.Row():
|
|
style_guidance_4 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Compositional Grammar")
|
|
with gr.Row():
|
|
style_guidance_5 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Signature")
|
|
|
|
gr.Markdown("### Feature/Perceptual Based Content Guidance Scales")
|
|
with gr.Row():
|
|
content_guidance_1 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Edge Detector")
|
|
with gr.Row():
|
|
content_guidance_2 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Shape Assembler")
|
|
with gr.Row():
|
|
content_guidance_3 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Part Recognizer")
|
|
with gr.Row():
|
|
content_guidance_4 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Object Former")
|
|
with gr.Row():
|
|
content_guidance_5 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Scene Integrator")
|
|
|
|
gr.Markdown("### Latent Space Guidance")
|
|
latent_guidance = gr.Slider(
|
|
minimum=0,
|
|
maximum=1000,
|
|
value=24,
|
|
step=1,
|
|
label="Latent Space Closeness",
|
|
info="Controls how closely the output matches the style image in latent space"
|
|
)
|
|
|
|
run_button = gr.Button("Generate")
|
|
|
|
with gr.Column(scale=1):
|
|
output = gr.Image(
|
|
label="Output",
|
|
height=384
|
|
)
|
|
|
|
gr.Markdown("""
|
|
The process works in three main steps:
|
|
|
|
1. **Initial Image Processing**
|
|
- Your input image is partially noised based on the 'Initial Image Strength'
|
|
- Higher strength means more noise, allowing more creative freedom but less preservation of original content
|
|
|
|
2. **Diffusion Denoising**
|
|
- The image is gradually denoised using Stable Diffusion
|
|
- The process is guided by your prompt and negative prompt
|
|
- Takes place over the specified number of inference steps
|
|
|
|
3. **Multi-Level Style and Content Control**
|
|
- During denoising, three types of guidance shape the output:
|
|
|
|
**Style Features** (VGG16 layers):
|
|
* Texture Fundamentals: Basic textures and edges
|
|
* Pattern Assembly: Repeated elements and patterns
|
|
* Style Motifs: Distinctive style elements
|
|
* Compositional Grammar: Arrangement of elements
|
|
* Style Signature: Overall artistic style
|
|
|
|
**Content Features** (VGG16 layers):
|
|
* Edge Detector: Basic lines and boundaries
|
|
* Shape Assembler: Simple geometric forms
|
|
* Part Recognizer: Complex shapes and parts
|
|
* Object Former: Complete object representations
|
|
* Scene Integrator: Overall composition
|
|
|
|
**Latent Space Matching**:
|
|
* Controls how closely the output matches the style image in Stable Diffusion's latent space
|
|
* Higher values enforce stronger similarity to the style image's overall structure
|
|
* Lower values allow more freedom for the other guidance systems
|
|
|
|
### Tips for Best Results
|
|
- Start with lower strength (0.2-0.4) to preserve more of your initial image
|
|
- Use higher style guidance for stronger artistic effect
|
|
- Focus content guidance on middle layers for best object preservation
|
|
- Balance latent loss weight:
|
|
* Higher (30-50) for closer style image matching
|
|
* Lower (10-20) for more creative interpretations
|
|
|
|
[Read more here in my blog post](http://christhomas.co.uk/blog/2025/02/17/how-to-guide-stable-diffusion-with-vgg-features-style-loss-and-latent-mae/)
|
|
[Github repo](https://github.com/chris-thomas/steering-stable-diffusion)
|
|
""")
|
|
|
|
run_button.click(
|
|
fn=process_images,
|
|
inputs=[
|
|
init_image,
|
|
style_image,
|
|
prompt,
|
|
negative_prompt,
|
|
inference_steps,
|
|
strength,
|
|
style_guidance_1,
|
|
style_guidance_2,
|
|
style_guidance_3,
|
|
style_guidance_4,
|
|
style_guidance_5,
|
|
content_guidance_1,
|
|
content_guidance_2,
|
|
content_guidance_3,
|
|
content_guidance_4,
|
|
content_guidance_5,
|
|
latent_guidance
|
|
],
|
|
outputs=output
|
|
)
|
|
|
|
|
|
return demo
|
|
|
|
if __name__ == "__main__":
|
|
demo = create_interface()
|
|
demo.launch()
|
|
|