Spaces:
Running
on
Zero
Running
on
Zero
#import spaces | |
import gradio as gr | |
from PIL import Image | |
import torch | |
import os | |
import gc | |
from pathlib import Path | |
from model import ( | |
process_images, | |
DEVICE, | |
DIMENSION, | |
MODEL_ID | |
) | |
import logging | |
logging.disable(logging.WARNING) | |
def create_interface(): | |
with gr.Blocks() as demo: | |
gr.Markdown(""" | |
# Diffusion Style Transfer with Feature, Style, and Latent Loss | |
This tool combines Stable Diffusion with VGG feature control and latent matching to create styled images. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
init_image = gr.Image( | |
label="Initial Image", | |
type="pil", | |
height=384 | |
) | |
style_image = gr.Image( | |
label="Style Image", | |
type="pil", | |
height=384 | |
) | |
with gr.Column(scale=1): | |
prompt = gr.Textbox( | |
label="Prompt", | |
lines=2 | |
) | |
negative_prompt = gr.Textbox( | |
label="Negative Prompt", | |
value="extra details, jpeg artifacts, chromatic aberration" | |
) | |
with gr.Row(): | |
inference_steps = gr.Slider( | |
minimum=20, | |
maximum=100, | |
value=60, | |
step=1, | |
label="Inference Steps" | |
) | |
strength = gr.Slider( | |
minimum=0.0, | |
maximum=1.0, | |
value=0.2, | |
step=0.05, | |
label="Initial Image Strength" | |
) | |
gr.Markdown("### Gram Matrix Based Style Guidance Scales") | |
with gr.Row(): | |
style_guidance_1 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Texture Fundamentals") | |
with gr.Row(): | |
style_guidance_2 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Pattern Assembly") | |
with gr.Row(): | |
style_guidance_3 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Motifs") | |
with gr.Row(): | |
style_guidance_4 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Compositional Grammar") | |
with gr.Row(): | |
style_guidance_5 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Signature") | |
gr.Markdown("### Feature/Perceptual Based Content Guidance Scales") | |
with gr.Row(): | |
content_guidance_1 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Edge Detector") | |
with gr.Row(): | |
content_guidance_2 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Shape Assembler") | |
with gr.Row(): | |
content_guidance_3 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Part Recognizer") | |
with gr.Row(): | |
content_guidance_4 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Object Former") | |
with gr.Row(): | |
content_guidance_5 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Scene Integrator") | |
gr.Markdown("### Latent Space Guidance") | |
latent_guidance = gr.Slider( | |
minimum=0, | |
maximum=1000, | |
value=24, | |
step=1, | |
label="Latent Space Closeness", | |
info="Controls how closely the output matches the style image in latent space" | |
) | |
run_button = gr.Button("Generate") | |
with gr.Column(scale=1): | |
output = gr.Image( | |
label="Output", | |
height=384 | |
) | |
gr.Markdown(""" | |
The process works in three main steps: | |
1. **Initial Image Processing** | |
- Your input image is partially noised based on the 'Initial Image Strength' | |
- Higher strength means more noise, allowing more creative freedom but less preservation of original content | |
2. **Diffusion Denoising** | |
- The image is gradually denoised using Stable Diffusion | |
- The process is guided by your prompt and negative prompt | |
- Takes place over the specified number of inference steps | |
3. **Multi-Level Style and Content Control** | |
- During denoising, three types of guidance shape the output: | |
**Style Features** (VGG16 layers): | |
* Texture Fundamentals: Basic textures and edges | |
* Pattern Assembly: Repeated elements and patterns | |
* Style Motifs: Distinctive style elements | |
* Compositional Grammar: Arrangement of elements | |
* Style Signature: Overall artistic style | |
**Content Features** (VGG16 layers): | |
* Edge Detector: Basic lines and boundaries | |
* Shape Assembler: Simple geometric forms | |
* Part Recognizer: Complex shapes and parts | |
* Object Former: Complete object representations | |
* Scene Integrator: Overall composition | |
**Latent Space Matching**: | |
* Controls how closely the output matches the style image in Stable Diffusion's latent space | |
* Higher values enforce stronger similarity to the style image's overall structure | |
* Lower values allow more freedom for the other guidance systems | |
### Tips for Best Results | |
- Start with lower strength (0.2-0.4) to preserve more of your initial image | |
- Use higher style guidance for stronger artistic effect | |
- Focus content guidance on middle layers for best object preservation | |
- Balance latent loss weight: | |
* Higher (30-50) for closer style image matching | |
* Lower (10-20) for more creative interpretations | |
[Read more here in my blog post](http://christhomas.co.uk/blog/2025/02/17/how-to-guide-stable-diffusion-with-vgg-features-style-loss-and-latent-mae/) | |
[Github repo](https://github.com/chris-thomas/steering-stable-diffusion) | |
""") | |
run_button.click( | |
fn=process_images, | |
inputs=[ | |
init_image, | |
style_image, | |
prompt, | |
negative_prompt, | |
inference_steps, | |
strength, | |
style_guidance_1, | |
style_guidance_2, | |
style_guidance_3, | |
style_guidance_4, | |
style_guidance_5, | |
content_guidance_1, | |
content_guidance_2, | |
content_guidance_3, | |
content_guidance_4, | |
content_guidance_5, | |
latent_guidance | |
], | |
outputs=output | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch() | |