chris-thomas's picture
Upload folder using huggingface_hub
4f8f689 verified
#import spaces
import gradio as gr
from PIL import Image
import torch
import os
import gc
from pathlib import Path
from model import (
process_images,
DEVICE,
DIMENSION,
MODEL_ID
)
import logging
logging.disable(logging.WARNING)
def create_interface():
with gr.Blocks() as demo:
gr.Markdown("""
# Diffusion Style Transfer with Feature, Style, and Latent Loss
This tool combines Stable Diffusion with VGG feature control and latent matching to create styled images.
""")
with gr.Row():
with gr.Column(scale=1):
init_image = gr.Image(
label="Initial Image",
type="pil",
height=384
)
style_image = gr.Image(
label="Style Image",
type="pil",
height=384
)
with gr.Column(scale=1):
prompt = gr.Textbox(
label="Prompt",
lines=2
)
negative_prompt = gr.Textbox(
label="Negative Prompt",
value="extra details, jpeg artifacts, chromatic aberration"
)
with gr.Row():
inference_steps = gr.Slider(
minimum=20,
maximum=100,
value=60,
step=1,
label="Inference Steps"
)
strength = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.2,
step=0.05,
label="Initial Image Strength"
)
gr.Markdown("### Gram Matrix Based Style Guidance Scales")
with gr.Row():
style_guidance_1 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Texture Fundamentals")
with gr.Row():
style_guidance_2 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Pattern Assembly")
with gr.Row():
style_guidance_3 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Motifs")
with gr.Row():
style_guidance_4 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Compositional Grammar")
with gr.Row():
style_guidance_5 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Signature")
gr.Markdown("### Feature/Perceptual Based Content Guidance Scales")
with gr.Row():
content_guidance_1 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Edge Detector")
with gr.Row():
content_guidance_2 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Shape Assembler")
with gr.Row():
content_guidance_3 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Part Recognizer")
with gr.Row():
content_guidance_4 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Object Former")
with gr.Row():
content_guidance_5 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Scene Integrator")
gr.Markdown("### Latent Space Guidance")
latent_guidance = gr.Slider(
minimum=0,
maximum=1000,
value=24,
step=1,
label="Latent Space Closeness",
info="Controls how closely the output matches the style image in latent space"
)
run_button = gr.Button("Generate")
with gr.Column(scale=1):
output = gr.Image(
label="Output",
height=384
)
gr.Markdown("""
The process works in three main steps:
1. **Initial Image Processing**
- Your input image is partially noised based on the 'Initial Image Strength'
- Higher strength means more noise, allowing more creative freedom but less preservation of original content
2. **Diffusion Denoising**
- The image is gradually denoised using Stable Diffusion
- The process is guided by your prompt and negative prompt
- Takes place over the specified number of inference steps
3. **Multi-Level Style and Content Control**
- During denoising, three types of guidance shape the output:
**Style Features** (VGG16 layers):
* Texture Fundamentals: Basic textures and edges
* Pattern Assembly: Repeated elements and patterns
* Style Motifs: Distinctive style elements
* Compositional Grammar: Arrangement of elements
* Style Signature: Overall artistic style
**Content Features** (VGG16 layers):
* Edge Detector: Basic lines and boundaries
* Shape Assembler: Simple geometric forms
* Part Recognizer: Complex shapes and parts
* Object Former: Complete object representations
* Scene Integrator: Overall composition
**Latent Space Matching**:
* Controls how closely the output matches the style image in Stable Diffusion's latent space
* Higher values enforce stronger similarity to the style image's overall structure
* Lower values allow more freedom for the other guidance systems
### Tips for Best Results
- Start with lower strength (0.2-0.4) to preserve more of your initial image
- Use higher style guidance for stronger artistic effect
- Focus content guidance on middle layers for best object preservation
- Balance latent loss weight:
* Higher (30-50) for closer style image matching
* Lower (10-20) for more creative interpretations
[Read more here in my blog post](http://christhomas.co.uk/blog/2025/02/17/how-to-guide-stable-diffusion-with-vgg-features-style-loss-and-latent-mae/)
[Github repo](https://github.com/chris-thomas/steering-stable-diffusion)
""")
run_button.click(
fn=process_images,
inputs=[
init_image,
style_image,
prompt,
negative_prompt,
inference_steps,
strength,
style_guidance_1,
style_guidance_2,
style_guidance_3,
style_guidance_4,
style_guidance_5,
content_guidance_1,
content_guidance_2,
content_guidance_3,
content_guidance_4,
content_guidance_5,
latent_guidance
],
outputs=output
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()