Spaces:

chris-thomas
/

steering-diffusion-with-features-and-style-loss

Sleeping

App Files Files Community

steering-diffusion-with-features-and-style-loss / app.py

chris-thomas

Upload folder using huggingface_hub

4f8f689 verified 5 months ago

raw

history blame contribute delete

7.46 kB

	#import spaces
	import gradio as gr
	from PIL import Image
	import torch
	import os
	import gc
	from pathlib import Path

	from model import (
	process_images,
	DEVICE,
	DIMENSION,
	MODEL_ID
	)

	import logging
	logging.disable(logging.WARNING)

	def create_interface():
	with gr.Blocks() as demo:
	gr.Markdown("""
	# Diffusion Style Transfer with Feature, Style, and Latent Loss

	This tool combines Stable Diffusion with VGG feature control and latent matching to create styled images.
	""")
	with gr.Row():
	with gr.Column(scale=1):
	init_image = gr.Image(
	label="Initial Image",
	type="pil",
	height=384
	)
	style_image = gr.Image(
	label="Style Image",
	type="pil",
	height=384
	)

	with gr.Column(scale=1):
	prompt = gr.Textbox(
	label="Prompt",
	lines=2
	)
	negative_prompt = gr.Textbox(
	label="Negative Prompt",
	value="extra details, jpeg artifacts, chromatic aberration"
	)
	with gr.Row():
	inference_steps = gr.Slider(
	minimum=20,
	maximum=100,
	value=60,
	step=1,
	label="Inference Steps"
	)
	strength = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.2,
	step=0.05,
	label="Initial Image Strength"
	)

	gr.Markdown("### Gram Matrix Based Style Guidance Scales")
	with gr.Row():
	style_guidance_1 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Texture Fundamentals")
	with gr.Row():
	style_guidance_2 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Pattern Assembly")
	with gr.Row():
	style_guidance_3 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Motifs")
	with gr.Row():
	style_guidance_4 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Compositional Grammar")
	with gr.Row():
	style_guidance_5 = gr.Slider(minimum=0.0, maximum=2.0, value=1., step=0.1, label="Style Signature")

	gr.Markdown("### Feature/Perceptual Based Content Guidance Scales")
	with gr.Row():
	content_guidance_1 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Edge Detector")
	with gr.Row():
	content_guidance_2 = gr.Slider(minimum=0.0, maximum=20, value=1.0, step=0.5, label="Shape Assembler")
	with gr.Row():
	content_guidance_3 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Part Recognizer")
	with gr.Row():
	content_guidance_4 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Object Former")
	with gr.Row():
	content_guidance_5 = gr.Slider(minimum=0.0, maximum=20, value=0.0, step=0.5, label="Scene Integrator")

	gr.Markdown("### Latent Space Guidance")
	latent_guidance = gr.Slider(
	minimum=0,
	maximum=1000,
	value=24,
	step=1,
	label="Latent Space Closeness",
	info="Controls how closely the output matches the style image in latent space"
	)

	run_button = gr.Button("Generate")

	with gr.Column(scale=1):
	output = gr.Image(
	label="Output",
	height=384
	)

	gr.Markdown("""
	The process works in three main steps:

	1. Initial Image Processing
	- Your input image is partially noised based on the 'Initial Image Strength'
	- Higher strength means more noise, allowing more creative freedom but less preservation of original content

	2. Diffusion Denoising
	- The image is gradually denoised using Stable Diffusion
	- The process is guided by your prompt and negative prompt
	- Takes place over the specified number of inference steps

	3. Multi-Level Style and Content Control
	- During denoising, three types of guidance shape the output:

	Style Features (VGG16 layers):
	* Texture Fundamentals: Basic textures and edges
	* Pattern Assembly: Repeated elements and patterns
	* Style Motifs: Distinctive style elements
	* Compositional Grammar: Arrangement of elements
	* Style Signature: Overall artistic style

	Content Features (VGG16 layers):
	* Edge Detector: Basic lines and boundaries
	* Shape Assembler: Simple geometric forms
	* Part Recognizer: Complex shapes and parts
	* Object Former: Complete object representations
	* Scene Integrator: Overall composition

	Latent Space Matching:
	* Controls how closely the output matches the style image in Stable Diffusion's latent space
	* Higher values enforce stronger similarity to the style image's overall structure
	* Lower values allow more freedom for the other guidance systems

	### Tips for Best Results
	- Start with lower strength (0.2-0.4) to preserve more of your initial image
	- Use higher style guidance for stronger artistic effect
	- Focus content guidance on middle layers for best object preservation
	- Balance latent loss weight:
	* Higher (30-50) for closer style image matching
	* Lower (10-20) for more creative interpretations

	[Read more here in my blog post](http://christhomas.co.uk/blog/2025/02/17/how-to-guide-stable-diffusion-with-vgg-features-style-loss-and-latent-mae/)
	[Github repo](https://github.com/chris-thomas/steering-stable-diffusion)
	""")

	run_button.click(
	fn=process_images,
	inputs=[
	init_image,
	style_image,
	prompt,
	negative_prompt,
	inference_steps,
	strength,
	style_guidance_1,
	style_guidance_2,
	style_guidance_3,
	style_guidance_4,
	style_guidance_5,
	content_guidance_1,
	content_guidance_2,
	content_guidance_3,
	content_guidance_4,
	content_guidance_5,
	latent_guidance
	],
	outputs=output
	)


	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()