Spaces:
Running
Running
<html> | |
<head> | |
<meta charset="utf-8"> | |
<meta name="description" | |
content="LEDITS++ Limitless Image Editing using Text-to-Image Models"> | |
<meta name="keywords" | |
content="LEDITS++, DPM solver++ inversion, LEDITS, semantic guidance, SEGA, real image editing"> | |
<meta name="viewport" content="width=device-width, initial-scale=1"> | |
<title> LEDITS++: Limitless Image Editing using Text-to-Image Models </title> | |
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" | |
rel="stylesheet"> | |
<link rel="stylesheet" href="./static/css/bulma.min.css"> | |
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css"> | |
<link rel="stylesheet" href="./static/css/bulma-slider.min.css"> | |
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css"> | |
<link rel="stylesheet" | |
href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"> | |
<link rel="stylesheet" href="./static/css/index.css"> | |
<link rel="icon" href="./static/images/painting-mascot.svg"> | |
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script> | |
<script defer src="./static/js/fontawesome.all.min.js"></script> | |
<script src="./static/js/bulma-carousel.min.js"></script> | |
<script src="./static/js/bulma-slider.min.js"></script> | |
<script src="./static/js/index.js"></script> | |
<style> | |
.publication-links a { | |
color: white | |
} | |
@media only screen and (max-width: 900px) { | |
.columns { | |
overflow-y: scroll; | |
} | |
} | |
@media only screen and (min-width: 901px) { | |
.is-centered img { | |
width: 80vw | |
} | |
} | |
</style> | |
</head> | |
<body> | |
<nav class="navbar" role="navigation" aria-label="main navigation"> | |
<div class="navbar-brand"> | |
<a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false"> | |
<span aria-hidden="true"></span> | |
<span aria-hidden="true"></span> | |
<span aria-hidden="true"></span> | |
</a> | |
</div> | |
</nav> | |
<section class="hero"> | |
<div class="hero-body"> | |
<div class="container is-max-desktop"> | |
<div class="columns is-centered"> | |
<div class="column has-text-centered"> | |
<h1 class="title is-1 publication-title">LEDITS++: Limitless Image Editing using Text-to-Image | |
Models</h1> | |
<div class="is-size-5 publication-authors"> | |
<span class="author-block"> | |
<a href="https://scholar.google.com/citations?user=kJ9Abf8AAAAJ&hl=en">Manuel Brack</a>¹²⁺, | |
</span> | |
<span class="author-block"> | |
<a href="https://www.ml.informatik.tu-darmstadt.de/people/ffriedrich/index.html">Felix Friedrich</a>²³⁺, | |
</span> | |
<span class="author-block"> | |
<a href="https://huggingface.co/KatharinaK">Katharina Kornmeier</a>²⁺, | |
</span> | |
<span class="author-block"> | |
<a href="https://twitter.com/linoy_tsaban">Linoy Tsaban</a>⁴, | |
</span> | |
<span class="author-block"> | |
<a href="https://scholar.google.com/citations?user=GD481RkAAAAJ&hl=de">Patrick Schramowski</a>¹²³⁶, | |
</span> | |
<span class="author-block"> | |
<a href="https://ml-research.github.io/people/kkersting/">Kristian Kersting</a>¹²³⁵, | |
</span> | |
<span class="author-block"> | |
<a href="https://twitter.com/multimodalart">Apolinário Passos</a>⁴ | |
</span> | |
<p></p> | |
<div class="is-size-5 publication-authors"> | |
<span class="author-block">¹ German Research Center for Artificial Intelligence (DFKI),</span> | |
<span class="author-block">² Computer Science Department, TU Darmstadt,</span> | |
<span class="author-block">³ Hessian.AI,</span> | |
<span class="author-block">⁴ Hugging Face 🤗,</span> | |
<span class="author-block">⁵ Centre for Cognitive Science, TU Darmstadt,</span> | |
<span class="author-block">⁶ LAION,</span> | |
<span class="author-block">⁺ equal contribution</span> | |
</div> | |
<div class="column has-text-centered"> | |
<div class="publication-links"> | |
<!-- arxiv Link. --> | |
<span class="link-block"> | |
<a href="https://arxiv.org/abs/2311.16711" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="ai ai-arxiv"></i> | |
</span> | |
<span>arXiv</span> | |
</a> | |
</span> | |
<!-- Demo Link. --> | |
<span class="link-block"> | |
<a href="https://huggingface.co/spaces/editing-images/ledtisplusplus" | |
target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span>🤗 Demo</span> | |
</a> | |
</span> | |
<!-- Code Link. --> | |
<span class="link-block"> | |
<a href="https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/ledits_pp" | |
target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="fa fa-code"></i> | |
</span> | |
<span>Code</span> | |
</a> | |
</span> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</section> | |
<section class="hero teaser"> | |
<div class="container is-max-desktop"> | |
<div class="hero-body"> | |
<video autoplay muted loop playsinline height="100%"> | |
<source src="static/videos/faces.mp4" | |
type="video/mp4"> | |
</video> | |
<h2 class="subtitle has-text-centered"> | |
</h2> | |
</div> | |
</div> | |
</section> | |
<section class="section"> | |
<div class="container is-max-desktop"> | |
<!-- Abstract. --> | |
<div class="columns is-centered"> | |
<div class="column"> | |
<h3 class="title is-3 has-text-centered">Awards</h3> | |
<div class="content"> | |
<p> | |
We were rewarded with a Meta Quest 3 for being second place at the <a href="https://gamgc.github.io/">GenAI Media Generation Challenge Workshop @ CVPR</a>! | |
</p> | |
</div> | |
</div> | |
</div> | |
</div> | |
</section> | |
<section class="section"> | |
<div class="container is-max-desktop"> | |
<!-- Abstract. --> | |
<div class="columns is-centered"> | |
<div class="column"> | |
<h2 class="title is-3 has-text-centered">Abstract</h2> | |
<div class="content"> | |
<p> | |
Text-to-image diffusion models have recently received a lot of interest for their | |
astonishing ability to produce high-fidelity images from text only. Subsequent | |
research efforts are aiming to exploit the capabilities of these models and leverage | |
them for intuitive, textual image editing. However, existing methods often require | |
time-consuming fine-tuning and lack native support for performing multiple edits | |
simultaneously. To address these issues, we introduce LEDITS++ , an efficient yet | |
versatile technique for image editing using text-to-image models. LEDITS++ re- | |
quires no tuning nor optimization, runs in a few diffusion steps, natively supports | |
multiple simultaneous edits, inherently limits changes to relevant image regions, | |
and is architecture agnostic. | |
</p> | |
</div> | |
</div> | |
</div> | |
</div> | |
</section> | |
<section class="section"> | |
<div class="container is-max-desktop"> | |
<div class="columns is-centered has-text-centered"> | |
<img src="static/images/teaser.png" | |
class="interpolation-image" | |
style="max-height:700px; max-width:1200px" | |
alt="ledits++ teaser"/> | |
</div> | |
</div> | |
</section> | |
<section class="section"> | |
<div class="container is-max-desktop"> | |
<!-- Introduction --> | |
<div class="columns is-centered has-text-centered"> | |
<h2 class="title is-3">LEDITS++: Efficient and Versatile Textual Image Editing</h2> | |
</div> | |
<div class="content"> | |
<p> | |
To ease textual image editing, we present LEDITS++, a novel method for efficient and versatile image | |
editing using text-to-image diffusion models. Firstly, LEDITS++ sets itself apart as a parameter-free | |
solution requiring no fine-tuning nor any optimization. We derive characteristics of an edit-friendly | |
noise space with a perfect input reconstruction, which were previously proposed for the DDPM | |
sampling scheme, for a significantly faster multistep stochastic differential-equation (SDE) | |
solver. This novel invertibility of the DPM-solver++ facilitates editing with LEDITS++ in as | |
little as 20 total diffusion steps for inversion and inference combined. | |
Moreover, LEDITS++ places a strong emphasis on semantic grounding to enhance the visual and | |
contextual coherence of the edits. This ensures that changes are limited to the relevant regions in the | |
image, preserving the original image’s fidelity as much as possible. LEDITS++ also provides users | |
with the flexibility to combine multiple edits seamlessly, opening up new creative possibilities for | |
intricate image manipulations. Finally, the approach is architecture-agnostic and compatible with any | |
diffusion model, whether latent or pixel-based. | |
</p> | |
<section class="section"> | |
<div class="container is-max-desktop"> | |
<div class="columns is-centered has-text-centered"> | |
<img src="static/images/variations.png" | |
class="interpolation-image" | |
style="max-height:800px; max-width:800px" | |
alt="examples"/> | |
</div> | |
<div class="columns is-centered has-text-centered" style="margin-top: 2.5em;"> | |
<img src="static/images/smile_progression.png" | |
class="interpolation-image" | |
style="max-height:800px; max-width:800px" | |
alt="examples"/> | |
</div> | |
<div class="columns is-centered has-text-centered" style="margin-top: 2.5em;"> | |
<img src="static/images/qualitative_car.png" | |
class="interpolation-image" | |
style="max-height:800px; max-width:800px" | |
alt="examples"/> | |
</div> | |
</div> | |
</section> | |
<div class="columns is-centered has-text-centered"> | |
<h2 class="title is-3">Methodology | |
</h2> | |
</div> | |
<p> | |
The methodology of LEDITS++ can be broken down into three components: (1) efficient image | |
inversion, (2) versatile textual editing, and (3) semantic grounding of image changes. | |
</p> | |
<!--<div class="columns is-centered has-text-centered"> | |
<img src="static/images/ledits_teaser.jpg" | |
style="max-height:620px; max-width:1000px" | |
alt="diagram"/> | |
</div>--> | |
</div> | |
</div> | |
<div class="container is-max-desktop"> | |
<div class="column"> | |
<div class="columns is-centered"> | |
<!-- Editing workflows --> | |
<div class="column"> | |
<div class="content"> | |
<h2 class="title is-4">Component 1: Perfect Inversion</h2> | |
<p> | |
Utilizing T2I models for editing real images is usually done by inverting the sampling | |
process to identify a noisy xT that will be denoised to the input image x0. | |
We draw characteristics from <a href="https://inbarhub.github.io/DDPM_inversion/" | |
target="_blank">edit friendly DDPM inversion</a> and propose | |
an efficient | |
inversion method that greatly reduces the required number | |
of steps while maintaining no reconstruction error. | |
DDPM can be viewed as a first-order | |
SDE solver when formulating the reverse diffusion process as an SDE. This | |
SDE can be solved more efficiently—in fewer steps— | |
using a higher-order differential equation solver, hence we derive a new, faster | |
technique - <b>dpm-solver++ Inversion</b>. | |
</p> | |
<img src="static/images/inversion.png"/> | |
</div> | |
<div class="content"> | |
<h2 class="title is-4">Component 2: Textual Editing</h2> | |
<p> | |
After creating our re-construction sequence, we can edit the image by manipulating | |
the noise estimate εθ based on a set of edit instructions. We devise a dedicated | |
guidance term for each concept based on conditioned and unconditioned estimate. We | |
define LEDITS++ guidance such that it both reflects the direction of the edit (if we | |
want | |
to push away from/towards the edit concept) and maximizes fine-grained control over | |
the effect of the desired edit. | |
</p> | |
<img src="static/images/textual_editing.png"/> | |
</div> | |
<div class="content"> | |
<h2 class="title is-4">Component 3: Semantic Grounding</h2> | |
<p> | |
In our defined LEDITS++ guidance, we include a masking term composed of the | |
intersection between the mask generated from | |
the U-Net’s cross-attention layers and a mask derived from | |
the noise estimate - yielding a mask both focused on relevant image | |
regions and of fine granularity. | |
We empirically demonstrate that these maps can also capture regions | |
of an image relevant to an editing concept that is not already present. | |
Specifically for multiple edits, calculating a | |
dedicated mask for each edit prompt ensures that the corresponding | |
guidance terms remain largely isolated, limiting | |
interference between them. | |
</p> | |
</div> | |
</div> | |
</div> | |
<div class="columns is-centered"> | |
<img | |
style="max-height:800px; max-width:800px" | |
src="static/images/semantic_grounding.png" | |
/> | |
</div> | |
</div> | |
</div> | |
</section> | |
<section class="section"> | |
<div class="container is-max-desktop"> | |
<div class="columns is-centered has-text-centered"> | |
<h2 class="title is-3">Properties of LEDITS++ | |
</h2> | |
</div> | |
<div class="columns is-centered"> | |
<div class="column"> | |
<p> | |
<b>Efficiency.</b> | |
As a parameter-free approach, LEDITS++ does not require any fine-tuning or optimization. | |
In addition we use a recent, fast scheduler altogether making LEDITS++ six times faster than | |
recent DDPM inversion. | |
</p> | |
</div> | |
<div class="column"> | |
<p> | |
<b>Versatility.</b> | |
LEDITS++ facilitates fine-grained edits and holistic changes such as style transfer. | |
To the best of our knowledge, LEDITS++ is the only diffusion-based image editing method | |
inherently supporting multiple edits in isolation. | |
</p> | |
</div> | |
<div class="column"> | |
<p> | |
<b>Precision.</b> | |
LEDITS++’s methodology keeps edits concise and avoids unnecessary deviations | |
from the input image through prefect inversion and use of implict masking | |
(specifically important for editing multiple concepts simultaneously). | |
</p> | |
</div> | |
</div> | |
<video autoplay muted loop playsinline height="100%"> | |
<source src="static/videos/objects_styles.mp4" | |
type="video/mp4"> | |
</video> | |
</div> | |
</section> | |
<section class="section"> | |
<div class="container is-max-desktop"> | |
<div class="columns is-centered has-text-centered"> | |
<h2 class="title is-3">Interactive Demo</h2> | |
</div> | |
<script | |
type="module" | |
src="https://gradio.s3-us-west-2.amazonaws.com/4.7.1/gradio.js" | |
></script> | |
<gradio-app src="https://editing-images-leditsplusplus.hf.space"></gradio-app> | |
</div> | |
</section> | |
<!-- portraits video --> | |
<!--<section class="hero teaser">--> | |
<!-- <div class="container is-max-desktop">--> | |
<!-- <div class="hero-body">--> | |
<!-- <video id="portraits" autoplay muted loop playsinline height="100%">--> | |
<!-- <source src="./static/videos/portraits.mp4"--> | |
<!-- type="video/mp4">--> | |
<!-- </video>--> | |
<!-- <h2 class="subtitle has-text-centered">--> | |
<!-- *Gif/image description*--> | |
<!-- </h2>--> | |
<!-- </div>--> | |
<!-- </div>--> | |
<!--</section>--> | |
<!-- 3 key observations --> | |
<section class="section" id="BibTeX"> | |
<div class="container is-max-desktop content"> | |
<h2 class="title">BibTeX</h2> | |
<pre><code>@inproceedings{brack2024ledits, | |
year = { 2024 }, | |
booktitle = { Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) }, | |
author = { Manuel Brack and Felix Friedrich and Katharina Kornmeier and Linoy Tsaban and Patrick Schramowski and Kristian Kersting and Apolinaros Passos }, | |
title = { LEDITS++: Limitless Image Editing using Text-to-Image Models } | |
}</code></pre> | |
</div> | |
</section> | |
<footer class="footer"> | |
<div class="container"> | |
<div class="columns is-centered"> | |
<div class="column is-8"> | |
<div class="content"> | |
<p> | |
This website is licensed under a <a rel="license" | |
href="http://creativecommons.org/licenses/by-sa/4.0/">Creative | |
Commons Attribution-ShareAlike 4.0 International License</a>. | |
</p> | |
<p> | |
This page was built using the source code of: | |
<a rel="nerfies.github.io" | |
href="https://github.com/nerfies/nerfies.github.io">nerfies.github.io</a> | |
</p> | |
</div> | |
</div> | |
</div> | |
</div> | |
</footer> | |
</body> | |
</html> |