File size: 21,157 Bytes
6ba3fb6
 
24a1778
 
 
c41cfa5
24a1778
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2c80ce
f06d376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2c80ce
24a1778
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f06d376
 
 
 
 
 
 
e2c80ce
e6804f7
e2c80ce
24a1778
e6804f7
e2c80ce
24a1778
e6804f7
e2c80ce
24a1778
4202e73
e2c80ce
24a1778
4202e73
e2c80ce
24a1778
4202e73
e2c80ce
24a1778
4202e73
e2c80ce
 
f06d376
24a1778
e2c80ce
 
 
227c0e6
e2c80ce
c71da8a
e6804f7
24a1778
f06d376
 
 
 
6c9fc96
e2c80ce
 
 
 
 
 
 
f06d376
 
e2c80ce
 
 
 
 
 
f06d376
 
1a3208e
e2c80ce
 
 
 
 
 
 
 
f06d376
 
 
 
24a1778
f06d376
24a1778
 
 
 
 
 
 
36fdf15
24a1778
 
b0fa2af
24a1778
 
 
 
 
 
 
 
c88e020
 
 
 
 
b439e4f
 
 
9077079
b439e4f
 
c88e020
 
24a1778
c88e020
 
 
24a1778
 
 
f078aa0
e388930
f078aa0
a0e0216
24a1778
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3eb9a39
24a1778
0b32458
24a1778
 
 
 
 
 
 
 
f06d376
24a1778
 
 
a0e0216
24a1778
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1c4123
2412a2b
841bb6c
2412a2b
af3d46b
2412a2b
 
cf49b02
841bb6c
2412a2b
af3d46b
24a1778
 
36a2e97
 
 
af3d46b
36a2e97
 
24a1778
 
 
 
 
 
 
 
2412a2b
24a1778
 
4ce63ec
2412a2b
 
24a1778
4ce63ec
24a1778
 
 
 
 
 
 
 
 
 
 
 
 
0b32458
24a1778
f1c4123
 
f06d376
 
 
f1c4123
2412a2b
f1c4123
 
2412a2b
f1c4123
7b9597e
24a1778
 
af3d46b
24a1778
ab3decb
f06d376
 
 
 
 
 
 
 
 
 
 
 
 
d95aa82
f06d376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
defe4bc
f06d376
848b83a
 
85762c5
f06d376
 
 
 
 
85762c5
 
 
 
 
f06d376
85762c5
 
 
 
 
 
 
 
 
 
 
 
 
f06d376
85762c5
 
 
 
 
 
 
f06d376
85762c5
 
 
 
 
 
f06d376
85762c5
 
24a1778
ed968f2
85762c5
 
 
 
ed968f2
 
 
24a1778
 
c41cfa5
 
f06d376
 
 
 
0142905
 
f06d376
0142905
 
24a1778
c41cfa5
 
f06d376
 
 
24a1778
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f06d376
24a1778
 
 
f06d376
 
7480491
 
 
 
 
24a1778
f06d376
24a1778
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <meta name="description"
          content="LEDITS++ Limitless Image Editing using Text-to-Image Models">
    <meta name="keywords"
          content="LEDITS++, DPM solver++ inversion, LEDITS, semantic guidance, SEGA, real image editing">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title> LEDITS++: Limitless Image Editing using Text-to-Image Models </title>

    <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
          rel="stylesheet">

    <link rel="stylesheet" href="./static/css/bulma.min.css">
    <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
    <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
    <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
    <link rel="stylesheet"
          href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
    <link rel="stylesheet" href="./static/css/index.css">
    <link rel="icon" href="./static/images/painting-mascot.svg">

    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script defer src="./static/js/fontawesome.all.min.js"></script>
    <script src="./static/js/bulma-carousel.min.js"></script>
    <script src="./static/js/bulma-slider.min.js"></script>
    <script src="./static/js/index.js"></script>
    <style>
        .publication-links a {
            color: white !important
        }

        @media only screen and (max-width: 900px) {
            .columns {
                overflow-y: scroll;
            }
        }

        @media only screen and (min-width: 901px) {
            .is-centered img {
                width: 80vw !important
            }
        }
    </style>
</head>
<body>

<nav class="navbar" role="navigation" aria-label="main navigation">
    <div class="navbar-brand">
        <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
            <span aria-hidden="true"></span>
            <span aria-hidden="true"></span>
            <span aria-hidden="true"></span>
        </a>
    </div>
</nav>


<section class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
            <div class="columns is-centered">
                <div class="column has-text-centered">
                    <h1 class="title is-1 publication-title">LEDITS++: Limitless Image Editing using Text-to-Image
                        Models</h1>
                    <div class="is-size-5 publication-authors">
                       <span class="author-block">
                                  <a href="https://scholar.google.com/citations?user=kJ9Abf8AAAAJ&hl=en">Manuel Brack</a>¹²⁺,
                        </span>
                        <span class="author-block">
                                  <a href="https://www.ml.informatik.tu-darmstadt.de/people/ffriedrich/index.html">Felix Friedrich</a>²³⁺,
                        </span>
                        <span class="author-block">
                                  <a href="https://huggingface.co/KatharinaK">Katharina Kornmeier</a>²⁺,
                        </span>
                        <span class="author-block">
                                  <a href="https://twitter.com/linoy_tsaban">Linoy Tsaban</a>⁴,
                        </span>
                        <span class="author-block">
                                  <a href="https://scholar.google.com/citations?user=GD481RkAAAAJ&hl=de">Patrick Schramowski</a>¹²³⁶,
                        </span>
                        <span class="author-block">
                                  <a href="https://ml-research.github.io/people/kkersting/">Kristian Kersting</a>¹²³⁵,
                        </span>
                        <span class="author-block">
                                  <a href="https://twitter.com/multimodalart">Apolinário Passos</a></span>
                        <p></p>

                        <div class="is-size-5 publication-authors">
                            <span class="author-block">¹ German Research Center for Artificial Intelligence (DFKI),</span>
                            <span class="author-block">² Computer Science Department, TU Darmstadt,</span>
                            <span class="author-block">³ Hessian.AI,</span>
                            <span class="author-block">⁴ Hugging Face &#129303;,</span>
                            <span class="author-block">⁵ Centre for Cognitive Science, TU Darmstadt,</span>
                            <span class="author-block">⁶ LAION,</span>
                            <span class="author-block">⁺ equal contribution</span>
                        </div>
                        <div class="column has-text-centered">
                            <div class="publication-links">
                                <!-- arxiv Link. -->
                                <span class="link-block">
                     <a href="https://arxiv.org/abs/2311.16711"
                        class="external-link button is-normal is-rounded is-dark">
                     <span class="icon">
                     <i class="ai ai-arxiv"></i>
                     </span>
                     <span>arXiv</span>
                     </a>
                     </span>
                                <!-- Demo Link. -->
                                <span class="link-block">
                     <a href="https://huggingface.co/spaces/editing-images/ledtisplusplus"
                        target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                     <span>&#129303; Demo</span>
                     </a>
                     </span>
                                <!-- Code Link. -->
                                <span class="link-block">
                     <a href="https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/ledits_pp"
                        target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                     <span class="icon">
                     <i class="fa fa-code"></i>
                     </span>
                     <span>Code</span>
                     </a>
                     </span>
                            </div>
                        </div>
                    </div>
                </div>
            </div>
        </div>
</section>

<section class="hero teaser">
    <div class="container is-max-desktop">
        <div class="hero-body">

            <video autoplay muted loop playsinline height="100%">
                <source src="static/videos/faces.mp4"
                        type="video/mp4">
            </video>


            <h2 class="subtitle has-text-centered">
            </h2>

        </div>
    </div>
</section>

<section class="section">
    <div class="container is-max-desktop">
        <!-- Abstract. -->
        <div class="columns is-centered">
            <div class="column">
                <h3 class="title is-3 has-text-centered">Awards</h3>
                <div class="content">
                    <p>
                      We were rewarded with a Meta Quest 3 for being second place at the <a href="https://gamgc.github.io/">GenAI Media Generation Challenge Workshop @ CVPR</a>!
                    </p>
                </div>
            </div>
        </div>

    </div>
</section>
  
<section class="section">
    <div class="container is-max-desktop">
        <!-- Abstract. -->
        <div class="columns is-centered">
            <div class="column">
                <h2 class="title is-3 has-text-centered">Abstract</h2>
                <div class="content">
                    <p>
                        Text-to-image diffusion models have recently received a lot of interest for their
                        astonishing ability to produce high-fidelity images from text only. Subsequent
                        research efforts are aiming to exploit the capabilities of these models and leverage
                        them for intuitive, textual image editing. However, existing methods often require
                        time-consuming fine-tuning and lack native support for performing multiple edits
                        simultaneously. To address these issues, we introduce LEDITS++ , an efficient yet
                        versatile technique for image editing using text-to-image models. LEDITS++ re-
                        quires no tuning nor optimization, runs in a few diffusion steps, natively supports
                        multiple simultaneous edits, inherently limits changes to relevant image regions,
                        and is architecture agnostic.
                    </p>

                </div>
            </div>
        </div>

    </div>
</section>

<section class="section">
    <div class="container is-max-desktop">
        <div class="columns is-centered has-text-centered">
            <img src="static/images/teaser.png"
                 class="interpolation-image"
                 style="max-height:700px; max-width:1200px"
                 alt="ledits++ teaser"/>
        </div>
    </div>
</section>


<section class="section">
    <div class="container is-max-desktop">
        <!-- Introduction -->
        <div class="columns is-centered has-text-centered">
            <h2 class="title is-3">LEDITS++: Efficient and Versatile Textual Image Editing</h2>
        </div>
        <div class="content">
            <p>
                To ease textual image editing, we present LEDITS++, a novel method for efficient and versatile image
                editing using text-to-image diffusion models. Firstly, LEDITS++ sets itself apart as a parameter-free
                solution requiring no fine-tuning nor any optimization. We derive characteristics of an edit-friendly
                noise space with a perfect input reconstruction, which were previously proposed for the DDPM
                sampling scheme, for a significantly faster multistep stochastic differential-equation (SDE)
                solver. This novel invertibility of the DPM-solver++ facilitates editing with LEDITS++ in as
                little as 20 total diffusion steps for inversion and inference combined.
                Moreover, LEDITS++ places a strong emphasis on semantic grounding to enhance the visual and
                contextual coherence of the edits. This ensures that changes are limited to the relevant regions in the
                image, preserving the original image’s fidelity as much as possible. LEDITS++ also provides users
                with the flexibility to combine multiple edits seamlessly, opening up new creative possibilities for
                intricate image manipulations. Finally, the approach is architecture-agnostic and compatible with any
                diffusion model, whether latent or pixel-based.
            </p>
            <section class="section">
                <div class="container is-max-desktop">

                    <div class="columns is-centered has-text-centered">
                        <img src="static/images/variations.png"
                             class="interpolation-image"
                             style="max-height:800px; max-width:800px"
                             alt="examples"/>
                    </div>
                    <div class="columns is-centered has-text-centered" style="margin-top: 2.5em;">
                        <img src="static/images/smile_progression.png"
                             class="interpolation-image"
                             style="max-height:800px; max-width:800px"
                             alt="examples"/>
                    </div>
                    <div class="columns is-centered has-text-centered" style="margin-top: 2.5em;">
                        <img src="static/images/qualitative_car.png"
                             class="interpolation-image"
                             style="max-height:800px; max-width:800px"
                             alt="examples"/>
                    </div>
                </div>
            </section>
            <div class="columns is-centered has-text-centered">
                <h2 class="title is-3">Methodology
                </h2>
            </div>
            <p>
                The methodology of LEDITS++ can be broken down into three components: (1) efficient image
                inversion, (2) versatile textual editing, and (3) semantic grounding of image changes.
            </p>

            <!--<div class="columns is-centered has-text-centered">
                <img src="static/images/ledits_teaser.jpg"
                     style="max-height:620px; max-width:1000px"
                     alt="diagram"/>
            </div>-->

        </div>


    </div>
    <div class="container is-max-desktop">
        <div class="column">


            <div class="columns is-centered">
                <!-- Editing workflows -->
                <div class="column">
                    <div class="content">
                        <h2 class="title is-4">Component 1: Perfect Inversion</h2>
                        <p>
                            Utilizing T2I models for editing real images is usually done by inverting the sampling
                            process to identify a noisy xT that will be denoised to the input image x0.
                            We draw characteristics from <a href="https://inbarhub.github.io/DDPM_inversion/"
                                                            target="_blank">edit friendly DDPM inversion</a> and propose
                            an efficient
                            inversion method that greatly reduces the required number
                            of steps while maintaining no reconstruction error.
                            DDPM can be viewed as a first-order
                            SDE solver when formulating the reverse diffusion process as an SDE. This
                            SDE can be solved more efficiently—in fewer steps—
                            using a higher-order differential equation solver, hence we derive a new, faster
                            technique - <b>dpm-solver++ Inversion</b>.

                        </p>
                        <img src="static/images/inversion.png"/>
                    </div>
                    <div class="content">
                        <h2 class="title is-4">Component 2: Textual Editing</h2>
                        <p>
                            After creating our re-construction sequence, we can edit the image by manipulating
                            the noise estimate εθ based on a set of edit instructions. We devise a dedicated
                            guidance term for each concept based on conditioned and unconditioned estimate. We
                            define LEDITS++ guidance such that it both reflects the direction of the edit (if we
                            want
                            to push away from/towards the edit concept) and maximizes fine-grained control over
                            the effect of the desired edit.

                        </p>
                        <img src="static/images/textual_editing.png"/>
                    </div>
                    <div class="content">
                        <h2 class="title is-4">Component 3: Semantic Grounding</h2>
                        <p>
                            In our defined LEDITS++ guidance, we include a masking term composed of the
                            intersection between the mask generated from
                            the U-Net’s cross-attention layers and a mask derived from
                            the noise estimate - yielding a mask both focused on relevant image
                            regions and of fine granularity.
                            We empirically demonstrate that these maps can also capture regions
                            of an image relevant to an editing concept that is not already present.
                            Specifically for multiple edits, calculating a
                            dedicated mask for each edit prompt ensures that the corresponding
                            guidance terms remain largely isolated, limiting
                            interference between them.

                        </p>

                    </div>

                </div>
            </div>
            <div class="columns is-centered">
                <img
                        style="max-height:800px; max-width:800px"
                        src="static/images/semantic_grounding.png"
                />
            </div>
        </div>
    </div>
</section>
<section class="section">
    <div class="container is-max-desktop">


        <div class="columns is-centered has-text-centered">
            <h2 class="title is-3">Properties of LEDITS++
            </h2>
        </div>
        <div class="columns is-centered">
            <div class="column">
                <p>
                    <b>Efficiency.</b>
                    As a parameter-free approach, LEDITS++ does not require any fine-tuning or optimization.
                    In addition we use a recent, fast scheduler altogether making LEDITS++ six times faster than
                    recent DDPM inversion.
                </p>
            </div>
            <div class="column">
                <p>
                    <b>Versatility.</b>
                    LEDITS++ facilitates fine-grained edits and holistic changes such as style transfer.
                    To the best of our knowledge, LEDITS++ is the only diffusion-based image editing method
                    inherently supporting multiple edits in isolation.
                </p>
            </div>
            <div class="column">
                <p>
                    <b>Precision.</b>
                    LEDITS++’s methodology keeps edits concise and avoids unnecessary deviations
                    from the input image through prefect inversion and use of implict masking
                    (specifically important for editing multiple concepts simultaneously).

                </p>
            </div>
        </div>
      <video autoplay muted loop playsinline height="100%">
        <source src="static/videos/objects_styles.mp4"
                type="video/mp4">
    </video>

    </div>
    


</section>
<section class="section">
    <div class="container is-max-desktop">
        <div class="columns is-centered has-text-centered">
            <h2 class="title is-3">Interactive Demo</h2>
        </div>
        <script
        	type="module"
        	src="https://gradio.s3-us-west-2.amazonaws.com/4.7.1/gradio.js"
        ></script>
        
        <gradio-app src="https://editing-images-leditsplusplus.hf.space"></gradio-app>

    </div>
</section>


<!-- portraits video -->
<!--<section class="hero teaser">-->
<!--    <div class="container is-max-desktop">-->
<!--        <div class="hero-body">-->
<!--            <video id="portraits" autoplay muted loop playsinline height="100%">-->
<!--                <source src="./static/videos/portraits.mp4"-->
<!--                        type="video/mp4">-->
<!--            </video>-->
<!--            <h2 class="subtitle has-text-centered">-->
<!--               *Gif/image description*-->
<!--            </h2>-->

<!--        </div>-->
<!--    </div>-->
<!--</section>-->

<!-- 3 key observations -->


<section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
        <h2 class="title">BibTeX</h2>
        <pre><code>@inproceedings{brack2024ledits,
          year = { 2024 }, 
          booktitle = { Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) }, 
          author = { Manuel Brack and Felix Friedrich and Katharina Kornmeier and Linoy Tsaban and Patrick Schramowski and Kristian Kersting and Apolinaros Passos }, 
          title = { LEDITS++: Limitless Image Editing using Text-to-Image Models }
}</code></pre>
    </div>
</section>


<footer class="footer">
    <div class="container">
        <div class="columns is-centered">
            <div class="column is-8">
                <div class="content">
                    <p>
                        This website is licensed under a <a rel="license"
                                                            href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
                        Commons Attribution-ShareAlike 4.0 International License</a>.
                    </p>
                    <p>
                        This page was built using the source code of:
                        <a rel="nerfies.github.io"
                           href="https://github.com/nerfies/nerfies.github.io">nerfies.github.io</a>
                    </p>
                </div>
            </div>
        </div>
    </div>
</footer>

</body>
</html>