Spaces:

ironjr
/

semantic-draw-canvas-sd3

Running on Zero

App Files Files Community

ironjr commited on Jun 23, 2024

Commit

1e1c50f

1 Parent(s): 35a5a5e

first commit

Browse files

Files changed (12) hide show

README.md +3 -3
app.py +948 -0
examples/prompt_background.txt +8 -0
examples/prompt_background_advanced.txt +0 -0
examples/prompt_boy.txt +15 -0
examples/prompt_girl.txt +16 -0
examples/prompt_props.txt +43 -0
model.py +1095 -0
prompt_util.py +154 -0
requirements.txt +16 -0
share_btn.py +70 -0
util.py +315 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: SemanticPalette3
-emoji: 🚀
 colorFrom: red
 colorTo: yellow
 sdk: gradio
 sdk_version: 4.36.1
 app_file: app.py
-pinned: false
 license: mit
 ---

 ---
+title: Semantic Palette with Stable Diffusion 3
+emoji: 🧠🎨3️
 colorFrom: red
 colorTo: yellow
 sdk: gradio
 sdk_version: 4.36.1
 app_file: app.py
+pinned: true
 license: mit
 ---

app.py ADDED Viewed

	@@ -0,0 +1,948 @@

+# Copyright (c) 2024 Jaerin Lee
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import sys
+sys.path.append('../../src')
+import argparse
+import random
+import time
+import json
+import os
+import glob
+import pathlib
+from functools import partial
+from pprint import pprint
+import numpy as np
+from PIL import Image
+import torch
+import gradio as gr
+from huggingface_hub import snapshot_download
+from model import StableMultiDiffusion3Pipeline
+from util import seed_everything
+from prompt_util import preprocess_prompts, _quality_dict, _style_dict
+### Utils
+def log_state(state):
+    pprint(vars(opt))
+    if isinstance(state, gr.State):
+        state = state.value
+    pprint(vars(state))
+def is_empty_image(im: Image.Image) -> bool:
+    if im is None:
+        return True
+    im = np.array(im)
+    has_alpha = (im.shape[2] == 4)
+    if not has_alpha:
+        return False
+    elif im.sum() == 0:
+        return True
+    else:
+        return False
+### Argument passing
+parser = argparse.ArgumentParser(description='Semantic Palette demo powered by StreamMultiDiffusion with SD3 support.')
+parser.add_argument('-H', '--height', type=int, default=1024)
+parser.add_argument('-W', '--width', type=int, default=2560)
+parser.add_argument('--model', type=str, default=None, help='Hugging face model repository or local path for a SD1.5 model checkpoint to run.')
+parser.add_argument('--bootstrap_steps', type=int, default=2)
+parser.add_argument('--seed', type=int, default=-1)
+parser.add_argument('--device', type=int, default=0)
+parser.add_argument('--port', type=int, default=8000)
+opt = parser.parse_args()
+### Global variables and data structures
+device = f'cuda:{opt.device}' if opt.device >= 0 else 'cpu'
+if opt.model is None:
+    model_dict = {
+        'Stable Diffusion 3': 'stabilityai/stable-diffusion-3-medium-diffusers',
+    }
+else:
+    if opt.model.endswith('.safetensors'):
+        opt.model = os.path.abspath(os.path.join('checkpoints', opt.model))
+    model_dict = {os.path.splitext(os.path.basename(opt.model))[0]: opt.model}
+dtype = torch.float32 if device == 'cpu' else torch.float16
+models = {
+    k: StableMultiDiffusion3Pipeline(device, dtype=dtype, hf_key=v, has_i2t=False)
+    for k, v in model_dict.items()
+}
+prompt_suggestions = [
+    '1girl, souryuu asuka langley, neon genesis evangelion, solo, upper body, v, smile, looking at viewer',
+    '1boy, solo, portrait, looking at viewer, white t-shirt, brown hair',
+    '1girl, arima kana, oshi no ko, solo, upper body, from behind',
+]
+opt.max_palettes = 4
+opt.default_prompt_strength = 1.0
+opt.default_mask_strength = 1.0
+opt.default_mask_std = 0.0
+opt.default_negative_prompt = (
+    'nsfw, worst quality, bad quality, normal quality, cropped, framed'
+)
+opt.verbose = True
+opt.colors = [
+    '#000000',
+    '#2692F3',
+    '#F89E12',
+    '#16C232',
+    '#F92F6C',
+    # '#AC6AEB',
+    # '#92C62C',
+    # '#92C6EC',
+    # '#FECAC0',
+]
+### Event handlers
+def add_palette(state):
+    old_actives = state.active_palettes
+    state.active_palettes = min(state.active_palettes + 1, opt.max_palettes)
+    if opt.verbose:
+        log_state(state)
+    if state.active_palettes != old_actives:
+        return [state] + [
+            gr.update() if state.active_palettes != opt.max_palettes else gr.update(visible=False)
+        ] + [
+            gr.update() if i != state.active_palettes - 1 else gr.update(value=state.prompt_names[i + 1], visible=True)
+            for i in range(opt.max_palettes)
+        ]
+    else:
+        return [state] + [gr.update() for i in range(opt.max_palettes + 1)]
+def select_palette(state, button, idx):
+    if idx < 0 or idx > opt.max_palettes:
+        idx = 0
+    old_idx = state.current_palette
+    if old_idx == idx:
+        return [state] + [gr.update() for _ in range(opt.max_palettes + 7)]
+    state.current_palette = idx
+    if opt.verbose:
+        log_state(state)
+    updates = [state] + [
+        gr.update() if i not in (idx, old_idx) else
+        gr.update(variant='secondary') if i == old_idx else gr.update(variant='primary')
+        for i in range(opt.max_palettes + 1)
+    ]
+    label = 'Background' if idx == 0 else f'Palette {idx}'
+    updates.extend([
+        gr.update(value=button, interactive=(idx > 0)),
+        gr.update(value=state.prompts[idx], label=f'Edit Prompt for {label}'),
+        gr.update(value=state.neg_prompts[idx], label=f'Edit Negative Prompt for {label}'),
+        (
+            gr.update(value=state.mask_strengths[idx - 1], interactive=True) if idx > 0 else
+            gr.update(value=opt.default_mask_strength, interactive=False)
+        ),
+        (
+            gr.update(value=state.prompt_strengths[idx - 1], interactive=True) if idx > 0 else
+            gr.update(value=opt.default_prompt_strength, interactive=False)
+        ),
+        (
+            gr.update(value=state.mask_stds[idx - 1], interactive=True) if idx > 0 else
+            gr.update(value=opt.default_mask_std, interactive=False)
+        ),
+    ])
+    return updates
+def change_prompt_strength(state, strength):
+    if state.current_palette == 0:
+        return state
+    state.prompt_strengths[state.current_palette - 1] = strength
+    if opt.verbose:
+        log_state(state)
+    return state
+def change_std(state, std):
+    if state.current_palette == 0:
+        return state
+    state.mask_stds[state.current_palette - 1] = std
+    if opt.verbose:
+        log_state(state)
+    return state
+def change_mask_strength(state, strength):
+    if state.current_palette == 0:
+        return state
+    state.mask_strengths[state.current_palette - 1] = strength
+    if opt.verbose:
+        log_state(state)
+    return state
+def reset_seed(state, seed):
+    state.seed = seed
+    if opt.verbose:
+        log_state(state)
+    return state
+def rename_prompt(state, name):
+    state.prompt_names[state.current_palette] = name
+    if opt.verbose:
+        log_state(state)
+    return [state] + [
+        gr.update() if i != state.current_palette else gr.update(value=name)
+        for i in range(opt.max_palettes + 1)
+    ]
+def change_prompt(state, prompt):
+    state.prompts[state.current_palette] = prompt
+    if opt.verbose:
+        log_state(state)
+    return state
+def change_neg_prompt(state, neg_prompt):
+    state.neg_prompts[state.current_palette] = neg_prompt
+    if opt.verbose:
+        log_state(state)
+    return state
+def select_model(state, model_id):
+    state.model_id = model_id
+    if opt.verbose:
+        log_state(state)
+    return state
+def select_style(state, style_name):
+    state.style_name = style_name
+    if opt.verbose:
+        log_state(state)
+    return state
+def select_quality(state, quality_name):
+    state.quality_name = quality_name
+    if opt.verbose:
+        log_state(state)
+    return state
+def import_state(state, json_text):
+    current_palette = state.current_palette
+    # active_palettes = state.active_palettes
+    state = argparse.Namespace(**json.loads(json_text))
+    state.active_palettes = opt.max_palettes
+    return [state] + [
+        gr.update(value=v, visible=True) for v in state.prompt_names
+    ] + [
+        # state.model_id,
+        # state.style_name,
+        # state.quality_name,
+        state.prompts[current_palette],
+        state.prompt_names[current_palette],
+        state.neg_prompts[current_palette],
+        state.prompt_strengths[current_palette - 1],
+        state.mask_strengths[current_palette - 1],
+        state.mask_stds[current_palette - 1],
+        state.seed,
+    ]
+### Main worker
+def generate(state, *args, **kwargs):
+    return models[state.model_id](*args, **kwargs)
+def run(state, drawpad):
+    seed_everything(state.seed if state.seed >=0 else np.random.randint(2147483647))
+    print('Generate!')
+    background = drawpad['background'].convert('RGBA')
+    inpainting_mode = np.asarray(background).sum() != 0
+    print('Inpainting mode: ', inpainting_mode)
+    user_input = np.asarray(drawpad['layers'][0]) # (H, W, 4)
+    foreground_mask = torch.tensor(user_input[..., -1])[None, None] # (1, 1, H, W)
+    user_input = torch.tensor(user_input[..., :-1]) # (H, W, 3)
+    palette = torch.tensor([
+        tuple(int(s[i+1:i+3], 16) for i in (0, 2, 4))
+        for s in opt.colors[1:]
+    ]) # (N, 3)
+    masks = (palette[:, None, None, :] == user_input[None]).all(dim=-1)[:, None, ...] # (N, 1, H, W)
+    has_masks = [i for i, m in enumerate(masks.sum(dim=(1, 2, 3)) == 0) if not m]
+    print('Has mask: ', has_masks)
+    masks = masks * foreground_mask
+    masks = masks[has_masks]
+    if inpainting_mode:
+        prompts = [state.prompts[v + 1] for v in has_masks]
+        negative_prompts = [state.neg_prompts[v + 1] for v in has_masks]
+        mask_strengths = [state.mask_strengths[v] for v in has_masks]
+        mask_stds = [state.mask_stds[v] for v in has_masks]
+        prompt_strengths = [state.prompt_strengths[v] for v in has_masks]
+    else:
+        masks = torch.cat([torch.ones_like(foreground_mask), masks], dim=0)
+        prompts = [state.prompts[0]] + [state.prompts[v + 1] for v in has_masks]
+        negative_prompts = [state.neg_prompts[0]] + [state.neg_prompts[v + 1] for v in has_masks]
+        mask_strengths = [1] + [state.mask_strengths[v] for v in has_masks]
+        mask_stds = [0] + [state.mask_stds[v] for v in has_masks]
+        prompt_strengths = [1] + [state.prompt_strengths[v] for v in has_masks]
+    prompts, negative_prompts = preprocess_prompts(
+        prompts, negative_prompts, style_name=state.style_name, quality_name=state.quality_name)
+    return generate(
+        state,
+        prompts,
+        negative_prompts,
+        masks=masks,
+        mask_strengths=mask_strengths,
+        mask_stds=mask_stds,
+        prompt_strengths=prompt_strengths,
+        background=background.convert('RGB'),
+        background_prompt=state.prompts[0],
+        background_negative_prompt=state.neg_prompts[0],
+        height=opt.height,
+        width=opt.width,
+        bootstrap_steps=2,
+        guidance_scale=0,
+    )
+### Load examples
+root = pathlib.Path(__file__).parent
+print(root)
+example_root = os.path.join(root, 'examples')
+example_images = glob.glob(os.path.join(example_root, '*.webp'))
+example_images = [Image.open(i) for i in example_images]
+with open(os.path.join(example_root, 'prompt_background_advanced.txt')) as f:
+    prompts_background = [l.strip() for l in f.readlines() if l.strip() != '']
+with open(os.path.join(example_root, 'prompt_girl.txt')) as f:
+    prompts_girl = [l.strip() for l in f.readlines() if l.strip() != '']
+with open(os.path.join(example_root, 'prompt_boy.txt')) as f:
+    prompts_boy = [l.strip() for l in f.readlines() if l.strip() != '']
+with open(os.path.join(example_root, 'prompt_props.txt')) as f:
+    prompts_props = [l.strip() for l in f.readlines() if l.strip() != '']
+    prompts_props = {l.split(',')[0].strip(): ','.join(l.split(',')[1:]).strip() for l in prompts_props}
+prompt_background = lambda: random.choice(prompts_background)
+prompt_girl = lambda: random.choice(prompts_girl)
+prompt_boy = lambda: random.choice(prompts_boy)
+prompt_props = lambda: np.random.choice(list(prompts_props.keys()), size=(opt.max_palettes - 2), replace=False).tolist()
+### Main application
+css = f"""
+#run-button {{
+    font-size: 30pt;
+    background-image: linear-gradient(to right, #4338ca 0%, #26a0da 51%, #4338ca 100%);
+    margin: 0;
+    padding: 15px 45px;
+    text-align: center;
+    text-transform: uppercase;
+    transition: 0.5s;
+    background-size: 200% auto;
+    color: white;
+    box-shadow: 0 0 20px #eee;
+    border-radius: 10px;
+    display: block;
+    background-position: right center;
+}}
+#run-button:hover {{
+    background-position: left center;
+    color: #fff;
+    text-decoration: none;
+}}
+#semantic-palette {{
+    border-style: solid;
+    border-width: 0.2em;
+    border-color: #eee;
+}}
+#semantic-palette:hover {{
+    box-shadow: 0 0 20px #eee;
+}}
+#output-screen {{
+    width: 100%;
+    aspect-ratio: {opt.width} / {opt.height};
+}}
+.layer-wrap {{
+    display: none;
+}}
+.rainbow {{
+    text-align: center;
+    text-decoration: underline;
+    font-size: 32px;
+    font-family: monospace;
+    letter-spacing: 5px;
+}}
+.rainbow_text_animated {{
+    background: linear-gradient(to right, #6666ff, #0099ff , #00ff00, #ff3399, #6666ff);
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent;
+    animation: rainbow_animation 6s ease-in-out infinite;
+    background-size: 400% 100%;
+}}
+@keyframes rainbow_animation {{
+    0%,100% {{
+        background-position: 0 0;
+    }}
+    50% {{
+        background-position: 100% 0;
+    }}
+}}
+.gallery {{
+  --z: 16px;  /* control the zig-zag  */
+  --s: 144px; /* control the size */
+  --g: 4px;   /* control the gap */
+  display: grid;
+  gap: var(--g);
+  width: calc(2*var(--s) + var(--g));
+  grid-auto-flow: column;
+}}
+.gallery > a {{
+  width: 0;
+  min-width: calc(100% + var(--z)/2);
+  height: var(--s);
+  object-fit: cover;
+  -webkit-mask: var(--mask);
+          mask: var(--mask);
+  cursor: pointer;
+  transition: .5s;
+}}
+.gallery > a:hover {{
+  width: calc(var(--s)/2);
+}}
+.gallery > a:first-child {{
+  place-self: start;
+  clip-path: polygon(calc(2*var(--z)) 0,100% 0,100% 100%,0 100%);
+  --mask:
+    conic-gradient(from -135deg at right,#0000,#000 1deg 89deg,#0000 90deg)
+      50%/100% calc(2*var(--z)) repeat-y;
+}}
+.gallery > a:last-child {{
+  place-self: end;
+  clip-path: polygon(0 0,100% 0,calc(100% - 2*var(--z)) 100%,0 100%);
+  --mask:
+    conic-gradient(from   45deg at left ,#0000,#000 1deg 89deg,#0000 90deg)
+      50% calc(50% - var(--z))/100% calc(2*var(--z)) repeat-y;
+}}
+"""
+for i in range(opt.max_palettes + 1):
+    css = css + f"""
+.secondary#semantic-palette-{i} {{
+    background-image: linear-gradient(to right, #374151 0%, #374151 71%, {opt.colors[i]} 100%);
+    color: white;
+}}
+.primary#semantic-palette-{i} {{
+    background-image: linear-gradient(to right, #4338ca 0%, #4338ca 71%, {opt.colors[i]} 100%);
+    color: white;
+}}
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+    iface = argparse.Namespace()
+    def _define_state():
+        state = argparse.Namespace()
+        # Cursor.
+        state.current_palette = 0 # 0: Background; 1,2,3,...: Layers
+        state.model_id = list(model_dict.keys())[0]
+        state.style_name = '(None)'
+        state.quality_name = '(None)' # 'Standard v3.1'
+        # State variables (one-hot).
+        state.active_palettes = 1
+        # Front-end initialized to the default values.
+        prompt_props_ = prompt_props()
+        state.prompt_names = [
+            '🌄 Background',
+            '👧 Girl',
+            '👦 Boy',
+        ] + prompt_props_ + ['🎨 New Palette' for _ in range(opt.max_palettes - 5)]
+        state.prompts = [
+            prompt_background(),
+            prompt_girl(),
+            prompt_boy(),
+        ] + [prompts_props[k] for k in prompt_props_] + ['' for _ in range(opt.max_palettes - 5)]
+        state.neg_prompts = [
+            opt.default_negative_prompt
+            + (', humans, humans, humans' if i == 0 else '')
+            for i in range(opt.max_palettes + 1)
+        ]
+        state.prompt_strengths = [opt.default_prompt_strength for _ in range(opt.max_palettes)]
+        state.mask_strengths = [opt.default_mask_strength for _ in range(opt.max_palettes)]
+        state.mask_stds = [opt.default_mask_std for _ in range(opt.max_palettes)]
+        state.seed = opt.seed
+        return state
+    state = gr.State(value=_define_state)
+    ### Demo user interface
+    gr.HTML(
+        """
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+    <div>
+        <h1>🧠 Semantic Palette with <font class="rainbow rainbow_text_animated">Stable Diffusion 3</font> 🎨</h1>
+        <h5 style="margin: 0;">powered by</h5>
+        <h3>StreamMultiDiffusion: Real-Time Interactive Generation with Region-Based Semantic Control</h3>
+        <h5 style="margin: 0;">If you ❤️ our project, please visit our Github and give us a 🌟!</h5>
+        </br>
+        <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+            <a href='https://jaerinlee.com/research/StreamMultiDiffusion'>
+                <img src='https://img.shields.io/badge/Project-Page-green' alt='Project Page'>
+            </a>
+            &nbsp;
+            <a href='https://arxiv.org/abs/2403.09055'>
+                <img src="https://img.shields.io/badge/arXiv-2403.09055-red">
+            </a>
+            &nbsp;
+            <a href='https://github.com/ironjr/StreamMultiDiffusion'>
+                <img src='https://img.shields.io/github/stars/ironjr/StreamMultiDiffusion?label=Github&color=blue'>
+            </a>
+            &nbsp;
+            <a href='https://twitter.com/_ironjr_'>
+                <img src='https://img.shields.io/twitter/url?label=_ironjr_&url=https%3A%2F%2Ftwitter.com%2F_ironjr_'>
+            </a>
+            &nbsp;
+            <a href='https://github.com/ironjr/StreamMultiDiffusion/blob/main/LICENSE'>
+                <img src='https://img.shields.io/badge/license-MIT-lightgrey'>
+            </a>
+            &nbsp;
+            <a href='https://huggingface.co/spaces/ironjr/StreamMultiDiffusion'>
+                <img src='https://img.shields.io/badge/%F0%9F%A4%97%20Demo-StreamMultiDiffusion-yellow'>
+            </a>
+            &nbsp;
+            <a href='https://huggingface.co/spaces/ironjr/SemanticPalette'>
+                <img src='https://img.shields.io/badge/%F0%9F%A4%97%20Demo-SD1.5-yellow'>
+            </a>
+            &nbsp;
+            <a href='https://huggingface.co/spaces/ironjr/SemanticPaletteXL'>
+                <img src='https://img.shields.io/badge/%F0%9F%A4%97%20Demo-SDXL-yellow'>
+            </a>
+            &nbsp;
+            <a href='https://huggingface.co/spaces/ironjr/SemanticPalette3'>
+                <img src='https://img.shields.io/badge/%F0%9F%A4%97%20Demo-SD3-yellow'>
+            </a>
+        </div>
+    </div>
+</div>
+<div>
+    </br>
+</div>
+        """
+    )
+    with gr.Row():
+        iface.image_slot = gr.Image(
+            interactive=False,
+            show_label=False,
+            show_download_button=True,
+            type='pil',
+            label='Generated Result',
+            elem_id='output-screen',
+            value=lambda: random.choice(example_images),
+        )
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Group(elem_id='semantic-palette'):
+                gr.HTML(
+                    """
+<div style="justify-content: center; align-items: center;">
+    <br/>
+    <h3 style="margin: 0; text-align: center;"><b>🧠 Semantic Palette 🎨</b></h3>
+    <br/>
+</div>
+                    """
+                )
+                iface.btn_semantics = [gr.Button(
+                    value=state.value.prompt_names[0],
+                    variant='primary',
+                    elem_id='semantic-palette-0',
+                )]
+                for i in range(opt.max_palettes):
+                    iface.btn_semantics.append(gr.Button(
+                        value=state.value.prompt_names[i + 1],
+                        variant='secondary',
+                        visible=(i < state.value.active_palettes),
+                        elem_id=f'semantic-palette-{i + 1}'
+                    ))
+                iface.btn_add_palette = gr.Button(
+                    value='Create New Semantic Brush',
+                    variant='primary',
+                )
+            with gr.Accordion(label='Import/Export Semantic Palette', open=False):
+                iface.tbox_state_import = gr.Textbox(label='Put Palette JSON Here To Import')
+                iface.json_state_export = gr.JSON(label='Exported Palette')
+                iface.btn_export_state = gr.Button("Export Palette ➡️ JSON", variant='primary')
+                iface.btn_import_state = gr.Button("Import JSON ➡️ Palette", variant='secondary')
+            gr.HTML(
+                """
+<div>
+</br>
+</div>
+<div style="justify-content: center; align-items: center;">
+<h3 style="margin: 0; text-align: center;"><b>❓Usage❓</b></h3>
+</br>
+<div style="justify-content: center; align-items: left; text-align: left;">
+    <p>1-1. Type in the background prompt. Background is not required if you paint the whole drawpad.</p>
+    <p>1-2. (Optional: <em><b>Inpainting mode</b></em>) Uploading a background image will make the app into inpainting mode. Removing the image returns to the creation mode. In the inpainting mode, increasing the <em>Mask Blur STD</em> > 8 for every colored palette is recommended for smooth boundaries.</p>
+    <p>2. Select a semantic brush by clicking onto one in the <b>Semantic Palette</b> above. Edit prompt for the semantic brush.</p>
+    <p>2-1. If you are willing to draw more diverse images, try <b>Create New Semantic Brush</b>.</p>
+    <p>3. Start drawing in the <b>Semantic Drawpad</b> tab. The brush color is directly linked to the semantic brushes.</p>
+    <p>4. Click [<b>GENERATE!</b>] button to create your (large-scale) artwork!</p>
+</div>
+</div>
+                """
+            )
+            gr.HTML(
+                """
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+<h5 style="margin: 0;"><b>... or run in your own 🤗 space!</b></h5>
+</div>
+                """
+            )
+            gr.DuplicateButton()
+        with gr.Column(scale=4):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    iface.ctrl_semantic = gr.ImageEditor(
+                        image_mode='RGBA',
+                        sources=['upload', 'clipboard', 'webcam'],
+                        transforms=['crop'],
+                        crop_size=(opt.width, opt.height),
+                        brush=gr.Brush(
+                            colors=opt.colors[1:],
+                            color_mode="fixed",
+                        ),
+                        layers=False,
+                        canvas_size=(opt.width, opt.height),
+                        type='pil',
+                        label='Semantic Drawpad',
+                        elem_id='drawpad',
+                    )
+                with gr.Column(scale=1):
+                    iface.btn_generate = gr.Button(
+                        value='Generate!',
+                        variant='primary',
+                        # scale=1,
+                        elem_id='run-button'
+                    )
+                    gr.HTML(
+                        """
+<h3 style="text-align: center;">Try other demos in HF 🤗 Space!</h3>
+<div style="display: flex; justify-content: center; text-align: center;">
+    <div><b style="color: #2692F3">Semantic Palette<br>Animagine XL 3.1</b></div>
+    <div style="margin-left: 10px; margin-right: 10px; margin-top: 8px">or</div>
+    <div><b style="color: #F89E12">Official Demo of<br>StreamMultiDiffusion</b></div>
+</div>
+<div style="display: inline-block; margin-top: 10px">
+    <div class="gallery">
+        <a href="https://huggingface.co/spaces/ironjr/SemanticPaletteXL" target="_blank">
+            <img alt="AnimagineXL3.1 Demo" src="https://github.com/ironjr/StreamMultiDiffusion/blob/main/demo/semantic_palette_sd3/examples/icons/sdxl.webp?raw=true">
+        </a>
+        <a href="https://huggingface.co/spaces/ironjr/StreamMultiDiffusion" target="_blank">
+            <img alt="StreamMultiDiffusion Demo" src="https://github.com/ironjr/StreamMultiDiffusion/blob/main/demo/semantic_palette_sd3/examples/icons/smd.gif?raw=true">
+        </a>
+    </div>
+</div>
+                        """
+                    )
+                    # iface.model_select = gr.Radio(
+                    #     list(model_dict.keys()),
+                    #     label='Stable Diffusion Checkpoint',
+                    #     info='Choose your favorite style.',
+                    #     value=state.value.model_id,
+                    # )
+                    # with gr.Accordion(label='Prompt Engineering', open=True):
+                    #     iface.quality_select = gr.Dropdown(
+                    #         label='Quality Presets',
+                    #         interactive=True,
+                    #         choices=list(_quality_dict.keys()),
+                    #         value='Standard v3.1',
+                    #     )
+                    #     iface.style_select = gr.Radio(
+                    #         label='Style Preset',
+                    #         container=True,
+                    #         interactive=True,
+                    #         choices=list(_style_dict.keys()),
+                    #         value='(None)',
+                    #     )
+            with gr.Group(elem_id='control-panel'):
+                with gr.Row():
+                    iface.tbox_prompt = gr.Textbox(
+                        label='Edit Prompt for Background',
+                        info='What do you want to draw?',
+                        value=state.value.prompts[0],
+                        placeholder=lambda: random.choice(prompt_suggestions),
+                        scale=2,
+                    )
+                    iface.tbox_name = gr.Textbox(
+                        label='Edit Brush Name',
+                        info='Just for your convenience.',
+                        value=state.value.prompt_names[0],
+                        placeholder='🌄 Background',
+                        scale=1,
+                    )
+                with gr.Row():
+                    iface.tbox_neg_prompt = gr.Textbox(
+                        label='Edit Negative Prompt for Background',
+                        info='Add unwanted objects for this semantic brush.',
+                        value=opt.default_negative_prompt,
+                        scale=2,
+                    )
+                    iface.slider_strength = gr.Slider(
+                        label='Prompt Strength',
+                        info='Blends fg & bg in the prompt level, >0.8 Preferred.',
+                        minimum=0.5,
+                        maximum=1.0,
+                        value=opt.default_prompt_strength,
+                        scale=1,
+                    )
+                with gr.Row():
+                    iface.slider_alpha = gr.Slider(
+                        label='Mask Alpha',
+                        info='Factor multiplied to the mask before quantization. Extremely sensitive, >0.98 Preferred.',
+                        minimum=0.5,
+                        maximum=1.0,
+                        value=opt.default_mask_strength,
+                    )
+                    iface.slider_std = gr.Slider(
+                        label='Mask Blur STD',
+                        info='Blends fg & bg in the latent level, 0 for generation, 8-32 for inpainting.',
+                        minimum=0.0001,
+                        maximum=100.0,
+                        value=opt.default_mask_std,
+                    )
+                    iface.slider_seed = gr.Slider(
+                        label='Seed',
+                        info='The global seed.',
+                        minimum=-1,
+                        maximum=2147483647,
+                        step=1,
+                        value=opt.seed,
+                    )
+    ### Attach event handlers
+    for idx, btn in enumerate(iface.btn_semantics):
+        btn.click(
+            fn=partial(select_palette, idx=idx),
+            inputs=[state, btn],
+            outputs=[state] + iface.btn_semantics + [
+                iface.tbox_name,
+                iface.tbox_prompt,
+                iface.tbox_neg_prompt,
+                iface.slider_alpha,
+                iface.slider_strength,
+                iface.slider_std,
+            ],
+            api_name=f'select_palette_{idx}',
+        )
+    iface.btn_add_palette.click(
+        fn=add_palette,
+        inputs=state,
+        outputs=[state, iface.btn_add_palette] + iface.btn_semantics[1:],
+        api_name='create_new',
+    )
+    iface.btn_generate.click(
+        fn=run,
+        inputs=[state, iface.ctrl_semantic],
+        outputs=iface.image_slot,
+        api_name='run',
+    )
+    iface.slider_alpha.input(
+        fn=change_mask_strength,
+        inputs=[state, iface.slider_alpha],
+        outputs=state,
+        api_name='change_alpha',
+    )
+    iface.slider_std.input(
+        fn=change_std,
+        inputs=[state, iface.slider_std],
+        outputs=state,
+        api_name='change_std',
+    )
+    iface.slider_strength.input(
+        fn=change_prompt_strength,
+        inputs=[state, iface.slider_strength],
+        outputs=state,
+        api_name='change_strength',
+    )
+    iface.slider_seed.input(
+        fn=reset_seed,
+        inputs=[state, iface.slider_seed],
+        outputs=state,
+        api_name='reset_seed',
+    )
+    iface.tbox_name.input(
+        fn=rename_prompt,
+        inputs=[state, iface.tbox_name],
+        outputs=[state] + iface.btn_semantics,
+        api_name='prompt_rename',
+    )
+    iface.tbox_prompt.input(
+        fn=change_prompt,
+        inputs=[state, iface.tbox_prompt],
+        outputs=state,
+        api_name='prompt_edit',
+    )
+    iface.tbox_neg_prompt.input(
+        fn=change_neg_prompt,
+        inputs=[state, iface.tbox_neg_prompt],
+        outputs=state,
+        api_name='neg_prompt_edit',
+    )
+    # iface.model_select.change(
+    #     fn=select_model,
+    #     inputs=[state, iface.model_select],
+    #     outputs=state,
+    #     api_name='model_select',
+    # )
+    # iface.style_select.change(
+    #     fn=select_style,
+    #     inputs=[state, iface.style_select],
+    #     outputs=state,
+    #     api_name='style_select',
+    # )
+    # iface.quality_select.change(
+    #     fn=select_quality,
+    #     inputs=[state, iface.quality_select],
+    #     outputs=state,
+    #     api_name='quality_select',
+    # )
+    iface.btn_export_state.click(lambda x: vars(x), state, iface.json_state_export)
+    iface.btn_import_state.click(import_state, [state, iface.tbox_state_import], [
+        state,
+        *iface.btn_semantics,
+        # iface.model_select,
+        # iface.style_select,
+        # iface.quality_select,
+        iface.tbox_prompt,
+        iface.tbox_name,
+        iface.tbox_neg_prompt,
+        iface.slider_strength,
+        iface.slider_alpha,
+        iface.slider_std,
+        iface.slider_seed,
+    ])
+if __name__ == '__main__':
+    demo.launch(server_port=opt.port)

examples/prompt_background.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Maximalism, best quality, high quality, no humans, background, clear sky, ㅠblack sky, starry universe, planets
+Maximalism, best quality, high quality, no humans, background, clear sky, blue sky
+Maximalism, best quality, high quality, no humans, background, universe, void, black, galaxy, galaxy, stars, stars, stars
+Maximalism, best quality, high quality, no humans, background, galaxy
+Maximalism, best quality, high quality, no humans, background, sky, daylight
+Maximalism, best quality, high quality, no humans, background, skyscrappers, rooftop, city of light, helicopters, bright night, sky
+Maximalism, best quality, high quality, flowers, flowers, flowers, flower garden, no humans, background
+Maximalism, best quality, high quality, flowers, flowers, flowers, flower garden

examples/prompt_background_advanced.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/prompt_boy.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+1boy, looking at viewer, brown hair, blue shirt
+1boy, looking at viewer, brown hair, red shirt
+1boy, looking at viewer, brown hair, purple shirt
+1boy, looking at viewer, brown hair, orange shirt
+1boy, looking at viewer, brown hair, yellow shirt
+1boy, looking at viewer, brown hair, green shirt
+1boy, looking back, side shaved hair, cyberpunk cloths, robotic suit, large body
+1boy, looking back, short hair, renaissance cloths, noble boy
+1boy, looking back, long hair, ponytail, leather jacket, heavy metal boy
+1boy, looking at viewer, a king, kingly grace, majestic cloths, crown
+1boy, looking at viewer, an astronaut, brown hair, faint smile, engineer
+1boy, looking at viewer, a medieval knight, helmet, swordman, plate armour
+1boy, looking at viewer, black haired, old eastern cloth
+1boy, looking back, messy hair, suit, short beard, noir
+1boy, looking at viewer, cute face, light smile, starry eyes, jeans

examples/prompt_girl.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+1girl, looking at viewer, pretty face, light smile, haughty smile, proud, long wavy hair, charcoal dark eyes, chinese cloths
+1girl, looking at viewer, princess, pretty face, light smile, haughty smile, proud, long wavy hair, charcoal dark eyes, majestic gown
+1girl, looking at viewer, astronaut girl, long red hair, space suit, black starry eyes, happy face, pretty face
+1girl, looking at viewer, fantasy adventurer, backpack
+1girl, looking at viewer, astronaut girl, spacesuit, eva, happy face
+1girl, looking at viewer, soldier, rusty cloths, backpack, pretty face, sad smile, tears
+1girl, looking at viewer, majestic cloths, long hair, glittering eye, pretty face
+1girl, looking at viewer, from behind, majestic cloths, long hair, glittering eye
+1girl, looking at viewer, evil smile, very short hair, suit, evil genius
+1girl, looking at viewer, elven queen, green hair, haughty face, eyes wide open, crazy smile, brown jacket, leaves
+1girl, looking at viewer, purple hair, happy face, black leather jacket
+1girl, looking at viewer, pink hair, happy face, blue jeans, black leather jacket
+1girl, looking at viewer, knight, medium length hair, red hair, plate armour, blue eyes, sad, pretty face, determined face
+1girl, looking at viewer, pretty face, light smile, orange hair, casual cloths
+1girl, looking at viewer, pretty face, large smile, open mouth, uniform, mcdonald employee, short wavy hair
+1girl, looking at viewer, brown hair, ponytail, happy face, bright smile, blue jeans and white shirt

examples/prompt_props.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+🏯 Palace, Gyeongbokgung palace
+🌳 Garden, Chinese garden
+🏛️ Rome, Ancient city of Rome
+🧱 Wall, Castle wall
+🔴 Mars, Martian desert, Red rocky desert
+🌻 Grassland, Grasslands
+🏡 Village, A fantasy village
+🐉 Dragon, a flying chinese dragon
+🌏 Earth, Earth seen from ISS
+🚀 Space Station, the international space station
+🪻 Grassland, Rusty grassland with flowers
+🖼️ Tapestry, majestic tapestry, glittering effect, glowing in light, mural painting with mountain
+🏙️ City Ruin, city, ruins, ruins, ruins, deserted
+🏙️ Renaissance City, renaissance city, renaissance city, renaissance city
+🌷 Flowers, Flower garden
+🌼 Flowers, Flower garden, spring garden
+🌹 Flowers, Flowers flowers, flowers
+⛰️ Dolomites Mountains, Dolomites
+⛰️ Himalayas Mountains, Himalayas
+⛰️ Alps Mountains, Alps
+⛰️ Mountains, Mountains
+❄️⛰️ Mountains, Winter mountains
+🌷⛰️ Mountains, Spring mountains
+🌞⛰️ Mountains, Summer mountains
+🌵 Desert, A sandy desert, dunes
+🪨🌵 Desert, A rocky desert
+💦 Waterfall, A giant waterfall
+🌊 Ocean, Ocean
+⛱️ Seashore, Seashore
+🌅 Sea Horizon, Sea horizon
+🌊 Lake, Clear blue lake
+💻 Computer, A giant supecomputer
+🌳 Tree, A giant tree
+🌳 Forest, A forest
+🌳🌳 Forest, A dense forest
+🌲 Forest, Winter forest
+🌴 Forest, Summer forest, tropical forest
+👒 Hat, A hat
+🐶 Dog, Doggy body parts
+😻 Cat, A cat
+🦉 Owl, A small sitting owl
+🦅 Eagle, A small sitting eagle
+🚀 Rocket, A flying rocket

model.py ADDED Viewed

	@@ -0,0 +1,1095 @@

+# Copyright (c) 2024 Jaerin Lee
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import inspect
+from typing import Any, Callable, Dict, List, Literal, Tuple, Optional, Union
+from tqdm import tqdm
+from PIL import Image
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from einops import rearrange
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, SD3LoraLoaderMixin
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.transformers import SD3Transformer2DModel
+from diffusers.pipelines.stable_diffusion_3 import StableDiffusion3PipelineOutput
+from diffusers.schedulers import (
+    FlowMatchEulerDiscreteScheduler,
+    FlashFlowMatchEulerDiscreteScheduler,
+)
+from diffusers.utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers import (
+    DiffusionPipeline,
+    StableDiffusion3Pipeline,
+)
+from peft import PeftModel
+from util import load_model, gaussian_lowpass, blend, get_panorama_views, shift_to_mask_bbox_center
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusion3Pipeline
+        >>> pipe = StableDiffusion3Pipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> image = pipe(prompt).images[0]
+        >>> image.save("sd3.png")
+        ```
+"""
+class StableMultiDiffusion3Pipeline(nn.Module):
+    def __init__(
+        self,
+        device: torch.device,
+        dtype: torch.dtype = torch.float16,
+        hf_key: Optional[str] = None,
+        lora_key: Optional[str] = None,
+        load_from_local: bool = False, # Turn on if you have already downloaed LoRA & Hugging Face hub is down.
+        default_mask_std: float = 1.0, # 8.0
+        default_mask_strength: float = 1.0,
+        default_prompt_strength: float = 1.0, # 8.0
+        default_bootstrap_steps: int = 1,
+        default_boostrap_mix_steps: float = 1.0,
+        default_bootstrap_leak_sensitivity: float = 0.2,
+        default_preprocess_mask_cover_alpha: float = 0.3,
+        t_index_list: List[int] = [0, 4, 12, 25, 37], # [0, 5, 16, 18, 20, 37], # # [0, 12, 25, 37], # Magic number.
+        mask_type: Literal['discrete', 'semi-continuous', 'continuous'] = 'discrete',
+        has_i2t: bool = True,
+        lora_weight: float = 1.0,
+    ) -> None:
+        r"""Stabilized MultiDiffusion for fast sampling.
+        Accelrated region-based text-to-image synthesis with Latent Consistency
+        Model while preserving mask fidelity and quality.
+        Args:
+            device (torch.device): Specify CUDA device.
+            hf_key (Optional[str]): Custom StableDiffusion checkpoint for
+                stylized generation.
+            lora_key (Optional[str]): Custom Lightning LoRA for acceleration.
+            load_from_local (bool): Turn on if you have already downloaed LoRA
+                & Hugging Face hub is down.
+            default_mask_std (float): Preprocess mask with Gaussian blur with
+                specified standard deviation.
+            default_mask_strength (float): Preprocess mask by multiplying it
+                globally with the specified variable. Caution: extremely
+                sensitive. Recommended range: 0.98-1.
+            default_prompt_strength (float): Preprocess foreground prompts
+                globally by linearly interpolating its embedding with the
+                background prompt embeddint with specified mix ratio. Useful
+                control handle for foreground blending. Recommended range:
+                0.5-1.
+            default_bootstrap_steps (int): Bootstrapping stage steps to
+                encourage region separation. Recommended range: 1-3.
+            default_boostrap_mix_steps (float): Bootstrapping background is a
+                linear interpolation between background latent and the white
+                image latent. This handle controls the mix ratio. Available
+                range: 0-(number of bootstrapping inference steps). For
+                example, 2.3 means that for the first two steps, white image
+                is used as a bootstrapping background and in the third step,
+                mixture of white (0.3) and registered background (0.7) is used
+                as a bootstrapping background.
+            default_bootstrap_leak_sensitivity (float): Postprocessing at each
+                inference step by masking away the remaining bootstrap
+                backgrounds t Recommended range: 0-1.
+            default_preprocess_mask_cover_alpha (float): Optional preprocessing
+                where each mask covered by other masks is reduced in its alpha
+                value by this specified factor.
+            t_index_list (List[int]): The default scheduling for the scheduler.
+            mask_type (Literal['discrete', 'semi-continuous', 'continuous']):
+                defines the mask quantization modes. Details in the codes of
+                `self.process_mask`. Basically, this (subtly) controls the
+                smoothness of foreground-background blending. More continuous
+                means more blending, but smaller generated patch depending on
+                the mask standard deviation.
+            has_i2t (bool): Automatic background image to text prompt con-
+                version with BLIP-2 model. May not be necessary for the non-
+                streaming application.
+            lora_weight (float): Adjusts weight of the LCM/Lightning LoRA.
+                Heavily affects the overall quality!
+        """
+        super().__init__()
+        self.device = device
+        self.dtype = dtype
+        self.default_mask_std = default_mask_std
+        self.default_mask_strength = default_mask_strength
+        self.default_prompt_strength = default_prompt_strength
+        self.default_t_list = t_index_list
+        self.default_bootstrap_steps = default_bootstrap_steps
+        self.default_boostrap_mix_steps = default_boostrap_mix_steps
+        self.default_bootstrap_leak_sensitivity = default_bootstrap_leak_sensitivity
+        self.default_preprocess_mask_cover_alpha = default_preprocess_mask_cover_alpha
+        self.mask_type = mask_type
+        # Create model.
+        print(f'[INFO] Loading Stable Diffusion...')
+        if hf_key is not None:
+            print(f'[INFO] Using Hugging Face custom model key: {hf_key}')
+        else:
+            hf_key = "stabilityai/stable-diffusion-3-medium-diffusers"
+        transformer = SD3Transformer2DModel.from_pretrained(
+            hf_key,
+            subfolder="transformer",
+            torch_dtype=torch.float16,
+        ).to(self.device)
+        transformer = PeftModel.from_pretrained(transformer, "jasperai/flash-sd3").to(self.device)
+        self.pipe = StableDiffusion3Pipeline.from_pretrained(
+            "stabilityai/stable-diffusion-3-medium-diffusers",
+            transformer=transformer,
+            torch_dtype=torch.float16,
+            text_encoder_3=None,
+            tokenizer_3=None
+        ).to(self.device)
+        # Create model
+        if has_i2t:
+            self.i2t_processor = Blip2Processor.from_pretrained('Salesforce/blip2-opt-2.7b')
+            self.i2t_model = Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-opt-2.7b')
+        # Use SDXL-Lightning LoRA by default.
+        self.pipe.scheduler = FlashFlowMatchEulerDiscreteScheduler.from_pretrained(
+            "stabilityai/stable-diffusion-3-medium-diffusers", subfolder="scheduler")
+        self.pipe = self.pipe.to(self.device)
+        self.scheduler = self.pipe.scheduler
+        self.default_num_inference_steps = 4
+        self.default_guidance_scale = 0.0
+        if t_index_list is None:
+            self.prepare_flashflowmatch_schedule(
+                list(range(self.default_num_inference_steps)),
+                self.default_num_inference_steps,
+            )
+        else:
+            self.prepare_flashflowmatch_schedule(t_index_list, 50)
+        self.vae = self.pipe.vae
+        self.tokenizer = self.pipe.tokenizer
+        self.tokenizer_2 = self.pipe.tokenizer_2
+        self.tokenizer_3 = self.pipe.tokenizer_3
+        self.text_encoder = self.pipe.text_encoder
+        self.text_encoder_2 = self.pipe.text_encoder_2
+        self.text_encoder_3 = self.pipe.text_encoder_3
+        self.transformer = self.pipe.transformer
+        self.vae_scale_factor = self.pipe.vae_scale_factor
+        # Prepare white background for bootstrapping.
+        self.get_white_background(1024, 1024)
+        print(f'[INFO] Model is loaded!')
+    def prepare_flashflowmatch_schedule(
+        self,
+        t_index_list: Optional[List[int]] = None,
+        num_inference_steps: Optional[int] = None,
+    ) -> None:
+        r"""Set up different inference schedule for the diffusion model.
+        You do not have to run this explicitly if you want to use the default
+        setting, but if you want other time schedules, run this function
+        between the module initialization and the main call.
+        Note:
+          - Recommended t_index_lists for LCMs:
+              - [0, 12, 25, 37]: Default schedule for 4 steps. Best for
+                  panorama. Not recommended if you want to use bootstrapping.
+                  Because bootstrapping stage affects the initial structuring
+                  of the generated image & in this four step LCM, this is done
+                  with only at the first step, the structure may be distorted.
+              - [0, 4, 12, 25, 37]: Recommended if you would use 1-step boot-
+                  strapping. Default initialization in this implementation.
+              - [0, 5, 16, 18, 20, 37]: Recommended if you would use 2-step
+                  bootstrapping.
+          - Due to the characteristic of SD1.5 LCM LoRA, setting
+            `num_inference_steps` larger than 20 may results in overly blurry
+            and unrealistic images. Beware!
+        Args:
+            t_index_list (Optional[List[int]]): The specified scheduling step
+                regarding the maximum timestep as `num_inference_steps`, which
+                is by default, 50. That means that
+                `t_index_list=[0, 12, 25, 37]` is a relative time indices basd
+                on the full scale of 50. If None, reinitialize the module with
+                the default value.
+            num_inference_steps (Optional[int]): The maximum timestep of the
+                sampler. Defines relative scale of the `t_index_list`. Rarely
+                used in practice. If None, reinitialize the module with the
+                default value.
+        """
+        if t_index_list is None:
+            t_index_list = self.default_t_list
+        if num_inference_steps is None:
+            num_inference_steps = self.default_num_inference_steps
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.timesteps = self.scheduler.timesteps[torch.tensor(t_index_list)].to(self.device)
+        # FlashFlowMatchEulerDiscreteScheduler
+        # https://github.com/initml/diffusers/blob/clement/feature/flash_sd3/src/diffusers/schedulers/scheduling_flash_flow_match_euler_discrete.py
+        self.sigmas = self.scheduler.sigmas[torch.tensor(t_index_list)].to(self.device)
+        self.sigmas_next = torch.cat([self.sigmas, self.sigmas.new_zeros(1)])[1:].to(self.device)
+        noise_lvs = self.sigmas * (self.sigmas**2 + 1)**(-0.5)
+        self.noise_lvs = noise_lvs[None, :, None, None, None]
+        self.next_noise_lvs = torch.cat([noise_lvs[1:], noise_lvs.new_zeros(1)])[None, :, None, None, None]
+    @torch.no_grad()
+    def get_text_prompts(self, image: Image.Image) -> str:
+        r"""A convenient method to extract text prompt from an image.
+        This is called if the user does not provide background prompt but only
+        the background image. We use BLIP-2 to automatically generate prompts.
+        Args:
+            image (Image.Image): A PIL image.
+        Returns:
+            A single string of text prompt.
+        """
+        if hasattr(self, 'i2t_model'):
+            question = 'Question: What are in the image? Answer:'
+            inputs = self.i2t_processor(image, question, return_tensors='pt')
+            out = self.i2t_model.generate(**inputs, max_new_tokens=77)
+            prompt = self.i2t_processor.decode(out[0], skip_special_tokens=True).strip()
+            return prompt
+        else:
+            return ''
+    @torch.no_grad()
+    def encode_imgs(
+        self,
+        imgs: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        vae: Optional[nn.Module] = None,
+    ) -> torch.Tensor:
+        r"""A wrapper function for VAE encoder of the latent diffusion model.
+        Args:
+            imgs (torch.Tensor): An image to get StableDiffusion latents.
+                Expected shape: (B, 3, H, W). Expected pixel scale: [0, 1].
+            generator (Optional[torch.Generator]): Seed for KL-Autoencoder.
+            vae (Optional[nn.Module]): Explicitly specify VAE (used for
+                the demo application with TinyVAE).
+        Returns:
+            An image latent embedding with 1/8 size (depending on the auto-
+            encoder. Shape: (B, 4, H//8, W//8).
+        """
+        def _retrieve_latents(
+            encoder_output: torch.Tensor,
+            generator: Optional[torch.Generator] = None,
+            sample_mode: str = 'sample',
+        ):
+            if hasattr(encoder_output, 'latent_dist') and sample_mode == 'sample':
+                return encoder_output.latent_dist.sample(generator)
+            elif hasattr(encoder_output, 'latent_dist') and sample_mode == 'argmax':
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, 'latents'):
+                return encoder_output.latents
+            else:
+                raise AttributeError('Could not access latents of provided encoder_output')
+        vae = self.vae if vae is None else vae
+        imgs = 2 * imgs - 1
+        latents = vae.config.scaling_factor * _retrieve_latents(vae.encode(imgs), generator=generator)
+        return latents
+    @torch.no_grad()
+    def decode_latents(self, latents: torch.Tensor, vae: Optional[nn.Module] = None) -> torch.Tensor:
+        r"""A wrapper function for VAE decoder of the latent diffusion model.
+        Args:
+            latents (torch.Tensor): An image latent to get associated images.
+                Expected shape: (B, 4, H//8, W//8).
+            vae (Optional[nn.Module]): Explicitly specify VAE (used for
+                the demo application with TinyVAE).
+        Returns:
+            An image latent embedding with 1/8 size (depending on the auto-
+            encoder. Shape: (B, 3, H, W).
+        """
+        vae = self.vae if vae is None else vae
+        latents = 1 / vae.config.scaling_factor * latents
+        imgs = vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clip_(0, 1)
+        return imgs
+    @torch.no_grad()
+    def get_white_background(self, height: int, width: int) -> torch.Tensor:
+        r"""White background image latent for bootstrapping or in case of
+        absent background.
+        Additionally stores the maximally-sized white latent for fast retrieval
+        in the future. By default, we initially call this with 1024x1024 sized
+        white image, so the function is rarely visited twice.
+        Args:
+            height (int): The height of the white *image*, not its latent.
+            width (int): The width of the white *image*, not its latent.
+        Returns:
+            A white image latent of size (1, 4, height//8, width//8). A cropped
+            version of the stored white latent is returned if the requested
+            size is smaller than what we already have created.
+        """
+        if not hasattr(self, 'white') or self.white.shape[-2] < height or self.white.shape[-1] < width:
+            white = torch.ones(1, 3, height, width, dtype=self.dtype, device=self.device)
+            self.white = self.encode_imgs(white)
+            return self.white
+        return self.white[..., :(height // self.vae_scale_factor), :(width // self.vae_scale_factor)]
+    @torch.no_grad()
+    def process_mask(
+        self,
+        masks: Union[torch.Tensor, Image.Image, List[Image.Image]],
+        strength: Optional[Union[torch.Tensor, float]] = None,
+        std: Optional[Union[torch.Tensor, float]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        use_boolean_mask: bool = True,
+        timesteps: Optional[torch.Tensor] = None,
+        preprocess_mask_cover_alpha: Optional[float] = None,
+    ) -> Tuple[torch.Tensor]:
+        r"""Fast preprocess of masks for region-based generation with fine-
+        grained controls.
+        Mask preprocessing is done in four steps:
+         1. Resizing: Resize the masks into the specified width and height by
+            nearest neighbor interpolation.
+         2. (Optional) Ordering: Masks with higher indices are considered to
+            cover the masks with smaller indices. Covered masks are decayed
+            in its alpha value by the specified factor of
+            `preprocess_mask_cover_alpha`.
+         3. Blurring: Gaussian blur is applied to the mask with the specified
+            standard deviation (isotropic). This results in gradual increase of
+            masked region as the timesteps evolve, naturally blending fore-
+            ground and the predesignated background. Not strictly required if
+            you want to produce images from scratch withoout background.
+         4. Quantization: Split the real-numbered masks of value between [0, 1]
+            into predefined noise levels for each quantized scheduling step of
+            the diffusion sampler. For example, if the diffusion model sampler
+            has noise level of [0.9977, 0.9912, 0.9735, 0.8499, 0.5840], which
+            is the default noise level of this module with schedule [0, 4, 12,
+            25, 37], the masks are split into binary masks whose values are
+            greater than these levels. This results in tradual increase of mask
+            region as the timesteps increase. Details are described in our
+            paper at https://arxiv.org/pdf/2403.09055.pdf.
+        On the Three Modes of `mask_type`:
+            `self.mask_type` is predefined at the initialization stage of this
+            pipeline. Three possible modes are available: 'discrete', 'semi-
+            continuous', and 'continuous'. These define the mask quantization
+            modes we use. Basically, this (subtly) controls the smoothness of
+            foreground-background blending. Continuous modes produces nonbinary
+            masks to further blend foreground and background latents by linear-
+            ly interpolating between them. Semi-continuous masks only applies
+            continuous mask at the last step of the LCM sampler. Due to the
+            large step size of the LCM scheduler, we find that our continuous
+            blending helps generating seamless inpainting and editing results.
+        Args:
+            masks (Union[torch.Tensor, Image.Image, List[Image.Image]]): Masks.
+            strength (Optional[Union[torch.Tensor, float]]): Mask strength that
+                overrides the default value. A globally multiplied factor to
+                the mask at the initial stage of processing. Can be applied
+                seperately for each mask.
+            std (Optional[Union[torch.Tensor, float]]): Mask blurring Gaussian
+                kernel's standard deviation. Overrides the default value. Can
+                be applied seperately for each mask.
+            height (int): The height of the expected generation. Mask is
+                resized to (height//8, width//8) with nearest neighbor inter-
+                polation.
+            width (int): The width of the expected generation. Mask is resized
+                to (height//8, width//8) with nearest neighbor interpolation.
+            use_boolean_mask (bool): Specify this to treat the mask image as
+                a boolean tensor. The retion with dark part darker than 0.5 of
+                the maximal pixel value (that is, 127.5) is considered as the
+                designated mask.
+            timesteps (Optional[torch.Tensor]): Defines the scheduler noise
+                levels that acts as bins of mask quantization.
+            preprocess_mask_cover_alpha (Optional[float]): Optional pre-
+                processing where each mask covered by other masks is reduced in
+                its alpha value by this specified factor. Overrides the default
+                value.
+        Returns: A tuple of tensors.
+          - masks: Preprocessed (ordered, blurred, and quantized) binary/non-
+                binary masks (see the explanation on `mask_type` above) for
+                region-based image synthesis.
+          - masks_blurred: Gaussian blurred masks. Used for optionally
+                specified foreground-background blending after image
+                generation.
+          - std: Mask blur standard deviation. Used for optionally specified
+                foreground-background blending after image generation.
+        """
+        if isinstance(masks, Image.Image):
+            masks = [masks]
+        if isinstance(masks, (tuple, list)):
+            # Assumes white background for Image.Image;
+            # inverted boolean masks with shape (1, 1, H, W) for torch.Tensor.
+            if use_boolean_mask:
+                proc = lambda m: T.ToTensor()(m)[None, -1:] < 0.5
+            else:
+                proc = lambda m: 1.0 - T.ToTensor()(m)[None, -1:]
+            masks = torch.cat([proc(mask) for mask in masks], dim=0).float().clip_(0, 1)
+        masks = F.interpolate(masks.float(), size=(height, width), mode='bilinear', align_corners=False)
+        masks = masks.to(self.device)
+        # Background mask alpha is decayed by the specified factor where foreground masks covers it.
+        if preprocess_mask_cover_alpha is None:
+            preprocess_mask_cover_alpha = self.default_preprocess_mask_cover_alpha
+        if preprocess_mask_cover_alpha > 0:
+            masks = torch.stack([
+                torch.where(
+                    masks[i + 1:].sum(dim=0) > 0,
+                    mask * preprocess_mask_cover_alpha,
+                    mask,
+                ) if i < len(masks) - 1 else mask
+                for i, mask in enumerate(masks)
+            ], dim=0)
+        # Scheduler noise levels for mask quantization.
+        if timesteps is None:
+            noise_lvs = self.noise_lvs
+            next_noise_lvs = self.next_noise_lvs
+        else:
+            noise_lvs_ = self.sigmas * (self.sigmas**2 + 1)**(-0.5)
+            # noise_lvs_ = (1 - self.scheduler.alphas_cumprod[timesteps].to(self.device)) ** 0.5
+            noise_lvs = noise_lvs_[None, :, None, None, None].to(masks.device)
+            next_noise_lvs = torch.cat([noise_lvs_[1:], noise_lvs_.new_zeros(1)])[None, :, None, None, None]
+        # Mask preprocessing parameters are fetched from the default settings.
+        if std is None:
+            std = self.default_mask_std
+        if isinstance(std, (int, float)):
+            std = [std] * len(masks)
+        if isinstance(std, (list, tuple)):
+            std = torch.as_tensor(std, dtype=torch.float, device=self.device)
+        if strength is None:
+            strength = self.default_mask_strength
+        if isinstance(strength, (int, float)):
+            strength = [strength] * len(masks)
+        if isinstance(strength, (list, tuple)):
+            strength = torch.as_tensor(strength, dtype=torch.float, device=self.device)
+        if (std > 0).any():
+            std = torch.where(std > 0, std, 1e-5)
+            masks = gaussian_lowpass(masks, std)
+        masks_blurred = masks
+        # NOTE: This `strength` aligns with `denoising strength`. However, with LCM, using strength < 0.96
+        #       gives unpleasant results.
+        masks = masks * strength[:, None, None, None]
+        masks = masks.unsqueeze(1).repeat(1, noise_lvs.shape[1], 1, 1, 1)
+        # Mask is quantized according to the current noise levels specified by the scheduler.
+        if self.mask_type == 'discrete':
+            # Discrete mode.
+            masks = masks > noise_lvs
+        elif self.mask_type == 'semi-continuous':
+            # Semi-continuous mode (continuous at the last step only).
+            masks = torch.cat((
+                masks[:, :-1] > noise_lvs[:, :-1],
+                (
+                    (masks[:, -1:] - next_noise_lvs[:, -1:]) / (noise_lvs[:, -1:] - next_noise_lvs[:, -1:])
+                ).clip_(0, 1),
+            ), dim=1)
+        elif self.mask_type == 'continuous':
+            # Continuous mode: Have the exact same `1` coverage with discrete mode, but the mask gradually
+            #                  decreases continuously after the discrete mode boundary to become `0` at the
+            #                  next lower threshold.
+            masks = ((masks - next_noise_lvs) / (noise_lvs - next_noise_lvs)).clip_(0, 1)
+        # NOTE: Post processing mask strength does not align with conventional 'denoising_strength'. However,
+        #       fine-grained mask alpha channel tuning is available with this form.
+        # masks = masks * strength[None, :, None, None, None]
+        h = height // self.vae_scale_factor
+        w = width // self.vae_scale_factor
+        masks = rearrange(masks.float(), 'p t () h w -> (p t) () h w')
+        masks = F.interpolate(masks, size=(h, w), mode='nearest')
+        masks = rearrange(masks.to(self.dtype), '(p t) () h w -> p t () h w', p=len(std))
+        return masks, masks_blurred, std
+    def scheduler_step(
+        self,
+        noise_pred: torch.Tensor,
+        idx: int,
+        latent: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""Denoise-only step for reverse diffusion scheduler.
+        Designed to match the interface of the original `pipe.scheduler.step`,
+        which is a combination of this method and the following
+        `scheduler_add_noise`.
+        Args:
+            noise_pred (torch.Tensor): Noise prediction results from the U-Net.
+            idx (int): Instead of timesteps (in [0, 1000]-scale) use indices
+                for the timesteps tensor (ranged in [0, len(timesteps)-1]).
+            latent (torch.Tensor): Noisy latent.
+        Returns:
+            A denoised tensor with the same size as latent.
+        """
+        # Upcast to avoid precision issues when computing prev_sample.
+        latent = latent.to(torch.float32)
+        prev_sample = latent - noise_pred * self.sigmas[idx]
+        return prev_sample.to(self.dtype)
+    def scheduler_add_noise(
+        self,
+        latent: torch.Tensor,
+        noise: Optional[torch.Tensor],
+        idx: int,
+    ) -> torch.Tensor:
+        r"""Separated noise-add step for the reverse diffusion scheduler.
+        Designed to match the interface of the original
+        `pipe.scheduler.add_noise`.
+        Args:
+            latent (torch.Tensor): Denoised latent.
+            noise (torch.Tensor): Added noise. Can be None. If None, a random
+                noise is newly sampled for addition.
+            idx (int): Instead of timesteps (in [0, 1000]-scale) use indices
+                for the timesteps tensor (ranged in [0, len(timesteps)-1]).
+        Returns:
+            A noisy tensor with the same size as latent.
+        """
+        if idx < len(self.sigmas) and idx >= 0:
+            noise = torch.randn_like(latent) if noise is None else noise
+            return (1.0 - self.sigmas[idx]) * latent + self.sigmas[idx] * noise
+        else:
+            return latent
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompts: Optional[Union[str, List[str]]] = None,
+        negative_prompts: Union[str, List[str]] = '',
+        suffix: Optional[str] = None, #', background is ',
+        background: Optional[Union[torch.Tensor, Image.Image]] = None,
+        background_prompt: Optional[str] = None,
+        background_negative_prompt: str = '',
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        prompt_strengths: Optional[Union[torch.Tensor, float, List[float]]] = None,
+        masks: Optional[Union[Image.Image, List[Image.Image]]] = None,
+        mask_strengths: Optional[Union[torch.Tensor, float, List[float]]] = None,
+        mask_stds: Optional[Union[torch.Tensor, float, List[float]]] = None,
+        use_boolean_mask: bool = True,
+        do_blend: bool = True,
+        tile_size: int = 1024,
+        bootstrap_steps: Optional[int] = None,
+        boostrap_mix_steps: Optional[float] = None,
+        bootstrap_leak_sensitivity: Optional[float] = None,
+        preprocess_mask_cover_alpha: Optional[float] = None,
+        # SDXL Pipeline setting.
+        guidance_rescale: float = 0.7,
+        output_type = 'pil',
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ) -> Image.Image:
+        r"""Arbitrary-size image generation from multiple pairs of (regional)
+        text prompt-mask pairs.
+        This is a main routine for this pipeline.
+        Example:
+            >>> device = torch.device('cuda:0')
+            >>> smd = StableMultiDiffusionPipeline(device)
+            >>> prompts = {... specify prompts}
+            >>> masks = {... specify mask tensors}
+            >>> height, width = masks.shape[-2:]
+            >>> image = smd(
+            >>>     prompts, masks=masks.float(), height=height, width=width)
+            >>> image.save('my_beautiful_creation.png')
+        Args:
+            prompts (Union[str, List[str]]): A text prompt.
+            negative_prompts (Union[str, List[str]]): A negative text prompt.
+            suffix (Optional[str]): One option for blending foreground prompts
+                with background prompts by simply appending background prompt
+                to the end of each foreground prompt with this `middle word` in
+                between. For example, if you set this as `, background is`,
+                then the foreground prompt will be changed into
+                `(fg), background is (bg)` before conditional generation.
+            background (Optional[Union[torch.Tensor, Image.Image]]): a
+                background image, if the user wants to draw in front of the
+                specified image. Background prompt will automatically generated
+                with a BLIP-2 model.
+            background_prompt (Optional[str]): The background prompt is used
+                for preprocessing foreground prompt embeddings to blend
+                foreground and background.
+            background_negative_prompt (Optional[str]): The negative background
+                prompt.
+            height (int): Height of a generated image. It is tiled if larger
+                than `tile_size`.
+            width (int): Width of a generated image. It is tiled if larger
+                than `tile_size`.
+            num_inference_steps (Optional[int]): Number of inference steps.
+                Default inference scheduling is used if none is specified.
+            guidance_scale (Optional[float]): Classifier guidance scale.
+                Default value is used if none is specified.
+            prompt_strength (float): Overrides default value. Preprocess
+                foreground prompts globally by linearly interpolating its
+                embedding with the background prompt embeddint with specified
+                mix ratio. Useful control handle for foreground blending.
+                Recommended range: 0.5-1.
+            masks (Optional[Union[Image.Image, List[Image.Image]]]): a list of
+                mask images. Each mask associates with each of the text prompts
+                and each of the negative prompts. If specified as an image, it
+                regards the image as a boolean mask. Also accepts torch.Tensor
+                masks, which can have nonbinary values for fine-grained
+                controls in mixing regional generations.
+            mask_strengths (Optional[Union[torch.Tensor, float, List[float]]]):
+                Overrides the default value. an be assigned for each mask
+                separately. Preprocess mask by multiplying it globally with the
+                specified variable. Caution: extremely sensitive. Recommended
+                range: 0.98-1.
+            mask_stds (Optional[Union[torch.Tensor, float, List[float]]]):
+                Overrides the default value. Can be assigned for each mask
+                separately. Preprocess mask with Gaussian blur with specified
+                standard deviation. Recommended range: 0-64.
+            use_boolean_mask (bool): Turn this off if you want to treat the
+                mask image as nonbinary one. The module will use the last
+                channel of the given image in `masks` as the mask value.
+            do_blend (bool): Blend the generated foreground and the optionally
+                predefined background by smooth boundary obtained from Gaussian
+                blurs of the foreground `masks` with the given `mask_stds`.
+            tile_size (Optional[int]): Tile size of the panorama generation.
+                Works best with the default training size of the Stable-
+                Diffusion model, i.e., 1024 or 1024 for SD1.5 and 1024 for SDXL.
+            bootstrap_steps (int): Overrides the default value. Bootstrapping
+                stage steps to encourage region separation. Recommended range:
+                1-3.
+            boostrap_mix_steps (float): Overrides the default value.
+                Bootstrapping background is a linear interpolation between
+                background latent and the white image latent. This handle
+                controls the mix ratio. Available range: 0-(number of
+                bootstrapping inference steps). For example, 2.3 means that for
+                the first two steps, white image is used as a bootstrapping
+                background and in the third step, mixture of white (0.3) and
+                registered background (0.7) is used as a bootstrapping
+                background.
+            bootstrap_leak_sensitivity (float): Overrides the default value.
+                Postprocessing at each inference step by masking away the
+                remaining bootstrap backgrounds t Recommended range: 0-1.
+            preprocess_mask_cover_alpha (float): Overrides the default value.
+                Optional preprocessing where each mask covered by other masks
+                is reduced in its alpha value by this specified factor.
+        Returns: A PIL.Image image of a panorama (large-size) image.
+        """
+        ### Simplest cases
+        # prompts is None: return background.
+        # masks is None but prompts is not None: return prompts
+        # masks is not None and prompts is not None: Do StableMultiDiffusion.
+        if prompts is None or (isinstance(prompts, (list, tuple, str)) and len(prompts) == 0):
+            if background is None and background_prompt is not None:
+                return sample(background_prompt, background_negative_prompt, height, width, num_inference_steps, guidance_scale)
+            return background
+        elif masks is None or (isinstance(masks, (list, tuple)) and len(masks) == 0):
+            return sample(prompts, negative_prompts, height, width, num_inference_steps, guidance_scale)
+        ### Prepare generation
+        if num_inference_steps is not None:
+            self.prepare_flashflowmatch_schedule(list(range(num_inference_steps)), num_inference_steps)
+        if guidance_scale is None:
+            guidance_scale = self.default_guidance_scale
+        self.pipe._guidance_scale = guidance_scale
+        self.pipe._clip_skip = clip_skip
+        self.pipe._joint_attention_kwargs = joint_attention_kwargs
+        self.pipe._interrupt = False
+        do_classifier_free_guidance = guidance_scale > 1.0
+        ### Prompts & Masks
+        # asserts #m > 0 and #p > 0.
+        # #m == #p == #n > 0: We happily generate according to the prompts & masks.
+        # #m != #p: #p should be 1 and we will broadcast text embeds of p through m masks.
+        # #p != #n: #n should be 1 and we will broadcast negative embeds n through p prompts.
+        if isinstance(masks, Image.Image):
+            masks = [masks]
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if isinstance(negative_prompts, str):
+            negative_prompts = [negative_prompts]
+        num_masks = len(masks)
+        num_prompts = len(prompts)
+        num_nprompts = len(negative_prompts)
+        assert num_prompts in (num_masks, 1), \
+            f'The number of prompts {num_prompts} should match the number of masks {num_masks}!'
+        assert num_nprompts in (num_prompts, 1), \
+            f'The number of negative prompts {num_nprompts} should match the number of prompts {num_prompts}!'
+        fg_masks, masks_g, std = self.process_mask(
+            masks,
+            mask_strengths,
+            mask_stds,
+            height=height,
+            width=width,
+            use_boolean_mask=use_boolean_mask,
+            timesteps=self.timesteps,
+            preprocess_mask_cover_alpha=preprocess_mask_cover_alpha,
+        )  # (p, t, 1, H, W)
+        bg_masks = (1 - fg_masks.sum(dim=0)).clip_(0, 1)  # (T, 1, h, w)
+        has_background = bg_masks.sum() > 0
+        h = (height + self.vae_scale_factor - 1) // self.vae_scale_factor
+        w = (width + self.vae_scale_factor - 1) // self.vae_scale_factor
+        ### Background
+        # background == None && background_prompt == None: Initialize with white background.
+        # background == None && background_prompt != None: Generate background *along with other prompts*.
+        # background != None && background_prompt == None: Retrieve text prompt using BLIP.
+        # background != None && background_prompt != None: Use the given arguments.
+        # not has_background: no effect of prompt_strength (the mix ratio between fg prompt & bg prompt)
+        # has_background && prompt_strength != 1: mix only for this case.
+        bg_latent = None
+        if has_background:
+            if background is None and background_prompt is not None:
+                fg_masks = torch.cat((bg_masks[None], fg_masks), dim=0)
+                if suffix is not None:
+                    prompts = [p + suffix + background_prompt for p in prompts]
+                prompts = [background_prompt] + prompts
+                negative_prompts = [background_negative_prompt] + negative_prompts
+                has_background = False # Regard that background does not exist.
+            else:
+                if background is None and background_prompt is None:
+                    background = torch.ones(1, 3, height, width, dtype=self.dtype, device=self.device)
+                    background_prompt = 'simple white background image'
+                elif background is not None and background_prompt is None:
+                    background_prompt = self.get_text_prompts(background)
+                if suffix is not None:
+                    prompts = [p + suffix + background_prompt for p in prompts]
+                prompts = [background_prompt] + prompts
+                negative_prompts = [background_negative_prompt] + negative_prompts
+                if isinstance(background, Image.Image):
+                    background = T.ToTensor()(background).to(dtype=self.dtype, device=self.device)[None]
+                background = F.interpolate(background, size=(height, width), mode='bicubic', align_corners=False)
+                bg_latent = self.encode_imgs(background)
+        # Bootstrapping stage preparation.
+        if bootstrap_steps is None:
+            bootstrap_steps = self.default_bootstrap_steps
+        if boostrap_mix_steps is None:
+            boostrap_mix_steps = self.default_boostrap_mix_steps
+        if bootstrap_leak_sensitivity is None:
+            bootstrap_leak_sensitivity = self.default_bootstrap_leak_sensitivity
+        if bootstrap_steps > 0:
+            height_ = min(height, tile_size)
+            width_ = min(width, tile_size)
+            white = self.get_white_background(height, width) # (1, 4, h, w)
+        ### Prepare text embeddings (optimized for the minimal encoder batch size)
+        # SD3 pipeline settings.
+        batch_size = 1
+        num_images_per_prompt = 1
+        original_size = (height, width)
+        target_size = (height, width)
+        crops_coords_top_left = (0, 0)
+        negative_original_size = None
+        negative_target_size = None
+        negative_crops_coords_top_left = (0, 0)
+        prompt_2 = None
+        prompt_3 = None
+        negative_prompt_2 = None
+        negative_prompt_3 = None
+        prompt_embeds = None
+        negative_prompt_embeds = None
+        pooled_prompt_embeds = None
+        negative_pooled_prompt_embeds = None
+        text_encoder_lora_scale = None
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.pipe.encode_prompt(
+            prompt=prompts,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompts,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=self.device,
+            clip_skip=self.pipe.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+        )
+        if has_background:
+            # First channel is background prompt text embeds. Background prompt itself is not used for generation.
+            s = prompt_strengths
+            if prompt_strengths is None:
+                s = self.default_prompt_strength
+            if isinstance(s, (int, float)):
+                s = [s] * num_prompts
+            if isinstance(s, (list, tuple)):
+                assert len(s) == num_prompts, \
+                    f'The number of prompt strengths {len(s)} should match the number of prompts {num_prompts}!'
+                s = torch.as_tensor(s, dtype=self.dtype, device=self.device)
+            s = s[:, None, None]
+            be = prompt_embeds[:1]
+            fe = prompt_embeds[1:]
+            prompt_embeds = torch.lerp(be, fe, s)  # (p, 77, 1024)
+            if negative_prompt_embeds is not None:
+                bu = negative_prompt_embeds[:1]
+                fu = negative_prompt_embeds[1:]
+                if num_prompts > num_nprompts:
+                    # # negative prompts = 1; # prompts > 1.
+                    assert fu.shape[0] == 1 and fe.shape == num_prompts
+                    fu = fu.repeat(num_prompts, 1, 1)
+                negative_prompt_embeds = torch.lerp(bu, fu, s)  # (n, 77, 1024)
+            be = pooled_prompt_embeds[:1]
+            fe = pooled_prompt_embeds[1:]
+            pooled_prompt_embeds = torch.lerp(be, fe, s[..., 0])  # (p, 1280)
+            if negative_pooled_prompt_embeds is not None:
+                bu = negative_pooled_prompt_embeds[:1]
+                fu = negative_pooled_prompt_embeds[1:]
+                if num_prompts > num_nprompts:
+                    # # negative prompts = 1; # prompts > 1.
+                    assert fu.shape[0] == 1 and fe.shape == num_prompts
+                    fu = fu.repeat(num_prompts, 1)
+                negative_pooled_prompt_embeds = torch.lerp(bu, fu, s[..., 0])  # (n, 1280)
+        elif negative_prompt_embeds is not None and num_prompts > num_nprompts:
+            # # negative prompts = 1; # prompts > 1.
+            assert negative_prompt_embeds.shape[0] == 1 and prompt_embeds.shape[0] == num_prompts
+            negative_prompt_embeds = negative_prompt_embeds.repeat(num_prompts, 1, 1)
+            assert negative_pooled_prompt_embeds.shape[0] == 1 and pooled_prompt_embeds.shape[0] == num_prompts
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(num_prompts, 1)
+        # assert negative_prompt_embeds.shape[0] == prompt_embeds.shape[0] == num_prompts
+        if num_masks > num_prompts:
+            assert masks.shape[0] == num_masks and num_prompts == 1
+            prompt_embeds = prompt_embeds.repeat(num_masks, 1, 1)
+            if negative_prompt_embeds is not None:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(num_masks, 1, 1)
+            pooled_prompt_embeds = pooled_prompt_embeds.repeat(num_masks, 1)
+            if negative_pooled_prompt_embeds is not None:
+                negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(num_masks, 1)
+        # SD3 pipeline settings.
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+        del negative_prompt_embeds, negative_pooled_prompt_embeds
+        prompt_embeds = prompt_embeds.to(self.device)
+        pooled_prompt_embeds = pooled_prompt_embeds.to(self.device)
+        ### Run
+        # Latent initialization.
+        num_channels_latents = self.transformer.config.in_channels
+        noise = torch.randn((1, num_channels_latents, h, w), dtype=self.dtype, device=self.device)
+        if self.timesteps[0] < 999 and has_background:
+            latent = self.scheduler_add_noise(bg_latent, noise, 0)
+        else:
+            noise = torch.randn((1, num_channels_latents, h, w), dtype=self.dtype, device=self.device)
+            latent = noise
+        if has_background:
+            noise_bg_latents = [
+                self.scheduler_add_noise(bg_latent, noise, i) for i in range(len(self.timesteps))
+            ] + [bg_latent]
+        # Tiling (if needed).
+        if height > tile_size or width > tile_size:
+            t = (tile_size + self.vae_scale_factor - 1) // self.vae_scale_factor
+            views, tile_masks = get_panorama_views(h, w, t)
+            tile_masks = tile_masks.to(self.device)
+        else:
+            views = [(0, h, 0, w)]
+            tile_masks = latent.new_ones((1, 1, h, w))
+        value = torch.zeros_like(latent)
+        count_all = torch.zeros_like(latent)
+        with torch.autocast('cuda'):
+            for i, t in enumerate(tqdm(self.timesteps)):
+                if self.pipe.interrupt:
+                    continue
+                fg_mask = fg_masks[:, i]
+                bg_mask = bg_masks[i:i + 1]
+                value.zero_()
+                count_all.zero_()
+                for j, (h_start, h_end, w_start, w_end) in enumerate(views):
+                    fg_mask_ = fg_mask[..., h_start:h_end, w_start:w_end]
+                    latent_ = latent[..., h_start:h_end, w_start:w_end].repeat(num_masks, 1, 1, 1)
+                    # Bootstrap for tight background.
+                    if i < bootstrap_steps:
+                        mix_ratio = min(1, max(0, boostrap_mix_steps - i))
+                        # Treat the first foreground latent as the background latent if one does not exist.
+                        bg_latent_ = noise_bg_latents[i][..., h_start:h_end, w_start:w_end] if has_background else latent_[:1]
+                        white_ = white[..., h_start:h_end, w_start:w_end]
+                        white_ = self.scheduler_add_noise(white_, noise[..., h_start:h_end, w_start:w_end], i)
+                        bg_latent_ = mix_ratio * white_ + (1.0 - mix_ratio) * bg_latent_
+                        latent_ = (1.0 - fg_mask_) * bg_latent_ + fg_mask_ * latent_
+                        # Centering.
+                        latent_ = shift_to_mask_bbox_center(latent_, fg_mask_, reverse=True)
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latent_] * 2) if do_classifier_free_guidance else latent_
+                    # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                    timestep = t.expand(latent_model_input.shape[0])
+                    # Perform one step of the reverse diffusion.
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        pooled_projections=pooled_prompt_embeds,
+                        joint_attention_kwargs=joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    if do_classifier_free_guidance and guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(noise_pred, noise_pred_cond, guidance_rescale=guidance_rescale)
+                    latent_ = self.scheduler_step(noise_pred, i, latent_)
+                    if i < bootstrap_steps:
+                        # Uncentering.
+                        latent_ = shift_to_mask_bbox_center(latent_, fg_mask_)
+                        # Remove leakage (optional).
+                        leak = (latent_ - bg_latent_).pow(2).mean(dim=1, keepdim=True)
+                        leak_sigmoid = torch.sigmoid(leak / bootstrap_leak_sensitivity) * 2 - 1
+                        fg_mask_ = fg_mask_ * leak_sigmoid
+                    # Mix the latents.
+                    fg_mask_ = fg_mask_ * tile_masks[:, j:j + 1, h_start:h_end, w_start:w_end]
+                    value[..., h_start:h_end, w_start:w_end] += (fg_mask_ * latent_).sum(dim=0, keepdim=True)
+                    count_all[..., h_start:h_end, w_start:w_end] += fg_mask_.sum(dim=0, keepdim=True)
+                latent = torch.where(count_all > 0, value / count_all, value)
+                bg_mask = (1 - count_all).clip_(0, 1)  # (T, 1, h, w)
+                if has_background:
+                    latent = (1 - bg_mask) * latent + bg_mask * noise_bg_latents[i + 1] # bg_latent
+                # Noise is added after mixing.
+                if i < len(self.timesteps) - 1:
+                    latent = self.scheduler_add_noise(latent, None, i + 1)
+        if not output_type == "latent":
+            latent = (latent / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latent, return_dict=False)[0]
+        else:
+            image = latent
+        # Return PIL Image.
+        image = image[0].clip_(-1, 1) * 0.5 + 0.5
+        if has_background and do_blend:
+            fg_mask = torch.sum(masks_g, dim=0).clip_(0, 1)
+            image = blend(image, background[0], fg_mask)
+        else:
+            image = T.ToPILImage()(image)
+        return image

prompt_util.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from typing import Dict, List, Tuple, Union
+quality_prompt_list = [
+    {
+        "name": "(None)",
+        "prompt": "{prompt}",
+        "negative_prompt": "nsfw, lowres",
+    },
+    {
+        "name": "Standard v3.0",
+        "prompt": "{prompt}, masterpiece, best quality",
+        "negative_prompt": "nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name",
+    },
+    {
+        "name": "Standard v3.1",
+        "prompt": "{prompt}, masterpiece, best quality, very aesthetic, absurdres",
+        "negative_prompt": "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+    },
+    {
+        "name": "Light v3.1",
+        "prompt": "{prompt}, (masterpiece), best quality, very aesthetic, perfect face",
+        "negative_prompt": "nsfw, (low quality, worst quality:1.2), very displeasing, 3d, watermark, signature, ugly, poorly drawn",
+    },
+    {
+        "name": "Heavy v3.1",
+        "prompt": "{prompt}, (masterpiece), (best quality), (ultra-detailed), very aesthetic, illustration, disheveled hair, perfect composition, moist skin, intricate details",
+        "negative_prompt": "nsfw, longbody, lowres, bad anatomy, bad hands, missing fingers, pubic hair, extra digit, fewer digits, cropped, worst quality, low quality, very displeasing",
+    },
+]
+style_list = [
+    {
+        "name": "(None)",
+        "prompt": "{prompt}",
+        "negative_prompt": "",
+    },
+    {
+        "name": "Cinematic",
+        "prompt": "{prompt}, cinematic still, emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
+        "negative_prompt": "nsfw, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
+    },
+    {
+        "name": "Photographic",
+        "prompt": "{prompt}, cinematic photo, 35mm photograph, film, bokeh, professional, 4k, highly detailed",
+        "negative_prompt": "nsfw, drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
+    },
+    {
+        "name": "Anime",
+        "prompt": "{prompt}, anime artwork, anime style, key visual, vibrant, studio anime, highly detailed",
+        "negative_prompt": "nsfw, photo, deformed, black and white, realism, disfigured, low contrast",
+    },
+    {
+        "name": "Manga",
+        "prompt": "{prompt}, manga style, vibrant, high-energy, detailed, iconic, Japanese comic style",
+        "negative_prompt": "nsfw, ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
+    },
+    {
+        "name": "Digital Art",
+        "prompt": "{prompt}, concept art, digital artwork, illustrative, painterly, matte painting, highly detailed",
+        "negative_prompt": "nsfw, photo, photorealistic, realism, ugly",
+    },
+    {
+        "name": "Pixel art",
+        "prompt": "{prompt}, pixel-art, low-res, blocky, pixel art style, 8-bit graphics",
+        "negative_prompt": "nsfw, sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
+    },
+    {
+        "name": "Fantasy art",
+        "prompt": "{prompt}, ethereal fantasy concept art, magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
+        "negative_prompt": "nsfw, photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
+    },
+    {
+        "name": "Neonpunk",
+        "prompt": "{prompt}, neonpunk style, cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
+        "negative_prompt": "nsfw, painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
+    },
+    {
+        "name": "3D Model",
+        "prompt": "{prompt}, professional 3d model, octane render, highly detailed, volumetric, dramatic lighting",
+        "negative_prompt": "nsfw, ugly, deformed, noisy, low poly, blurry, painting",
+    },
+]
+_style_dict = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+_quality_dict = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in quality_prompt_list}
+def preprocess_prompt(
+    positive: str,
+    negative: str = "",
+    style_dict: Dict[str, dict] = _quality_dict,
+    style_name: str = "Standard v3.1", # "Heavy v3.1"
+    add_style: bool = True,
+) -> Tuple[str, str]:
+    p, n = style_dict.get(style_name, style_dict["(None)"])
+    if add_style and positive.strip():
+        formatted_positive = p.format(prompt=positive)
+    else:
+        formatted_positive = positive
+    combined_negative = n
+    if negative.strip():
+        if combined_negative:
+            combined_negative += ", " + negative
+        else:
+            combined_negative = negative
+    return formatted_positive, combined_negative
+def preprocess_prompts(
+    positives: List[str],
+    negatives: List[str] = None,
+    style_dict = _style_dict,
+    style_name: str = "Manga", # "(None)"
+    quality_dict = _quality_dict,
+    quality_name: str = "Standard v3.1", # "Heavy v3.1"
+    add_style: bool = True,
+    add_quality_tags = True,
+) -> Tuple[List[str], List[str]]:
+    if negatives is None:
+        negatives = ['' for _ in positives]
+    positives_ = []
+    negatives_ = []
+    for pos, neg in zip(positives, negatives):
+        pos, neg = preprocess_prompt(pos, neg, quality_dict, quality_name, add_quality_tags)
+        pos, neg = preprocess_prompt(pos, neg, style_dict, style_name, add_style)
+        positives_.append(pos)
+        negatives_.append(neg)
+    return positives_, negatives_
+def print_prompts(
+    positives: Union[str, List[str]],
+    negatives: Union[str, List[str]],
+    has_background: bool = False,
+) -> None:
+    if isinstance(positives, str):
+        positives = [positives]
+    if isinstance(negatives, str):
+        negatives = [negatives]
+    for i, prompt in enumerate(positives):
+        prefix = ((f'Prompt{i}' if i > 0 else 'Background Prompt')
+                  if has_background else f'Prompt{i + 1}')
+        print(prefix + ': ' + prompt)
+    for i, prompt in enumerate(negatives):
+        prefix = ((f'Negative Prompt{i}' if i > 0 else 'Background Negative Prompt')
+                  if has_background else f'Negative Prompt{i + 1}')
+        print(prefix + ': ' + prompt)

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+torch==2.0.1
+torchvision
+xformers==0.0.22
+einops
+diffusers @ git+https://github.com/initml/diffusers.git@clement/feature/flash_sd3
+transformers
+huggingface_hub[torch]
+gradio==4.39.0
+Pillow
+emoji
+numpy
+tqdm
+jupyterlab
+peft>=0.10.0
+sentencepiece
+protobuf

share_btn.py ADDED Viewed

	@@ -0,0 +1,70 @@

+share_js = """async () => {
+	async function uploadFile(file) {
+		const UPLOAD_URL = 'https://huggingface.co/uploads';
+		const response = await fetch(UPLOAD_URL, {
+			method: 'POST',
+			headers: {
+				'Content-Type': file.type,
+				'X-Requested-With': 'XMLHttpRequest',
+			},
+			body: file, /// <- File inherits from Blob
+		});
+		const url = await response.text();
+		return url;
+	}
+    async function getBase64(file) {
+       var reader = new FileReader();
+       reader.readAsDataURL(file);
+       reader.onload = function () {
+           console.log(reader.result);
+       };
+       reader.onerror = function (error) {
+           console.log('Error: ', error);
+       };
+    }
+    const toDataURL = url => fetch(url)
+        .then(response => response.blob())
+        .then(blob => new Promise((resolve, reject) => {
+            const reader = new FileReader()
+            reader.onloadend = () => resolve(reader.result)
+            reader.onerror = reject
+            reader.readAsDataURL(blob)
+        }));
+    async function dataURLtoFile(dataurl, filename) {
+        var arr = dataurl.split(','), mime = arr[0].match(/:(.*?);/)[1],
+        bstr = atob(arr[1]), n = bstr.length, u8arr = new Uint8Array(n);
+        while (n--) {
+            u8arr[n] = bstr.charCodeAt(n);
+        }
+        return new File([u8arr], filename, {type:mime});
+    };
+    const gradioEl = document.querySelector('body > gradio-app');
+    const imgEls = gradioEl.querySelectorAll('#output-screen img');
+    if(!imgEls.length){
+        return;
+    };
+    const urls = await Promise.all([...imgEls].map((imgEl) => {
+        const origURL = imgEl.src;
+        const imgId = Date.now() % 200;
+        const fileName = 'semantic-palette-xl-' + imgId + '.png';
+        return toDataURL(origURL)
+            .then(dataUrl => {
+                return dataURLtoFile(dataUrl, fileName);
+            })
+        })).then(fileData => {return Promise.all([...fileData].map((file) => {
+            return uploadFile(file);
+        }))});
+	const htmlImgs = urls.map(url => `<img src='${url}' width='2560' height='1024'>`);
+	const descriptionMd = `<div style='display: flex; flex-wrap: wrap; column-gap: 0.75rem;'>
+${htmlImgs.join(`\n`)}
+</div>`;
+    const params = new URLSearchParams({
+        title: `My creation`,
+        description: descriptionMd,
+    });
+	const paramsStr = params.toString();
+	window.open(`https://huggingface.co/spaces/ironjr/SemanticPaletteXL/discussions/new?${paramsStr}`, '_blank');
+}"""

util.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Copyright (c) 2024 Jaerin Lee
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import concurrent.futures
+import time
+from typing import Any, Callable, List, Literal, Tuple, Union
+from PIL import Image
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.cuda.amp as amp
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from diffusers import (
+    DiffusionPipeline,
+    StableDiffusionPipeline,
+    StableDiffusionXLPipeline,
+)
+def seed_everything(seed: int) -> None:
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = True
+def load_model(
+    model_key: str,
+    sd_version: Literal['1.5', 'xl'],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.nn.Module:
+    if model_key.endswith('.safetensors'):
+        if sd_version == '1.5':
+            pipeline = StableDiffusionPipeline
+        elif sd_version == 'xl':
+            pipeline = StableDiffusionXLPipeline
+        else:
+            raise ValueError(f'Stable Diffusion version {sd_version} not supported.')
+        return pipeline.from_single_file(model_key, torch_dtype=dtype).to(device)
+    try:
+        return DiffusionPipeline.from_pretrained(model_key, variant='fp16', torch_dtype=dtype).to(device)
+    except:
+        return DiffusionPipeline.from_pretrained(model_key, variant=None, torch_dtype=dtype).to(device)
+def get_cutoff(cutoff: float = None, scale: float = None) -> float:
+    if cutoff is not None:
+        return cutoff
+    if scale is not None and cutoff is None:
+        return 0.5 / scale
+    raise ValueError('Either one of `cutoff`, or `scale` should be specified.')
+def get_scale(cutoff: float = None, scale: float = None) -> float:
+    if scale is not None:
+        return scale
+    if cutoff is not None and scale is None:
+        return 0.5 / cutoff
+    raise ValueError('Either one of `cutoff`, or `scale` should be specified.')
+def filter_2d_by_kernel_1d(x: torch.Tensor, k: torch.Tensor) -> torch.Tensor:
+    assert len(k.shape) in (1,), 'Kernel size should be one of (1,).'
+    #  assert len(k.shape) in (1, 2), 'Kernel size should be one of (1, 2).'
+    b, c, h, w = x.shape
+    ks = k.shape[-1]
+    k = k.view(1, 1, -1).repeat(c, 1, 1)
+    x = x.permute(0, 2, 1, 3)
+    x = x.reshape(b * h, c, w)
+    x = F.pad(x, (ks // 2, (ks - 1) // 2), mode='replicate')
+    x = F.conv1d(x, k, groups=c)
+    x = x.reshape(b, h, c, w).permute(0, 3, 2, 1).reshape(b * w, c, h)
+    x = F.pad(x, (ks // 2, (ks - 1) // 2), mode='replicate')
+    x = F.conv1d(x, k, groups=c)
+    x = x.reshape(b, w, c, h).permute(0, 2, 3, 1)
+    return x
+def filter_2d_by_kernel_2d(x: torch.Tensor, k: torch.Tensor) -> torch.Tensor:
+    assert len(k.shape) in (2, 3), 'Kernel size should be one of (2, 3).'
+    x = F.pad(x, (
+        k.shape[-2] // 2, (k.shape[-2] - 1) // 2,
+        k.shape[-1] // 2, (k.shape[-1] - 1) // 2,
+    ), mode='replicate')
+    b, c, _, _ = x.shape
+    if len(k.shape) == 2 or (len(k.shape) == 3 and k.shape[0] == 1):
+        k = k.view(1, 1, *k.shape[-2:]).repeat(c, 1, 1, 1)
+        x = F.conv2d(x, k, groups=c)
+    elif len(k.shape) == 3:
+        assert k.shape[0] == b, \
+            'The number of kernels should match the batch size.'
+        k = k.unsqueeze(1)
+        x = F.conv2d(x.permute(1, 0, 2, 3), k, groups=b).permute(1, 0, 2, 3)
+    return x
+@amp.autocast(False)
+def filter_by_kernel(
+    x: torch.Tensor,
+    k: torch.Tensor,
+    is_batch: bool = False,
+) -> torch.Tensor:
+    k_dim = len(k.shape)
+    if k_dim == 1 or k_dim == 2 and is_batch:
+        return filter_2d_by_kernel_1d(x, k)
+    elif k_dim == 2 or k_dim == 3 and is_batch:
+        return filter_2d_by_kernel_2d(x, k)
+    else:
+        raise ValueError('Kernel size should be one of (1, 2, 3).')
+def gen_gauss_lowpass_filter_2d(
+    std: torch.Tensor,
+    window_size: int = None,
+) -> torch.Tensor:
+    # Gaussian kernel size is odd in order to preserve the center.
+    if window_size is None:
+        window_size = (
+            2 * int(np.ceil(3 * std.max().detach().cpu().numpy())) + 1)
+    y = torch.arange(
+        window_size, dtype=std.dtype, device=std.device
+    ).view(-1, 1).repeat(1, window_size)
+    grid = torch.stack((y.t(), y), dim=-1)
+    grid -= 0.5 * (window_size - 1) # (W, W)
+    var = (std * std).unsqueeze(-1).unsqueeze(-1)
+    distsq = (grid * grid).sum(dim=-1).unsqueeze(0).repeat(*std.shape, 1, 1)
+    k = torch.exp(-0.5 * distsq / var)
+    k /= k.sum(dim=(-2, -1), keepdim=True)
+    return k
+def gaussian_lowpass(
+    x: torch.Tensor,
+    std: Union[float, Tuple[float], torch.Tensor] = None,
+    cutoff: Union[float, torch.Tensor] = None,
+    scale: Union[float, torch.Tensor] = None,
+) -> torch.Tensor:
+    if std is None:
+        cutoff = get_cutoff(cutoff, scale)
+        std = 0.5 / (np.pi * cutoff)
+    if isinstance(std, (float, int)):
+        std = (std, std)
+    if isinstance(std, torch.Tensor):
+        """Using nn.functional.conv2d with Gaussian kernels built in runtime is
+        80% faster than transforms.functional.gaussian_blur for individual
+        items.
+        (in GPU); However, in CPU, the result is exactly opposite. But you
+        won't gonna run this on CPU, right?
+        """
+        if len(list(s for s in std.shape if s != 1)) >= 2:
+            raise NotImplementedError(
+                'Anisotropic Gaussian filter is not currently available.')
+        # k.shape == (B, W, W).
+        k = gen_gauss_lowpass_filter_2d(std=std.view(-1))
+        if k.shape[0] == 1:
+            return filter_by_kernel(x, k[0], False)
+        else:
+            return filter_by_kernel(x, k, True)
+    else:
+        # Gaussian kernel size is odd in order to preserve the center.
+        window_size = tuple(2 * int(np.ceil(3 * s)) + 1 for s in std)
+        return TF.gaussian_blur(x, window_size, std)
+def blend(
+    fg: Union[torch.Tensor, Image.Image],
+    bg: Union[torch.Tensor, Image.Image],
+    mask: Union[torch.Tensor, Image.Image],
+    std: float = 0.0,
+) -> Image.Image:
+    if not isinstance(fg, torch.Tensor):
+        fg = T.ToTensor()(fg)
+    if not isinstance(bg, torch.Tensor):
+        bg = T.ToTensor()(bg)
+    if not isinstance(mask, torch.Tensor):
+        mask = (T.ToTensor()(mask) < 0.5).float()[:1]
+    if std > 0:
+        mask = gaussian_lowpass(mask[None], std)[0].clip_(0, 1)
+    return T.ToPILImage()(fg * mask + bg * (1 - mask))
+def get_panorama_views(
+    panorama_height: int,
+    panorama_width: int,
+    window_size: int = 64,
+) -> tuple[List[Tuple[int]], torch.Tensor]:
+    stride = window_size // 2
+    is_horizontal = panorama_width > panorama_height
+    num_blocks_height = (panorama_height - window_size + stride - 1) // stride + 1
+    num_blocks_width = (panorama_width - window_size + stride - 1) // stride + 1
+    total_num_blocks = num_blocks_height * num_blocks_width
+    half_fwd = torch.linspace(0, 1, (window_size + 1) // 2)
+    half_rev = half_fwd.flip(0)
+    if window_size % 2 == 1:
+        half_rev = half_rev[1:]
+    c = torch.cat((half_fwd, half_rev))
+    one = torch.ones_like(c)
+    f = c.clone()
+    f[:window_size // 2] = 1
+    b = c.clone()
+    b[-(window_size // 2):] = 1
+    h = [one] if num_blocks_height == 1 else [f] + [c] * (num_blocks_height - 2) + [b]
+    w = [one] if num_blocks_width == 1 else [f] + [c] * (num_blocks_width - 2) + [b]
+    views = []
+    masks = torch.zeros(total_num_blocks, panorama_height, panorama_width) # (n, h, w)
+    for i in range(total_num_blocks):
+        hi, wi = i // num_blocks_width, i % num_blocks_width
+        h_start = hi * stride
+        h_end = min(h_start + window_size, panorama_height)
+        w_start = wi * stride
+        w_end = min(w_start + window_size, panorama_width)
+        views.append((h_start, h_end, w_start, w_end))
+        h_width = h_end - h_start
+        w_width = w_end - w_start
+        masks[i, h_start:h_end, w_start:w_end] = h[hi][:h_width, None] * w[wi][None, :w_width]
+    # Sum of the mask weights at each pixel `masks.sum(dim=1)` must be unity.
+    return views, masks[None] # (1, n, h, w)
+def shift_to_mask_bbox_center(im: torch.Tensor, mask: torch.Tensor, reverse: bool = False) -> List[int]:
+    h, w = mask.shape[-2:]
+    device = mask.device
+    mask = mask.reshape(-1, h, w)
+    # assert mask.shape[0] == im.shape[0]
+    h_occupied = mask.sum(dim=-2) > 0
+    w_occupied = mask.sum(dim=-1) > 0
+    l = torch.argmax(h_occupied * torch.arange(w, 0, -1).to(device), 1, keepdim=True).cpu()
+    r = torch.argmax(h_occupied * torch.arange(w).to(device), 1, keepdim=True).cpu()
+    t = torch.argmax(w_occupied * torch.arange(h, 0, -1).to(device), 1, keepdim=True).cpu()
+    b = torch.argmax(w_occupied * torch.arange(h).to(device), 1, keepdim=True).cpu()
+    tb = (t + b + 1) // 2
+    lr = (l + r + 1) // 2
+    shifts = (tb - (h // 2), lr - (w // 2))
+    shifts = torch.cat(shifts, dim=1) # (p, 2)
+    if reverse:
+        shifts = shifts * -1
+    return torch.stack([i.roll(shifts=s.tolist(), dims=(-2, -1)) for i, s in zip(im, shifts)], dim=0)
+class Streamer:
+    def __init__(self, fn: Callable, ema_alpha: float = 0.9) -> None:
+        self.fn = fn
+        self.ema_alpha = ema_alpha
+        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+        self.future = self.executor.submit(fn)
+        self.image = None
+        self.prev_exec_time = 0
+        self.ema_exec_time = 0
+    @property
+    def throughput(self) -> float:
+        return 1.0 / self.ema_exec_time if self.ema_exec_time else float('inf')
+    def timed_fn(self) -> Any:
+        start = time.time()
+        res = self.fn()
+        end = time.time()
+        self.prev_exec_time = end - start
+        self.ema_exec_time = self.ema_exec_time * self.ema_alpha + self.prev_exec_time * (1 - self.ema_alpha)
+        return res
+    def __call__(self) -> Any:
+        if self.future.done() or self.image is None:
+            # get the result (the new image) and start a new task
+            image = self.future.result()
+            self.future = self.executor.submit(self.timed_fn)
+            self.image = image
+            return image
+        else:
+            # if self.fn() is not ready yet, use the previous image
+            # NOTE: This assumes that we have access to a previously generated image here.
+            # If there's no previous image (i.e., this is the first invocation), you could fall
+            # back to some default image or handle it differently based on your requirements.
+            return self.image