ameerazam08
/

Udiff

Model card Files Files and versions Community

ameerazam08 commited on Dec 16, 2023

Commit

2cf789d

•

1 Parent(s): 7193dd9

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +1 -0
README.md +14 -0
__pycache__/util.cpython-310.pyc +0 -0
__pycache__/util.cpython-311.pyc +0 -0
app.py +245 -0
checkpoints/AEs/AE_inpainting_2.safetensors +3 -0
checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt +3 -0
checkpoints/st-step=100000+la-step=100000-v1.ckpt +3 -0
configs/demo.yaml +29 -0
configs/test/textdesign_sd_2.yaml +137 -0
demo/examples/DIRTY_0_0.png +0 -0
demo/examples/ENGINE_0_0.png +0 -0
demo/examples/FAVOURITE_0_0.jpeg +0 -0
demo/examples/FRONTIER_0_0.png +0 -0
demo/examples/Peaceful_0_0.jpeg +0 -0
demo/examples/Scamps_0_0.png +0 -0
demo/examples/TREE_0_0.png +0 -0
demo/examples/better_0_0.jpg +0 -0
demo/examples/tested_0_0.png +0 -0
demo/teaser.png +3 -0
requirements.txt +28 -0
sgm/__init__.py +2 -0
sgm/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/__pycache__/__init__.cpython-311.pyc +0 -0
sgm/__pycache__/lr_scheduler.cpython-311.pyc +0 -0
sgm/__pycache__/util.cpython-310.pyc +0 -0
sgm/__pycache__/util.cpython-311.pyc +0 -0
sgm/lr_scheduler.py +135 -0
sgm/models/__init__.py +2 -0
sgm/models/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/models/__pycache__/__init__.cpython-311.pyc +0 -0
sgm/models/__pycache__/autoencoder.cpython-310.pyc +0 -0
sgm/models/__pycache__/autoencoder.cpython-311.pyc +0 -0
sgm/models/__pycache__/diffusion.cpython-310.pyc +0 -0
sgm/models/__pycache__/diffusion.cpython-311.pyc +0 -0
sgm/models/autoencoder.py +335 -0
sgm/models/diffusion.py +328 -0
sgm/modules/__init__.py +6 -0
sgm/modules/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/modules/__pycache__/__init__.cpython-311.pyc +0 -0
sgm/modules/__pycache__/attention.cpython-310.pyc +0 -0
sgm/modules/__pycache__/attention.cpython-311.pyc +0 -0
sgm/modules/__pycache__/ema.cpython-310.pyc +0 -0
sgm/modules/__pycache__/ema.cpython-311.pyc +0 -0
sgm/modules/attention.py +976 -0
sgm/modules/autoencoding/__init__.py +0 -0
sgm/modules/autoencoding/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/modules/autoencoding/__pycache__/__init__.cpython-311.pyc +0 -0
sgm/modules/autoencoding/losses/__init__.py +246 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+/demo/**/* filter=lfs diff=lfs merge=lfs -text
+checkpoints/**/* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ **/__pycache__

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: UDiffText
+emoji: 😋
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+sdk_version: 3.41.0
+python_version: 3.11.4
+app_file: app.py
+pinned: true
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (1.77 kB). View file

__pycache__/util.cpython-311.pyc ADDED Viewed

Binary file (3.01 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# -- coding: utf-8 --**
+import cv2
+import torch
+import os, glob
+import numpy as np
+import gradio as gr
+from PIL import Image
+from omegaconf import OmegaConf
+from contextlib import nullcontext
+from pytorch_lightning import seed_everything
+from os.path import join as ospj
+from random import randint
+from torchvision.utils import save_image
+from torchvision.transforms import Resize
+from util import *
+def process(image, mask):
+    img_h, img_w = image.shape[:2]
+    mask = mask[...,:1]//255
+    contours, _ = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    if len(contours) != 1: raise gr.Error("One masked area only!")
+    m_x, m_y, m_w, m_h = cv2.boundingRect(contours[0])
+    c_x, c_y = m_x + m_w//2, m_y + m_h//2
+    if img_w > img_h:
+        if m_w > img_h: raise gr.Error("Illegal mask area!")
+        if c_x < img_w - c_x:
+            c_l = max(0, c_x - img_h//2)
+            c_r = c_l + img_h
+        else:
+            c_r = min(img_w, c_x + img_h//2)
+            c_l = c_r - img_h
+        image = image[:,c_l:c_r,:]
+        mask = mask[:,c_l:c_r,:]
+    else:
+        if m_h > img_w: raise gr.Error("Illegal mask area!")
+        if c_y < img_h - c_y:
+            c_t = max(0, c_y - img_w//2)
+            c_b = c_t + img_w
+        else:
+            c_b = min(img_h, c_y + img_w//2)
+            c_t = c_b - img_w
+        image = image[c_t:c_b,:,:]
+        mask = mask[c_t:c_b,:,:]
+    image = torch.from_numpy(image.transpose(2,0,1)).to(dtype=torch.float32) / 127.5 - 1.0
+    mask = torch.from_numpy(mask.transpose(2,0,1)).to(dtype=torch.float32)
+    image = resize(image[None])[0]
+    mask = resize(mask[None])[0]
+    masked = image * (1 - mask)
+    return image, mask, masked
+def predict(cfgs, model, sampler, batch):
+    context = nullcontext if cfgs.aae_enabled else torch.no_grad
+    with context():
+        batch, batch_uc_1 = prepare_batch(cfgs, batch)
+        c, uc_1 = model.conditioner.get_unconditional_conditioning(
+            batch,
+            batch_uc=batch_uc_1,
+            force_uc_zero_embeddings=cfgs.force_uc_zero_embeddings,
+        )
+        x = sampler.get_init_noise(cfgs, model, cond=c, batch=batch, uc=uc_1)
+        samples_z = sampler(model, x, cond=c, batch=batch, uc=uc_1, init_step=0,
+                            aae_enabled = cfgs.aae_enabled, detailed = cfgs.detailed)
+        samples_x = model.decode_first_stage(samples_z)
+        samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+        return samples, samples_z
+def demo_predict(input_blk, text, num_samples, steps, scale, seed, show_detail):
+    global cfgs, global_index
+    if len(text) < cfgs.txt_len[0] or len(text) > cfgs.txt_len[1]:
+        raise gr.Error("Illegal text length!")
+    global_index += 1
+    if num_samples > 1: cfgs.noise_iters = 0
+    cfgs.batch_size = num_samples
+    cfgs.steps = steps
+    cfgs.scale[0] = scale
+    cfgs.detailed = show_detail
+    seed_everything(seed)
+    sampler.num_steps = steps
+    sampler.guider.scale_value = scale
+    image = input_blk["image"]
+    mask = input_blk["mask"]
+    image, mask, masked = process(image, mask)
+    seg_mask = torch.cat((torch.ones(len(text)), torch.zeros(cfgs.seq_len-len(text))))
+    # additional cond
+    txt = f"\"{text}\""
+    original_size_as_tuple = torch.tensor((cfgs.H, cfgs.W))
+    crop_coords_top_left = torch.tensor((0, 0))
+    target_size_as_tuple = torch.tensor((cfgs.H, cfgs.W))
+    image = torch.tile(image[None], (num_samples, 1, 1, 1))
+    mask = torch.tile(mask[None], (num_samples, 1, 1, 1))
+    masked = torch.tile(masked[None], (num_samples, 1, 1, 1))
+    seg_mask = torch.tile(seg_mask[None], (num_samples, 1))
+    original_size_as_tuple = torch.tile(original_size_as_tuple[None], (num_samples, 1))
+    crop_coords_top_left = torch.tile(crop_coords_top_left[None], (num_samples, 1))
+    target_size_as_tuple = torch.tile(target_size_as_tuple[None], (num_samples, 1))
+    text = [text for i in range(num_samples)]
+    txt = [txt for i in range(num_samples)]
+    name = [str(global_index) for i in range(num_samples)]
+    batch = {
+        "image": image,
+        "mask": mask,
+        "masked": masked,
+        "seg_mask": seg_mask,
+        "label": text,
+        "txt": txt,
+        "original_size_as_tuple": original_size_as_tuple,
+        "crop_coords_top_left": crop_coords_top_left,
+        "target_size_as_tuple": target_size_as_tuple,
+        "name": name
+    }
+    samples, samples_z = predict(cfgs, model, sampler, batch)
+    samples = samples.cpu().numpy().transpose(0, 2, 3, 1) * 255
+    results = [Image.fromarray(sample.astype(np.uint8)) for sample in samples]
+    if cfgs.detailed:
+        sections = []
+        attn_map = Image.open(f"./temp/attn_map/attn_map_{global_index}.png")
+        seg_maps = np.load(f"./temp/seg_map/seg_{global_index}.npy")
+        for i, seg_map in enumerate(seg_maps):
+            seg_map = cv2.resize(seg_map, (cfgs.W, cfgs.H))
+            sections.append((seg_map, text[0][i]))
+        seg = (results[0], sections)
+    else:
+        attn_map = None
+        seg = None
+    return results, attn_map, seg
+if __name__ == "__main__":
+    os.makedirs("./temp", exist_ok=True)
+    os.makedirs("./temp/attn_map", exist_ok=True)
+    os.makedirs("./temp/seg_map", exist_ok=True)
+    cfgs = OmegaConf.load("./configs/demo.yaml")
+    model = init_model(cfgs)
+    sampler = init_sampling(cfgs)
+    global_index = 0
+    resize = Resize((cfgs.H, cfgs.W))
+    block = gr.Blocks().queue()
+    with block:
+        with gr.Row():
+            gr.HTML(
+                """
+                <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
+                <h1 style="font-weight: 600; font-size: 2rem; margin: 0.5rem;">
+                    UDiffText: A Unified Framework for High-quality Text Synthesis in Arbitrary Images via Character-aware Diffusion Models
+                </h1>
+                <ul style="text-align: center; margin: 0.5rem;">
+                    <li style="display: inline-block; margin:auto;"><a href='https://arxiv.org/abs/2312.04884'><img src='https://img.shields.io/badge/Arxiv-2312.04884-DF826C'></a></li>
+                    <li style="display: inline-block; margin:auto;"><a href='https://github.com/ZYM-PKU/UDiffText'><img src='https://img.shields.io/badge/Code-UDiffText-D0F288'></a></li>
+                    <li style="display: inline-block; margin:auto;"><a href='https://udifftext.github.io'><img src='https://img.shields.io/badge/Project-UDiffText-8ADAB2'></a></li>
+                </ul>
+                <h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin: 0.5rem;">
+                    Our proposed UDiffText is capable of synthesizing accurate and harmonious text in either synthetic or real-word images, thus can be applied to tasks like scene text editing (a), arbitrary text generation (b) and accurate T2I generation (c)
+                </h2>
+                <div align=center><img src="file/demo/teaser.png" alt="UDiffText" width="80%"></div>
+                </div>
+                """
+            )
+        with gr.Row():
+            with gr.Column():
+                input_blk = gr.Image(source='upload', tool='sketch', type="numpy", label="Input", height=512)
+                gr.Markdown("Notice: please draw horizontally to indicate only **one** masked area.")
+                text = gr.Textbox(label="Text to render: (1~12 characters)", info="the text you want to render at the masked region")
+                run_button = gr.Button(variant="primary")
+                with gr.Accordion("Advanced options", open=False):
+                    num_samples = gr.Slider(label="Images", info="number of generated images, locked as 1", minimum=1, maximum=1, value=1, step=1)
+                    steps = gr.Slider(label="Steps", info ="denoising sampling steps", minimum=1, maximum=200, value=50, step=1)
+                    scale = gr.Slider(label="Guidance Scale", info="the scale of classifier-free guidance (CFG)", minimum=0.0, maximum=10.0, value=5.0, step=0.1)
+                    seed = gr.Slider(label="Seed", info="random seed for noise initialization", minimum=0, maximum=2147483647, step=1, randomize=True)
+                    show_detail = gr.Checkbox(label="Show Detail", info="show the additional visualization results", value=False)
+            with gr.Column():
+                gallery = gr.Gallery(label="Output", height=512, preview=True)
+                with gr.Accordion("Visualization results", open=True):
+                    with gr.Tab(label="Attention Maps"):
+                        gr.Markdown("### Attention maps for each character (extracted from middle blocks at intermediate sampling step):")
+                        attn_map = gr.Image(show_label=False, show_download_button=False)
+                    with gr.Tab(label="Segmentation Maps"):
+                        gr.Markdown("### Character-level segmentation maps (using upscaled attention maps):")
+                        seg_map = gr.AnnotatedImage(height=384, show_label=False)
+        # examples
+        examples = []
+        example_paths = sorted(glob.glob(ospj("./demo/examples", "*")))
+        for example_path in example_paths:
+            label = example_path.split(os.sep)[-1].split(".")[0].split("_")[0]
+            examples.append([example_path, label])
+        gr.Markdown("## Examples:")
+        gr.Examples(
+            examples=examples,
+            inputs=[input_blk, text]
+        )
+        run_button.click(fn=demo_predict, inputs=[input_blk, text, num_samples, steps, scale, seed, show_detail], outputs=[gallery, attn_map, seg_map])
+    block.launch()

checkpoints/AEs/AE_inpainting_2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:547baac83984f8bf8b433882236b87e77eb4d2f5c71e3d7a04b8dec2fe02b81f
+size 334640988

checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4076c90467a907dcb8cde15776bfda4473010fe845739490341db74e82cd2267
+size 4059026213

checkpoints/st-step=100000+la-step=100000-v1.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edea71eb83b6be72c33ef787a7122a810a7b9257bf97a276ef322707d5769878
+size 6148465904

configs/demo.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+type: "demo"
+# path
+load_ckpt_path: "./checkpoints/st-step=100000+la-step=100000-v1.ckpt"
+model_cfg_path: "./configs/test/textdesign_sd_2.yaml"
+# param
+H: 512
+W: 512
+txt_len: [1, 12]
+seq_len: 12
+batch_size: 1
+channel: 4 # AE latent channel
+factor: 8 # AE downsample factor
+scale: [5.0, 0.0] # content scale, style scale
+noise_iters: 0
+force_uc_zero_embeddings: ["label"]
+aae_enabled: False
+detailed: False
+# runtime
+steps: 50
+init_step: 0
+num_workers: 0
+use_gpu: True
+gpu: 0
+max_iter: 100

configs/test/textdesign_sd_2.yaml ADDED Viewed

	@@ -0,0 +1,137 @@

+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    input_key: image
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+        weighting_config:
+          target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetAddModel
+      params:
+        use_checkpoint: False
+        in_channels: 9
+        out_channels: 4
+        ctrl_channels: 0
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        attn_type: add_attn
+        attn_layers:
+          - output_blocks.6.1
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 0
+        add_context_dim: 2048
+        legacy: False
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          # crossattn cond
+          # - is_trainable: False
+          #   input_key: txt
+          #   target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+          #   params:
+          #     arch: ViT-H-14
+          #     version: ./checkpoints/encoders/OpenCLIP/ViT-H-14/open_clip_pytorch_model.bin
+          #     layer: penultimate
+          # add crossattn cond
+          - is_trainable: False
+            input_key: label
+            target: sgm.modules.encoders.modules.LabelEncoder
+            params:
+              is_add_embedder: True
+              max_len: 12
+              emb_dim: 2048
+              n_heads: 8
+              n_trans_layers: 12
+              ckpt_path: ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt # ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt
+          # concat cond
+          - is_trainable: False
+            input_key: mask
+            target: sgm.modules.encoders.modules.IdentityEncoder
+          - is_trainable: False
+            input_key: masked
+            target: sgm.modules.encoders.modules.LatentEncoder
+            params:
+              scale_factor: 0.18215
+              config:
+                target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
+                params:
+                  ckpt_path: ./checkpoints/AEs/AE_inpainting_2.safetensors
+                  embed_dim: 4
+                  monitor: val/rec_loss
+                  ddconfig:
+                    attn_type: vanilla-xformers
+                    double_z: true
+                    z_channels: 4
+                    resolution: 256
+                    in_channels: 3
+                    out_ch: 3
+                    ch: 128
+                    ch_mult: [1, 2, 4, 4]
+                    num_res_blocks: 2
+                    attn_resolutions: []
+                    dropout: 0.0
+                  lossconfig:
+                    target: torch.nn.Identity
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.FullLoss # StandardDiffusionLoss
+      params:
+        seq_len: 12
+        kernel_size: 3
+        gaussian_sigma: 0.5
+        min_attn_size: 16
+        lambda_local_loss: 0.02
+        lambda_ocr_loss: 0.001
+        ocr_enabled: False
+        predictor_config:
+          target: sgm.modules.predictors.model.ParseqPredictor
+          params:
+            ckpt_path: "./checkpoints/predictors/parseq-bb5792a6.pt"
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
+          params:
+            num_idx: 1000
+            discretization_config:
+              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization

demo/examples/DIRTY_0_0.png ADDED Viewed

demo/examples/ENGINE_0_0.png ADDED Viewed

demo/examples/FAVOURITE_0_0.jpeg ADDED Viewed

demo/examples/FRONTIER_0_0.png ADDED Viewed

demo/examples/Peaceful_0_0.jpeg ADDED Viewed

demo/examples/Scamps_0_0.png ADDED Viewed

demo/examples/TREE_0_0.png ADDED Viewed

demo/examples/better_0_0.jpg ADDED Viewed

demo/examples/tested_0_0.png ADDED Viewed

demo/teaser.png ADDED Viewed

Git LFS Details

SHA256: dcd166cc9691c99a7ee93a028ab485472171ee348a5f4dbaf82f6bf1fb27c66d
Pointer size: 132 Bytes
Size of remote file: 2.62 MB

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+colorlover==0.3.0
+einops==0.6.1
+gradio==3.41.0
+imageio==2.31.2
+img2dataset==1.42.0
+kornia==0.6.9
+lpips==0.1.4
+matplotlib==3.7.2
+numpy==1.25.1
+omegaconf==2.3.0
+open-clip-torch==2.20.0
+opencv-python==4.6.0.66
+Pillow==9.5.0
+pytorch-fid==0.3.0
+pytorch-lightning==2.0.1
+safetensors==0.3.1
+scikit-learn==1.3.0
+scipy==1.11.1
+seaborn==0.12.2
+tensorboard==2.14.0
+timm==0.9.2
+tokenizers==0.13.3
+torch==2.1.0
+torchvision==0.16.0
+tqdm==4.65.0
+transformers==4.30.2
+xformers==0.0.22.post7

sgm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .models import AutoencodingEngine, DiffusionEngine
2	+ from .util import instantiate_from_config

sgm/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (267 Bytes). View file

sgm/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (314 Bytes). View file

sgm/__pycache__/lr_scheduler.cpython-311.pyc ADDED Viewed

Binary file (6.56 kB). View file

sgm/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (8.07 kB). View file

sgm/__pycache__/util.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

sgm/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import numpy as np
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+    def __init__(
+        self,
+        warm_up_steps,
+        lr_min,
+        lr_max,
+        lr_start,
+        max_decay_steps,
+        verbosity_interval=0,
+    ):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.0
+        self.verbosity_interval = verbosity_interval
+    def schedule(self, n, **kwargs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (
+                self.lr_max - self.lr_start
+            ) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (
+                self.lr_max_decay_steps - self.lr_warm_up_steps
+            )
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                1 + np.cos(t * np.pi)
+            )
+            self.last_lr = lr
+            return lr
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+class LambdaWarmUpCosineScheduler2:
+    """
+    supports repeated iterations, configurable via lists
+    note: use with a base_lr of 1.0.
+    """
+    def __init__(
+        self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0
+    ):
+        assert (
+            len(warm_up_steps)
+            == len(f_min)
+            == len(f_max)
+            == len(f_start)
+            == len(cycle_lengths)
+        )
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.0
+        self.verbosity_interval = verbosity_interval
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            interval += 1
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(
+                    f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                    f"current cycle {cycle}"
+                )
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[
+                cycle
+            ] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (
+                self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]
+            )
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
+                1 + np.cos(t * np.pi)
+            )
+            self.last_f = f
+            return f
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(
+                    f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                    f"current cycle {cycle}"
+                )
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[
+                cycle
+            ] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (
+                self.cycle_lengths[cycle] - n
+            ) / (self.cycle_lengths[cycle])
+            self.last_f = f
+            return f

sgm/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .autoencoder import AutoencodingEngine
2	+ from .diffusion import DiffusionEngine

sgm/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (250 Bytes). View file

sgm/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (291 Bytes). View file

sgm/models/__pycache__/autoencoder.cpython-310.pyc ADDED Viewed

Binary file (11.5 kB). View file

sgm/models/__pycache__/autoencoder.cpython-311.pyc ADDED Viewed

Binary file (20.2 kB). View file

sgm/models/__pycache__/diffusion.cpython-310.pyc ADDED Viewed

Binary file (10.8 kB). View file

sgm/models/__pycache__/diffusion.cpython-311.pyc ADDED Viewed

Binary file (20.2 kB). View file

sgm/models/autoencoder.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import re
+from abc import abstractmethod
+from contextlib import contextmanager
+from typing import Any, Dict, Tuple, Union
+import pytorch_lightning as pl
+import torch
+from omegaconf import ListConfig
+from packaging import version
+from safetensors.torch import load_file as load_safetensors
+from ..modules.diffusionmodules.model import Decoder, Encoder
+from ..modules.distributions.distributions import DiagonalGaussianDistribution
+from ..modules.ema import LitEma
+from ..util import default, get_obj_from_str, instantiate_from_config
+class AbstractAutoencoder(pl.LightningModule):
+    """
+    This is the base class for all autoencoders, including image autoencoders, image autoencoders with discriminators,
+    unCLIP models, etc. Hence, it is fairly general, and specific features
+    (e.g. discriminator training, encoding, decoding) must be implemented in subclasses.
+    """
+    def __init__(
+        self,
+        ema_decay: Union[None, float] = None,
+        monitor: Union[None, str] = None,
+        input_key: str = "jpg",
+        ckpt_path: Union[None, str] = None,
+        ignore_keys: Union[Tuple, list, ListConfig] = (),
+    ):
+        super().__init__()
+        self.input_key = input_key
+        self.use_ema = ema_decay is not None
+        if monitor is not None:
+            self.monitor = monitor
+        if self.use_ema:
+            self.model_ema = LitEma(self, decay=ema_decay)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            self.automatic_optimization = False
+    def init_from_ckpt(
+        self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple()
+    ) -> None:
+        if path.endswith("ckpt"):
+            sd = torch.load(path, map_location="cpu")["state_dict"]
+        elif path.endswith("safetensors"):
+            sd = load_safetensors(path)
+        else:
+            raise NotImplementedError
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if re.match(ik, k):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(
+            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
+        )
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+    @abstractmethod
+    def get_input(self, batch) -> Any:
+        raise NotImplementedError()
+    def on_train_batch_end(self, *args, **kwargs):
+        # for EMA computation
+        if self.use_ema:
+            self.model_ema(self)
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    @abstractmethod
+    def encode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("encode()-method of abstract base class called")
+    @abstractmethod
+    def decode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("decode()-method of abstract base class called")
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        print(f"loading >>> {cfg['target']} <<< optimizer from config")
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+    def configure_optimizers(self) -> Any:
+        raise NotImplementedError()
+class AutoencodingEngine(AbstractAutoencoder):
+    """
+    Base class for all image autoencoders that we train, like VQGAN or AutoencoderKL
+    (we also restore them explicitly as special cases for legacy reasons).
+    Regularizations such as KL or VQ are moved to the regularizer class.
+    """
+    def __init__(
+        self,
+        *args,
+        encoder_config: Dict,
+        decoder_config: Dict,
+        loss_config: Dict,
+        regularizer_config: Dict,
+        optimizer_config: Union[Dict, None] = None,
+        lr_g_factor: float = 1.0,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        # todo: add options to freeze encoder/decoder
+        self.encoder = instantiate_from_config(encoder_config)
+        self.decoder = instantiate_from_config(decoder_config)
+        self.loss = instantiate_from_config(loss_config)
+        self.regularization = instantiate_from_config(regularizer_config)
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.Adam"}
+        )
+        self.lr_g_factor = lr_g_factor
+    def get_input(self, batch: Dict) -> torch.Tensor:
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in channels-first format (e.g., bchw instead if bhwc)
+        return batch[self.input_key]
+    def get_autoencoder_params(self) -> list:
+        params = (
+            list(self.encoder.parameters())
+            + list(self.decoder.parameters())
+            + list(self.regularization.get_trainable_parameters())
+            + list(self.loss.get_trainable_autoencoder_parameters())
+        )
+        return params
+    def get_discriminator_params(self) -> list:
+        params = list(self.loss.get_trainable_parameters())  # e.g., discriminator
+        return params
+    def get_last_layer(self):
+        return self.decoder.get_last_layer()
+    def encode(self, x: Any, return_reg_log: bool = False) -> Any:
+        z = self.encoder(x)
+        z, reg_log = self.regularization(z)
+        if return_reg_log:
+            return z, reg_log
+        return z
+    def decode(self, z: Any) -> torch.Tensor:
+        x = self.decoder(z)
+        return x
+    def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        z, reg_log = self.encode(x, return_reg_log=True)
+        dec = self.decode(z)
+        return z, dec, reg_log
+    def training_step(self, batch, batch_idx, optimizer_idx) -> Any:
+        x = self.get_input(batch)
+        z, xrec, regularization_log = self(x)
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(
+                regularization_log,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split="train",
+            )
+            self.log_dict(
+                log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True
+            )
+            return aeloss
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(
+                regularization_log,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split="train",
+            )
+            self.log_dict(
+                log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True
+            )
+            return discloss
+    def validation_step(self, batch, batch_idx) -> Dict:
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
+            log_dict.update(log_dict_ema)
+        return log_dict
+    def _validation_step(self, batch, batch_idx, postfix="") -> Dict:
+        x = self.get_input(batch)
+        z, xrec, regularization_log = self(x)
+        aeloss, log_dict_ae = self.loss(
+            regularization_log,
+            x,
+            xrec,
+            0,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split="val" + postfix,
+        )
+        discloss, log_dict_disc = self.loss(
+            regularization_log,
+            x,
+            xrec,
+            1,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split="val" + postfix,
+        )
+        self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
+        log_dict_ae.update(log_dict_disc)
+        self.log_dict(log_dict_ae)
+        return log_dict_ae
+    def configure_optimizers(self) -> Any:
+        ae_params = self.get_autoencoder_params()
+        disc_params = self.get_discriminator_params()
+        opt_ae = self.instantiate_optimizer_from_config(
+            ae_params,
+            default(self.lr_g_factor, 1.0) * self.learning_rate,
+            self.optimizer_config,
+        )
+        opt_disc = self.instantiate_optimizer_from_config(
+            disc_params, self.learning_rate, self.optimizer_config
+        )
+        return [opt_ae, opt_disc], []
+    @torch.no_grad()
+    def log_images(self, batch: Dict, **kwargs) -> Dict:
+        log = dict()
+        x = self.get_input(batch)
+        _, xrec, _ = self(x)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        with self.ema_scope():
+            _, xrec_ema, _ = self(x)
+            log["reconstructions_ema"] = xrec_ema
+        return log
+class AutoencoderKL(AutoencodingEngine):
+    def __init__(self, embed_dim: int, **kwargs):
+        ddconfig = kwargs.pop("ddconfig")
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ignore_keys = kwargs.pop("ignore_keys", ())
+        super().__init__(
+            encoder_config={"target": "torch.nn.Identity"},
+            decoder_config={"target": "torch.nn.Identity"},
+            regularizer_config={"target": "torch.nn.Identity"},
+            loss_config=kwargs.pop("lossconfig"),
+            **kwargs,
+        )
+        assert ddconfig["double_z"]
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def encode(self, x):
+        assert (
+            not self.training
+        ), f"{self.__class__.__name__} only supports inference currently"
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z, **decoder_kwargs):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z, **decoder_kwargs)
+        return dec
+class AutoencoderKLInferenceWrapper(AutoencoderKL):
+    def encode(self, x):
+        return super().encode(x).sample()
+class IdentityFirstStage(AbstractAutoencoder):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def get_input(self, x: Any) -> Any:
+        return x
+    def encode(self, x: Any, *args, **kwargs) -> Any:
+        return x
+    def decode(self, x: Any, *args, **kwargs) -> Any:
+        return x

sgm/models/diffusion.py ADDED Viewed

	@@ -0,0 +1,328 @@

+from contextlib import contextmanager
+from typing import Any, Dict, List, Tuple, Union
+import pytorch_lightning as pl
+import torch
+from omegaconf import ListConfig, OmegaConf
+from safetensors.torch import load_file as load_safetensors
+from torch.optim.lr_scheduler import LambdaLR
+from ..modules import UNCONDITIONAL_CONFIG
+from ..modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
+from ..modules.ema import LitEma
+from ..util import (
+    default,
+    disabled_train,
+    get_obj_from_str,
+    instantiate_from_config,
+    log_txt_as_img,
+)
+class DiffusionEngine(pl.LightningModule):
+    def __init__(
+        self,
+        network_config,
+        denoiser_config,
+        first_stage_config,
+        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        optimizer_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        scheduler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
+        network_wrapper: Union[None, str] = None,
+        ckpt_path: Union[None, str] = None,
+        use_ema: bool = False,
+        ema_decay_rate: float = 0.9999,
+        scale_factor: float = 1.0,
+        disable_first_stage_autocast=False,
+        input_key: str = "jpg",
+        log_keys: Union[List, None] = None,
+        no_cond_log: bool = False,
+        compile_model: bool = False,
+        opt_keys: Union[List, None] = None
+    ):
+        super().__init__()
+        self.opt_keys = opt_keys
+        self.log_keys = log_keys
+        self.input_key = input_key
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.AdamW"}
+        )
+        model = instantiate_from_config(network_config)
+        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
+            model, compile_model=compile_model
+        )
+        self.denoiser = instantiate_from_config(denoiser_config)
+        self.sampler = (
+            instantiate_from_config(sampler_config)
+            if sampler_config is not None
+            else None
+        )
+        self.conditioner = instantiate_from_config(
+            default(conditioner_config, UNCONDITIONAL_CONFIG)
+        )
+        self.scheduler_config = scheduler_config
+        self._init_first_stage(first_stage_config)
+        self.loss_fn = (
+            instantiate_from_config(loss_fn_config)
+            if loss_fn_config is not None
+            else None
+        )
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self.model, decay=ema_decay_rate)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        self.scale_factor = scale_factor
+        self.disable_first_stage_autocast = disable_first_stage_autocast
+        self.no_cond_log = no_cond_log
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path)
+    def init_from_ckpt(
+        self,
+        path: str,
+    ) -> None:
+        if path.endswith("ckpt"):
+            sd = torch.load(path, map_location="cpu")["state_dict"]
+        elif path.endswith("safetensors"):
+            sd = load_safetensors(path)
+        else:
+            raise NotImplementedError
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(
+            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
+        )
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+    def freeze(self):
+        for param in self.parameters():
+            param.requires_grad_(False)
+    def _init_first_stage(self, config):
+        model = instantiate_from_config(config).eval()
+        model.train = disabled_train
+        for param in model.parameters():
+            param.requires_grad = False
+        self.first_stage_model = model
+    def get_input(self, batch):
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in bchw format
+        return batch[self.input_key]
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        z = 1.0 / self.scale_factor * z
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            out = self.first_stage_model.decode(z)
+        return out
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            z = self.first_stage_model.encode(x)
+        z = self.scale_factor * z
+        return z
+    def forward(self, x, batch):
+        loss, loss_dict = self.loss_fn(self.model, self.denoiser, self.conditioner, x, batch, self.first_stage_model, self.scale_factor)
+        return loss, loss_dict
+    def shared_step(self, batch: Dict) -> Any:
+        x = self.get_input(batch)
+        x = self.encode_first_stage(x)
+        batch["global_step"] = self.global_step
+        loss, loss_dict = self(x, batch)
+        return loss, loss_dict
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+        self.log_dict(
+            loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False
+        )
+        self.log(
+            "global_step",
+            float(self.global_step),
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+        )
+        lr = self.optimizers().param_groups[0]["lr"]
+        self.log(
+            "lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False
+        )
+        return loss
+    def on_train_start(self, *args, **kwargs):
+        if self.sampler is None or self.loss_fn is None:
+            raise ValueError("Sampler and loss function need to be set for training.")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = []
+        print("Trainable parameter list: ")
+        print("-"*20)
+        for name, param in self.model.named_parameters():
+            if any([key in name for key in self.opt_keys]):
+                params.append(param)
+                print(name)
+            else:
+                param.requires_grad_(False)
+        for embedder in self.conditioner.embedders:
+            if embedder.is_trainable:
+                for name, param in embedder.named_parameters():
+                    params.append(param)
+                    print(name)
+        print("-"*20)
+        opt = self.instantiate_optimizer_from_config(params, lr, self.optimizer_config)
+        scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda=lambda epoch: 0.95**epoch)
+        return [opt], scheduler
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: Dict,
+        uc: Union[Dict, None] = None,
+        batch_size: int = 16,
+        shape: Union[None, Tuple, List] = None,
+        **kwargs,
+    ):
+        randn = torch.randn(batch_size, *shape).to(self.device)
+        denoiser = lambda input, sigma, c: self.denoiser(
+            self.model, input, sigma, c, **kwargs
+        )
+        samples = self.sampler(denoiser, randn, cond, uc=uc)
+        return samples
+    @torch.no_grad()
+    def log_conditionings(self, batch: Dict, n: int) -> Dict:
+        """
+        Defines heuristics to log different conditionings.
+        These can be lists of strings (text-to-image), tensors, ints, ...
+        """
+        image_h, image_w = batch[self.input_key].shape[2:]
+        log = dict()
+        for embedder in self.conditioner.embedders:
+            if (
+                (self.log_keys is None) or (embedder.input_key in self.log_keys)
+            ) and not self.no_cond_log:
+                x = batch[embedder.input_key][:n]
+                if isinstance(x, torch.Tensor):
+                    if x.dim() == 1:
+                        # class-conditional, convert integer to string
+                        x = [str(x[i].item()) for i in range(x.shape[0])]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 4)
+                    elif x.dim() == 2:
+                        # size and crop cond and the like
+                        x = [
+                            "x".join([str(xx) for xx in x[i].tolist()])
+                            for i in range(x.shape[0])
+                        ]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                elif isinstance(x, (List, ListConfig)):
+                    if isinstance(x[0], str):
+                        # strings
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                else:
+                    raise NotImplementedError()
+                log[embedder.input_key] = xc
+        return log
+    @torch.no_grad()
+    def log_images(
+        self,
+        batch: Dict,
+        N: int = 8,
+        sample: bool = True,
+        ucg_keys: List[str] = None,
+        **kwargs,
+    ) -> Dict:
+        conditioner_input_keys = [e.input_key for e in self.conditioner.embedders]
+        if ucg_keys:
+            assert all(map(lambda x: x in conditioner_input_keys, ucg_keys)), (
+                "Each defined ucg key for sampling must be in the provided conditioner input keys,"
+                f"but we have {ucg_keys} vs. {conditioner_input_keys}"
+            )
+        else:
+            ucg_keys = conditioner_input_keys
+        log = dict()
+        x = self.get_input(batch)
+        c, uc = self.conditioner.get_unconditional_conditioning(
+            batch,
+            force_uc_zero_embeddings=ucg_keys
+            if len(self.conditioner.embedders) > 0
+            else [],
+        )
+        sampling_kwargs = {}
+        N = min(x.shape[0], N)
+        x = x.to(self.device)[:N]
+        log["inputs"] = x
+        z = self.encode_first_stage(x)
+        log["reconstructions"] = self.decode_first_stage(z)
+        log.update(self.log_conditionings(batch, N))
+        for k in c:
+            if isinstance(c[k], torch.Tensor):
+                c[k], uc[k] = map(lambda y: y[k][:N].to(self.device), (c, uc))
+        if sample:
+            with self.ema_scope("Plotting"):
+                samples = self.sample(
+                    c, shape=z.shape[1:], uc=uc, batch_size=N, **sampling_kwargs
+                )
+            samples = self.decode_first_stage(samples)
+            log["samples"] = samples
+        return log

sgm/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .encoders.modules import GeneralConditioner, DualConditioner
+UNCONDITIONAL_CONFIG = {
+    "target": "sgm.modules.GeneralConditioner",
+    "params": {"emb_models": []},
+}

sgm/modules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (337 Bytes). View file

sgm/modules/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (388 Bytes). View file

sgm/modules/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (21.6 kB). View file

sgm/modules/__pycache__/attention.cpython-311.pyc ADDED Viewed

Binary file (45.1 kB). View file

sgm/modules/__pycache__/ema.cpython-310.pyc ADDED Viewed

Binary file (3.21 kB). View file

sgm/modules/__pycache__/ema.cpython-311.pyc ADDED Viewed

Binary file (5.82 kB). View file

sgm/modules/attention.py ADDED Viewed

	@@ -0,0 +1,976 @@

+import math
+from inspect import isfunction
+from typing import Any, Optional
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from packaging import version
+from torch import nn, einsum
+if version.parse(torch.__version__) >= version.parse("2.0.0"):
+    SDP_IS_AVAILABLE = True
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    BACKEND_MAP = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+        None: {"enable_math": True, "enable_flash": True, "enable_mem_efficient": True},
+    }
+else:
+    from contextlib import nullcontext
+    SDP_IS_AVAILABLE = False
+    sdp_kernel = nullcontext
+    BACKEND_MAP = {}
+    print(
+        f"No SDP backend available, likely because you are running in pytorch versions < 2.0. In fact, "
+        f"you are using PyTorch {torch.__version__}. You might want to consider upgrading."
+    )
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILABLE = True
+except:
+    XFORMERS_IS_AVAILABLE = False
+    print("no module 'xformers'. Processing without...")
+from .diffusionmodules.util import checkpoint
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(
+            qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
+        )
+        k = k.softmax(dim=-1)
+        context = torch.einsum("bhdn,bhen->bhde", k, v)
+        out = torch.einsum("bhde,bhdn->bhen", context, q)
+        out = rearrange(
+            out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
+        )
+        return self.to_out(out)
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b (h w) c")
+        k = rearrange(k, "b c h w -> b c (h w)")
+        w_ = torch.einsum("bij,bjk->bik", q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = rearrange(v, "b c h w -> b c (h w)")
+        w_ = rearrange(w_, "b i j -> b j i")
+        h_ = torch.einsum("bij,bjk->bik", v, w_)
+        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
+        h_ = self.proj_out(h_)
+        return x + h_
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        backend=None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = zero_module(nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        ))
+        self.backend = backend
+        self.attn_map_cache = None
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        additional_tokens=None,
+        n_times_crossframe_attn_in_self=0,
+    ):
+        h = self.heads
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            n_cp = x.shape[0] // n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
+        ## old
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        del q, k
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
+        # save attn_map
+        if self.attn_map_cache is not None:
+            bh, n, l = sim.shape
+            size = int(n**0.5)
+            self.attn_map_cache["size"] = size
+            self.attn_map_cache["attn_map"] = sim
+        out = einsum('b i j, b j d -> b i d', sim, v)
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
+        ## new
+        # with sdp_kernel(**BACKEND_MAP[self.backend]):
+        #     # print("dispatching into backend", self.backend, "q/k/v shape: ", q.shape, k.shape, v.shape)
+        #     out = F.scaled_dot_product_attention(
+        #         q, k, v, attn_mask=mask
+        #     )  # scale is dim_head ** -0.5 per default
+        # del q, k, v
+        # out = rearrange(out, "b h n d -> b n (h d)", h=h)
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        return self.to_out(out)
+class MemoryEfficientCrossAttention(nn.Module):
+    # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    def __init__(
+        self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs
+    ):
+        super().__init__()
+        # print(
+        #     f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
+        #     f"{heads} heads with a dimension of {dim_head}."
+        # )
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.attention_op: Optional[Any] = None
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        additional_tokens=None,
+        n_times_crossframe_attn_in_self=0,
+    ):
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            # n_cp = x.shape[0]//n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self,
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self,
+            )
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        # actually compute the attention, what we cannot get enough of
+        out = xformers.ops.memory_efficient_attention(
+            q, k, v, attn_bias=None, op=self.attention_op
+        )
+        # TODO: Use this directly in the attention operation, as a bias
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        return self.to_out(out)
+class BasicTransformerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention,  # ampere
+    }
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        add_context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        disable_self_attn=False,
+        attn_mode="softmax",
+        sdp_backend=None,
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        if attn_mode != "softmax" and not XFORMERS_IS_AVAILABLE:
+            print(
+                f"Attention mode '{attn_mode}' is not available. Falling back to native attention. "
+                f"This is not a problem in Pytorch >= 2.0. FYI, you are running with PyTorch version {torch.__version__}"
+            )
+            attn_mode = "softmax"
+        elif attn_mode == "softmax" and not SDP_IS_AVAILABLE:
+            print(
+                "We do not support vanilla attention anymore, as it is too expensive. Sorry."
+            )
+            if not XFORMERS_IS_AVAILABLE:
+                assert (
+                    False
+                ), "Please install xformers via e.g. 'pip install xformers==0.0.16'"
+            else:
+                print("Falling back to xformers efficient attention.")
+                attn_mode = "softmax-xformers"
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            assert sdp_backend is None or isinstance(sdp_backend, SDPBackend)
+        else:
+            assert sdp_backend is None
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = MemoryEfficientCrossAttention(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else None,
+            backend=sdp_backend,
+        )  # is a self-attention if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        if context_dim is not None and context_dim > 0:
+            self.attn2 = attn_cls(
+                query_dim=dim,
+                context_dim=context_dim,
+                heads=n_heads,
+                dim_head=d_head,
+                dropout=dropout,
+                backend=sdp_backend,
+            )  # is self-attn if context is none
+        if add_context_dim is not None and add_context_dim > 0:
+            self.add_attn = attn_cls(
+                query_dim=dim,
+                context_dim=add_context_dim,
+                heads=n_heads,
+                dim_head=d_head,
+                dropout=dropout,
+                backend=sdp_backend,
+            )  # is self-attn if context is none
+            self.add_norm = nn.LayerNorm(dim)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(
+        self, x, context=None, add_context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
+        kwargs = {"x": x}
+        if context is not None:
+            kwargs.update({"context": context})
+        if additional_tokens is not None:
+            kwargs.update({"additional_tokens": additional_tokens})
+        if n_times_crossframe_attn_in_self:
+            kwargs.update(
+                {"n_times_crossframe_attn_in_self": n_times_crossframe_attn_in_self}
+            )
+        return checkpoint(
+            self._forward, (x, context, add_context), self.parameters(), self.checkpoint
+        )
+    def _forward(
+        self, x, context=None, add_context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
+        x = (
+            self.attn1(
+                self.norm1(x),
+                context=context if self.disable_self_attn else None,
+                additional_tokens=additional_tokens,
+                n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self
+                if not self.disable_self_attn
+                else 0,
+            )
+            + x
+        )
+        if hasattr(self, "attn2"):
+            x = (
+                self.attn2(
+                    self.norm2(x), context=context, additional_tokens=additional_tokens
+                )
+                + x
+            )
+        if hasattr(self, "add_attn"):
+            x = (
+                self.add_attn(
+                    self.add_norm(x), context=add_context, additional_tokens=additional_tokens
+                )
+                + x
+            )
+        x = self.ff(self.norm3(x)) + x
+        return x
+class BasicTransformerSingleLayerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention  # on the A100s not quite as fast as the above version
+        # (todo might depend on head_dim, check, falls back to semi-optimized kernels for dim!=[16,32,64,128])
+    }
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        attn_mode="softmax",
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim,
+        )
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None):
+        return checkpoint(
+            self._forward, (x, context), self.parameters(), self.checkpoint
+        )
+    def _forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), context=context) + x
+        x = self.ff(self.norm2(x)) + x
+        return x
+class  SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    NEW: use_linear for more efficiency instead of the 1x1 convs
+    """
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        add_context_dim=None,
+        disable_self_attn=False,
+        use_linear=False,
+        attn_type="softmax",
+        use_checkpoint=True,
+        # sdp_backend=SDPBackend.FLASH_ATTENTION
+        sdp_backend=None,
+    ):
+        super().__init__()
+        # print(
+        #     f"constructing {self.__class__.__name__} of depth {depth} w/ {in_channels} channels and {n_heads} heads"
+        # )
+        from omegaconf import ListConfig
+        if exists(context_dim) and not isinstance(context_dim, (list, ListConfig)):
+            context_dim = [context_dim]
+        if exists(context_dim) and isinstance(context_dim, list):
+            if depth != len(context_dim):
+                # print(
+                #     f"WARNING: {self.__class__.__name__}: Found context dims {context_dim} of depth {len(context_dim)}, "
+                #     f"which does not match the specified 'depth' of {depth}. Setting context_dim to {depth * [context_dim[0]]} now."
+                # )
+                # depth does not match context dims.
+                assert all(
+                    map(lambda x: x == context_dim[0], context_dim)
+                ), "need homogenous context_dim to match depth automatically"
+                context_dim = depth * [context_dim[0]]
+        elif context_dim is None:
+            context_dim = [None] * depth
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        if not use_linear:
+            self.proj_in = nn.Conv2d(
+                in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+            )
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    context_dim=context_dim[d],
+                    add_context_dim=add_context_dim,
+                    disable_self_attn=disable_self_attn,
+                    attn_mode=attn_type,
+                    checkpoint=use_checkpoint,
+                    sdp_backend=sdp_backend,
+                )
+                for d in range(depth)
+            ]
+        )
+        if not use_linear:
+            self.proj_out = zero_module(
+                nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+            )
+        else:
+            # self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+    def forward(self, x, context=None, add_context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        if not isinstance(context, list):
+            context = [context]
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c").contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        for i, block in enumerate(self.transformer_blocks):
+            if i > 0 and len(context) == 1:
+                i = 0  # use same context for each block
+            x = block(x, context=context[i], add_context=add_context)
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = self.proj_out(x)
+        return x + x_in
+def benchmark_attn():
+    # Lets define a helpful benchmarking function:
+    # https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    import torch.nn.functional as F
+    import torch.utils.benchmark as benchmark
+    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+        t0 = benchmark.Timer(
+            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+        )
+        return t0.blocked_autorange().mean * 1e6
+    # Lets define the hyper-parameters of our input
+    batch_size = 32
+    max_sequence_len = 1024
+    num_heads = 32
+    embed_dimension = 32
+    dtype = torch.float16
+    query = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    key = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    value = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    print(f"q/k/v shape:", query.shape, key.shape, value.shape)
+    # Lets explore the speed of each of the 3 implementations
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    # Helpful arguments mapper
+    backend_map = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+    }
+    from torch.profiler import ProfilerActivity, profile, record_function
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+    print(
+        f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+    )
+    with profile(
+        activities=activities, record_shapes=False, profile_memory=True
+    ) as prof:
+        with record_function("Default detailed stats"):
+            for _ in range(25):
+                o = F.scaled_dot_product_attention(query, key, value)
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+    print(
+        f"The math implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+    )
+    with sdp_kernel(**backend_map[SDPBackend.MATH]):
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("Math implmentation stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+    with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
+        try:
+            print(
+                f"The flash attention implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+            )
+        except RuntimeError:
+            print("FlashAttention is not supported. See warnings for reasons.")
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("FlashAttention stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+    with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        try:
+            print(
+                f"The memory efficient implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+            )
+        except RuntimeError:
+            print("EfficientAttention is not supported. See warnings for reasons.")
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("EfficientAttention stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+def run_model(model, x, context):
+    return model(x, context)
+def benchmark_transformer_blocks():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    import torch.utils.benchmark as benchmark
+    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+        t0 = benchmark.Timer(
+            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+        )
+        return t0.blocked_autorange().mean * 1e6
+    checkpoint = True
+    compile = False
+    batch_size = 32
+    h, w = 64, 64
+    context_len = 77
+    embed_dimension = 1024
+    context_dim = 1024
+    d_head = 64
+    transformer_depth = 4
+    n_heads = embed_dimension // d_head
+    dtype = torch.float16
+    model_native = SpatialTransformer(
+        embed_dimension,
+        n_heads,
+        d_head,
+        context_dim=context_dim,
+        use_linear=True,
+        use_checkpoint=checkpoint,
+        attn_type="softmax",
+        depth=transformer_depth,
+        sdp_backend=SDPBackend.FLASH_ATTENTION,
+    ).to(device)
+    model_efficient_attn = SpatialTransformer(
+        embed_dimension,
+        n_heads,
+        d_head,
+        context_dim=context_dim,
+        use_linear=True,
+        depth=transformer_depth,
+        use_checkpoint=checkpoint,
+        attn_type="softmax-xformers",
+    ).to(device)
+    if not checkpoint and compile:
+        print("compiling models")
+        model_native = torch.compile(model_native)
+        model_efficient_attn = torch.compile(model_efficient_attn)
+    x = torch.rand(batch_size, embed_dimension, h, w, device=device, dtype=dtype)
+    c = torch.rand(batch_size, context_len, context_dim, device=device, dtype=dtype)
+    from torch.profiler import ProfilerActivity, profile, record_function
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+    with torch.autocast("cuda"):
+        print(
+            f"The native model runs in {benchmark_torch_function_in_microseconds(model_native.forward, x, c):.3f} microseconds"
+        )
+        print(
+            f"The efficientattn model runs in {benchmark_torch_function_in_microseconds(model_efficient_attn.forward, x, c):.3f} microseconds"
+        )
+        print(75 * "+")
+        print("NATIVE")
+        print(75 * "+")
+        torch.cuda.reset_peak_memory_stats()
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("NativeAttention stats"):
+                for _ in range(25):
+                    model_native(x, c)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by native block")
+        print(75 * "+")
+        print("Xformers")
+        print(75 * "+")
+        torch.cuda.reset_peak_memory_stats()
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("xformers stats"):
+                for _ in range(25):
+                    model_efficient_attn(x, c)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by xformers block")
+def test01():
+    # conv1x1 vs linear
+    from ..util import count_params
+    conv = nn.Conv2d(3, 32, kernel_size=1).cuda()
+    print(count_params(conv))
+    linear = torch.nn.Linear(3, 32).cuda()
+    print(count_params(linear))
+    print(conv.weight.shape)
+    # use same initialization
+    linear.weight = torch.nn.Parameter(conv.weight.squeeze(-1).squeeze(-1))
+    linear.bias = torch.nn.Parameter(conv.bias)
+    print(linear.weight.shape)
+    x = torch.randn(11, 3, 64, 64).cuda()
+    xr = rearrange(x, "b c h w -> b (h w) c").contiguous()
+    print(xr.shape)
+    out_linear = linear(xr)
+    print(out_linear.mean(), out_linear.shape)
+    out_conv = conv(x)
+    print(out_conv.mean(), out_conv.shape)
+    print("done with test01.\n")
+def test02():
+    # try cosine flash attention
+    import time
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+    print("testing cosine flash attention...")
+    DIM = 1024
+    SEQLEN = 4096
+    BS = 16
+    print(" softmax (vanilla) first...")
+    model = BasicTransformerBlock(
+        dim=DIM,
+        n_heads=16,
+        d_head=64,
+        dropout=0.0,
+        context_dim=None,
+        attn_mode="softmax",
+    ).cuda()
+    try:
+        x = torch.randn(BS, SEQLEN, DIM).cuda()
+        tic = time.time()
+        y = model(x)
+        toc = time.time()
+        print(y.shape, toc - tic)
+    except RuntimeError as e:
+        # likely oom
+        print(str(e))
+    print("\n now flash-cosine...")
+    model = BasicTransformerBlock(
+        dim=DIM,
+        n_heads=16,
+        d_head=64,
+        dropout=0.0,
+        context_dim=None,
+        attn_mode="flash-cosine",
+    ).cuda()
+    x = torch.randn(BS, SEQLEN, DIM).cuda()
+    tic = time.time()
+    y = model(x)
+    toc = time.time()
+    print(y.shape, toc - tic)
+    print("done with test02.\n")
+if __name__ == "__main__":
+    # test01()
+    # test02()
+    # test03()
+    # benchmark_attn()
+    benchmark_transformer_blocks()
+    print("done.")

sgm/modules/autoencoding/__init__.py ADDED Viewed

File without changes

sgm/modules/autoencoding/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (161 Bytes). View file

sgm/modules/autoencoding/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (172 Bytes). View file

sgm/modules/autoencoding/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,246 @@

+from typing import Any, Union
+import torch
+import torch.nn as nn
+from einops import rearrange
+from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
+from taming.modules.losses.lpips import LPIPS
+from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
+from ....util import default, instantiate_from_config
+def adopt_weight(weight, global_step, threshold=0, value=0.0):
+    if global_step < threshold:
+        weight = value
+    return weight
+class LatentLPIPS(nn.Module):
+    def __init__(
+        self,
+        decoder_config,
+        perceptual_weight=1.0,
+        latent_weight=1.0,
+        scale_input_to_tgt_size=False,
+        scale_tgt_to_input_size=False,
+        perceptual_weight_on_inputs=0.0,
+    ):
+        super().__init__()
+        self.scale_input_to_tgt_size = scale_input_to_tgt_size
+        self.scale_tgt_to_input_size = scale_tgt_to_input_size
+        self.init_decoder(decoder_config)
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        self.latent_weight = latent_weight
+        self.perceptual_weight_on_inputs = perceptual_weight_on_inputs
+    def init_decoder(self, config):
+        self.decoder = instantiate_from_config(config)
+        if hasattr(self.decoder, "encoder"):
+            del self.decoder.encoder
+    def forward(self, latent_inputs, latent_predictions, image_inputs, split="train"):
+        log = dict()
+        loss = (latent_inputs - latent_predictions) ** 2
+        log[f"{split}/latent_l2_loss"] = loss.mean().detach()
+        image_reconstructions = None
+        if self.perceptual_weight > 0.0:
+            image_reconstructions = self.decoder.decode(latent_predictions)
+            image_targets = self.decoder.decode(latent_inputs)
+            perceptual_loss = self.perceptual_loss(
+                image_targets.contiguous(), image_reconstructions.contiguous()
+            )
+            loss = (
+                self.latent_weight * loss.mean()
+                + self.perceptual_weight * perceptual_loss.mean()
+            )
+            log[f"{split}/perceptual_loss"] = perceptual_loss.mean().detach()
+        if self.perceptual_weight_on_inputs > 0.0:
+            image_reconstructions = default(
+                image_reconstructions, self.decoder.decode(latent_predictions)
+            )
+            if self.scale_input_to_tgt_size:
+                image_inputs = torch.nn.functional.interpolate(
+                    image_inputs,
+                    image_reconstructions.shape[2:],
+                    mode="bicubic",
+                    antialias=True,
+                )
+            elif self.scale_tgt_to_input_size:
+                image_reconstructions = torch.nn.functional.interpolate(
+                    image_reconstructions,
+                    image_inputs.shape[2:],
+                    mode="bicubic",
+                    antialias=True,
+                )
+            perceptual_loss2 = self.perceptual_loss(
+                image_inputs.contiguous(), image_reconstructions.contiguous()
+            )
+            loss = loss + self.perceptual_weight_on_inputs * perceptual_loss2.mean()
+            log[f"{split}/perceptual_loss_on_inputs"] = perceptual_loss2.mean().detach()
+        return loss, log
+class GeneralLPIPSWithDiscriminator(nn.Module):
+    def __init__(
+        self,
+        disc_start: int,
+        logvar_init: float = 0.0,
+        pixelloss_weight=1.0,
+        disc_num_layers: int = 3,
+        disc_in_channels: int = 3,
+        disc_factor: float = 1.0,
+        disc_weight: float = 1.0,
+        perceptual_weight: float = 1.0,
+        disc_loss: str = "hinge",
+        scale_input_to_tgt_size: bool = False,
+        dims: int = 2,
+        learn_logvar: bool = False,
+        regularization_weights: Union[None, dict] = None,
+    ):
+        super().__init__()
+        self.dims = dims
+        if self.dims > 2:
+            print(
+                f"running with dims={dims}. This means that for perceptual loss calculation, "
+                f"the LPIPS loss will be applied to each frame independently. "
+            )
+        self.scale_input_to_tgt_size = scale_input_to_tgt_size
+        assert disc_loss in ["hinge", "vanilla"]
+        self.pixel_weight = pixelloss_weight
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        # output log variance
+        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
+        self.learn_logvar = learn_logvar
+        self.discriminator = NLayerDiscriminator(
+            input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=False
+        ).apply(weights_init)
+        self.discriminator_iter_start = disc_start
+        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.regularization_weights = default(regularization_weights, {})
+    def get_trainable_parameters(self) -> Any:
+        return self.discriminator.parameters()
+    def get_trainable_autoencoder_parameters(self) -> Any:
+        if self.learn_logvar:
+            yield self.logvar
+        yield from ()
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+        if last_layer is not None:
+            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        else:
+            nll_grads = torch.autograd.grad(
+                nll_loss, self.last_layer[0], retain_graph=True
+            )[0]
+            g_grads = torch.autograd.grad(
+                g_loss, self.last_layer[0], retain_graph=True
+            )[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+    def forward(
+        self,
+        regularization_log,
+        inputs,
+        reconstructions,
+        optimizer_idx,
+        global_step,
+        last_layer=None,
+        split="train",
+        weights=None,
+    ):
+        if self.scale_input_to_tgt_size:
+            inputs = torch.nn.functional.interpolate(
+                inputs, reconstructions.shape[2:], mode="bicubic", antialias=True
+            )
+        if self.dims > 2:
+            inputs, reconstructions = map(
+                lambda x: rearrange(x, "b c t h w -> (b t) c h w"),
+                (inputs, reconstructions),
+            )
+        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(
+                inputs.contiguous(), reconstructions.contiguous()
+            )
+            rec_loss = rec_loss + self.perceptual_weight * p_loss
+        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+        weighted_nll_loss = nll_loss
+        if weights is not None:
+            weighted_nll_loss = weights * nll_loss
+        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        # now the GAN part
+        if optimizer_idx == 0:
+            # generator update
+            logits_fake = self.discriminator(reconstructions.contiguous())
+            g_loss = -torch.mean(logits_fake)
+            if self.disc_factor > 0.0:
+                try:
+                    d_weight = self.calculate_adaptive_weight(
+                        nll_loss, g_loss, last_layer=last_layer
+                    )
+                except RuntimeError:
+                    assert not self.training
+                    d_weight = torch.tensor(0.0)
+            else:
+                d_weight = torch.tensor(0.0)
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            loss = weighted_nll_loss + d_weight * disc_factor * g_loss
+            log = dict()
+            for k in regularization_log:
+                if k in self.regularization_weights:
+                    loss = loss + self.regularization_weights[k] * regularization_log[k]
+                log[f"{split}/{k}"] = regularization_log[k].detach().mean()
+            log.update(
+                {
+                    "{}/total_loss".format(split): loss.clone().detach().mean(),
+                    "{}/logvar".format(split): self.logvar.detach(),
+                    "{}/nll_loss".format(split): nll_loss.detach().mean(),
+                    "{}/rec_loss".format(split): rec_loss.detach().mean(),
+                    "{}/d_weight".format(split): d_weight.detach(),
+                    "{}/disc_factor".format(split): torch.tensor(disc_factor),
+                    "{}/g_loss".format(split): g_loss.detach().mean(),
+                }
+            )
+            return loss, log
+        if optimizer_idx == 1:
+            # second pass for discriminator update
+            logits_real = self.discriminator(inputs.contiguous().detach())
+            logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+            log = {
+                "{}/disc_loss".format(split): d_loss.clone().detach().mean(),
+                "{}/logits_real".format(split): logits_real.detach().mean(),
+                "{}/logits_fake".format(split): logits_fake.detach().mean(),
+            }
+            return d_loss, log