diff --git a/.gitignore b/.gitignore
index 3c76fd4add404777ce8d8130784b3ad40911382c..04dc5a589e6655d965d09876f45ec6bd99b196bf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,3 @@
-# ignored folders
-models
-
 # ignored folders
 tmp/*
 
@@ -23,7 +20,6 @@ version.py
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
-*.pyc
 *.py[cod]
 *$py.class
 
@@ -125,4 +121,4 @@ venv.bak/
 /site
 
 # mypy
-.mypy_cache/
\ No newline at end of file
+.mypy_cache/
diff --git a/app.py b/app.py
index d72a6e917ab82ba7df5b96ae8bb63ed79c5fe1d9..25436c6935d5e3a6aba4a617c8fa86b7b8853b11 100755
--- a/app.py
+++ b/app.py
@@ -1,29 +1,44 @@
+# demo inspired by https://huggingface.co/spaces/lambdalabs/image-mixer-demo
+import argparse
+import copy
 import os
-# os.system('pip3 install openmim')
-os.system('mim install mmcv-full==1.7.0')
-# os.system('pip3 install mmpose')
-# os.system('pip3 install mmdet')
-# os.system('pip3 install gradio==3.19.1')
-#os.system('pip3 install psutil')
-
-from demo.model import Model_all
+import shlex
+import subprocess
+from functools import partial
+from itertools import chain
+
+import cv2
 import gradio as gr
-from demo.demos import create_demo_keypose, create_demo_sketch, create_demo_draw, create_demo_seg, create_demo_depth, create_demo_depth_keypose, create_demo_color, create_demo_color_sketch, create_demo_openpose, create_demo_style_sketch, create_demo_canny
 import torch
-import subprocess
-import shlex
+from basicsr.utils import tensor2img
 from huggingface_hub import hf_hub_url
+from pytorch_lightning import seed_everything
+from torch import autocast
 
+from ldm.inference_base import (DEFAULT_NEGATIVE_PROMPT, diffusion_inference,
+                                get_adapters, get_sd_models)
+from ldm.modules.extra_condition import api
+from ldm.modules.extra_condition.api import (ExtraCondition,
+                                             get_adapter_feature,
+                                             get_cond_model)
+
+torch.set_grad_enabled(False)
+
+supported_cond = ['style', 'color', 'canny', 'sketch', 'openpose', 'depth']
+
+# download the checkpoints
 urls = {
-    'TencentARC/T2I-Adapter':['models/t2iadapter_keypose_sd14v1.pth', 'models/t2iadapter_color_sd14v1.pth', 'models/t2iadapter_openpose_sd14v1.pth', 'models/t2iadapter_seg_sd14v1.pth', 'models/t2iadapter_sketch_sd14v1.pth', 'models/t2iadapter_depth_sd14v1.pth','third-party-models/body_pose_model.pth', "models/t2iadapter_style_sd14v1.pth", "models/t2iadapter_canny_sd14v1.pth"],
-    'CompVis/stable-diffusion-v-1-4-original':['sd-v1-4.ckpt'],
-    'andite/anything-v4.0':['anything-v4.0-pruned.ckpt', 'anything-v4.0.vae.pt'],
+    'TencentARC/T2I-Adapter': [
+        'models/t2iadapter_keypose_sd14v1.pth', 'models/t2iadapter_color_sd14v1.pth',
+        'models/t2iadapter_openpose_sd14v1.pth', 'models/t2iadapter_seg_sd14v1.pth',
+        'models/t2iadapter_sketch_sd14v1.pth', 'models/t2iadapter_depth_sd14v1.pth',
+        'third-party-models/body_pose_model.pth', "models/t2iadapter_style_sd14v1.pth",
+        "models/t2iadapter_canny_sd14v1.pth", "third-party-models/table5_pidinet.pth"
+    ],
+    'runwayml/stable-diffusion-v1-5': ['v1-5-pruned-emaonly.ckpt'],
+    'andite/anything-v4.0': ['anything-v4.0-pruned.ckpt', 'anything-v4.0.vae.pt'],
 }
-urls_mmpose = [
-    'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth',
-    'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth',
-    'https://github.com/kazuto1011/deeplab-pytorch/releases/download/v1.0/deeplabv2_resnet101_msc-cocostuff164k-100000.pth'
-]
+
 if os.path.exists('models') == False:
     os.mkdir('models')
 for repo in urls:
@@ -31,58 +46,257 @@ for repo in urls:
     for file in files:
         url = hf_hub_url(repo, file)
         name_ckp = url.split('/')[-1]
-        save_path = os.path.join('models',name_ckp)
+        save_path = os.path.join('models', name_ckp)
         if os.path.exists(save_path) == False:
             subprocess.run(shlex.split(f'wget {url} -O {save_path}'))
 
-for url in urls_mmpose:
-    name_ckp = url.split('/')[-1]
-    save_path = os.path.join('models',name_ckp)
-    if os.path.exists(save_path) == False:
-        subprocess.run(shlex.split(f'wget {url} -O {save_path}'))
+# config
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--sd_ckpt',
+    type=str,
+    default='models/v1-5-pruned-emaonly.ckpt',
+    help='path to checkpoint of stable diffusion model, both .ckpt and .safetensor are supported',
+)
+parser.add_argument(
+    '--vae_ckpt',
+    type=str,
+    default=None,
+    help='vae checkpoint, anime SD models usually have seperate vae ckpt that need to be loaded',
+)
+global_opt = parser.parse_args()
+global_opt.config = 'configs/stable-diffusion/sd-v1-inference.yaml'
+for cond_name in supported_cond:
+    setattr(global_opt, f'{cond_name}_adapter_ckpt', f'models/t2iadapter_{cond_name}_sd14v1.pth')
+global_opt.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+global_opt.max_resolution = 512 * 512
+global_opt.sampler = 'ddim'
+global_opt.cond_weight = 1.0
+global_opt.C = 4
+global_opt.f = 8
+
+# stable-diffusion model
+sd_model, sampler = get_sd_models(global_opt)
+# adapters and models to processing condition inputs
+adapters = {}
+cond_models = {}
+torch.cuda.empty_cache()
+
+
+def run(*args):
+    with torch.inference_mode(), \
+            sd_model.ema_scope(), \
+            autocast('cuda'):
+
+        inps = []
+        for i in range(0, len(args) - 8, len(supported_cond)):
+            inps.append(args[i:i + len(supported_cond)])
+
+        opt = copy.deepcopy(global_opt)
+        opt.prompt, opt.neg_prompt, opt.scale, opt.n_samples, opt.seed, opt.steps, opt.resize_short_edge, opt.cond_tau \
+            = args[-8:]
+
+        conds = []
+        activated_conds = []
+
+        ims1 = []
+        ims2 = []
+        for idx, (b, im1, im2, cond_weight) in enumerate(zip(*inps)):
+            if idx > 1:
+                if im1 is not None or im2 is not None:
+                    if im1 is not None:
+                        h, w, _ = im1.shape
+                    else:
+                        h, w, _ = im2.shape
+                    break
+        # resize all the images to the same size
+        for idx, (b, im1, im2, cond_weight) in enumerate(zip(*inps)):
+            if idx == 0:
+                ims1.append(im1)
+                ims2.append(im2)
+                continue
+            if im1 is not None:
+                im1 = cv2.resize(im1, (w, h), interpolation=cv2.INTER_CUBIC)
+            if im2 is not None:
+                im2 = cv2.resize(im2, (w, h), interpolation=cv2.INTER_CUBIC)
+            ims1.append(im1)
+            ims2.append(im2)
+
+        for idx, (b, _, _, cond_weight) in enumerate(zip(*inps)):
+            cond_name = supported_cond[idx]
+            if b == 'Nothing':
+                if cond_name in adapters:
+                    adapters[cond_name]['model'] = adapters[cond_name]['model'].cpu()
+            else:
+                activated_conds.append(cond_name)
+                if cond_name in adapters:
+                    adapters[cond_name]['model'] = adapters[cond_name]['model'].to(opt.device)
+                else:
+                    adapters[cond_name] = get_adapters(opt, getattr(ExtraCondition, cond_name))
+                adapters[cond_name]['cond_weight'] = cond_weight
+
+                process_cond_module = getattr(api, f'get_cond_{cond_name}')
 
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-model = Model_all(device)
+                if b == 'Image':
+                    if cond_name not in cond_models:
+                        cond_models[cond_name] = get_cond_model(opt, getattr(ExtraCondition, cond_name))
+                    conds.append(process_cond_module(opt, ims1[idx], 'image', cond_models[cond_name]))
+                else:
+                    conds.append(process_cond_module(opt, ims2[idx], cond_name, None))
 
-DESCRIPTION = '''# T2I-Adapter
+        adapter_features, append_to_context = get_adapter_feature(
+            conds, [adapters[cond_name] for cond_name in activated_conds])
 
-Gradio demo for **T2I-Adapter**: [[GitHub]](https://github.com/TencentARC/T2I-Adapter), [[Paper]](https://arxiv.org/abs/2302.08453).
+        output_conds = []
+        for cond in conds:
+            output_conds.append(tensor2img(cond, rgb2bgr=False))
 
-It also supports **multiple adapters** in the follwing tabs showing **"A adapter + B adapter"**.
+        ims = []
+        seed_everything(opt.seed)
+        for _ in range(opt.n_samples):
+            result = diffusion_inference(opt, sd_model, sampler, adapter_features, append_to_context)
+            ims.append(tensor2img(result, rgb2bgr=False))
 
-If T2I-Adapter is helpful, please help to ⭐ the [Github Repo](https://github.com/TencentARC/T2I-Adapter) and recommend it to your friends 😊
-'''
+        # Clear GPU memory cache so less likely to OOM
+        torch.cuda.empty_cache()
+        return ims, output_conds
+
+
+def change_visible(im1, im2, val):
+    outputs = {}
+    if val == "Image":
+        outputs[im1] = gr.update(visible=True)
+        outputs[im2] = gr.update(visible=False)
+    elif val == "Nothing":
+        outputs[im1] = gr.update(visible=False)
+        outputs[im2] = gr.update(visible=False)
+    else:
+        outputs[im1] = gr.update(visible=False)
+        outputs[im2] = gr.update(visible=True)
+    return outputs
+
+
+DESCRIPTION = '# [Composable T2I-Adapter](https://github.com/TencentARC/T2I-Adapter)'
+
+DESCRIPTION += f'<p>Gradio demo for **T2I-Adapter**: [[GitHub]](https://github.com/TencentARC/T2I-Adapter), [[Paper]](https://arxiv.org/abs/2302.08453). If T2I-Adapter is helpful, please help to ⭐ the [Github Repo](https://github.com/TencentARC/T2I-Adapter) and recommend it to your friends 😊 </p>'
+
+DESCRIPTION += f'<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/Adapter/T2I-Adapter?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
 
 with gr.Blocks(css='style.css') as demo:
     gr.Markdown(DESCRIPTION)
-    
-    gr.HTML("""<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
-    <br/>
-    <a href="https://huggingface.co/spaces/Adapter/T2I-Adapter?duplicate=true">
-    <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-    <p/>""")
-
-    with gr.Tabs():
-        with gr.TabItem('Openpose'):
-            create_demo_openpose(model.process_openpose)
-        with gr.TabItem('Keypose'):
-            create_demo_keypose(model.process_keypose)
-        with gr.TabItem('Canny'):
-            create_demo_canny(model.process_canny)
-        with gr.TabItem('Sketch'):
-            create_demo_sketch(model.process_sketch)
-        with gr.TabItem('Draw'):
-            create_demo_draw(model.process_draw)
-        with gr.TabItem('Depth'):
-            create_demo_depth(model.process_depth)
-        with gr.TabItem('Depth + Keypose'):
-            create_demo_depth_keypose(model.process_depth_keypose)
-        with gr.TabItem('Color'):
-            create_demo_color(model.process_color)
-        with gr.TabItem('Color + Sketch'):
-            create_demo_color_sketch(model.process_color_sketch)
-        with gr.TabItem('Style + Sketch'):
-            create_demo_style_sketch(model.process_style_sketch)
-        with gr.TabItem('Segmentation'):
-            create_demo_seg(model.process_seg)
-demo.queue().launch(debug=True, server_name='0.0.0.0')
\ No newline at end of file
+
+    btns = []
+    ims1 = []
+    ims2 = []
+    cond_weights = []
+
+    with gr.Row():
+        with gr.Column(scale=1.9):
+            with gr.Box():
+                gr.Markdown("<h5><center>Style & Color</center></h5>")
+                with gr.Row():
+                    for cond_name in supported_cond[:2]:
+                        with gr.Box():
+                            with gr.Column():
+                                if cond_name == 'style':
+                                    btn1 = gr.Radio(
+                                        choices=["Image", "Nothing"],
+                                        label=f"Input type for {cond_name}",
+                                        interactive=True,
+                                        value="Nothing",
+                                    )
+                                else:
+                                    btn1 = gr.Radio(
+                                        choices=["Image", cond_name, "Nothing"],
+                                        label=f"Input type for {cond_name}",
+                                        interactive=True,
+                                        value="Nothing",
+                                    )
+                                im1 = gr.Image(
+                                    source='upload', label="Image", interactive=True, visible=False, type="numpy")
+                                im2 = gr.Image(
+                                    source='upload', label=cond_name, interactive=True, visible=False, type="numpy")
+                                cond_weight = gr.Slider(
+                                    label="Condition weight",
+                                    minimum=0,
+                                    maximum=5,
+                                    step=0.05,
+                                    value=1,
+                                    interactive=True)
+
+                                fn = partial(change_visible, im1, im2)
+                                btn1.change(fn=fn, inputs=[btn1], outputs=[im1, im2], queue=False)
+
+                                btns.append(btn1)
+                                ims1.append(im1)
+                                ims2.append(im2)
+                                cond_weights.append(cond_weight)
+        with gr.Column(scale=4):
+            with gr.Box():
+                gr.Markdown("<h5><center>Structure</center></h5>")
+                with gr.Row():
+                    for cond_name in supported_cond[2:6]:
+                        with gr.Box():
+                            with gr.Column():
+                                if cond_name == 'openpose':
+                                    btn1 = gr.Radio(
+                                        choices=["Image", 'pose', "Nothing"],
+                                        label=f"Input type for {cond_name}",
+                                        interactive=True,
+                                        value="Nothing",
+                                    )
+                                else:
+                                    btn1 = gr.Radio(
+                                        choices=["Image", cond_name, "Nothing"],
+                                        label=f"Input type for {cond_name}",
+                                        interactive=True,
+                                        value="Nothing",
+                                    )
+                                im1 = gr.Image(
+                                    source='upload', label="Image", interactive=True, visible=False, type="numpy")
+                                im2 = gr.Image(
+                                    source='upload', label=cond_name, interactive=True, visible=False, type="numpy")
+                                cond_weight = gr.Slider(
+                                    label="Condition weight",
+                                    minimum=0,
+                                    maximum=5,
+                                    step=0.05,
+                                    value=1,
+                                    interactive=True)
+
+                                fn = partial(change_visible, im1, im2)
+                                btn1.change(fn=fn, inputs=[btn1], outputs=[im1, im2], queue=False)
+
+                                btns.append(btn1)
+                                ims1.append(im1)
+                                ims2.append(im2)
+                                cond_weights.append(cond_weight)
+
+    with gr.Column():
+        prompt = gr.Textbox(label="Prompt")
+
+        with gr.Accordion('Advanced options', open=False):
+            neg_prompt = gr.Textbox(label="Negative Prompt", value=DEFAULT_NEGATIVE_PROMPT)
+            scale = gr.Slider(
+                label="Guidance Scale (Classifier free guidance)", value=7.5, minimum=1, maximum=20, step=0.1)
+            n_samples = gr.Slider(label="Num samples", value=1, minimum=1, maximum=8, step=1)
+            seed = gr.Slider(label="Seed", value=42, minimum=0, maximum=10000, step=1)
+            steps = gr.Slider(label="Steps", value=50, minimum=10, maximum=100, step=1)
+            resize_short_edge = gr.Slider(label="Image resolution", value=512, minimum=320, maximum=1024, step=1)
+            cond_tau = gr.Slider(
+                label="timestamp parameter that determines until which step the adapter is applied",
+                value=1.0,
+                minimum=0.1,
+                maximum=1.0,
+                step=0.05)
+
+    with gr.Row():
+        submit = gr.Button("Generate")
+    output = gr.Gallery().style(grid=2, height='auto')
+    cond = gr.Gallery().style(grid=2, height='auto')
+
+    inps = list(chain(btns, ims1, ims2, cond_weights))
+
+    inps.extend([prompt, neg_prompt, scale, n_samples, seed, steps, resize_short_edge, cond_tau])
+    submit.click(fn=run, inputs=inps, outputs=[output, cond])
+demo.launch(server_name='0.0.0.0', share=False, server_port=47313)
diff --git a/models/faster_rcnn_r50_fpn_coco.py b/configs/mm/faster_rcnn_r50_fpn_coco.py
similarity index 96%
rename from models/faster_rcnn_r50_fpn_coco.py
rename to configs/mm/faster_rcnn_r50_fpn_coco.py
index 3010d61cea6b6a25c425cc893c82aee022424ec0..a9ad9528b22163ae7ce1390375b69227fd6eafd9 100644
--- a/models/faster_rcnn_r50_fpn_coco.py
+++ b/configs/mm/faster_rcnn_r50_fpn_coco.py
@@ -1,182 +1,182 @@
-checkpoint_config = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
-# optimizer
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(grad_clip=None)
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=0.001,
-    step=[8, 11])
-total_epochs = 12
-
-model = dict(
-    type='FasterRCNN',
-    pretrained='torchvision://resnet50',
-    backbone=dict(
-        type='ResNet',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN', requires_grad=True),
-        norm_eval=True,
-        style='pytorch'),
-    neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        num_outs=5),
-    rpn_head=dict(
-        type='RPNHead',
-        in_channels=256,
-        feat_channels=256,
-        anchor_generator=dict(
-            type='AnchorGenerator',
-            scales=[8],
-            ratios=[0.5, 1.0, 2.0],
-            strides=[4, 8, 16, 32, 64]),
-        bbox_coder=dict(
-            type='DeltaXYWHBBoxCoder',
-            target_means=[.0, .0, .0, .0],
-            target_stds=[1.0, 1.0, 1.0, 1.0]),
-        loss_cls=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
-        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
-    roi_head=dict(
-        type='StandardRoIHead',
-        bbox_roi_extractor=dict(
-            type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
-            out_channels=256,
-            featmap_strides=[4, 8, 16, 32]),
-        bbox_head=dict(
-            type='Shared2FCBBoxHead',
-            in_channels=256,
-            fc_out_channels=1024,
-            roi_feat_size=7,
-            num_classes=80,
-            bbox_coder=dict(
-                type='DeltaXYWHBBoxCoder',
-                target_means=[0., 0., 0., 0.],
-                target_stds=[0.1, 0.1, 0.2, 0.2]),
-            reg_class_agnostic=False,
-            loss_cls=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
-    # model training and testing settings
-    train_cfg=dict(
-        rpn=dict(
-            assigner=dict(
-                type='MaxIoUAssigner',
-                pos_iou_thr=0.7,
-                neg_iou_thr=0.3,
-                min_pos_iou=0.3,
-                match_low_quality=True,
-                ignore_iof_thr=-1),
-            sampler=dict(
-                type='RandomSampler',
-                num=256,
-                pos_fraction=0.5,
-                neg_pos_ub=-1,
-                add_gt_as_proposals=False),
-            allowed_border=-1,
-            pos_weight=-1,
-            debug=False),
-        rpn_proposal=dict(
-            nms_pre=2000,
-            max_per_img=1000,
-            nms=dict(type='nms', iou_threshold=0.7),
-            min_bbox_size=0),
-        rcnn=dict(
-            assigner=dict(
-                type='MaxIoUAssigner',
-                pos_iou_thr=0.5,
-                neg_iou_thr=0.5,
-                min_pos_iou=0.5,
-                match_low_quality=False,
-                ignore_iof_thr=-1),
-            sampler=dict(
-                type='RandomSampler',
-                num=512,
-                pos_fraction=0.25,
-                neg_pos_ub=-1,
-                add_gt_as_proposals=True),
-            pos_weight=-1,
-            debug=False)),
-    test_cfg=dict(
-        rpn=dict(
-            nms_pre=1000,
-            max_per_img=1000,
-            nms=dict(type='nms', iou_threshold=0.7),
-            min_bbox_size=0),
-        rcnn=dict(
-            score_thr=0.05,
-            nms=dict(type='nms', iou_threshold=0.5),
-            max_per_img=100)
-        # soft-nms is also supported for rcnn testing
-        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
-    ))
-
-dataset_type = 'CocoDataset'
-data_root = 'data/coco'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', with_bbox=True),
-    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1333, 800),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='DefaultFormatBundle'),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
-        type=dataset_type,
-        ann_file=f'{data_root}/annotations/instances_train2017.json',
-        img_prefix=f'{data_root}/train2017/',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=f'{data_root}/annotations/instances_val2017.json',
-        img_prefix=f'{data_root}/val2017/',
-        pipeline=test_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=f'{data_root}/annotations/instances_val2017.json',
-        img_prefix=f'{data_root}/val2017/',
-        pipeline=test_pipeline))
-evaluation = dict(interval=1, metric='bbox')
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+total_epochs = 12
+
+model = dict(
+    type='FasterRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/models/hrnet_w48_coco_256x192.py b/configs/mm/hrnet_w48_coco_256x192.py
similarity index 96%
rename from models/hrnet_w48_coco_256x192.py
rename to configs/mm/hrnet_w48_coco_256x192.py
index 898864bd6b328d69dcb492e1a42e1d3d6b1f2e8c..9755e6773cd3a8c0d2ac684c612d716cfd44b0ca 100644
--- a/models/hrnet_w48_coco_256x192.py
+++ b/configs/mm/hrnet_w48_coco_256x192.py
@@ -1,169 +1,169 @@
-# _base_ = [
-#     '../../../../_base_/default_runtime.py',
-#     '../../../../_base_/datasets/coco.py'
-# ]
-evaluation = dict(interval=10, metric='mAP', save_best='AP')
-
-optimizer = dict(
-    type='Adam',
-    lr=5e-4,
-)
-optimizer_config = dict(grad_clip=None)
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=0.001,
-    step=[170, 200])
-total_epochs = 210
-channel_cfg = dict(
-    num_output_channels=17,
-    dataset_joints=17,
-    dataset_channel=[
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
-    ],
-    inference_channel=[
-        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
-    ])
-
-# model settings
-model = dict(
-    type='TopDown',
-    pretrained='https://download.openmmlab.com/mmpose/'
-    'pretrain_models/hrnet_w48-8ef0771d.pth',
-    backbone=dict(
-        type='HRNet',
-        in_channels=3,
-        extra=dict(
-            stage1=dict(
-                num_modules=1,
-                num_branches=1,
-                block='BOTTLENECK',
-                num_blocks=(4, ),
-                num_channels=(64, )),
-            stage2=dict(
-                num_modules=1,
-                num_branches=2,
-                block='BASIC',
-                num_blocks=(4, 4),
-                num_channels=(48, 96)),
-            stage3=dict(
-                num_modules=4,
-                num_branches=3,
-                block='BASIC',
-                num_blocks=(4, 4, 4),
-                num_channels=(48, 96, 192)),
-            stage4=dict(
-                num_modules=3,
-                num_branches=4,
-                block='BASIC',
-                num_blocks=(4, 4, 4, 4),
-                num_channels=(48, 96, 192, 384))),
-    ),
-    keypoint_head=dict(
-        type='TopdownHeatmapSimpleHead',
-        in_channels=48,
-        out_channels=channel_cfg['num_output_channels'],
-        num_deconv_layers=0,
-        extra=dict(final_conv_kernel=1, ),
-        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
-    train_cfg=dict(),
-    test_cfg=dict(
-        flip_test=True,
-        post_process='default',
-        shift_heatmap=True,
-        modulate_kernel=11))
-
-data_cfg = dict(
-    image_size=[192, 256],
-    heatmap_size=[48, 64],
-    num_output_channels=channel_cfg['num_output_channels'],
-    num_joints=channel_cfg['dataset_joints'],
-    dataset_channel=channel_cfg['dataset_channel'],
-    inference_channel=channel_cfg['inference_channel'],
-    soft_nms=False,
-    nms_thr=1.0,
-    oks_thr=0.9,
-    vis_thr=0.2,
-    use_gt_bbox=False,
-    det_bbox_thr=0.0,
-    bbox_file='data/coco/person_detection_results/'
-    'COCO_val2017_detections_AP_H_56_person.json',
-)
-
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='TopDownGetBboxCenterScale', padding=1.25),
-    dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
-    dict(type='TopDownRandomFlip', flip_prob=0.5),
-    dict(
-        type='TopDownHalfBodyTransform',
-        num_joints_half_body=8,
-        prob_half_body=0.3),
-    dict(
-        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
-    dict(type='TopDownAffine'),
-    dict(type='ToTensor'),
-    dict(
-        type='NormalizeTensor',
-        mean=[0.485, 0.456, 0.406],
-        std=[0.229, 0.224, 0.225]),
-    dict(type='TopDownGenerateTarget', sigma=2),
-    dict(
-        type='Collect',
-        keys=['img', 'target', 'target_weight'],
-        meta_keys=[
-            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
-            'rotation', 'bbox_score', 'flip_pairs'
-        ]),
-]
-
-val_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='TopDownGetBboxCenterScale', padding=1.25),
-    dict(type='TopDownAffine'),
-    dict(type='ToTensor'),
-    dict(
-        type='NormalizeTensor',
-        mean=[0.485, 0.456, 0.406],
-        std=[0.229, 0.224, 0.225]),
-    dict(
-        type='Collect',
-        keys=['img'],
-        meta_keys=[
-            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
-            'flip_pairs'
-        ]),
-]
-
-test_pipeline = val_pipeline
-
-data_root = 'data/coco'
-data = dict(
-    samples_per_gpu=32,
-    workers_per_gpu=2,
-    val_dataloader=dict(samples_per_gpu=32),
-    test_dataloader=dict(samples_per_gpu=32),
-    train=dict(
-        type='TopDownCocoDataset',
-        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
-        img_prefix=f'{data_root}/train2017/',
-        data_cfg=data_cfg,
-        pipeline=train_pipeline,
-        dataset_info={{_base_.dataset_info}}),
-    val=dict(
-        type='TopDownCocoDataset',
-        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
-        img_prefix=f'{data_root}/val2017/',
-        data_cfg=data_cfg,
-        pipeline=val_pipeline,
-        dataset_info={{_base_.dataset_info}}),
-    test=dict(
-        type='TopDownCocoDataset',
-        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
-        img_prefix=f'{data_root}/val2017/',
-        data_cfg=data_cfg,
-        pipeline=test_pipeline,
-        dataset_info={{_base_.dataset_info}}),
-)
+# _base_ = [
+#     '../../../../_base_/default_runtime.py',
+#     '../../../../_base_/datasets/coco.py'
+# ]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/configs/stable-diffusion/sd-v1-inference.yaml b/configs/stable-diffusion/sd-v1-inference.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dba409bc86df919bbaa687e1c85fefd641b963de
--- /dev/null
+++ b/configs/stable-diffusion/sd-v1-inference.yaml
@@ -0,0 +1,65 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 512
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.WebUIFrozenCLIPEmebedder
+      params:
+        version: openai/clip-vit-large-patch14
+        layer: last
diff --git a/configs/stable-diffusion/sd-v1-train.yaml b/configs/stable-diffusion/sd-v1-train.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c22ae71c977c229d0bbf0d618a838196c601804
--- /dev/null
+++ b/configs/stable-diffusion/sd-v1-train.yaml
@@ -0,0 +1,86 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config: #__is_unconditional__
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        version: openai/clip-vit-large-patch14
+
+logger:
+  print_freq: 100
+  save_checkpoint_freq: !!float 1e4
+  use_tb_logger: true
+  wandb:
+    project: ~
+    resume_id: ~
+dist_params:
+  backend: nccl
+  port: 29500
+training:
+  lr: !!float 1e-5
+  save_freq: 1e4
\ No newline at end of file
diff --git a/configs/stable-diffusion/train_keypose.yaml b/configs/stable-diffusion/train_keypose.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cd25843a0d854ee3a36807ed69b666a66ada16ab
--- /dev/null
+++ b/configs/stable-diffusion/train_keypose.yaml
@@ -0,0 +1,87 @@
+name: train_keypose
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config: #__is_unconditional__
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        version: openai/clip-vit-large-patch14
+
+logger:
+  print_freq: 100
+  save_checkpoint_freq: !!float 1e4
+  use_tb_logger: true
+  wandb:
+    project: ~
+    resume_id: ~
+dist_params:
+  backend: nccl
+  port: 29500
+training:
+  lr: !!float 1e-5
+  save_freq: 1e4
\ No newline at end of file
diff --git a/configs/stable-diffusion/train_mask.yaml b/configs/stable-diffusion/train_mask.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ab298114683416d3687bfc9a2a0b24b51fb1e62
--- /dev/null
+++ b/configs/stable-diffusion/train_mask.yaml
@@ -0,0 +1,87 @@
+name: train_mask
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config: #__is_unconditional__
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        version: openai/clip-vit-large-patch14
+
+logger:
+  print_freq: 100
+  save_checkpoint_freq: !!float 1e4
+  use_tb_logger: true
+  wandb:
+    project: ~
+    resume_id: ~
+dist_params:
+  backend: nccl
+  port: 29500
+training:
+  lr: !!float 1e-5
+  save_freq: 1e4
\ No newline at end of file
diff --git a/configs/stable-diffusion/train_sketch.yaml b/configs/stable-diffusion/train_sketch.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90d44870ec68d327b2cf85a6dfae280bd397a825
--- /dev/null
+++ b/configs/stable-diffusion/train_sketch.yaml
@@ -0,0 +1,87 @@
+name: train_sketch
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config: #__is_unconditional__
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        version: openai/clip-vit-large-patch14
+
+logger:
+  print_freq: 100
+  save_checkpoint_freq: !!float 1e4
+  use_tb_logger: true
+  wandb:
+    project: ~
+    resume_id: ~
+dist_params:
+  backend: nccl
+  port: 29500
+training:
+  lr: !!float 1e-5
+  save_freq: 1e4
\ No newline at end of file
diff --git a/demo/demos.py b/demo/demos.py
deleted file mode 100755
index fb35178cb48b0561f2ddbdba6d5111718f36605a..0000000000000000000000000000000000000000
--- a/demo/demos.py
+++ /dev/null
@@ -1,309 +0,0 @@
-import gradio as gr
-import numpy as np
-import psutil
-
-def create_map():
-    return np.zeros(shape=(512, 512), dtype=np.uint8)+255
-
-def get_system_memory():
-    memory = psutil.virtual_memory()
-    memory_percent = memory.percent
-    memory_used = memory.used / (1024.0 ** 3)
-    memory_total = memory.total / (1024.0 ** 3)
-    return {"percent": f"{memory_percent}%", "used": f"{memory_used:.3f}GB", "total": f"{memory_total:.3f}GB"}
-
-
-
-def create_demo_keypose(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Keypose)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Keypose', 'Image'], type="value", default='Image', label='Input Types\n (You can input an image or a keypose map)')
-                    fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed to produce a fixed output)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the keypose to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-        ips = [input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_openpose(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Openpose)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Openpose', 'Image'], type="value", default='Image', label='Input Types\n (You can input an image or a openpose map)')
-                    fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed to produce a fixed output)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the openpose to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-        ips = [input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_sketch(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Sketch)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Sketch', 'Image'], type="value", default='Image', label='Input Types\n (You can input an image or a sketch)')
-                    color_back = gr.inputs.Radio(['White', 'Black'], type="value", default='Black', label='Color of the sketch background\n (Only work for sketch input)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=0.4, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_canny(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Canny)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Canny', 'Image'], type="value", default='Image', label='Input Types\n (You can input an image or a canny map)')
-                    color_back = gr.inputs.Radio(['White', 'Black'], type="value", default='Black', label='Color of the canny background\n (Only work for canny input)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the canny to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_color_sketch(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Color + Sketch)')
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    input_img_sketch = gr.Image(source='upload', type="numpy", label='Sketch guidance')
-                    input_img_color = gr.Image(source='upload', type="numpy", label='Color guidance')
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                type_in_color = gr.inputs.Radio(['ColorMap', 'Image'], type="value", default='Image', label='Input Types of Color\n (You can input an image or a color map)')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Sketch', 'Image'], type="value", default='Image', label='Input Types of Sketch\n (You can input an image or a sketch)')
-                    color_back = gr.inputs.Radio(['White', 'Black'], type="value", default='Black', label='Color of the sketch background\n (Only work for sketch input)')
-                with gr.Row():
-                    w_sketch = gr.Slider(label="Sketch guidance weight", minimum=0, maximum=2, value=1.0, step=0.1)
-                    w_color = gr.Slider(label="Color guidance weight", minimum=0, maximum=2, value=1.2, step=0.1)
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=0.4, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=3, height='auto')
-            ips = [input_img_sketch, input_img_color, type_in, type_in_color, w_sketch, w_color, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_style_sketch(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Style + Sketch)')
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    input_img_sketch = gr.Image(source='upload', type="numpy", label='Sketch guidance')
-                    input_img_style = gr.Image(source='upload', type="numpy", label='Style guidance')
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Sketch', 'Image'], type="value", default='Image', label='Input Types of Sketch\n (You can input an image or a sketch)')
-                    color_back = gr.inputs.Radio(['White', 'Black'], type="value", default='Black', label='Color of the sketch background\n (Only work for sketch input)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img_sketch, input_img_style, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_color(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Color)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy", label='Color guidance')
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                type_in_color = gr.inputs.Radio(['ColorMap', 'Image'], type="value", default='Image', label='Input Types of Color\n (You can input an image or a color map)')
-                w_color = gr.Slider(label="Color guidance weight", minimum=0, maximum=2, value=1, step=0.1)
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, prompt, neg_prompt, pos_prompt, w_color, type_in_color, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_seg(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Segmentation)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Segmentation', 'Image'], type="value", default='Image', label='You can input an image or a segmentation. If you choose to input a segmentation, it must correspond to the coco-stuff')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the segmentation to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_depth(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Depth)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Depth', 'Image'], type="value", default='Image', label='You can input an image or a depth map')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the depth map to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_depth_keypose(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Depth & Keypose)')
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    input_img_depth = gr.Image(source='upload', type="numpy", label='Depth guidance')
-                    input_img_keypose = gr.Image(source='upload', type="numpy", label='Keypose guidance')
-
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in_depth = gr.inputs.Radio(['Depth', 'Image'], type="value", default='Image', label='You can input an image or a depth map')
-                    type_in_keypose = gr.inputs.Radio(['Keypose', 'Image'], type="value", default='Image', label='You can input an image or a keypose map (mmpose style)')
-                with gr.Row():
-                    w_depth = gr.Slider(label="Depth guidance weight", minimum=0, maximum=2, value=1.0, step=0.1)
-                    w_keypose = gr.Slider(label="Keypose guidance weight", minimum=0, maximum=2, value=1.5, step=0.1)
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the multi-guidance to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=3, height='auto')
-            ips = [input_img_depth, input_img_keypose, type_in_depth, type_in_keypose, w_depth, w_keypose, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-
-def create_demo_draw(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Hand-free drawing)')
-        with gr.Row():
-            with gr.Column():
-                create_button = gr.Button(label="Start", value='Hand-free drawing')
-                input_img = gr.Image(source='upload', type="numpy",tool='sketch')
-                create_button.click(fn=create_map, outputs=[input_img], queue=False)
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=0.4, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
\ No newline at end of file
diff --git a/demo/model.py b/demo/model.py
deleted file mode 100755
index 275bf80081a96f2377c04285ee7ff40689e4d3c4..0000000000000000000000000000000000000000
--- a/demo/model.py
+++ /dev/null
@@ -1,979 +0,0 @@
-import torch
-from basicsr.utils import img2tensor, tensor2img
-from pytorch_lightning import seed_everything
-from ldm.models.diffusion.plms import PLMSSampler
-from ldm.modules.encoders.adapter import Adapter, Adapter_light, StyleAdapter
-from ldm.util import instantiate_from_config
-from ldm.modules.structure_condition.model_edge import pidinet
-from ldm.modules.structure_condition.model_seg import seger, Colorize
-from ldm.modules.structure_condition.midas.api import MiDaSInference
-import gradio as gr
-from omegaconf import OmegaConf
-import mmcv
-from mmdet.apis import inference_detector, init_detector
-from mmpose.apis import (inference_top_down_pose_model, init_pose_model, process_mmdet_results, vis_pose_result)
-import os
-import cv2
-import numpy as np
-import torch.nn.functional as F
-from transformers import CLIPProcessor, CLIPVisionModel
-from PIL import Image
-
-
-def preprocessing(image, device):
-    # Resize
-    scale = 640 / max(image.shape[:2])
-    image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
-    raw_image = image.astype(np.uint8)
-
-    # Subtract mean values
-    image = image.astype(np.float32)
-    image -= np.array(
-        [
-            float(104.008),
-            float(116.669),
-            float(122.675),
-        ]
-    )
-
-    # Convert to torch.Tensor and add "batch" axis
-    image = torch.from_numpy(image.transpose(2, 0, 1)).float().unsqueeze(0)
-    image = image.to(device)
-
-    return image, raw_image
-
-
-def imshow_keypoints(img,
-                     pose_result,
-                     skeleton=None,
-                     kpt_score_thr=0.1,
-                     pose_kpt_color=None,
-                     pose_link_color=None,
-                     radius=4,
-                     thickness=1):
-    """Draw keypoints and links on an image.
-
-    Args:
-            img (ndarry): The image to draw poses on.
-            pose_result (list[kpts]): The poses to draw. Each element kpts is
-                a set of K keypoints as an Kx3 numpy.ndarray, where each
-                keypoint is represented as x, y, score.
-            kpt_score_thr (float, optional): Minimum score of keypoints
-                to be shown. Default: 0.3.
-            pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
-                the keypoint will not be drawn.
-            pose_link_color (np.array[Mx3]): Color of M links. If None, the
-                links will not be drawn.
-            thickness (int): Thickness of lines.
-    """
-
-    img_h, img_w, _ = img.shape
-    img = np.zeros(img.shape)
-
-    for idx, kpts in enumerate(pose_result):
-        if idx > 1:
-            continue
-        kpts = kpts['keypoints']
-        kpts = np.array(kpts, copy=False)
-
-        # draw each point on image
-        if pose_kpt_color is not None:
-            assert len(pose_kpt_color) == len(kpts)
-
-            for kid, kpt in enumerate(kpts):
-                x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
-
-                if kpt_score < kpt_score_thr or pose_kpt_color[kid] is None:
-                    # skip the point that should not be drawn
-                    continue
-
-                color = tuple(int(c) for c in pose_kpt_color[kid])
-                cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1)
-
-        # draw links
-        if skeleton is not None and pose_link_color is not None:
-            assert len(pose_link_color) == len(skeleton)
-
-            for sk_id, sk in enumerate(skeleton):
-                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
-                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
-
-                if (pos1[0] <= 0 or pos1[0] >= img_w or pos1[1] <= 0 or pos1[1] >= img_h or pos2[0] <= 0
-                        or pos2[0] >= img_w or pos2[1] <= 0 or pos2[1] >= img_h or kpts[sk[0], 2] < kpt_score_thr
-                        or kpts[sk[1], 2] < kpt_score_thr or pose_link_color[sk_id] is None):
-                    # skip the link that should not be drawn
-                    continue
-                color = tuple(int(c) for c in pose_link_color[sk_id])
-                cv2.line(img, pos1, pos2, color, thickness=thickness)
-
-    return img
-
-
-def load_model_from_config(config, ckpt, verbose=False):
-    print(f"Loading model from {ckpt}")
-    pl_sd = torch.load(ckpt, map_location="cpu")
-    if "global_step" in pl_sd:
-        print(f"Global Step: {pl_sd['global_step']}")
-    if "state_dict" in pl_sd:
-        sd = pl_sd["state_dict"]
-    else:
-        sd = pl_sd
-    model = instantiate_from_config(config.model)
-    _, _ = model.load_state_dict(sd, strict=False)
-
-    model.cuda()
-    model.eval()
-    return model
-
-
-class Model_all:
-    def __init__(self, device='cpu'):
-        # common part
-        self.device = device
-        self.config = OmegaConf.load("configs/stable-diffusion/app.yaml")
-        self.config.model.params.cond_stage_config.params.device = device
-        self.base_model = load_model_from_config(self.config, "models/sd-v1-4.ckpt").to(device)
-        self.current_base = 'sd-v1-4.ckpt'
-        self.sampler = PLMSSampler(self.base_model)
-
-        # sketch part
-        self.model_canny = Adapter(channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                    use_conv=False).to(device)
-        self.model_canny.load_state_dict(torch.load("models/t2iadapter_canny_sd14v1.pth", map_location=device))
-        self.model_sketch = Adapter(channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                    use_conv=False).to(device)
-        self.model_sketch.load_state_dict(torch.load("models/t2iadapter_sketch_sd14v1.pth", map_location=device))
-        self.model_edge = pidinet().to(device)
-        self.model_edge.load_state_dict({k.replace('module.', ''): v for k, v in
-                                         torch.load('models/table5_pidinet.pth', map_location=device)[
-                                             'state_dict'].items()})
-
-        # segmentation part
-        self.model_seger = seger().to(device)
-        self.model_seger.eval()
-        self.coler = Colorize(n=182)
-        self.model_seg = Adapter(cin=int(3 * 64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                 use_conv=False).to(device)
-        self.model_seg.load_state_dict(torch.load("models/t2iadapter_seg_sd14v1.pth", map_location=device))
-
-        # depth part
-        self.depth_model = MiDaSInference(model_type='dpt_hybrid').to(device)
-        self.model_depth = Adapter(cin=3 * 64, channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                   use_conv=False).to(device)
-        self.model_depth.load_state_dict(torch.load("models/t2iadapter_depth_sd14v1.pth", map_location=device))
-
-        # keypose part
-        self.model_pose = Adapter(cin=int(3 * 64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                  use_conv=False).to(device)
-        self.model_pose.load_state_dict(torch.load("models/t2iadapter_keypose_sd14v1.pth", map_location=device))
-
-        # openpose part
-        self.model_openpose = Adapter(cin=int(3 * 64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                  use_conv=False).to(device)
-        self.model_openpose.load_state_dict(torch.load("models/t2iadapter_openpose_sd14v1.pth", map_location=device))
-
-        # color part
-        self.model_color = Adapter_light(cin=int(3 * 64), channels=[320, 640, 1280, 1280], nums_rb=4).to(device)
-        self.model_color.load_state_dict(torch.load("models/t2iadapter_color_sd14v1.pth", map_location=device))
-
-        # style part
-        self.model_style = StyleAdapter(width=1024, context_dim=768, num_head=8, n_layes=3, num_token=8).to(device)
-        self.model_style.load_state_dict(torch.load("models/t2iadapter_style_sd14v1.pth", map_location=device))
-        self.clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
-        self.clip_vision_model = CLIPVisionModel.from_pretrained('openai/clip-vit-large-patch14').to(device)
-
-        device = 'cpu'
-        ## mmpose
-        det_config = 'models/faster_rcnn_r50_fpn_coco.py'
-        det_checkpoint = 'models/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
-        pose_config = 'models/hrnet_w48_coco_256x192.py'
-        pose_checkpoint = 'models/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth'
-        self.det_cat_id = 1
-        self.bbox_thr = 0.2
-        ## detector
-        det_config_mmcv = mmcv.Config.fromfile(det_config)
-        self.det_model = init_detector(det_config_mmcv, det_checkpoint, device=device)
-        pose_config_mmcv = mmcv.Config.fromfile(pose_config)
-        self.pose_model = init_pose_model(pose_config_mmcv, pose_checkpoint, device=device)
-        ## color
-        self.skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], [6, 8],
-                         [7, 9], [8, 10],
-                         [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]
-        self.pose_kpt_color = [[51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255],
-                               [0, 255, 0],
-                               [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0],
-                               [255, 128, 0],
-                               [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0]]
-        self.pose_link_color = [[0, 255, 0], [0, 255, 0], [255, 128, 0], [255, 128, 0],
-                                [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [0, 255, 0],
-                                [255, 128, 0],
-                                [0, 255, 0], [255, 128, 0], [51, 153, 255], [51, 153, 255], [51, 153, 255],
-                                [51, 153, 255],
-                                [51, 153, 255], [51, 153, 255], [51, 153, 255]]
-
-    def load_vae(self):
-        vae_sd = torch.load(os.path.join('models', 'anything-v4.0.vae.pt'), map_location="cuda")
-        sd = vae_sd["state_dict"]
-        self.base_model.first_stage_model.load_state_dict(sd, strict=False)
-
-    @torch.no_grad()
-    def process_sketch(self, input_img, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale,
-                       con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-
-        if type_in == 'Sketch':
-            if color_back == 'White':
-                im = 255 - im
-            im_edge = im.copy()
-            im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-            im = im > 0.5
-            im = im.float()
-        elif type_in == 'Image':
-            im = img2tensor(im).unsqueeze(0) / 255.
-            im = self.model_edge(im.to(self.device))[-1]
-            im = im > 0.5
-            im = im.float()
-            im_edge = tensor2img(im)
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_sketch(im.to(self.device))
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_edge, x_samples_ddim]
-    
-    @torch.no_grad()
-    def process_canny(self, input_img, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale,
-                       con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-
-        if type_in == 'Canny':
-            if color_back == 'White':
-                im = 255 - im
-            im_edge = im.copy()
-            im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-        elif type_in == 'Image':
-            im = cv2.Canny(im,100,200)
-            im = img2tensor(im[..., None], bgr2rgb=True, float32=True).unsqueeze(0) / 255.
-            im_edge = tensor2img(im)
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_canny(im.to(self.device))
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_edge, x_samples_ddim]
-    
-    @torch.no_grad()
-    def process_color_sketch(self, input_img_sketch, input_img_color, type_in, type_in_color, w_sketch, w_color, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img_sketch, (512, 512))
-
-        if type_in == 'Sketch':
-            if color_back == 'White':
-                im = 255 - im
-            im_edge = im.copy()
-            im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-            im = im > 0.5
-            im = im.float()
-        elif type_in == 'Image':
-            im = img2tensor(im).unsqueeze(0) / 255.
-            im = self.model_edge(im.to(self.device))[-1]#.cuda()
-            im = im > 0.5
-            im = im.float()
-            im_edge = tensor2img(im)
-        if type_in_color == 'Image':
-            input_img_color = cv2.resize(input_img_color,(512//64, 512//64), interpolation=cv2.INTER_CUBIC)  
-            input_img_color = cv2.resize(input_img_color,(512,512), interpolation=cv2.INTER_NEAREST)
-        else:
-            input_img_color = cv2.resize(input_img_color, (512, 512))
-        im_color = input_img_color.copy()
-        im_color_tensor = img2tensor(input_img_color, bgr2rgb=False).unsqueeze(0) / 255.
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter_sketch = self.model_sketch(im.to(self.device))
-        features_adapter_color = self.model_color(im_color_tensor.to(self.device))
-        features_adapter = [fs*w_sketch+fc*w_color for fs, fc in zip(features_adapter_sketch,features_adapter_color)]
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_edge, im_color, x_samples_ddim]
-    
-    @torch.no_grad()
-    def process_style_sketch(self, input_img_sketch, input_img_style, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img_sketch, (512, 512))
-
-        if type_in == 'Sketch':
-            if color_back == 'White':
-                im = 255 - im
-            im_edge = im.copy()
-            im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-            im = im > 0.5
-            im = im.float()
-        elif type_in == 'Image':
-            im = img2tensor(im).unsqueeze(0) / 255.
-            im = self.model_edge(im.to(self.device))[-1]#.cuda()
-            im = im > 0.5
-            im = im.float()
-            im_edge = tensor2img(im)
-        
-        style = Image.fromarray(input_img_style)
-        style_for_clip = self.clip_processor(images=style, return_tensors="pt")['pixel_values']
-        style_feat = self.clip_vision_model(style_for_clip.to(self.device))['last_hidden_state']
-        style_feat = self.model_style(style_feat)
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_sketch(im.to(self.device))
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='style',
-                                              con_strength=con_strength,
-                                              style_feature=style_feat)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_edge, x_samples_ddim]
-
-    @torch.no_grad()
-    def process_color(self, input_img, prompt, neg_prompt, pos_prompt, w_color, type_in_color, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        if type_in_color == 'Image':
-            input_img = cv2.resize(input_img,(512//64, 512//64), interpolation=cv2.INTER_CUBIC)  
-            input_img = cv2.resize(input_img,(512,512), interpolation=cv2.INTER_NEAREST)
-        else:
-            input_img = cv2.resize(input_img, (512, 512))
-
-        im_color = input_img.copy()
-        im = img2tensor(input_img, bgr2rgb=False).unsqueeze(0) / 255.
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_color(im.to(self.device))
-        features_adapter = [fi*w_color for fi in features_adapter]
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_color, x_samples_ddim]
-    
-    @torch.no_grad()
-    def process_depth(self, input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale,
-                      con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-
-        if type_in == 'Depth':
-            im_depth = im.copy()
-            depth = img2tensor(im).unsqueeze(0) / 255.
-        elif type_in == 'Image':
-            im = img2tensor(im).unsqueeze(0) / 127.5 - 1.0
-            depth = self.depth_model(im.to(self.device)).repeat(1, 3, 1, 1)
-            depth -= torch.min(depth)
-            depth /= torch.max(depth)
-            im_depth = tensor2img(depth)
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_depth(depth.to(self.device))
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_depth, x_samples_ddim]
-
-    @torch.no_grad()
-    def process_depth_keypose(self, input_img_depth, input_img_keypose, type_in_depth, type_in_keypose, w_depth,
-                              w_keypose, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        if fix_sample == 'True':
-            seed_everything(42)
-        im_depth = cv2.resize(input_img_depth, (512, 512))
-        im_keypose = cv2.resize(input_img_keypose, (512, 512))
-
-        # get depth 
-        if type_in_depth == 'Depth':
-            im_depth_out = im_depth.copy()
-            depth = img2tensor(im_depth).unsqueeze(0) / 255.
-        elif type_in_depth == 'Image':
-            im_depth = img2tensor(im_depth).unsqueeze(0) / 127.5 - 1.0
-            depth = self.depth_model(im_depth.to(self.device)).repeat(1, 3, 1, 1)
-            depth -= torch.min(depth)
-            depth /= torch.max(depth)
-            im_depth_out = tensor2img(depth)
-
-        # get keypose
-        if type_in_keypose == 'Keypose':
-            im_keypose_out = im_keypose.copy()[:,:,::-1]
-        elif type_in_keypose == 'Image':
-            image = im_keypose.copy()
-            im_keypose = img2tensor(im_keypose).unsqueeze(0) / 255.
-            mmdet_results = inference_detector(self.det_model, image)
-            # keep the person class bounding boxes.
-            person_results = process_mmdet_results(mmdet_results, self.det_cat_id)
-
-            # optional
-            return_heatmap = False
-            dataset = self.pose_model.cfg.data['test']['type']
-
-            # e.g. use ('backbone', ) to return backbone feature
-            output_layer_names = None
-            pose_results, _ = inference_top_down_pose_model(
-                self.pose_model,
-                image,
-                person_results,
-                bbox_thr=self.bbox_thr,
-                format='xyxy',
-                dataset=dataset,
-                dataset_info=None,
-                return_heatmap=return_heatmap,
-                outputs=output_layer_names)
-
-            # show the results
-            im_keypose_out = imshow_keypoints(
-                image,
-                pose_results,
-                skeleton=self.skeleton,
-                pose_kpt_color=self.pose_kpt_color,
-                pose_link_color=self.pose_link_color,
-                radius=2,
-                thickness=2)
-            im_keypose_out = im_keypose_out.astype(np.uint8)
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter_depth = self.model_depth(depth.to(self.device))
-        pose = img2tensor(im_keypose_out, bgr2rgb=True, float32=True) / 255.
-        pose = pose.unsqueeze(0)
-        features_adapter_keypose = self.model_pose(pose.to(self.device))
-        features_adapter = [f_d * w_depth + f_k * w_keypose for f_d, f_k in
-                            zip(features_adapter_depth, features_adapter_keypose)]
-        shape = [4, 64, 64]
-
-        # sampling
-        con_strength = int((1 - con_strength) * 50)
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_depth_out, im_keypose_out[:, :, ::-1], x_samples_ddim]
-
-    @torch.no_grad()
-    def process_seg(self, input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale,
-                    con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-
-        if type_in == 'Segmentation':
-            im_seg = im.copy()
-            im = img2tensor(im).unsqueeze(0) / 255.
-            labelmap = im.float()
-        elif type_in == 'Image':
-            im, _ = preprocessing(im, self.device)
-            _, _, H, W = im.shape
-
-            # Image -> Probability map
-            logits = self.model_seger(im)
-            logits = F.interpolate(logits, size=(H, W), mode="bilinear", align_corners=False)
-            probs = F.softmax(logits, dim=1)[0]
-            probs = probs.cpu().data.numpy()
-            labelmap = np.argmax(probs, axis=0)
-
-            labelmap = self.coler(labelmap)
-            labelmap = np.transpose(labelmap, (1, 2, 0))
-            labelmap = cv2.resize(labelmap, (512, 512))
-            labelmap = img2tensor(labelmap, bgr2rgb=False, float32=True) / 255.
-            im_seg = tensor2img(labelmap)[:, :, ::-1]
-            labelmap = labelmap.unsqueeze(0)
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_seg(labelmap.to(self.device))
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_seg, x_samples_ddim]
-
-    @torch.no_grad()
-    def process_draw(self, input_img, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        input_img = input_img['mask']
-        c = input_img[:, :, 0:3].astype(np.float32)
-        a = input_img[:, :, 3:4].astype(np.float32) / 255.0
-        im = c * a + 255.0 * (1.0 - a)
-        im = im.clip(0, 255).astype(np.uint8)
-        im = cv2.resize(im, (512, 512))
-
-        im_edge = im.copy()
-        im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-        im = im > 0.5
-        im = im.float()
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_sketch(im.to(self.device))
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_edge, x_samples_ddim]
-
-    @torch.no_grad()
-    def process_keypose(self, input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength,
-                        base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-
-        if type_in == 'Keypose':
-            im_pose = im.copy()[:,:,::-1]
-        elif type_in == 'Image':
-            image = im.copy()
-            im = img2tensor(im).unsqueeze(0) / 255.
-            mmdet_results = inference_detector(self.det_model, image)
-            # keep the person class bounding boxes.
-            person_results = process_mmdet_results(mmdet_results, self.det_cat_id)
-
-            # optional
-            return_heatmap = False
-            dataset = self.pose_model.cfg.data['test']['type']
-
-            # e.g. use ('backbone', ) to return backbone feature
-            output_layer_names = None
-            pose_results, _ = inference_top_down_pose_model(
-                self.pose_model,
-                image,
-                person_results,
-                bbox_thr=self.bbox_thr,
-                format='xyxy',
-                dataset=dataset,
-                dataset_info=None,
-                return_heatmap=return_heatmap,
-                outputs=output_layer_names)
-
-            # show the results
-            im_pose = imshow_keypoints(
-                image,
-                pose_results,
-                skeleton=self.skeleton,
-                pose_kpt_color=self.pose_kpt_color,
-                pose_link_color=self.pose_link_color,
-                radius=2,
-                thickness=2)
-        # im_pose = cv2.resize(im_pose, (512, 512))
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        pose = img2tensor(im_pose, bgr2rgb=True, float32=True) / 255.
-        pose = pose.unsqueeze(0)
-        features_adapter = self.model_pose(pose.to(self.device))
-
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_pose[:, :, ::-1].astype(np.uint8), x_samples_ddim]
-    
-    @torch.no_grad()
-    def process_openpose(self, input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength,
-                        base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-
-        if type_in == 'Openpose':
-            im_pose = im.copy()[:,:,::-1]
-        elif type_in == 'Image':
-            from ldm.modules.structure_condition.openpose.api import OpenposeInference
-            model = OpenposeInference()
-            keypose = model(im[:,:,::-1])
-            im_pose = keypose.copy()
-
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        pose = img2tensor(im_pose, bgr2rgb=True, float32=True) / 255.
-        pose = pose.unsqueeze(0)
-        features_adapter = self.model_openpose(pose.to(self.device))
-
-        shape = [4, 64, 64]
-
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-
-        return [im_pose[:, :, ::-1].astype(np.uint8), x_samples_ddim]
-
-
-if __name__ == '__main__':
-    model = Model_all('cpu')
\ No newline at end of file
diff --git a/dist_util.py b/dist_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..47441a48932a86d5556b1167ef327aa3b1ec8173
--- /dev/null
+++ b/dist_util.py
@@ -0,0 +1,91 @@
+# Modified from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py  # noqa: E501
+import functools
+import os
+import subprocess
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+
+def init_dist(launcher, backend='nccl', **kwargs):
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+
+
+def _init_dist_pytorch(backend, **kwargs):
+    rank = int(os.environ['RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_slurm(backend, port=None):
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=backend)
+
+
+def get_dist_info():
+    if dist.is_available():
+        initialized = dist.is_initialized()
+    else:
+        initialized = False
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+
+
+def master_only(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+
+    return wrapper
+
+def get_bare_model(net):
+    """Get bare model, especially under wrapping with
+    DistributedDataParallel or DataParallel.
+    """
+    if isinstance(net, (DataParallel, DistributedDataParallel)):
+        net = net.module
+    return net
diff --git a/docs/AdapterZoo.md b/docs/AdapterZoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..ffdf9a9c4588367796f463a575cccdddf65ab513
--- /dev/null
+++ b/docs/AdapterZoo.md
@@ -0,0 +1,16 @@
+# Adapter Zoo
+
+You can download the adapters from <https://huggingface.co/TencentARC/T2I-Adapter/tree/main>
+
+All the following adapters are trained with Stable Diffusion (SD) V1.4, and they can be directly used on custom models as long as they are fine-tuned from the same text-to-image models, such as Anything-4.0 or models on the <https://civitai.com/>.
+
+| Adapter Name  | Adapter Description | Demos|Model Parameters|  Model Storage | |
+| --- | --- |--- |--- |--- |---|
+| t2iadapter_color_sd14v1.pth | Spatial color palette → image | [Demos](examples.md#color-adapter-spatial-palette) |18 M | 75 MB | |
+| t2iadapter_style_sd14v1.pth | Image style → image | [Demos](examples.md#style-adapter)|| 154MB |  Preliminary model. Style adapters with finer controls are on the way|
+| t2iadapter_openpose_sd14v1.pth | Openpose → image| [Demos](examples.md#openpose-adapter) |77 M| 309 MB | |
+| t2iadapter_canny_sd14v1.pth | Canny edges → image | [Demos](examples.md#canny-adapter-edge )|77 M | 309 MB ||
+| t2iadapter_sketch_sd14v1.pth | sketch → image ||77 M| 308 MB | |
+| t2iadapter_keypose_sd14v1.pth | keypose → image || 77 M| 309 MB | mmpose style |
+| t2iadapter_seg_sd14v1.pth | segmentation → image ||77 M| 309 MB ||
+| t2iadapter_depth_sd14v1.pth | depth maps → image ||77 M | 309 MB | Not the final model, still under training|
diff --git a/docs/FAQ.md b/docs/FAQ.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b34bb16e54c63afaee471d54405afc0164b601f
--- /dev/null
+++ b/docs/FAQ.md
@@ -0,0 +1,5 @@
+# FAQ
+
+- **Q: The openpose adapter (t2iadapter_openpose_sd14v1) outputs gray-scale images.**
+
+    **A:** You can add `colorful` in the prompt to avoid this problem.
diff --git a/docs/examples.md b/docs/examples.md
new file mode 100644
index 0000000000000000000000000000000000000000..4e422ee622b7a6e2042776df3944b255368cdb49
--- /dev/null
+++ b/docs/examples.md
@@ -0,0 +1,41 @@
+# Demos
+
+## Style Adapter
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/222734169-d47789e8-e83c-48c2-80ef-a896c2bafbb0.png" height=450>
+</p>
+
+## Color Adapter (Spatial Palette)
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/222915829-ccfb0366-13a8-484a-9561-627fabd87d29.png" height=450>
+</p>
+
+## Openpose Adapter
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/222733916-dc26a66e-d786-4407-8889-b81804862b1a.png" height=450>
+</p>
+
+## Canny Adapter (Edge)
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/222915813-c8f264bd-1be6-4496-97ff-aec4f6b53788.png" height=450>
+</p>
+
+## Multi-adapters
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/220939329-379f88b7-444f-4a3a-9de0-8f90605d1d34.png" height=450>
+</p>
+
+<div align="center">
+
+*T2I adapters naturally support using multiple adapters together.*
+
+</div><br />
+The testing script usage for this example is similar to the command line given below, except that we replaced the pretrained SD model with Anything 4.5 and Kenshi
+
+>python test_composable_adapters.py --prompt "1gril, computer desk, best quality, extremely detailed" --neg_prompt "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality" --depth_cond_path examples/depth/desk_depth.png --depth_cond_weight 1.0 --depth_ckpt models/t2iadapter_depth_sd14v1.pth --depth_type_in depth --pose_cond_path examples/keypose/person_keypose.png --pose_cond_weight 1.5 --ckpt models/anything-v4.0-pruned.ckpt --n_sample 4 --max_resolution 524288
+
+[Image source](https://twitter.com/toyxyz3/status/1628375164781211648)
diff --git a/environment.yaml b/environment.yaml
deleted file mode 100755
index 025ced87018b87c8e23a80fd77cde85e4715d897..0000000000000000000000000000000000000000
--- a/environment.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: ldm
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - python=3.8.5
-  - pip=20.3
-  - cudatoolkit=11.3
-  - pytorch=1.11.0
-  - torchvision=0.12.0
-  - numpy=1.19.2
-  - pip:
-    - albumentations==0.4.3
-    - diffusers
-    - opencv-python==4.1.2.30
-    - pudb==2019.2
-    - invisible-watermark
-    - imageio==2.9.0
-    - imageio-ffmpeg==0.4.2
-    - pytorch-lightning==1.4.2
-    - omegaconf==2.1.1
-    - test-tube>=0.7.5
-    - streamlit>=0.73.1
-    - einops==0.3.0
-    - torch-fidelity==0.3.0
-    - transformers==4.19.2
-    - torchmetrics==0.6.0
-    - kornia==0.6
-    - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
-    - -e git+https://github.com/openai/CLIP.git@main#egg=clip
-    - -e .
diff --git a/ldm/modules/structure_condition/midas/__init__.py b/experiments/README.md
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/__init__.py
rename to experiments/README.md
diff --git a/ldm/data/base.py b/ldm/data/base.py
deleted file mode 100755
index b196c2f7aa583a3e8bc4aad9f943df0c4dae0da7..0000000000000000000000000000000000000000
--- a/ldm/data/base.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from abc import abstractmethod
-from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
-
-
-class Txt2ImgIterableBaseDataset(IterableDataset):
-    '''
-    Define an interface to make the IterableDatasets for text2img data chainable
-    '''
-    def __init__(self, num_records=0, valid_ids=None, size=256):
-        super().__init__()
-        self.num_records = num_records
-        self.valid_ids = valid_ids
-        self.sample_ids = valid_ids
-        self.size = size
-
-        print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
-
-    def __len__(self):
-        return self.num_records
-
-    @abstractmethod
-    def __iter__(self):
-        pass
\ No newline at end of file
diff --git a/ldm/data/dataset_coco.py b/ldm/data/dataset_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b4aa4facb12be8534522c9240ca6e63ce4a68b5
--- /dev/null
+++ b/ldm/data/dataset_coco.py
@@ -0,0 +1,36 @@
+import json
+import cv2
+import os
+from basicsr.utils import img2tensor
+
+
+class dataset_coco_mask_color():
+    def __init__(self, path_json, root_path_im, root_path_mask, image_size):
+        super(dataset_coco_mask_color, self).__init__()
+        with open(path_json, 'r', encoding='utf-8') as fp:
+            data = json.load(fp)
+        data = data['annotations']
+        self.files = []
+        self.root_path_im = root_path_im
+        self.root_path_mask = root_path_mask
+        for file in data:
+            name = "%012d.png" % file['image_id']
+            self.files.append({'name': name, 'sentence': file['caption']})
+
+    def __getitem__(self, idx):
+        file = self.files[idx]
+        name = file['name']
+        # print(os.path.join(self.root_path_im, name))
+        im = cv2.imread(os.path.join(self.root_path_im, name.replace('.png', '.jpg')))
+        im = cv2.resize(im, (512, 512))
+        im = img2tensor(im, bgr2rgb=True, float32=True) / 255.
+
+        mask = cv2.imread(os.path.join(self.root_path_mask, name))  # [:,:,0]
+        mask = cv2.resize(mask, (512, 512))
+        mask = img2tensor(mask, bgr2rgb=True, float32=True) / 255.  # [0].unsqueeze(0)#/255.
+
+        sentence = file['sentence']
+        return {'im': im, 'mask': mask, 'sentence': sentence}
+
+    def __len__(self):
+        return len(self.files)
diff --git a/ldm/data/dataset_depth.py b/ldm/data/dataset_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3afe28da237c62795625574b89b60072da79cd2
--- /dev/null
+++ b/ldm/data/dataset_depth.py
@@ -0,0 +1,35 @@
+import json
+import cv2
+import os
+from basicsr.utils import img2tensor
+
+
+class DepthDataset():
+    def __init__(self, meta_file):
+        super(DepthDataset, self).__init__()
+
+        self.files = []
+        with open(meta_file, 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                img_path = line.strip()
+                depth_img_path = img_path.rsplit('.', 1)[0] + '.depth.png'
+                txt_path = img_path.rsplit('.', 1)[0] + '.txt'
+                self.files.append({'img_path': img_path, 'depth_img_path': depth_img_path, 'txt_path': txt_path})
+
+    def __getitem__(self, idx):
+        file = self.files[idx]
+
+        im = cv2.imread(file['img_path'])
+        im = img2tensor(im, bgr2rgb=True, float32=True) / 255.
+
+        depth = cv2.imread(file['depth_img_path'])  # [:,:,0]
+        depth = img2tensor(depth, bgr2rgb=True, float32=True) / 255.  # [0].unsqueeze(0)#/255.
+
+        with open(file['txt_path'], 'r') as fs:
+            sentence = fs.readline().strip()
+
+        return {'im': im, 'depth': depth, 'sentence': sentence}
+
+    def __len__(self):
+        return len(self.files)
diff --git a/ldm/data/dataset_laion.py b/ldm/data/dataset_laion.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b1807b1d87e27e09656daf6e7144bd5fba6adce
--- /dev/null
+++ b/ldm/data/dataset_laion.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import os
+import pytorch_lightning as pl
+import torch
+import webdataset as wds
+from torchvision.transforms import transforms
+
+from ldm.util import instantiate_from_config
+
+
+def dict_collation_fn(samples, combine_tensors=True, combine_scalars=True):
+    """Take a list  of samples (as dictionary) and create a batch, preserving the keys.
+    If `tensors` is True, `ndarray` objects are combined into
+    tensor batches.
+    :param dict samples: list of samples
+    :param bool tensors: whether to turn lists of ndarrays into a single ndarray
+    :returns: single sample consisting of a batch
+    :rtype: dict
+    """
+    keys = set.intersection(*[set(sample.keys()) for sample in samples])
+    batched = {key: [] for key in keys}
+
+    for s in samples:
+        [batched[key].append(s[key]) for key in batched]
+
+    result = {}
+    for key in batched:
+        if isinstance(batched[key][0], (int, float)):
+            if combine_scalars:
+                result[key] = np.array(list(batched[key]))
+        elif isinstance(batched[key][0], torch.Tensor):
+            if combine_tensors:
+                result[key] = torch.stack(list(batched[key]))
+        elif isinstance(batched[key][0], np.ndarray):
+            if combine_tensors:
+                result[key] = np.array(list(batched[key]))
+        else:
+            result[key] = list(batched[key])
+    return result
+
+
+class WebDataModuleFromConfig(pl.LightningDataModule):
+
+    def __init__(self,
+                 tar_base,
+                 batch_size,
+                 train=None,
+                 validation=None,
+                 test=None,
+                 num_workers=4,
+                 multinode=True,
+                 min_size=None,
+                 max_pwatermark=1.0,
+                 **kwargs):
+        super().__init__()
+        print(f'Setting tar base to {tar_base}')
+        self.tar_base = tar_base
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.train = train
+        self.validation = validation
+        self.test = test
+        self.multinode = multinode
+        self.min_size = min_size  # filter out very small images
+        self.max_pwatermark = max_pwatermark  # filter out watermarked images
+
+    def make_loader(self, dataset_config):
+        image_transforms = [instantiate_from_config(tt) for tt in dataset_config.image_transforms]
+        image_transforms = transforms.Compose(image_transforms)
+
+        process = instantiate_from_config(dataset_config['process'])
+
+        shuffle = dataset_config.get('shuffle', 0)
+        shardshuffle = shuffle > 0
+
+        nodesplitter = wds.shardlists.split_by_node if self.multinode else wds.shardlists.single_node_only
+
+        tars = os.path.join(self.tar_base, dataset_config.shards)
+
+        dset = wds.WebDataset(
+            tars, nodesplitter=nodesplitter, shardshuffle=shardshuffle,
+            handler=wds.warn_and_continue).repeat().shuffle(shuffle)
+        print(f'Loading webdataset with {len(dset.pipeline[0].urls)} shards.')
+
+        dset = (
+            dset.select(self.filter_keys).decode('pil',
+                                                 handler=wds.warn_and_continue).select(self.filter_size).map_dict(
+                                                     jpg=image_transforms, handler=wds.warn_and_continue).map(process))
+        dset = (dset.batched(self.batch_size, partial=False, collation_fn=dict_collation_fn))
+
+        loader = wds.WebLoader(dset, batch_size=None, shuffle=False, num_workers=self.num_workers)
+
+        return loader
+
+    def filter_size(self, x):
+        if self.min_size is None:
+            return True
+        try:
+            return x['json']['original_width'] >= self.min_size and x['json']['original_height'] >= self.min_size and x[
+                'json']['pwatermark'] <= self.max_pwatermark
+        except Exception:
+            return False
+
+    def filter_keys(self, x):
+        try:
+            return ("jpg" in x) and ("txt" in x)
+        except Exception:
+            return False
+
+    def train_dataloader(self):
+        return self.make_loader(self.train)
+
+    def val_dataloader(self):
+        return None
+
+    def test_dataloader(self):
+        return None
+
+
+if __name__ == '__main__':
+    from omegaconf import OmegaConf
+    config = OmegaConf.load("configs/stable-diffusion/train_canny_sd_v1.yaml")
+    datamod = WebDataModuleFromConfig(**config["data"]["params"])
+    dataloader = datamod.train_dataloader()
+
+    for batch in dataloader:
+        print(batch.keys())
+        print(batch['jpg'].shape)
diff --git a/ldm/data/dataset_wikiart.py b/ldm/data/dataset_wikiart.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7a2de87ccbba147580fed82e3c5e5a5ab38761e
--- /dev/null
+++ b/ldm/data/dataset_wikiart.py
@@ -0,0 +1,67 @@
+import json
+import os.path
+
+from PIL import Image
+from torch.utils.data import DataLoader
+
+from transformers import CLIPProcessor
+from torchvision.transforms import transforms
+
+import pytorch_lightning as pl
+
+
+class WikiArtDataset():
+    def __init__(self, meta_file):
+        super(WikiArtDataset, self).__init__()
+
+        self.files = []
+        with open(meta_file, 'r') as f:
+            js = json.load(f)
+            for img_path in js:
+                img_name = os.path.splitext(os.path.basename(img_path))[0]
+                caption = img_name.split('_')[-1]
+                caption = caption.split('-')
+                j = len(caption) - 1
+                while j >= 0:
+                    if not caption[j].isdigit():
+                        break
+                    j -= 1
+                if j < 0:
+                    continue
+                sentence = ' '.join(caption[:j + 1])
+                self.files.append({'img_path': os.path.join('datasets/wikiart', img_path), 'sentence': sentence})
+
+        version = 'openai/clip-vit-large-patch14'
+        self.processor = CLIPProcessor.from_pretrained(version)
+
+        self.jpg_transform = transforms.Compose([
+            transforms.Resize(512),
+            transforms.RandomCrop(512),
+            transforms.ToTensor(),
+        ])
+
+    def __getitem__(self, idx):
+        file = self.files[idx]
+
+        im = Image.open(file['img_path'])
+
+        im_tensor = self.jpg_transform(im)
+
+        clip_im = self.processor(images=im, return_tensors="pt")['pixel_values'][0]
+
+        return {'jpg': im_tensor, 'style': clip_im, 'txt': file['sentence']}
+
+    def __len__(self):
+        return len(self.files)
+
+
+class WikiArtDataModule(pl.LightningDataModule):
+    def __init__(self, meta_file, batch_size, num_workers):
+        super(WikiArtDataModule, self).__init__()
+        self.train_dataset = WikiArtDataset(meta_file)
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers,
+                          pin_memory=True)
diff --git a/ldm/data/imagenet.py b/ldm/data/imagenet.py
deleted file mode 100755
index 1c473f9c6965b22315dbb289eff8247c71bdc790..0000000000000000000000000000000000000000
--- a/ldm/data/imagenet.py
+++ /dev/null
@@ -1,394 +0,0 @@
-import os, yaml, pickle, shutil, tarfile, glob
-import cv2
-import albumentations
-import PIL
-import numpy as np
-import torchvision.transforms.functional as TF
-from omegaconf import OmegaConf
-from functools import partial
-from PIL import Image
-from tqdm import tqdm
-from torch.utils.data import Dataset, Subset
-
-import taming.data.utils as tdu
-from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
-from taming.data.imagenet import ImagePaths
-
-from ldm.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
-
-
-def synset2idx(path_to_yaml="data/index_synset.yaml"):
-    with open(path_to_yaml) as f:
-        di2s = yaml.load(f)
-    return dict((v,k) for k,v in di2s.items())
-
-
-class ImageNetBase(Dataset):
-    def __init__(self, config=None):
-        self.config = config or OmegaConf.create()
-        if not type(self.config)==dict:
-            self.config = OmegaConf.to_container(self.config)
-        self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
-        self.process_images = True  # if False we skip loading & processing images and self.data contains filepaths
-        self._prepare()
-        self._prepare_synset_to_human()
-        self._prepare_idx_to_synset()
-        self._prepare_human_to_integer_label()
-        self._load()
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, i):
-        return self.data[i]
-
-    def _prepare(self):
-        raise NotImplementedError()
-
-    def _filter_relpaths(self, relpaths):
-        ignore = set([
-            "n06596364_9591.JPEG",
-        ])
-        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
-        if "sub_indices" in self.config:
-            indices = str_to_indices(self.config["sub_indices"])
-            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
-            self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
-            files = []
-            for rpath in relpaths:
-                syn = rpath.split("/")[0]
-                if syn in synsets:
-                    files.append(rpath)
-            return files
-        else:
-            return relpaths
-
-    def _prepare_synset_to_human(self):
-        SIZE = 2655750
-        URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
-        self.human_dict = os.path.join(self.root, "synset_human.txt")
-        if (not os.path.exists(self.human_dict) or
-                not os.path.getsize(self.human_dict)==SIZE):
-            download(URL, self.human_dict)
-
-    def _prepare_idx_to_synset(self):
-        URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
-        self.idx2syn = os.path.join(self.root, "index_synset.yaml")
-        if (not os.path.exists(self.idx2syn)):
-            download(URL, self.idx2syn)
-
-    def _prepare_human_to_integer_label(self):
-        URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
-        self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
-        if (not os.path.exists(self.human2integer)):
-            download(URL, self.human2integer)
-        with open(self.human2integer, "r") as f:
-            lines = f.read().splitlines()
-            assert len(lines) == 1000
-            self.human2integer_dict = dict()
-            for line in lines:
-                value, key = line.split(":")
-                self.human2integer_dict[key] = int(value)
-
-    def _load(self):
-        with open(self.txt_filelist, "r") as f:
-            self.relpaths = f.read().splitlines()
-            l1 = len(self.relpaths)
-            self.relpaths = self._filter_relpaths(self.relpaths)
-            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
-
-        self.synsets = [p.split("/")[0] for p in self.relpaths]
-        self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
-
-        unique_synsets = np.unique(self.synsets)
-        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
-        if not self.keep_orig_class_label:
-            self.class_labels = [class_dict[s] for s in self.synsets]
-        else:
-            self.class_labels = [self.synset2idx[s] for s in self.synsets]
-
-        with open(self.human_dict, "r") as f:
-            human_dict = f.read().splitlines()
-            human_dict = dict(line.split(maxsplit=1) for line in human_dict)
-
-        self.human_labels = [human_dict[s] for s in self.synsets]
-
-        labels = {
-            "relpath": np.array(self.relpaths),
-            "synsets": np.array(self.synsets),
-            "class_label": np.array(self.class_labels),
-            "human_label": np.array(self.human_labels),
-        }
-
-        if self.process_images:
-            self.size = retrieve(self.config, "size", default=256)
-            self.data = ImagePaths(self.abspaths,
-                                   labels=labels,
-                                   size=self.size,
-                                   random_crop=self.random_crop,
-                                   )
-        else:
-            self.data = self.abspaths
-
-
-class ImageNetTrain(ImageNetBase):
-    NAME = "ILSVRC2012_train"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
-    FILES = [
-        "ILSVRC2012_img_train.tar",
-    ]
-    SIZES = [
-        147897477120,
-    ]
-
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.process_images = process_images
-        self.data_root = data_root
-        super().__init__(**kwargs)
-
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 1281167
-        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
-                                    default=True)
-        if not tdu.is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
-                    import academictorrents as at
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-
-                print("Extracting sub-tars.")
-                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
-                for subpath in tqdm(subpaths):
-                    subdir = subpath[:-len(".tar")]
-                    os.makedirs(subdir, exist_ok=True)
-                    with tarfile.open(subpath, "r:") as tar:
-                        tar.extractall(path=subdir)
-
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-
-            tdu.mark_prepared(self.root)
-
-
-class ImageNetValidation(ImageNetBase):
-    NAME = "ILSVRC2012_validation"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
-    VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
-    FILES = [
-        "ILSVRC2012_img_val.tar",
-        "validation_synset.txt",
-    ]
-    SIZES = [
-        6744924160,
-        1950000,
-    ]
-
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.data_root = data_root
-        self.process_images = process_images
-        super().__init__(**kwargs)
-
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 50000
-        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
-                                    default=False)
-        if not tdu.is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
-                    import academictorrents as at
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-
-                vspath = os.path.join(self.root, self.FILES[1])
-                if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
-                    download(self.VS_URL, vspath)
-
-                with open(vspath, "r") as f:
-                    synset_dict = f.read().splitlines()
-                    synset_dict = dict(line.split() for line in synset_dict)
-
-                print("Reorganizing into synset folders")
-                synsets = np.unique(list(synset_dict.values()))
-                for s in synsets:
-                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
-                for k, v in synset_dict.items():
-                    src = os.path.join(datadir, k)
-                    dst = os.path.join(datadir, v)
-                    shutil.move(src, dst)
-
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-
-            tdu.mark_prepared(self.root)
-
-
-
-class ImageNetSR(Dataset):
-    def __init__(self, size=None,
-                 degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
-                 random_crop=True):
-        """
-        Imagenet Superresolution Dataloader
-        Performs following ops in order:
-        1.  crops a crop of size s from image either as random or center crop
-        2.  resizes crop to size with cv2.area_interpolation
-        3.  degrades resized crop with degradation_fn
-
-        :param size: resizing to size after cropping
-        :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
-        :param downscale_f: Low Resolution Downsample factor
-        :param min_crop_f: determines crop size s,
-          where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
-        :param max_crop_f: ""
-        :param data_root:
-        :param random_crop:
-        """
-        self.base = self.get_base()
-        assert size
-        assert (size / downscale_f).is_integer()
-        self.size = size
-        self.LR_size = int(size / downscale_f)
-        self.min_crop_f = min_crop_f
-        self.max_crop_f = max_crop_f
-        assert(max_crop_f <= 1.)
-        self.center_crop = not random_crop
-
-        self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
-
-        self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
-
-        if degradation == "bsrgan":
-            self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
-
-        elif degradation == "bsrgan_light":
-            self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
-
-        else:
-            interpolation_fn = {
-            "cv_nearest": cv2.INTER_NEAREST,
-            "cv_bilinear": cv2.INTER_LINEAR,
-            "cv_bicubic": cv2.INTER_CUBIC,
-            "cv_area": cv2.INTER_AREA,
-            "cv_lanczos": cv2.INTER_LANCZOS4,
-            "pil_nearest": PIL.Image.NEAREST,
-            "pil_bilinear": PIL.Image.BILINEAR,
-            "pil_bicubic": PIL.Image.BICUBIC,
-            "pil_box": PIL.Image.BOX,
-            "pil_hamming": PIL.Image.HAMMING,
-            "pil_lanczos": PIL.Image.LANCZOS,
-            }[degradation]
-
-            self.pil_interpolation = degradation.startswith("pil_")
-
-            if self.pil_interpolation:
-                self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
-
-            else:
-                self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
-                                                                          interpolation=interpolation_fn)
-
-    def __len__(self):
-        return len(self.base)
-
-    def __getitem__(self, i):
-        example = self.base[i]
-        image = Image.open(example["file_path_"])
-
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        image = np.array(image).astype(np.uint8)
-
-        min_side_len = min(image.shape[:2])
-        crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
-        crop_side_len = int(crop_side_len)
-
-        if self.center_crop:
-            self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
-
-        else:
-            self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
-
-        image = self.cropper(image=image)["image"]
-        image = self.image_rescaler(image=image)["image"]
-
-        if self.pil_interpolation:
-            image_pil = PIL.Image.fromarray(image)
-            LR_image = self.degradation_process(image_pil)
-            LR_image = np.array(LR_image).astype(np.uint8)
-
-        else:
-            LR_image = self.degradation_process(image=image)["image"]
-
-        example["image"] = (image/127.5 - 1.0).astype(np.float32)
-        example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
-
-        return example
-
-
-class ImageNetSRTrain(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def get_base(self):
-        with open("data/imagenet_train_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetTrain(process_images=False,)
-        return Subset(dset, indices)
-
-
-class ImageNetSRValidation(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def get_base(self):
-        with open("data/imagenet_val_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetValidation(process_images=False,)
-        return Subset(dset, indices)
diff --git a/ldm/data/lsun.py b/ldm/data/lsun.py
deleted file mode 100755
index 6256e45715ff0b57c53f985594d27cbbbff0e68e..0000000000000000000000000000000000000000
--- a/ldm/data/lsun.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import os
-import numpy as np
-import PIL
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-
-
-class LSUNBase(Dataset):
-    def __init__(self,
-                 txt_file,
-                 data_root,
-                 size=None,
-                 interpolation="bicubic",
-                 flip_p=0.5
-                 ):
-        self.data_paths = txt_file
-        self.data_root = data_root
-        with open(self.data_paths, "r") as f:
-            self.image_paths = f.read().splitlines()
-        self._length = len(self.image_paths)
-        self.labels = {
-            "relative_file_path_": [l for l in self.image_paths],
-            "file_path_": [os.path.join(self.data_root, l)
-                           for l in self.image_paths],
-        }
-
-        self.size = size
-        self.interpolation = {"linear": PIL.Image.LINEAR,
-                              "bilinear": PIL.Image.BILINEAR,
-                              "bicubic": PIL.Image.BICUBIC,
-                              "lanczos": PIL.Image.LANCZOS,
-                              }[interpolation]
-        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, i):
-        example = dict((k, self.labels[k][i]) for k in self.labels)
-        image = Image.open(example["file_path_"])
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        # default to score-sde preprocessing
-        img = np.array(image).astype(np.uint8)
-        crop = min(img.shape[0], img.shape[1])
-        h, w, = img.shape[0], img.shape[1]
-        img = img[(h - crop) // 2:(h + crop) // 2,
-              (w - crop) // 2:(w + crop) // 2]
-
-        image = Image.fromarray(img)
-        if self.size is not None:
-            image = image.resize((self.size, self.size), resample=self.interpolation)
-
-        image = self.flip(image)
-        image = np.array(image).astype(np.uint8)
-        example["image"] = (image / 127.5 - 1.0).astype(np.float32)
-        return example
-
-
-class LSUNChurchesTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
-
-
-class LSUNChurchesValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
-                         flip_p=flip_p, **kwargs)
-
-
-class LSUNBedroomsTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
-
-
-class LSUNBedroomsValidation(LSUNBase):
-    def __init__(self, flip_p=0.0, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
-                         flip_p=flip_p, **kwargs)
-
-
-class LSUNCatsTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
-
-
-class LSUNCatsValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
-                         flip_p=flip_p, **kwargs)
diff --git a/ldm/data/utils.py b/ldm/data/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ece8c92b4aca12d6c65908900460cc4beaf522e
--- /dev/null
+++ b/ldm/data/utils.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+import cv2
+import numpy as np
+from torchvision.transforms import transforms
+from torchvision.transforms.functional import to_tensor
+from transformers import CLIPProcessor
+
+from basicsr.utils import img2tensor
+
+
+class AddCannyFreezeThreshold(object):
+
+    def __init__(self, low_threshold=100, high_threshold=200):
+        self.low_threshold = low_threshold
+        self.high_threshold = high_threshold
+
+    def __call__(self, sample):
+        # sample['jpg'] is PIL image
+        x = sample['jpg']
+        img = cv2.cvtColor(np.array(x), cv2.COLOR_RGB2BGR)
+        canny = cv2.Canny(img, self.low_threshold, self.high_threshold)[..., None]
+        sample['canny'] = img2tensor(canny, bgr2rgb=True, float32=True) / 255.
+        sample['jpg'] = to_tensor(x)
+        return sample
+
+
+class AddStyle(object):
+
+    def __init__(self, version):
+        self.processor = CLIPProcessor.from_pretrained(version)
+        self.pil_to_tensor = transforms.ToTensor()
+
+    def __call__(self, sample):
+        # sample['jpg'] is PIL image
+        x = sample['jpg']
+        style = self.processor(images=x, return_tensors="pt")['pixel_values'][0]
+        sample['style'] = style
+        sample['jpg'] = to_tensor(x)
+        return sample
diff --git a/ldm/inference_base.py b/ldm/inference_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7b62e852b4b52881e06ff66d478185b3a928396
--- /dev/null
+++ b/ldm/inference_base.py
@@ -0,0 +1,282 @@
+import argparse
+import torch
+from omegaconf import OmegaConf
+
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.models.diffusion.plms import PLMSSampler
+from ldm.modules.encoders.adapter import Adapter, StyleAdapter, Adapter_light
+from ldm.modules.extra_condition.api import ExtraCondition
+from ldm.util import fix_cond_shapes, load_model_from_config, read_state_dict
+
+DEFAULT_NEGATIVE_PROMPT = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                          'fewer digits, cropped, worst quality, low quality'
+
+
+def get_base_argument_parser() -> argparse.ArgumentParser:
+    """get the base argument parser for inference scripts"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--outdir',
+        type=str,
+        help='dir to write results to',
+        default=None,
+    )
+
+    parser.add_argument(
+        '--prompt',
+        type=str,
+        nargs='?',
+        default=None,
+        help='positive prompt',
+    )
+
+    parser.add_argument(
+        '--neg_prompt',
+        type=str,
+        default=DEFAULT_NEGATIVE_PROMPT,
+        help='negative prompt',
+    )
+
+    parser.add_argument(
+        '--cond_path',
+        type=str,
+        default=None,
+        help='condition image path',
+    )
+
+    parser.add_argument(
+        '--cond_inp_type',
+        type=str,
+        default='image',
+        help='the type of the input condition image, take depth T2I as example, the input can be raw image, '
+        'which depth will be calculated, or the input can be a directly a depth map image',
+    )
+
+    parser.add_argument(
+        '--sampler',
+        type=str,
+        default='ddim',
+        choices=['ddim', 'plms'],
+        help='sampling algorithm, currently, only ddim and plms are supported, more are on the way',
+    )
+
+    parser.add_argument(
+        '--steps',
+        type=int,
+        default=50,
+        help='number of sampling steps',
+    )
+
+    parser.add_argument(
+        '--sd_ckpt',
+        type=str,
+        default='models/sd-v1-4.ckpt',
+        help='path to checkpoint of stable diffusion model, both .ckpt and .safetensor are supported',
+    )
+
+    parser.add_argument(
+        '--vae_ckpt',
+        type=str,
+        default=None,
+        help='vae checkpoint, anime SD models usually have seperate vae ckpt that need to be loaded',
+    )
+
+    parser.add_argument(
+        '--adapter_ckpt',
+        type=str,
+        default=None,
+        help='path to checkpoint of adapter',
+    )
+
+    parser.add_argument(
+        '--config',
+        type=str,
+        default='configs/stable-diffusion/sd-v1-inference.yaml',
+        help='path to config which constructs SD model',
+    )
+
+    parser.add_argument(
+        '--max_resolution',
+        type=float,
+        default=512 * 512,
+        help='max image height * width, only for computer with limited vram',
+    )
+
+    parser.add_argument(
+        '--resize_short_edge',
+        type=int,
+        default=None,
+        help='resize short edge of the input image, if this arg is set, max_resolution will not be used',
+    )
+
+    parser.add_argument(
+        '--C',
+        type=int,
+        default=4,
+        help='latent channels',
+    )
+
+    parser.add_argument(
+        '--f',
+        type=int,
+        default=8,
+        help='downsampling factor',
+    )
+
+    parser.add_argument(
+        '--scale',
+        type=float,
+        default=7.5,
+        help='unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))',
+    )
+
+    parser.add_argument(
+        '--cond_tau',
+        type=float,
+        default=1.0,
+        help='timestamp parameter that determines until which step the adapter is applied, '
+        'similar as Prompt-to-Prompt tau')
+
+    parser.add_argument(
+        '--cond_weight',
+        type=float,
+        default=1.0,
+        help='the adapter features are multiplied by the cond_weight. The larger the cond_weight, the more aligned '
+        'the generated image and condition will be, but the generated quality may be reduced',
+    )
+
+    parser.add_argument(
+        '--seed',
+        type=int,
+        default=42,
+    )
+
+    parser.add_argument(
+        '--n_samples',
+        type=int,
+        default=4,
+        help='# of samples to generate',
+    )
+
+    return parser
+
+
+def get_sd_models(opt):
+    """
+    build stable diffusion model, sampler
+    """
+    # SD
+    config = OmegaConf.load(f"{opt.config}")
+    model = load_model_from_config(config, opt.sd_ckpt, opt.vae_ckpt)
+    sd_model = model.to(opt.device)
+
+    # sampler
+    if opt.sampler == 'plms':
+        sampler = PLMSSampler(model)
+    elif opt.sampler == 'ddim':
+        sampler = DDIMSampler(model)
+    else:
+        raise NotImplementedError
+
+    return sd_model, sampler
+
+
+def get_t2i_adapter_models(opt):
+    config = OmegaConf.load(f"{opt.config}")
+    model = load_model_from_config(config, opt.sd_ckpt, opt.vae_ckpt)
+    adapter_ckpt_path = getattr(opt, f'{opt.which_cond}_adapter_ckpt', None)
+    if adapter_ckpt_path is None:
+        adapter_ckpt_path = getattr(opt, 'adapter_ckpt')
+    adapter_ckpt = read_state_dict(adapter_ckpt_path)
+    new_state_dict = {}
+    for k, v in adapter_ckpt.items():
+        if not k.startswith('adapter.'):
+            new_state_dict[f'adapter.{k}'] = v
+        else:
+            new_state_dict[k] = v
+    m, u = model.load_state_dict(new_state_dict, strict=False)
+    if len(u) > 0:
+        print(f"unexpected keys in loading adapter ckpt {adapter_ckpt_path}:")
+        print(u)
+
+    model = model.to(opt.device)
+
+    # sampler
+    if opt.sampler == 'plms':
+        sampler = PLMSSampler(model)
+    elif opt.sampler == 'ddim':
+        sampler = DDIMSampler(model)
+    else:
+        raise NotImplementedError
+
+    return model, sampler
+
+
+def get_cond_ch(cond_type: ExtraCondition):
+    if cond_type == ExtraCondition.sketch or cond_type == ExtraCondition.canny:
+        return 1
+    return 3
+
+
+def get_adapters(opt, cond_type: ExtraCondition):
+    adapter = {}
+    cond_weight = getattr(opt, f'{cond_type.name}_weight', None)
+    if cond_weight is None:
+        cond_weight = getattr(opt, 'cond_weight')
+    adapter['cond_weight'] = cond_weight
+
+    if cond_type == ExtraCondition.style:
+        adapter['model'] = StyleAdapter(width=1024, context_dim=768, num_head=8, n_layes=3, num_token=8).to(opt.device)
+    elif cond_type == ExtraCondition.color:
+        adapter['model'] = Adapter_light(
+            cin=64 * get_cond_ch(cond_type),
+            channels=[320, 640, 1280, 1280],
+            nums_rb=4).to(opt.device)
+    else:
+        adapter['model'] = Adapter(
+            cin=64 * get_cond_ch(cond_type),
+            channels=[320, 640, 1280, 1280][:4],
+            nums_rb=2,
+            ksize=1,
+            sk=True,
+            use_conv=False).to(opt.device)
+    ckpt_path = getattr(opt, f'{cond_type.name}_adapter_ckpt', None)
+    if ckpt_path is None:
+        ckpt_path = getattr(opt, 'adapter_ckpt')
+    adapter['model'].load_state_dict(torch.load(ckpt_path))
+
+    return adapter
+
+
+def diffusion_inference(opt, model, sampler, adapter_features, append_to_context=None):
+    # get text embedding
+    c = model.get_learned_conditioning([opt.prompt])
+    if opt.scale != 1.0:
+        uc = model.get_learned_conditioning([opt.neg_prompt])
+    else:
+        uc = None
+    c, uc = fix_cond_shapes(model, c, uc)
+
+    if not hasattr(opt, 'H'):
+        opt.H = 512
+        opt.W = 512
+    shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
+
+    samples_latents, _ = sampler.sample(
+        S=opt.steps,
+        conditioning=c,
+        batch_size=1,
+        shape=shape,
+        verbose=False,
+        unconditional_guidance_scale=opt.scale,
+        unconditional_conditioning=uc,
+        x_T=None,
+        features_adapter=adapter_features,
+        append_to_context=append_to_context,
+        cond_tau=opt.cond_tau,
+    )
+
+    x_samples = model.decode_first_stage(samples_latents)
+    x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
+
+    return x_samples
diff --git a/ldm/models/autoencoder.py b/ldm/models/autoencoder.py
index 6a9c4f45498561953b8085981609b2a3298a5473..e3ff5fe3ed0f70de8b31f1af27e107b93fbb94ca 100755
--- a/ldm/models/autoencoder.py
+++ b/ldm/models/autoencoder.py
@@ -1,64 +1,65 @@
 import torch
 import pytorch_lightning as pl
 import torch.nn.functional as F
+import torch.nn as nn
 from contextlib import contextmanager
 
-from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
-
 from ldm.modules.diffusionmodules.model import Encoder, Decoder
 from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
 
 from ldm.util import instantiate_from_config
+from ldm.modules.ema import LitEma
 
 
-class VQModel(pl.LightningModule):
+class AutoencoderKL(pl.LightningModule):
     def __init__(self,
                  ddconfig,
                  lossconfig,
-                 n_embed,
                  embed_dim,
                  ckpt_path=None,
                  ignore_keys=[],
                  image_key="image",
                  colorize_nlabels=None,
                  monitor=None,
-                 batch_resize_range=None,
-                 scheduler_config=None,
-                 lr_g_factor=1.0,
-                 remap=None,
-                 sane_index_shape=False, # tell vector quantizer to return indices as bhw
-                 use_ema=False
+                 ema_decay=None,
+                 learn_logvar=False
                  ):
         super().__init__()
-        self.embed_dim = embed_dim
-        self.n_embed = n_embed
+        self.learn_logvar = learn_logvar
         self.image_key = image_key
         self.encoder = Encoder(**ddconfig)
         self.decoder = Decoder(**ddconfig)
         self.loss = instantiate_from_config(lossconfig)
-        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
-                                        remap=remap,
-                                        sane_index_shape=sane_index_shape)
-        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
         self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
         if colorize_nlabels is not None:
             assert type(colorize_nlabels)==int
             self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
         if monitor is not None:
             self.monitor = monitor
-        self.batch_resize_range = batch_resize_range
-        if self.batch_resize_range is not None:
-            print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
 
-        self.use_ema = use_ema
+        self.use_ema = ema_decay is not None
         if self.use_ema:
-            self.model_ema = LitEma(self)
+            self.ema_decay = ema_decay
+            assert 0. < ema_decay < 1.
+            self.model_ema = LitEma(self, decay=ema_decay)
             print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
 
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-        self.scheduler_config = scheduler_config
-        self.lr_g_factor = lr_g_factor
+
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
 
     @contextmanager
     def ema_scope(self, context=None):
@@ -75,252 +76,10 @@ class VQModel(pl.LightningModule):
                 if context is not None:
                     print(f"{context}: Restored training weights")
 
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-            print(f"Unexpected Keys: {unexpected}")
-
     def on_train_batch_end(self, *args, **kwargs):
         if self.use_ema:
             self.model_ema(self)
 
-    def encode(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        quant, emb_loss, info = self.quantize(h)
-        return quant, emb_loss, info
-
-    def encode_to_prequant(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        return h
-
-    def decode(self, quant):
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-        return dec
-
-    def decode_code(self, code_b):
-        quant_b = self.quantize.embed_code(code_b)
-        dec = self.decode(quant_b)
-        return dec
-
-    def forward(self, input, return_pred_indices=False):
-        quant, diff, (_,_,ind) = self.encode(input)
-        dec = self.decode(quant)
-        if return_pred_indices:
-            return dec, diff, ind
-        return dec, diff
-
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
-        if self.batch_resize_range is not None:
-            lower_size = self.batch_resize_range[0]
-            upper_size = self.batch_resize_range[1]
-            if self.global_step <= 4:
-                # do the first few batches with max size to avoid later oom
-                new_resize = upper_size
-            else:
-                new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
-            if new_resize != x.shape[2]:
-                x = F.interpolate(x, size=new_resize, mode="bicubic")
-            x = x.detach()
-        return x
-
-    def training_step(self, batch, batch_idx, optimizer_idx):
-        # https://github.com/pytorch/pytorch/issues/37142
-        # try not to fool the heuristics
-        x = self.get_input(batch, self.image_key)
-        xrec, qloss, ind = self(x, return_pred_indices=True)
-
-        if optimizer_idx == 0:
-            # autoencode
-            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train",
-                                            predicted_indices=ind)
-
-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
-            return aeloss
-
-        if optimizer_idx == 1:
-            # discriminator
-            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
-            return discloss
-
-    def validation_step(self, batch, batch_idx):
-        log_dict = self._validation_step(batch, batch_idx)
-        with self.ema_scope():
-            log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
-        return log_dict
-
-    def _validation_step(self, batch, batch_idx, suffix=""):
-        x = self.get_input(batch, self.image_key)
-        xrec, qloss, ind = self(x, return_pred_indices=True)
-        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
-                                        self.global_step,
-                                        last_layer=self.get_last_layer(),
-                                        split="val"+suffix,
-                                        predicted_indices=ind
-                                        )
-
-        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
-                                            self.global_step,
-                                            last_layer=self.get_last_layer(),
-                                            split="val"+suffix,
-                                            predicted_indices=ind
-                                            )
-        rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
-        self.log(f"val{suffix}/rec_loss", rec_loss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
-        self.log(f"val{suffix}/aeloss", aeloss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
-        if version.parse(pl.__version__) >= version.parse('1.4.0'):
-            del log_dict_ae[f"val{suffix}/rec_loss"]
-        self.log_dict(log_dict_ae)
-        self.log_dict(log_dict_disc)
-        return self.log_dict
-
-    def configure_optimizers(self):
-        lr_d = self.learning_rate
-        lr_g = self.lr_g_factor*self.learning_rate
-        print("lr_d", lr_d)
-        print("lr_g", lr_g)
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quantize.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
-                                  lr=lr_g, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr_d, betas=(0.5, 0.9))
-
-        if self.scheduler_config is not None:
-            scheduler = instantiate_from_config(self.scheduler_config)
-
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                },
-                {
-                    'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                },
-            ]
-            return [opt_ae, opt_disc], scheduler
-        return [opt_ae, opt_disc], []
-
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-
-    def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.image_key)
-        x = x.to(self.device)
-        if only_inputs:
-            log["inputs"] = x
-            return log
-        xrec, _ = self(x)
-        if x.shape[1] > 3:
-            # colorize with random projection
-            assert xrec.shape[1] > 3
-            x = self.to_rgb(x)
-            xrec = self.to_rgb(xrec)
-        log["inputs"] = x
-        log["reconstructions"] = xrec
-        if plot_ema:
-            with self.ema_scope():
-                xrec_ema, _ = self(x)
-                if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
-                log["reconstructions_ema"] = xrec_ema
-        return log
-
-    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
-        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
-        return x
-
-
-class VQModelInterface(VQModel):
-    def __init__(self, embed_dim, *args, **kwargs):
-        super().__init__(embed_dim=embed_dim, *args, **kwargs)
-        self.embed_dim = embed_dim
-
-    def encode(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        return h
-
-    def decode(self, h, force_not_quantize=False):
-        # also go through quantization layer
-        if not force_not_quantize:
-            quant, emb_loss, info = self.quantize(h)
-        else:
-            quant = h
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-        return dec
-
-
-class AutoencoderKL(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 ):
-        super().__init__()
-        self.image_key = image_key
-        self.encoder = Encoder(**ddconfig)
-        self.decoder = Decoder(**ddconfig)
-        self.loss = instantiate_from_config(lossconfig)
-        assert ddconfig["double_z"]
-        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
-        self.embed_dim = embed_dim
-        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
-        if monitor is not None:
-            self.monitor = monitor
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path}")
-
     def encode(self, x):
         h = self.encoder(x)
         moments = self.quant_conv(h)
@@ -370,25 +129,33 @@ class AutoencoderKL(pl.LightningModule):
             return discloss
 
     def validation_step(self, batch, batch_idx):
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
+        return log_dict
+
+    def _validation_step(self, batch, batch_idx, postfix=""):
         inputs = self.get_input(batch, self.image_key)
         reconstructions, posterior = self(inputs)
         aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
-                                        last_layer=self.get_last_layer(), split="val")
+                                        last_layer=self.get_last_layer(), split="val"+postfix)
 
         discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
-                                            last_layer=self.get_last_layer(), split="val")
+                                            last_layer=self.get_last_layer(), split="val"+postfix)
 
-        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
         self.log_dict(log_dict_ae)
         self.log_dict(log_dict_disc)
         return self.log_dict
 
     def configure_optimizers(self):
         lr = self.learning_rate
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
+        ae_params_list = list(self.encoder.parameters()) + list(self.decoder.parameters()) + list(
+            self.quant_conv.parameters()) + list(self.post_quant_conv.parameters())
+        if self.learn_logvar:
+            print(f"{self.__class__.__name__}: Learning logvar")
+            ae_params_list.append(self.loss.logvar)
+        opt_ae = torch.optim.Adam(ae_params_list,
                                   lr=lr, betas=(0.5, 0.9))
         opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
                                     lr=lr, betas=(0.5, 0.9))
@@ -398,7 +165,7 @@ class AutoencoderKL(pl.LightningModule):
         return self.decoder.conv_out.weight
 
     @torch.no_grad()
-    def log_images(self, batch, only_inputs=False, **kwargs):
+    def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
         log = dict()
         x = self.get_input(batch, self.image_key)
         x = x.to(self.device)
@@ -423,9 +190,9 @@ class AutoencoderKL(pl.LightningModule):
         return x
 
 
-class IdentityFirstStage(torch.nn.Module):
+class IdentityFirstStage(nn.Module):
     def __init__(self, *args, vq_interface=False, **kwargs):
-        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
+        self.vq_interface = vq_interface
         super().__init__()
 
     def encode(self, x, *args, **kwargs):
@@ -441,3 +208,4 @@ class IdentityFirstStage(torch.nn.Module):
 
     def forward(self, x, *args, **kwargs):
         return x
+
diff --git a/ldm/models/diffusion/classifier.py b/ldm/models/diffusion/classifier.py
deleted file mode 100755
index 67e98b9d8ffb96a150b517497ace0a242d7163ef..0000000000000000000000000000000000000000
--- a/ldm/models/diffusion/classifier.py
+++ /dev/null
@@ -1,267 +0,0 @@
-import os
-import torch
-import pytorch_lightning as pl
-from omegaconf import OmegaConf
-from torch.nn import functional as F
-from torch.optim import AdamW
-from torch.optim.lr_scheduler import LambdaLR
-from copy import deepcopy
-from einops import rearrange
-from glob import glob
-from natsort import natsorted
-
-from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
-from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
-
-__models__ = {
-    'class_label': EncoderUNetModel,
-    'segmentation': UNetModel
-}
-
-
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-
-
-class NoisyLatentImageClassifier(pl.LightningModule):
-
-    def __init__(self,
-                 diffusion_path,
-                 num_classes,
-                 ckpt_path=None,
-                 pool='attention',
-                 label_key=None,
-                 diffusion_ckpt_path=None,
-                 scheduler_config=None,
-                 weight_decay=1.e-2,
-                 log_steps=10,
-                 monitor='val/loss',
-                 *args,
-                 **kwargs):
-        super().__init__(*args, **kwargs)
-        self.num_classes = num_classes
-        # get latest config of diffusion model
-        diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
-        self.diffusion_config = OmegaConf.load(diffusion_config).model
-        self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
-        self.load_diffusion()
-
-        self.monitor = monitor
-        self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
-        self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
-        self.log_steps = log_steps
-
-        self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
-            else self.diffusion_model.cond_stage_key
-
-        assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
-
-        if self.label_key not in __models__:
-            raise NotImplementedError()
-
-        self.load_classifier(ckpt_path, pool)
-
-        self.scheduler_config = scheduler_config
-        self.use_scheduler = self.scheduler_config is not None
-        self.weight_decay = weight_decay
-
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-        if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
-
-    def load_diffusion(self):
-        model = instantiate_from_config(self.diffusion_config)
-        self.diffusion_model = model.eval()
-        self.diffusion_model.train = disabled_train
-        for param in self.diffusion_model.parameters():
-            param.requires_grad = False
-
-    def load_classifier(self, ckpt_path, pool):
-        model_config = deepcopy(self.diffusion_config.params.unet_config.params)
-        model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
-        model_config.out_channels = self.num_classes
-        if self.label_key == 'class_label':
-            model_config.pool = pool
-
-        self.model = __models__[self.label_key](**model_config)
-        if ckpt_path is not None:
-            print('#####################################################################')
-            print(f'load from ckpt "{ckpt_path}"')
-            print('#####################################################################')
-            self.init_from_ckpt(ckpt_path)
-
-    @torch.no_grad()
-    def get_x_noisy(self, x, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x))
-        continuous_sqrt_alpha_cumprod = None
-        if self.diffusion_model.use_continuous_noise:
-            continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
-            # todo: make sure t+1 is correct here
-
-        return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
-                                             continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
-
-    def forward(self, x_noisy, t, *args, **kwargs):
-        return self.model(x_noisy, t)
-
-    @torch.no_grad()
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-        x = x.to(memory_format=torch.contiguous_format).float()
-        return x
-
-    @torch.no_grad()
-    def get_conditioning(self, batch, k=None):
-        if k is None:
-            k = self.label_key
-        assert k is not None, 'Needs to provide label key'
-
-        targets = batch[k].to(self.device)
-
-        if self.label_key == 'segmentation':
-            targets = rearrange(targets, 'b h w c -> b c h w')
-            for down in range(self.numd):
-                h, w = targets.shape[-2:]
-                targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
-
-            # targets = rearrange(targets,'b c h w -> b h w c')
-
-        return targets
-
-    def compute_top_k(self, logits, labels, k, reduction="mean"):
-        _, top_ks = torch.topk(logits, k, dim=1)
-        if reduction == "mean":
-            return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
-        elif reduction == "none":
-            return (top_ks == labels[:, None]).float().sum(dim=-1)
-
-    def on_train_epoch_start(self):
-        # save some memory
-        self.diffusion_model.model.to('cpu')
-
-    @torch.no_grad()
-    def write_logs(self, loss, logits, targets):
-        log_prefix = 'train' if self.training else 'val'
-        log = {}
-        log[f"{log_prefix}/loss"] = loss.mean()
-        log[f"{log_prefix}/acc@1"] = self.compute_top_k(
-            logits, targets, k=1, reduction="mean"
-        )
-        log[f"{log_prefix}/acc@5"] = self.compute_top_k(
-            logits, targets, k=5, reduction="mean"
-        )
-
-        self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
-        self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
-        self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
-        lr = self.optimizers().param_groups[0]['lr']
-        self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
-
-    def shared_step(self, batch, t=None):
-        x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
-        targets = self.get_conditioning(batch)
-        if targets.dim() == 4:
-            targets = targets.argmax(dim=1)
-        if t is None:
-            t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
-        else:
-            t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
-        x_noisy = self.get_x_noisy(x, t)
-        logits = self(x_noisy, t)
-
-        loss = F.cross_entropy(logits, targets, reduction='none')
-
-        self.write_logs(loss.detach(), logits.detach(), targets.detach())
-
-        loss = loss.mean()
-        return loss, logits, x_noisy, targets
-
-    def training_step(self, batch, batch_idx):
-        loss, *_ = self.shared_step(batch)
-        return loss
-
-    def reset_noise_accs(self):
-        self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
-                          range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
-
-    def on_validation_start(self):
-        self.reset_noise_accs()
-
-    @torch.no_grad()
-    def validation_step(self, batch, batch_idx):
-        loss, *_ = self.shared_step(batch)
-
-        for t in self.noisy_acc:
-            _, logits, _, targets = self.shared_step(batch, t)
-            self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
-            self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
-
-        return loss
-
-    def configure_optimizers(self):
-        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
-
-        if self.use_scheduler:
-            scheduler = instantiate_from_config(self.scheduler_config)
-
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
-            return [optimizer], scheduler
-
-        return optimizer
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, *args, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.diffusion_model.first_stage_key)
-        log['inputs'] = x
-
-        y = self.get_conditioning(batch)
-
-        if self.label_key == 'class_label':
-            y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
-            log['labels'] = y
-
-        if ismap(y):
-            log['labels'] = self.diffusion_model.to_rgb(y)
-
-            for step in range(self.log_steps):
-                current_time = step * self.log_time_interval
-
-                _, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
-
-                log[f'inputs@t{current_time}'] = x_noisy
-
-                pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
-                pred = rearrange(pred, 'b h w c -> b c h w')
-
-                log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
-
-        for key in log:
-            log[key] = log[key][:N]
-
-        return log
diff --git a/ldm/models/diffusion/ddim.py b/ldm/models/diffusion/ddim.py
index fb31215db5c3f3f703f15987d7eee6a179c9f7ec..9f19c803246a0125d9c67c31df49da351c7552f0 100755
--- a/ldm/models/diffusion/ddim.py
+++ b/ldm/models/diffusion/ddim.py
@@ -3,7 +3,6 @@
 import torch
 import numpy as np
 from tqdm import tqdm
-from functools import partial
 
 from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
     extract_into_tensor
@@ -24,7 +23,7 @@ class DDIMSampler(object):
 
     def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
         self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps, verbose=verbose)
         alphas_cumprod = self.model.alphas_cumprod
         assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
         to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
@@ -43,14 +42,14 @@ class DDIMSampler(object):
         # ddim sampling parameters
         ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
                                                                                    ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
+                                                                                   eta=ddim_eta, verbose=verbose)
         self.register_buffer('ddim_sigmas', ddim_sigmas)
         self.register_buffer('ddim_alphas', ddim_alphas)
         self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
         self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
         sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
             (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+                    1 - self.alphas_cumprod / self.alphas_cumprod_prev))
         self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
 
     @torch.no_grad()
@@ -75,6 +74,9 @@ class DDIMSampler(object):
                log_every_t=100,
                unconditional_guidance_scale=1.,
                unconditional_conditioning=None,
+               features_adapter=None,
+               append_to_context=None,
+               cond_tau=0.4,
                # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
                **kwargs
                ):
@@ -107,6 +109,9 @@ class DDIMSampler(object):
                                                     log_every_t=log_every_t,
                                                     unconditional_guidance_scale=unconditional_guidance_scale,
                                                     unconditional_conditioning=unconditional_conditioning,
+                                                    features_adapter=features_adapter,
+                                                    append_to_context=append_to_context,
+                                                    cond_tau=cond_tau,
                                                     )
         return samples, intermediates
 
@@ -116,7 +121,8 @@ class DDIMSampler(object):
                       callback=None, timesteps=None, quantize_denoised=False,
                       mask=None, x0=None, img_callback=None, log_every_t=100,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, features_adapter=None,
+                      append_to_context=None, cond_tau=0.4):
         device = self.model.betas.device
         b = shape[0]
         if x_T is None:
@@ -131,7 +137,7 @@ class DDIMSampler(object):
             timesteps = self.ddim_timesteps[:subset_end]
 
         intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        time_range = reversed(range(0, timesteps)) if ddim_use_original_steps else np.flip(timesteps)
         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
         print(f"Running DDIM Sampling with {total_steps} timesteps")
 
@@ -151,7 +157,13 @@ class DDIMSampler(object):
                                       noise_dropout=noise_dropout, score_corrector=score_corrector,
                                       corrector_kwargs=corrector_kwargs,
                                       unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning)
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      features_adapter=None if index < int(
+                                          (1 - cond_tau) * total_steps) else features_adapter,
+                                      # TODO support style_cond_tau
+                                      append_to_context=None if index < int(
+                                          0.5 * total_steps) else append_to_context,
+                                      )
             img, pred_x0 = outs
             if callback: callback(i)
             if img_callback: img_callback(pred_x0, i)
@@ -165,20 +177,55 @@ class DDIMSampler(object):
     @torch.no_grad()
     def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None):
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, features_adapter=None,
+                      append_to_context=None):
         b, *_, device = *x.shape, x.device
 
         if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-            e_t = self.model.apply_model(x, t, c)
+            if append_to_context is not None:
+                model_output = self.model.apply_model(x, t, torch.cat([c, append_to_context], dim=1),
+                                                      features_adapter=features_adapter)
+            else:
+                model_output = self.model.apply_model(x, t, c, features_adapter=features_adapter)
         else:
             x_in = torch.cat([x] * 2)
             t_in = torch.cat([t] * 2)
-            c_in = torch.cat([unconditional_conditioning, c])
-            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+            if isinstance(c, dict):
+                assert isinstance(unconditional_conditioning, dict)
+                c_in = dict()
+                for k in c:
+                    if isinstance(c[k], list):
+                        c_in[k] = [torch.cat([
+                            unconditional_conditioning[k][i],
+                            c[k][i]]) for i in range(len(c[k]))]
+                    else:
+                        c_in[k] = torch.cat([
+                            unconditional_conditioning[k],
+                            c[k]])
+            elif isinstance(c, list):
+                c_in = list()
+                assert isinstance(unconditional_conditioning, list)
+                for i in range(len(c)):
+                    c_in.append(torch.cat([unconditional_conditioning[i], c[i]]))
+            else:
+                if append_to_context is not None:
+                    pad_len = append_to_context.size(1)
+                    new_unconditional_conditioning = torch.cat(
+                        [unconditional_conditioning, unconditional_conditioning[:, -pad_len:, :]], dim=1)
+                    new_c = torch.cat([c, append_to_context], dim=1)
+                    c_in = torch.cat([new_unconditional_conditioning, new_c])
+                else:
+                    c_in = torch.cat([unconditional_conditioning, c])
+            model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in, features_adapter=features_adapter).chunk(2)
+            model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
+
+        if self.model.parameterization == "v":
+            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
+        else:
+            e_t = model_output
 
         if score_corrector is not None:
-            assert self.model.parameterization == "eps"
+            assert self.model.parameterization == "eps", 'not implemented'
             e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
 
         alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
@@ -189,14 +236,18 @@ class DDIMSampler(object):
         a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
         a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
         sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device)
 
         # current prediction for x_0
-        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        if self.model.parameterization != "v":
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        else:
+            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
+
         if quantize_denoised:
             pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
         # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+        dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
         noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
         if noise_dropout > 0.:
             noise = torch.nn.functional.dropout(noise, p=noise_dropout)
@@ -238,4 +289,4 @@ class DDIMSampler(object):
             x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
                                           unconditional_guidance_scale=unconditional_guidance_scale,
                                           unconditional_conditioning=unconditional_conditioning)
-        return x_dec
\ No newline at end of file
+        return x_dec
diff --git a/ldm/models/diffusion/ddpm.py b/ldm/models/diffusion/ddpm.py
index 6e05db7cf98be1fb14f30455422b1a7b3058eb96..263840b499ec9df0be40a02a665e0245b32a2f29 100755
--- a/ldm/models/diffusion/ddpm.py
+++ b/ldm/models/diffusion/ddpm.py
@@ -12,16 +12,18 @@ import numpy as np
 import pytorch_lightning as pl
 from torch.optim.lr_scheduler import LambdaLR
 from einops import rearrange, repeat
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from functools import partial
+import itertools
 from tqdm import tqdm
 from torchvision.utils import make_grid
 from pytorch_lightning.utilities.distributed import rank_zero_only
+from omegaconf import ListConfig
 
 from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
 from ldm.modules.ema import LitEma
 from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import VQModelInterface, IdentityFirstStage, AutoencoderKL
+from ldm.models.autoencoder import IdentityFirstStage, AutoencoderKL
 from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
 from ldm.models.diffusion.ddim import DDIMSampler
 
@@ -71,9 +73,13 @@ class DDPM(pl.LightningModule):
                  use_positional_encodings=False,
                  learn_logvar=False,
                  logvar_init=0.,
+                 make_it_fit=False,
+                 ucg_training=None,
+                 reset_ema=False,
+                 reset_num_ema_updates=False,
                  ):
         super().__init__()
-        assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"'
+        assert parameterization in ["eps", "x0", "v"], 'currently only supporting "eps" and "x0" and "v"'
         self.parameterization = parameterization
         print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
         self.cond_stage_model = None
@@ -100,8 +106,18 @@ class DDPM(pl.LightningModule):
 
         if monitor is not None:
             self.monitor = monitor
+        self.make_it_fit = make_it_fit
+        if reset_ema: assert exists(ckpt_path)
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
+            if reset_ema:
+                assert self.use_ema
+                print(f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
+                self.model_ema = LitEma(self.model)
+        if reset_num_ema_updates:
+            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
+            assert self.use_ema
+            self.model_ema.reset_num_updates()
 
         self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
                                linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
@@ -113,6 +129,9 @@ class DDPM(pl.LightningModule):
         if self.learn_logvar:
             self.logvar = nn.Parameter(self.logvar, requires_grad=True)
 
+        self.ucg_training = ucg_training or dict()
+        if self.ucg_training:
+            self.ucg_prng = np.random.RandomState()
 
     def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
                           linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
@@ -146,7 +165,7 @@ class DDPM(pl.LightningModule):
 
         # calculations for posterior q(x_{t-1} | x_t, x_0)
         posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
-                    1. - alphas_cumprod) + self.v_posterior * betas
+                1. - alphas_cumprod) + self.v_posterior * betas
         # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
         self.register_buffer('posterior_variance', to_torch(posterior_variance))
         # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
@@ -158,12 +177,14 @@ class DDPM(pl.LightningModule):
 
         if self.parameterization == "eps":
             lvlb_weights = self.betas ** 2 / (
-                        2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
+                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
         elif self.parameterization == "x0":
             lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
+        elif self.parameterization == "v":
+            lvlb_weights = torch.ones_like(self.betas ** 2 / (
+                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod)))
         else:
             raise NotImplementedError("mu not supported")
-        # TODO how to choose this term
         lvlb_weights[0] = lvlb_weights[1]
         self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
         assert not torch.isnan(self.lvlb_weights).all()
@@ -183,6 +204,7 @@ class DDPM(pl.LightningModule):
                 if context is not None:
                     print(f"{context}: Restored training weights")
 
+    @torch.no_grad()
     def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
         sd = torch.load(path, map_location="cpu")
         if "state_dict" in list(sd.keys()):
@@ -193,13 +215,57 @@ class DDPM(pl.LightningModule):
                 if k.startswith(ik):
                     print("Deleting key {} from state_dict.".format(k))
                     del sd[k]
+        if self.make_it_fit:
+            n_params = len([name for name, _ in
+                            itertools.chain(self.named_parameters(),
+                                            self.named_buffers())])
+            for name, param in tqdm(
+                    itertools.chain(self.named_parameters(),
+                                    self.named_buffers()),
+                    desc="Fitting old weights to new weights",
+                    total=n_params
+            ):
+                if not name in sd:
+                    continue
+                old_shape = sd[name].shape
+                new_shape = param.shape
+                assert len(old_shape) == len(new_shape)
+                if len(new_shape) > 2:
+                    # we only modify first two axes
+                    assert new_shape[2:] == old_shape[2:]
+                # assumes first axis corresponds to output dim
+                if not new_shape == old_shape:
+                    new_param = param.clone()
+                    old_param = sd[name]
+                    if len(new_shape) == 1:
+                        for i in range(new_param.shape[0]):
+                            new_param[i] = old_param[i % old_shape[0]]
+                    elif len(new_shape) >= 2:
+                        for i in range(new_param.shape[0]):
+                            for j in range(new_param.shape[1]):
+                                new_param[i, j] = old_param[i % old_shape[0], j % old_shape[1]]
+
+                        n_used_old = torch.ones(old_shape[1])
+                        for j in range(new_param.shape[1]):
+                            n_used_old[j % old_shape[1]] += 1
+                        n_used_new = torch.zeros(new_shape[1])
+                        for j in range(new_param.shape[1]):
+                            n_used_new[j] = n_used_old[j % old_shape[1]]
+
+                        n_used_new = n_used_new[None, :]
+                        while len(n_used_new.shape) < len(new_shape):
+                            n_used_new = n_used_new.unsqueeze(-1)
+                        new_param /= n_used_new
+
+                    sd[name] = new_param
+
         missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
             sd, strict=False)
         print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
         if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
+            print(f"Missing Keys:\n {missing}")
         if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
+            print(f"\nUnexpected Keys:\n {unexpected}")
 
     def q_mean_variance(self, x_start, t):
         """
@@ -219,6 +285,20 @@ class DDPM(pl.LightningModule):
                 extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
         )
 
+    def predict_start_from_z_and_v(self, x_t, t, v):
+        # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+        # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+        return (
+                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
+        )
+
+    def predict_eps_from_z_and_v(self, x_t, t, v):
+        return (
+                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * x_t
+        )
+
     def q_posterior(self, x_start, x_t, t):
         posterior_mean = (
                 extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
@@ -276,6 +356,12 @@ class DDPM(pl.LightningModule):
         return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
                 extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
 
+    def get_v(self, x, noise, t):
+        return (
+                extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise -
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x
+        )
+
     def get_loss(self, pred, target, mean=True):
         if self.loss_type == 'l1':
             loss = (target - pred).abs()
@@ -301,6 +387,8 @@ class DDPM(pl.LightningModule):
             target = noise
         elif self.parameterization == "x0":
             target = x_start
+        elif self.parameterization == "v":
+            target = self.get_v(x_start, noise, t)
         else:
             raise NotImplementedError(f"Parameterization {self.parameterization} not yet supported")
 
@@ -328,10 +416,10 @@ class DDPM(pl.LightningModule):
 
     def get_input(self, batch, k):
         x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-        x = x.to(memory_format=torch.contiguous_format).float()
+        # if len(x.shape) == 3:
+        #     x = x[..., None]
+        # x = rearrange(x, 'b h w c -> b c h w')
+        # x = x.to(memory_format=torch.contiguous_format).float()
         return x
 
     def shared_step(self, batch):
@@ -421,41 +509,12 @@ class DDPM(pl.LightningModule):
         return opt
 
 
-class DiffusionWrapper(pl.LightningModule):
-    def __init__(self, diff_model_config, conditioning_key):
-        super().__init__()
-        self.diffusion_model = instantiate_from_config(diff_model_config)
-        self.conditioning_key = conditioning_key
-        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm']
-
-    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, features_adapter=None):
-        if self.conditioning_key is None:
-            out = self.diffusion_model(x, t, features_adapter=features_adapter)
-        elif self.conditioning_key == 'concat':
-            xc = torch.cat([x] + c_concat, dim=1)
-            out = self.diffusion_model(xc, t, features_adapter=features_adapter)
-        elif self.conditioning_key == 'crossattn':
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(x, t, context=cc, features_adapter=features_adapter)
-        elif self.conditioning_key == 'hybrid':
-            xc = torch.cat([x] + c_concat, dim=1)
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc, features_adapter=features_adapter)
-        elif self.conditioning_key == 'adm':
-            cc = c_crossattn[0]
-            out = self.diffusion_model(x, t, y=cc, features_adapter=features_adapter)
-        else:
-            raise NotImplementedError()
-
-        return out
-
-
 class LatentDiffusion(DDPM):
     """main class"""
+
     def __init__(self,
                  first_stage_config,
                  cond_stage_config,
-                 unet_config,
                  num_timesteps_cond=None,
                  cond_stage_key="image",
                  cond_stage_trainable=False,
@@ -474,9 +533,10 @@ class LatentDiffusion(DDPM):
         if cond_stage_config == '__is_unconditional__':
             conditioning_key = None
         ckpt_path = kwargs.pop("ckpt_path", None)
+        reset_ema = kwargs.pop("reset_ema", False)
+        reset_num_ema_updates = kwargs.pop("reset_num_ema_updates", False)
         ignore_keys = kwargs.pop("ignore_keys", [])
-        super().__init__(conditioning_key=conditioning_key, unet_config=unet_config, *args, **kwargs)
-        self.model = DiffusionWrapper(unet_config, conditioning_key)
+        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
         self.concat_mode = concat_mode
         self.cond_stage_trainable = cond_stage_trainable
         self.cond_stage_key = cond_stage_key
@@ -492,35 +552,27 @@ class LatentDiffusion(DDPM):
         self.instantiate_cond_stage(cond_stage_config)
         self.cond_stage_forward = cond_stage_forward
         self.clip_denoised = False
-        self.bbox_tokenizer = None  
+        self.bbox_tokenizer = None
 
         self.restarted_from_ckpt = False
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys)
             self.restarted_from_ckpt = True
+            if reset_ema:
+                assert self.use_ema
+                print(
+                    f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
+                self.model_ema = LitEma(self.model)
+        if reset_num_ema_updates:
+            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
+            assert self.use_ema
+            self.model_ema.reset_num_updates()
 
     def make_cond_schedule(self, ):
         self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
         ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
         self.cond_ids[:self.num_timesteps_cond] = ids
 
-    @rank_zero_only
-    @torch.no_grad()
-    def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
-        # only for very first batch
-        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
-            assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
-            # set rescale weight to 1./std of encodings
-            print("### USING STD-RESCALING ###")
-            x = super().get_input(batch, self.first_stage_key)
-            x = x.to(self.device)
-            encoder_posterior = self.encode_first_stage(x)
-            z = self.get_first_stage_encoding(encoder_posterior).detach()
-            del self.scale_factor
-            self.register_buffer('scale_factor', 1. / z.flatten().std())
-            print(f"setting self.scale_factor to {self.scale_factor}")
-            print("### USING STD-RESCALING ###")
-
     def register_schedule(self,
                           given_betas=None, beta_schedule="linear", timesteps=1000,
                           linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
@@ -562,7 +614,7 @@ class LatentDiffusion(DDPM):
         denoise_row = []
         for zd in tqdm(samples, desc=desc):
             denoise_row.append(self.decode_first_stage(zd.to(self.device),
-                                                            force_not_quantize=force_no_decoder_quantization))
+                                                       force_not_quantize=force_no_decoder_quantization))
         n_imgs_per_row = len(denoise_row)
         denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
         denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
@@ -695,9 +747,9 @@ class LatentDiffusion(DDPM):
             if cond_key is None:
                 cond_key = self.cond_stage_key
             if cond_key != self.first_stage_key:
-                if cond_key in ['caption', 'coordinates_bbox']:
+                if cond_key in ['caption', 'coordinates_bbox', "txt"]:
                     xc = batch[cond_key]
-                elif cond_key == 'class_label':
+                elif cond_key in ['class_label', 'cls']:
                     xc = batch
                 else:
                     xc = super().get_input(batch, cond_key).to(self.device)
@@ -742,181 +794,28 @@ class LatentDiffusion(DDPM):
             z = rearrange(z, 'b h w c -> b c h w').contiguous()
 
         z = 1. / self.scale_factor * z
-
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                uf = self.split_input_params["vqf"]
-                bs, nc, h, w = z.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-
-                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
-
-                z = unfold(z)  # (bn, nc * prod(**ks), L)
-                # 1. Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                # 2. apply model loop over last dim
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
-                                                                 force_not_quantize=predict_cids or force_not_quantize)
-                                   for i in range(z.shape[-1])]
-                else:
-
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
-                                   for i in range(z.shape[-1])]
-
-                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
-                o = o * weighting
-                # Reverse 1. reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
-                return decoded
-            else:
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-                else:
-                    return self.first_stage_model.decode(z)
-
-        else:
-            if isinstance(self.first_stage_model, VQModelInterface):
-                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-            else:
-                return self.first_stage_model.decode(z)
-
-    # same as above but without decorator
-    def differentiable_decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
-        if predict_cids:
-            if z.dim() == 4:
-                z = torch.argmax(z.exp(), dim=1).long()
-            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
-            z = rearrange(z, 'b h w c -> b c h w').contiguous()
-
-        z = 1. / self.scale_factor * z
-
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                uf = self.split_input_params["vqf"]
-                bs, nc, h, w = z.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-
-                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
-
-                z = unfold(z)  # (bn, nc * prod(**ks), L)
-                # 1. Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                # 2. apply model loop over last dim
-                if isinstance(self.first_stage_model, VQModelInterface):  
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
-                                                                 force_not_quantize=predict_cids or force_not_quantize)
-                                   for i in range(z.shape[-1])]
-                else:
-
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
-                                   for i in range(z.shape[-1])]
-
-                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
-                o = o * weighting
-                # Reverse 1. reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
-                return decoded
-            else:
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-                else:
-                    return self.first_stage_model.decode(z)
-
-        else:
-            if isinstance(self.first_stage_model, VQModelInterface):
-                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-            else:
-                return self.first_stage_model.decode(z)
+        return self.first_stage_model.decode(z)
 
     @torch.no_grad()
     def encode_first_stage(self, x):
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                df = self.split_input_params["vqf"]
-                self.split_input_params['original_image_size'] = x.shape[-2:]
-                bs, nc, h, w = x.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-
-                fold, unfold, normalization, weighting = self.get_fold_unfold(x, ks, stride, df=df)
-                z = unfold(x)  # (bn, nc * prod(**ks), L)
-                # Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                output_list = [self.first_stage_model.encode(z[:, :, :, :, i])
-                               for i in range(z.shape[-1])]
-
-                o = torch.stack(output_list, axis=-1)
-                o = o * weighting
-
-                # Reverse reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization
-                return decoded
-
-            else:
-                return self.first_stage_model.encode(x)
-        else:
-            return self.first_stage_model.encode(x)
+        return self.first_stage_model.encode(x)
 
     def shared_step(self, batch, **kwargs):
         x, c = self.get_input(batch, self.first_stage_key)
-        loss = self(x, c)
+        loss = self(x, c, **kwargs)
         return loss
 
-    def forward(self, x, c, features_adapter=None, *args, **kwargs):
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-
-        return self.p_losses(x, c, t, features_adapter, *args, **kwargs)
-
-    def _rescale_annotations(self, bboxes, crop_coordinates):  # TODO: move to dataset
-        def rescale_bbox(bbox):
-            x0 = clamp((bbox[0] - crop_coordinates[0]) / crop_coordinates[2])
-            y0 = clamp((bbox[1] - crop_coordinates[1]) / crop_coordinates[3])
-            w = min(bbox[2] / crop_coordinates[2], 1 - x0)
-            h = min(bbox[3] / crop_coordinates[3], 1 - y0)
-            return x0, y0, w, h
-
-        return [rescale_bbox(b) for b in bboxes]
+    def forward(self, x, c, *args, **kwargs):
+        if 't' not in kwargs:
+            t = torch.randint(0, self.num_timesteps, (x.shape[0], ), device=self.device).long()
+        else:
+            t = kwargs.pop('t')
 
-    def apply_model(self, x_noisy, t, cond, features_adapter=None, return_ids=False):
+        return self.p_losses(x, c, t, *args, **kwargs)
 
+    def apply_model(self, x_noisy, t, cond, return_ids=False, **kwargs):
         if isinstance(cond, dict):
-            # hybrid case, cond is exptected to be a dict
+            # hybrid case, cond is expected to be a dict
             pass
         else:
             if not isinstance(cond, list):
@@ -924,98 +823,7 @@ class LatentDiffusion(DDPM):
             key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
             cond = {key: cond}
 
-        if hasattr(self, "split_input_params"):
-            assert len(cond) == 1  # todo can only deal with one conditioning atm
-            assert not return_ids  
-            ks = self.split_input_params["ks"]  # eg. (128, 128)
-            stride = self.split_input_params["stride"]  # eg. (64, 64)
-
-            h, w = x_noisy.shape[-2:]
-
-            fold, unfold, normalization, weighting = self.get_fold_unfold(x_noisy, ks, stride)
-
-            z = unfold(x_noisy)  # (bn, nc * prod(**ks), L)
-            # Reshape to img shape
-            z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-            z_list = [z[:, :, :, :, i] for i in range(z.shape[-1])]
-
-            if self.cond_stage_key in ["image", "LR_image", "segmentation",
-                                       'bbox_img'] and self.model.conditioning_key:  # todo check for completeness
-                c_key = next(iter(cond.keys()))  # get key
-                c = next(iter(cond.values()))  # get value
-                assert (len(c) == 1)  # todo extend to list with more than one elem
-                c = c[0]  # get element
-
-                c = unfold(c)
-                c = c.view((c.shape[0], -1, ks[0], ks[1], c.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                cond_list = [{c_key: [c[:, :, :, :, i]]} for i in range(c.shape[-1])]
-
-            elif self.cond_stage_key == 'coordinates_bbox':
-                assert 'original_image_size' in self.split_input_params, 'BoudingBoxRescaling is missing original_image_size'
-
-                # assuming padding of unfold is always 0 and its dilation is always 1
-                n_patches_per_row = int((w - ks[0]) / stride[0] + 1)
-                full_img_h, full_img_w = self.split_input_params['original_image_size']
-                # as we are operating on latents, we need the factor from the original image size to the
-                # spatial latent size to properly rescale the crops for regenerating the bbox annotations
-                num_downs = self.first_stage_model.encoder.num_resolutions - 1
-                rescale_latent = 2 ** (num_downs)
-
-                # get top left postions of patches as conforming for the bbbox tokenizer, therefore we
-                # need to rescale the tl patch coordinates to be in between (0,1)
-                tl_patch_coordinates = [(rescale_latent * stride[0] * (patch_nr % n_patches_per_row) / full_img_w,
-                                         rescale_latent * stride[1] * (patch_nr // n_patches_per_row) / full_img_h)
-                                        for patch_nr in range(z.shape[-1])]
-
-                # patch_limits are tl_coord, width and height coordinates as (x_tl, y_tl, h, w)
-                patch_limits = [(x_tl, y_tl,
-                                 rescale_latent * ks[0] / full_img_w,
-                                 rescale_latent * ks[1] / full_img_h) for x_tl, y_tl in tl_patch_coordinates]
-                # patch_values = [(np.arange(x_tl,min(x_tl+ks, 1.)),np.arange(y_tl,min(y_tl+ks, 1.))) for x_tl, y_tl in tl_patch_coordinates]
-
-                # tokenize crop coordinates for the bounding boxes of the respective patches
-                patch_limits_tknzd = [torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[None].to(self.device)
-                                      for bbox in patch_limits]  # list of length l with tensors of shape (1, 2)
-                print(patch_limits_tknzd[0].shape)
-                # cut tknzd crop position from conditioning
-                assert isinstance(cond, dict), 'cond must be dict to be fed into model'
-                cut_cond = cond['c_crossattn'][0][..., :-2].to(self.device)
-                print(cut_cond.shape)
-
-                adapted_cond = torch.stack([torch.cat([cut_cond, p], dim=1) for p in patch_limits_tknzd])
-                adapted_cond = rearrange(adapted_cond, 'l b n -> (l b) n')
-                print(adapted_cond.shape)
-                adapted_cond = self.get_learned_conditioning(adapted_cond)
-                print(adapted_cond.shape)
-                adapted_cond = rearrange(adapted_cond, '(l b) n d -> l b n d', l=z.shape[-1])
-                print(adapted_cond.shape)
-
-                cond_list = [{'c_crossattn': [e]} for e in adapted_cond]
-
-            else:
-                cond_list = [cond for i in range(z.shape[-1])]  # Todo make this more efficient
-
-            # apply model by loop over crops
-            if features_adapter is not None:
-                output_list = [self.model(z_list[i], t, **cond_list[i], features_adapter=features_adapter) for i in range(z.shape[-1])]
-            else:
-                output_list = [self.model(z_list[i], t, **cond_list[i]) for i in range(z.shape[-1])]
-            assert not isinstance(output_list[0],
-                                  tuple)  # todo cant deal with multiple model outputs check this never happens
-
-            o = torch.stack(output_list, axis=-1)
-            o = o * weighting
-            # Reverse reshape to img shape
-            o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-            # stitch crops together
-            x_recon = fold(o) / normalization
-
-        else:
-            if features_adapter is not None:
-                x_recon = self.model(x_noisy, t, **cond, features_adapter=features_adapter)
-            else:
-                x_recon = self.model(x_noisy, t, **cond)
+        x_recon = self.model(x_noisy, t, **cond, **kwargs)
 
         if isinstance(x_recon, tuple) and not return_ids:
             return x_recon[0]
@@ -1040,10 +848,10 @@ class LatentDiffusion(DDPM):
         kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
         return mean_flat(kl_prior) / np.log(2.0)
 
-    def p_losses(self, x_start, cond, t, features_adapter=None, noise=None):
+    def p_losses(self, x_start, cond, t, noise=None, **kwargs):
         noise = default(noise, lambda: torch.randn_like(x_start))
         x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_output = self.apply_model(x_noisy, t, cond, features_adapter)
+        model_output = self.apply_model(x_noisy, t, cond, **kwargs)
 
         loss_dict = {}
         prefix = 'train' if self.training else 'val'
@@ -1052,6 +860,8 @@ class LatentDiffusion(DDPM):
             target = x_start
         elif self.parameterization == "eps":
             target = noise
+        elif self.parameterization == "v":
+            target = self.get_v(x_start, noise, t)
         else:
             raise NotImplementedError()
 
@@ -1247,7 +1057,7 @@ class LatentDiffusion(DDPM):
     @torch.no_grad()
     def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
                verbose=True, timesteps=None, quantize_denoised=False,
-               mask=None, x0=None, shape=None,**kwargs):
+               mask=None, x0=None, shape=None, **kwargs):
         if shape is None:
             shape = (batch_size, self.channels, self.image_size, self.image_size)
         if cond is not None:
@@ -1263,26 +1073,51 @@ class LatentDiffusion(DDPM):
                                   mask=mask, x0=x0)
 
     @torch.no_grad()
-    def sample_log(self,cond,batch_size,ddim, ddim_steps,**kwargs):
-
+    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
         if ddim:
             ddim_sampler = DDIMSampler(self)
             shape = (self.channels, self.image_size, self.image_size)
-            samples, intermediates =ddim_sampler.sample(ddim_steps,batch_size,
-                                                        shape,cond,verbose=False,**kwargs)
+            samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size,
+                                                         shape, cond, verbose=False, **kwargs)
 
         else:
             samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
-                                                 return_intermediates=True,**kwargs)
+                                                 return_intermediates=True, **kwargs)
 
         return samples, intermediates
 
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, batch_size, null_label=None):
+        if null_label is not None:
+            xc = null_label
+            if isinstance(xc, ListConfig):
+                xc = list(xc)
+            if isinstance(xc, dict) or isinstance(xc, list):
+                c = self.get_learned_conditioning(xc)
+            else:
+                if hasattr(xc, "to"):
+                    xc = xc.to(self.device)
+                c = self.get_learned_conditioning(xc)
+        else:
+            if self.cond_stage_key in ["class_label", "cls"]:
+                xc = self.cond_stage_model.get_unconditional_conditioning(batch_size, device=self.device)
+                return self.get_learned_conditioning(xc)
+            else:
+                raise NotImplementedError("todo")
+        if isinstance(c, list):  # in case the encoder gives us a list
+            for i in range(len(c)):
+                c[i] = repeat(c[i], '1 ... -> b ...', b=batch_size).to(self.device)
+        else:
+            c = repeat(c, '1 ... -> b ...', b=batch_size).to(self.device)
+        return c
 
     @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
+    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0., return_keys=None,
                    quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, **kwargs):
-
+                   plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
+                   use_ema_scope=True,
+                   **kwargs):
+        ema_scope = self.ema_scope if use_ema_scope else nullcontext
         use_ddim = ddim_steps is not None
 
         log = dict()
@@ -1299,12 +1134,16 @@ class LatentDiffusion(DDPM):
             if hasattr(self.cond_stage_model, "decode"):
                 xc = self.cond_stage_model.decode(c)
                 log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["caption"])
+            elif self.cond_stage_key in ["caption", "txt"]:
+                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
                 log["conditioning"] = xc
-            elif self.cond_stage_key == 'class_label':
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
-                log['conditioning'] = xc
+            elif self.cond_stage_key in ['class_label', "cls"]:
+                try:
+                    xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
+                    log['conditioning'] = xc
+                except KeyError:
+                    # probably no "human_label" in batch
+                    pass
             elif isimage(xc):
                 log["conditioning"] = xc
             if ismap(xc):
@@ -1330,9 +1169,9 @@ class LatentDiffusion(DDPM):
 
         if sample:
             # get denoise row
-            with self.ema_scope("Plotting"):
-                samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
-                                                         ddim_steps=ddim_steps,eta=ddim_eta)
+            with ema_scope("Sampling"):
+                samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
+                                                         ddim_steps=ddim_steps, eta=ddim_eta)
                 # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
             x_samples = self.decode_first_stage(samples)
             log["samples"] = x_samples
@@ -1343,39 +1182,52 @@ class LatentDiffusion(DDPM):
             if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
                     self.first_stage_model, IdentityFirstStage):
                 # also display when quantizing x0 while sampling
-                with self.ema_scope("Plotting Quantized Denoised"):
-                    samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
-                                                             ddim_steps=ddim_steps,eta=ddim_eta,
+                with ema_scope("Plotting Quantized Denoised"):
+                    samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
+                                                             ddim_steps=ddim_steps, eta=ddim_eta,
                                                              quantize_denoised=True)
                     # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
                     #                                      quantize_denoised=True)
                 x_samples = self.decode_first_stage(samples.to(self.device))
                 log["samples_x0_quantized"] = x_samples
 
-            if inpaint:
-                # make a simple center square
-                b, h, w = z.shape[0], z.shape[2], z.shape[3]
-                mask = torch.ones(N, h, w).to(self.device)
-                # zeros will be filled in
-                mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
-                mask = mask[:, None, ...]
-                with self.ema_scope("Plotting Inpaint"):
-
-                    samples, _ = self.sample_log(cond=c,batch_size=N,ddim=use_ddim, eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_inpainting"] = x_samples
-                log["mask"] = mask
-
-                # outpaint
-                with self.ema_scope("Plotting Outpaint"):
-                    samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_outpainting"] = x_samples
+        if unconditional_guidance_scale > 1.0:
+            uc = self.get_unconditional_conditioning(N, unconditional_guidance_label)
+            if self.model.conditioning_key == "crossattn-adm":
+                uc = {"c_crossattn": [uc], "c_adm": c["c_adm"]}
+            with ema_scope("Sampling with classifier-free guidance"):
+                samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
+                                                 ddim_steps=ddim_steps, eta=ddim_eta,
+                                                 unconditional_guidance_scale=unconditional_guidance_scale,
+                                                 unconditional_conditioning=uc,
+                                                 )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+
+        if inpaint:
+            # make a simple center square
+            b, h, w = z.shape[0], z.shape[2], z.shape[3]
+            mask = torch.ones(N, h, w).to(self.device)
+            # zeros will be filled in
+            mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
+            mask = mask[:, None, ...]
+            with ema_scope("Plotting Inpaint"):
+                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
+                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+            x_samples = self.decode_first_stage(samples.to(self.device))
+            log["samples_inpainting"] = x_samples
+            log["mask"] = mask
+
+            # outpaint
+            mask = 1. - mask
+            with ema_scope("Plotting Outpaint"):
+                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
+                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+            x_samples = self.decode_first_stage(samples.to(self.device))
+            log["samples_outpainting"] = x_samples
 
         if plot_progressive_rows:
-            with self.ema_scope("Plotting Progressives"):
+            with ema_scope("Plotting Progressives"):
                 img, progressives = self.progressive_denoising(c,
                                                                shape=(self.channels, self.image_size, self.image_size),
                                                                batch_size=N)
@@ -1422,25 +1274,40 @@ class LatentDiffusion(DDPM):
         x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
         return x
 
-class Layout2ImgDiffusion(LatentDiffusion):
-    # TODO: move all layout-specific hacks to this class
-    def __init__(self, cond_stage_key, *args, **kwargs):
-        assert cond_stage_key == 'coordinates_bbox', 'Layout2ImgDiffusion only for cond_stage_key="coordinates_bbox"'
-        super().__init__(cond_stage_key=cond_stage_key, *args, **kwargs)
-
-    def log_images(self, batch, N=8, *args, **kwargs):
-        logs = super().log_images(batch=batch, N=N, *args, **kwargs)
 
-        key = 'train' if self.training else 'validation'
-        dset = self.trainer.datamodule.datasets[key]
-        mapper = dset.conditional_builders[self.cond_stage_key]
+class DiffusionWrapper(pl.LightningModule):
+    def __init__(self, diff_model_config, conditioning_key):
+        super().__init__()
+        self.diffusion_model = instantiate_from_config(diff_model_config)
+        self.conditioning_key = conditioning_key
+        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', 'crossattn-adm']
 
-        bbox_imgs = []
-        map_fn = lambda catno: dset.get_textual_label(dset.get_category_id(catno))
-        for tknzd_bbox in batch[self.cond_stage_key][:N]:
-            bboximg = mapper.plot(tknzd_bbox.detach().cpu(), map_fn, (256, 256))
-            bbox_imgs.append(bboximg)
+    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=None, **kwargs):
+        if self.conditioning_key is None:
+            out = self.diffusion_model(x, t, **kwargs)
+        elif self.conditioning_key == 'concat':
+            xc = torch.cat([x] + c_concat, dim=1)
+            out = self.diffusion_model(xc, t, **kwargs)
+        elif self.conditioning_key == 'crossattn':
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(x, t, context=cc, **kwargs)
+        elif self.conditioning_key == 'hybrid':
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc, **kwargs)
+        elif self.conditioning_key == 'hybrid-adm':
+            assert c_adm is not None
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc, y=c_adm, **kwargs)
+        elif self.conditioning_key == 'crossattn-adm':
+            assert c_adm is not None
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(x, t, context=cc, y=c_adm, **kwargs)
+        elif self.conditioning_key == 'adm':
+            cc = c_crossattn[0]
+            out = self.diffusion_model(x, t, y=cc, **kwargs)
+        else:
+            raise NotImplementedError()
 
-        cond_img = torch.stack(bbox_imgs, dim=0)
-        logs['bbox_image'] = cond_img
-        return logs
+        return out
diff --git a/ldm/models/diffusion/dpm_solver/dpm_solver.py b/ldm/models/diffusion/dpm_solver/dpm_solver.py
index bdb64e0c78cc3520f92d79db3124c85fc3cfb9b4..23ebfebf167a6c16f3b57e09d491998c4adf68db 100755
--- a/ldm/models/diffusion/dpm_solver/dpm_solver.py
+++ b/ldm/models/diffusion/dpm_solver/dpm_solver.py
@@ -1,6 +1,7 @@
 import torch
 import torch.nn.functional as F
 import math
+from tqdm import tqdm
 
 
 class NoiseScheduleVP:
@@ -11,7 +12,7 @@ class NoiseScheduleVP:
             alphas_cumprod=None,
             continuous_beta_0=0.1,
             continuous_beta_1=20.,
-        ):
+    ):
         """Create a wrapper class for the forward SDE (VP type).
 
         ***
@@ -93,7 +94,9 @@ class NoiseScheduleVP:
         """
 
         if schedule not in ['discrete', 'linear', 'cosine']:
-            raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(schedule))
+            raise ValueError(
+                "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
+                    schedule))
 
         self.schedule = schedule
         if schedule == 'discrete':
@@ -112,7 +115,8 @@ class NoiseScheduleVP:
             self.beta_1 = continuous_beta_1
             self.cosine_s = 0.008
             self.cosine_beta_max = 999.
-            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
+            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
+                        1. + self.cosine_s) / math.pi - self.cosine_s
             self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
             self.schedule = schedule
             if schedule == 'cosine':
@@ -127,12 +131,13 @@ class NoiseScheduleVP:
         Compute log(alpha_t) of a given continuous-time label t in [0, T].
         """
         if self.schedule == 'discrete':
-            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1))
+            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
+                                  self.log_alpha_array.to(t.device)).reshape((-1))
         elif self.schedule == 'linear':
             return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
         elif self.schedule == 'cosine':
             log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
-            log_alpha_t =  log_alpha_fn(t) - self.cosine_log_alpha_0
+            log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
             return log_alpha_t
 
     def marginal_alpha(self, t):
@@ -161,30 +166,32 @@ class NoiseScheduleVP:
         """
         if self.schedule == 'linear':
             tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            Delta = self.beta_0**2 + tmp
+            Delta = self.beta_0 ** 2 + tmp
             return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
         elif self.schedule == 'discrete':
             log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
-            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1]))
+            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
+                               torch.flip(self.t_array.to(lamb.device), [1]))
             return t.reshape((-1,))
         else:
             log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
+            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
+                        1. + self.cosine_s) / math.pi - self.cosine_s
             t = t_fn(log_alpha)
             return t
 
 
 def model_wrapper(
-    model,
-    noise_schedule,
-    model_type="noise",
-    model_kwargs={},
-    guidance_type="uncond",
-    condition=None,
-    unconditional_condition=None,
-    guidance_scale=1.,
-    classifier_fn=None,
-    classifier_kwargs={},
+        model,
+        noise_schedule,
+        model_type="noise",
+        model_kwargs={},
+        guidance_type="uncond",
+        condition=None,
+        unconditional_condition=None,
+        guidance_scale=1.,
+        classifier_fn=None,
+        classifier_kwargs={},
 ):
     """Create a wrapper function for the noise prediction model.
 
@@ -392,7 +399,7 @@ class DPM_Solver:
         alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
         x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
         if self.thresholding:
-            p = 0.995   # A hyperparameter in the paper of "Imagen" [1].
+            p = 0.995  # A hyperparameter in the paper of "Imagen" [1].
             s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
             s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
             x0 = torch.clamp(x0, -s, s) / s
@@ -431,10 +438,11 @@ class DPM_Solver:
             return torch.linspace(t_T, t_0, N + 1).to(device)
         elif skip_type == 'time_quadratic':
             t_order = 2
-            t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device)
+            t = torch.linspace(t_T ** (1. / t_order), t_0 ** (1. / t_order), N + 1).pow(t_order).to(device)
             return t
         else:
-            raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
+            raise ValueError(
+                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
 
     def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
         """
@@ -471,28 +479,29 @@ class DPM_Solver:
         if order == 3:
             K = steps // 3 + 1
             if steps % 3 == 0:
-                orders = [3,] * (K - 2) + [2, 1]
+                orders = [3, ] * (K - 2) + [2, 1]
             elif steps % 3 == 1:
-                orders = [3,] * (K - 1) + [1]
+                orders = [3, ] * (K - 1) + [1]
             else:
-                orders = [3,] * (K - 1) + [2]
+                orders = [3, ] * (K - 1) + [2]
         elif order == 2:
             if steps % 2 == 0:
                 K = steps // 2
-                orders = [2,] * K
+                orders = [2, ] * K
             else:
                 K = steps // 2 + 1
-                orders = [2,] * (K - 1) + [1]
+                orders = [2, ] * (K - 1) + [1]
         elif order == 1:
             K = 1
-            orders = [1,] * steps
+            orders = [1, ] * steps
         else:
             raise ValueError("'order' must be '1' or '2' or '3'.")
         if skip_type == 'logSNR':
             # To reproduce the results in DPM-Solver paper
             timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
         else:
-            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[torch.cumsum(torch.tensor([0,] + orders)).to(device)]
+            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
+                torch.cumsum(torch.tensor([0, ] + orders)).to(device)]
         return timesteps_outer, orders
 
     def denoise_to_zero_fn(self, x, s):
@@ -528,8 +537,8 @@ class DPM_Solver:
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_t = (
-                expand_dims(sigma_t / sigma_s, dims) * x
-                - expand_dims(alpha_t * phi_1, dims) * model_s
+                    expand_dims(sigma_t / sigma_s, dims) * x
+                    - expand_dims(alpha_t * phi_1, dims) * model_s
             )
             if return_intermediate:
                 return x_t, {'model_s': model_s}
@@ -540,15 +549,16 @@ class DPM_Solver:
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_t = (
-                expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                - expand_dims(sigma_t * phi_1, dims) * model_s
+                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                    - expand_dims(sigma_t * phi_1, dims) * model_s
             )
             if return_intermediate:
                 return x_t, {'model_s': model_s}
             else:
                 return x_t
 
-    def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False, solver_type='dpm_solver'):
+    def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False,
+                                            solver_type='dpm_solver'):
         """
         Singlestep solver DPM-Solver-2 from time `s` to time `t`.
 
@@ -575,7 +585,8 @@ class DPM_Solver:
         h = lambda_t - lambda_s
         lambda_s1 = lambda_s + r1 * h
         s1 = ns.inverse_lambda(lambda_s1)
-        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(t)
+        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(
+            s1), ns.marginal_log_mean_coeff(t)
         sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
         alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
 
@@ -586,21 +597,22 @@ class DPM_Solver:
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_s1 = (
-                expand_dims(sigma_s1 / sigma_s, dims) * x
-                - expand_dims(alpha_s1 * phi_11, dims) * model_s
+                    expand_dims(sigma_s1 / sigma_s, dims) * x
+                    - expand_dims(alpha_s1 * phi_11, dims) * model_s
             )
             model_s1 = self.model_fn(x_s1, s1)
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-                    - (0.5 / r1) * expand_dims(alpha_t * phi_1, dims) * (model_s1 - model_s)
+                        expand_dims(sigma_t / sigma_s, dims) * x
+                        - expand_dims(alpha_t * phi_1, dims) * model_s
+                        - (0.5 / r1) * expand_dims(alpha_t * phi_1, dims) * (model_s1 - model_s)
                 )
             elif solver_type == 'taylor':
                 x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-                    + (1. / r1) * expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * (model_s1 - model_s)
+                        expand_dims(sigma_t / sigma_s, dims) * x
+                        - expand_dims(alpha_t * phi_1, dims) * model_s
+                        + (1. / r1) * expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * (
+                                    model_s1 - model_s)
                 )
         else:
             phi_11 = torch.expm1(r1 * h)
@@ -609,28 +621,29 @@ class DPM_Solver:
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_s1 = (
-                expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
-                - expand_dims(sigma_s1 * phi_11, dims) * model_s
+                    expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
+                    - expand_dims(sigma_s1 * phi_11, dims) * model_s
             )
             model_s1 = self.model_fn(x_s1, s1)
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-                    - (0.5 / r1) * expand_dims(sigma_t * phi_1, dims) * (model_s1 - model_s)
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                        - expand_dims(sigma_t * phi_1, dims) * model_s
+                        - (0.5 / r1) * expand_dims(sigma_t * phi_1, dims) * (model_s1 - model_s)
                 )
             elif solver_type == 'taylor':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-                    - (1. / r1) * expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * (model_s1 - model_s)
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                        - expand_dims(sigma_t * phi_1, dims) * model_s
+                        - (1. / r1) * expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * (model_s1 - model_s)
                 )
         if return_intermediate:
             return x_t, {'model_s': model_s, 'model_s1': model_s1}
         else:
             return x_t
 
-    def singlestep_dpm_solver_third_update(self, x, s, t, r1=1./3., r2=2./3., model_s=None, model_s1=None, return_intermediate=False, solver_type='dpm_solver'):
+    def singlestep_dpm_solver_third_update(self, x, s, t, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None,
+                                           return_intermediate=False, solver_type='dpm_solver'):
         """
         Singlestep solver DPM-Solver-3 from time `s` to time `t`.
 
@@ -664,8 +677,10 @@ class DPM_Solver:
         lambda_s2 = lambda_s + r2 * h
         s1 = ns.inverse_lambda(lambda_s1)
         s2 = ns.inverse_lambda(lambda_s2)
-        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
-        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(s2), ns.marginal_std(t)
+        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(
+            s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(
+            s2), ns.marginal_std(t)
         alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
 
         if self.predict_x0:
@@ -680,21 +695,21 @@ class DPM_Solver:
                 model_s = self.model_fn(x, s)
             if model_s1 is None:
                 x_s1 = (
-                    expand_dims(sigma_s1 / sigma_s, dims) * x
-                    - expand_dims(alpha_s1 * phi_11, dims) * model_s
+                        expand_dims(sigma_s1 / sigma_s, dims) * x
+                        - expand_dims(alpha_s1 * phi_11, dims) * model_s
                 )
                 model_s1 = self.model_fn(x_s1, s1)
             x_s2 = (
-                expand_dims(sigma_s2 / sigma_s, dims) * x
-                - expand_dims(alpha_s2 * phi_12, dims) * model_s
-                + r2 / r1 * expand_dims(alpha_s2 * phi_22, dims) * (model_s1 - model_s)
+                    expand_dims(sigma_s2 / sigma_s, dims) * x
+                    - expand_dims(alpha_s2 * phi_12, dims) * model_s
+                    + r2 / r1 * expand_dims(alpha_s2 * phi_22, dims) * (model_s1 - model_s)
             )
             model_s2 = self.model_fn(x_s2, s2)
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-                    + (1. / r2) * expand_dims(alpha_t * phi_2, dims) * (model_s2 - model_s)
+                        expand_dims(sigma_t / sigma_s, dims) * x
+                        - expand_dims(alpha_t * phi_1, dims) * model_s
+                        + (1. / r2) * expand_dims(alpha_t * phi_2, dims) * (model_s2 - model_s)
                 )
             elif solver_type == 'taylor':
                 D1_0 = (1. / r1) * (model_s1 - model_s)
@@ -702,10 +717,10 @@ class DPM_Solver:
                 D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                 D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
                 x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-                    + expand_dims(alpha_t * phi_2, dims) * D1
-                    - expand_dims(alpha_t * phi_3, dims) * D2
+                        expand_dims(sigma_t / sigma_s, dims) * x
+                        - expand_dims(alpha_t * phi_1, dims) * model_s
+                        + expand_dims(alpha_t * phi_2, dims) * D1
+                        - expand_dims(alpha_t * phi_3, dims) * D2
                 )
         else:
             phi_11 = torch.expm1(r1 * h)
@@ -719,21 +734,21 @@ class DPM_Solver:
                 model_s = self.model_fn(x, s)
             if model_s1 is None:
                 x_s1 = (
-                    expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
-                    - expand_dims(sigma_s1 * phi_11, dims) * model_s
+                        expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
+                        - expand_dims(sigma_s1 * phi_11, dims) * model_s
                 )
                 model_s1 = self.model_fn(x_s1, s1)
             x_s2 = (
-                expand_dims(torch.exp(log_alpha_s2 - log_alpha_s), dims) * x
-                - expand_dims(sigma_s2 * phi_12, dims) * model_s
-                - r2 / r1 * expand_dims(sigma_s2 * phi_22, dims) * (model_s1 - model_s)
+                    expand_dims(torch.exp(log_alpha_s2 - log_alpha_s), dims) * x
+                    - expand_dims(sigma_s2 * phi_12, dims) * model_s
+                    - r2 / r1 * expand_dims(sigma_s2 * phi_22, dims) * (model_s1 - model_s)
             )
             model_s2 = self.model_fn(x_s2, s2)
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-                    - (1. / r2) * expand_dims(sigma_t * phi_2, dims) * (model_s2 - model_s)
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                        - expand_dims(sigma_t * phi_1, dims) * model_s
+                        - (1. / r2) * expand_dims(sigma_t * phi_2, dims) * (model_s2 - model_s)
                 )
             elif solver_type == 'taylor':
                 D1_0 = (1. / r1) * (model_s1 - model_s)
@@ -741,10 +756,10 @@ class DPM_Solver:
                 D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                 D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-                    - expand_dims(sigma_t * phi_2, dims) * D1
-                    - expand_dims(sigma_t * phi_3, dims) * D2
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                        - expand_dims(sigma_t * phi_1, dims) * model_s
+                        - expand_dims(sigma_t * phi_2, dims) * D1
+                        - expand_dims(sigma_t * phi_3, dims) * D2
                 )
 
         if return_intermediate:
@@ -772,7 +787,8 @@ class DPM_Solver:
         dims = x.dim()
         model_prev_1, model_prev_0 = model_prev_list
         t_prev_1, t_prev_0 = t_prev_list
-        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
+        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(
+            t_prev_0), ns.marginal_lambda(t)
         log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
         sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
         alpha_t = torch.exp(log_alpha_t)
@@ -784,28 +800,28 @@ class DPM_Solver:
         if self.predict_x0:
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(sigma_t / sigma_prev_0, dims) * x
-                    - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                    - 0.5 * expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * D1_0
+                        expand_dims(sigma_t / sigma_prev_0, dims) * x
+                        - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
+                        - 0.5 * expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * D1_0
                 )
             elif solver_type == 'taylor':
                 x_t = (
-                    expand_dims(sigma_t / sigma_prev_0, dims) * x
-                    - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                    + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1_0
+                        expand_dims(sigma_t / sigma_prev_0, dims) * x
+                        - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
+                        + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1_0
                 )
         else:
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                    - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                    - 0.5 * expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * D1_0
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+                        - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
+                        - 0.5 * expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * D1_0
                 )
             elif solver_type == 'taylor':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                    - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                    - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1_0
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+                        - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
+                        - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1_0
                 )
         return x_t
 
@@ -827,7 +843,8 @@ class DPM_Solver:
         dims = x.dim()
         model_prev_2, model_prev_1, model_prev_0 = model_prev_list
         t_prev_2, t_prev_1, t_prev_0 = t_prev_list
-        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
+        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(
+            t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
         log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
         sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
         alpha_t = torch.exp(log_alpha_t)
@@ -842,21 +859,22 @@ class DPM_Solver:
         D2 = expand_dims(1. / (r0 + r1), dims) * (D1_0 - D1_1)
         if self.predict_x0:
             x_t = (
-                expand_dims(sigma_t / sigma_prev_0, dims) * x
-                - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1
-                - expand_dims(alpha_t * ((torch.exp(-h) - 1. + h) / h**2 - 0.5), dims) * D2
+                    expand_dims(sigma_t / sigma_prev_0, dims) * x
+                    - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
+                    + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1
+                    - expand_dims(alpha_t * ((torch.exp(-h) - 1. + h) / h ** 2 - 0.5), dims) * D2
             )
         else:
             x_t = (
-                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1
-                - expand_dims(sigma_t * ((torch.exp(h) - 1. - h) / h**2 - 0.5), dims) * D2
+                    expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+                    - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
+                    - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1
+                    - expand_dims(sigma_t * ((torch.exp(h) - 1. - h) / h ** 2 - 0.5), dims) * D2
             )
         return x_t
 
-    def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpm_solver', r1=None, r2=None):
+    def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpm_solver', r1=None,
+                                     r2=None):
         """
         Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
 
@@ -876,9 +894,11 @@ class DPM_Solver:
         if order == 1:
             return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
         elif order == 2:
-            return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1)
+            return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate,
+                                                            solver_type=solver_type, r1=r1)
         elif order == 3:
-            return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1, r2=r2)
+            return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate,
+                                                           solver_type=solver_type, r1=r1, r2=r2)
         else:
             raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
 
@@ -906,7 +926,8 @@ class DPM_Solver:
         else:
             raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
 
-    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, solver_type='dpm_solver'):
+    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5,
+                            solver_type='dpm_solver'):
         """
         The adaptive step size solver based on singlestep DPM-Solver.
 
@@ -938,11 +959,17 @@ class DPM_Solver:
         if order == 2:
             r1 = 0.5
             lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
-            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
+            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
+                                                                                               solver_type=solver_type,
+                                                                                               **kwargs)
         elif order == 3:
             r1, r2 = 1. / 3., 2. / 3.
-            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
-            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
+            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
+                                                                                    return_intermediate=True,
+                                                                                    solver_type=solver_type)
+            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2,
+                                                                                              solver_type=solver_type,
+                                                                                              **kwargs)
         else:
             raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
         while torch.abs((s - t_0)).mean() > t_err:
@@ -963,9 +990,9 @@ class DPM_Solver:
         return x
 
     def sample(self, x, steps=20, t_start=None, t_end=None, order=3, skip_type='time_uniform',
-        method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
-        atol=0.0078, rtol=0.05,
-    ):
+               method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
+               atol=0.0078, rtol=0.05,
+               ):
         """
         Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
 
@@ -1073,7 +1100,8 @@ class DPM_Solver:
         device = x.device
         if method == 'adaptive':
             with torch.no_grad():
-                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, solver_type=solver_type)
+                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol,
+                                             solver_type=solver_type)
         elif method == 'multistep':
             assert steps >= order
             timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
@@ -1083,19 +1111,21 @@ class DPM_Solver:
                 model_prev_list = [self.model_fn(x, vec_t)]
                 t_prev_list = [vec_t]
                 # Init the first `order` values by lower order multistep DPM-Solver.
-                for init_order in range(1, order):
+                for init_order in tqdm(range(1, order), desc="DPM init order"):
                     vec_t = timesteps[init_order].expand(x.shape[0])
-                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, init_order, solver_type=solver_type)
+                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, init_order,
+                                                         solver_type=solver_type)
                     model_prev_list.append(self.model_fn(x, vec_t))
                     t_prev_list.append(vec_t)
                 # Compute the remaining values by `order`-th order multistep DPM-Solver.
-                for step in range(order, steps + 1):
+                for step in tqdm(range(order, steps + 1), desc="DPM multistep"):
                     vec_t = timesteps[step].expand(x.shape[0])
                     if lower_order_final and steps < 15:
                         step_order = min(order, steps + 1 - step)
                     else:
                         step_order = order
-                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, step_order, solver_type=solver_type)
+                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, step_order,
+                                                         solver_type=solver_type)
                     for i in range(order - 1):
                         t_prev_list[i] = t_prev_list[i + 1]
                         model_prev_list[i] = model_prev_list[i + 1]
@@ -1105,14 +1135,18 @@ class DPM_Solver:
                         model_prev_list[-1] = self.model_fn(x, vec_t)
         elif method in ['singlestep', 'singlestep_fixed']:
             if method == 'singlestep':
-                timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order, skip_type=skip_type, t_T=t_T, t_0=t_0, device=device)
+                timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order,
+                                                                                              skip_type=skip_type,
+                                                                                              t_T=t_T, t_0=t_0,
+                                                                                              device=device)
             elif method == 'singlestep_fixed':
                 K = steps // order
-                orders = [order,] * K
+                orders = [order, ] * K
                 timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
             for i, order in enumerate(orders):
                 t_T_inner, t_0_inner = timesteps_outer[i], timesteps_outer[i + 1]
-                timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=t_T_inner.item(), t_0=t_0_inner.item(), N=order, device=device)
+                timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=t_T_inner.item(), t_0=t_0_inner.item(),
+                                                      N=order, device=device)
                 lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
                 vec_s, vec_t = t_T_inner.tile(x.shape[0]), t_0_inner.tile(x.shape[0])
                 h = lambda_inner[-1] - lambda_inner[0]
@@ -1124,7 +1158,6 @@ class DPM_Solver:
         return x
 
 
-
 #############################################################
 # other utility functions
 #############################################################
@@ -1181,4 +1214,4 @@ def expand_dims(v, dims):
     Returns:
         a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
     """
-    return v[(...,) + (None,)*(dims - 1)]
\ No newline at end of file
+    return v[(...,) + (None,) * (dims - 1)]
diff --git a/ldm/models/diffusion/dpm_solver/sampler.py b/ldm/models/diffusion/dpm_solver/sampler.py
index 2c42d6f964d92658e769df95a81dec92250e5a99..fc2c96baf2bf5f8de3684c198bcd1b0df5b51149 100755
--- a/ldm/models/diffusion/dpm_solver/sampler.py
+++ b/ldm/models/diffusion/dpm_solver/sampler.py
@@ -1,10 +1,15 @@
 """SAMPLING ONLY."""
-
 import torch
 
 from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver
 
 
+MODEL_TYPES = {
+    "eps": "noise",
+    "v": "v"
+}
+
+
 class DPMSolverSampler(object):
     def __init__(self, model, **kwargs):
         super().__init__()
@@ -56,7 +61,7 @@ class DPMSolverSampler(object):
         C, H, W = shape
         size = (batch_size, C, H, W)
 
-        # print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
+        print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
 
         device = self.model.betas.device
         if x_T is None:
@@ -69,7 +74,7 @@ class DPMSolverSampler(object):
         model_fn = model_wrapper(
             lambda x, t, c: self.model.apply_model(x, t, c),
             ns,
-            model_type="noise",
+            model_type=MODEL_TYPES[self.model.parameterization],
             guidance_type="classifier-free",
             condition=conditioning,
             unconditional_condition=unconditional_conditioning,
diff --git a/ldm/models/diffusion/plms.py b/ldm/models/diffusion/plms.py
index f5a998b0ce4fea0f9069358a63850e72aa393176..273ffbebaf952ffc25f6b92506b7c91b4af4c3bf 100755
--- a/ldm/models/diffusion/plms.py
+++ b/ldm/models/diffusion/plms.py
@@ -3,10 +3,9 @@
 import torch
 import numpy as np
 from tqdm import tqdm
-from functools import partial
-import copy
 from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
 
+
 class PLMSSampler(object):
     def __init__(self, model, schedule="linear", **kwargs):
         super().__init__()
@@ -24,7 +23,7 @@ class PLMSSampler(object):
         if ddim_eta != 0:
             raise ValueError('ddim_eta must be 0 for PLMS')
         self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps, verbose=verbose)
         alphas_cumprod = self.model.alphas_cumprod
         assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
         to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
@@ -43,14 +42,14 @@ class PLMSSampler(object):
         # ddim sampling parameters
         ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
                                                                                    ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
+                                                                                   eta=ddim_eta, verbose=verbose)
         self.register_buffer('ddim_sigmas', ddim_sigmas)
         self.register_buffer('ddim_alphas', ddim_alphas)
         self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
         self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
         sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
             (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+                    1 - self.alphas_cumprod / self.alphas_cumprod_prev))
         self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
 
     @torch.no_grad()
@@ -75,11 +74,8 @@ class PLMSSampler(object):
                log_every_t=100,
                unconditional_guidance_scale=1.,
                unconditional_conditioning=None,
-               features_adapter1=None,
-               features_adapter2=None,
-               mode = 'sketch',
-               con_strength=30,
-               style_feature=None,
+               features_adapter=None,
+               cond_tau=0.4,
                # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
                **kwargs
                ):
@@ -113,11 +109,8 @@ class PLMSSampler(object):
                                                     log_every_t=log_every_t,
                                                     unconditional_guidance_scale=unconditional_guidance_scale,
                                                     unconditional_conditioning=unconditional_conditioning,
-                                                    features_adapter1=copy.deepcopy(features_adapter1),
-                                                    features_adapter2=copy.deepcopy(features_adapter2),
-                                                    mode = mode,
-                                                    con_strength = con_strength, 
-                                                    style_feature=style_feature#.clone()
+                                                    features_adapter=features_adapter,
+                                                    cond_tau=cond_tau
                                                     )
         return samples, intermediates
 
@@ -127,7 +120,8 @@ class PLMSSampler(object):
                       callback=None, timesteps=None, quantize_denoised=False,
                       mask=None, x0=None, img_callback=None, log_every_t=100,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,features_adapter1=None, features_adapter2=None, mode='sketch', con_strength=30, style_feature=None):
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, features_adapter=None,
+                      cond_tau=0.4):
         device = self.model.betas.device
         b = shape[0]
         if x_T is None:
@@ -141,7 +135,7 @@ class PLMSSampler(object):
             timesteps = self.ddim_timesteps[:subset_end]
 
         intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
+        time_range = list(reversed(range(0, timesteps))) if ddim_use_original_steps else np.flip(timesteps)
         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
         print(f"Running PLMS Sampling with {total_steps} timesteps")
 
@@ -152,41 +146,21 @@ class PLMSSampler(object):
             index = total_steps - i - 1
             ts = torch.full((b,), step, device=device, dtype=torch.long)
             ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
-            cond_in = cond
-            unconditional_conditioning_in = unconditional_conditioning
 
-            if mask is not None :#and index>=10:
+            if mask is not None:  # and index>=10:
                 assert x0 is not None
                 img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
                 img = img_orig * mask + (1. - mask) * img
 
-            if mode == 'sketch':
-                if index<con_strength:
-                    features_adapter = None
-                else:
-                    features_adapter = features_adapter1
-            elif mode == 'style':
-                if index<con_strength:
-                    features_adapter = None
-                else:
-                    features_adapter = features_adapter1
-
-                if index>25:
-                    cond_in = torch.cat([cond, style_feature.clone()], dim=1)
-                    unconditional_conditioning_in = torch.cat(
-                        [unconditional_conditioning, unconditional_conditioning[:, -8:, :]], dim=1)
-            elif mode == 'mul':
-                features_adapter = [a1i*0.5 + a2i for a1i, a2i in zip(features_adapter1, features_adapter2)]
-            else:
-                features_adapter = features_adapter1
-
-            outs = self.p_sample_plms(img, cond_in, ts, index=index, use_original_steps=ddim_use_original_steps,
+            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
                                       quantize_denoised=quantize_denoised, temperature=temperature,
                                       noise_dropout=noise_dropout, score_corrector=score_corrector,
                                       corrector_kwargs=corrector_kwargs,
                                       unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning_in,
-                                      old_eps=old_eps, t_next=ts_next, features_adapter=copy.deepcopy(features_adapter))
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      old_eps=old_eps, t_next=ts_next,
+                                      features_adapter=None if index < int(
+                                          (1 - cond_tau) * total_steps) else features_adapter)
 
             img, pred_x0, e_t = outs
             old_eps.append(e_t)
@@ -204,17 +178,18 @@ class PLMSSampler(object):
     @torch.no_grad()
     def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None, features_adapter=None):
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None,
+                      features_adapter=None):
         b, *_, device = *x.shape, x.device
 
         def get_model_output(x, t):
             if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-                e_t = self.model.apply_model(x, t, c, copy.deepcopy(features_adapter))
+                e_t = self.model.apply_model(x, t, c, features_adapter=features_adapter)
             else:
                 x_in = torch.cat([x] * 2)
                 t_in = torch.cat([t] * 2)
                 c_in = torch.cat([unconditional_conditioning, c])
-                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in, copy.deepcopy(features_adapter)).chunk(2)
+                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in, features_adapter=features_adapter).chunk(2)
                 e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
 
             if score_corrector is not None:
@@ -233,14 +208,14 @@ class PLMSSampler(object):
             a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
             a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
             sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device)
 
             # current prediction for x_0
             pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
             if quantize_denoised:
                 pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
             # direction pointing to x_t
-            dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+            dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
             noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
             if noise_dropout > 0.:
                 noise = torch.nn.functional.dropout(noise, p=noise_dropout)
diff --git a/ldm/modules/attention.py b/ldm/modules/attention.py
index 8c6e174b2f8d01a8ce09990b655bf410caf1509e..88a4d4727a4a337206ecd1dcf559ce90efa3401e 100755
--- a/ldm/modules/attention.py
+++ b/ldm/modules/attention.py
@@ -20,6 +20,10 @@ except:
 import os
 _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
 
+if os.environ.get("DISABLE_XFORMERS", "false").lower() == 'true':
+    XFORMERS_IS_AVAILBLE = False
+
+
 def exists(val):
     return val is not None
 
diff --git a/ldm/modules/diffusionmodules/openaimodel.py b/ldm/modules/diffusionmodules/openaimodel.py
index d6e089a6786da3e977398cc7a1f83ec7fd4a4ff6..09972d58e1a65b88909dfe35c12c9126851da5cf 100755
--- a/ldm/modules/diffusionmodules/openaimodel.py
+++ b/ldm/modules/diffusionmodules/openaimodel.py
@@ -1,7 +1,6 @@
 from abc import abstractmethod
-from functools import partial
 import math
-from typing import Iterable
+import torch
 
 import numpy as np
 import torch as th
@@ -18,6 +17,7 @@ from ldm.modules.diffusionmodules.util import (
     timestep_embedding,
 )
 from ldm.modules.attention import SpatialTransformer
+from ldm.util import exists
 
 
 # dummy replace
@@ -270,8 +270,6 @@ class ResBlock(TimestepBlock):
             h = out_norm(h) * (1 + scale) + shift
             h = out_rest(h)
         else:
-            # print(h.shape, emb_out.shape)
-            # exit(0)
             h = h + emb_out
             h = self.out_layers(h)
         return self.skip_connection(x) + h
@@ -468,16 +466,16 @@ class UNetModel(nn.Module):
         context_dim=None,                 # custom transformer support
         n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
         legacy=True,
-        # l_cond = 4,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
+        use_linear_in_transformer=False,
     ):
         super().__init__()
-        
-        # print('UNet', context_dim)
         if use_spatial_transformer:
             assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
 
         if context_dim is not None:
-            # print('UNet not none', context_dim, context_dim is not None, context_dim != None, context_dim == "None")
             assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
             from omegaconf.listconfig import ListConfig
             if type(context_dim) == ListConfig:
@@ -496,7 +494,24 @@ class UNetModel(nn.Module):
         self.in_channels = in_channels
         self.model_channels = model_channels
         self.out_channels = out_channels
-        self.num_res_blocks = num_res_blocks
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
+
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
@@ -508,9 +523,6 @@ class UNetModel(nn.Module):
         self.num_head_channels = num_head_channels
         self.num_heads_upsample = num_heads_upsample
         self.predict_codebook_ids = n_embed is not None
-        # self.l_cond = l_cond
-        # print(self.l_cond)
-        # exit(0)
 
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
@@ -520,7 +532,13 @@ class UNetModel(nn.Module):
         )
 
         if self.num_classes is not None:
-            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            else:
+                raise ValueError()
 
         self.input_blocks = nn.ModuleList(
             [
@@ -534,7 +552,7 @@ class UNetModel(nn.Module):
         ch = model_channels
         ds = 1
         for level, mult in enumerate(channel_mult):
-            for _ in range(num_res_blocks):
+            for nr in range(self.num_res_blocks[level]):
                 layers = [
                     ResBlock(
                         ch,
@@ -556,17 +574,25 @@ class UNetModel(nn.Module):
                     if legacy:
                         #num_heads = 1
                         dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else SpatialTransformer(
+                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint
+                            )
                         )
-                    )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
                 input_block_chans.append(ch)
@@ -618,8 +644,10 @@ class UNetModel(nn.Module):
                 num_heads=num_heads,
                 num_head_channels=dim_head,
                 use_new_attention_order=use_new_attention_order,
-            ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                            disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
+                            use_checkpoint=use_checkpoint
                         ),
             ResBlock(
                 ch,
@@ -634,7 +662,7 @@ class UNetModel(nn.Module):
 
         self.output_blocks = nn.ModuleList([])
         for level, mult in list(enumerate(channel_mult))[::-1]:
-            for i in range(num_res_blocks + 1):
+            for i in range(self.num_res_blocks[level] + 1):
                 ich = input_block_chans.pop()
                 layers = [
                     ResBlock(
@@ -657,18 +685,26 @@ class UNetModel(nn.Module):
                     if legacy:
                         #num_heads = 1
                         dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads_upsample,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if not exists(num_attention_blocks) or i < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads_upsample,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else SpatialTransformer(
+                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint
+                            )
                         )
-                    )
-                if level and i == num_res_blocks:
+                if level and i == self.num_res_blocks[level]:
                     out_ch = ch
                     layers.append(
                         ResBlock(
@@ -716,7 +752,7 @@ class UNetModel(nn.Module):
         self.middle_block.apply(convert_module_to_f32)
         self.output_blocks.apply(convert_module_to_f32)
 
-    def forward(self, x, timesteps=None, context=None, y=None, features_adapter=None, step_cur=0,**kwargs):
+    def forward(self, x, timesteps=None, context=None, y=None, features_adapter=None, append_to_context=None, **kwargs):
         """
         Apply the model to an input batch.
         :param x: an [N x C x ...] Tensor of inputs.
@@ -733,21 +769,26 @@ class UNetModel(nn.Module):
         emb = self.time_embed(t_emb)
 
         if self.num_classes is not None:
-            assert y.shape == (x.shape[0],)
+            assert y.shape[0] == x.shape[0]
             emb = emb + self.label_emb(y)
 
         h = x.type(self.dtype)
 
+        if append_to_context is not None:
+            context = torch.cat([context, append_to_context], dim=1)
+
+        adapter_idx = 0
         for id, module in enumerate(self.input_blocks):
             h = module(h, emb, context)
-            if ((id+1)%3 == 0) and features_adapter is not None and len(features_adapter):
-                h = h + features_adapter.pop(0)
+            if ((id+1)%3 == 0) and features_adapter is not None:
+                h = h + features_adapter[adapter_idx]
+                adapter_idx += 1
             hs.append(h)
         if features_adapter is not None:
-            assert len(features_adapter)==0, 'Wrong features_adapter'
+            assert len(features_adapter)==adapter_idx, 'Wrong features_adapter'
 
         h = self.middle_block(h, emb, context)
-        for id, module in enumerate(self.output_blocks):
+        for module in self.output_blocks:
             h = th.cat([h, hs.pop()], dim=1)
             h = module(h, emb, context)
         h = h.type(x.dtype)
@@ -755,222 +796,3 @@ class UNetModel(nn.Module):
             return self.id_predictor(h)
         else:
             return self.out(h)
-
-
-class EncoderUNetModel(nn.Module):
-    """
-    The half UNet model with attention and timestep embedding.
-    For usage, see UNet.
-    """
-
-    def __init__(
-        self,
-        image_size,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        use_checkpoint=False,
-        use_fp16=False,
-        num_heads=1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        pool="adaptive",
-        *args,
-        **kwargs
-    ):
-        super().__init__()
-
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        self.out_channels = out_channels
-        self.num_res_blocks = num_res_blocks
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-
-        time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
-            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
-        )
-
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for _ in range(num_res_blocks):
-                layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads,
-                            num_head_channels=num_head_channels,
-                            use_new_attention_order=use_new_attention_order,
-                        )
-                    )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                ds *= 2
-                self._feature_size += ch
-
-        self.middle_block = TimestepEmbedSequential(
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=num_head_channels,
-                use_new_attention_order=use_new_attention_order,
-            ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-        )
-        self._feature_size += ch
-        self.pool = pool
-        if pool == "adaptive":
-            self.out = nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                nn.AdaptiveAvgPool2d((1, 1)),
-                zero_module(conv_nd(dims, ch, out_channels, 1)),
-                nn.Flatten(),
-            )
-        elif pool == "attention":
-            assert num_head_channels != -1
-            self.out = nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                AttentionPool2d(
-                    (image_size // ds), ch, num_head_channels, out_channels
-                ),
-            )
-        elif pool == "spatial":
-            self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048),
-                nn.ReLU(),
-                nn.Linear(2048, self.out_channels),
-            )
-        elif pool == "spatial_v2":
-            self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048),
-                normalization(2048),
-                nn.SiLU(),
-                nn.Linear(2048, self.out_channels),
-            )
-        else:
-            raise NotImplementedError(f"Unexpected {pool} pooling")
-
-    def convert_to_fp16(self):
-        """
-        Convert the torso of the model to float16.
-        """
-        self.input_blocks.apply(convert_module_to_f16)
-        self.middle_block.apply(convert_module_to_f16)
-
-    def convert_to_fp32(self):
-        """
-        Convert the torso of the model to float32.
-        """
-        self.input_blocks.apply(convert_module_to_f32)
-        self.middle_block.apply(convert_module_to_f32)
-
-    def forward(self, x, timesteps):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :return: an [N x K] Tensor of outputs.
-        """
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
-
-        results = []
-        h = x.type(self.dtype)
-        for module in self.input_blocks:
-            h = module(h, emb)
-            if self.pool.startswith("spatial"):
-                results.append(h.type(x.dtype).mean(dim=(2, 3)))
-        h = self.middle_block(h, emb)
-        if self.pool.startswith("spatial"):
-            results.append(h.type(x.dtype).mean(dim=(2, 3)))
-            h = th.cat(results, axis=-1)
-            return self.out(h)
-        else:
-            h = h.type(x.dtype)
-            return self.out(h)
-
diff --git a/ldm/modules/diffusionmodules/util.py b/ldm/modules/diffusionmodules/util.py
index a952e6c40308c33edd422da0ce6a60f47e73661b..637363dfe34799e70cfdbcd11445212df9d9ca1f 100755
--- a/ldm/modules/diffusionmodules/util.py
+++ b/ldm/modules/diffusionmodules/util.py
@@ -122,7 +122,9 @@ class CheckpointFunction(torch.autograd.Function):
         ctx.run_function = run_function
         ctx.input_tensors = list(args[:length])
         ctx.input_params = list(args[length:])
-
+        ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
+                                   "dtype": torch.get_autocast_gpu_dtype(),
+                                   "cache_enabled": torch.is_autocast_cache_enabled()}
         with torch.no_grad():
             output_tensors = ctx.run_function(*ctx.input_tensors)
         return output_tensors
@@ -130,7 +132,8 @@ class CheckpointFunction(torch.autograd.Function):
     @staticmethod
     def backward(ctx, *output_grads):
         ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
-        with torch.enable_grad():
+        with torch.enable_grad(), \
+                torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
             # Fixes a bug where the first op in run_function modifies the
             # Tensor storage in place, which is not allowed for detach()'d
             # Tensors.
diff --git a/ldm/modules/ema.py b/ldm/modules/ema.py
index c8c75af43565f6e140287644aaaefa97dd6e67c5..bded25019b9bcbcd0260f0b8185f8c7859ca58c4 100755
--- a/ldm/modules/ema.py
+++ b/ldm/modules/ema.py
@@ -10,24 +10,28 @@ class LitEma(nn.Module):
 
         self.m_name2s_name = {}
         self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
-        self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
-                             else torch.tensor(-1,dtype=torch.int))
+        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_upates
+        else torch.tensor(-1, dtype=torch.int))
 
         for name, p in model.named_parameters():
             if p.requires_grad:
-                #remove as '.'-character is not allowed in buffers
-                s_name = name.replace('.','')
-                self.m_name2s_name.update({name:s_name})
-                self.register_buffer(s_name,p.clone().detach().data)
+                # remove as '.'-character is not allowed in buffers
+                s_name = name.replace('.', '')
+                self.m_name2s_name.update({name: s_name})
+                self.register_buffer(s_name, p.clone().detach().data)
 
         self.collected_params = []
 
-    def forward(self,model):
+    def reset_num_updates(self):
+        del self.num_updates
+        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int))
+
+    def forward(self, model):
         decay = self.decay
 
         if self.num_updates >= 0:
             self.num_updates += 1
-            decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
 
         one_minus_decay = 1.0 - decay
 
diff --git a/ldm/modules/encoders/adapter.py b/ldm/modules/encoders/adapter.py
index 36a5ec987ff2829606860f781ab54782c2aaf7de..0eef97edcaca1186835f32dc1b0c7bcb9c4bd3ec 100755
--- a/ldm/modules/encoders/adapter.py
+++ b/ldm/modules/encoders/adapter.py
@@ -1,9 +1,8 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from ldm.modules.attention import SpatialTransformer, BasicTransformerBlock
 from collections import OrderedDict
 
+
 def conv_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D convolution module.
@@ -16,6 +15,7 @@ def conv_nd(dims, *args, **kwargs):
         return nn.Conv3d(*args, **kwargs)
     raise ValueError(f"unsupported dimensions: {dims}")
 
+
 def avg_pool_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D average pooling module.
@@ -28,6 +28,7 @@ def avg_pool_nd(dims, *args, **kwargs):
         return nn.AvgPool3d(*args, **kwargs)
     raise ValueError(f"unsupported dimensions: {dims}")
 
+
 class Downsample(nn.Module):
     """
     A downsampling layer with an optional convolution.
@@ -37,7 +38,7 @@ class Downsample(nn.Module):
                  downsampling occurs in the inner-two dimensions.
     """
 
-    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
@@ -60,15 +61,16 @@ class Downsample(nn.Module):
 class ResnetBlock(nn.Module):
     def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
         super().__init__()
-        ps = ksize//2
-        if in_c != out_c or sk==False:
+        ps = ksize // 2
+        if in_c != out_c or sk == False:
             self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
         else:
+            # print('n_in')
             self.in_conv = None
         self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
         self.act = nn.ReLU()
         self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
-        if sk==False:
+        if sk == False:
             self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps)
         else:
             self.skep = None
@@ -80,7 +82,7 @@ class ResnetBlock(nn.Module):
     def forward(self, x):
         if self.down == True:
             x = self.down_opt(x)
-        if self.in_conv is not None: # edit
+        if self.in_conv is not None:  # edit
             x = self.in_conv(x)
 
         h = self.block1(x)
@@ -101,12 +103,14 @@ class Adapter(nn.Module):
         self.body = []
         for i in range(len(channels)):
             for j in range(nums_rb):
-                if (i!=0) and (j==0):
-                    self.body.append(ResnetBlock(channels[i-1], channels[i], down=True, ksize=ksize, sk=sk, use_conv=use_conv))
+                if (i != 0) and (j == 0):
+                    self.body.append(
+                        ResnetBlock(channels[i - 1], channels[i], down=True, ksize=ksize, sk=sk, use_conv=use_conv))
                 else:
-                    self.body.append(ResnetBlock(channels[i], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv))
+                    self.body.append(
+                        ResnetBlock(channels[i], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv))
         self.body = nn.ModuleList(self.body)
-        self.conv_in = nn.Conv2d(cin,channels[0], 3, 1, 1)
+        self.conv_in = nn.Conv2d(cin, channels[0], 3, 1, 1)
 
     def forward(self, x):
         # unshuffle
@@ -116,12 +120,79 @@ class Adapter(nn.Module):
         x = self.conv_in(x)
         for i in range(len(self.channels)):
             for j in range(self.nums_rb):
-                idx = i*self.nums_rb +j
+                idx = i * self.nums_rb + j
                 x = self.body[idx](x)
             features.append(x)
 
         return features
-    
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()),
+                         ("c_proj", nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class StyleAdapter(nn.Module):
+
+    def __init__(self, width=1024, context_dim=768, num_head=8, n_layes=3, num_token=4):
+        super().__init__()
+
+        scale = width ** -0.5
+        self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(width, num_head) for _ in range(n_layes)])
+        self.num_token = num_token
+        self.style_embedding = nn.Parameter(torch.randn(1, num_token, width) * scale)
+        self.ln_post = LayerNorm(width)
+        self.ln_pre = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, context_dim))
+
+    def forward(self, x):
+        # x shape [N, HW+1, C]
+        style_embedding = self.style_embedding + torch.zeros(
+            (x.shape[0], self.num_token, self.style_embedding.shape[-1]), device=x.device)
+        x = torch.cat([x, style_embedding], dim=1)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer_layes(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, -self.num_token:, :])
+        x = x @ self.proj
+
+        return x
+
 
 class ResnetBlock_light(nn.Module):
     def __init__(self, in_c):
@@ -185,66 +256,3 @@ class Adapter_light(nn.Module):
             features.append(x)
 
         return features
-
-class QuickGELU(nn.Module):
-
-    def forward(self, x: torch.Tensor):
-        return x * torch.sigmoid(1.702 * x)
-    
-class ResidualAttentionBlock(nn.Module):
-
-    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
-        super().__init__()
-
-        self.attn = nn.MultiheadAttention(d_model, n_head)
-        self.ln_1 = LayerNorm(d_model)
-        self.mlp = nn.Sequential(
-            OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()),
-                         ("c_proj", nn.Linear(d_model * 4, d_model))]))
-        self.ln_2 = LayerNorm(d_model)
-        self.attn_mask = attn_mask
-
-    def attention(self, x: torch.Tensor):
-        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
-        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
-
-    def forward(self, x: torch.Tensor):
-        x = x + self.attention(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-
-class LayerNorm(nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16."""
-
-    def forward(self, x: torch.Tensor):
-        orig_type = x.dtype
-        ret = super().forward(x.type(torch.float32))
-        return ret.type(orig_type)
-    
-class StyleAdapter(nn.Module):
-
-    def __init__(self, width=1024, context_dim=768, num_head=8, n_layes=3, num_token=4):
-        super().__init__()
-
-        scale = width ** -0.5
-        self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(width, num_head) for _ in range(n_layes)])
-        self.num_token = num_token
-        self.style_embedding = nn.Parameter(torch.randn(1, num_token, width) * scale)
-        self.ln_post = LayerNorm(width)
-        self.ln_pre = LayerNorm(width)
-        self.proj = nn.Parameter(scale * torch.randn(width, context_dim))
-
-    def forward(self, x):
-        # x shape [N, HW+1, C]
-        style_embedding = self.style_embedding + torch.zeros(
-            (x.shape[0], self.num_token, self.style_embedding.shape[-1]), device=x.device)
-        x = torch.cat([x, style_embedding], dim=1)
-        x = self.ln_pre(x)
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer_layes(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-
-        x = self.ln_post(x[:, -self.num_token:, :])
-        x = x @ self.proj
-
-        return x
\ No newline at end of file
diff --git a/ldm/modules/encoders/modules.py b/ldm/modules/encoders/modules.py
index ededbe43e9e0466b9979079060692e38f561d4d3..d59229ac1c97980e811e3b808f3431311c4f3b7d 100755
--- a/ldm/modules/encoders/modules.py
+++ b/ldm/modules/encoders/modules.py
@@ -1,12 +1,13 @@
 import torch
 import torch.nn as nn
-from functools import partial
-import clip
-from einops import rearrange, repeat
-from transformers import CLIPTokenizer, CLIPTextModel
-import kornia
+import math
+from torch.utils.checkpoint import checkpoint
 
-from ldm.modules.x_transformer import Encoder, TransformerWrapper  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
+from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel, CLIPModel
+
+import open_clip
+import re
+from ldm.util import default, count_params
 
 
 class AbstractEncoder(nn.Module):
@@ -17,6 +18,11 @@ class AbstractEncoder(nn.Module):
         raise NotImplementedError
 
 
+class IdentityEncoder(AbstractEncoder):
+
+    def encode(self, x):
+        return x
+
 
 class ClassEmbedder(nn.Module):
     def __init__(self, embed_dim, n_classes=1000, key='class'):
@@ -33,116 +39,48 @@ class ClassEmbedder(nn.Module):
         return c
 
 
-class TransformerEmbedder(AbstractEncoder):
-    """Some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
+class FrozenT5Embedder(AbstractEncoder):
+    """Uses the T5 transformer encoder for text"""
+    def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77, freeze=True):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
         super().__init__()
+        self.tokenizer = T5Tokenizer.from_pretrained(version)
+        self.transformer = T5EncoderModel.from_pretrained(version)
         self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer))
-
-    def forward(self, tokens):
-        tokens = tokens.to(self.device)  # meh
-        z = self.transformer(tokens, return_embeddings=True)
-        return z
+        self.max_length = max_length   # TODO: typical value?
+        if freeze:
+            self.freeze()
 
-    def encode(self, x):
-        return self(x)
-
-
-class BERTTokenizer(AbstractEncoder):
-    """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
-    def __init__(self, device="cuda", vq_interface=True, max_length=77):
-        super().__init__()
-        from transformers import BertTokenizerFast  # TODO: add to reuquirements
-        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-        self.device = device
-        self.vq_interface = vq_interface
-        self.max_length = max_length
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        #self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
 
     def forward(self, text):
         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
         tokens = batch_encoding["input_ids"].to(self.device)
-        return tokens
-
-    @torch.no_grad()
-    def encode(self, text):
-        tokens = self(text)
-        if not self.vq_interface:
-            return tokens
-        return None, None, [None, None, tokens]
-
-    def decode(self, text):
-        return text
-
-
-class BERTEmbedder(AbstractEncoder):
-    """Uses the BERT tokenizr model and add some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
-                 device="cuda",use_tokenizer=True, embedding_dropout=0.0):
-        super().__init__()
-        self.use_tknz_fn = use_tokenizer
-        if self.use_tknz_fn:
-            self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
-        self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer),
-                                              emb_dropout=embedding_dropout)
+        outputs = self.transformer(input_ids=tokens)
 
-    def forward(self, text):
-        if self.use_tknz_fn:
-            tokens = self.tknz_fn(text)#.to(self.device)
-        else:
-            tokens = text
-        z = self.transformer(tokens, return_embeddings=True)
+        z = outputs.last_hidden_state
         return z
 
     def encode(self, text):
-        # output of length 77
         return self(text)
 
 
-class SpatialRescaler(nn.Module):
-    def __init__(self,
-                 n_stages=1,
-                 method='bilinear',
-                 multiplier=0.5,
-                 in_channels=3,
-                 out_channels=None,
-                 bias=False):
-        super().__init__()
-        self.n_stages = n_stages
-        assert self.n_stages >= 0
-        assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
-        self.multiplier = multiplier
-        self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
-        self.remap_output = out_channels is not None
-        if self.remap_output:
-            print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
-            self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
-
-    def forward(self,x):
-        for stage in range(self.n_stages):
-            x = self.interpolator(x, scale_factor=self.multiplier)
-
-
-        if self.remap_output:
-            x = self.channel_mapper(x)
-        return x
-
-    def encode(self, x):
-        return self(x)
-
 class FrozenCLIPEmbedder(AbstractEncoder):
-    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
-    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77,
+                 freeze=True, layer="last"):  # clip-vit-base-patch32
         super().__init__()
         self.tokenizer = CLIPTokenizer.from_pretrained(version)
-        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.transformer = CLIPModel.from_pretrained(version).text_model
         self.device = device
         self.max_length = max_length
-        self.freeze()
+        if freeze:
+            self.freeze()
+        self.layer = layer
 
     def freeze(self):
         self.transformer = self.transformer.eval()
@@ -153,26 +91,47 @@ class FrozenCLIPEmbedder(AbstractEncoder):
         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
         tokens = batch_encoding["input_ids"].to(self.device)
-        outputs = self.transformer(input_ids=tokens)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer != 'last')
 
-        z = outputs.last_hidden_state
+        if self.layer == 'penultimate':
+            z = outputs.hidden_states[-2]
+            z = self.transformer.final_layer_norm(z)
+        else:
+            z = outputs.last_hidden_state
         return z
 
     def encode(self, text):
         return self(text)
 
 
-class FrozenCLIPTextEmbedder(nn.Module):
+class FrozenOpenCLIPEmbedder(AbstractEncoder):
     """
-    Uses the CLIP transformer encoder for text.
+    Uses the OpenCLIP transformer encoder for text
     """
-    def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True):
+    LAYERS = [
+        #"pooled",
+        "last",
+        "penultimate"
+    ]
+    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
+                 freeze=True, layer="last"):
         super().__init__()
-        self.model, _ = clip.load(version, jit=False, device="cpu")
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
+        del model.visual
+        self.model = model
+
         self.device = device
         self.max_length = max_length
-        self.n_repeat = n_repeat
-        self.normalize = normalize
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
 
     def freeze(self):
         self.model = self.model.eval()
@@ -180,55 +139,303 @@ class FrozenCLIPTextEmbedder(nn.Module):
             param.requires_grad = False
 
     def forward(self, text):
-        tokens = clip.tokenize(text).to(self.device)
-        z = self.model.encode_text(tokens)
-        if self.normalize:
-            z = z / torch.linalg.norm(z, dim=1, keepdim=True)
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
         return z
 
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        return x
+
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask = None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+
     def encode(self, text):
-        z = self(text)
-        if z.ndim==2:
-            z = z[:, None, :]
-        z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat)
-        return z
+        return self(text)
 
 
-class FrozenClipImageEmbedder(nn.Module):
-    """
-        Uses the CLIP image encoder.
-        """
-    def __init__(
-            self,
-            model,
-            jit=False,
-            device='cuda' if torch.cuda.is_available() else 'cpu',
-            antialias=False,
-        ):
+class FrozenCLIPT5Encoder(AbstractEncoder):
+    def __init__(self, clip_version="openai/clip-vit-large-patch14", t5_version="google/t5-v1_1-xl", device="cuda",
+                 clip_max_length=77, t5_max_length=77):
         super().__init__()
-        self.model, _ = clip.load(name=model, device=device, jit=jit)
+        self.clip_encoder = FrozenCLIPEmbedder(clip_version, device, max_length=clip_max_length)
+        self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length)
+        print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder)*1.e-6:.2f} M parameters, "
+              f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder)*1.e-6:.2f} M params.")
 
-        self.antialias = antialias
+    def encode(self, text):
+        return self(text)
 
-        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
-        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+    def forward(self, text):
+        clip_z = self.clip_encoder.encode(text)
+        t5_z = self.t5_encoder.encode(text)
+        return [clip_z, t5_z]
+
+
+# code from sd-webui
+re_attention = re.compile(r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""", re.X)
+
+
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
 
-    def preprocess(self, x):
-        # normalize to [0,1]
-        x = kornia.geometry.resize(x, (224, 224),
-                                   interpolation='bicubic',align_corners=True,
-                                   antialias=self.antialias)
-        x = (x + 1.) / 2.
-        # renormalize according to clip
-        x = kornia.enhance.normalize(x, self.mean, self.std)
-        return x
+    res = []
+    round_brackets = []
+    square_brackets = []
+
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+
+        if text.startswith('\\'):
+            res.append([text[1:], 1.0])
+        elif text == '(':
+            round_brackets.append(len(res))
+        elif text == '[':
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ')' and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == ']' and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+
+    if len(res) == 0:
+        res = [["", 1.0]]
+
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+
+    return res
+
+class WebUIFrozenCLIPEmebedder(AbstractEncoder):
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", freeze=True, layer="penultimate"):
+        super(WebUIFrozenCLIPEmebedder, self).__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPModel.from_pretrained(version).text_model
+        self.device = device
+        self.layer = layer
+        if freeze:
+            self.freeze()
+
+        self.comma_token = [v for k, v in self.tokenizer.get_vocab().items() if k == ',</w>'][0]
+        self.comma_padding_backtrack = 20
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def tokenize(self, texts):
+        tokenized = self.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
+        return tokenized
+
+    def encode_with_transformers(self, tokens):
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer!='last')
+
+        if self.layer == 'penultimate':
+            z = outputs.hidden_states[-2]
+            z = self.transformer.final_layer_norm(z)
+        else:
+            z = outputs.last_hidden_state
+
+        return z
+
+    def tokenize_line(self, line):
+        parsed = parse_prompt_attention(line)
+        # print(parsed)
+
+        tokenized = self.tokenize([text for text, _ in parsed])
+
+        remade_tokens = []
+        multipliers = []
+        last_comma = -1
+
+        for tokens, (text, weight) in zip(tokenized, parsed):
+            i = 0
+            while i < len(tokens):
+                token = tokens[i]
+
+                if token == self.comma_token:
+                    last_comma = len(remade_tokens)
+                elif self.comma_padding_backtrack != 0 and max(len(remade_tokens),
+                                                               1) % 75 == 0 and last_comma != -1 and len(
+                        remade_tokens) - last_comma <= self.comma_padding_backtrack:
+                    last_comma += 1
+                    reloc_tokens = remade_tokens[last_comma:]
+                    reloc_mults = multipliers[last_comma:]
+
+                    remade_tokens = remade_tokens[:last_comma]
+                    length = len(remade_tokens)
+
+                    rem = int(math.ceil(length / 75)) * 75 - length
+                    remade_tokens += [self.tokenizer.eos_token_id] * rem + reloc_tokens
+                    multipliers = multipliers[:last_comma] + [1.0] * rem + reloc_mults
+
+                remade_tokens.append(token)
+                multipliers.append(weight)
+                i += 1
+
+        token_count = len(remade_tokens)
+        prompt_target_length = math.ceil(max(token_count, 1) / 75) * 75
+        tokens_to_add = prompt_target_length - len(remade_tokens)
+
+        remade_tokens = remade_tokens + [self.tokenizer.eos_token_id] * tokens_to_add
+        multipliers = multipliers + [1.0] * tokens_to_add
+
+        return remade_tokens, multipliers, token_count
+
+    def process_text(self, texts):
+        remade_batch_tokens = []
+        token_count = 0
+
+        cache = {}
+        batch_multipliers = []
+        for line in texts:
+            if line in cache:
+                remade_tokens, multipliers = cache[line]
+            else:
+                remade_tokens, multipliers, current_token_count = self.tokenize_line(line)
+                token_count = max(current_token_count, token_count)
+
+                cache[line] = (remade_tokens, multipliers)
+
+            remade_batch_tokens.append(remade_tokens)
+            batch_multipliers.append(multipliers)
+
+        return batch_multipliers, remade_batch_tokens, token_count
+
+    def process_tokens(self, remade_batch_tokens, batch_multipliers):
+        remade_batch_tokens = [[self.tokenizer.bos_token_id] + x[:75] + [self.tokenizer.eos_token_id] for x in remade_batch_tokens]
+        batch_multipliers = [[1.0] + x[:75] + [1.0] for x in batch_multipliers]
+
+        tokens = torch.asarray(remade_batch_tokens).to(self.device)
+
+        z = self.encode_with_transformers(tokens)
+
+        # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise
+        batch_multipliers_of_same_length = [x + [1.0] * (75 - len(x)) for x in batch_multipliers]
+        batch_multipliers = torch.asarray(batch_multipliers_of_same_length).to(self.device)
+        original_mean = z.mean()
+        z *= batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape)
+        new_mean = z.mean()
+        z *= original_mean / new_mean
+
+        return z
+
+    def forward(self, text):
+        batch_multipliers, remade_batch_tokens, token_count = self.process_text(text)
+
+        z = None
+        i = 0
+        while max(map(len, remade_batch_tokens)) != 0:
+            rem_tokens = [x[75:] for x in remade_batch_tokens]
+            rem_multipliers = [x[75:] for x in batch_multipliers]
+
+            tokens = []
+            multipliers = []
+            for j in range(len(remade_batch_tokens)):
+                if len(remade_batch_tokens[j]) > 0:
+                    tokens.append(remade_batch_tokens[j][:75])
+                    multipliers.append(batch_multipliers[j][:75])
+                else:
+                    tokens.append([self.tokenizer.eos_token_id] * 75)
+                    multipliers.append([1.0] * 75)
+
+            z1 = self.process_tokens(tokens, multipliers)
+            z = z1 if z is None else torch.cat((z, z1), axis=-2)
+
+            remade_batch_tokens = rem_tokens
+            batch_multipliers = rem_multipliers
+            i += 1
+
+        return z
+
+    def encode(self, text):
+        return self(text)
 
-    def forward(self, x):
-        # x is assumed to be in range [-1,1]
-        return self.model.encode_image(self.preprocess(x))
 
 
 if __name__ == "__main__":
-    from ldm.util import count_params
     model = FrozenCLIPEmbedder()
-    count_params(model, verbose=True)
\ No newline at end of file
+    count_params(model, verbose=True)
diff --git a/ldm/modules/structure_condition/__init__.py b/ldm/modules/extra_condition/__init__.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/__init__.py
rename to ldm/modules/extra_condition/__init__.py
diff --git a/ldm/modules/extra_condition/api.py b/ldm/modules/extra_condition/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6968ef9dd4a087c862f8e66b05108eb12f671f4
--- /dev/null
+++ b/ldm/modules/extra_condition/api.py
@@ -0,0 +1,269 @@
+from enum import Enum, unique
+
+import cv2
+import torch
+from basicsr.utils import img2tensor
+from ldm.util import resize_numpy_image
+from PIL import Image
+from torch import autocast
+
+
+@unique
+class ExtraCondition(Enum):
+    sketch = 0
+    keypose = 1
+    seg = 2
+    depth = 3
+    canny = 4
+    style = 5
+    color = 6
+    openpose = 7
+
+
+def get_cond_model(opt, cond_type: ExtraCondition):
+    if cond_type == ExtraCondition.sketch:
+        from ldm.modules.extra_condition.model_edge import pidinet
+        model = pidinet()
+        ckp = torch.load('models/table5_pidinet.pth', map_location='cpu')['state_dict']
+        model.load_state_dict({k.replace('module.', ''): v for k, v in ckp.items()}, strict=True)
+        model.to(opt.device)
+        return model
+    elif cond_type == ExtraCondition.seg:
+        raise NotImplementedError
+    elif cond_type == ExtraCondition.keypose:
+        import mmcv
+        from mmdet.apis import init_detector
+        from mmpose.apis import init_pose_model
+        det_config = 'configs/mm/faster_rcnn_r50_fpn_coco.py'
+        det_checkpoint = 'models/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
+        pose_config = 'configs/mm/hrnet_w48_coco_256x192.py'
+        pose_checkpoint = 'models/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth'
+        det_config_mmcv = mmcv.Config.fromfile(det_config)
+        det_model = init_detector(det_config_mmcv, det_checkpoint, device=opt.device)
+        pose_config_mmcv = mmcv.Config.fromfile(pose_config)
+        pose_model = init_pose_model(pose_config_mmcv, pose_checkpoint, device=opt.device)
+        return {'pose_model': pose_model, 'det_model': det_model}
+    elif cond_type == ExtraCondition.depth:
+        from ldm.modules.extra_condition.midas.api import MiDaSInference
+        model = MiDaSInference(model_type='dpt_hybrid').to(opt.device)
+        return model
+    elif cond_type == ExtraCondition.canny:
+        return None
+    elif cond_type == ExtraCondition.style:
+        from transformers import CLIPProcessor, CLIPVisionModel
+        version = 'openai/clip-vit-large-patch14'
+        processor = CLIPProcessor.from_pretrained(version)
+        clip_vision_model = CLIPVisionModel.from_pretrained(version).to(opt.device)
+        return {'processor': processor, 'clip_vision_model': clip_vision_model}
+    elif cond_type == ExtraCondition.color:
+        return None
+    elif cond_type == ExtraCondition.openpose:
+        from ldm.modules.extra_condition.openpose.api import OpenposeInference
+        model = OpenposeInference().to(opt.device)
+        return model
+    else:
+        raise NotImplementedError
+
+
+def get_cond_sketch(opt, cond_image, cond_inp_type, cond_model=None):
+    if isinstance(cond_image, str):
+        edge = cv2.imread(cond_image)
+    else:
+        # for gradio input, pay attention, it's rgb numpy
+        edge = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    edge = resize_numpy_image(edge, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = edge.shape[:2]
+    if cond_inp_type == 'sketch':
+        edge = img2tensor(edge)[0].unsqueeze(0).unsqueeze(0) / 255.
+        edge = edge.to(opt.device)
+    elif cond_inp_type == 'image':
+        edge = img2tensor(edge).unsqueeze(0) / 255.
+        edge = cond_model(edge.to(opt.device))[-1]
+    else:
+        raise NotImplementedError
+
+    # edge = 1-edge # for white background
+    edge = edge > 0.5
+    edge = edge.float()
+
+    return edge
+
+
+def get_cond_seg(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        seg = cv2.imread(cond_image)
+    else:
+        seg = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    seg = resize_numpy_image(seg, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = seg.shape[:2]
+    if cond_inp_type == 'seg':
+        seg = img2tensor(seg).unsqueeze(0) / 255.
+        seg = seg.to(opt.device)
+    else:
+        raise NotImplementedError
+
+    return seg
+
+
+def get_cond_keypose(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        pose = cv2.imread(cond_image)
+    else:
+        pose = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    pose = resize_numpy_image(pose, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = pose.shape[:2]
+    if cond_inp_type == 'keypose':
+        pose = img2tensor(pose).unsqueeze(0) / 255.
+        pose = pose.to(opt.device)
+    elif cond_inp_type == 'image':
+        from ldm.modules.extra_condition.utils import imshow_keypoints
+        from mmdet.apis import inference_detector
+        from mmpose.apis import (inference_top_down_pose_model, process_mmdet_results)
+
+        # mmpose seems not compatible with autocast fp16
+        with autocast("cuda", dtype=torch.float32):
+            mmdet_results = inference_detector(cond_model['det_model'], pose)
+            # keep the person class bounding boxes.
+            person_results = process_mmdet_results(mmdet_results, 1)
+
+            # optional
+            return_heatmap = False
+            dataset = cond_model['pose_model'].cfg.data['test']['type']
+
+            # e.g. use ('backbone', ) to return backbone feature
+            output_layer_names = None
+            pose_results, returned_outputs = inference_top_down_pose_model(
+                cond_model['pose_model'],
+                pose,
+                person_results,
+                bbox_thr=0.2,
+                format='xyxy',
+                dataset=dataset,
+                dataset_info=None,
+                return_heatmap=return_heatmap,
+                outputs=output_layer_names)
+
+        # show the results
+        pose = imshow_keypoints(pose, pose_results, radius=2, thickness=2)
+        pose = img2tensor(pose).unsqueeze(0) / 255.
+        pose = pose.to(opt.device)
+    else:
+        raise NotImplementedError
+
+    return pose
+
+
+def get_cond_depth(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        depth = cv2.imread(cond_image)
+    else:
+        depth = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    depth = resize_numpy_image(depth, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = depth.shape[:2]
+    if cond_inp_type == 'depth':
+        depth = img2tensor(depth).unsqueeze(0) / 255.
+        depth = depth.to(opt.device)
+    elif cond_inp_type == 'image':
+        depth = img2tensor(depth).unsqueeze(0) / 127.5 - 1.0
+        depth = cond_model(depth.to(opt.device)).repeat(1, 3, 1, 1)
+        depth -= torch.min(depth)
+        depth /= torch.max(depth)
+    else:
+        raise NotImplementedError
+
+    return depth
+
+
+def get_cond_canny(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        canny = cv2.imread(cond_image)
+    else:
+        canny = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    canny = resize_numpy_image(canny, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = canny.shape[:2]
+    if cond_inp_type == 'canny':
+        canny = img2tensor(canny)[0:1].unsqueeze(0) / 255.
+        canny = canny.to(opt.device)
+    elif cond_inp_type == 'image':
+        canny = cv2.Canny(canny, 100, 200)[..., None]
+        canny = img2tensor(canny).unsqueeze(0) / 255.
+        canny = canny.to(opt.device)
+    else:
+        raise NotImplementedError
+
+    return canny
+
+
+def get_cond_style(opt, cond_image, cond_inp_type='image', cond_model=None):
+    assert cond_inp_type == 'image'
+    if isinstance(cond_image, str):
+        style = Image.open(cond_image)
+    else:
+        # numpy image to PIL image
+        style = Image.fromarray(cond_image)
+
+    style_for_clip = cond_model['processor'](images=style, return_tensors="pt")['pixel_values']
+    style_feat = cond_model['clip_vision_model'](style_for_clip.to(opt.device))['last_hidden_state']
+
+    return style_feat
+
+
+def get_cond_color(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        color = cv2.imread(cond_image)
+    else:
+        color = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    color = resize_numpy_image(color, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = color.shape[:2]
+    if cond_inp_type == 'image':
+        color = cv2.resize(color, (opt.W//64, opt.H//64), interpolation=cv2.INTER_CUBIC)
+        color = cv2.resize(color, (opt.W, opt.H), interpolation=cv2.INTER_NEAREST)
+    color = img2tensor(color).unsqueeze(0) / 255.
+    color = color.to(opt.device)
+    return color
+
+
+def get_cond_openpose(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        openpose_keypose = cv2.imread(cond_image)
+    else:
+        openpose_keypose = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    openpose_keypose = resize_numpy_image(
+        openpose_keypose, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = openpose_keypose.shape[:2]
+    if cond_inp_type == 'openpose':
+        openpose_keypose = img2tensor(openpose_keypose).unsqueeze(0) / 255.
+        openpose_keypose = openpose_keypose.to(opt.device)
+    elif cond_inp_type == 'image':
+        with autocast('cuda', dtype=torch.float32):
+            openpose_keypose = cond_model(openpose_keypose)
+        openpose_keypose = img2tensor(openpose_keypose).unsqueeze(0) / 255.
+        openpose_keypose = openpose_keypose.to(opt.device)
+
+    else:
+        raise NotImplementedError
+
+    return openpose_keypose
+
+
+def get_adapter_feature(inputs, adapters):
+    ret_feat_map = None
+    ret_feat_seq = None
+    if not isinstance(inputs, list):
+        inputs = [inputs]
+        adapters = [adapters]
+
+    for input, adapter in zip(inputs, adapters):
+        cur_feature = adapter['model'](input)
+        if isinstance(cur_feature, list):
+            if ret_feat_map is None:
+                ret_feat_map = list(map(lambda x: x * adapter['cond_weight'], cur_feature))
+            else:
+                ret_feat_map = list(map(lambda x, y: x + y * adapter['cond_weight'], ret_feat_map, cur_feature))
+        else:
+            if ret_feat_seq is None:
+                ret_feat_seq = cur_feature
+            else:
+                ret_feat_seq = torch.cat([ret_feat_seq, cur_feature], dim=1)
+
+    return ret_feat_map, ret_feat_seq
diff --git a/ldm/modules/structure_condition/midas/midas/__init__.py b/ldm/modules/extra_condition/midas/__init__.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/midas/__init__.py
rename to ldm/modules/extra_condition/midas/__init__.py
diff --git a/ldm/modules/structure_condition/midas/api.py b/ldm/modules/extra_condition/midas/api.py
old mode 100755
new mode 100644
similarity index 93%
rename from ldm/modules/structure_condition/midas/api.py
rename to ldm/modules/extra_condition/midas/api.py
index a601c72480732339b8737813e7154a52ffca6fa7..9a6e194545c40ec263e65a140678b53a5a2abd54
--- a/ldm/modules/structure_condition/midas/api.py
+++ b/ldm/modules/extra_condition/midas/api.py
@@ -6,10 +6,10 @@ import torch
 import torch.nn as nn
 from torchvision.transforms import Compose
 
-from ldm.modules.structure_condition.midas.midas.dpt_depth import DPTDepthModel
-from ldm.modules.structure_condition.midas.midas.midas_net import MidasNet
-from ldm.modules.structure_condition.midas.midas.midas_net_custom import MidasNet_small
-from ldm.modules.structure_condition.midas.midas.transforms import Resize, NormalizeImage, PrepareForNet
+from ldm.modules.extra_condition.midas.midas.dpt_depth import DPTDepthModel
+from ldm.modules.extra_condition.midas.midas.midas_net import MidasNet
+from ldm.modules.extra_condition.midas.midas.midas_net_custom import MidasNet_small
+from ldm.modules.extra_condition.midas.midas.transforms import Resize, NormalizeImage, PrepareForNet
 
 
 ISL_PATHS = {
diff --git a/ldm/modules/structure_condition/openpose/__init__.py b/ldm/modules/extra_condition/midas/midas/__init__.py
similarity index 100%
rename from ldm/modules/structure_condition/openpose/__init__.py
rename to ldm/modules/extra_condition/midas/midas/__init__.py
diff --git a/ldm/modules/structure_condition/midas/midas/base_model.py b/ldm/modules/extra_condition/midas/midas/base_model.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/midas/base_model.py
rename to ldm/modules/extra_condition/midas/midas/base_model.py
diff --git a/ldm/modules/structure_condition/midas/midas/blocks.py b/ldm/modules/extra_condition/midas/midas/blocks.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/midas/blocks.py
rename to ldm/modules/extra_condition/midas/midas/blocks.py
diff --git a/ldm/modules/structure_condition/midas/midas/dpt_depth.py b/ldm/modules/extra_condition/midas/midas/dpt_depth.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/midas/dpt_depth.py
rename to ldm/modules/extra_condition/midas/midas/dpt_depth.py
diff --git a/ldm/modules/structure_condition/midas/midas/midas_net.py b/ldm/modules/extra_condition/midas/midas/midas_net.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/midas/midas_net.py
rename to ldm/modules/extra_condition/midas/midas/midas_net.py
diff --git a/ldm/modules/structure_condition/midas/midas/midas_net_custom.py b/ldm/modules/extra_condition/midas/midas/midas_net_custom.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/midas/midas_net_custom.py
rename to ldm/modules/extra_condition/midas/midas/midas_net_custom.py
diff --git a/ldm/modules/structure_condition/midas/midas/transforms.py b/ldm/modules/extra_condition/midas/midas/transforms.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/midas/transforms.py
rename to ldm/modules/extra_condition/midas/midas/transforms.py
diff --git a/ldm/modules/structure_condition/midas/midas/vit.py b/ldm/modules/extra_condition/midas/midas/vit.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/midas/vit.py
rename to ldm/modules/extra_condition/midas/midas/vit.py
diff --git a/ldm/modules/structure_condition/midas/utils.py b/ldm/modules/extra_condition/midas/utils.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/midas/utils.py
rename to ldm/modules/extra_condition/midas/utils.py
diff --git a/ldm/modules/structure_condition/model_edge.py b/ldm/modules/extra_condition/model_edge.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/model_edge.py
rename to ldm/modules/extra_condition/model_edge.py
diff --git a/ldm/modules/extra_condition/openpose/__init__.py b/ldm/modules/extra_condition/openpose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ldm/modules/structure_condition/openpose/api.py b/ldm/modules/extra_condition/openpose/api.py
similarity index 88%
rename from ldm/modules/structure_condition/openpose/api.py
rename to ldm/modules/extra_condition/openpose/api.py
index fde41115af7da97b6d1ad39edec5ff88339c9c52..dbe7a8c1c0f9c035cdff8660d33348c58a0579c5 100644
--- a/ldm/modules/structure_condition/openpose/api.py
+++ b/ldm/modules/extra_condition/openpose/api.py
@@ -1,6 +1,5 @@
-import os
-
 import numpy as np
+import os
 import torch.nn as nn
 
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
@@ -11,7 +10,7 @@ import torch
 from . import util
 from .body import Body
 
-remote_model_path = "https://drive.google.com/file/d/1EULkcH_hhSU28qVc1jSJpCh2hGOrzpjK/view?usp=share_link"
+remote_model_path = "https://huggingface.co/TencentARC/T2I-Adapter/blob/main/third-party-models/body_pose_model.pth"
 
 
 class OpenposeInference(nn.Module):
diff --git a/ldm/modules/structure_condition/openpose/body.py b/ldm/modules/extra_condition/openpose/body.py
similarity index 96%
rename from ldm/modules/structure_condition/openpose/body.py
rename to ldm/modules/extra_condition/openpose/body.py
index 6f60943b09c79e80e532dd830fd547afdaf9a13f..ecfa8a0946ee9f653f7c00e928ae54b0109a9bdf 100644
--- a/ldm/modules/structure_condition/openpose/body.py
+++ b/ldm/modules/extra_condition/openpose/body.py
@@ -1,10 +1,9 @@
-import math
-import time
-
 import cv2
+import math
 import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np
+import time
 import torch
 from scipy.ndimage.filters import gaussian_filter
 from torchvision import transforms
@@ -210,15 +209,3 @@ class Body(object):
         # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
         # candidate: x, y, score, id
         return candidate, subset
-
-
-if __name__ == "__main__":
-    body_estimation = Body('../model/body_pose_model.pth')
-
-    test_image = '/group/30042/liangbinxie/Projects/mmpose/test_data/twitter/1.png'
-    oriImg = cv2.imread(test_image)  # B,G,R order
-    candidate, subset = body_estimation(oriImg)
-    print(candidate, subset)
-    canvas = util.draw_bodypose(oriImg, candidate, subset)
-    plt.imshow(canvas[:, :, [2, 1, 0]])
-    plt.show()
diff --git a/ldm/modules/structure_condition/openpose/hand.py b/ldm/modules/extra_condition/openpose/hand.py
similarity index 89%
rename from ldm/modules/structure_condition/openpose/hand.py
rename to ldm/modules/extra_condition/openpose/hand.py
index 3d0bf17165ad7eb225332b51f4a2aa16718664b2..1100239e21d561cf0da050ff506bcd86c3b5fa04 100644
--- a/ldm/modules/structure_condition/openpose/hand.py
+++ b/ldm/modules/extra_condition/openpose/hand.py
@@ -1,18 +1,20 @@
 import cv2
 import json
-import numpy as np
 import math
-import time
-from scipy.ndimage.filters import gaussian_filter
-import matplotlib.pyplot as plt
 import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import time
 import torch
+from scipy.ndimage.filters import gaussian_filter
 from skimage.measure import label
 
-from .model import handpose_model
 from . import util
+from .model import handpose_model
+
 
 class Hand(object):
+
     def __init__(self, model_path):
         self.model = handpose_model()
         if torch.cuda.is_available():
@@ -73,14 +75,3 @@ class Hand(object):
             y, x = util.npmax(map_ori)
             all_peaks.append([x, y])
         return np.array(all_peaks)
-
-if __name__ == "__main__":
-    hand_estimation = Hand('../model/hand_pose_model.pth')
-
-    # test_image = '../images/hand.jpg'
-    test_image = '../images/hand.jpg'
-    oriImg = cv2.imread(test_image)  # B,G,R order
-    peaks = hand_estimation(oriImg)
-    canvas = util.draw_handpose(oriImg, peaks, True)
-    cv2.imshow('', canvas)
-    cv2.waitKey(0)
\ No newline at end of file
diff --git a/ldm/modules/extra_condition/openpose/model.py b/ldm/modules/extra_condition/openpose/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5d8eb6b7e4af7e2a4fc21fe500b29f02ff176d
--- /dev/null
+++ b/ldm/modules/extra_condition/openpose/model.py
@@ -0,0 +1,178 @@
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+
+
+def make_layers(block, no_relu_layers):
+    layers = []
+    for layer_name, v in block.items():
+        if 'pool' in layer_name:
+            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2])
+            layers.append((layer_name, layer))
+        else:
+            conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1], kernel_size=v[2], stride=v[3], padding=v[4])
+            layers.append((layer_name, conv2d))
+            if layer_name not in no_relu_layers:
+                layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
+
+    return nn.Sequential(OrderedDict(layers))
+
+
+class bodypose_model(nn.Module):
+
+    def __init__(self):
+        super(bodypose_model, self).__init__()
+
+        # these layers have no relu layer
+        no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
+                          'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
+                          'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
+                          'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
+        blocks = {}
+        block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]), ('conv1_2', [64, 64, 3, 1, 1]), ('pool1_stage1', [2, 2,
+                                                                                                               0]),
+                              ('conv2_1', [64, 128, 3, 1, 1]), ('conv2_2', [128, 128, 3, 1, 1]),
+                              ('pool2_stage1', [2, 2, 0]), ('conv3_1', [128, 256, 3, 1, 1]),
+                              ('conv3_2', [256, 256, 3, 1, 1]), ('conv3_3', [256, 256, 3, 1, 1]),
+                              ('conv3_4', [256, 256, 3, 1, 1]), ('pool3_stage1', [2, 2, 0]),
+                              ('conv4_1', [256, 512, 3, 1, 1]), ('conv4_2', [512, 512, 3, 1, 1]),
+                              ('conv4_3_CPM', [512, 256, 3, 1, 1]), ('conv4_4_CPM', [256, 128, 3, 1, 1])])
+
+        # Stage 1
+        block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]), ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]), ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
+                                ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])])
+
+        block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]), ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]), ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
+                                ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])])
+        blocks['block1_1'] = block1_1
+        blocks['block1_2'] = block1_2
+
+        self.model0 = make_layers(block0, no_relu_layers)
+
+        # Stages 2 - 6
+        for i in range(2, 7):
+            blocks['block%d_1' % i] = OrderedDict([('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
+                                                   ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                                                   ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                                                   ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                                                   ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                                                   ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
+                                                   ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])])
+
+            blocks['block%d_2' % i] = OrderedDict([('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
+                                                   ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                                                   ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                                                   ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                                                   ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                                                   ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
+                                                   ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])])
+
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+        self.model1_1 = blocks['block1_1']
+        self.model2_1 = blocks['block2_1']
+        self.model3_1 = blocks['block3_1']
+        self.model4_1 = blocks['block4_1']
+        self.model5_1 = blocks['block5_1']
+        self.model6_1 = blocks['block6_1']
+
+        self.model1_2 = blocks['block1_2']
+        self.model2_2 = blocks['block2_2']
+        self.model3_2 = blocks['block3_2']
+        self.model4_2 = blocks['block4_2']
+        self.model5_2 = blocks['block5_2']
+        self.model6_2 = blocks['block6_2']
+
+    def forward(self, x):
+
+        out1 = self.model0(x)
+
+        out1_1 = self.model1_1(out1)
+        out1_2 = self.model1_2(out1)
+        out2 = torch.cat([out1_1, out1_2, out1], 1)
+
+        out2_1 = self.model2_1(out2)
+        out2_2 = self.model2_2(out2)
+        out3 = torch.cat([out2_1, out2_2, out1], 1)
+
+        out3_1 = self.model3_1(out3)
+        out3_2 = self.model3_2(out3)
+        out4 = torch.cat([out3_1, out3_2, out1], 1)
+
+        out4_1 = self.model4_1(out4)
+        out4_2 = self.model4_2(out4)
+        out5 = torch.cat([out4_1, out4_2, out1], 1)
+
+        out5_1 = self.model5_1(out5)
+        out5_2 = self.model5_2(out5)
+        out6 = torch.cat([out5_1, out5_2, out1], 1)
+
+        out6_1 = self.model6_1(out6)
+        out6_2 = self.model6_2(out6)
+
+        return out6_1, out6_2
+
+
+class handpose_model(nn.Module):
+
+    def __init__(self):
+        super(handpose_model, self).__init__()
+
+        # these layers have no relu layer
+        no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
+                          'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
+        # stage 1
+        block1_0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]), ('conv1_2', [64, 64, 3, 1, 1]),
+                                ('pool1_stage1', [2, 2, 0]), ('conv2_1', [64, 128, 3, 1, 1]),
+                                ('conv2_2', [128, 128, 3, 1, 1]), ('pool2_stage1', [2, 2, 0]),
+                                ('conv3_1', [128, 256, 3, 1, 1]), ('conv3_2', [256, 256, 3, 1, 1]),
+                                ('conv3_3', [256, 256, 3, 1, 1]), ('conv3_4', [256, 256, 3, 1, 1]),
+                                ('pool3_stage1', [2, 2, 0]), ('conv4_1', [256, 512, 3, 1, 1]),
+                                ('conv4_2', [512, 512, 3, 1, 1]), ('conv4_3', [512, 512, 3, 1, 1]),
+                                ('conv4_4', [512, 512, 3, 1, 1]), ('conv5_1', [512, 512, 3, 1, 1]),
+                                ('conv5_2', [512, 512, 3, 1, 1]), ('conv5_3_CPM', [512, 128, 3, 1, 1])])
+
+        block1_1 = OrderedDict([('conv6_1_CPM', [128, 512, 1, 1, 0]), ('conv6_2_CPM', [512, 22, 1, 1, 0])])
+
+        blocks = {}
+        blocks['block1_0'] = block1_0
+        blocks['block1_1'] = block1_1
+
+        # stage 2-6
+        for i in range(2, 7):
+            blocks['block%d' % i] = OrderedDict([('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
+                                                 ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
+                                                 ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
+                                                 ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
+                                                 ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
+                                                 ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
+                                                 ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])])
+
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+        self.model1_0 = blocks['block1_0']
+        self.model1_1 = blocks['block1_1']
+        self.model2 = blocks['block2']
+        self.model3 = blocks['block3']
+        self.model4 = blocks['block4']
+        self.model5 = blocks['block5']
+        self.model6 = blocks['block6']
+
+    def forward(self, x):
+        out1_0 = self.model1_0(x)
+        out1_1 = self.model1_1(out1_0)
+        concat_stage2 = torch.cat([out1_1, out1_0], 1)
+        out_stage2 = self.model2(concat_stage2)
+        concat_stage3 = torch.cat([out_stage2, out1_0], 1)
+        out_stage3 = self.model3(concat_stage3)
+        concat_stage4 = torch.cat([out_stage3, out1_0], 1)
+        out_stage4 = self.model4(concat_stage4)
+        concat_stage5 = torch.cat([out_stage4, out1_0], 1)
+        out_stage5 = self.model5(concat_stage5)
+        concat_stage6 = torch.cat([out_stage5, out1_0], 1)
+        out_stage6 = self.model6(concat_stage6)
+        return out_stage6
diff --git a/ldm/modules/structure_condition/openpose/util.py b/ldm/modules/extra_condition/openpose/util.py
similarity index 100%
rename from ldm/modules/structure_condition/openpose/util.py
rename to ldm/modules/extra_condition/openpose/util.py
diff --git a/ldm/modules/structure_condition/utils.py b/ldm/modules/extra_condition/utils.py
old mode 100755
new mode 100644
similarity index 100%
rename from ldm/modules/structure_condition/utils.py
rename to ldm/modules/extra_condition/utils.py
diff --git a/ldm/modules/image_degradation/bsrgan_light.py b/ldm/modules/image_degradation/bsrgan_light.py
index 9e1f823996bf559e9b015ea9aa2b3cd38dd13af1..808c7f882cb75e2ba2340d5b55881d11927351f0 100755
--- a/ldm/modules/image_degradation/bsrgan_light.py
+++ b/ldm/modules/image_degradation/bsrgan_light.py
@@ -25,7 +25,6 @@ import ldm.modules.image_degradation.utils_image as util
 # --------------------------------------------
 """
 
-
 def modcrop_np(img, sf):
     '''
     Args:
@@ -254,7 +253,7 @@ def srmd_degradation(x, k, sf=3):
           year={2018}
         }
     '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
+    x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
     x = bicubic_degradation(x, sf=sf)
     return x
 
@@ -277,7 +276,7 @@ def dpsr_degradation(x, k, sf=3):
         }
     '''
     x = bicubic_degradation(x, sf=sf)
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
+    x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
     return x
 
 
@@ -290,7 +289,7 @@ def classical_degradation(x, k, sf=3):
     Return:
         downsampled LR image
     '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
+    x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
     # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
     st = 0
     return x[st::sf, st::sf, ...]
@@ -335,7 +334,7 @@ def add_blur(img, sf=4):
         k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
     else:
         k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
+    img = ndimage.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
 
     return img
 
@@ -497,7 +496,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
                 k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
                 k_shifted = shift_pixel(k, sf)
                 k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                img = ndimage.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
                 img = img[0::sf, 0::sf, ...]  # nearest downsampling
             img = np.clip(img, 0.0, 1.0)
 
@@ -531,7 +530,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
 
 
 # todo no isp_model?
-def degradation_bsrgan_variant(image, sf=4, isp_model=None):
+def degradation_bsrgan_variant(image, sf=4, isp_model=None, up=False):
     """
     This is the degradation model of BSRGAN from the paper
     "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
@@ -589,7 +588,7 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
                 k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
                 k_shifted = shift_pixel(k, sf)
                 k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                image = ndimage.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
                 image = image[0::sf, 0::sf, ...]  # nearest downsampling
 
             image = np.clip(image, 0.0, 1.0)
@@ -617,6 +616,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
     # add final JPEG compression noise
     image = add_JPEG_noise(image)
     image = util.single2uint(image)
+    if up:
+        image = cv2.resize(image, (w1, h1), interpolation=cv2.INTER_CUBIC)  # todo: random, as above? want to condition on it then
     example = {"image": image}
     return example
 
diff --git a/ldm/modules/losses/__init__.py b/ldm/modules/losses/__init__.py
deleted file mode 100755
index 876d7c5bd6e3245ee77feb4c482b7a8143604ad5..0000000000000000000000000000000000000000
--- a/ldm/modules/losses/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator
\ No newline at end of file
diff --git a/ldm/modules/losses/contperceptual.py b/ldm/modules/losses/contperceptual.py
deleted file mode 100755
index 672c1e32a1389def02461c0781339681060c540e..0000000000000000000000000000000000000000
--- a/ldm/modules/losses/contperceptual.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import torch
-import torch.nn as nn
-
-from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?
-
-
-class LPIPSWithDiscriminator(nn.Module):
-    def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
-                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
-                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
-                 disc_loss="hinge"):
-
-        super().__init__()
-        assert disc_loss in ["hinge", "vanilla"]
-        self.kl_weight = kl_weight
-        self.pixel_weight = pixelloss_weight
-        self.perceptual_loss = LPIPS().eval()
-        self.perceptual_weight = perceptual_weight
-        # output log variance
-        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
-
-        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
-                                                 n_layers=disc_num_layers,
-                                                 use_actnorm=use_actnorm
-                                                 ).apply(weights_init)
-        self.discriminator_iter_start = disc_start
-        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
-        self.disc_factor = disc_factor
-        self.discriminator_weight = disc_weight
-        self.disc_conditional = disc_conditional
-
-    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
-        if last_layer is not None:
-            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
-        else:
-            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
-
-        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
-        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
-        d_weight = d_weight * self.discriminator_weight
-        return d_weight
-
-    def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
-                global_step, last_layer=None, cond=None, split="train",
-                weights=None):
-        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
-        if self.perceptual_weight > 0:
-            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
-            rec_loss = rec_loss + self.perceptual_weight * p_loss
-
-        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
-        weighted_nll_loss = nll_loss
-        if weights is not None:
-            weighted_nll_loss = weights*nll_loss
-        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
-        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
-        kl_loss = posteriors.kl()
-        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
-
-        # now the GAN part
-        if optimizer_idx == 0:
-            # generator update
-            if cond is None:
-                assert not self.disc_conditional
-                logits_fake = self.discriminator(reconstructions.contiguous())
-            else:
-                assert self.disc_conditional
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
-            g_loss = -torch.mean(logits_fake)
-
-            if self.disc_factor > 0.0:
-                try:
-                    d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
-                except RuntimeError:
-                    assert not self.training
-                    d_weight = torch.tensor(0.0)
-            else:
-                d_weight = torch.tensor(0.0)
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
-
-            log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
-                   "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
-                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
-                   "{}/d_weight".format(split): d_weight.detach(),
-                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
-                   "{}/g_loss".format(split): g_loss.detach().mean(),
-                   }
-            return loss, log
-
-        if optimizer_idx == 1:
-            # second pass for discriminator update
-            if cond is None:
-                logits_real = self.discriminator(inputs.contiguous().detach())
-                logits_fake = self.discriminator(reconstructions.contiguous().detach())
-            else:
-                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
-
-            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
-                   "{}/logits_real".format(split): logits_real.detach().mean(),
-                   "{}/logits_fake".format(split): logits_fake.detach().mean()
-                   }
-            return d_loss, log
-
diff --git a/ldm/modules/losses/vqperceptual.py b/ldm/modules/losses/vqperceptual.py
deleted file mode 100755
index f69981769e4bd5462600458c4fcf26620f7e4306..0000000000000000000000000000000000000000
--- a/ldm/modules/losses/vqperceptual.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import torch
-from torch import nn
-import torch.nn.functional as F
-from einops import repeat
-
-from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
-from taming.modules.losses.lpips import LPIPS
-from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
-
-
-def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
-    assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
-    loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
-    loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
-    loss_real = (weights * loss_real).sum() / weights.sum()
-    loss_fake = (weights * loss_fake).sum() / weights.sum()
-    d_loss = 0.5 * (loss_real + loss_fake)
-    return d_loss
-
-def adopt_weight(weight, global_step, threshold=0, value=0.):
-    if global_step < threshold:
-        weight = value
-    return weight
-
-
-def measure_perplexity(predicted_indices, n_embed):
-    # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
-    # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
-    encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
-    avg_probs = encodings.mean(0)
-    perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
-    cluster_use = torch.sum(avg_probs > 0)
-    return perplexity, cluster_use
-
-def l1(x, y):
-    return torch.abs(x-y)
-
-
-def l2(x, y):
-    return torch.pow((x-y), 2)
-
-
-class VQLPIPSWithDiscriminator(nn.Module):
-    def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
-                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
-                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
-                 disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
-                 pixel_loss="l1"):
-        super().__init__()
-        assert disc_loss in ["hinge", "vanilla"]
-        assert perceptual_loss in ["lpips", "clips", "dists"]
-        assert pixel_loss in ["l1", "l2"]
-        self.codebook_weight = codebook_weight
-        self.pixel_weight = pixelloss_weight
-        if perceptual_loss == "lpips":
-            print(f"{self.__class__.__name__}: Running with LPIPS.")
-            self.perceptual_loss = LPIPS().eval()
-        else:
-            raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
-        self.perceptual_weight = perceptual_weight
-
-        if pixel_loss == "l1":
-            self.pixel_loss = l1
-        else:
-            self.pixel_loss = l2
-
-        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
-                                                 n_layers=disc_num_layers,
-                                                 use_actnorm=use_actnorm,
-                                                 ndf=disc_ndf
-                                                 ).apply(weights_init)
-        self.discriminator_iter_start = disc_start
-        if disc_loss == "hinge":
-            self.disc_loss = hinge_d_loss
-        elif disc_loss == "vanilla":
-            self.disc_loss = vanilla_d_loss
-        else:
-            raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
-        print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
-        self.disc_factor = disc_factor
-        self.discriminator_weight = disc_weight
-        self.disc_conditional = disc_conditional
-        self.n_classes = n_classes
-
-    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
-        if last_layer is not None:
-            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
-        else:
-            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
-
-        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
-        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
-        d_weight = d_weight * self.discriminator_weight
-        return d_weight
-
-    def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
-                global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
-        if not exists(codebook_loss):
-            codebook_loss = torch.tensor([0.]).to(inputs.device)
-        #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
-        rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
-        if self.perceptual_weight > 0:
-            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
-            rec_loss = rec_loss + self.perceptual_weight * p_loss
-        else:
-            p_loss = torch.tensor([0.0])
-
-        nll_loss = rec_loss
-        #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
-        nll_loss = torch.mean(nll_loss)
-
-        # now the GAN part
-        if optimizer_idx == 0:
-            # generator update
-            if cond is None:
-                assert not self.disc_conditional
-                logits_fake = self.discriminator(reconstructions.contiguous())
-            else:
-                assert self.disc_conditional
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
-            g_loss = -torch.mean(logits_fake)
-
-            try:
-                d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
-            except RuntimeError:
-                assert not self.training
-                d_weight = torch.tensor(0.0)
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
-
-            log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
-                   "{}/quant_loss".format(split): codebook_loss.detach().mean(),
-                   "{}/nll_loss".format(split): nll_loss.detach().mean(),
-                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
-                   "{}/p_loss".format(split): p_loss.detach().mean(),
-                   "{}/d_weight".format(split): d_weight.detach(),
-                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
-                   "{}/g_loss".format(split): g_loss.detach().mean(),
-                   }
-            if predicted_indices is not None:
-                assert self.n_classes is not None
-                with torch.no_grad():
-                    perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
-                log[f"{split}/perplexity"] = perplexity
-                log[f"{split}/cluster_usage"] = cluster_usage
-            return loss, log
-
-        if optimizer_idx == 1:
-            # second pass for discriminator update
-            if cond is None:
-                logits_real = self.discriminator(inputs.contiguous().detach())
-                logits_fake = self.discriminator(reconstructions.contiguous().detach())
-            else:
-                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
-
-            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
-                   "{}/logits_real".format(split): logits_real.detach().mean(),
-                   "{}/logits_fake".format(split): logits_fake.detach().mean()
-                   }
-            return d_loss, log
diff --git a/ldm/modules/structure_condition/model_seg.py b/ldm/modules/structure_condition/model_seg.py
deleted file mode 100755
index ab8e8d368321acaa3d481009ef07adf7f107b207..0000000000000000000000000000000000000000
--- a/ldm/modules/structure_condition/model_seg.py
+++ /dev/null
@@ -1,283 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import cv2
-from basicsr.utils import img2tensor, tensor2img
-
-_BATCH_NORM = nn.BatchNorm2d
-_BOTTLENECK_EXPANSION = 4
-
-import blobfile as bf
-
-def _list_image_files_recursively(data_dir):
-    results = []
-    for entry in sorted(bf.listdir(data_dir)):
-        full_path = bf.join(data_dir, entry)
-        ext = entry.split(".")[-1]
-        if "." in entry and ext.lower() in ["jpg", "jpeg", "png", "gif"]:
-            results.append(full_path)
-        elif bf.isdir(full_path):
-            results.extend(_list_image_files_recursively(full_path))
-    return results
-    
-def uint82bin(n, count=8):
-    """returns the binary of integer n, count refers to amount of bits"""
-    return ''.join([str((n >> y) & 1) for y in range(count - 1, -1, -1)])
-
-
-def labelcolormap(N):
-    if N == 35:  # cityscape
-        cmap = np.array([(0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (111, 74, 0), (81, 0, 81),
-                         (128, 64, 128), (244, 35, 232), (250, 170, 160), (230, 150, 140), (70, 70, 70), (102, 102, 156), (190, 153, 153),
-                         (180, 165, 180), (150, 100, 100), (150, 120, 90), (153, 153, 153), (153, 153, 153), (250, 170, 30), (220, 220, 0),
-                         (107, 142, 35), (152, 251, 152), (70, 130, 180), (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70),
-                         (0, 60, 100), (0, 0, 90), (0, 0, 110), (0, 80, 100), (0, 0, 230), (119, 11, 32), (0, 0, 142)],
-                        dtype=np.uint8)
-    else:
-        cmap = np.zeros((N, 3), dtype=np.uint8)
-        for i in range(N):
-            r, g, b = 0, 0, 0
-            id = i + 1  # let's give 0 a color
-            for j in range(7):
-                str_id = uint82bin(id)
-                r = r ^ (np.uint8(str_id[-1]) << (7 - j))
-                g = g ^ (np.uint8(str_id[-2]) << (7 - j))
-                b = b ^ (np.uint8(str_id[-3]) << (7 - j))
-                id = id >> 3
-            cmap[i, 0] =  r
-            cmap[i, 1] =  g
-            cmap[i, 2] =  b
-     
-    return cmap
-
-
-class Colorize(object):
-    def __init__(self, n=182):
-        self.cmap = labelcolormap(n)
-
-    def __call__(self, gray_image):
-        size = gray_image.shape
-        color_image = np.zeros((3, size[0], size[1])) 
-     
-        for label in range(0, len(self.cmap)):
-            mask = (label == gray_image ) 
-            color_image[0][mask] = self.cmap[label][0]
-            color_image[1][mask] = self.cmap[label][1]
-            color_image[2][mask] = self.cmap[label][2]
-
-        return color_image
-
-class _ConvBnReLU(nn.Sequential):
-    """
-    Cascade of 2D convolution, batch norm, and ReLU.
-    """
-
-    BATCH_NORM = _BATCH_NORM
-
-    def __init__(
-        self, in_ch, out_ch, kernel_size, stride, padding, dilation, relu=True
-    ):
-        super(_ConvBnReLU, self).__init__()
-        self.add_module(
-            "conv",
-            nn.Conv2d(
-                in_ch, out_ch, kernel_size, stride, padding, dilation, bias=False
-            ),
-        )
-        self.add_module("bn", _BATCH_NORM(out_ch, eps=1e-5, momentum=1 - 0.999))
-
-        if relu:
-            self.add_module("relu", nn.ReLU())
-
-class _Bottleneck(nn.Module):
-    """
-    Bottleneck block of MSRA ResNet.
-    """
-
-    def __init__(self, in_ch, out_ch, stride, dilation, downsample):
-        super(_Bottleneck, self).__init__()
-        mid_ch = out_ch // _BOTTLENECK_EXPANSION
-        self.reduce = _ConvBnReLU(in_ch, mid_ch, 1, stride, 0, 1, True)
-        self.conv3x3 = _ConvBnReLU(mid_ch, mid_ch, 3, 1, dilation, dilation, True)
-        self.increase = _ConvBnReLU(mid_ch, out_ch, 1, 1, 0, 1, False)
-        self.shortcut = (
-            _ConvBnReLU(in_ch, out_ch, 1, stride, 0, 1, False)
-            if downsample
-            else nn.Identity()
-        )
-
-    def forward(self, x):
-        h = self.reduce(x)
-        h = self.conv3x3(h)
-        h = self.increase(h)
-        h += self.shortcut(x)
-        return F.relu(h)
-
-class _ResLayer(nn.Sequential):
-    """
-    Residual layer with multi grids
-    """
-
-    def __init__(self, n_layers, in_ch, out_ch, stride, dilation, multi_grids=None):
-        super(_ResLayer, self).__init__()
-
-        if multi_grids is None:
-            multi_grids = [1 for _ in range(n_layers)]
-        else:
-            assert n_layers == len(multi_grids)
-
-        # Downsampling is only in the first block
-        for i in range(n_layers):
-            self.add_module(
-                "block{}".format(i + 1),
-                _Bottleneck(
-                    in_ch=(in_ch if i == 0 else out_ch),
-                    out_ch=out_ch,
-                    stride=(stride if i == 0 else 1),
-                    dilation=dilation * multi_grids[i],
-                    downsample=(True if i == 0 else False),
-                ),
-            )
-
-class _Stem(nn.Sequential):
-    """
-    The 1st conv layer.
-    Note that the max pooling is different from both MSRA and FAIR ResNet.
-    """
-
-    def __init__(self, out_ch):
-        super(_Stem, self).__init__()
-        self.add_module("conv1", _ConvBnReLU(3, out_ch, 7, 2, 3, 1))
-        self.add_module("pool", nn.MaxPool2d(3, 2, 1, ceil_mode=True))
-
-class _ASPP(nn.Module):
-    """
-    Atrous spatial pyramid pooling (ASPP)
-    """
-
-    def __init__(self, in_ch, out_ch, rates):
-        super(_ASPP, self).__init__()
-        for i, rate in enumerate(rates):
-            self.add_module(
-                "c{}".format(i),
-                nn.Conv2d(in_ch, out_ch, 3, 1, padding=rate, dilation=rate, bias=True),
-            )
-
-        for m in self.children():
-            nn.init.normal_(m.weight, mean=0, std=0.01)
-            nn.init.constant_(m.bias, 0)
-
-    def forward(self, x):
-        return sum([stage(x) for stage in self.children()])
-
-class MSC(nn.Module):
-    """
-    Multi-scale inputs
-    """
-
-    def __init__(self, base, scales=None):
-        super(MSC, self).__init__()
-        self.base = base
-        if scales:
-            self.scales = scales
-        else:
-            self.scales = [0.5, 0.75]
-
-    def forward(self, x):
-        # Original
-        logits = self.base(x)
-        _, _, H, W = logits.shape
-        interp = lambda l: F.interpolate(
-            l, size=(H, W), mode="bilinear", align_corners=False
-        )
-
-        # Scaled
-        logits_pyramid = []
-        for p in self.scales:
-            h = F.interpolate(x, scale_factor=p, mode="bilinear", align_corners=False)
-            logits_pyramid.append(self.base(h))
-
-        # Pixel-wise max
-        logits_all = [logits] + [interp(l) for l in logits_pyramid]
-        logits_max = torch.max(torch.stack(logits_all), dim=0)[0]
-
-        return logits_max
-
-class DeepLabV2(nn.Sequential):
-    """
-    DeepLab v2: Dilated ResNet + ASPP
-    Output stride is fixed at 8
-    """
-
-    def __init__(self, n_classes=182, n_blocks=[3, 4, 23, 3], atrous_rates=[6, 12, 18, 24]):
-        super(DeepLabV2, self).__init__()
-        ch = [64 * 2 ** p for p in range(6)]
-        self.add_module("layer1", _Stem(ch[0]))
-        self.add_module("layer2", _ResLayer(n_blocks[0], ch[0], ch[2], 1, 1))
-        self.add_module("layer3", _ResLayer(n_blocks[1], ch[2], ch[3], 2, 1))
-        self.add_module("layer4", _ResLayer(n_blocks[2], ch[3], ch[4], 1, 2))
-        self.add_module("layer5", _ResLayer(n_blocks[3], ch[4], ch[5], 1, 4))
-        self.add_module("aspp", _ASPP(ch[5], n_classes, atrous_rates))
-
-    def freeze_bn(self):
-        for m in self.modules():
-            if isinstance(m, _ConvBnReLU.BATCH_NORM):
-                m.eval()
-
-def preprocessing(image, device):
-    # Resize
-    scale = 640 / max(image.shape[:2])
-    image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
-    raw_image = image.astype(np.uint8)
-
-    # Subtract mean values
-    image = image.astype(np.float32)
-    image -= np.array(
-        [
-            float(104.008),
-            float(116.669),
-            float(122.675),
-        ]
-    )
-
-    # Convert to torch.Tensor and add "batch" axis
-    image = torch.from_numpy(image.transpose(2, 0, 1)).float().unsqueeze(0)
-    image = image.to(device)
-
-    return image, raw_image
-
-# Model setup
-def seger():
-    model = MSC(
-            base=DeepLabV2(
-                n_classes=182, n_blocks=[3, 4, 23, 3], atrous_rates=[6, 12, 18, 24]
-            ),
-            scales=[0.5, 0.75],
-        )
-    state_dict = torch.load('models/deeplabv2_resnet101_msc-cocostuff164k-100000.pth')
-    model.load_state_dict(state_dict)  # to skip ASPP
-
-    return model
-
-if __name__ == '__main__':
-    device = 'cuda'
-    model = seger()
-    model.to(device)
-    model.eval()
-    with torch.no_grad():
-        im = cv2.imread('/group/30042/chongmou/ft_local/Diffusion/baselines/SPADE/datasets/coco_stuff/val_img/000000000785.jpg', cv2.IMREAD_COLOR)
-        im, raw_im = preprocessing(im, 'cuda')
-        _, _, H, W = im.shape
-
-        # Image -> Probability map
-        logits = model(im)
-        logits = F.interpolate(logits, size=(H, W), mode="bilinear", align_corners=False)
-        probs = F.softmax(logits, dim=1)[0]
-        probs = probs.cpu().data.numpy()
-        labelmap = np.argmax(probs, axis=0)
-        print(labelmap.shape, np.max(labelmap), np.min(labelmap))
-        cv2.imwrite('mask.png', labelmap)
-
-
-   
\ No newline at end of file
diff --git a/ldm/modules/structure_condition/openpose/model.py b/ldm/modules/structure_condition/openpose/model.py
deleted file mode 100644
index 5dfc80de827a17beccb9b0f3f7588545be78c9de..0000000000000000000000000000000000000000
--- a/ldm/modules/structure_condition/openpose/model.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import torch
-from collections import OrderedDict
-
-import torch
-import torch.nn as nn
-
-def make_layers(block, no_relu_layers):
-    layers = []
-    for layer_name, v in block.items():
-        if 'pool' in layer_name:
-            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
-                                    padding=v[2])
-            layers.append((layer_name, layer))
-        else:
-            conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
-                               kernel_size=v[2], stride=v[3],
-                               padding=v[4])
-            layers.append((layer_name, conv2d))
-            if layer_name not in no_relu_layers:
-                layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
-
-    return nn.Sequential(OrderedDict(layers))
-
-class bodypose_model(nn.Module):
-    def __init__(self):
-        super(bodypose_model, self).__init__()
-
-        # these layers have no relu layer
-        no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
-                          'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
-                          'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
-                          'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
-        blocks = {}
-        block0 = OrderedDict([
-                      ('conv1_1', [3, 64, 3, 1, 1]),
-                      ('conv1_2', [64, 64, 3, 1, 1]),
-                      ('pool1_stage1', [2, 2, 0]),
-                      ('conv2_1', [64, 128, 3, 1, 1]),
-                      ('conv2_2', [128, 128, 3, 1, 1]),
-                      ('pool2_stage1', [2, 2, 0]),
-                      ('conv3_1', [128, 256, 3, 1, 1]),
-                      ('conv3_2', [256, 256, 3, 1, 1]),
-                      ('conv3_3', [256, 256, 3, 1, 1]),
-                      ('conv3_4', [256, 256, 3, 1, 1]),
-                      ('pool3_stage1', [2, 2, 0]),
-                      ('conv4_1', [256, 512, 3, 1, 1]),
-                      ('conv4_2', [512, 512, 3, 1, 1]),
-                      ('conv4_3_CPM', [512, 256, 3, 1, 1]),
-                      ('conv4_4_CPM', [256, 128, 3, 1, 1])
-                  ])
-
-
-        # Stage 1
-        block1_1 = OrderedDict([
-                        ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
-                        ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
-                        ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
-                        ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
-                        ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
-                    ])
-
-        block1_2 = OrderedDict([
-                        ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
-                        ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
-                        ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
-                        ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
-                        ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
-                    ])
-        blocks['block1_1'] = block1_1
-        blocks['block1_2'] = block1_2
-
-        self.model0 = make_layers(block0, no_relu_layers)
-
-        # Stages 2 - 6
-        for i in range(2, 7):
-            blocks['block%d_1' % i] = OrderedDict([
-                    ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
-                    ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
-                    ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
-                ])
-
-            blocks['block%d_2' % i] = OrderedDict([
-                    ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
-                    ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
-                    ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
-                ])
-
-        for k in blocks.keys():
-            blocks[k] = make_layers(blocks[k], no_relu_layers)
-
-        self.model1_1 = blocks['block1_1']
-        self.model2_1 = blocks['block2_1']
-        self.model3_1 = blocks['block3_1']
-        self.model4_1 = blocks['block4_1']
-        self.model5_1 = blocks['block5_1']
-        self.model6_1 = blocks['block6_1']
-
-        self.model1_2 = blocks['block1_2']
-        self.model2_2 = blocks['block2_2']
-        self.model3_2 = blocks['block3_2']
-        self.model4_2 = blocks['block4_2']
-        self.model5_2 = blocks['block5_2']
-        self.model6_2 = blocks['block6_2']
-
-
-    def forward(self, x):
-
-        out1 = self.model0(x)
-
-        out1_1 = self.model1_1(out1)
-        out1_2 = self.model1_2(out1)
-        out2 = torch.cat([out1_1, out1_2, out1], 1)
-
-        out2_1 = self.model2_1(out2)
-        out2_2 = self.model2_2(out2)
-        out3 = torch.cat([out2_1, out2_2, out1], 1)
-
-        out3_1 = self.model3_1(out3)
-        out3_2 = self.model3_2(out3)
-        out4 = torch.cat([out3_1, out3_2, out1], 1)
-
-        out4_1 = self.model4_1(out4)
-        out4_2 = self.model4_2(out4)
-        out5 = torch.cat([out4_1, out4_2, out1], 1)
-
-        out5_1 = self.model5_1(out5)
-        out5_2 = self.model5_2(out5)
-        out6 = torch.cat([out5_1, out5_2, out1], 1)
-
-        out6_1 = self.model6_1(out6)
-        out6_2 = self.model6_2(out6)
-
-        return out6_1, out6_2
-
-class handpose_model(nn.Module):
-    def __init__(self):
-        super(handpose_model, self).__init__()
-
-        # these layers have no relu layer
-        no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
-                          'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
-        # stage 1
-        block1_0 = OrderedDict([
-                ('conv1_1', [3, 64, 3, 1, 1]),
-                ('conv1_2', [64, 64, 3, 1, 1]),
-                ('pool1_stage1', [2, 2, 0]),
-                ('conv2_1', [64, 128, 3, 1, 1]),
-                ('conv2_2', [128, 128, 3, 1, 1]),
-                ('pool2_stage1', [2, 2, 0]),
-                ('conv3_1', [128, 256, 3, 1, 1]),
-                ('conv3_2', [256, 256, 3, 1, 1]),
-                ('conv3_3', [256, 256, 3, 1, 1]),
-                ('conv3_4', [256, 256, 3, 1, 1]),
-                ('pool3_stage1', [2, 2, 0]),
-                ('conv4_1', [256, 512, 3, 1, 1]),
-                ('conv4_2', [512, 512, 3, 1, 1]),
-                ('conv4_3', [512, 512, 3, 1, 1]),
-                ('conv4_4', [512, 512, 3, 1, 1]),
-                ('conv5_1', [512, 512, 3, 1, 1]),
-                ('conv5_2', [512, 512, 3, 1, 1]),
-                ('conv5_3_CPM', [512, 128, 3, 1, 1])
-            ])
-
-        block1_1 = OrderedDict([
-            ('conv6_1_CPM', [128, 512, 1, 1, 0]),
-            ('conv6_2_CPM', [512, 22, 1, 1, 0])
-        ])
-
-        blocks = {}
-        blocks['block1_0'] = block1_0
-        blocks['block1_1'] = block1_1
-
-        # stage 2-6
-        for i in range(2, 7):
-            blocks['block%d' % i] = OrderedDict([
-                    ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
-                    ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
-                    ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
-                    ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
-                ])
-
-        for k in blocks.keys():
-            blocks[k] = make_layers(blocks[k], no_relu_layers)
-
-        self.model1_0 = blocks['block1_0']
-        self.model1_1 = blocks['block1_1']
-        self.model2 = blocks['block2']
-        self.model3 = blocks['block3']
-        self.model4 = blocks['block4']
-        self.model5 = blocks['block5']
-        self.model6 = blocks['block6']
-
-    def forward(self, x):
-        out1_0 = self.model1_0(x)
-        out1_1 = self.model1_1(out1_0)
-        concat_stage2 = torch.cat([out1_1, out1_0], 1)
-        out_stage2 = self.model2(concat_stage2)
-        concat_stage3 = torch.cat([out_stage2, out1_0], 1)
-        out_stage3 = self.model3(concat_stage3)
-        concat_stage4 = torch.cat([out_stage3, out1_0], 1)
-        out_stage4 = self.model4(concat_stage4)
-        concat_stage5 = torch.cat([out_stage4, out1_0], 1)
-        out_stage5 = self.model5(concat_stage5)
-        concat_stage6 = torch.cat([out_stage5, out1_0], 1)
-        out_stage6 = self.model6(concat_stage6)
-        return out_stage6
-
-
diff --git a/ldm/modules/x_transformer.py b/ldm/modules/x_transformer.py
deleted file mode 100755
index 5fc15bf9cfe0111a910e7de33d04ffdec3877576..0000000000000000000000000000000000000000
--- a/ldm/modules/x_transformer.py
+++ /dev/null
@@ -1,641 +0,0 @@
-"""shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers"""
-import torch
-from torch import nn, einsum
-import torch.nn.functional as F
-from functools import partial
-from inspect import isfunction
-from collections import namedtuple
-from einops import rearrange, repeat, reduce
-
-# constants
-
-DEFAULT_DIM_HEAD = 64
-
-Intermediates = namedtuple('Intermediates', [
-    'pre_softmax_attn',
-    'post_softmax_attn'
-])
-
-LayerIntermediates = namedtuple('Intermediates', [
-    'hiddens',
-    'attn_intermediates'
-])
-
-
-class AbsolutePositionalEmbedding(nn.Module):
-    def __init__(self, dim, max_seq_len):
-        super().__init__()
-        self.emb = nn.Embedding(max_seq_len, dim)
-        self.init_()
-
-    def init_(self):
-        nn.init.normal_(self.emb.weight, std=0.02)
-
-    def forward(self, x):
-        n = torch.arange(x.shape[1], device=x.device)
-        return self.emb(n)[None, :, :]
-
-
-class FixedPositionalEmbedding(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
-
-    def forward(self, x, seq_dim=1, offset=0):
-        t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
-        sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
-        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
-        return emb[None, :, :]
-
-
-# helpers
-
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def always(val):
-    def inner(*args, **kwargs):
-        return val
-    return inner
-
-
-def not_equals(val):
-    def inner(x):
-        return x != val
-    return inner
-
-
-def equals(val):
-    def inner(x):
-        return x == val
-    return inner
-
-
-def max_neg_value(tensor):
-    return -torch.finfo(tensor.dtype).max
-
-
-# keyword argument helpers
-
-def pick_and_pop(keys, d):
-    values = list(map(lambda key: d.pop(key), keys))
-    return dict(zip(keys, values))
-
-
-def group_dict_by_key(cond, d):
-    return_val = [dict(), dict()]
-    for key in d.keys():
-        match = bool(cond(key))
-        ind = int(not match)
-        return_val[ind][key] = d[key]
-    return (*return_val,)
-
-
-def string_begins_with(prefix, str):
-    return str.startswith(prefix)
-
-
-def group_by_key_prefix(prefix, d):
-    return group_dict_by_key(partial(string_begins_with, prefix), d)
-
-
-def groupby_prefix_and_trim(prefix, d):
-    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
-    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
-    return kwargs_without_prefix, kwargs
-
-
-# classes
-class Scale(nn.Module):
-    def __init__(self, value, fn):
-        super().__init__()
-        self.value = value
-        self.fn = fn
-
-    def forward(self, x, **kwargs):
-        x, *rest = self.fn(x, **kwargs)
-        return (x * self.value, *rest)
-
-
-class Rezero(nn.Module):
-    def __init__(self, fn):
-        super().__init__()
-        self.fn = fn
-        self.g = nn.Parameter(torch.zeros(1))
-
-    def forward(self, x, **kwargs):
-        x, *rest = self.fn(x, **kwargs)
-        return (x * self.g, *rest)
-
-
-class ScaleNorm(nn.Module):
-    def __init__(self, dim, eps=1e-5):
-        super().__init__()
-        self.scale = dim ** -0.5
-        self.eps = eps
-        self.g = nn.Parameter(torch.ones(1))
-
-    def forward(self, x):
-        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
-        return x / norm.clamp(min=self.eps) * self.g
-
-
-class RMSNorm(nn.Module):
-    def __init__(self, dim, eps=1e-8):
-        super().__init__()
-        self.scale = dim ** -0.5
-        self.eps = eps
-        self.g = nn.Parameter(torch.ones(dim))
-
-    def forward(self, x):
-        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
-        return x / norm.clamp(min=self.eps) * self.g
-
-
-class Residual(nn.Module):
-    def forward(self, x, residual):
-        return x + residual
-
-
-class GRUGating(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.gru = nn.GRUCell(dim, dim)
-
-    def forward(self, x, residual):
-        gated_output = self.gru(
-            rearrange(x, 'b n d -> (b n) d'),
-            rearrange(residual, 'b n d -> (b n) d')
-        )
-
-        return gated_output.reshape_as(x)
-
-
-# feedforward
-
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * F.gelu(gate)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
-
-        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-# attention.
-class Attention(nn.Module):
-    def __init__(
-            self,
-            dim,
-            dim_head=DEFAULT_DIM_HEAD,
-            heads=8,
-            causal=False,
-            mask=None,
-            talking_heads=False,
-            sparse_topk=None,
-            use_entmax15=False,
-            num_mem_kv=0,
-            dropout=0.,
-            on_attn=False
-    ):
-        super().__init__()
-        if use_entmax15:
-            raise NotImplementedError("Check out entmax activation instead of softmax activation!")
-        self.scale = dim_head ** -0.5
-        self.heads = heads
-        self.causal = causal
-        self.mask = mask
-
-        inner_dim = dim_head * heads
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(dim, inner_dim, bias=False)
-        self.dropout = nn.Dropout(dropout)
-
-        # talking heads
-        self.talking_heads = talking_heads
-        if talking_heads:
-            self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads))
-            self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads))
-
-        # explicit topk sparse attention
-        self.sparse_topk = sparse_topk
-
-        # entmax
-        #self.attn_fn = entmax15 if use_entmax15 else F.softmax
-        self.attn_fn = F.softmax
-
-        # add memory key / values
-        self.num_mem_kv = num_mem_kv
-        if num_mem_kv > 0:
-            self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
-            self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
-
-        # attention on attention
-        self.attn_on_attn = on_attn
-        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim)
-
-    def forward(
-            self,
-            x,
-            context=None,
-            mask=None,
-            context_mask=None,
-            rel_pos=None,
-            sinusoidal_emb=None,
-            prev_attn=None,
-            mem=None
-    ):
-        b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device
-        kv_input = default(context, x)
-
-        q_input = x
-        k_input = kv_input
-        v_input = kv_input
-
-        if exists(mem):
-            k_input = torch.cat((mem, k_input), dim=-2)
-            v_input = torch.cat((mem, v_input), dim=-2)
-
-        if exists(sinusoidal_emb):
-            # in shortformer, the query would start at a position offset depending on the past cached memory
-            offset = k_input.shape[-2] - q_input.shape[-2]
-            q_input = q_input + sinusoidal_emb(q_input, offset=offset)
-            k_input = k_input + sinusoidal_emb(k_input)
-
-        q = self.to_q(q_input)
-        k = self.to_k(k_input)
-        v = self.to_v(v_input)
-
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
-
-        input_mask = None
-        if any(map(exists, (mask, context_mask))):
-            q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool())
-            k_mask = q_mask if not exists(context) else context_mask
-            k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool())
-            q_mask = rearrange(q_mask, 'b i -> b () i ()')
-            k_mask = rearrange(k_mask, 'b j -> b () () j')
-            input_mask = q_mask * k_mask
-
-        if self.num_mem_kv > 0:
-            mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v))
-            k = torch.cat((mem_k, k), dim=-2)
-            v = torch.cat((mem_v, v), dim=-2)
-            if exists(input_mask):
-                input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True)
-
-        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
-        mask_value = max_neg_value(dots)
-
-        if exists(prev_attn):
-            dots = dots + prev_attn
-
-        pre_softmax_attn = dots
-
-        if talking_heads:
-            dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous()
-
-        if exists(rel_pos):
-            dots = rel_pos(dots)
-
-        if exists(input_mask):
-            dots.masked_fill_(~input_mask, mask_value)
-            del input_mask
-
-        if self.causal:
-            i, j = dots.shape[-2:]
-            r = torch.arange(i, device=device)
-            mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j')
-            mask = F.pad(mask, (j - i, 0), value=False)
-            dots.masked_fill_(mask, mask_value)
-            del mask
-
-        if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
-            top, _ = dots.topk(self.sparse_topk, dim=-1)
-            vk = top[..., -1].unsqueeze(-1).expand_as(dots)
-            mask = dots < vk
-            dots.masked_fill_(mask, mask_value)
-            del mask
-
-        attn = self.attn_fn(dots, dim=-1)
-        post_softmax_attn = attn
-
-        attn = self.dropout(attn)
-
-        if talking_heads:
-            attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous()
-
-        out = einsum('b h i j, b h j d -> b h i d', attn, v)
-        out = rearrange(out, 'b h n d -> b n (h d)')
-
-        intermediates = Intermediates(
-            pre_softmax_attn=pre_softmax_attn,
-            post_softmax_attn=post_softmax_attn
-        )
-
-        return self.to_out(out), intermediates
-
-
-class AttentionLayers(nn.Module):
-    def __init__(
-            self,
-            dim,
-            depth,
-            heads=8,
-            causal=False,
-            cross_attend=False,
-            only_cross=False,
-            use_scalenorm=False,
-            use_rmsnorm=False,
-            use_rezero=False,
-            rel_pos_num_buckets=32,
-            rel_pos_max_distance=128,
-            position_infused_attn=False,
-            custom_layers=None,
-            sandwich_coef=None,
-            par_ratio=None,
-            residual_attn=False,
-            cross_residual_attn=False,
-            macaron=False,
-            pre_norm=True,
-            gate_residual=False,
-            **kwargs
-    ):
-        super().__init__()
-        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
-        attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs)
-
-        dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
-
-        self.dim = dim
-        self.depth = depth
-        self.layers = nn.ModuleList([])
-
-        self.has_pos_emb = position_infused_attn
-        self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None
-        self.rotary_pos_emb = always(None)
-
-        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
-        self.rel_pos = None
-
-        self.pre_norm = pre_norm
-
-        self.residual_attn = residual_attn
-        self.cross_residual_attn = cross_residual_attn
-
-        norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
-        norm_class = RMSNorm if use_rmsnorm else norm_class
-        norm_fn = partial(norm_class, dim)
-
-        norm_fn = nn.Identity if use_rezero else norm_fn
-        branch_fn = Rezero if use_rezero else None
-
-        if cross_attend and not only_cross:
-            default_block = ('a', 'c', 'f')
-        elif cross_attend and only_cross:
-            default_block = ('c', 'f')
-        else:
-            default_block = ('a', 'f')
-
-        if macaron:
-            default_block = ('f',) + default_block
-
-        if exists(custom_layers):
-            layer_types = custom_layers
-        elif exists(par_ratio):
-            par_depth = depth * len(default_block)
-            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
-            default_block = tuple(filter(not_equals('f'), default_block))
-            par_attn = par_depth // par_ratio
-            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
-            par_width = (depth_cut + depth_cut // par_attn) // par_attn
-            assert len(default_block) <= par_width, 'default block is too large for par_ratio'
-            par_block = default_block + ('f',) * (par_width - len(default_block))
-            par_head = par_block * par_attn
-            layer_types = par_head + ('f',) * (par_depth - len(par_head))
-        elif exists(sandwich_coef):
-            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
-            layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
-        else:
-            layer_types = default_block * depth
-
-        self.layer_types = layer_types
-        self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
-
-        for layer_type in self.layer_types:
-            if layer_type == 'a':
-                layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs)
-            elif layer_type == 'c':
-                layer = Attention(dim, heads=heads, **attn_kwargs)
-            elif layer_type == 'f':
-                layer = FeedForward(dim, **ff_kwargs)
-                layer = layer if not macaron else Scale(0.5, layer)
-            else:
-                raise Exception(f'invalid layer type {layer_type}')
-
-            if isinstance(layer, Attention) and exists(branch_fn):
-                layer = branch_fn(layer)
-
-            if gate_residual:
-                residual_fn = GRUGating(dim)
-            else:
-                residual_fn = Residual()
-
-            self.layers.append(nn.ModuleList([
-                norm_fn(),
-                layer,
-                residual_fn
-            ]))
-
-    def forward(
-            self,
-            x,
-            context=None,
-            mask=None,
-            context_mask=None,
-            mems=None,
-            return_hiddens=False
-    ):
-        hiddens = []
-        intermediates = []
-        prev_attn = None
-        prev_cross_attn = None
-
-        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
-
-        for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)):
-            is_last = ind == (len(self.layers) - 1)
-
-            if layer_type == 'a':
-                hiddens.append(x)
-                layer_mem = mems.pop(0)
-
-            residual = x
-
-            if self.pre_norm:
-                x = norm(x)
-
-            if layer_type == 'a':
-                out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos,
-                                   prev_attn=prev_attn, mem=layer_mem)
-            elif layer_type == 'c':
-                out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn)
-            elif layer_type == 'f':
-                out = block(x)
-
-            x = residual_fn(out, residual)
-
-            if layer_type in ('a', 'c'):
-                intermediates.append(inter)
-
-            if layer_type == 'a' and self.residual_attn:
-                prev_attn = inter.pre_softmax_attn
-            elif layer_type == 'c' and self.cross_residual_attn:
-                prev_cross_attn = inter.pre_softmax_attn
-
-            if not self.pre_norm and not is_last:
-                x = norm(x)
-
-        if return_hiddens:
-            intermediates = LayerIntermediates(
-                hiddens=hiddens,
-                attn_intermediates=intermediates
-            )
-
-            return x, intermediates
-
-        return x
-
-
-class Encoder(AttentionLayers):
-    def __init__(self, **kwargs):
-        assert 'causal' not in kwargs, 'cannot set causality on encoder'
-        super().__init__(causal=False, **kwargs)
-
-
-
-class TransformerWrapper(nn.Module):
-    def __init__(
-            self,
-            *,
-            num_tokens,
-            max_seq_len,
-            attn_layers,
-            emb_dim=None,
-            max_mem_len=0.,
-            emb_dropout=0.,
-            num_memory_tokens=None,
-            tie_embedding=False,
-            use_pos_emb=True
-    ):
-        super().__init__()
-        assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
-
-        dim = attn_layers.dim
-        emb_dim = default(emb_dim, dim)
-
-        self.max_seq_len = max_seq_len
-        self.max_mem_len = max_mem_len
-        self.num_tokens = num_tokens
-
-        self.token_emb = nn.Embedding(num_tokens, emb_dim)
-        self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if (
-                    use_pos_emb and not attn_layers.has_pos_emb) else always(0)
-        self.emb_dropout = nn.Dropout(emb_dropout)
-
-        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
-        self.attn_layers = attn_layers
-        self.norm = nn.LayerNorm(dim)
-
-        self.init_()
-
-        self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
-
-        # memory tokens (like [cls]) from Memory Transformers paper
-        num_memory_tokens = default(num_memory_tokens, 0)
-        self.num_memory_tokens = num_memory_tokens
-        if num_memory_tokens > 0:
-            self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
-
-            # let funnel encoder know number of memory tokens, if specified
-            if hasattr(attn_layers, 'num_memory_tokens'):
-                attn_layers.num_memory_tokens = num_memory_tokens
-
-    def init_(self):
-        nn.init.normal_(self.token_emb.weight, std=0.02)
-
-    def forward(
-            self,
-            x,
-            return_embeddings=False,
-            mask=None,
-            return_mems=False,
-            return_attn=False,
-            mems=None,
-            **kwargs
-    ):
-        b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
-        x = self.token_emb(x)
-        x += self.pos_emb(x)
-        x = self.emb_dropout(x)
-
-        x = self.project_emb(x)
-
-        if num_mem > 0:
-            mem = repeat(self.memory_tokens, 'n d -> b n d', b=b)
-            x = torch.cat((mem, x), dim=1)
-
-            # auto-handle masking after appending memory tokens
-            if exists(mask):
-                mask = F.pad(mask, (num_mem, 0), value=True)
-
-        x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
-        x = self.norm(x)
-
-        mem, x = x[:, :num_mem], x[:, num_mem:]
-
-        out = self.to_logits(x) if not return_embeddings else x
-
-        if return_mems:
-            hiddens = intermediates.hiddens
-            new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens
-            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
-            return out, new_mems
-
-        if return_attn:
-            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
-            return out, attn_maps
-
-        return out
-
diff --git a/ldm/util.py b/ldm/util.py
index 8ba38853e7a07228cc2c187742b5c45d7359b3f9..dc9e3c48b1924fbc1ac3ecdf7a2192e1a46d9228 100755
--- a/ldm/util.py
+++ b/ldm/util.py
@@ -1,14 +1,12 @@
 import importlib
+import math
 
+import cv2
 import torch
 import numpy as np
-from collections import abc
-from einops import rearrange
-from functools import partial
 
-import multiprocessing as mp
-from threading import Thread
-from queue import Queue
+import os
+from safetensors.torch import load_file
 
 from inspect import isfunction
 from PIL import Image, ImageDraw, ImageFont
@@ -22,7 +20,7 @@ def log_txt_as_img(wh, xc, size=10):
     for bi in range(b):
         txt = Image.new("RGB", wh, color="white")
         draw = ImageDraw.Draw(txt)
-        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
+        font = ImageFont.truetype('assets/DejaVuSans.ttf', size=size)
         nc = int(40 * (wh[0] / 256))
         lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
 
@@ -93,111 +91,110 @@ def get_obj_from_str(string, reload=False):
     return getattr(importlib.import_module(module, package=None), cls)
 
 
-def _do_parallel_data_prefetch(func, Q, data, idx, idx_to_fn=False):
-    # create dummy dataset instance
+checkpoint_dict_replacements = {
+    'cond_stage_model.transformer.text_model.embeddings.': 'cond_stage_model.transformer.embeddings.',
+    'cond_stage_model.transformer.text_model.encoder.': 'cond_stage_model.transformer.encoder.',
+    'cond_stage_model.transformer.text_model.final_layer_norm.': 'cond_stage_model.transformer.final_layer_norm.',
+}
 
-    # run prefetching
-    if idx_to_fn:
-        res = func(data, worker_id=idx)
-    else:
-        res = func(data)
-    Q.put([idx, res])
-    Q.put("Done")
-
-
-def parallel_data_prefetch(
-        func: callable, data, n_proc, target_data_type="ndarray", cpu_intensive=True, use_worker_id=False
-):
-    # if target_data_type not in ["ndarray", "list"]:
-    #     raise ValueError(
-    #         "Data, which is passed to parallel_data_prefetch has to be either of type list or ndarray."
-    #     )
-    if isinstance(data, np.ndarray) and target_data_type == "list":
-        raise ValueError("list expected but function got ndarray.")
-    elif isinstance(data, abc.Iterable):
-        if isinstance(data, dict):
-            print(
-                f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.'
-            )
-            data = list(data.values())
-        if target_data_type == "ndarray":
-            data = np.asarray(data)
-        else:
-            data = list(data)
-    else:
-        raise TypeError(
-            f"The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}."
-        )
 
-    if cpu_intensive:
-        Q = mp.Queue(1000)
-        proc = mp.Process
-    else:
-        Q = Queue(1000)
-        proc = Thread
-    # spawn processes
-    if target_data_type == "ndarray":
-        arguments = [
-            [func, Q, part, i, use_worker_id]
-            for i, part in enumerate(np.array_split(data, n_proc))
-        ]
+def transform_checkpoint_dict_key(k):
+    for text, replacement in checkpoint_dict_replacements.items():
+        if k.startswith(text):
+            k = replacement + k[len(text):]
+
+    return k
+
+
+def get_state_dict_from_checkpoint(pl_sd):
+    pl_sd = pl_sd.pop("state_dict", pl_sd)
+    pl_sd.pop("state_dict", None)
+
+    sd = {}
+    for k, v in pl_sd.items():
+        new_key = transform_checkpoint_dict_key(k)
+
+        if new_key is not None:
+            sd[new_key] = v
+
+    pl_sd.clear()
+    pl_sd.update(sd)
+
+    return pl_sd
+
+
+def read_state_dict(checkpoint_file, print_global_state=False):
+    _, extension = os.path.splitext(checkpoint_file)
+    if extension.lower() == ".safetensors":
+        pl_sd = load_file(checkpoint_file, device='cpu')
     else:
-        step = (
-            int(len(data) / n_proc + 1)
-            if len(data) % n_proc != 0
-            else int(len(data) / n_proc)
-        )
-        arguments = [
-            [func, Q, part, i, use_worker_id]
-            for i, part in enumerate(
-                [data[i: i + step] for i in range(0, len(data), step)]
-            )
-        ]
-    processes = []
-    for i in range(n_proc):
-        p = proc(target=_do_parallel_data_prefetch, args=arguments[i])
-        processes += [p]
-
-    # start processes
-    print(f"Start prefetching...")
-    import time
-
-    start = time.time()
-    gather_res = [[] for _ in range(n_proc)]
-    try:
-        for p in processes:
-            p.start()
-
-        k = 0
-        while k < n_proc:
-            # get result
-            res = Q.get()
-            if res == "Done":
-                k += 1
-            else:
-                gather_res[res[0]] = res[1]
-
-    except Exception as e:
-        print("Exception: ", e)
-        for p in processes:
-            p.terminate()
-
-        raise e
-    finally:
-        for p in processes:
-            p.join()
-        print(f"Prefetching complete. [{time.time() - start} sec.]")
-
-    if target_data_type == 'ndarray':
-        if not isinstance(gather_res[0], np.ndarray):
-            return np.concatenate([np.asarray(r) for r in gather_res], axis=0)
-
-        # order outputs
-        return np.concatenate(gather_res, axis=0)
-    elif target_data_type == 'list':
-        out = []
-        for r in gather_res:
-            out.extend(r)
-        return out
+        pl_sd = torch.load(checkpoint_file, map_location='cpu')
+
+    if print_global_state and "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+
+    sd = get_state_dict_from_checkpoint(pl_sd)
+    return sd
+
+
+def load_model_from_config(config, ckpt, vae_ckpt=None, verbose=False):
+    print(f"Loading model from {ckpt}")
+    sd = read_state_dict(ckpt)
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+
+    if 'anything' in ckpt.lower() and vae_ckpt is None:
+        vae_ckpt = 'models/anything-v4.0.vae.pt'
+
+    if vae_ckpt is not None and vae_ckpt != 'None':
+        print(f"Loading vae model from {vae_ckpt}")
+        vae_sd = torch.load(vae_ckpt, map_location="cpu")
+        if "global_step" in vae_sd:
+            print(f"Global Step: {vae_sd['global_step']}")
+        sd = vae_sd["state_dict"]
+        m, u = model.first_stage_model.load_state_dict(sd, strict=False)
+        if len(m) > 0 and verbose:
+            print("missing keys:")
+            print(m)
+        if len(u) > 0 and verbose:
+            print("unexpected keys:")
+            print(u)
+
+    model.cuda()
+    model.eval()
+    return model
+
+
+def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None):
+    h, w = image.shape[:2]
+    if resize_short_edge is not None:
+        k = resize_short_edge / min(h, w)
     else:
-        return gather_res
+        k = max_resolution / (h * w)
+        k = k**0.5
+    h = int(np.round(h * k / 64)) * 64
+    w = int(np.round(w * k / 64)) * 64
+    image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
+    return image
+
+
+# make uc and prompt shapes match via padding for long prompts
+null_cond = None
+
+def fix_cond_shapes(model, prompt_condition, uc):
+    if uc is None:
+        return prompt_condition, uc
+    global null_cond
+    if null_cond is None:
+        null_cond = model.get_learned_conditioning([""])
+    while prompt_condition.shape[1] > uc.shape[1]:
+        uc = torch.cat((uc, null_cond.repeat((uc.shape[0], 1, 1))), axis=1)
+    while prompt_condition.shape[1] < uc.shape[1]:
+        prompt_condition = torch.cat((prompt_condition, null_cond.repeat((prompt_condition.shape[0], 1, 1))), axis=1)
+    return prompt_condition, uc
diff --git a/models/table5_pidinet.pth b/models/table5_pidinet.pth
deleted file mode 100644
index 1ceba1de87e7bb3c81961b80acbb3a106ca249c0..0000000000000000000000000000000000000000
--- a/models/table5_pidinet.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:80860ac267258b5f27486e0ef152a211d0b08120f62aeb185a050acc30da486c
-size 2871148
diff --git a/requirements.txt b/requirements.txt
index 614404fca39c73a052e7318697032e762d5105af..22a11291fd310d712100382ddbb040399b64d94e 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,20 +1,19 @@
-torch==1.13.0
-torchvision
 transformers==4.19.2
 diffusers==0.11.1
 invisible_watermark==0.1.5
 basicsr==1.4.2
 einops==0.6.0
 omegaconf==2.3.0
-taming-transformers-rom1504==0.0.6
-pytorch_lightning==1.8.6
-clip==0.2.0
+pytorch_lightning==1.5.9
 kornia==0.6.8
-
-
-openmim
-mmpose
-mmdet
-psutil
-blobfile
-timm
\ No newline at end of file
+gradio
+opencv-python
+pudb
+imageio
+imageio-ffmpeg
+k-diffusion
+webdataset
+open-clip-torch
+kornia
+safetensors
+timm
diff --git a/style.css b/style.css
new file mode 100644
index 0000000000000000000000000000000000000000..c4739b4ea5fc35e774a049e3dacc443f7f0eac19
--- /dev/null
+++ b/style.css
@@ -0,0 +1,3 @@
+h1 {
+  text-align: center;
+}
diff --git a/test_adapter.py b/test_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa8f7ae0cd5817eac836b3ab66d51480aa7bede4
--- /dev/null
+++ b/test_adapter.py
@@ -0,0 +1,80 @@
+import os
+
+import cv2
+import torch
+from basicsr.utils import tensor2img
+from pytorch_lightning import seed_everything
+from torch import autocast
+
+from ldm.inference_base import (diffusion_inference, get_adapters, get_base_argument_parser, get_sd_models)
+from ldm.modules.extra_condition import api
+from ldm.modules.extra_condition.api import (ExtraCondition, get_adapter_feature, get_cond_model)
+
+torch.set_grad_enabled(False)
+
+
+def main():
+    supported_cond = [e.name for e in ExtraCondition]
+    parser = get_base_argument_parser()
+    parser.add_argument(
+        '--which_cond',
+        type=str,
+        required=True,
+        choices=supported_cond,
+        help='which condition modality you want to test',
+    )
+    opt = parser.parse_args()
+    which_cond = opt.which_cond
+    if opt.outdir is None:
+        opt.outdir = f'outputs/test-{which_cond}'
+    os.makedirs(opt.outdir, exist_ok=True)
+    if opt.resize_short_edge is None:
+        print(f"you don't specify the resize_shot_edge, so the maximum resolution is set to {opt.max_resolution}")
+    opt.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+    # support two test mode: single image test, and batch test (through a txt file)
+    if opt.prompt.endswith('.txt'):
+        assert opt.prompt.endswith('.txt')
+        image_paths = []
+        prompts = []
+        with open(opt.prompt, 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                line = line.strip()
+                image_paths.append(line.split('; ')[0])
+                prompts.append(line.split('; ')[1])
+    else:
+        image_paths = [opt.cond_path]
+        prompts = [opt.prompt]
+    print(image_paths)
+
+    # prepare models
+    sd_model, sampler = get_sd_models(opt)
+    adapter = get_adapters(opt, getattr(ExtraCondition, which_cond))
+    cond_model = None
+    if opt.cond_inp_type == 'image':
+        cond_model = get_cond_model(opt, getattr(ExtraCondition, which_cond))
+
+    process_cond_module = getattr(api, f'get_cond_{which_cond}')
+
+    # inference
+    with torch.inference_mode(), \
+            sd_model.ema_scope(), \
+            autocast('cuda'):
+        for test_idx, (cond_path, prompt) in enumerate(zip(image_paths, prompts)):
+            seed_everything(opt.seed)
+            for v_idx in range(opt.n_samples):
+                # seed_everything(opt.seed+v_idx+test_idx)
+                cond = process_cond_module(opt, cond_path, opt.cond_inp_type, cond_model)
+
+                base_count = len(os.listdir(opt.outdir)) // 2
+                cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_{which_cond}.png'), tensor2img(cond))
+
+                adapter_features, append_to_context = get_adapter_feature(cond, adapter)
+                opt.prompt = prompt
+                result = diffusion_inference(opt, sd_model, sampler, adapter_features, append_to_context)
+                cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_result.png'), tensor2img(result))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/test_composable_adapters.py b/test_composable_adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e814e949c381d096581d6b46029649be982a22e
--- /dev/null
+++ b/test_composable_adapters.py
@@ -0,0 +1,101 @@
+import cv2
+import os
+import torch
+from pytorch_lightning import seed_everything
+from torch import autocast
+
+from basicsr.utils import tensor2img
+from ldm.inference_base import diffusion_inference, get_adapters, get_base_argument_parser, get_sd_models
+from ldm.modules.extra_condition import api
+from ldm.modules.extra_condition.api import ExtraCondition, get_adapter_feature, get_cond_model
+
+torch.set_grad_enabled(False)
+
+
+def main():
+    supported_cond = [e.name for e in ExtraCondition]
+    parser = get_base_argument_parser()
+    for cond_name in supported_cond:
+        parser.add_argument(
+            f'--{cond_name}_path',
+            type=str,
+            default=None,
+            help=f'condition image path for {cond_name}',
+        )
+        parser.add_argument(
+            f'--{cond_name}_inp_type',
+            type=str,
+            default='image',
+            help=f'the type of the input condition image, can be image or {cond_name}',
+            choices=['image', cond_name],
+        )
+        parser.add_argument(
+            f'--{cond_name}_adapter_ckpt',
+            type=str,
+            default=None,
+            help=f'path to checkpoint of the {cond_name} adapter, '
+                 f'if {cond_name}_path is not None, this should not be None too',
+        )
+        parser.add_argument(
+            f'--{cond_name}_weight',
+            type=float,
+            default=1.0,
+            help=f'the {cond_name} adapter features are multiplied by the {cond_name}_weight and then summed up together',
+        )
+    opt = parser.parse_args()
+
+    # process argument
+    activated_conds = []
+    cond_paths = []
+    adapter_ckpts = []
+    for cond_name in supported_cond:
+        if getattr(opt, f'{cond_name}_path') is None:
+            continue
+        assert getattr(opt, f'{cond_name}_adapter_ckpt') is not None, f'you should specify the {cond_name}_adapter_ckpt'
+        activated_conds.append(cond_name)
+        cond_paths.append(getattr(opt, f'{cond_name}_path'))
+        adapter_ckpts.append(getattr(opt, f'{cond_name}_adapter_ckpt'))
+    assert len(activated_conds) != 0, 'you did not input any condition'
+
+    if opt.outdir is None:
+        opt.outdir = f'outputs/test-composable-adapters'
+    os.makedirs(opt.outdir, exist_ok=True)
+    if opt.resize_short_edge is None:
+        print(f"you don't specify the resize_shot_edge, so the maximum resolution is set to {opt.max_resolution}")
+    opt.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+    # prepare models
+    adapters = []
+    cond_models = []
+    cond_inp_types = []
+    process_cond_modules = []
+    for cond_name in activated_conds:
+        adapters.append(get_adapters(opt, getattr(ExtraCondition, cond_name)))
+        cond_inp_type = getattr(opt, f'{cond_name}_inp_type', 'image')
+        if cond_inp_type == 'image':
+            cond_models.append(get_cond_model(opt, getattr(ExtraCondition, cond_name)))
+        else:
+            cond_models.append(None)
+        cond_inp_types.append(cond_inp_type)
+        process_cond_modules.append(getattr(api, f'get_cond_{cond_name}'))
+    sd_model, sampler = get_sd_models(opt)
+
+    # inference
+    with torch.inference_mode(), \
+            sd_model.ema_scope(), \
+            autocast('cuda'):
+        seed_everything(opt.seed)
+        conds = []
+        for cond_idx, cond_name in enumerate(activated_conds):
+            conds.append(process_cond_modules[cond_idx](
+                opt, cond_paths[cond_idx], cond_inp_types[cond_idx], cond_models[cond_idx],
+            ))
+        adapter_features, append_to_context = get_adapter_feature(conds, adapters)
+        for v_idx in range(opt.n_samples):
+            result = diffusion_inference(opt, sd_model, sampler, adapter_features, append_to_context)
+            base_count = len(os.listdir(opt.outdir))
+            cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_result.png'), tensor2img(result))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/train_depth.py b/train_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..af9a203bdd8b0904440bdcc55f2127c26aab7ebd
--- /dev/null
+++ b/train_depth.py
@@ -0,0 +1,281 @@
+import argparse
+import logging
+import os
+import os.path as osp
+import torch
+from basicsr.utils import (get_env_info, get_root_logger, get_time_str,
+                           scandir)
+from basicsr.utils.options import copy_opt_file, dict2str
+from omegaconf import OmegaConf
+
+from ldm.data.dataset_depth import DepthDataset
+from basicsr.utils.dist_util import get_dist_info, init_dist, master_only
+from ldm.modules.encoders.adapter import Adapter
+from ldm.util import load_model_from_config
+
+
+@master_only
+def mkdir_and_rename(path):
+    """mkdirs. If path exists, rename it with timestamp and create a new one.
+
+    Args:
+        path (str): Folder path.
+    """
+    if osp.exists(path):
+        new_name = path + '_archived_' + get_time_str()
+        print(f'Path already exists. Rename it to {new_name}', flush=True)
+        os.rename(path, new_name)
+    os.makedirs(path, exist_ok=True)
+    os.makedirs(osp.join(path, 'models'))
+    os.makedirs(osp.join(path, 'training_states'))
+    os.makedirs(osp.join(path, 'visualization'))
+
+
+def load_resume_state(opt):
+    resume_state_path = None
+    if opt.auto_resume:
+        state_path = osp.join('experiments', opt.name, 'training_states')
+        if osp.isdir(state_path):
+            states = list(scandir(state_path, suffix='state', recursive=False, full_path=False))
+            if len(states) != 0:
+                states = [float(v.split('.state')[0]) for v in states]
+                resume_state_path = osp.join(state_path, f'{max(states):.0f}.state')
+                opt.resume_state_path = resume_state_path
+
+    if resume_state_path is None:
+        resume_state = None
+    else:
+        device_id = torch.cuda.current_device()
+        resume_state = torch.load(resume_state_path, map_location=lambda storage, loc: storage.cuda(device_id))
+    return resume_state
+
+
+def parsr_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bsize",
+        type=int,
+        default=8,
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=10000,
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=8,
+    )
+    parser.add_argument(
+        "--plms",
+        action='store_true',
+        help="use plms sampling",
+    )
+    parser.add_argument(
+        "--auto_resume",
+        action='store_true',
+        help="use plms sampling",
+    )
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="models/sd-v1-4.ckpt",
+        help="path to checkpoint of model",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/stable-diffusion/sd-v1-train.yaml",
+        help="path to config which constructs model",
+    )
+    parser.add_argument(
+        "--name",
+        type=str,
+        default="train_depth",
+        help="experiment name",
+    )
+    parser.add_argument(
+        "--print_fq",
+        type=int,
+        default=100,
+        help="path to config which constructs model",
+    )
+    parser.add_argument(
+        "--H",
+        type=int,
+        default=512,
+        help="image height, in pixel space",
+    )
+    parser.add_argument(
+        "--W",
+        type=int,
+        default=512,
+        help="image width, in pixel space",
+    )
+    parser.add_argument(
+        "--C",
+        type=int,
+        default=4,
+        help="latent channels",
+    )
+    parser.add_argument(
+        "--f",
+        type=int,
+        default=8,
+        help="downsampling factor",
+    )
+    parser.add_argument(
+        "--sample_steps",
+        type=int,
+        default=50,
+        help="number of ddim sampling steps",
+    )
+    parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=1,
+        help="how many samples to produce for each given prompt. A.k.a. batch size",
+    )
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=7.5,
+        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
+    )
+    parser.add_argument(
+        "--gpus",
+        default=[0, 1, 2, 3],
+        help="gpu idx",
+    )
+    parser.add_argument(
+        '--local_rank',
+        default=0,
+        type=int,
+        help='node rank for distributed training'
+    )
+    parser.add_argument(
+        '--launcher',
+        default='pytorch',
+        type=str,
+        help='node rank for distributed training'
+    )
+    opt = parser.parse_args()
+    return opt
+
+
+def main():
+    opt = parsr_args()
+    config = OmegaConf.load(f"{opt.config}")
+
+    # distributed setting
+    init_dist(opt.launcher)
+    torch.backends.cudnn.benchmark = True
+    device = 'cuda'
+    torch.cuda.set_device(opt.local_rank)
+
+    # dataset
+    train_dataset = DepthDataset('datasets/laion_depth_meta_v1.txt')
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=opt.bsize,
+        shuffle=(train_sampler is None),
+        num_workers=opt.num_workers,
+        pin_memory=True,
+        sampler=train_sampler)
+
+    # stable diffusion
+    model = load_model_from_config(config, f"{opt.ckpt}").to(device)
+
+    # depth encoder
+    model_ad = Adapter(cin=3 * 64, channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True, use_conv=False).to(
+        device)
+
+    # to gpus
+    model_ad = torch.nn.parallel.DistributedDataParallel(
+        model_ad,
+        device_ids=[opt.local_rank],
+        output_device=opt.local_rank)
+    model = torch.nn.parallel.DistributedDataParallel(
+        model,
+        device_ids=[opt.local_rank],
+        output_device=opt.local_rank)
+
+    # optimizer
+    params = list(model_ad.parameters())
+    optimizer = torch.optim.AdamW(params, lr=config['training']['lr'])
+
+    experiments_root = osp.join('experiments', opt.name)
+
+    # resume state
+    resume_state = load_resume_state(opt)
+    if resume_state is None:
+        mkdir_and_rename(experiments_root)
+        start_epoch = 0
+        current_iter = 0
+        # WARNING: should not use get_root_logger in the above codes, including the called functions
+        # Otherwise the logger will not be properly initialized
+        log_file = osp.join(experiments_root, f"train_{opt.name}_{get_time_str()}.log")
+        logger = get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=log_file)
+        logger.info(get_env_info())
+        logger.info(dict2str(config))
+    else:
+        # WARNING: should not use get_root_logger in the above codes, including the called functions
+        # Otherwise the logger will not be properly initialized
+        log_file = osp.join(experiments_root, f"train_{opt.name}_{get_time_str()}.log")
+        logger = get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=log_file)
+        logger.info(get_env_info())
+        logger.info(dict2str(config))
+        resume_optimizers = resume_state['optimizers']
+        optimizer.load_state_dict(resume_optimizers)
+        logger.info(f"Resuming training from epoch: {resume_state['epoch']}, " f"iter: {resume_state['iter']}.")
+        start_epoch = resume_state['epoch']
+        current_iter = resume_state['iter']
+
+    # copy the yml file to the experiment root
+    copy_opt_file(opt.config, experiments_root)
+
+    # training
+    logger.info(f'Start training from epoch: {start_epoch}, iter: {current_iter}')
+    for epoch in range(start_epoch, opt.epochs):
+        train_dataloader.sampler.set_epoch(epoch)
+        # train
+        for _, data in enumerate(train_dataloader):
+            current_iter += 1
+            with torch.no_grad():
+                c = model.module.get_learned_conditioning(data['sentence'])
+                z = model.module.encode_first_stage((data['im'] * 2 - 1.).to(device))
+                z = model.module.get_first_stage_encoding(z)
+
+            optimizer.zero_grad()
+            model.zero_grad()
+            features_adapter = model_ad(data['depth'].to(device))
+            l_pixel, loss_dict = model(z, c=c, features_adapter=features_adapter)
+            l_pixel.backward()
+            optimizer.step()
+
+            if (current_iter + 1) % opt.print_fq == 0:
+                logger.info(loss_dict)
+
+            # save checkpoint
+            rank, _ = get_dist_info()
+            if (rank == 0) and ((current_iter + 1) % config['training']['save_freq'] == 0):
+                save_filename = f'model_ad_{current_iter + 1}.pth'
+                save_path = os.path.join(experiments_root, 'models', save_filename)
+                save_dict = {}
+                state_dict = model_ad.state_dict()
+                for key, param in state_dict.items():
+                    if key.startswith('module.'):  # remove unnecessary 'module.'
+                        key = key[7:]
+                    save_dict[key] = param.cpu()
+                torch.save(save_dict, save_path)
+                # save state
+                state = {'epoch': epoch, 'iter': current_iter + 1, 'optimizers': optimizer.state_dict()}
+                save_filename = f'{current_iter + 1}.state'
+                save_path = os.path.join(experiments_root, 'training_states', save_filename)
+                torch.save(state, save_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/train_seg.py b/train_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..82ed0724ef757a93e9f9fdd4ef3ada4a0203f906
--- /dev/null
+++ b/train_seg.py
@@ -0,0 +1,372 @@
+import cv2
+import torch
+import os
+from basicsr.utils import img2tensor, tensor2img, scandir, get_time_str, get_root_logger, get_env_info
+from ldm.data.dataset_coco import dataset_coco_mask_color
+import argparse
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.models.diffusion.plms import PLMSSampler
+from ldm.models.diffusion.dpm_solver import DPMSolverSampler
+from omegaconf import OmegaConf
+from ldm.util import instantiate_from_config
+from ldm.modules.encoders.adapter import Adapter
+from PIL import Image
+import numpy as np
+import torch.nn as nn
+import matplotlib.pyplot as plt
+import time
+import os.path as osp
+from basicsr.utils.options import copy_opt_file, dict2str
+import logging
+from dist_util import init_dist, master_only, get_bare_model, get_dist_info
+
+def load_model_from_config(config, ckpt, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+
+    model.cuda()
+    model.eval()
+    return model
+
+@master_only
+def mkdir_and_rename(path):
+    """mkdirs. If path exists, rename it with timestamp and create a new one.
+
+    Args:
+        path (str): Folder path.
+    """
+    if osp.exists(path):
+        new_name = path + '_archived_' + get_time_str()
+        print(f'Path already exists. Rename it to {new_name}', flush=True)
+        os.rename(path, new_name)
+    os.makedirs(path, exist_ok=True)
+    os.makedirs(osp.join(experiments_root, 'models'))
+    os.makedirs(osp.join(experiments_root, 'training_states'))
+    os.makedirs(osp.join(experiments_root, 'visualization'))
+
+def load_resume_state(opt):
+    resume_state_path = None
+    if opt.auto_resume:
+        state_path = osp.join('experiments', opt.name, 'training_states')
+        if osp.isdir(state_path):
+            states = list(scandir(state_path, suffix='state', recursive=False, full_path=False))
+            if len(states) != 0:
+                states = [float(v.split('.state')[0]) for v in states]
+                resume_state_path = osp.join(state_path, f'{max(states):.0f}.state')
+                opt.resume_state_path = resume_state_path
+    # else:
+    #     if opt['path'].get('resume_state'):
+    #         resume_state_path = opt['path']['resume_state']
+
+    if resume_state_path is None:
+        resume_state = None
+    else:
+        device_id = torch.cuda.current_device()
+        resume_state = torch.load(resume_state_path, map_location=lambda storage, loc: storage.cuda(device_id))
+        # check_resume(opt, resume_state['iter'])
+    return resume_state
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--bsize",
+    type=int,
+    default=8,
+    help="the prompt to render"
+)
+parser.add_argument(
+    "--epochs",
+    type=int,
+    default=10000,
+    help="the prompt to render"
+)
+parser.add_argument(
+    "--num_workers",
+    type=int,
+    default=8,
+    help="the prompt to render"
+)
+parser.add_argument(
+    "--use_shuffle",
+    type=bool,
+    default=True,
+    help="the prompt to render"
+)
+parser.add_argument(
+        "--dpm_solver",
+        action='store_true',
+        help="use dpm_solver sampling",
+)
+parser.add_argument(
+        "--plms",
+        action='store_true',
+        help="use plms sampling",
+)
+parser.add_argument(
+        "--auto_resume",
+        action='store_true',
+        help="use plms sampling",
+)
+parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="ckp/sd-v1-4.ckpt",
+        help="path to checkpoint of model",
+)
+parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/stable-diffusion/train_mask.yaml",
+        help="path to config which constructs model",
+)
+parser.add_argument(
+        "--print_fq",
+        type=int,
+        default=100,
+        help="path to config which constructs model",
+)
+parser.add_argument(
+        "--H",
+        type=int,
+        default=512,
+        help="image height, in pixel space",
+)
+parser.add_argument(
+    "--W",
+    type=int,
+    default=512,
+    help="image width, in pixel space",
+)
+parser.add_argument(
+    "--C",
+    type=int,
+    default=4,
+    help="latent channels",
+)
+parser.add_argument(
+    "--f",
+    type=int,
+    default=8,
+    help="downsampling factor",
+)
+parser.add_argument(
+        "--ddim_steps",
+        type=int,
+        default=50,
+        help="number of ddim sampling steps",
+)
+parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=1,
+        help="how many samples to produce for each given prompt. A.k.a. batch size",
+)
+parser.add_argument(
+        "--ddim_eta",
+        type=float,
+        default=0.0,
+        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
+)
+parser.add_argument(
+        "--scale",
+        type=float,
+        default=7.5,
+        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
+)
+parser.add_argument(
+        "--gpus",
+        default=[0,1,2,3],
+        help="gpu idx",
+)
+parser.add_argument(
+        '--local_rank', 
+        default=0, 
+        type=int,
+        help='node rank for distributed training'
+)
+parser.add_argument(
+        '--launcher', 
+        default='pytorch', 
+        type=str,
+        help='node rank for distributed training'
+)
+opt = parser.parse_args()
+
+if __name__ == '__main__':
+    config = OmegaConf.load(f"{opt.config}")
+    opt.name = config['name']
+    
+    # distributed setting
+    init_dist(opt.launcher)
+    torch.backends.cudnn.benchmark = True
+    device='cuda'
+    torch.cuda.set_device(opt.local_rank)
+
+    # dataset
+    path_json_train = 'coco_stuff/mask/annotations/captions_train2017.json'
+    path_json_val = 'coco_stuff/mask/annotations/captions_val2017.json'
+    train_dataset = dataset_coco_mask_color(path_json_train, 
+    root_path_im='coco/train2017',
+    root_path_mask='coco_stuff/mask/train2017_color',
+    image_size=512
+    )
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    val_dataset = dataset_coco_mask_color(path_json_val, 
+    root_path_im='coco/val2017',
+    root_path_mask='coco_stuff/mask/val2017_color',
+    image_size=512
+    )
+    train_dataloader = torch.utils.data.DataLoader(
+            train_dataset,
+            batch_size=opt.bsize,
+            shuffle=(train_sampler is None),
+            num_workers=opt.num_workers,
+            pin_memory=True,
+            sampler=train_sampler)
+    val_dataloader = torch.utils.data.DataLoader(
+            val_dataset,
+            batch_size=1,
+            shuffle=False,
+            num_workers=1,
+            pin_memory=False)
+
+    # stable diffusion
+    model = load_model_from_config(config, f"{opt.ckpt}").to(device)
+    
+    # sketch encoder
+    model_ad = Adapter(cin=int(3*64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True, use_conv=False).to(device)
+
+
+    # to gpus
+    model_ad = torch.nn.parallel.DistributedDataParallel(
+        model_ad,
+        device_ids=[opt.local_rank], 
+        output_device=opt.local_rank)
+    model = torch.nn.parallel.DistributedDataParallel(
+        model,
+        device_ids=[opt.local_rank], 
+        output_device=opt.local_rank)
+        # device_ids=[torch.cuda.current_device()])
+
+    # optimizer
+    params = list(model_ad.parameters())
+    optimizer = torch.optim.AdamW(params, lr=config['training']['lr'])
+
+    experiments_root = osp.join('experiments', opt.name)
+
+    # resume state
+    resume_state = load_resume_state(opt)
+    if resume_state is None:
+        mkdir_and_rename(experiments_root)
+        start_epoch = 0
+        current_iter = 0
+        # WARNING: should not use get_root_logger in the above codes, including the called functions
+        # Otherwise the logger will not be properly initialized
+        log_file = osp.join(experiments_root, f"train_{opt.name}_{get_time_str()}.log")
+        logger = get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=log_file)
+        logger.info(get_env_info())
+        logger.info(dict2str(config))
+    else:
+        # WARNING: should not use get_root_logger in the above codes, including the called functions
+        # Otherwise the logger will not be properly initialized
+        log_file = osp.join(experiments_root, f"train_{opt.name}_{get_time_str()}.log")
+        logger = get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=log_file)
+        logger.info(get_env_info())
+        logger.info(dict2str(config))
+        resume_optimizers = resume_state['optimizers']
+        optimizer.load_state_dict(resume_optimizers)
+        logger.info(f"Resuming training from epoch: {resume_state['epoch']}, " f"iter: {resume_state['iter']}.")
+        start_epoch = resume_state['epoch']
+        current_iter = resume_state['iter']
+
+    # copy the yml file to the experiment root
+    copy_opt_file(opt.config, experiments_root)
+
+    # training
+    logger.info(f'Start training from epoch: {start_epoch}, iter: {current_iter}')
+    for epoch in range(start_epoch, opt.epochs):
+        train_dataloader.sampler.set_epoch(epoch)
+        # train
+        for _, data in enumerate(train_dataloader):
+            current_iter += 1
+            with torch.no_grad():
+                c = model.module.get_learned_conditioning(data['sentence'])
+                z = model.module.encode_first_stage((data['im']*2-1.).cuda(non_blocking=True))
+                z = model.module.get_first_stage_encoding(z)
+
+            mask = data['mask']
+            optimizer.zero_grad()
+            model.zero_grad()
+            features_adapter = model_ad(mask)
+            l_pixel, loss_dict = model(z, c=c, features_adapter = features_adapter)
+            l_pixel.backward()
+            optimizer.step()
+
+            if (current_iter+1)%opt.print_fq == 0:
+                logger.info(loss_dict)
+            
+            # save checkpoint
+            rank, _ = get_dist_info()
+            if (rank==0) and ((current_iter+1)%config['training']['save_freq'] == 0):
+                save_filename = f'model_ad_{current_iter+1}.pth'
+                save_path = os.path.join(experiments_root, 'models', save_filename)
+                save_dict = {}
+                model_ad_bare = get_bare_model(model_ad)
+                state_dict = model_ad_bare.state_dict()
+                for key, param in state_dict.items():
+                    if key.startswith('module.'):  # remove unnecessary 'module.'
+                        key = key[7:]
+                    save_dict[key] = param.cpu()
+                torch.save(save_dict, save_path)
+            # save state
+                state = {'epoch': epoch, 'iter': current_iter+1, 'optimizers': optimizer.state_dict()}
+                save_filename = f'{current_iter+1}.state'
+                save_path = os.path.join(experiments_root, 'training_states', save_filename)
+                torch.save(state, save_path)
+
+        # val
+        rank, _ = get_dist_info()
+        if rank==0:
+            for data in val_dataloader:
+                with torch.no_grad():
+                    if opt.dpm_solver:
+                        sampler = DPMSolverSampler(model.module)
+                    elif opt.plms:
+                        sampler = PLMSSampler(model.module)
+                    else:
+                        sampler = DDIMSampler(model.module)
+                    c = model.module.get_learned_conditioning(data['sentence'])
+                    mask = data['mask']
+                    im_mask = tensor2img(mask)
+                    cv2.imwrite(os.path.join(experiments_root, 'visualization', 'mask_%04d.png'%epoch), im_mask)
+                    features_adapter = model_ad(mask)
+                    shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
+                    samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
+                                                        conditioning=c,
+                                                        batch_size=opt.n_samples,
+                                                        shape=shape,
+                                                        verbose=False,
+                                                        unconditional_guidance_scale=opt.scale,
+                                                        unconditional_conditioning=model.module.get_learned_conditioning(opt.n_samples * [""]),
+                                                        eta=opt.ddim_eta,
+                                                        x_T=None,
+                                                        features_adapter=features_adapter)
+                    x_samples_ddim = model.module.decode_first_stage(samples_ddim)
+                    x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                    x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
+                    for id_sample, x_sample in enumerate(x_samples_ddim):
+                        x_sample = 255.*x_sample
+                        img = x_sample.astype(np.uint8)
+                        img = cv2.putText(img.copy(), data['sentence'][0], (10,30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 2)
+                        cv2.imwrite(os.path.join(experiments_root, 'visualization', 'sample_e%04d_s%04d.png'%(epoch, id_sample)), img[:,:,::-1])
+                    break
diff --git a/train_sketch.py b/train_sketch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ab8d2f0e742f8a0395578b697bbad00415ccfa
--- /dev/null
+++ b/train_sketch.py
@@ -0,0 +1,399 @@
+import argparse
+import logging
+import os
+import os.path as osp
+import time
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+from basicsr.utils import (get_env_info, get_root_logger, get_time_str,
+                           img2tensor, scandir, tensor2img)
+from basicsr.utils.options import copy_opt_file, dict2str
+from omegaconf import OmegaConf
+from PIL import Image
+
+from ldm.data.dataset_coco import dataset_coco_mask_color
+from dist_util import get_bare_model, get_dist_info, init_dist, master_only
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.models.diffusion.dpm_solver import DPMSolverSampler
+from ldm.models.diffusion.plms import PLMSSampler
+from ldm.modules.encoders.adapter import Adapter
+from ldm.util import instantiate_from_config
+from ldm.modules.structure_condition.model_edge import pidinet
+
+
+def load_model_from_config(config, ckpt, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+
+    model.cuda()
+    model.eval()
+    return model
+
+@master_only
+def mkdir_and_rename(path):
+    """mkdirs. If path exists, rename it with timestamp and create a new one.
+
+    Args:
+        path (str): Folder path.
+    """
+    if osp.exists(path):
+        new_name = path + '_archived_' + get_time_str()
+        print(f'Path already exists. Rename it to {new_name}', flush=True)
+        os.rename(path, new_name)
+    os.makedirs(path, exist_ok=True)
+    os.makedirs(osp.join(experiments_root, 'models'))
+    os.makedirs(osp.join(experiments_root, 'training_states'))
+    os.makedirs(osp.join(experiments_root, 'visualization'))
+
+def load_resume_state(opt):
+    resume_state_path = None
+    if opt.auto_resume:
+        state_path = osp.join('experiments', opt.name, 'training_states')
+        if osp.isdir(state_path):
+            states = list(scandir(state_path, suffix='state', recursive=False, full_path=False))
+            if len(states) != 0:
+                states = [float(v.split('.state')[0]) for v in states]
+                resume_state_path = osp.join(state_path, f'{max(states):.0f}.state')
+                opt.resume_state_path = resume_state_path
+    # else:
+    #     if opt['path'].get('resume_state'):
+    #         resume_state_path = opt['path']['resume_state']
+
+    if resume_state_path is None:
+        resume_state = None
+    else:
+        device_id = torch.cuda.current_device()
+        resume_state = torch.load(resume_state_path, map_location=lambda storage, loc: storage.cuda(device_id))
+        # check_resume(opt, resume_state['iter'])
+    return resume_state
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--bsize",
+    type=int,
+    default=8,
+    help="the prompt to render"
+)
+parser.add_argument(
+    "--epochs",
+    type=int,
+    default=10000,
+    help="the prompt to render"
+)
+parser.add_argument(
+    "--num_workers",
+    type=int,
+    default=8,
+    help="the prompt to render"
+)
+parser.add_argument(
+    "--use_shuffle",
+    type=bool,
+    default=True,
+    help="the prompt to render"
+)
+parser.add_argument(
+        "--dpm_solver",
+        action='store_true',
+        help="use dpm_solver sampling",
+)
+parser.add_argument(
+        "--plms",
+        action='store_true',
+        help="use plms sampling",
+)
+parser.add_argument(
+        "--auto_resume",
+        action='store_true',
+        help="use plms sampling",
+)
+parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="models/sd-v1-4.ckpt",
+        help="path to checkpoint of model",
+)
+parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/stable-diffusion/train_sketch.yaml",
+        help="path to config which constructs model",
+)
+parser.add_argument(
+        "--print_fq",
+        type=int,
+        default=100,
+        help="path to config which constructs model",
+)
+parser.add_argument(
+        "--H",
+        type=int,
+        default=512,
+        help="image height, in pixel space",
+)
+parser.add_argument(
+    "--W",
+    type=int,
+    default=512,
+    help="image width, in pixel space",
+)
+parser.add_argument(
+    "--C",
+    type=int,
+    default=4,
+    help="latent channels",
+)
+parser.add_argument(
+    "--f",
+    type=int,
+    default=8,
+    help="downsampling factor",
+)
+parser.add_argument(
+        "--ddim_steps",
+        type=int,
+        default=50,
+        help="number of ddim sampling steps",
+)
+parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=1,
+        help="how many samples to produce for each given prompt. A.k.a. batch size",
+)
+parser.add_argument(
+        "--ddim_eta",
+        type=float,
+        default=0.0,
+        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
+)
+parser.add_argument(
+        "--scale",
+        type=float,
+        default=7.5,
+        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
+)
+parser.add_argument(
+        "--gpus",
+        default=[0,1,2,3],
+        help="gpu idx",
+)
+parser.add_argument(
+        '--local_rank',
+        default=0,
+        type=int,
+        help='node rank for distributed training'
+)
+parser.add_argument(
+        '--launcher',
+        default='pytorch',
+        type=str,
+        help='node rank for distributed training'
+)
+parser.add_argument(
+        '--l_cond',
+        default=4,
+        type=int,
+        help='number of scales'
+)
+opt = parser.parse_args()
+
+if __name__ == '__main__':
+    config = OmegaConf.load(f"{opt.config}")
+    opt.name = config['name']
+
+    # distributed setting
+    init_dist(opt.launcher)
+    torch.backends.cudnn.benchmark = True
+    device='cuda'
+    torch.cuda.set_device(opt.local_rank)
+
+    # dataset
+    path_json_train = 'coco_stuff/mask/annotations/captions_train2017.json'
+    path_json_val = 'coco_stuff/mask/annotations/captions_val2017.json'
+    train_dataset = dataset_coco_mask_color(path_json_train,
+    root_path_im='coco/train2017',
+    root_path_mask='coco_stuff/mask/train2017_color',
+    image_size=512
+    )
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    val_dataset = dataset_coco_mask_color(path_json_val,
+    root_path_im='coco/val2017',
+    root_path_mask='coco_stuff/mask/val2017_color',
+    image_size=512
+    )
+    train_dataloader = torch.utils.data.DataLoader(
+            train_dataset,
+            batch_size=opt.bsize,
+            shuffle=(train_sampler is None),
+            num_workers=opt.num_workers,
+            pin_memory=True,
+            sampler=train_sampler)
+    val_dataloader = torch.utils.data.DataLoader(
+            val_dataset,
+            batch_size=1,
+            shuffle=False,
+            num_workers=1,
+            pin_memory=False)
+
+    # edge_generator
+    net_G = pidinet()
+    ckp = torch.load('models/table5_pidinet.pth', map_location='cpu')['state_dict']
+    net_G.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()})
+    net_G.cuda()
+
+    # stable diffusion
+    model = load_model_from_config(config, f"{opt.ckpt}").to(device)
+
+    # sketch encoder
+    model_ad = Adapter(channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True, use_conv=False).to(device)
+
+    # to gpus
+    model_ad = torch.nn.parallel.DistributedDataParallel(
+        model_ad,
+        device_ids=[opt.local_rank],
+        output_device=opt.local_rank)
+    model = torch.nn.parallel.DistributedDataParallel(
+        model,
+        device_ids=[opt.local_rank],
+        output_device=opt.local_rank)
+        # device_ids=[torch.cuda.current_device()])
+    net_G = torch.nn.parallel.DistributedDataParallel(
+        net_G,
+        device_ids=[opt.local_rank],
+        output_device=opt.local_rank)
+        # device_ids=[torch.cuda.current_device()])
+
+    # optimizer
+    params = list(model_ad.parameters())
+    optimizer = torch.optim.AdamW(params, lr=config['training']['lr'])
+
+    experiments_root = osp.join('experiments', opt.name)
+
+    # resume state
+    resume_state = load_resume_state(opt)
+    if resume_state is None:
+        mkdir_and_rename(experiments_root)
+        start_epoch = 0
+        current_iter = 0
+        # WARNING: should not use get_root_logger in the above codes, including the called functions
+        # Otherwise the logger will not be properly initialized
+        log_file = osp.join(experiments_root, f"train_{opt.name}_{get_time_str()}.log")
+        logger = get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=log_file)
+        logger.info(get_env_info())
+        logger.info(dict2str(config))
+    else:
+        # WARNING: should not use get_root_logger in the above codes, including the called functions
+        # Otherwise the logger will not be properly initialized
+        log_file = osp.join(experiments_root, f"train_{opt.name}_{get_time_str()}.log")
+        logger = get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=log_file)
+        logger.info(get_env_info())
+        logger.info(dict2str(config))
+        resume_optimizers = resume_state['optimizers']
+        optimizer.load_state_dict(resume_optimizers)
+        logger.info(f"Resuming training from epoch: {resume_state['epoch']}, " f"iter: {resume_state['iter']}.")
+        start_epoch = resume_state['epoch']
+        current_iter = resume_state['iter']
+
+    # copy the yml file to the experiment root
+    copy_opt_file(opt.config, experiments_root)
+
+
+    # training
+    logger.info(f'Start training from epoch: {start_epoch}, iter: {current_iter}')
+    for epoch in range(start_epoch, opt.epochs):
+        train_dataloader.sampler.set_epoch(epoch)
+        # train
+        for _, data in enumerate(train_dataloader):
+            current_iter += 1
+            with torch.no_grad():
+                edge = net_G(data['im'].cuda(non_blocking=True))[-1]
+                edge = edge>0.5
+                edge = edge.float()
+                c = model.module.get_learned_conditioning(data['sentence'])
+                z = model.module.encode_first_stage((data['im']*2-1.).cuda(non_blocking=True))
+                z = model.module.get_first_stage_encoding(z)
+
+            optimizer.zero_grad()
+            model.zero_grad()
+            features_adapter = model_ad(edge)
+            l_pixel, loss_dict = model(z, c=c, features_adapter = features_adapter)
+            l_pixel.backward()
+            optimizer.step()
+
+            if (current_iter+1)%opt.print_fq == 0:
+                logger.info(loss_dict)
+
+            # save checkpoint
+            rank, _ = get_dist_info()
+            if (rank==0) and ((current_iter+1)%config['training']['save_freq'] == 0):
+                save_filename = f'model_ad_{current_iter+1}.pth'
+                save_path = os.path.join(experiments_root, 'models', save_filename)
+                save_dict = {}
+                model_ad_bare = get_bare_model(model_ad)
+                state_dict = model_ad_bare.state_dict()
+                for key, param in state_dict.items():
+                    if key.startswith('module.'):  # remove unnecessary 'module.'
+                        key = key[7:]
+                    save_dict[key] = param.cpu()
+                torch.save(save_dict, save_path)
+            # save state
+                state = {'epoch': epoch, 'iter': current_iter+1, 'optimizers': optimizer.state_dict()}
+                save_filename = f'{current_iter+1}.state'
+                save_path = os.path.join(experiments_root, 'training_states', save_filename)
+                torch.save(state, save_path)
+
+        # val
+        rank, _ = get_dist_info()
+        if rank==0:
+            for data in val_dataloader:
+                with torch.no_grad():
+                    if opt.dpm_solver:
+                        sampler = DPMSolverSampler(model.module)
+                    elif opt.plms:
+                        sampler = PLMSSampler(model.module)
+                    else:
+                        sampler = DDIMSampler(model.module)
+                    print(data['im'].shape)
+                    c = model.module.get_learned_conditioning(data['sentence'])
+                    edge = net_G(data['im'].cuda(non_blocking=True))[-1]
+                    edge = edge>0.5
+                    edge = edge.float()
+                    im_edge = tensor2img(edge)
+                    cv2.imwrite(os.path.join(experiments_root, 'visualization', 'edge_%04d.png'%epoch), im_edge)
+                    features_adapter = model_ad(edge)
+                    shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
+                    samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
+                                                        conditioning=c,
+                                                        batch_size=opt.n_samples,
+                                                        shape=shape,
+                                                        verbose=False,
+                                                        unconditional_guidance_scale=opt.scale,
+                                                        unconditional_conditioning=model.module.get_learned_conditioning(opt.n_samples * [""]),
+                                                        eta=opt.ddim_eta,
+                                                        x_T=None,
+                                                        features_adapter=features_adapter)
+                    x_samples_ddim = model.module.decode_first_stage(samples_ddim)
+                    x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                    x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
+                    for id_sample, x_sample in enumerate(x_samples_ddim):
+                        x_sample = 255.*x_sample
+                        img = x_sample.astype(np.uint8)
+                        img = cv2.putText(img.copy(), data['sentence'][0], (10,30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 2)
+                        cv2.imwrite(os.path.join(experiments_root, 'visualization', 'sample_e%04d_s%04d.png'%(epoch, id_sample)), img[:,:,::-1])
+                    break