toto10 commited on Jul 30, 2023

Commit

7b641e8

•

1 Parent(s): be5e120

4d1c6f1ff2b7f9073e03e5dc47df66713da13a02a88c27243d5c27b1a6b63784

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +16 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/plates_out.jpeg +0 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations_noise.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/000002025.png +0 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/000002035.png +0 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0001.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0002.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0003.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0004.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0005.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0006.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0001.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0003.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0005.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0006.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0007.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/merged-dog.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/sampled-bear-x4.png +3 -0
repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/snow-leopard-x4.png +3 -0
repositories/stable-diffusion-stability-ai/checkpoints/checkpoints.txt +1 -0
repositories/stable-diffusion-stability-ai/configs/karlo/decoder_900M_vit_l.yaml +37 -0
repositories/stable-diffusion-stability-ai/configs/karlo/improved_sr_64_256_1.4B.yaml +27 -0
repositories/stable-diffusion-stability-ai/configs/karlo/prior_1B_vit_l.yaml +21 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-bf16.yaml +71 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-fp32.yaml +70 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-v-bf16.yaml +72 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-v-fp32.yaml +71 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml +80 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml +83 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inference-v.yaml +68 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inference.yaml +67 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inpainting-inference.yaml +158 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-midas-inference.yaml +74 -0
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/x4-upscaling.yaml +76 -0
repositories/stable-diffusion-stability-ai/doc/UNCLIP.MD +88 -0
repositories/stable-diffusion-stability-ai/environment.yaml +29 -0
repositories/stable-diffusion-stability-ai/ldm/__pycache__/util.cpython-310.pyc +0 -0
repositories/stable-diffusion-stability-ai/ldm/data/__init__.py +0 -0
repositories/stable-diffusion-stability-ai/ldm/data/__pycache__/__init__.cpython-310.pyc +0 -0
repositories/stable-diffusion-stability-ai/ldm/data/__pycache__/util.cpython-310.pyc +0 -0
repositories/stable-diffusion-stability-ai/ldm/data/util.py +24 -0
repositories/stable-diffusion-stability-ai/ldm/models/__pycache__/autoencoder.cpython-310.pyc +0 -0
repositories/stable-diffusion-stability-ai/ldm/models/autoencoder.py +219 -0
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__init__.py +0 -0
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/__init__.cpython-310.pyc +0 -0
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/ddim.cpython-310.pyc +0 -0
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/ddpm.cpython-310.pyc +0 -0
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/plms.cpython-310.pyc +0 -0
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/sampling_util.cpython-310.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -46,3 +46,19 @@ repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merge
 repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0005.png filter=lfs diff=lfs merge=lfs -text
 repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-in.png filter=lfs diff=lfs merge=lfs -text
 repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-out.png filter=lfs diff=lfs merge=lfs -text

 repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0005.png filter=lfs diff=lfs merge=lfs -text
 repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-in.png filter=lfs diff=lfs merge=lfs -text
 repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-out.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations_noise.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0001.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0002.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0003.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0004.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0005.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0006.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0001.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0003.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0005.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0006.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0007.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/merged-dog.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/sampled-bear-x4.png filter=lfs diff=lfs merge=lfs -text
+repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/snow-leopard-x4.png filter=lfs diff=lfs merge=lfs -text

repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/plates_out.jpeg ADDED Viewed

repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations.png ADDED Viewed

Git LFS Details

SHA256: e9bc45418c5c4ded4fe8ef054c6fc85fa23efe9bab4cdbc42d3ec55f2a57bc39
Pointer size: 132 Bytes
Size of remote file: 1.77 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations_noise.png ADDED Viewed

Git LFS Details

SHA256: 83e2cdb18e95cb074db4d6d78dc7c2333936ab641e27945c358b2c4160eeb6da
Pointer size: 132 Bytes
Size of remote file: 1.54 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/000002025.png ADDED Viewed

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/000002035.png ADDED Viewed

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0001.png ADDED Viewed

Git LFS Details

SHA256: ed10e1df0f4c0f83794310e59a77098b4836d96a2b12cc809ddf39e77b1b6c94
Pointer size: 132 Bytes
Size of remote file: 4.63 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0002.png ADDED Viewed

Git LFS Details

SHA256: b4a009d112633ac788fbbe7a7176d4002e95407b64672afa8104755534bb4641
Pointer size: 132 Bytes
Size of remote file: 3.46 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0003.png ADDED Viewed

Git LFS Details

SHA256: 84adec03ab2e2d54990950a113af97750eab90135596461689efe5ebfa1ebf92
Pointer size: 132 Bytes
Size of remote file: 3.83 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0004.png ADDED Viewed

Git LFS Details

SHA256: 8ac938c03b4b554e1c475a4b3c5df50b72a890eec21542fa7911a8ff01bf13f4
Pointer size: 132 Bytes
Size of remote file: 4.1 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0005.png ADDED Viewed

Git LFS Details

SHA256: 849b877a80752b4578afc2ece4ea0726768809298ee9a38284ba4e159d0a817c
Pointer size: 132 Bytes
Size of remote file: 2.17 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0006.png ADDED Viewed

Git LFS Details

SHA256: 2d31dbbf76633677be3b8eba933e9eec82825925535ef9c557a3003daf16ad42
Pointer size: 132 Bytes
Size of remote file: 4.37 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0001.png ADDED Viewed

Git LFS Details

SHA256: 71ca5f77befffa10a2ef6d4b69f8bb721e7ebd7ea03538e2c359dc44f526b0e8
Pointer size: 132 Bytes
Size of remote file: 2.41 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0003.png ADDED Viewed

Git LFS Details

SHA256: 9fde5a40c512d61e2390e70d9f14b0d33f0af84cbde2dcd9d86e1f9b38072266
Pointer size: 132 Bytes
Size of remote file: 2.27 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0005.png ADDED Viewed

Git LFS Details

SHA256: a417aadc1d91b91531ca6bbf89840a36f432d8e9382aaa953610bedce22ff76f
Pointer size: 132 Bytes
Size of remote file: 2.58 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0006.png ADDED Viewed

Git LFS Details

SHA256: 1d55ba7d103da275b4612976e93f405fcb593f7e6a6fda31f2e180b41c8e4f59
Pointer size: 132 Bytes
Size of remote file: 2.64 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0007.png ADDED Viewed

Git LFS Details

SHA256: 920ccf908b7fa5073a7c5cd3f4e109b5e66f7e29517ef5462ca55e931d0b5689
Pointer size: 132 Bytes
Size of remote file: 2.41 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/merged-dog.png ADDED Viewed

Git LFS Details

SHA256: d85d15bd51b3fa162f2b020ccac5a64b10ced728c6f22dcba183dc65ab6e8b5a
Pointer size: 132 Bytes
Size of remote file: 1.82 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/sampled-bear-x4.png ADDED Viewed

Git LFS Details

SHA256: d4f2aaa8eb3054cda0a6e8577170d09c1494809cceca21973497602e17a22f1e
Pointer size: 132 Bytes
Size of remote file: 3.16 MB

repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/snow-leopard-x4.png ADDED Viewed

Git LFS Details

SHA256: fe8231dddcf77ada4b46f6949b4ea7757ff2d006253e1807ba6e1168077aad19
Pointer size: 132 Bytes
Size of remote file: 3.89 MB

repositories/stable-diffusion-stability-ai/checkpoints/checkpoints.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Put unCLIP checkpoints here.

repositories/stable-diffusion-stability-ai/configs/karlo/decoder_900M_vit_l.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+model:
+  type: t2i-decoder
+  diffusion_sampler: uniform
+  hparams:
+    image_size: 64
+    num_channels: 320
+    num_res_blocks: 3
+    channel_mult: ''
+    attention_resolutions: 32,16,8
+    num_heads: -1
+    num_head_channels: 64
+    num_heads_upsample: -1
+    use_scale_shift_norm: true
+    dropout: 0.1
+    clip_dim: 768
+    clip_emb_mult: 4
+    text_ctx: 77
+    xf_width: 1536
+    xf_layers: 0
+    xf_heads: 0
+    xf_final_ln: false
+    resblock_updown: true
+    learn_sigma: true
+    text_drop: 0.3
+    clip_emb_type: image
+    clip_emb_drop: 0.1
+    use_plm: true
+diffusion:
+  steps: 1000
+  learn_sigma: true
+  sigma_small: false
+  noise_schedule: squaredcos_cap_v2
+  use_kl: false
+  predict_xstart: false
+  rescale_learned_sigmas: true
+  timestep_respacing: ''

repositories/stable-diffusion-stability-ai/configs/karlo/improved_sr_64_256_1.4B.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+model:
+  type: improved_sr_64_256
+  diffusion_sampler: uniform
+  hparams:
+    channels: 320
+    depth: 3
+    channels_multiple:
+    - 1
+    - 2
+    - 3
+    - 4
+    dropout: 0.0
+diffusion:
+  steps: 1000
+  learn_sigma: false
+  sigma_small: true
+  noise_schedule: squaredcos_cap_v2
+  use_kl: false
+  predict_xstart: false
+  rescale_learned_sigmas: true
+  timestep_respacing: '7'
+sampling:
+  timestep_respacing: '7' # fix
+  clip_denoise: true

repositories/stable-diffusion-stability-ai/configs/karlo/prior_1B_vit_l.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+model:
+  type: prior
+  diffusion_sampler: uniform
+  hparams:
+    text_ctx: 77
+    xf_width: 2048
+    xf_layers: 20
+    xf_heads: 32
+    xf_final_ln: true
+    text_drop: 0.2
+    clip_dim: 768
+diffusion:
+  steps: 1000
+  learn_sigma: false
+  sigma_small: true
+  noise_schedule: squaredcos_cap_v2
+  use_kl: false
+  predict_xstart: true
+  rescale_learned_sigmas: false
+  timestep_respacing: ''

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-bf16.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: MIT
+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: False
+        use_fp16: False
+        use_bf16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-fp32.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: MIT
+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: False
+        use_fp16: False
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-v-bf16.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: MIT
+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: False
+        use_fp16: False
+        use_bf16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-v-fp32.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: MIT
+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: False
+        use_fp16: False
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
+  params:
+    embedding_dropout: 0.25
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 96
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn-adm
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    use_ema: False
+    embedder_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+    noise_aug_config:
+      target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
+      params:
+        timestep_dim: 1024
+        noise_schedule_config:
+          timesteps: 1000
+          beta_schedule: squaredcos_cap_v2
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: "sequential"
+        adm_in_channels: 2048
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
+  params:
+    embedding_dropout: 0.25
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 96
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn-adm
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    use_ema: False
+    embedder_config:
+      target: ldm.modules.encoders.modules.ClipImageEmbedder
+      params:
+        model: "ViT-L/14"
+    noise_aug_config:
+      target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
+      params:
+        clip_stats_path: "checkpoints/karlo_models/ViT-L-14_stats.th"
+        timestep_dim: 768
+        noise_schedule_config:
+          timesteps: 1000
+          beta_schedule: squaredcos_cap_v2
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: "sequential"
+        adm_in_channels: 1536
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inference-v.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inference.yaml ADDED Viewed

	@@ -0,0 +1,67 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inpainting-inference.yaml ADDED Viewed

	@@ -0,0 +1,158 @@

+model:
+  base_learning_rate: 5.0e-05
+  target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: hybrid
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    finetune_keys: null
+    use_ema: False
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 9
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: null  # for concat as in LAION-A
+    p_unsafe_threshold: 0.1
+    filter_word_list: "data/filters.yaml"
+    max_pwatermark: 0.45
+    batch_size: 8
+    num_workers: 6
+    multinode: True
+    min_size: 512
+    train:
+      shards:
+        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
+        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
+        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
+        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
+        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -"  #{00000-94333}.tar"
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 512
+      postprocess:
+        target: ldm.data.laion.AddMask
+        params:
+          mode: "512train-large"
+          p_drop: 0.25
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards:
+        - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 512
+      postprocess:
+        target: ldm.data.laion.AddMask
+        params:
+          mode: "512train-large"
+          p_drop: 0.25
+lightning:
+  find_unused_parameters: True
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 10000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        enable_autocast: False
+        disabled: False
+        batch_frequency: 1000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 5.0
+          unconditional_guidance_label: [""]
+          ddim_steps: 50  # todo check these out for depth2img,
+          ddim_eta: 0.0   # todo check these out for depth2img,
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-midas-inference.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+model:
+  base_learning_rate: 5.0e-07
+  target: ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: hybrid
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    finetune_keys: null
+    use_ema: False
+    depth_stage_config:
+      target: ldm.modules.midas.api.MiDaSInference
+      params:
+        model_type: "dpt_hybrid"
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 5
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/configs/stable-diffusion/x4-upscaling.yaml ADDED Viewed

	@@ -0,0 +1,76 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentUpscaleDiffusion
+  params:
+    parameterization: "v"
+    low_scale_key: "lr"
+    linear_start: 0.0001
+    linear_end: 0.02
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 128
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: "hybrid-adm"
+    monitor: val/loss_simple_ema
+    scale_factor: 0.08333
+    use_ema: False
+    low_scale_config:
+      target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation
+      params:
+        noise_schedule_config: # image space
+          linear_start: 0.0001
+          linear_end: 0.02
+        max_noise_level: 350
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        num_classes: 1000  # timesteps for noise conditioning (here constant, just need one)
+        image_size: 128
+        in_channels: 7
+        out_channels: 4
+        model_channels: 256
+        attention_resolutions: [ 2,4,8]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 2, 4]
+        disable_self_attentions: [True, True, True, False]
+        disable_middle_self_attn: False
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+        use_linear_in_transformer: True
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        ddconfig:
+          # attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
+          double_z: True
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

repositories/stable-diffusion-stability-ai/doc/UNCLIP.MD ADDED Viewed

	@@ -0,0 +1,88 @@

+### Stable unCLIP
+[unCLIP](https://openai.com/dall-e-2/) is the approach behind OpenAI's [DALL·E 2](https://openai.com/dall-e-2/),
+trained to invert CLIP image embeddings.
+We finetuned SD 2.1 to accept a CLIP ViT-L/14 image embedding in addition to the text encodings.
+This means that the model can be used to produce image variations, but can also be combined with a text-to-image
+embedding prior to yield a full text-to-image model at 768x768 resolution.
+If you would like to try a demo of this model on the web, please visit https://clipdrop.co/stable-diffusion-reimagine
+We provide two models, trained on OpenAI CLIP-L and OpenCLIP-H image embeddings, respectively,
+available from [https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/tree/main).
+To use them, download from Hugging Face, and put and the weights into the `checkpoints` folder.
+#### Image Variations
+![image-variations-l-1](../assets/stable-samples/stable-unclip/unclip-variations.png)
+Diffusers integration
+Stable UnCLIP Image Variations is integrated with the [🧨 diffusers](https://github.com/huggingface/diffusers) library
+```python
+#pip install git+https://github.com/huggingface/diffusers.git transformers accelerate
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+from diffusers import StableUnCLIPImg2ImgPipeline
+#Start the StableUnCLIP Image variations pipeline
+pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16"
+)
+pipe = pipe.to("cuda")
+#Get image from URL
+url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/tarsila_do_amaral.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+#Pipe to make the variation
+images = pipe(init_image).images
+images[0].save("tarsila_variation.png")
+```
+Check out the [Stable UnCLIP pipeline docs here](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_unclip)
+Streamlit UI demo
+```
+streamlit run scripts/streamlit/stableunclip.py
+```
+to launch a streamlit script than can be used to make image variations with both models (CLIP-L and OpenCLIP-H).
+These models can process a `noise_level`, which specifies an amount of Gaussian noise added to the CLIP embeddings.
+This can be used to increase output variance as in the following examples.
+![image-variations-noise](../assets/stable-samples/stable-unclip/unclip-variations_noise.png)
+### Stable Diffusion Meets Karlo
+![panda](../assets/stable-samples/stable-unclip/panda.jpg)
+Recently, [KakaoBrain](https://kakaobrain.com/) openly released [Karlo](https://github.com/kakaobrain/karlo), a pretrained, large-scale replication of [unCLIP](https://arxiv.org/abs/2204.06125).
+We introduce _Stable Karlo_, a combination of the Karlo CLIP image embedding prior, and Stable Diffusion v2.1-768.
+To run the model, first download the KARLO checkpoints
+```shell
+mkdir -p checkpoints/karlo_models
+cd checkpoints/karlo_models
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/096db1af569b284eb76b3881534822d9/ViT-L-14.pt
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/0b62380a75e56f073e2844ab5199153d/ViT-L-14_stats.th
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/85626483eaca9f581e2a78d31ff905ca/prior-ckpt-step%3D01000000-of-01000000.ckpt
+cd ../../
+```
+and the finetuned SD2.1 unCLIP-L checkpoint from [here](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt), and put the ckpt into the `checkpoints folder`
+Then, run
+```
+streamlit run scripts/streamlit/stableunclip.py
+```
+and pick the `use_karlo` option in the GUI.
+The script optionally supports sampling from the full Karlo model. To use it, download the 64x64 decoder and 64->256 upscaler
+via
+```shell
+cd checkpoints/karlo_models
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/efdf6206d8ed593961593dc029a8affa/decoder-ckpt-step%3D01000000-of-01000000.ckpt
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/4226b831ae0279020d134281f3c31590/improved-sr-ckpt-step%3D1.2M.ckpt
+cd ../../
+```

repositories/stable-diffusion-stability-ai/environment.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: ldm
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.8.5
+  - pip=20.3
+  - cudatoolkit=11.3
+  - pytorch=1.12.1
+  - torchvision=0.13.1
+  - numpy=1.23.1
+  - pip:
+    - albumentations==1.3.0
+    - opencv-python==4.6.0.66
+    - imageio==2.9.0
+    - imageio-ffmpeg==0.4.2
+    - pytorch-lightning==1.4.2
+    - omegaconf==2.1.1
+    - test-tube>=0.7.5
+    - streamlit==1.12.1
+    - einops==0.3.0
+    - transformers==4.19.2
+    - webdataset==0.2.5
+    - kornia==0.6
+    - open_clip_torch==2.0.2
+    - invisible-watermark>=0.1.5
+    - streamlit-drawable-canvas==0.8.0
+    - torchmetrics==0.6.0
+    - -e .

repositories/stable-diffusion-stability-ai/ldm/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (6.65 kB). View file

repositories/stable-diffusion-stability-ai/ldm/data/__init__.py ADDED Viewed

File without changes

repositories/stable-diffusion-stability-ai/ldm/data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (180 Bytes). View file

repositories/stable-diffusion-stability-ai/ldm/data/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (1.17 kB). View file

repositories/stable-diffusion-stability-ai/ldm/data/util.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+from ldm.modules.midas.api import load_midas_transform
+class AddMiDaS(object):
+    def __init__(self, model_type):
+        super().__init__()
+        self.transform = load_midas_transform(model_type)
+    def pt2np(self, x):
+        x = ((x + 1.0) * .5).detach().cpu().numpy()
+        return x
+    def np2pt(self, x):
+        x = torch.from_numpy(x) * 2 - 1.
+        return x
+    def __call__(self, sample):
+        # sample['jpg'] is tensor hwc in [-1, 1] at this point
+        x = self.pt2np(sample['jpg'])
+        x = self.transform({"image": x})["image"]
+        sample['midas_in'] = x
+        return sample

repositories/stable-diffusion-stability-ai/ldm/models/__pycache__/autoencoder.cpython-310.pyc ADDED Viewed

Binary file (7.76 kB). View file

repositories/stable-diffusion-stability-ai/ldm/models/autoencoder.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import torch
+import pytorch_lightning as pl
+import torch.nn.functional as F
+from contextlib import contextmanager
+from ldm.modules.diffusionmodules.model import Encoder, Decoder
+from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
+from ldm.util import instantiate_from_config
+from ldm.modules.ema import LitEma
+class AutoencoderKL(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ema_decay=None,
+                 learn_logvar=False
+                 ):
+        super().__init__()
+        self.learn_logvar = learn_logvar
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.use_ema = ema_decay is not None
+        if self.use_ema:
+            self.ema_decay = ema_decay
+            assert 0. < ema_decay < 1.
+            self.model_ema = LitEma(self, decay=ema_decay)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return aeloss
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                                last_layer=self.get_last_layer(), split="train")
+            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
+        return log_dict
+    def _validation_step(self, batch, batch_idx, postfix=""):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val"+postfix)
+        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val"+postfix)
+        self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        ae_params_list = list(self.encoder.parameters()) + list(self.decoder.parameters()) + list(
+            self.quant_conv.parameters()) + list(self.post_quant_conv.parameters())
+        if self.learn_logvar:
+            print(f"{self.__class__.__name__}: Learning logvar")
+            ae_params_list.append(self.loss.logvar)
+        opt_ae = torch.optim.Adam(ae_params_list,
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+            if log_ema or self.use_ema:
+                with self.ema_scope():
+                    xrec_ema, posterior_ema = self(x)
+                    if x.shape[1] > 3:
+                        # colorize with random projection
+                        assert xrec_ema.shape[1] > 3
+                        xrec_ema = self.to_rgb(xrec_ema)
+                    log["samples_ema"] = self.decode(torch.randn_like(posterior_ema.sample()))
+                    log["reconstructions_ema"] = xrec_ema
+        log["inputs"] = x
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class IdentityFirstStage(torch.nn.Module):
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface
+        super().__init__()
+    def encode(self, x, *args, **kwargs):
+        return x
+    def decode(self, x, *args, **kwargs):
+        return x
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+    def forward(self, x, *args, **kwargs):
+        return x

repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__init__.py ADDED Viewed

File without changes

repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (192 Bytes). View file

repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/ddim.cpython-310.pyc ADDED Viewed

Binary file (9.39 kB). View file

repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/ddpm.cpython-310.pyc ADDED Viewed

Binary file (55.6 kB). View file

repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/plms.cpython-310.pyc ADDED Viewed

Binary file (7.57 kB). View file

repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/sampling_util.cpython-310.pyc ADDED Viewed

Binary file (1.11 kB). View file