toto10 commited on
Commit
7b641e8
1 Parent(s): be5e120

4d1c6f1ff2b7f9073e03e5dc47df66713da13a02a88c27243d5c27b1a6b63784

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +16 -0
  2. repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/plates_out.jpeg +0 -0
  3. repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations.png +3 -0
  4. repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations_noise.png +3 -0
  5. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/000002025.png +0 -0
  6. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/000002035.png +0 -0
  7. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0001.png +3 -0
  8. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0002.png +3 -0
  9. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0003.png +3 -0
  10. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0004.png +3 -0
  11. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0005.png +3 -0
  12. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0006.png +3 -0
  13. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0001.png +3 -0
  14. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0003.png +3 -0
  15. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0005.png +3 -0
  16. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0006.png +3 -0
  17. repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0007.png +3 -0
  18. repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/merged-dog.png +3 -0
  19. repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/sampled-bear-x4.png +3 -0
  20. repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/snow-leopard-x4.png +3 -0
  21. repositories/stable-diffusion-stability-ai/checkpoints/checkpoints.txt +1 -0
  22. repositories/stable-diffusion-stability-ai/configs/karlo/decoder_900M_vit_l.yaml +37 -0
  23. repositories/stable-diffusion-stability-ai/configs/karlo/improved_sr_64_256_1.4B.yaml +27 -0
  24. repositories/stable-diffusion-stability-ai/configs/karlo/prior_1B_vit_l.yaml +21 -0
  25. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-bf16.yaml +71 -0
  26. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-fp32.yaml +70 -0
  27. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-v-bf16.yaml +72 -0
  28. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-v-fp32.yaml +71 -0
  29. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml +80 -0
  30. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml +83 -0
  31. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inference-v.yaml +68 -0
  32. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inference.yaml +67 -0
  33. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inpainting-inference.yaml +158 -0
  34. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-midas-inference.yaml +74 -0
  35. repositories/stable-diffusion-stability-ai/configs/stable-diffusion/x4-upscaling.yaml +76 -0
  36. repositories/stable-diffusion-stability-ai/doc/UNCLIP.MD +88 -0
  37. repositories/stable-diffusion-stability-ai/environment.yaml +29 -0
  38. repositories/stable-diffusion-stability-ai/ldm/__pycache__/util.cpython-310.pyc +0 -0
  39. repositories/stable-diffusion-stability-ai/ldm/data/__init__.py +0 -0
  40. repositories/stable-diffusion-stability-ai/ldm/data/__pycache__/__init__.cpython-310.pyc +0 -0
  41. repositories/stable-diffusion-stability-ai/ldm/data/__pycache__/util.cpython-310.pyc +0 -0
  42. repositories/stable-diffusion-stability-ai/ldm/data/util.py +24 -0
  43. repositories/stable-diffusion-stability-ai/ldm/models/__pycache__/autoencoder.cpython-310.pyc +0 -0
  44. repositories/stable-diffusion-stability-ai/ldm/models/autoencoder.py +219 -0
  45. repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__init__.py +0 -0
  46. repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/__init__.cpython-310.pyc +0 -0
  47. repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/ddim.cpython-310.pyc +0 -0
  48. repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/ddpm.cpython-310.pyc +0 -0
  49. repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/plms.cpython-310.pyc +0 -0
  50. repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/sampling_util.cpython-310.pyc +0 -0
.gitattributes CHANGED
@@ -46,3 +46,19 @@ repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merge
46
  repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0005.png filter=lfs diff=lfs merge=lfs -text
47
  repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-in.png filter=lfs diff=lfs merge=lfs -text
48
  repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-out.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0005.png filter=lfs diff=lfs merge=lfs -text
47
  repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-in.png filter=lfs diff=lfs merge=lfs -text
48
  repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-out.png filter=lfs diff=lfs merge=lfs -text
49
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations.png filter=lfs diff=lfs merge=lfs -text
50
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations_noise.png filter=lfs diff=lfs merge=lfs -text
51
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0001.png filter=lfs diff=lfs merge=lfs -text
52
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0002.png filter=lfs diff=lfs merge=lfs -text
53
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0003.png filter=lfs diff=lfs merge=lfs -text
54
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0004.png filter=lfs diff=lfs merge=lfs -text
55
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0005.png filter=lfs diff=lfs merge=lfs -text
56
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0006.png filter=lfs diff=lfs merge=lfs -text
57
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0001.png filter=lfs diff=lfs merge=lfs -text
58
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0003.png filter=lfs diff=lfs merge=lfs -text
59
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0005.png filter=lfs diff=lfs merge=lfs -text
60
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0006.png filter=lfs diff=lfs merge=lfs -text
61
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0007.png filter=lfs diff=lfs merge=lfs -text
62
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/merged-dog.png filter=lfs diff=lfs merge=lfs -text
63
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/sampled-bear-x4.png filter=lfs diff=lfs merge=lfs -text
64
+ repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/snow-leopard-x4.png filter=lfs diff=lfs merge=lfs -text
repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/plates_out.jpeg ADDED
repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations.png ADDED

Git LFS Details

  • SHA256: e9bc45418c5c4ded4fe8ef054c6fc85fa23efe9bab4cdbc42d3ec55f2a57bc39
  • Pointer size: 132 Bytes
  • Size of remote file: 1.77 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/unclip-variations_noise.png ADDED

Git LFS Details

  • SHA256: 83e2cdb18e95cb074db4d6d78dc7c2333936ab641e27945c358b2c4160eeb6da
  • Pointer size: 132 Bytes
  • Size of remote file: 1.54 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/000002025.png ADDED
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/000002035.png ADDED
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0001.png ADDED

Git LFS Details

  • SHA256: ed10e1df0f4c0f83794310e59a77098b4836d96a2b12cc809ddf39e77b1b6c94
  • Pointer size: 132 Bytes
  • Size of remote file: 4.63 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0002.png ADDED

Git LFS Details

  • SHA256: b4a009d112633ac788fbbe7a7176d4002e95407b64672afa8104755534bb4641
  • Pointer size: 132 Bytes
  • Size of remote file: 3.46 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0003.png ADDED

Git LFS Details

  • SHA256: 84adec03ab2e2d54990950a113af97750eab90135596461689efe5ebfa1ebf92
  • Pointer size: 132 Bytes
  • Size of remote file: 3.83 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0004.png ADDED

Git LFS Details

  • SHA256: 8ac938c03b4b554e1c475a4b3c5df50b72a890eec21542fa7911a8ff01bf13f4
  • Pointer size: 132 Bytes
  • Size of remote file: 4.1 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0005.png ADDED

Git LFS Details

  • SHA256: 849b877a80752b4578afc2ece4ea0726768809298ee9a38284ba4e159d0a817c
  • Pointer size: 132 Bytes
  • Size of remote file: 2.17 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/768/merged-0006.png ADDED

Git LFS Details

  • SHA256: 2d31dbbf76633677be3b8eba933e9eec82825925535ef9c557a3003daf16ad42
  • Pointer size: 132 Bytes
  • Size of remote file: 4.37 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0001.png ADDED

Git LFS Details

  • SHA256: 71ca5f77befffa10a2ef6d4b69f8bb721e7ebd7ea03538e2c359dc44f526b0e8
  • Pointer size: 132 Bytes
  • Size of remote file: 2.41 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0003.png ADDED

Git LFS Details

  • SHA256: 9fde5a40c512d61e2390e70d9f14b0d33f0af84cbde2dcd9d86e1f9b38072266
  • Pointer size: 132 Bytes
  • Size of remote file: 2.27 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0005.png ADDED

Git LFS Details

  • SHA256: a417aadc1d91b91531ca6bbf89840a36f432d8e9382aaa953610bedce22ff76f
  • Pointer size: 132 Bytes
  • Size of remote file: 2.58 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0006.png ADDED

Git LFS Details

  • SHA256: 1d55ba7d103da275b4612976e93f405fcb593f7e6a6fda31f2e180b41c8e4f59
  • Pointer size: 132 Bytes
  • Size of remote file: 2.64 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/txt2img/merged-0007.png ADDED

Git LFS Details

  • SHA256: 920ccf908b7fa5073a7c5cd3f4e109b5e66f7e29517ef5462ca55e931d0b5689
  • Pointer size: 132 Bytes
  • Size of remote file: 2.41 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/merged-dog.png ADDED

Git LFS Details

  • SHA256: d85d15bd51b3fa162f2b020ccac5a64b10ced728c6f22dcba183dc65ab6e8b5a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.82 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/sampled-bear-x4.png ADDED

Git LFS Details

  • SHA256: d4f2aaa8eb3054cda0a6e8577170d09c1494809cceca21973497602e17a22f1e
  • Pointer size: 132 Bytes
  • Size of remote file: 3.16 MB
repositories/stable-diffusion-stability-ai/assets/stable-samples/upscaling/snow-leopard-x4.png ADDED

Git LFS Details

  • SHA256: fe8231dddcf77ada4b46f6949b4ea7757ff2d006253e1807ba6e1168077aad19
  • Pointer size: 132 Bytes
  • Size of remote file: 3.89 MB
repositories/stable-diffusion-stability-ai/checkpoints/checkpoints.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Put unCLIP checkpoints here.
repositories/stable-diffusion-stability-ai/configs/karlo/decoder_900M_vit_l.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ type: t2i-decoder
3
+ diffusion_sampler: uniform
4
+ hparams:
5
+ image_size: 64
6
+ num_channels: 320
7
+ num_res_blocks: 3
8
+ channel_mult: ''
9
+ attention_resolutions: 32,16,8
10
+ num_heads: -1
11
+ num_head_channels: 64
12
+ num_heads_upsample: -1
13
+ use_scale_shift_norm: true
14
+ dropout: 0.1
15
+ clip_dim: 768
16
+ clip_emb_mult: 4
17
+ text_ctx: 77
18
+ xf_width: 1536
19
+ xf_layers: 0
20
+ xf_heads: 0
21
+ xf_final_ln: false
22
+ resblock_updown: true
23
+ learn_sigma: true
24
+ text_drop: 0.3
25
+ clip_emb_type: image
26
+ clip_emb_drop: 0.1
27
+ use_plm: true
28
+
29
+ diffusion:
30
+ steps: 1000
31
+ learn_sigma: true
32
+ sigma_small: false
33
+ noise_schedule: squaredcos_cap_v2
34
+ use_kl: false
35
+ predict_xstart: false
36
+ rescale_learned_sigmas: true
37
+ timestep_respacing: ''
repositories/stable-diffusion-stability-ai/configs/karlo/improved_sr_64_256_1.4B.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ type: improved_sr_64_256
3
+ diffusion_sampler: uniform
4
+ hparams:
5
+ channels: 320
6
+ depth: 3
7
+ channels_multiple:
8
+ - 1
9
+ - 2
10
+ - 3
11
+ - 4
12
+ dropout: 0.0
13
+
14
+ diffusion:
15
+ steps: 1000
16
+ learn_sigma: false
17
+ sigma_small: true
18
+ noise_schedule: squaredcos_cap_v2
19
+ use_kl: false
20
+ predict_xstart: false
21
+ rescale_learned_sigmas: true
22
+ timestep_respacing: '7'
23
+
24
+
25
+ sampling:
26
+ timestep_respacing: '7' # fix
27
+ clip_denoise: true
repositories/stable-diffusion-stability-ai/configs/karlo/prior_1B_vit_l.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ type: prior
3
+ diffusion_sampler: uniform
4
+ hparams:
5
+ text_ctx: 77
6
+ xf_width: 2048
7
+ xf_layers: 20
8
+ xf_heads: 32
9
+ xf_final_ln: true
10
+ text_drop: 0.2
11
+ clip_dim: 768
12
+
13
+ diffusion:
14
+ steps: 1000
15
+ learn_sigma: false
16
+ sigma_small: true
17
+ noise_schedule: squaredcos_cap_v2
18
+ use_kl: false
19
+ predict_xstart: true
20
+ rescale_learned_sigmas: false
21
+ timestep_respacing: ''
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-bf16.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022 Intel Corporation
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ model:
5
+ base_learning_rate: 1.0e-4
6
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
7
+ params:
8
+ linear_start: 0.00085
9
+ linear_end: 0.0120
10
+ num_timesteps_cond: 1
11
+ log_every_t: 200
12
+ timesteps: 1000
13
+ first_stage_key: "jpg"
14
+ cond_stage_key: "txt"
15
+ image_size: 64
16
+ channels: 4
17
+ cond_stage_trainable: false
18
+ conditioning_key: crossattn
19
+ monitor: val/loss_simple_ema
20
+ scale_factor: 0.18215
21
+ use_ema: False # we set this to false because this is an inference only config
22
+
23
+ unet_config:
24
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
25
+ params:
26
+ use_checkpoint: False
27
+ use_fp16: False
28
+ use_bf16: True
29
+ image_size: 32 # unused
30
+ in_channels: 4
31
+ out_channels: 4
32
+ model_channels: 320
33
+ attention_resolutions: [ 4, 2, 1 ]
34
+ num_res_blocks: 2
35
+ channel_mult: [ 1, 2, 4, 4 ]
36
+ num_head_channels: 64 # need to fix for flash-attn
37
+ use_spatial_transformer: True
38
+ use_linear_in_transformer: True
39
+ transformer_depth: 1
40
+ context_dim: 1024
41
+ legacy: False
42
+
43
+ first_stage_config:
44
+ target: ldm.models.autoencoder.AutoencoderKL
45
+ params:
46
+ embed_dim: 4
47
+ monitor: val/rec_loss
48
+ ddconfig:
49
+ #attn_type: "vanilla-xformers"
50
+ double_z: true
51
+ z_channels: 4
52
+ resolution: 256
53
+ in_channels: 3
54
+ out_ch: 3
55
+ ch: 128
56
+ ch_mult:
57
+ - 1
58
+ - 2
59
+ - 4
60
+ - 4
61
+ num_res_blocks: 2
62
+ attn_resolutions: []
63
+ dropout: 0.0
64
+ lossconfig:
65
+ target: torch.nn.Identity
66
+
67
+ cond_stage_config:
68
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
69
+ params:
70
+ freeze: True
71
+ layer: "penultimate"
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-fp32.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022 Intel Corporation
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ model:
5
+ base_learning_rate: 1.0e-4
6
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
7
+ params:
8
+ linear_start: 0.00085
9
+ linear_end: 0.0120
10
+ num_timesteps_cond: 1
11
+ log_every_t: 200
12
+ timesteps: 1000
13
+ first_stage_key: "jpg"
14
+ cond_stage_key: "txt"
15
+ image_size: 64
16
+ channels: 4
17
+ cond_stage_trainable: false
18
+ conditioning_key: crossattn
19
+ monitor: val/loss_simple_ema
20
+ scale_factor: 0.18215
21
+ use_ema: False # we set this to false because this is an inference only config
22
+
23
+ unet_config:
24
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
25
+ params:
26
+ use_checkpoint: False
27
+ use_fp16: False
28
+ image_size: 32 # unused
29
+ in_channels: 4
30
+ out_channels: 4
31
+ model_channels: 320
32
+ attention_resolutions: [ 4, 2, 1 ]
33
+ num_res_blocks: 2
34
+ channel_mult: [ 1, 2, 4, 4 ]
35
+ num_head_channels: 64 # need to fix for flash-attn
36
+ use_spatial_transformer: True
37
+ use_linear_in_transformer: True
38
+ transformer_depth: 1
39
+ context_dim: 1024
40
+ legacy: False
41
+
42
+ first_stage_config:
43
+ target: ldm.models.autoencoder.AutoencoderKL
44
+ params:
45
+ embed_dim: 4
46
+ monitor: val/rec_loss
47
+ ddconfig:
48
+ #attn_type: "vanilla-xformers"
49
+ double_z: true
50
+ z_channels: 4
51
+ resolution: 256
52
+ in_channels: 3
53
+ out_ch: 3
54
+ ch: 128
55
+ ch_mult:
56
+ - 1
57
+ - 2
58
+ - 4
59
+ - 4
60
+ num_res_blocks: 2
61
+ attn_resolutions: []
62
+ dropout: 0.0
63
+ lossconfig:
64
+ target: torch.nn.Identity
65
+
66
+ cond_stage_config:
67
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
68
+ params:
69
+ freeze: True
70
+ layer: "penultimate"
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-v-bf16.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022 Intel Corporation
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ model:
5
+ base_learning_rate: 1.0e-4
6
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
7
+ params:
8
+ parameterization: "v"
9
+ linear_start: 0.00085
10
+ linear_end: 0.0120
11
+ num_timesteps_cond: 1
12
+ log_every_t: 200
13
+ timesteps: 1000
14
+ first_stage_key: "jpg"
15
+ cond_stage_key: "txt"
16
+ image_size: 64
17
+ channels: 4
18
+ cond_stage_trainable: false
19
+ conditioning_key: crossattn
20
+ monitor: val/loss_simple_ema
21
+ scale_factor: 0.18215
22
+ use_ema: False # we set this to false because this is an inference only config
23
+
24
+ unet_config:
25
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
26
+ params:
27
+ use_checkpoint: False
28
+ use_fp16: False
29
+ use_bf16: True
30
+ image_size: 32 # unused
31
+ in_channels: 4
32
+ out_channels: 4
33
+ model_channels: 320
34
+ attention_resolutions: [ 4, 2, 1 ]
35
+ num_res_blocks: 2
36
+ channel_mult: [ 1, 2, 4, 4 ]
37
+ num_head_channels: 64 # need to fix for flash-attn
38
+ use_spatial_transformer: True
39
+ use_linear_in_transformer: True
40
+ transformer_depth: 1
41
+ context_dim: 1024
42
+ legacy: False
43
+
44
+ first_stage_config:
45
+ target: ldm.models.autoencoder.AutoencoderKL
46
+ params:
47
+ embed_dim: 4
48
+ monitor: val/rec_loss
49
+ ddconfig:
50
+ #attn_type: "vanilla-xformers"
51
+ double_z: true
52
+ z_channels: 4
53
+ resolution: 256
54
+ in_channels: 3
55
+ out_ch: 3
56
+ ch: 128
57
+ ch_mult:
58
+ - 1
59
+ - 2
60
+ - 4
61
+ - 4
62
+ num_res_blocks: 2
63
+ attn_resolutions: []
64
+ dropout: 0.0
65
+ lossconfig:
66
+ target: torch.nn.Identity
67
+
68
+ cond_stage_config:
69
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
70
+ params:
71
+ freeze: True
72
+ layer: "penultimate"
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/intel/v2-inference-v-fp32.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022 Intel Corporation
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ model:
5
+ base_learning_rate: 1.0e-4
6
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
7
+ params:
8
+ parameterization: "v"
9
+ linear_start: 0.00085
10
+ linear_end: 0.0120
11
+ num_timesteps_cond: 1
12
+ log_every_t: 200
13
+ timesteps: 1000
14
+ first_stage_key: "jpg"
15
+ cond_stage_key: "txt"
16
+ image_size: 64
17
+ channels: 4
18
+ cond_stage_trainable: false
19
+ conditioning_key: crossattn
20
+ monitor: val/loss_simple_ema
21
+ scale_factor: 0.18215
22
+ use_ema: False # we set this to false because this is an inference only config
23
+
24
+ unet_config:
25
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
26
+ params:
27
+ use_checkpoint: False
28
+ use_fp16: False
29
+ image_size: 32 # unused
30
+ in_channels: 4
31
+ out_channels: 4
32
+ model_channels: 320
33
+ attention_resolutions: [ 4, 2, 1 ]
34
+ num_res_blocks: 2
35
+ channel_mult: [ 1, 2, 4, 4 ]
36
+ num_head_channels: 64 # need to fix for flash-attn
37
+ use_spatial_transformer: True
38
+ use_linear_in_transformer: True
39
+ transformer_depth: 1
40
+ context_dim: 1024
41
+ legacy: False
42
+
43
+ first_stage_config:
44
+ target: ldm.models.autoencoder.AutoencoderKL
45
+ params:
46
+ embed_dim: 4
47
+ monitor: val/rec_loss
48
+ ddconfig:
49
+ #attn_type: "vanilla-xformers"
50
+ double_z: true
51
+ z_channels: 4
52
+ resolution: 256
53
+ in_channels: 3
54
+ out_ch: 3
55
+ ch: 128
56
+ ch_mult:
57
+ - 1
58
+ - 2
59
+ - 4
60
+ - 4
61
+ num_res_blocks: 2
62
+ attn_resolutions: []
63
+ dropout: 0.0
64
+ lossconfig:
65
+ target: torch.nn.Identity
66
+
67
+ cond_stage_config:
68
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
69
+ params:
70
+ freeze: True
71
+ layer: "penultimate"
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
4
+ params:
5
+ embedding_dropout: 0.25
6
+ parameterization: "v"
7
+ linear_start: 0.00085
8
+ linear_end: 0.0120
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 96
14
+ channels: 4
15
+ cond_stage_trainable: false
16
+ conditioning_key: crossattn-adm
17
+ scale_factor: 0.18215
18
+ monitor: val/loss_simple_ema
19
+ use_ema: False
20
+
21
+ embedder_config:
22
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
23
+
24
+ noise_aug_config:
25
+ target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
26
+ params:
27
+ timestep_dim: 1024
28
+ noise_schedule_config:
29
+ timesteps: 1000
30
+ beta_schedule: squaredcos_cap_v2
31
+
32
+ unet_config:
33
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
34
+ params:
35
+ num_classes: "sequential"
36
+ adm_in_channels: 2048
37
+ use_checkpoint: True
38
+ image_size: 32 # unused
39
+ in_channels: 4
40
+ out_channels: 4
41
+ model_channels: 320
42
+ attention_resolutions: [ 4, 2, 1 ]
43
+ num_res_blocks: 2
44
+ channel_mult: [ 1, 2, 4, 4 ]
45
+ num_head_channels: 64 # need to fix for flash-attn
46
+ use_spatial_transformer: True
47
+ use_linear_in_transformer: True
48
+ transformer_depth: 1
49
+ context_dim: 1024
50
+ legacy: False
51
+
52
+ first_stage_config:
53
+ target: ldm.models.autoencoder.AutoencoderKL
54
+ params:
55
+ embed_dim: 4
56
+ monitor: val/rec_loss
57
+ ddconfig:
58
+ attn_type: "vanilla-xformers"
59
+ double_z: true
60
+ z_channels: 4
61
+ resolution: 256
62
+ in_channels: 3
63
+ out_ch: 3
64
+ ch: 128
65
+ ch_mult:
66
+ - 1
67
+ - 2
68
+ - 4
69
+ - 4
70
+ num_res_blocks: 2
71
+ attn_resolutions: [ ]
72
+ dropout: 0.0
73
+ lossconfig:
74
+ target: torch.nn.Identity
75
+
76
+ cond_stage_config:
77
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
78
+ params:
79
+ freeze: True
80
+ layer: "penultimate"
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
4
+ params:
5
+ embedding_dropout: 0.25
6
+ parameterization: "v"
7
+ linear_start: 0.00085
8
+ linear_end: 0.0120
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 96
14
+ channels: 4
15
+ cond_stage_trainable: false
16
+ conditioning_key: crossattn-adm
17
+ scale_factor: 0.18215
18
+ monitor: val/loss_simple_ema
19
+ use_ema: False
20
+
21
+ embedder_config:
22
+ target: ldm.modules.encoders.modules.ClipImageEmbedder
23
+ params:
24
+ model: "ViT-L/14"
25
+
26
+ noise_aug_config:
27
+ target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
28
+ params:
29
+ clip_stats_path: "checkpoints/karlo_models/ViT-L-14_stats.th"
30
+ timestep_dim: 768
31
+ noise_schedule_config:
32
+ timesteps: 1000
33
+ beta_schedule: squaredcos_cap_v2
34
+
35
+ unet_config:
36
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
37
+ params:
38
+ num_classes: "sequential"
39
+ adm_in_channels: 1536
40
+ use_checkpoint: True
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_head_channels: 64 # need to fix for flash-attn
49
+ use_spatial_transformer: True
50
+ use_linear_in_transformer: True
51
+ transformer_depth: 1
52
+ context_dim: 1024
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ attn_type: "vanilla-xformers"
62
+ double_z: true
63
+ z_channels: 4
64
+ resolution: 256
65
+ in_channels: 3
66
+ out_ch: 3
67
+ ch: 128
68
+ ch_mult:
69
+ - 1
70
+ - 2
71
+ - 4
72
+ - 4
73
+ num_res_blocks: 2
74
+ attn_resolutions: [ ]
75
+ dropout: 0.0
76
+ lossconfig:
77
+ target: torch.nn.Identity
78
+
79
+ cond_stage_config:
80
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
81
+ params:
82
+ freeze: True
83
+ layer: "penultimate"
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inference-v.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
4
+ params:
5
+ parameterization: "v"
6
+ linear_start: 0.00085
7
+ linear_end: 0.0120
8
+ num_timesteps_cond: 1
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 64
14
+ channels: 4
15
+ cond_stage_trainable: false
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_factor: 0.18215
19
+ use_ema: False # we set this to false because this is an inference only config
20
+
21
+ unet_config:
22
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
23
+ params:
24
+ use_checkpoint: True
25
+ use_fp16: True
26
+ image_size: 32 # unused
27
+ in_channels: 4
28
+ out_channels: 4
29
+ model_channels: 320
30
+ attention_resolutions: [ 4, 2, 1 ]
31
+ num_res_blocks: 2
32
+ channel_mult: [ 1, 2, 4, 4 ]
33
+ num_head_channels: 64 # need to fix for flash-attn
34
+ use_spatial_transformer: True
35
+ use_linear_in_transformer: True
36
+ transformer_depth: 1
37
+ context_dim: 1024
38
+ legacy: False
39
+
40
+ first_stage_config:
41
+ target: ldm.models.autoencoder.AutoencoderKL
42
+ params:
43
+ embed_dim: 4
44
+ monitor: val/rec_loss
45
+ ddconfig:
46
+ #attn_type: "vanilla-xformers"
47
+ double_z: true
48
+ z_channels: 4
49
+ resolution: 256
50
+ in_channels: 3
51
+ out_ch: 3
52
+ ch: 128
53
+ ch_mult:
54
+ - 1
55
+ - 2
56
+ - 4
57
+ - 4
58
+ num_res_blocks: 2
59
+ attn_resolutions: []
60
+ dropout: 0.0
61
+ lossconfig:
62
+ target: torch.nn.Identity
63
+
64
+ cond_stage_config:
65
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
66
+ params:
67
+ freeze: True
68
+ layer: "penultimate"
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inference.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False # we set this to false because this is an inference only config
19
+
20
+ unet_config:
21
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
22
+ params:
23
+ use_checkpoint: True
24
+ use_fp16: True
25
+ image_size: 32 # unused
26
+ in_channels: 4
27
+ out_channels: 4
28
+ model_channels: 320
29
+ attention_resolutions: [ 4, 2, 1 ]
30
+ num_res_blocks: 2
31
+ channel_mult: [ 1, 2, 4, 4 ]
32
+ num_head_channels: 64 # need to fix for flash-attn
33
+ use_spatial_transformer: True
34
+ use_linear_in_transformer: True
35
+ transformer_depth: 1
36
+ context_dim: 1024
37
+ legacy: False
38
+
39
+ first_stage_config:
40
+ target: ldm.models.autoencoder.AutoencoderKL
41
+ params:
42
+ embed_dim: 4
43
+ monitor: val/rec_loss
44
+ ddconfig:
45
+ #attn_type: "vanilla-xformers"
46
+ double_z: true
47
+ z_channels: 4
48
+ resolution: 256
49
+ in_channels: 3
50
+ out_ch: 3
51
+ ch: 128
52
+ ch_mult:
53
+ - 1
54
+ - 2
55
+ - 4
56
+ - 4
57
+ num_res_blocks: 2
58
+ attn_resolutions: []
59
+ dropout: 0.0
60
+ lossconfig:
61
+ target: torch.nn.Identity
62
+
63
+ cond_stage_config:
64
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
65
+ params:
66
+ freeze: True
67
+ layer: "penultimate"
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-inpainting-inference.yaml ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 5.0e-05
3
+ target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: hybrid
16
+ scale_factor: 0.18215
17
+ monitor: val/loss_simple_ema
18
+ finetune_keys: null
19
+ use_ema: False
20
+
21
+ unet_config:
22
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
23
+ params:
24
+ use_checkpoint: True
25
+ image_size: 32 # unused
26
+ in_channels: 9
27
+ out_channels: 4
28
+ model_channels: 320
29
+ attention_resolutions: [ 4, 2, 1 ]
30
+ num_res_blocks: 2
31
+ channel_mult: [ 1, 2, 4, 4 ]
32
+ num_head_channels: 64 # need to fix for flash-attn
33
+ use_spatial_transformer: True
34
+ use_linear_in_transformer: True
35
+ transformer_depth: 1
36
+ context_dim: 1024
37
+ legacy: False
38
+
39
+ first_stage_config:
40
+ target: ldm.models.autoencoder.AutoencoderKL
41
+ params:
42
+ embed_dim: 4
43
+ monitor: val/rec_loss
44
+ ddconfig:
45
+ #attn_type: "vanilla-xformers"
46
+ double_z: true
47
+ z_channels: 4
48
+ resolution: 256
49
+ in_channels: 3
50
+ out_ch: 3
51
+ ch: 128
52
+ ch_mult:
53
+ - 1
54
+ - 2
55
+ - 4
56
+ - 4
57
+ num_res_blocks: 2
58
+ attn_resolutions: [ ]
59
+ dropout: 0.0
60
+ lossconfig:
61
+ target: torch.nn.Identity
62
+
63
+ cond_stage_config:
64
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
65
+ params:
66
+ freeze: True
67
+ layer: "penultimate"
68
+
69
+
70
+ data:
71
+ target: ldm.data.laion.WebDataModuleFromConfig
72
+ params:
73
+ tar_base: null # for concat as in LAION-A
74
+ p_unsafe_threshold: 0.1
75
+ filter_word_list: "data/filters.yaml"
76
+ max_pwatermark: 0.45
77
+ batch_size: 8
78
+ num_workers: 6
79
+ multinode: True
80
+ min_size: 512
81
+ train:
82
+ shards:
83
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
84
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
85
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
86
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
87
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar"
88
+ shuffle: 10000
89
+ image_key: jpg
90
+ image_transforms:
91
+ - target: torchvision.transforms.Resize
92
+ params:
93
+ size: 512
94
+ interpolation: 3
95
+ - target: torchvision.transforms.RandomCrop
96
+ params:
97
+ size: 512
98
+ postprocess:
99
+ target: ldm.data.laion.AddMask
100
+ params:
101
+ mode: "512train-large"
102
+ p_drop: 0.25
103
+ # NOTE use enough shards to avoid empty validation loops in workers
104
+ validation:
105
+ shards:
106
+ - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
107
+ shuffle: 0
108
+ image_key: jpg
109
+ image_transforms:
110
+ - target: torchvision.transforms.Resize
111
+ params:
112
+ size: 512
113
+ interpolation: 3
114
+ - target: torchvision.transforms.CenterCrop
115
+ params:
116
+ size: 512
117
+ postprocess:
118
+ target: ldm.data.laion.AddMask
119
+ params:
120
+ mode: "512train-large"
121
+ p_drop: 0.25
122
+
123
+ lightning:
124
+ find_unused_parameters: True
125
+ modelcheckpoint:
126
+ params:
127
+ every_n_train_steps: 5000
128
+
129
+ callbacks:
130
+ metrics_over_trainsteps_checkpoint:
131
+ params:
132
+ every_n_train_steps: 10000
133
+
134
+ image_logger:
135
+ target: main.ImageLogger
136
+ params:
137
+ enable_autocast: False
138
+ disabled: False
139
+ batch_frequency: 1000
140
+ max_images: 4
141
+ increase_log_steps: False
142
+ log_first_step: False
143
+ log_images_kwargs:
144
+ use_ema_scope: False
145
+ inpaint: False
146
+ plot_progressive_rows: False
147
+ plot_diffusion_rows: False
148
+ N: 4
149
+ unconditional_guidance_scale: 5.0
150
+ unconditional_guidance_label: [""]
151
+ ddim_steps: 50 # todo check these out for depth2img,
152
+ ddim_eta: 0.0 # todo check these out for depth2img,
153
+
154
+ trainer:
155
+ benchmark: True
156
+ val_check_interval: 5000000
157
+ num_sanity_val_steps: 0
158
+ accumulate_grad_batches: 1
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/v2-midas-inference.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 5.0e-07
3
+ target: ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: hybrid
16
+ scale_factor: 0.18215
17
+ monitor: val/loss_simple_ema
18
+ finetune_keys: null
19
+ use_ema: False
20
+
21
+ depth_stage_config:
22
+ target: ldm.modules.midas.api.MiDaSInference
23
+ params:
24
+ model_type: "dpt_hybrid"
25
+
26
+ unet_config:
27
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
28
+ params:
29
+ use_checkpoint: True
30
+ image_size: 32 # unused
31
+ in_channels: 5
32
+ out_channels: 4
33
+ model_channels: 320
34
+ attention_resolutions: [ 4, 2, 1 ]
35
+ num_res_blocks: 2
36
+ channel_mult: [ 1, 2, 4, 4 ]
37
+ num_head_channels: 64 # need to fix for flash-attn
38
+ use_spatial_transformer: True
39
+ use_linear_in_transformer: True
40
+ transformer_depth: 1
41
+ context_dim: 1024
42
+ legacy: False
43
+
44
+ first_stage_config:
45
+ target: ldm.models.autoencoder.AutoencoderKL
46
+ params:
47
+ embed_dim: 4
48
+ monitor: val/rec_loss
49
+ ddconfig:
50
+ #attn_type: "vanilla-xformers"
51
+ double_z: true
52
+ z_channels: 4
53
+ resolution: 256
54
+ in_channels: 3
55
+ out_ch: 3
56
+ ch: 128
57
+ ch_mult:
58
+ - 1
59
+ - 2
60
+ - 4
61
+ - 4
62
+ num_res_blocks: 2
63
+ attn_resolutions: [ ]
64
+ dropout: 0.0
65
+ lossconfig:
66
+ target: torch.nn.Identity
67
+
68
+ cond_stage_config:
69
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
70
+ params:
71
+ freeze: True
72
+ layer: "penultimate"
73
+
74
+
repositories/stable-diffusion-stability-ai/configs/stable-diffusion/x4-upscaling.yaml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.LatentUpscaleDiffusion
4
+ params:
5
+ parameterization: "v"
6
+ low_scale_key: "lr"
7
+ linear_start: 0.0001
8
+ linear_end: 0.02
9
+ num_timesteps_cond: 1
10
+ log_every_t: 200
11
+ timesteps: 1000
12
+ first_stage_key: "jpg"
13
+ cond_stage_key: "txt"
14
+ image_size: 128
15
+ channels: 4
16
+ cond_stage_trainable: false
17
+ conditioning_key: "hybrid-adm"
18
+ monitor: val/loss_simple_ema
19
+ scale_factor: 0.08333
20
+ use_ema: False
21
+
22
+ low_scale_config:
23
+ target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation
24
+ params:
25
+ noise_schedule_config: # image space
26
+ linear_start: 0.0001
27
+ linear_end: 0.02
28
+ max_noise_level: 350
29
+
30
+ unet_config:
31
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
32
+ params:
33
+ use_checkpoint: True
34
+ num_classes: 1000 # timesteps for noise conditioning (here constant, just need one)
35
+ image_size: 128
36
+ in_channels: 7
37
+ out_channels: 4
38
+ model_channels: 256
39
+ attention_resolutions: [ 2,4,8]
40
+ num_res_blocks: 2
41
+ channel_mult: [ 1, 2, 2, 4]
42
+ disable_self_attentions: [True, True, True, False]
43
+ disable_middle_self_attn: False
44
+ num_heads: 8
45
+ use_spatial_transformer: True
46
+ transformer_depth: 1
47
+ context_dim: 1024
48
+ legacy: False
49
+ use_linear_in_transformer: True
50
+
51
+ first_stage_config:
52
+ target: ldm.models.autoencoder.AutoencoderKL
53
+ params:
54
+ embed_dim: 4
55
+ ddconfig:
56
+ # attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
57
+ double_z: True
58
+ z_channels: 4
59
+ resolution: 256
60
+ in_channels: 3
61
+ out_ch: 3
62
+ ch: 128
63
+ ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
64
+ num_res_blocks: 2
65
+ attn_resolutions: [ ]
66
+ dropout: 0.0
67
+
68
+ lossconfig:
69
+ target: torch.nn.Identity
70
+
71
+ cond_stage_config:
72
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
73
+ params:
74
+ freeze: True
75
+ layer: "penultimate"
76
+
repositories/stable-diffusion-stability-ai/doc/UNCLIP.MD ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Stable unCLIP
2
+
3
+ [unCLIP](https://openai.com/dall-e-2/) is the approach behind OpenAI's [DALL·E 2](https://openai.com/dall-e-2/),
4
+ trained to invert CLIP image embeddings.
5
+ We finetuned SD 2.1 to accept a CLIP ViT-L/14 image embedding in addition to the text encodings.
6
+ This means that the model can be used to produce image variations, but can also be combined with a text-to-image
7
+ embedding prior to yield a full text-to-image model at 768x768 resolution.
8
+
9
+ If you would like to try a demo of this model on the web, please visit https://clipdrop.co/stable-diffusion-reimagine
10
+
11
+ We provide two models, trained on OpenAI CLIP-L and OpenCLIP-H image embeddings, respectively,
12
+ available from [https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/tree/main).
13
+ To use them, download from Hugging Face, and put and the weights into the `checkpoints` folder.
14
+
15
+ #### Image Variations
16
+ ![image-variations-l-1](../assets/stable-samples/stable-unclip/unclip-variations.png)
17
+
18
+ Diffusers integration
19
+ Stable UnCLIP Image Variations is integrated with the [🧨 diffusers](https://github.com/huggingface/diffusers) library
20
+ ```python
21
+ #pip install git+https://github.com/huggingface/diffusers.git transformers accelerate
22
+ import requests
23
+ import torch
24
+ from PIL import Image
25
+ from io import BytesIO
26
+
27
+ from diffusers import StableUnCLIPImg2ImgPipeline
28
+
29
+ #Start the StableUnCLIP Image variations pipeline
30
+ pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
31
+ "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16"
32
+ )
33
+ pipe = pipe.to("cuda")
34
+
35
+ #Get image from URL
36
+ url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/tarsila_do_amaral.png"
37
+ response = requests.get(url)
38
+ init_image = Image.open(BytesIO(response.content)).convert("RGB")
39
+
40
+ #Pipe to make the variation
41
+ images = pipe(init_image).images
42
+ images[0].save("tarsila_variation.png")
43
+ ```
44
+ Check out the [Stable UnCLIP pipeline docs here](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_unclip)
45
+
46
+ Streamlit UI demo
47
+
48
+ ```
49
+ streamlit run scripts/streamlit/stableunclip.py
50
+ ```
51
+ to launch a streamlit script than can be used to make image variations with both models (CLIP-L and OpenCLIP-H).
52
+ These models can process a `noise_level`, which specifies an amount of Gaussian noise added to the CLIP embeddings.
53
+ This can be used to increase output variance as in the following examples.
54
+
55
+ ![image-variations-noise](../assets/stable-samples/stable-unclip/unclip-variations_noise.png)
56
+
57
+
58
+ ### Stable Diffusion Meets Karlo
59
+ ![panda](../assets/stable-samples/stable-unclip/panda.jpg)
60
+
61
+ Recently, [KakaoBrain](https://kakaobrain.com/) openly released [Karlo](https://github.com/kakaobrain/karlo), a pretrained, large-scale replication of [unCLIP](https://arxiv.org/abs/2204.06125).
62
+ We introduce _Stable Karlo_, a combination of the Karlo CLIP image embedding prior, and Stable Diffusion v2.1-768.
63
+
64
+ To run the model, first download the KARLO checkpoints
65
+ ```shell
66
+ mkdir -p checkpoints/karlo_models
67
+ cd checkpoints/karlo_models
68
+ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/096db1af569b284eb76b3881534822d9/ViT-L-14.pt
69
+ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/0b62380a75e56f073e2844ab5199153d/ViT-L-14_stats.th
70
+ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/85626483eaca9f581e2a78d31ff905ca/prior-ckpt-step%3D01000000-of-01000000.ckpt
71
+ cd ../../
72
+ ```
73
+ and the finetuned SD2.1 unCLIP-L checkpoint from [here](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt), and put the ckpt into the `checkpoints folder`
74
+
75
+ Then, run
76
+
77
+ ```
78
+ streamlit run scripts/streamlit/stableunclip.py
79
+ ```
80
+ and pick the `use_karlo` option in the GUI.
81
+ The script optionally supports sampling from the full Karlo model. To use it, download the 64x64 decoder and 64->256 upscaler
82
+ via
83
+ ```shell
84
+ cd checkpoints/karlo_models
85
+ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/efdf6206d8ed593961593dc029a8affa/decoder-ckpt-step%3D01000000-of-01000000.ckpt
86
+ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/4226b831ae0279020d134281f3c31590/improved-sr-ckpt-step%3D1.2M.ckpt
87
+ cd ../../
88
+ ```
repositories/stable-diffusion-stability-ai/environment.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: ldm
2
+ channels:
3
+ - pytorch
4
+ - defaults
5
+ dependencies:
6
+ - python=3.8.5
7
+ - pip=20.3
8
+ - cudatoolkit=11.3
9
+ - pytorch=1.12.1
10
+ - torchvision=0.13.1
11
+ - numpy=1.23.1
12
+ - pip:
13
+ - albumentations==1.3.0
14
+ - opencv-python==4.6.0.66
15
+ - imageio==2.9.0
16
+ - imageio-ffmpeg==0.4.2
17
+ - pytorch-lightning==1.4.2
18
+ - omegaconf==2.1.1
19
+ - test-tube>=0.7.5
20
+ - streamlit==1.12.1
21
+ - einops==0.3.0
22
+ - transformers==4.19.2
23
+ - webdataset==0.2.5
24
+ - kornia==0.6
25
+ - open_clip_torch==2.0.2
26
+ - invisible-watermark>=0.1.5
27
+ - streamlit-drawable-canvas==0.8.0
28
+ - torchmetrics==0.6.0
29
+ - -e .
repositories/stable-diffusion-stability-ai/ldm/__pycache__/util.cpython-310.pyc ADDED
Binary file (6.65 kB). View file
 
repositories/stable-diffusion-stability-ai/ldm/data/__init__.py ADDED
File without changes
repositories/stable-diffusion-stability-ai/ldm/data/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (180 Bytes). View file
 
repositories/stable-diffusion-stability-ai/ldm/data/__pycache__/util.cpython-310.pyc ADDED
Binary file (1.17 kB). View file
 
repositories/stable-diffusion-stability-ai/ldm/data/util.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from ldm.modules.midas.api import load_midas_transform
4
+
5
+
6
+ class AddMiDaS(object):
7
+ def __init__(self, model_type):
8
+ super().__init__()
9
+ self.transform = load_midas_transform(model_type)
10
+
11
+ def pt2np(self, x):
12
+ x = ((x + 1.0) * .5).detach().cpu().numpy()
13
+ return x
14
+
15
+ def np2pt(self, x):
16
+ x = torch.from_numpy(x) * 2 - 1.
17
+ return x
18
+
19
+ def __call__(self, sample):
20
+ # sample['jpg'] is tensor hwc in [-1, 1] at this point
21
+ x = self.pt2np(sample['jpg'])
22
+ x = self.transform({"image": x})["image"]
23
+ sample['midas_in'] = x
24
+ return sample
repositories/stable-diffusion-stability-ai/ldm/models/__pycache__/autoencoder.cpython-310.pyc ADDED
Binary file (7.76 kB). View file
 
repositories/stable-diffusion-stability-ai/ldm/models/autoencoder.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pytorch_lightning as pl
3
+ import torch.nn.functional as F
4
+ from contextlib import contextmanager
5
+
6
+ from ldm.modules.diffusionmodules.model import Encoder, Decoder
7
+ from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
8
+
9
+ from ldm.util import instantiate_from_config
10
+ from ldm.modules.ema import LitEma
11
+
12
+
13
+ class AutoencoderKL(pl.LightningModule):
14
+ def __init__(self,
15
+ ddconfig,
16
+ lossconfig,
17
+ embed_dim,
18
+ ckpt_path=None,
19
+ ignore_keys=[],
20
+ image_key="image",
21
+ colorize_nlabels=None,
22
+ monitor=None,
23
+ ema_decay=None,
24
+ learn_logvar=False
25
+ ):
26
+ super().__init__()
27
+ self.learn_logvar = learn_logvar
28
+ self.image_key = image_key
29
+ self.encoder = Encoder(**ddconfig)
30
+ self.decoder = Decoder(**ddconfig)
31
+ self.loss = instantiate_from_config(lossconfig)
32
+ assert ddconfig["double_z"]
33
+ self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
34
+ self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
35
+ self.embed_dim = embed_dim
36
+ if colorize_nlabels is not None:
37
+ assert type(colorize_nlabels)==int
38
+ self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
39
+ if monitor is not None:
40
+ self.monitor = monitor
41
+
42
+ self.use_ema = ema_decay is not None
43
+ if self.use_ema:
44
+ self.ema_decay = ema_decay
45
+ assert 0. < ema_decay < 1.
46
+ self.model_ema = LitEma(self, decay=ema_decay)
47
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
48
+
49
+ if ckpt_path is not None:
50
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
51
+
52
+ def init_from_ckpt(self, path, ignore_keys=list()):
53
+ sd = torch.load(path, map_location="cpu")["state_dict"]
54
+ keys = list(sd.keys())
55
+ for k in keys:
56
+ for ik in ignore_keys:
57
+ if k.startswith(ik):
58
+ print("Deleting key {} from state_dict.".format(k))
59
+ del sd[k]
60
+ self.load_state_dict(sd, strict=False)
61
+ print(f"Restored from {path}")
62
+
63
+ @contextmanager
64
+ def ema_scope(self, context=None):
65
+ if self.use_ema:
66
+ self.model_ema.store(self.parameters())
67
+ self.model_ema.copy_to(self)
68
+ if context is not None:
69
+ print(f"{context}: Switched to EMA weights")
70
+ try:
71
+ yield None
72
+ finally:
73
+ if self.use_ema:
74
+ self.model_ema.restore(self.parameters())
75
+ if context is not None:
76
+ print(f"{context}: Restored training weights")
77
+
78
+ def on_train_batch_end(self, *args, **kwargs):
79
+ if self.use_ema:
80
+ self.model_ema(self)
81
+
82
+ def encode(self, x):
83
+ h = self.encoder(x)
84
+ moments = self.quant_conv(h)
85
+ posterior = DiagonalGaussianDistribution(moments)
86
+ return posterior
87
+
88
+ def decode(self, z):
89
+ z = self.post_quant_conv(z)
90
+ dec = self.decoder(z)
91
+ return dec
92
+
93
+ def forward(self, input, sample_posterior=True):
94
+ posterior = self.encode(input)
95
+ if sample_posterior:
96
+ z = posterior.sample()
97
+ else:
98
+ z = posterior.mode()
99
+ dec = self.decode(z)
100
+ return dec, posterior
101
+
102
+ def get_input(self, batch, k):
103
+ x = batch[k]
104
+ if len(x.shape) == 3:
105
+ x = x[..., None]
106
+ x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
107
+ return x
108
+
109
+ def training_step(self, batch, batch_idx, optimizer_idx):
110
+ inputs = self.get_input(batch, self.image_key)
111
+ reconstructions, posterior = self(inputs)
112
+
113
+ if optimizer_idx == 0:
114
+ # train encoder+decoder+logvar
115
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
116
+ last_layer=self.get_last_layer(), split="train")
117
+ self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
118
+ self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
119
+ return aeloss
120
+
121
+ if optimizer_idx == 1:
122
+ # train the discriminator
123
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
124
+ last_layer=self.get_last_layer(), split="train")
125
+
126
+ self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
127
+ self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
128
+ return discloss
129
+
130
+ def validation_step(self, batch, batch_idx):
131
+ log_dict = self._validation_step(batch, batch_idx)
132
+ with self.ema_scope():
133
+ log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
134
+ return log_dict
135
+
136
+ def _validation_step(self, batch, batch_idx, postfix=""):
137
+ inputs = self.get_input(batch, self.image_key)
138
+ reconstructions, posterior = self(inputs)
139
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
140
+ last_layer=self.get_last_layer(), split="val"+postfix)
141
+
142
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
143
+ last_layer=self.get_last_layer(), split="val"+postfix)
144
+
145
+ self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
146
+ self.log_dict(log_dict_ae)
147
+ self.log_dict(log_dict_disc)
148
+ return self.log_dict
149
+
150
+ def configure_optimizers(self):
151
+ lr = self.learning_rate
152
+ ae_params_list = list(self.encoder.parameters()) + list(self.decoder.parameters()) + list(
153
+ self.quant_conv.parameters()) + list(self.post_quant_conv.parameters())
154
+ if self.learn_logvar:
155
+ print(f"{self.__class__.__name__}: Learning logvar")
156
+ ae_params_list.append(self.loss.logvar)
157
+ opt_ae = torch.optim.Adam(ae_params_list,
158
+ lr=lr, betas=(0.5, 0.9))
159
+ opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
160
+ lr=lr, betas=(0.5, 0.9))
161
+ return [opt_ae, opt_disc], []
162
+
163
+ def get_last_layer(self):
164
+ return self.decoder.conv_out.weight
165
+
166
+ @torch.no_grad()
167
+ def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
168
+ log = dict()
169
+ x = self.get_input(batch, self.image_key)
170
+ x = x.to(self.device)
171
+ if not only_inputs:
172
+ xrec, posterior = self(x)
173
+ if x.shape[1] > 3:
174
+ # colorize with random projection
175
+ assert xrec.shape[1] > 3
176
+ x = self.to_rgb(x)
177
+ xrec = self.to_rgb(xrec)
178
+ log["samples"] = self.decode(torch.randn_like(posterior.sample()))
179
+ log["reconstructions"] = xrec
180
+ if log_ema or self.use_ema:
181
+ with self.ema_scope():
182
+ xrec_ema, posterior_ema = self(x)
183
+ if x.shape[1] > 3:
184
+ # colorize with random projection
185
+ assert xrec_ema.shape[1] > 3
186
+ xrec_ema = self.to_rgb(xrec_ema)
187
+ log["samples_ema"] = self.decode(torch.randn_like(posterior_ema.sample()))
188
+ log["reconstructions_ema"] = xrec_ema
189
+ log["inputs"] = x
190
+ return log
191
+
192
+ def to_rgb(self, x):
193
+ assert self.image_key == "segmentation"
194
+ if not hasattr(self, "colorize"):
195
+ self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
196
+ x = F.conv2d(x, weight=self.colorize)
197
+ x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
198
+ return x
199
+
200
+
201
+ class IdentityFirstStage(torch.nn.Module):
202
+ def __init__(self, *args, vq_interface=False, **kwargs):
203
+ self.vq_interface = vq_interface
204
+ super().__init__()
205
+
206
+ def encode(self, x, *args, **kwargs):
207
+ return x
208
+
209
+ def decode(self, x, *args, **kwargs):
210
+ return x
211
+
212
+ def quantize(self, x, *args, **kwargs):
213
+ if self.vq_interface:
214
+ return x, None, [None, None, None]
215
+ return x
216
+
217
+ def forward(self, x, *args, **kwargs):
218
+ return x
219
+
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__init__.py ADDED
File without changes
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (192 Bytes). View file
 
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/ddim.cpython-310.pyc ADDED
Binary file (9.39 kB). View file
 
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/ddpm.cpython-310.pyc ADDED
Binary file (55.6 kB). View file
 
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/plms.cpython-310.pyc ADDED
Binary file (7.57 kB). View file
 
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/__pycache__/sampling_util.cpython-310.pyc ADDED
Binary file (1.11 kB). View file