README.md CHANGED
@@ -65,6 +65,11 @@ aesthetic prompts. Specifically, Stable Cascade (30 inference steps) was compare
65
  steps), SDXL (50 inference steps), SDXL Turbo (1 inference step) and Würstchen v2 (30 inference steps).
66
 
67
  ## Code Example
 
 
 
 
 
68
  ```python
69
  import torch
70
  from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline
@@ -73,7 +78,7 @@ device = "cuda"
73
  dtype = torch.bfloat16
74
  num_images_per_prompt = 2
75
 
76
- prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade", torch_dtype=dtype).to(device)
77
  decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", torch_dtype=dtype).to(device)
78
 
79
  prompt = "Anthropomorphic cat dressed as a pilot"
 
65
  steps), SDXL (50 inference steps), SDXL Turbo (1 inference step) and Würstchen v2 (30 inference steps).
66
 
67
  ## Code Example
68
+ ```shell
69
+ #install `diffusers` from this branch while the PR is WIP
70
+ pip install git+https://github.com/kashif/diffusers.git@wuerstchen-v3
71
+ ```
72
+
73
  ```python
74
  import torch
75
  from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline
 
78
  dtype = torch.bfloat16
79
  num_images_per_prompt = 2
80
 
81
+ prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", torch_dtype=dtype).to(device)
82
  decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", torch_dtype=dtype).to(device)
83
 
84
  prompt = "Anthropomorphic cat dressed as a pilot"
decoder/config.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableCascadeUnet",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "_name_or_path": "StableCascade/decoder",
5
+ "block_repeat": [
6
+ [
7
+ 1,
8
+ 1,
9
+ 1,
10
+ 1
11
+ ],
12
+ [
13
+ 3,
14
+ 3,
15
+ 2,
16
+ 2
17
+ ]
18
+ ],
19
+ "blocks": [
20
+ [
21
+ 2,
22
+ 6,
23
+ 28,
24
+ 6
25
+ ],
26
+ [
27
+ 6,
28
+ 28,
29
+ 6,
30
+ 2
31
+ ]
32
+ ],
33
+ "c_clip_img": null,
34
+ "c_clip_seq": 4,
35
+ "c_clip_text": null,
36
+ "c_clip_text_pooled": 1280,
37
+ "c_cond": 1280,
38
+ "c_effnet": 16,
39
+ "c_hidden": [
40
+ 320,
41
+ 640,
42
+ 1280,
43
+ 1280
44
+ ],
45
+ "c_in": 4,
46
+ "c_out": 4,
47
+ "c_pixels": 3,
48
+ "c_r": 64,
49
+ "dropout": [
50
+ 0,
51
+ 0,
52
+ 0.1,
53
+ 0.1
54
+ ],
55
+ "kernel_size": 3,
56
+ "level_config": [
57
+ "CT",
58
+ "CT",
59
+ "CTA",
60
+ "CTA"
61
+ ],
62
+ "nhead": [
63
+ -1,
64
+ -1,
65
+ 20,
66
+ 20
67
+ ],
68
+ "patch_size": 2,
69
+ "self_attn": true,
70
+ "switch_level": null,
71
+ "t_conds": [
72
+ "sca"
73
+ ]
74
+ }
decoder/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f9575dfa6c2535ad65733d6257d17a7b1e1b54b7eafb251ce9556595f3bc0c9
3
+ size 3126071088
model_index.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableCascadeDecoderPipeline",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "_name_or_path": "StableCascade/",
5
+ "decoder": [
6
+ "stable_cascade",
7
+ "StableCascadeUnet"
8
+ ],
9
+ "latent_dim_scale": 10.67,
10
+ "scheduler": [
11
+ "diffusers",
12
+ "DDPMWuerstchenScheduler"
13
+ ],
14
+ "text_encoder": [
15
+ "transformers",
16
+ "CLIPTextModelWithProjection"
17
+ ],
18
+ "tokenizer": [
19
+ "transformers",
20
+ "CLIPTokenizerFast"
21
+ ],
22
+ "vqgan": [
23
+ "wuerstchen",
24
+ "PaellaVQModel"
25
+ ]
26
+ }
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDPMWuerstchenScheduler",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "s": 0.008,
5
+ "scaler": 1.0
6
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "StableCascade/text_encoder",
3
+ "architectures": [
4
+ "CLIPTextModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 49406,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 49407,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1280,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 5120,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 20,
19
+ "num_hidden_layers": 32,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 1280,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.38.0.dev0",
24
+ "vocab_size": 49408
25
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:260e0127aca3c89db813637ae659ebb822cb07af71fedc16cbd980e9518dfdcd
3
+ size 1389382688
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": true,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "model_max_length": 77,
27
+ "pad_token": "<|endoftext|>",
28
+ "tokenizer_class": "CLIPTokenizer",
29
+ "unk_token": "<|endoftext|>"
30
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
vqgan/config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PaellaVQModel",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "_name_or_path": "StableCascade/vqgan",
5
+ "bottleneck_blocks": 12,
6
+ "embed_dim": 384,
7
+ "in_channels": 3,
8
+ "latent_channels": 4,
9
+ "levels": 2,
10
+ "num_vq_embeddings": 8192,
11
+ "out_channels": 3,
12
+ "scale_factor": 0.3764,
13
+ "up_down_scale_factor": 2
14
+ }
vqgan/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ac32fab5177329dac907b2480c8c00aeefc712dfd92c2d52263a9c64b426b26
3
+ size 36825828