add beta1

Files changed (16) hide show

README.md +18 -0
checkpoints/wd15-beta1-fp16.ckpt +3 -0
checkpoints/wd15-beta1-fp16.safetensors +3 -0
checkpoints/wd15-beta1-fp16.yaml +68 -0
checkpoints/wd15-beta1-fp32.ckpt +3 -0
checkpoints/wd15-beta1-fp32.safetensors +3 -0
checkpoints/wd15-beta1-fp32.yaml +68 -0
feature_extractor/preprocessor_config.json +28 -0
model_index.json +33 -0
scheduler/scheduler_config.json +14 -0
text_encoder/config.json +25 -0
text_encoder/model.safetensors +3 -0
unet/config.json +50 -0
unet/diffusion_pytorch_model.safetensors +3 -0
vae/config.json +30 -0
vae/diffusion_pytorch_model.safetensors +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,21 @@
 ---
 license: other
 ---

 ---
 license: other
 ---
+IMPORTANT: this is a BETA MODEL! It is not done!
+# Release Notes
+https://cafeai.notion.site/WD-1-5-Beta-Release-Notes-967d3a5ece054d07bb02cba02e8199b7
+# Checkpoints
+Checkpoints are located in the "checkpoints" folder, under the files tab
+# Aesthetic Embeddings
+I've included a "wdgoodprompt" and "wdbadprompt" embedding in the embeddings folder to help make generation easier. With in progress models, its common to have to use long prompts for good results. Using these embeddings helps alleviate some of that.
+# Generation
+With Waifu Diffusion 1.5, best results are generated from generating at a resolution of somewhere between 500 and 1000 and then using 2x latent upscale hiresfix
+## License
+WD 1.5 is released under the AGPL v3 ([https://www.gnu.org/licenses/agpl-3.0.en.html](https://www.gnu.org/licenses/agpl-3.0.en.html)) and the CreativeML Open RAIL++-M License ([https://huggingface.co/stabilityai/stable-diffusion-2/raw/main/LICENSE-MODEL](https://huggingface.co/stabilityai/stable-diffusion-2/raw/main/LICENSE-MODEL)), with both licenses being applicable. If any derivative of this model is made, please share your changes accordingly.

checkpoints/wd15-beta1-fp16.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da47dfd5acb2ba0302e67b7688ad0c1d30f6b0b5a66838156a4f8fbd4e1affcb
+size 2580326690

checkpoints/wd15-beta1-fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a5a5cf857fc145f37439b6db89245101018c8f538a77d0c30e77f946d35b9cf
+size 2580068692

checkpoints/wd15-beta1-fp16.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

checkpoints/wd15-beta1-fp32.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:620db5529f7762dc9b5510faaf2188a326d844d4535bfbfdc3c61992b71a50c2
+size 5160236890

checkpoints/wd15-beta1-fp32.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8ea25c308554b99fed79e359f4d70b3b25d6218589ab295b225c2f2bcbb5568
+size 5159974886

checkpoints/wd15-beta1-fp32.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPFeatureExtractor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

model_index.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_class_name": "StableDiffusionPipeline",
+  "_diffusers_version": "0.12.1",
+  "feature_extractor": [
+    "transformers",
+    "CLIPFeatureExtractor"
+  ],
+  "requires_safety_checker": true,
+  "safety_checker": [
+    "stable_diffusion",
+    "StableDiffusionSafetyChecker"
+  ],
+  "scheduler": [
+    "diffusers",
+    "PNDMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "_class_name": "PNDMScheduler",
+  "_diffusers_version": "0.12.1",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "prediction_type": "v_prediction",
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "trained_betas": null
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "/mnt/sd-finetune-data/cafe-output/wd-15-official-step-2-2023-02-03-18-08/step_120000",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_size": 1024,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 23,
+  "pad_token_id": 1,
+  "projection_dim": 512,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.0",
+  "vocab_size": 49408
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cce6febb0b6d876ee5eb24af35e27e764eb4f9b1d0b7c026c8c3333d4cfc916c
+size 1361597018

unet/config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.12.1",
+  "_name_or_path": "/mnt/sd-finetune-data/cafe-output/wd-15-official-step-2-2023-02-03-18-08/step_120000",
+  "act_fn": "silu",
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dual_cross_attention": false,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "resnet_time_scale_shift": "default",
+  "sample_size": 96,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": true,
+  "use_linear_projection": true
+}

unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81b6cb73e529ad559d88bd674a187d21bad1679d43a8ba9cff19ed1cc6453856
+size 3463726467

vae/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.12.1",
+  "_name_or_path": "/mnt/sd-finetune-data/cafe/cafe-tools/vae-v2/vae-v2",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 512,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ba2a6f13818912597aabe11b15e3cb20c4b80fbb28c469284687b2c5758d474
+size 334643245