diff --git a/control_v11p_sd15_depth_fp16.safetensors b/control_v11p_sd15_depth_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..115fe3656c731fa0f42d2dfff58ea79dbb82aef7 --- /dev/null +++ b/control_v11p_sd15_depth_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bab8043519c0f563853459c1e4f4e93445a87cef1dcdfa3e1e70115b3c83553 +size 722601100 diff --git a/control_v11p_sd15_depth_fp16.yaml b/control_v11p_sd15_depth_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_depth_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_inpaint.yaml b/control_v11p_sd15_inpaint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_inpaint.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_inpaint_fp16.safetensors b/control_v11p_sd15_inpaint_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bc721fd8edfa9e6b7df2daed9bf1932f0d9bc320 --- /dev/null +++ b/control_v11p_sd15_inpaint_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677a4fe351edecd40cd0d7cc210a8686b59d4e55207317f12319ef746a7a5a89 +size 722601100 diff --git a/control_v11p_sd15_inpaint_fp16.yaml b/control_v11p_sd15_inpaint_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_inpaint_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_lineart.yaml b/control_v11p_sd15_lineart.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_lineart.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_lineart_fp16.safetensors b/control_v11p_sd15_lineart_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7affa7f156cbd231392e1a19d5bb45a7c127bfa6 --- /dev/null +++ b/control_v11p_sd15_lineart_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10559106d1bb8196298b7a81565ede9279295d2b2df15165b9dbe189994def56 +size 722601100 diff --git a/control_v11p_sd15_lineart_fp16.yaml b/control_v11p_sd15_lineart_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_lineart_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_mlsd.yaml b/control_v11p_sd15_mlsd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_mlsd.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_mlsd_fp16.safetensors b/control_v11p_sd15_mlsd_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..64a8f1fb09783d5d83e04b52e61b031eae7479ea --- /dev/null +++ b/control_v11p_sd15_mlsd_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d689682fcc9341581b788524ab813cda789acdbc16bdbecbd1b9d2221e119b7 +size 722601100 diff --git a/control_v11p_sd15_mlsd_fp16.yaml b/control_v11p_sd15_mlsd_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_mlsd_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_normalbae.yaml b/control_v11p_sd15_normalbae.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_normalbae.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_normalbae_fp16.safetensors b/control_v11p_sd15_normalbae_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6f09b754ec6eeebe291fe9586cd8a8ffc1394df7 --- /dev/null +++ b/control_v11p_sd15_normalbae_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79864bfc3a1df58ed35bd92fc8dd1e4d7b85cac424cc427b9049ddc7647cceec +size 722601100 diff --git a/control_v11p_sd15_normalbae_fp16.yaml b/control_v11p_sd15_normalbae_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_normalbae_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_openpose.yaml b/control_v11p_sd15_openpose.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_openpose.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_openpose_fp16.safetensors b/control_v11p_sd15_openpose_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec5a3b296ac5a2002e9dd8d69ff870ded494cd2b --- /dev/null +++ b/control_v11p_sd15_openpose_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4003c1da17b0e4ba444e02140e1c0d83bb24b79e4dcfd613c3a554d38f0f89c7 +size 722601100 diff --git a/control_v11p_sd15_openpose_fp16.yaml b/control_v11p_sd15_openpose_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_openpose_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_scribble.yaml b/control_v11p_sd15_scribble.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_scribble.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_scribble_fp16.safetensors b/control_v11p_sd15_scribble_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c3d62447a2fdbc308baf2eddb3375d199837f04b --- /dev/null +++ b/control_v11p_sd15_scribble_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99edfd25b54c18c0ab19fba8c5618f741aac1f8c3101e7fa62cce925ad87ae68 +size 722601100 diff --git a/control_v11p_sd15_scribble_fp16.yaml b/control_v11p_sd15_scribble_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_scribble_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_seg.yaml b/control_v11p_sd15_seg.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_seg.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_seg_fp16.safetensors b/control_v11p_sd15_seg_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..65c09abfbe53ab108c239044ae2d6364098ae07d --- /dev/null +++ b/control_v11p_sd15_seg_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acd4dd3df2da2f1f2b9dd00f4504cc0d98b20afb608e25f1789a95c0ccdba14a +size 722601100 diff --git a/control_v11p_sd15_seg_fp16.yaml b/control_v11p_sd15_seg_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_seg_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_softedge.yaml b/control_v11p_sd15_softedge.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_softedge.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_softedge_fp16.safetensors b/control_v11p_sd15_softedge_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..976eb04e7aa2c0e5a722b9c2c4325c0a537c0dbe --- /dev/null +++ b/control_v11p_sd15_softedge_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e78fea5b4599fec2ecd7e3f14b171feb290b88200c95d569ec0ff59a19bc3478 +size 722601100 diff --git a/control_v11p_sd15_softedge_fp16.yaml b/control_v11p_sd15_softedge_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_softedge_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15s2_lineart_anime.yaml b/control_v11p_sd15s2_lineart_anime.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15s2_lineart_anime.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15s2_lineart_anime_fp16.safetensors b/control_v11p_sd15s2_lineart_anime_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b54f7c2014edb5cba82766eb1f91ba14bf5c0046 --- /dev/null +++ b/control_v11p_sd15s2_lineart_anime_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:067bf845109afdd1049bd9553d44faed1ad53691bd6b5ac9ee31c87466ef7c27 +size 722601100 diff --git a/control_v11p_sd15s2_lineart_anime_fp16.yaml b/control_v11p_sd15s2_lineart_anime_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15s2_lineart_anime_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11u_sd15_tile.yaml b/control_v11u_sd15_tile.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11u_sd15_tile.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11u_sd15_tile_fp16.safetensors b/control_v11u_sd15_tile_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..74f7ff0ad8d0ec38f9fe5e92a010b93e460f1796 --- /dev/null +++ b/control_v11u_sd15_tile_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:387ac74221099dcd163b3a76060d0fae441301e169646f186875f620b1287a8f +size 722601100 diff --git a/control_v11u_sd15_tile_fp16.yaml b/control_v11u_sd15_tile_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11u_sd15_tile_fp16.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/image_adapter_v14.yaml b/image_adapter_v14.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/image_adapter_v14.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/sketch_adapter_v14.yaml b/sketch_adapter_v14.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/sketch_adapter_v14.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_canny_sd14v1.pth b/t2iadapter_canny_sd14v1.pth new file mode 100644 index 0000000000000000000000000000000000000000..01cf956e10656f111da78340013d7d354f7a176b --- /dev/null +++ b/t2iadapter_canny_sd14v1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb35d666889f622f7c499c5c11a8e94dabd9231029a13a5efd736364e76a987 +size 308013107 diff --git a/t2iadapter_canny_sd14v1.yaml b/t2iadapter_canny_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/t2iadapter_canny_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_canny_sd15v2.yaml b/t2iadapter_canny_sd15v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/t2iadapter_canny_sd15v2.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_color_sd14v1.pth b/t2iadapter_color_sd14v1.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd05ea3480769abb5e92451b30a31af9b7af18b7 --- /dev/null +++ b/t2iadapter_color_sd14v1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ea46a3d2f26023179a8daccb2259a198e992044570d2f9bd18e412d479fd591 +size 74780341 diff --git a/t2iadapter_color_sd14v1.yaml b/t2iadapter_color_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6780dd94ca6abfe58b4e3dcce1206b902fc3d540 --- /dev/null +++ b/t2iadapter_color_sd14v1.yaml @@ -0,0 +1,6 @@ +model: + target: scripts.adapter.Adapter_light + params: + channels: [320, 640, 1280, 1280] + nums_rb: 4 + cin: 192 \ No newline at end of file diff --git a/t2iadapter_depth_sd14v1.pth b/t2iadapter_depth_sd14v1.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb25f2247850d76bbc61ca77c25e71077a2621a --- /dev/null +++ b/t2iadapter_depth_sd14v1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6adc765d71cbd07567688a9320add25db9a0758eef2e3b6897cbca71e1cf9d36 +size 309487667 diff --git a/t2iadapter_depth_sd14v1.yaml b/t2iadapter_depth_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_depth_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_depth_sd15v2.yaml b/t2iadapter_depth_sd15v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_depth_sd15v2.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_keypose_sd14v1.pth b/t2iadapter_keypose_sd14v1.pth new file mode 100644 index 0000000000000000000000000000000000000000..838f7b1066d7106b7dd3f90c5c134e798bbe64ab --- /dev/null +++ b/t2iadapter_keypose_sd14v1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edca0cef5eb32feba66c8e0524d996204b87a19eace640d27826d04229d17d7e +size 309487667 diff --git a/t2iadapter_keypose_sd14v1.yaml b/t2iadapter_keypose_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_keypose_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_openpose_sd14v1.pth b/t2iadapter_openpose_sd14v1.pth new file mode 100644 index 0000000000000000000000000000000000000000..316f6c509865e713899078c757c07e19a1183595 --- /dev/null +++ b/t2iadapter_openpose_sd14v1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d8027cbeae188dcae57b7475243b7dec20f2620c3c0fe7778319c56bf1ec6b +size 309487667 diff --git a/t2iadapter_openpose_sd14v1.yaml b/t2iadapter_openpose_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_openpose_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_seg_sd14v1.pth b/t2iadapter_seg_sd14v1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f1033a956541a4d758ea61f58d1bd18c3bdd987d --- /dev/null +++ b/t2iadapter_seg_sd14v1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c01a17ecd473e3e375cea152c1e4135274f7113a495be35909b497d65b87713a +size 309487667 diff --git a/t2iadapter_seg_sd14v1.yaml b/t2iadapter_seg_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_seg_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_sketch_sd14v1.pth b/t2iadapter_sketch_sd14v1.pth new file mode 100644 index 0000000000000000000000000000000000000000..16eee33d08ea401e5c387f13c1f15a5760b1a8e8 --- /dev/null +++ b/t2iadapter_sketch_sd14v1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e006de134c9db86d1d44e6f13783e0846a78879afa3bd6f73feb3d7f6a5715b1 +size 308013107 diff --git a/t2iadapter_sketch_sd14v1.yaml b/t2iadapter_sketch_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/t2iadapter_sketch_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_sketch_sd15v2.yaml b/t2iadapter_sketch_sd15v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/t2iadapter_sketch_sd15v2.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_style_sd14v1.pth b/t2iadapter_style_sd14v1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e12a85c3939994265faa36fd36d9480efab7d6c9 --- /dev/null +++ b/t2iadapter_style_sd14v1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b41febaddb32c4338272b9ad78b7d2b2584749ca5750d6b1d972766eb2fb731b +size 154363687 diff --git a/t2iadapter_style_sd14v1.yaml b/t2iadapter_style_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f634fbe7e46b9e4057298af395e0a28ac1516cf --- /dev/null +++ b/t2iadapter_style_sd14v1.yaml @@ -0,0 +1,8 @@ +model: + target: scripts.adapter.StyleAdapter + params: + width: 1024 + context_dim: 768 + num_head: 8 + n_layes: 3 + num_token: 8 \ No newline at end of file diff --git a/t2iadapter_zoedepth_sd15v1.yaml b/t2iadapter_zoedepth_sd15v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_zoedepth_sd15v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file