model: target: sgm.models.diffusion.DiffusionEngine params: input_key: image scale_factor: 0.18215 disable_first_stage_autocast: True denoiser_config: target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser params: num_idx: 1000 weighting_config: target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization network_config: target: sgm.modules.diffusionmodules.openaimodel.UNetAddModel params: use_checkpoint: False in_channels: 9 out_channels: 4 ctrl_channels: 0 model_channels: 320 attention_resolutions: [4, 2, 1] attn_type: add_attn attn_layers: - output_blocks.6.1 num_res_blocks: 2 channel_mult: [1, 2, 4, 4] num_head_channels: 64 use_spatial_transformer: True use_linear_in_transformer: True transformer_depth: 1 context_dim: 0 add_context_dim: 2048 legacy: False conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: # crossattn cond # - is_trainable: False # input_key: txt # target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder # params: # arch: ViT-H-14 # version: ./checkpoints/encoders/OpenCLIP/ViT-H-14/open_clip_pytorch_model.bin # layer: penultimate # add crossattn cond - is_trainable: False input_key: label target: sgm.modules.encoders.modules.LabelEncoder params: is_add_embedder: True max_len: 12 emb_dim: 2048 n_heads: 8 n_trans_layers: 12 ckpt_path: ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt # ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt # concat cond - is_trainable: False input_key: mask target: sgm.modules.encoders.modules.IdentityEncoder - is_trainable: False input_key: masked target: sgm.modules.encoders.modules.LatentEncoder params: scale_factor: 0.18215 config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: ckpt_path: ./checkpoints/AEs/AE_inpainting_2.safetensors embed_dim: 4 monitor: val/rec_loss ddconfig: attn_type: vanilla-xformers double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [1, 2, 4, 4] num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity first_stage_config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: embed_dim: 4 monitor: val/rec_loss ddconfig: attn_type: vanilla-xformers double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [1, 2, 4, 4] num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity loss_fn_config: target: sgm.modules.diffusionmodules.loss.FullLoss # StandardDiffusionLoss params: seq_len: 12 kernel_size: 3 gaussian_sigma: 0.5 min_attn_size: 16 lambda_local_loss: 0.02 lambda_ocr_loss: 0.001 ocr_enabled: False predictor_config: target: sgm.modules.predictors.model.ParseqPredictor params: ckpt_path: "./checkpoints/predictors/parseq-bb5792a6.pt" sigma_sampler_config: target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling params: num_idx: 1000 discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization