model: target: cldm.cldm.ControlLDM params: linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: "image" first_stage_key_cond: ["agn", "agn_mask", "image_densepose"] cond_stage_key: "cloth" control_key: "cloth" image_size: 64 channels: 4 cond_stage_trainable: False conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False only_mid_control: False use_VAEdownsample: True use_lastzc: True use_imageCLIP: True use_pbe_weight: True u_cond_percent: 0.2 use_attn_mask: False mask1_key: "agn_mask" mask2_key: "cloth_mask" control_stage_config: target: cldm.warping_cldm_network.NoZeroConvControlNet params: image_size: 32 in_channels: 13 hint_channels: 3 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False cond_first_ch: 4 unet_config: target: cldm.warping_cldm_network.StableVITON params: image_size: 32 in_channels: 13 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False dim_head_denorm: 1 first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity validation_config: ddim_steps: 50 eta: 0.0 scale: 1.0 cond_stage_config: target: ldm.modules.image_encoders.modules.FrozenCLIPImageEmbedder dataset_name: VITONHDDataset resume_path: ./pretrained_models/VITONHD_PBE_pose.ckpt default_prompt: "" log_images_kwargs: unconditional_guidance_scale: 5.0