|  | model: | 
					
						
						|  | target: cldm.cldm.ControlLDM | 
					
						
						|  | params: | 
					
						
						|  | linear_start: 0.00085 | 
					
						
						|  | linear_end: 0.0120 | 
					
						
						|  | num_timesteps_cond: 1 | 
					
						
						|  | log_every_t: 200 | 
					
						
						|  | timesteps: 1000 | 
					
						
						|  | first_stage_key: "jpg" | 
					
						
						|  | cond_stage_key: "txt" | 
					
						
						|  | control_key: "hint" | 
					
						
						|  | image_size: 64 | 
					
						
						|  | channels: 4 | 
					
						
						|  | cond_stage_trainable: false | 
					
						
						|  | conditioning_key: crossattn | 
					
						
						|  | monitor: val/loss_simple_ema | 
					
						
						|  | scale_factor: 0.18215 | 
					
						
						|  | use_ema: False | 
					
						
						|  | only_mid_control: False | 
					
						
						|  |  | 
					
						
						|  | control_stage_config: | 
					
						
						|  | target: cldm.cldm.ControlNet | 
					
						
						|  | params: | 
					
						
						|  | image_size: 32 | 
					
						
						|  | in_channels: 4 | 
					
						
						|  | hint_channels: 3 | 
					
						
						|  | model_channels: 320 | 
					
						
						|  | attention_resolutions: [ 4, 2, 1 ] | 
					
						
						|  | num_res_blocks: 2 | 
					
						
						|  | channel_mult: [ 1, 2, 4, 4 ] | 
					
						
						|  | num_heads: 8 | 
					
						
						|  | use_spatial_transformer: True | 
					
						
						|  | transformer_depth: 1 | 
					
						
						|  | context_dim: 768 | 
					
						
						|  | use_checkpoint: True | 
					
						
						|  | legacy: False | 
					
						
						|  |  | 
					
						
						|  | unet_config: | 
					
						
						|  | target: cldm.cldm.ControlledUnetModel | 
					
						
						|  | params: | 
					
						
						|  | image_size: 32 | 
					
						
						|  | in_channels: 4 | 
					
						
						|  | out_channels: 4 | 
					
						
						|  | model_channels: 320 | 
					
						
						|  | attention_resolutions: [ 4, 2, 1 ] | 
					
						
						|  | num_res_blocks: 2 | 
					
						
						|  | channel_mult: [ 1, 2, 4, 4 ] | 
					
						
						|  | num_heads: 8 | 
					
						
						|  | use_spatial_transformer: True | 
					
						
						|  | transformer_depth: 1 | 
					
						
						|  | context_dim: 768 | 
					
						
						|  | use_checkpoint: True | 
					
						
						|  | legacy: False | 
					
						
						|  |  | 
					
						
						|  | first_stage_config: | 
					
						
						|  | target: ldm.models.autoencoder.AutoencoderKL | 
					
						
						|  | params: | 
					
						
						|  | embed_dim: 4 | 
					
						
						|  | monitor: val/rec_loss | 
					
						
						|  | ddconfig: | 
					
						
						|  | double_z: true | 
					
						
						|  | z_channels: 4 | 
					
						
						|  | resolution: 256 | 
					
						
						|  | in_channels: 3 | 
					
						
						|  | out_ch: 3 | 
					
						
						|  | ch: 128 | 
					
						
						|  | ch_mult: | 
					
						
						|  | - 1 | 
					
						
						|  | - 2 | 
					
						
						|  | - 4 | 
					
						
						|  | - 4 | 
					
						
						|  | num_res_blocks: 2 | 
					
						
						|  | attn_resolutions: [] | 
					
						
						|  | dropout: 0.0 | 
					
						
						|  | lossconfig: | 
					
						
						|  | target: torch.nn.Identity | 
					
						
						|  |  | 
					
						
						|  | cond_stage_config: | 
					
						
						|  | target: ldm.modules.encoders.modules.FrozenCLIPEmbedder | 
					
						
						|  |  |