openai_unet_sd: type: openai_unet args: image_size: null # no use in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: [ 2, 2, 2, 2 ] channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False openai_unet_dual_context: super_cfg: openai_unet_sd type: openai_unet_dual_context ######################## # Code cleaned version # ######################## openai_unet_2d_audio: type: openai_unet_2d args: input_channels: 8 model_channels: 192 output_channels: 8 num_noattn_blocks: [ 2, 2, 2, 2 ] channel_mult: [ 1, 2, 4, 4 ] with_attn: [true, true, true, false] channel_mult_connector: [1, 2, 4] num_noattn_blocks_connector: [1, 1, 1] with_connector: [True, True, True, False] connector_output_channel: 1280 num_heads: 8 context_dim: 768 use_checkpoint: False openai_unet_2d: type: openai_unet_2d args: input_channels: 4 model_channels: 320 output_channels: 4 num_noattn_blocks: [ 2, 2, 2, 2 ] channel_mult: [ 1, 2, 4, 4 ] with_attn: [true, true, true, false] channel_mult_connector: [1, 2, 4] num_noattn_blocks_connector: [1, 1, 1] with_connector: [True, True, True, False] connector_output_channel: 1280 num_heads: 8 context_dim: 768 use_checkpoint: True use_video_architecture: True openai_unet_0dmd: type: openai_unet_0dmd args: input_channels: 768 model_channels: 320 output_channels: 768 num_noattn_blocks: [ 2, 2, 2, 2 ] channel_mult: [ 1, 2, 4, 4 ] second_dim: [ 4, 4, 4, 4 ] with_attn: [true, true, true, false] num_noattn_blocks_connector: [1, 1, 1] second_dim_connector: [4, 4, 4] with_connector: [True, True, True, False] connector_output_channel: 1280 num_heads: 8 context_dim: 768 use_checkpoint: True openai_unet_codi: type: openai_unet_codi args: unet_image_cfg: MODEL(openai_unet_2d) unet_text_cfg: MODEL(openai_unet_0dmd) unet_audio_cfg: MODEL(openai_unet_2d_audio) # model_type: ['video', 'image'] # model_type: ['text'] model_type: ['audio', 'image', 'video', 'text']