| { | |
| "sample_size": 128, | |
| "in_channels": 4, | |
| "out_channels": 4, | |
| "center_input_sample": false, | |
| "flip_sin_to_cos": true, | |
| "freq_shift": 0, | |
| "down_block_types": [ | |
| "DownBlock2D", | |
| "CrossAttnDownBlock2D", | |
| "CrossAttnDownBlock2D" | |
| ], | |
| "mid_block_type": "UNetMidBlock2DCrossAttn", | |
| "up_block_types": [ | |
| "CrossAttnUpBlock2D", | |
| "CrossAttnUpBlock2D", | |
| "UpBlock2D" | |
| ], | |
| "block_out_channels": [320, 640, 1280], | |
| "layers_per_block": 2, | |
| "cross_attention_dim": 2048, | |
| "transformer_layers_per_block": 10, | |
| "attention_head_dim": 8, | |
| "num_attention_heads": 16, | |
| "use_linear_projection": true | |
| } |