{ | |
"architectures": [ | |
"STDiT3" | |
], | |
"caption_channels": 4096, | |
"class_dropout_prob": 0.1, | |
"depth": 28, | |
"drop_path": 0.0, | |
"enable_flash_attn": true, | |
"enable_layernorm_kernel": true, | |
"enable_sequence_parallelism": false, | |
"freeze_y_embedder": true, | |
"hidden_size": 1152, | |
"in_channels": 4, | |
"input_size": [ | |
null, | |
null, | |
null | |
], | |
"input_sq_size": 512, | |
"mlp_ratio": 4.0, | |
"model_max_length": 300, | |
"model_type": "STDiT3", | |
"num_heads": 16, | |
"only_train_temporal": false, | |
"patch_size": [ | |
1, | |
2, | |
2 | |
], | |
"pred_sigma": true, | |
"qk_norm": true, | |
"skip_y_embedder": false, | |
"torch_dtype": "bfloat16", | |
"transformers_version": "4.39.3" | |
} | |