{ "_class_name": "TransformerMAE", "_diffusers_version": "0.24.0", "_name_or_path": "/shared/jiarui/results/video-diffusers/output/mae_p16_text_ed_enc_cat_n1024_arxiv_l400m_224_600k_bs1024x8/amp/hf_pipeline/mask_image_model", "cross_attention_dim": 1024, "decoder_cross_attention": true, "decoder_cross_attention_cat_encoder": true, "decoder_depth": 8, "decoder_embed_dim": 512, "decoder_num_heads": 16, "encoder_cross_attention": true, "encoder_cross_attention_cat_encoder": true, "encoder_depth": 24, "encoder_embed_dim": 1024, "encoder_num_heads": 16, "in_channels": 1024, "in_context_cross_attention": false, "num_embed": null, "out_channels": 1024, "sample_size": 14, "with_cls_token": true }