{ "architectures": [ "Qwen2VLVAEForConditionalGeneration" ], "attention_dropout": 0.0, "bos_token_id": 151643, "eos_token_id": 151645, "vision_start_token_id": 151652, "vision_end_token_id": 151653, "vision_token_id": 151654, "image_token_id": 151655, "video_token_id": 151656, "hidden_act": "silu", "hidden_size": 3584, "initializer_range": 0.02, "intermediate_size": 18944, "max_position_embeddings": 32768, "max_window_layers": 28, "model_type": "qwen2_vl_vae", "num_attention_heads": 28, "num_hidden_layers": 28, "num_key_value_heads": 4, "rms_norm_eps": 1e-06, "rope_theta": 1000000.0, "sliding_window": 32768, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "transformers_version": "4.41.2", "use_cache": true, "use_sliding_window": false, "vision_config": { "in_channels": 12, "patch_size": 2, "hidden_size": 3584, "vae_path": "genmo/mochi-1-preview", "vae_subfolder": "vae", "vae_config": { "_class_name": "AutoencoderKLMochi", "_diffusers_version": "0.32.0.dev0", "act_fn": "silu", "add_attention_block": [ false, true, true, true, true ], "decoder_block_out_channels": [ 128, 256, 512, 768 ], "encoder_block_out_channels": [ 64, 128, 256, 384 ], "in_channels": 15, "latent_channels": 12, "latents_mean": [ -0.06730895953510081, -0.038011381506090416, -0.07477820912866141, -0.05565264470995561, 0.012767231469026969, -0.04703542746246419, 0.043896967884726704, -0.09346305707025976, -0.09918314763016893, -0.008729793427399178, -0.011931556316503654, -0.0321993391887285 ], "latents_std": [ 0.9263795028493863, 0.9248894543193766, 0.9393059390890617, 0.959253732819592, 0.8244560132752793, 0.917259975397747, 0.9294154431013696, 1.3720942357788521, 0.881393668867029, 0.9168315692124348, 0.9185249279345552, 0.9274757570805041 ], "layers_per_block": [ 3, 3, 4, 6, 3 ], "out_channels": 3, "scaling_factor": 1.0, "spatial_expansions": [ 2, 2, 2 ], "temporal_expansions": [ 1, 2, 3 ] } }, "rope_scaling": { "type": "mrope", "mrope_section": [ 16, 24, 24 ] }, "vocab_size": 152064 }