| { |
| "architectures": [ |
| "Qwen3AudioWrappedForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "audio_adapter_configs": [ |
| { |
| "adapter_embedding_dim": 1024, |
| "adapter_name": "downsampler_conformer", |
| "audio_encoder_layers": [ |
| 7, |
| 15, |
| 23, |
| 31 |
| ], |
| "downsampler_depth": 1, |
| "encoder_embedding_dim": 1280, |
| "encoder_name": "whisper", |
| "layer_fusion_config": { |
| "layer_fusion_type": "weighted_average" |
| }, |
| "llm_embedding_dim": 2560, |
| "norm_type": "batch", |
| "pre_average": true, |
| "use_conformer": false |
| }, |
| { |
| "adapter_embedding_dim": 1024, |
| "adapter_name": "identity", |
| "audio_encoder_layers": [ |
| 3, |
| 7, |
| 11 |
| ], |
| "encoder_name": "sslam", |
| "layer_fusion_config": { |
| "layer_fusion_type": "weighted_average" |
| }, |
| "llm_embedding_dim": 2560, |
| "pre_average": true, |
| "use_llm_proj": false |
| }, |
| { |
| "adapter_embedding_dim": 1024, |
| "adapter_name": "identity", |
| "audio_encoder_layers": [ |
| 4, |
| 8, |
| 12 |
| ], |
| "encoder_name": "muq", |
| "layer_fusion_config": { |
| "layer_fusion_type": "weighted_average" |
| }, |
| "llm_embedding_dim": 2560, |
| "pre_average": true, |
| "use_llm_proj": false |
| }, |
| { |
| "adapter_embedding_dim": 1024, |
| "adapter_name": "identity", |
| "audio_encoder_layers": [ |
| 6, |
| 10, |
| 15, |
| 20, |
| 24 |
| ], |
| "encoder_name": "w2vbert", |
| "layer_fusion_config": { |
| "layer_fusion_type": "weighted_average" |
| }, |
| "llm_embedding_dim": 2560, |
| "pre_average": true, |
| "use_llm_proj": false |
| } |
| ], |
| "audio_encoder_configs": [ |
| { |
| "encoder_name": "whisper" |
| }, |
| { |
| "encoder_name": "sslam" |
| }, |
| { |
| "encoder_name": "muq" |
| }, |
| { |
| "encoder_name": "w2vbert" |
| } |
| ], |
| "audio_fusion_config": { |
| "conditional_embedding_dim": [ |
| 1024, |
| 1024, |
| 768 |
| ], |
| "conditional_encoders": [ |
| "w2vbert", |
| "muq", |
| "sslam" |
| ], |
| "first_cross_attention_layer_shared": [ |
| true, |
| true, |
| true |
| ], |
| "first_self_attention_block_shared": [ |
| true, |
| true, |
| true |
| ], |
| "fusion_type": "multiperceiver", |
| "llm_embedding_dim": 2560, |
| "main_encoder": "whisper", |
| "num_cross_attention_heads": [ |
| 4, |
| 4, |
| 4 |
| ], |
| "num_cross_attention_layers": [ |
| 1, |
| 1, |
| 1 |
| ], |
| "num_latent_channels": [ |
| 768, |
| 768, |
| 768 |
| ], |
| "num_latents": [ |
| 20, |
| 20, |
| 20 |
| ], |
| "num_self_attention_blocks": [ |
| 3, |
| 3, |
| 3 |
| ], |
| "num_self_attention_heads": [ |
| 4, |
| 4, |
| 4 |
| ], |
| "num_self_attention_layers_per_block": [ |
| 6, |
| 6, |
| 6 |
| ] |
| }, |
| "audio_postprocessing_config": { |
| "postprocessing_type": "identity" |
| }, |
| "audio_sep_d_embed": 2560, |
| "bos_token_id": 151643, |
| "dtype": "float32", |
| "eos_token_id": 151645, |
| "head_dim": 128, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 9728, |
| "layer_types": [ |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention" |
| ], |
| "max_position_embeddings": 262144, |
| "max_window_layers": 36, |
| "model_type": "qwen3_audio", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 36, |
| "num_key_value_heads": 8, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 5000000, |
| "sliding_window": null, |
| "tie_word_embeddings": true, |
| "transformers_version": "4.57.1", |
| "use_cache": false, |
| "use_explicit_audio_tokens": false, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| } |
|
|