{ "_name_or_path": "_", "architectures": [ "OmniForCausalLM" ], "attention_qkv_bias": true, "attention_qkv_pack": true, "audio_config": { "audio_head_transformer_layers": 3, "audio_delim_token_id": 151693, "audio_end_token_id": 151677, "audio_pad_token_id": 151678, "audio_start_token_id": 151676, "audiogen_end_token_id": 151701, "audiogen_start_token_id": 151700, "audiotext_end_token_id": 151698, "audiotext_pad_token_id": 151699, "audiotext_start_token_id": 151697, "avg_pooler": 4, "d_model": 1280, "decoder_attention_heads": 20, "decoder_ffn_dim": 5120, "decoder_kernel_size": 3, "decoder_layers": 8, "decoder_stride_size": 2, "enable": true, "encoder_attention_heads": 20, "encoder_ffn_dim": 5120, "encoder_layers": 32, "hop_length": 160, "kernel_size": 3, "max_audio_seconds": 30, "n_fft": 400, "num_mel_bins": 128, "sampling_rate": 16000, "stride_size": 2, "split_overlap": 0.0, "vq_config":{ "enable": true, "codebook_sizes": [8192, 4096, 2048, 1024, 1024, 1024, 1024, 1024] } }, "auto_map": { "AutoConfig": "configuration_omni.OmniConfig", "AutoModelForCausalLM": "modeling_omni.OmniForCausalLM" }, "omni_tokenizer_type": "auto", "bos_token_id": 1, "eos_token_id": 2, "flow_matching_config": { "enable": true, "use_hires_mel": true, "sampling_rate": 24000, "hop_length": 480, "max_audio_seconds": 30, "split_overlap": 0.1, "use_hidden_states_before_dconv2": true, "prenet_in_dim": 1280, "prenet_out_dim": 80, "prenet_d_model": 512, "prenet_attention_heads": 8, "prenet_ffn_dim": 2048, "prenet_nlayers": 12, "prenet_activation_function": "gelu", "prenet_max_source_positions": 5000, "prenet_target_mel_length_scale_ratio": 1.0, "prenet_loss_weight": 1.0, "unet_use_omni_attn": false, "loss_weight": 1.0, "in_channels": 80, "spk_emb_dim": 0, "diffusion_steps": 10, "channels": [256], "dropout": 0.0, "attention_head_dim": 64, "n_blocks": 4, "num_mid_blocks": 12, "num_heads": 8, "act_fn": "gelu", "cal_mel_mae": true, "cfm_params": { "sigma_min": 1e-6, "solver": "euler", "t_scheduler": "cosine", "training_cfg_rate": 0.2, "inference_cfg_rate": 0.7, "reg_loss_type": "l1" } }, "head_dim": 128, "hidden_act": "silu", "hidden_size": 3584, "initializer_range": 0.02, "intermediate_size": 18944, "max_position_embeddings": 65536, "max_window_layers": 28, "model_type": "omni", "multimodal": [ "audio", "image", "video", "audiogen" ], "multimodal_special_token_list": [ 151676, 151677, 151678, 151679, 151680, 151681, 151682, 151683, 151684, 151685, 151686, 151687, 151688, 151693, 151694, 151695, 151696, 151697, 151698, 151699, 151700, 151701 ], "num_attention_heads": 28, "num_hidden_layers": 28, "num_key_value_heads": 4, "pad_token_id": 0, "position_embedding_type": "rope", "rms_norm_eps": 1e-06, "rope_theta": 1000000.0, "sliding_window": 131072, "sparse_attention_heads": null, "sparse_attention_layers": [], "tie_word_embeddings": false, "torch_dtype": "bfloat16", "train_multimodal_special_tokens_only": false, "transformers_version": "4.45.0.dev0", "use_cache": false, "use_norm_head": false, "use_sliding_window": false, "video_config": { "_name_or_path": "", "_attn_implementation": "flash_attention_2", "decode_way": "1fps", "depth": 32, "embed_dim": 1280, "enable": true, "hidden_act": "quick_gelu", "hidden_size": 3584, "image_delimiter_token_id": 151688, "image_end_token_id": 151680, "image_line_token_id": 151682, "image_mean": [ 0.48145466, 0.4578275, 0.40821073 ], "image_pad_token_id": 151681, "image_size": 224, "image_start_token_id": 151679, "image_std": [ 0.26862954, 0.26130258, 0.27577711 ], "in_channels": 3, "in_chans": 3, "intermediate_size": 3072, "layer_norm_eps": 1e-05, "max_frame_num": 32, "max_length": 20, "max_pixels": 602112, "merge_size": 2, "min_length": 0, "min_pixels": 3136, "mlp_ratio": 4, "model_type": "clip_vision_model", "num_attention_heads": 12, "num_channels": 3, "num_heads": 16, "num_hidden_layers": 12, "patch_size": 14, "spatial_merge_size": 2, "spatial_patch_size": 14, "temporal_patch_size": 2, "video_end_token_id": 151696, "video_place_token_id": 151694, "video_start_token_id": 151695 }, "visual_config": { "_name_or_path": "", "_attn_implementation": "flash_attention_2", "depth": 32, "diversity_penalty": 0.0, "do_sample": false, "early_stopping": false, "embed_dim": 1280, "enable": true, "hidden_act": "quick_gelu", "hidden_size": 3584, "image_delimiter_token_id": 151688, "image_end_token_id": 151680, "image_line_token_id": 151682, "image_mean": [ 0.48145466, 0.4578275, 0.40821073 ], "image_pad_token_id": 151681, "image_size": 224, "image_start_token_id": 151679, "image_std": [ 0.26862954, 0.26130258, 0.27577711 ], "in_channels": 3, "in_chans": 3, "intermediate_size": 3072, "layer_norm_eps": 1e-05, "length_penalty": 1.0, "max_length": 20, "max_pixels": 3211264, "merge_size": 2, "min_length": 0, "min_pixels": 3136, "mlp_ratio": 4, "model_type": "clip_vision_model", "num_attention_heads": 12, "num_channels": 3, "num_heads": 16, "num_hidden_layers": 12, "patch_size": 14, "projection_dim": 512, "spatial_merge_size": 2, "spatial_patch_size": 14, "temporal_patch_size": 2 }, "vocab_size": 152064, "vocoder_config":{ "enable": true, "enable_multi_scale": true, "max_audio_seconds": 30, "sampling_rate": 16000, "hop_length": 256, "split_overlap": 0.0, "n_fft": 1024, "num_mel_bins": 80, "channels": [256, 256, 256, 256, 256] } }