| from transformers.configuration_utils import PretrainedConfig |
| from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig |
| from transformers.models.whisper.configuration_whisper import WhisperConfig |
|
|
| from .configuration_longcat_ngram import LongcatFlashNgramConfig |
|
|
| class LongcatNextConfig(LongcatFlashNgramConfig): |
| model_type = "longcat_next" |
| def __init__( |
| self, |
| vocab_size=131072, |
| hidden_size=6144, |
| num_hidden_layers=56, |
| num_layers=28, |
| num_attention_heads=64, |
| num_key_value_heads=None, |
| hidden_act="silu", |
| max_position_embeddings=131072, |
| initializer_range=0.02, |
| rms_norm_eps=1e-5, |
| use_cache=True, |
| pad_token_id=None, |
| bos_token_id=1, |
| eos_token_id=2, |
| tie_word_embeddings=False, |
| rope_theta=10000000.0, |
| rope_scaling=None, |
| attention_bias=False, |
| attention_dropout=0.0, |
| ffn_hidden_size=12288, |
| q_lora_rank=1536, |
| kv_lora_rank=512, |
| qk_nope_head_dim=128, |
| qk_rope_head_dim=64, |
| head_dim=64, |
| v_head_dim=128, |
| qk_head_dim=None, |
| moe_topk=12, |
| n_routed_experts=512, |
| zero_expert_num=256, |
| expert_ffn_hidden_size=2048, |
| routed_scaling_factor=6.0, |
| emb_neighbor_num=None, |
| emb_split_num=None, |
| ngram_vocab_size_ratio=None, |
| oe_ignored_token_ids=[], |
| text_vocab_size=131072, |
| text_vocab_plus_multimodal_special_token_size=131125, |
| visual_embedding_layer_intermediate_size=8192, |
| visual_embedding_layer_hidden_act="silu", |
| visual_offset=150581, |
| audio_offset=131125, |
| visual_config={}, |
| audio_config={}, |
| **kwargs, |
| ): |
| self.text_vocab_size = text_vocab_size |
| self.text_vocab_plus_multimodal_special_token_size = text_vocab_plus_multimodal_special_token_size |
| self.visual_embedding_layer_intermediate_size = visual_embedding_layer_intermediate_size |
| self.visual_embedding_layer_hidden_act = visual_embedding_layer_hidden_act |
| self.visual_offset = visual_offset |
| self.audio_offset = audio_offset |
| self.visual_config = LongcatNextVisualConfig(**visual_config) |
| self.audio_config = LongcatNextAudioConfig(**audio_config) |
| oe_ignored_token_ids = oe_ignored_token_ids or list(range(self.text_vocab_size, self.text_vocab_plus_multimodal_special_token_size)) |
|
|
| super().__init__( |
| vocab_size=vocab_size, |
| hidden_size=hidden_size, |
| num_hidden_layers=num_hidden_layers, |
| num_layers=num_layers, |
| num_attention_heads=num_attention_heads, |
| num_key_value_heads=num_key_value_heads, |
| hidden_act=hidden_act, |
| max_position_embeddings=max_position_embeddings, |
| initializer_range=initializer_range, |
| rms_norm_eps=rms_norm_eps, |
| use_cache=use_cache, |
| pad_token_id=pad_token_id, |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| tie_word_embeddings=tie_word_embeddings, |
| rope_theta=rope_theta, |
| rope_scaling=rope_scaling, |
| attention_bias=attention_bias, |
| attention_dropout=attention_dropout, |
| ffn_hidden_size=ffn_hidden_size, |
| q_lora_rank=q_lora_rank, |
| kv_lora_rank=kv_lora_rank, |
| qk_nope_head_dim=qk_nope_head_dim, |
| qk_rope_head_dim=qk_rope_head_dim, |
| head_dim=head_dim, |
| v_head_dim=v_head_dim, |
| qk_head_dim=qk_head_dim, |
| moe_topk=moe_topk, |
| n_routed_experts=n_routed_experts, |
| zero_expert_num=zero_expert_num, |
| expert_ffn_hidden_size=expert_ffn_hidden_size, |
| routed_scaling_factor=routed_scaling_factor, |
| emb_neighbor_num=emb_neighbor_num, |
| emb_split_num=emb_split_num, |
| ngram_vocab_size_ratio=ngram_vocab_size_ratio, |
| oe_ignored_token_ids=oe_ignored_token_ids, |
| **kwargs, |
| ) |
|
|
| class LongcatNextVisualConfig(Qwen2_5_VLVisionConfig): |
| model_type = "longcat_next_visual" |
| base_config_key = "" |
|
|
| def __init__( |
| self, |
| image_start_token_id=131106, |
| image_end_token_id=131107, |
| image_pad_token_id=131108, |
| image_newline_token_id=131109, |
| vq_config={}, |
| visual_decoder_config={}, |
| **kwargs, |
| ): |
| self.image_start_token_id = image_start_token_id |
| self.image_end_token_id = image_end_token_id |
| self.image_pad_token_id = image_pad_token_id |
| self.image_newline_token_id = image_newline_token_id |
| self.vq_config = PretrainedConfig(**vq_config) |
| self.visual_decoder_config = PretrainedConfig(**visual_decoder_config) |
| self.visual_decoder_config.image_decoder_config = PretrainedConfig(**getattr(self.visual_decoder_config, "image_decoder_config", {})) |
| self.visual_decoder_config.transformer_config = PretrainedConfig(**getattr(self.visual_decoder_config, "transformer_config", {})) |
| self.visual_decoder_config.vae_config = PretrainedConfig(**getattr(self.visual_decoder_config, "vae_config", {})) |
| self.visual_decoder_config.scheduler_config = PretrainedConfig(**getattr(self.visual_decoder_config, "scheduler_config", {})) |
| super().__init__(**kwargs) |
|
|
| class LongcatNextAudioConfig(WhisperConfig): |
| model_type = "longcat_next_audio" |
| base_config_key = "" |
|
|
| def __init__( |
| self, |
| vq_config={}, |
| vocoder_config={}, |
| flow_matching_config={}, |
| cosy24kvocoder_config={}, |
| **kwargs |
| ): |
| self.vq_config = PretrainedConfig(**vq_config) |
| self.vocoder_config = PretrainedConfig(**vocoder_config) |
| self.flow_matching_config = PretrainedConfig(**flow_matching_config) |
| self.flow_matching_config.cfm_params = PretrainedConfig(**getattr(self.flow_matching_config, "cfm_params", {})) |
| self.cosy24kvocoder_config = PretrainedConfig(**cosy24kvocoder_config) |
| super().__init__(**kwargs) |
|
|
|
|
| __all__ = ["LongcatNextConfig", "LongcatNextVisualConfig", "LongcatNextAudioConfig"] |
|
|