LongCat-Next-4bit / configuration_longcat_next.py
kernelpool's picture
Add files using upload-large-folder tool
74da6da verified
from transformers.configuration_utils import PretrainedConfig
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
from transformers.models.whisper.configuration_whisper import WhisperConfig
from .configuration_longcat_ngram import LongcatFlashNgramConfig
class LongcatNextConfig(LongcatFlashNgramConfig):
model_type = "longcat_next"
def __init__(
self,
vocab_size=131072,
hidden_size=6144,
num_hidden_layers=56,
num_layers=28,
num_attention_heads=64,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=131072,
initializer_range=0.02,
rms_norm_eps=1e-5,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
rope_theta=10000000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
ffn_hidden_size=12288,
q_lora_rank=1536,
kv_lora_rank=512,
qk_nope_head_dim=128,
qk_rope_head_dim=64,
head_dim=64,
v_head_dim=128,
qk_head_dim=None,
moe_topk=12,
n_routed_experts=512,
zero_expert_num=256,
expert_ffn_hidden_size=2048,
routed_scaling_factor=6.0,
emb_neighbor_num=None,
emb_split_num=None,
ngram_vocab_size_ratio=None,
oe_ignored_token_ids=[],
text_vocab_size=131072, # text vocab size (vocab_size = text_vocab_size + audio_token + visual_token + multimodal_special_token_list)
text_vocab_plus_multimodal_special_token_size=131125,
visual_embedding_layer_intermediate_size=8192,
visual_embedding_layer_hidden_act="silu",
visual_offset=150581,
audio_offset=131125,
visual_config={},
audio_config={},
**kwargs,
):
self.text_vocab_size = text_vocab_size
self.text_vocab_plus_multimodal_special_token_size = text_vocab_plus_multimodal_special_token_size
self.visual_embedding_layer_intermediate_size = visual_embedding_layer_intermediate_size
self.visual_embedding_layer_hidden_act = visual_embedding_layer_hidden_act
self.visual_offset = visual_offset
self.audio_offset = audio_offset
self.visual_config = LongcatNextVisualConfig(**visual_config)
self.audio_config = LongcatNextAudioConfig(**audio_config)
oe_ignored_token_ids = oe_ignored_token_ids or list(range(self.text_vocab_size, self.text_vocab_plus_multimodal_special_token_size))
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_layers=num_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
rms_norm_eps=rms_norm_eps,
use_cache=use_cache,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
attention_bias=attention_bias,
attention_dropout=attention_dropout,
ffn_hidden_size=ffn_hidden_size,
q_lora_rank=q_lora_rank,
kv_lora_rank=kv_lora_rank,
qk_nope_head_dim=qk_nope_head_dim,
qk_rope_head_dim=qk_rope_head_dim,
head_dim=head_dim,
v_head_dim=v_head_dim,
qk_head_dim=qk_head_dim,
moe_topk=moe_topk,
n_routed_experts=n_routed_experts,
zero_expert_num=zero_expert_num,
expert_ffn_hidden_size=expert_ffn_hidden_size,
routed_scaling_factor=routed_scaling_factor,
emb_neighbor_num=emb_neighbor_num,
emb_split_num=emb_split_num,
ngram_vocab_size_ratio=ngram_vocab_size_ratio,
oe_ignored_token_ids=oe_ignored_token_ids,
**kwargs,
)
class LongcatNextVisualConfig(Qwen2_5_VLVisionConfig):
model_type = "longcat_next_visual"
base_config_key = ""
def __init__(
self,
image_start_token_id=131106,
image_end_token_id=131107,
image_pad_token_id=131108,
image_newline_token_id=131109,
vq_config={},
visual_decoder_config={},
**kwargs,
):
self.image_start_token_id = image_start_token_id
self.image_end_token_id = image_end_token_id
self.image_pad_token_id = image_pad_token_id
self.image_newline_token_id = image_newline_token_id
self.vq_config = PretrainedConfig(**vq_config)
self.visual_decoder_config = PretrainedConfig(**visual_decoder_config)
self.visual_decoder_config.image_decoder_config = PretrainedConfig(**getattr(self.visual_decoder_config, "image_decoder_config", {}))
self.visual_decoder_config.transformer_config = PretrainedConfig(**getattr(self.visual_decoder_config, "transformer_config", {}))
self.visual_decoder_config.vae_config = PretrainedConfig(**getattr(self.visual_decoder_config, "vae_config", {}))
self.visual_decoder_config.scheduler_config = PretrainedConfig(**getattr(self.visual_decoder_config, "scheduler_config", {}))
super().__init__(**kwargs)
class LongcatNextAudioConfig(WhisperConfig):
model_type = "longcat_next_audio"
base_config_key = ""
def __init__(
self,
vq_config={},
vocoder_config={},
flow_matching_config={},
cosy24kvocoder_config={},
**kwargs
):
self.vq_config = PretrainedConfig(**vq_config)
self.vocoder_config = PretrainedConfig(**vocoder_config)
self.flow_matching_config = PretrainedConfig(**flow_matching_config)
self.flow_matching_config.cfm_params = PretrainedConfig(**getattr(self.flow_matching_config, "cfm_params", {}))
self.cosy24kvocoder_config = PretrainedConfig(**cosy24kvocoder_config)
super().__init__(**kwargs)
__all__ = ["LongcatNextConfig", "LongcatNextVisualConfig", "LongcatNextAudioConfig"]