from transformers import PretrainedConfig class BatGPTConfig(PretrainedConfig): model_type = "batgpt" def __init__( self, vocab_size=65024, emb_dim=5632, hidden_size=5632, n_layer=48, n_head=44, layer_norm_epsilon=1e-5, use_multi_query_attn=True, num_heads_per_kv=2, qkv_bias=True, use_native_attn_impl=True, mlp_activation="swiglu", hidden_dropout=0.0, ffn_hidden_size=13696, prefix_size=None, prefix_proj=False, max_seq_len=32768, pos_emb_impl="rope", use_emb_factorization=False, empty_init=True, **kwargs ): self.vocab_size = vocab_size self.emb_dim = emb_dim self.hidden_size = hidden_size self.n_layer = n_layer self.n_head = n_head self.layer_norm_epsilon = layer_norm_epsilon self.use_multi_query_attn = use_multi_query_attn self.num_heads_per_kv = num_heads_per_kv self.qkv_bias = qkv_bias self.use_native_attn_impl = use_native_attn_impl self.mlp_activation = mlp_activation self.hidden_dropout = hidden_dropout self.ffn_hidden_size = ffn_hidden_size self.prefix_size = prefix_size self.prefix_proj = prefix_proj self.max_seq_len = max_seq_len self.pos_emb_impl = pos_emb_impl self.use_emb_factorization = use_emb_factorization self.empty_init = empty_init super().__init__(**kwargs)