|
from transformers import PretrainedConfig
|
|
|
|
|
|
class LingLongConfig(PretrainedConfig):
|
|
model_type = 'linglong'
|
|
|
|
def __init__(
|
|
self,
|
|
vocab_size: int = 13312,
|
|
n_position: int = 1024,
|
|
n_embd: int = 1024,
|
|
n_layer: int = 24,
|
|
n_head: int = 16,
|
|
n_inner: int | None = None,
|
|
activation_function: str = 'gelu_new',
|
|
resid_pdrop: float = 0.1,
|
|
embd_pdrop: float = 0.1,
|
|
attn_pdrop: float = 0.1,
|
|
layer_norm_epsilon: float = 1e-8,
|
|
initializer_range: float = 0.02,
|
|
scale_attn_weights: bool = True,
|
|
use_cache: bool = True,
|
|
bos_token_id: int = 10,
|
|
eos_token_id: int = 8,
|
|
pad_token_id: int = 0,
|
|
scale_attn_by_inverse_layer_idx: bool = False,
|
|
reorder_and_upcast_attn: bool = False,
|
|
attn_mode: str = 'sparse',
|
|
attn_stride: int | None = 128,
|
|
attn_c: int | None = 8,
|
|
use_pinyin: bool = False,
|
|
backward: bool = False,
|
|
tokenizer_class: str | None = 'LingLongTokenizer',
|
|
**kwargs,
|
|
):
|
|
self.vocab_size = vocab_size
|
|
self.n_position = n_position
|
|
self.n_embd = n_embd
|
|
self.n_layer = n_layer
|
|
self.n_head = n_head
|
|
self.n_inner = n_inner
|
|
self.activation_function = activation_function
|
|
self.resid_pdrop = resid_pdrop
|
|
self.embd_pdrop = embd_pdrop
|
|
self.attn_pdrop = attn_pdrop
|
|
self.layer_norm_epsilon = layer_norm_epsilon
|
|
self.initializer_range = initializer_range
|
|
self.scale_attn_weights = scale_attn_weights
|
|
self.use_cache = use_cache
|
|
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
|
|
self.reorder_and_upcast_attn = reorder_and_upcast_attn
|
|
self.attn_mode = attn_mode
|
|
self.attn_stride = attn_stride
|
|
self.attn_c = attn_c
|
|
self.use_pinyin = use_pinyin
|
|
self.backward = backward
|
|
|
|
self.bos_token_id = bos_token_id
|
|
self.eos_token_id = eos_token_id
|
|
self.pad_token_id = pad_token_id
|
|
|
|
super().__init__(
|
|
bos_token_id=bos_token_id,
|
|
eos_token_id=eos_token_id,
|
|
pad_token_id=pad_token_id,
|
|
tokenizer_class=tokenizer_class,
|
|
**kwargs,
|
|
)
|
|
|