custom-seq2seq / configuration_custom_seq2seq_llm.py
amazingvince's picture
Upload folder using huggingface_hub
78dbe53 verified
from transformers import PreTrainedModel, PretrainedConfig
class Seq2SeqConfig(PretrainedConfig):
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_encoder_layers=6,
num_decoder_layers=12,
num_attention_heads=12,
num_key_value_heads=4,
intermediate_size=3072,
hidden_act="silu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
max_position_embeddings=512,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
use_cache=True,
rotary_emb_dim=0,
rotary_emb_base=10000.0,
rotary_emb_scale_base=None,
rotary_emb_interleaved=False,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_encoder_layers = num_encoder_layers
self.num_decoder_layers = num_decoder_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.rotary_emb_base = rotary_emb_base
self.rotary_emb_scale_base = rotary_emb_scale_base
self.rotary_emb_interleaved = rotary_emb_interleaved
# Calculate head_dim and set rotary_emb_dim
self.head_dim = self.hidden_size // self.num_attention_heads
self.rotary_emb_dim = kwargs.get('rotary_emb_dim', self.head_dim // 2)
# Ensure rotary_emb_dim is not larger than head_dim
if self.rotary_emb_dim > self.head_dim:
print(f"Warning: rotary_emb_dim ({self.rotary_emb_dim}) is larger than head_dim ({self.head_dim}). Setting rotary_emb_dim to head_dim.")
self.rotary_emb_dim = self.head_dim