from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) class GPTRefactConfig(PretrainedConfig): model_type = "gpt_refact" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { "hidden_size": "n_embd", "max_position_embeddings": "n_positions", "num_attention_heads": "n_head", "num_hidden_layers": "n_layer", } def __init__( self, vocab_size: int = 49216, n_positions: int = 4096, n_embd: int = 1024, n_layer: int = 32, n_head: int = 64, max_position_embeddings: int = 4096, multi_query: bool = True, layer_norm_epsilon: float = 1e-5, initializer_range: float = 0.02, use_cache: bool = True, eos_token_id: int = 0, attention_softmax_in_fp32: bool = True, scale_attention_softmax_in_fp32: bool = True, attention_bias_in_fp32: bool = True, torch_dtype: str = 'bfloat16', **kwargs, ): self.vocab_size = vocab_size self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.n_inner = None self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range self.use_cache = use_cache self.attention_softmax_in_fp32 = attention_softmax_in_fp32 self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32 self.attention_bias_in_fp32 = attention_bias_in_fp32 self.multi_query = multi_query self.max_position_embeddings = max_position_embeddings self.torch_dtype = torch_dtype super().__init__(eos_token_id=eos_token_id, **kwargs)