from packaging import version import transformers if version.parse(transformers.__version__) < version.parse("4.31.0"): raise ImportError( f"You are using transformers=={transformers.__version__}, but transformers>=4.31.0 is required to use DeciCoder. Please upgrade transformers." ) from transformers.models.llama.configuration_llama import LlamaConfig from transformers.utils import logging logger = logging.get_logger(__name__) LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class DeciCoderConfig(LlamaConfig): r""" This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the LLaMA-7B. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: naive_attention_prefill (`bool`, *optional*, defaults to False): Whether to use naive matmul or scaled dot product attention during prefill. naive_attention_decode_batched (`bool`, *optional*, defaults to True): Whether to use naive matmul or scaled dot product attention during decode for batch_size > 1. naive_attention_decode_single (`bool`, *optional*, defaults to False): Whether to use naive matmul or scaled dot product attention during decode for batch_size == 1. ```""" model_type = "llama" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, naive_attention_prefill: bool = False, naive_attention_decode_batched: bool = True, naive_attention_decode_single: bool = False, **kwargs, ): self.naive_attention_prefill = naive_attention_prefill self.naive_attention_decode_batched = naive_attention_decode_batched self.naive_attention_decode_single = naive_attention_decode_single super().__init__(**kwargs,)