chgk13
/

decicoder-1b-openvino-int8

+from packaging import version
+import transformers
+if version.parse(transformers.__version__) < version.parse("4.31.0"):
+    raise ImportError(
+        f"You are using transformers=={transformers.__version__}, but transformers>=4.31.0 is required to use DeciCoder. Please upgrade transformers."
+    )
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class DeciCoderConfig(LlamaConfig):
+    r"""
+   This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+   model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+   defaults will yield a similar configuration to that of the LLaMA-7B.
+   Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+   documentation from [`PretrainedConfig`] for more information.
+    Args:
+        naive_attention_prefill (`bool`, *optional*, defaults to False):
+            Whether to use naive matmul or scaled dot product attention during prefill.
+        naive_attention_decode_batched (`bool`, *optional*, defaults to True):
+            Whether to use naive matmul or scaled dot product attention during decode for batch_size > 1.
+        naive_attention_decode_single (`bool`, *optional*, defaults to False):
+            Whether to use naive matmul or scaled dot product attention during decode for batch_size == 1.
+       ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        naive_attention_prefill: bool = False,
+        naive_attention_decode_batched: bool = True,
+        naive_attention_decode_single: bool = False,
+        **kwargs,
+    ):
+        self.naive_attention_prefill = naive_attention_prefill
+        self.naive_attention_decode_batched = naive_attention_decode_batched
+        self.naive_attention_decode_single = naive_attention_decode_single
+        super().__init__(**kwargs,)