upload model

Browse files

Files changed (11) hide show

config.json +40 -0
configuration_granite.py +98 -0
generation_config.json +7 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +329 -0
modeling_granite.py +1374 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
tokenizer_config.json +187 -0

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "activation_function": "swiglu",
+  "add_bias": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_head_type": "mha",
+  "attention_multiplier": null,
+  "attention_softmax_in_fp32": true,
+  "attn_pdrop": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_granite.GraniteConfig",
+    "AutoModel": "modeling_granite.GraniteModel",
+    "AutoModelForCausalLM": "modeling_granite.GraniteForCausalLM"
+  },
+  "bos_token_id": 0,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 0,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "granite",
+  "n_embd": 2560,
+  "n_head": 32,
+  "n_inner": 10240,
+  "n_layer": 32,
+  "n_positions": 2048,
+  "normalization_function": "rmsnorm",
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "position_embedding_type": "rope",
+  "resid_pdrop": 0.1,
+  "rope_theta": 10000,
+  "scale_attention_softmax_in_fp32": true,
+  "scale_attn_weights": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.1",
+  "use_cache": true,
+  "vocab_size": 49152
+}

configuration_granite.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from transformers import PretrainedConfig
+class GraniteConfig(PretrainedConfig):
+    model_type = "granite"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+    def __init__(
+        self,
+        vocab_size: int = 50257,
+        n_positions: int = 1024,
+        n_embd: int = 768,
+        n_layer: int = 12,
+        n_head: int = 12,
+        num_key_value_heads: int = None,
+        n_inner: int = None,
+        activation_function: str = "gelu_pytorch_tanh",
+        attention_head_type: str = "mqa",
+        resid_pdrop: float = 0.1,
+        embd_pdrop: float = 0.1,
+        attn_pdrop: float = 0.1,
+        normalization_function: str = "layernorm",
+        layer_norm_epsilon: float = 1e-5,
+        initializer_range: float = 0.02,
+        scale_attn_weights: bool = True,
+        attention_multiplier: float = None,
+        use_cache: bool = True,
+        bos_token_id: int = 50256,
+        eos_token_id: int = 50256,
+        pad_token_id: int = 50256,
+        attention_softmax_in_fp32: bool = True,
+        scale_attention_softmax_in_fp32: bool = True,
+        add_bias: bool = True,
+        position_embedding_type: str = "learned_absolute",
+        rope_theta: int = 10000,
+        **kwargs,
+    ) -> None:
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.num_key_value_heads = num_key_value_heads
+        self.n_inner = 4 * n_embd if n_inner is None else n_inner
+        self.activation_function = activation_function
+        self.attention_head_type = attention_head_type
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.normalization_function = normalization_function
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.attention_multiplier = attention_multiplier
+        self.use_cache = use_cache
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
+        self.position_embedding_type = position_embedding_type
+        self.add_bias = add_bias
+        self.rope_theta = rope_theta
+        if self.attention_multiplier is not None:
+            assert self.scale_attn_weights
+        # for compatibility with some features
+        self.multi_query = attention_head_type == "mqa"
+        if attention_head_type == "mha":
+            if self.num_key_value_heads is None:
+                self.num_key_value_heads = self.n_head
+            assert (
+                self.n_head == self.num_key_value_heads
+            ), "MultiHeadAttention should have same number of heads for query, keys and values"
+        elif attention_head_type == "mqa":
+            if self.num_key_value_heads is None:
+                self.num_key_value_heads = 1
+            assert self.num_key_value_heads == 1, "MultiQueryAttention should have 1 head for keys and values"
+        elif attention_head_type == "gqa":
+            assert (
+                self.num_key_value_heads is not None
+            ), "`num_key_value_heads` needs to be specified with GroupedQueryAttention"
+            assert (
+                self.n_head % self.num_key_value_heads == 0
+            ), "GroupedQueryAttention should have more than 1 head for keys and values"
+        else:
+            raise ValueError(f"unexpected attention_head_type ({attention_head_type})")
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "pad_token_id": 0,
+  "transformers_version": "4.38.1"
+}

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:612678f5629dbc658d29e393ef01b0ddc26c0ddcc7eb15e98bc1145c2f66c20b
+size 4804086856

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:085bbe4bfa511e3b8cd345c2f65ca44075894cb967e2748bf2a5780b195b10f9
+size 4930111520

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c05dc3ccf273010235c01cadae113ec29a366a467cd78a5f2c46e713e2bedf3
+size 4195850696

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,329 @@

+{
+  "metadata": {
+    "total_size": 13930014720
+  },
+  "weight_map": {
+    "transformer.h.0.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.0.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.0.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.0.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.0.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.0.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.0.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.0.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.0.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.0.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.1.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.1.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.1.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.1.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.1.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.10.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.10.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.10.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.10.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.10.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.10.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.10.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.10.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.10.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.10.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.11.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.11.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.11.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.11.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.11.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.12.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.12.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.12.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.12.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.12.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.13.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.13.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.13.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.13.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.13.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.14.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.14.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.14.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.14.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.14.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.15.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.15.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.15.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.15.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.15.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.16.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.16.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.16.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.16.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.16.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.17.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.17.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.17.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.17.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.17.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.18.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.18.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.18.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.18.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.18.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.19.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.19.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.19.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.19.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.19.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.2.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.2.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.2.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.2.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.2.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.2.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.2.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.2.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.2.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.2.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.20.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.20.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.20.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.20.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.20.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.20.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.20.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.20.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.20.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.20.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.21.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.21.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.ln_2.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.21.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.21.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+    "transformer.h.21.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.22.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.22.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.22.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.22.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.22.ln_1.weight": "model-00002-of-00003.safetensors",
+    "transformer.h.22.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.22.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.22.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.22.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.22.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.23.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.23.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.ln_1.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.23.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.23.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.23.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.24.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.24.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.ln_1.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.24.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.24.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.24.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.25.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.25.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.ln_1.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.25.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.25.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.25.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.26.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.26.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.ln_1.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.26.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.26.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.26.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.27.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.27.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.ln_1.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.27.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.27.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.27.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.28.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.28.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.ln_1.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.28.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.28.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.28.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.29.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.29.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.ln_1.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.29.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.29.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.29.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.3.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.3.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.3.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.3.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.3.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.3.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.3.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.3.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.3.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.3.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.30.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.30.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.30.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.30.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.30.ln_1.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.30.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.30.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.30.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.30.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.30.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.31.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.31.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.ln_1.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.ln_2.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.31.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.31.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+    "transformer.h.31.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+    "transformer.h.4.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.4.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.4.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.4.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.4.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.4.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.4.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.4.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.4.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.4.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.5.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.5.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.5.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.5.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.5.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.6.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.6.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.6.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.6.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.6.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.7.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.7.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.7.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.7.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.7.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.8.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.8.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.8.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.8.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.8.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.9.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.9.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.ln_1.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.ln_2.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.9.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+    "transformer.h.9.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+    "transformer.h.9.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+    "transformer.ln_f.weight": "model-00003-of-00003.safetensors",
+    "transformer.wte.weight": "model-00001-of-00003.safetensors"
+  }
+}

modeling_granite.py ADDED Viewed

	@@ -0,0 +1,1374 @@

+import numbers
+from enum import Enum
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import DynamicCache, PreTrainedModel
+from transformers.activations import get_activation as get_base_activation
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from transformers.utils import is_flash_attn_2_available
+from .configuration_granite import GraniteConfig
+class PositionEmbeddingType(Enum):
+    learned_absolute = "learned_absolute"
+    alibi = "alibi"
+    rope = "rope"
+class AttentionHeadType(Enum):
+    mha = "mha"
+    mqa = "mqa"
+    gqa = "gqa"
+if is_flash_attn_2_available():
+    from flash_attn.bert_padding import IndexFirstAxis, pad_input, unpad_input
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return indices, cu_seqlens, max_seqlen_in_batch
+def repeat_key_value(x: torch.Tensor, num_heads: int, num_key_value_heads: int) -> torch.Tensor:
+    num_groups = num_heads // num_key_value_heads
+    # mha
+    if num_groups == 1:
+        return x
+    # mqa
+    if num_key_value_heads == 1:
+        return x.expand(-1, num_heads, -1, -1)
+    # gqa
+    return x.repeat_interleave(num_groups, dim=1)
+##################################################
+# activation functions
+_GLU_BASE_MAPPING = {
+    "geglu": "gelu",
+    "miglu": "mish",
+    "mishglu": "mish",
+    "swiglu": "swish",
+}
+class GLUActivation(nn.Module):
+    def __init__(self, base_activation: nn.Module) -> None:
+        super().__init__()
+        self.base_activation = base_activation
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.chunk(2, dim=-1)
+        return x[0] * self.base_activation(x[1])
+def is_glu(name: str) -> bool:
+    return name.endswith("glu")
+def get_activation_function(name: str) -> nn.Module:
+    if is_glu(name):
+        # for glu and sigmoid_glu, we directly return the pytorch's GLU
+        if name in ["glu", "sigmoid_glu"]:
+            activation_function = nn.modules.GLU()
+        else:
+            if name in _GLU_BASE_MAPPING:
+                name = _GLU_BASE_MAPPING[name]
+            elif name.endswith("_glu"):
+                name = name.rstrip("_glu")
+            else:
+                raise ValueError("invalid activation function")
+            base_activation = get_base_activation(name)
+            activation_function = GLUActivation(base_activation)
+    else:
+        activation_function = get_base_activation(name)
+    return activation_function
+##################################################
+# normalization functions
+class RMSNorm(nn.Module):
+    def __init__(self, normalized_shape: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.eps = eps
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = normalized_shape
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        input_dtype = input.dtype
+        input = input.to(torch.float32)
+        variance = input.pow(2).mean(-1, keepdim=True)
+        input = input * torch.rsqrt(variance + self.eps)
+        return self.weight * input.to(input_dtype)
+    def extra_repr(self) -> str:
+        return f"{self.normalized_shape}, eps={self.eps}"
+    def reset_parameters(self) -> None:
+        nn.init.ones_(self.weight)
+_NORMALIZATION_FUNCTIONS = {
+    "layernorm": nn.LayerNorm,
+    "rmsnorm": RMSNorm,
+}
+def get_normalization_function(name: str, normalized_shape: int, eps: float = 1e-5) -> nn.Module:
+    if name in _NORMALIZATION_FUNCTIONS:
+        return _NORMALIZATION_FUNCTIONS[name](normalized_shape, eps=eps)
+    raise ValueError(f"unexpected `normalization_function` {name}")
+##################################################
+# attention modules
+class GraniteAttention(nn.Module):
+    def __init__(self, config: GraniteConfig, causal: bool, layer_idx: Optional[int] = None) -> None:
+        super().__init__()
+        self.causal = causal
+        self.hidden_size = config.n_embd
+        self.num_heads = config.n_head
+        self.num_key_value_heads = config.num_key_value_heads
+        self.add_bias = config.add_bias
+        assert (
+            self.hidden_size % self.num_heads == 0
+        ), f"`hidden_size` ({self.hidden_size}) must be divisible by `num_heads` ({self.num_heads})"
+        self.head_dim = self.hidden_size // self.num_heads
+        self.attention_head_type = AttentionHeadType(config.attention_head_type)
+        self.position_embedding_type = PositionEmbeddingType(config.position_embedding_type)
+        self.scale_attn_weights = config.scale_attn_weights
+        self.attention_multiplier = config.attention_multiplier
+        self.layer_idx = layer_idx
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        self.scale_attention_softmax_in_fp32 = (
+            config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
+        )
+        if self.attention_head_type == AttentionHeadType.mha:
+            if self.num_key_value_heads is None:
+                self.num_key_value_heads = self.num_heads
+            assert (
+                self.num_heads == self.num_key_value_heads
+            ), f"{self.__class__.__name__} should have same number of heads for query, keys and values"
+        elif self.attention_head_type == AttentionHeadType.gqa:
+            assert (
+                self.num_key_value_heads is not None
+            ), "`num_key_value_heads` needs to be specified with GroupedQueryAttention"
+            assert self.num_heads % self.num_key_value_heads == 0, (
+                f"`num_heads` ({self.num_heads}) should be a multiple of `num_key_value_heads` "
+                f"({self.num_key_value_heads})"
+            )
+        elif self.attention_head_type == AttentionHeadType.mqa:
+            if self.num_key_value_heads is None:
+                self.num_key_value_heads = 1
+            assert self.num_key_value_heads == 1, f"{self.__class__.__name__} should have 1 head for keys and values"
+        else:
+            raise ValueError(f"unexpected attention_head_type ({self.attention_head_type})")
+        # note that the actual layout is different for the output and depends on whether we are using MHA, MQA or GQA
+        # (self.hidden_size + 2 * self.num_key_value_heads * self.head_dim) is just the actual number output features
+        self.c_attn = nn.Linear(
+            self.hidden_size, self.hidden_size + 2 * self.num_key_value_heads * self.head_dim, bias=self.add_bias
+        )
+        self.c_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=self.add_bias)
+        self.attn_pdrop = config.attn_pdrop
+        self.resid_pdrop = config.resid_pdrop
+        self.attn_dropout = nn.Identity() if self.attn_pdrop == 0 else nn.Dropout(self.attn_pdrop)
+        self.resid_dropout = nn.Identity() if self.resid_pdrop == 0 else nn.Dropout(self.resid_pdrop)
+    def _prepare_qkv_for_forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # ==========================================================================================
+        # hidden_states -> (batch_size, query_length, num_heads * head_dim)
+        # ==========================================================================================
+        # the output of following is a tuple if using MQA with tensor parallel
+        hidden_states = self.c_attn(hidden_states)
+        # ==========================================================================================
+        # hidden_states -> (batch_size, query_length, [num_heads + num_key_value_heads * 2] * head_dim)
+        # ==========================================================================================
+        # for MHA, we can get away with doing just 1 transpose which is not true for GQA
+        if self.attention_head_type == AttentionHeadType.mha:
+            query, key, value = self._prepare_qkv_for_forward_mha(hidden_states)
+        elif self.attention_head_type == AttentionHeadType.gqa:
+            query, key, value = self._prepare_qkv_for_forward_gqa(hidden_states)
+        elif self.attention_head_type == AttentionHeadType.mqa:
+            query, key, value = self._prepare_qkv_for_forward_mqa(hidden_states)
+        else:
+            raise ValueError(f"unexpected attention_head_type ({self.attention_head_type})")
+        # ==========================================================================================
+        # query -> (batch_size, num_heads, query_length, head_dim)
+        # key -> (batch_size, num_key_value_heads, query_length, head_dim)
+        # value -> (batch_size, num_key_value_heads, query_length, head_dim)
+        # ==========================================================================================
+        return query, key, value
+    def _prepare_qkv_for_forward_mha(
+        self, hidden_states: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size, query_length = hidden_states.shape[:-1]
+        hidden_states = hidden_states.view(batch_size, query_length, self.num_heads, -1)
+        hidden_states = hidden_states.transpose(1, 2)
+        query, key, value = hidden_states.chunk(3, dim=-1)
+        return query, key, value
+    def _prepare_qkv_for_forward_gqa(
+        self, hidden_states: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size, query_length = hidden_states.shape[:-1]
+        hidden_states = hidden_states.view(batch_size, query_length, self.num_key_value_heads, -1)
+        query, key, value = hidden_states.split(
+            ((self.num_heads // self.num_key_value_heads) * self.head_dim, self.head_dim, self.head_dim), dim=-1
+        )
+        # this needs to be a reshape instead of view sadly
+        query = query.reshape(batch_size, query_length, -1, self.head_dim)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        return query, key, value
+    def _prepare_qkv_for_forward_mqa(
+        self, hidden_states: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size, query_length = hidden_states.shape[:-1]
+        query, key, value = hidden_states.split((self.hidden_size, self.head_dim, self.head_dim), dim=-1)
+        query = query.view(batch_size, query_length, self.num_heads, -1)
+        query = query.transpose(1, 2)
+        key = key.unsqueeze(1)
+        value = value.unsqueeze(1)
+        return query, key, value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        past_key_values: Optional[DynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rope_cos_sin: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        # ==========================================================================================
+        # hidden_states -> (batch_size, query_length, num_heads * head_dim)
+        # ==========================================================================================
+        query, key, value = self._prepare_qkv_for_forward(hidden_states)
+        # ==========================================================================================
+        # query -> (batch_size, num_heads, query_length, head_dim)
+        # key -> (batch_size, num_key_value_heads, query_length, head_dim)
+        # value -> (batch_size, num_key_value_heads, query_length, head_dim)
+        # ==========================================================================================
+        if self.position_embedding_type == PositionEmbeddingType.rope:
+            query = apply_rotary_pos_emb(query, rope_cos_sin)
+            key = apply_rotary_pos_emb(key, rope_cos_sin)
+        if past_key_values is not None:
+            key, value = past_key_values.update(key, value, self.layer_idx)
+        # ==========================================================================================
+        # query -> (batch_size, num_heads, query_length, head_dim)
+        # key -> (batch_size, num_key_value_heads, key_length, head_dim)
+        # value -> (batch_size, num_key_value_heads, key_length, head_dim)
+        # ==========================================================================================
+        key = key.transpose(-1, -2)
+        dtype = query.dtype
+        softmax_dtype = torch.float32 if self.attention_softmax_in_fp32 else dtype
+        if self.scale_attn_weights:
+            if self.attention_multiplier is None:
+                scale_factor = 1 / self.head_dim**0.5
+            else:
+                scale_factor = self.attention_multiplier
+        else:
+            scale_factor = 1
+        # ==========================================================================================
+        # query -> (batch_size, num_heads, query_length, head_dim)
+        # key -> (batch_size, num_key_value_heads, head_dim, key_length)
+        # value -> (batch_size, num_key_value_heads, key_length, head_dim)
+        # ==========================================================================================
+        batch_size = query.shape[0]
+        query_length = query.shape[2]
+        key_length = key.shape[-1]
+        key = repeat_key_value(key, self.num_heads, self.num_key_value_heads)
+        value = repeat_key_value(value, self.num_heads, self.num_key_value_heads)
+        # Always copies
+        query = query.reshape(batch_size * self.num_heads, query_length, self.head_dim)
+        # No copy when layer_past is provided.
+        key = key.reshape(batch_size * self.num_heads, self.head_dim, key_length)
+        # ==========================================================================================
+        # query -> (batch_size * num_heads, query_length, head_dim)
+        # key -> (batch_size * num_heads, head_dim, key_length)
+        # value -> (batch_size, num_heads, key_length, head_dim)
+        # ==========================================================================================
+        attn_weights = torch.empty(
+            (batch_size * self.num_heads, query_length, key_length), device=query.device, dtype=query.dtype
+        )
+        attn_weights = torch.baddbmm(attn_weights, query, key, beta=0, alpha=scale_factor).view(
+            batch_size, self.num_heads, query_length, key_length
+        )
+        # ==========================================================================================
+        # attn_weights -> (batch_size, num_heads, query_length, key_length)
+        # ==========================================================================================
+        attn_weights = attn_weights.to(softmax_dtype)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = F.softmax(attn_weights, dim=-1).to(dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        # ==========================================================================================
+        # value -> (batch_size, num_heads, key_length, head_dim)
+        # attn_weights -> (batch_size, num_heads, query_length, key_length)
+        # ==========================================================================================
+        attn_output = torch.matmul(attn_weights, value)
+        # ==========================================================================================
+        # attn_output -> (batch_size, num_heads, query_length, head_dim)
+        # ==========================================================================================
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
+        # ==========================================================================================
+        # attn_output -> (batch_size, query_length, num_heads * head_dim)
+        # ==========================================================================================
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        return attn_output
+class GraniteSDPA(GraniteAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        past_key_values: Optional[DynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rope_cos_sin: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        # ==========================================================================================
+        # hidden_states -> (batch_size, query_length, num_heads * head_dim)
+        # ==========================================================================================
+        query, key, value = self._prepare_qkv_for_forward(hidden_states)
+        # ==========================================================================================
+        # query -> (batch_size, num_heads, query_length, head_dim)
+        # key -> (batch_size, num_key_value_heads, query_length, head_dim)
+        # value -> (batch_size, num_key_value_heads, query_length, head_dim)
+        # ==========================================================================================
+        if self.position_embedding_type == PositionEmbeddingType.rope:
+            query = apply_rotary_pos_emb(query, rope_cos_sin)
+            key = apply_rotary_pos_emb(key, rope_cos_sin)
+        if past_key_values is not None:
+            key, value = past_key_values.update(key, value, self.layer_idx)
+        # ==========================================================================================
+        # query -> (batch_size, num_heads, query_length, head_dim)
+        # key -> (batch_size, num_key_value_heads, key_length, head_dim)
+        # value -> (batch_size, num_key_value_heads, key_length, head_dim)
+        # ==========================================================================================
+        key = repeat_key_value(key, self.num_heads, self.num_key_value_heads)
+        value = repeat_key_value(value, self.num_heads, self.num_key_value_heads)
+        # ==========================================================================================
+        # query -> (batch_size, num_heads, query_length, head_dim)
+        # key -> (batch_size, num_heads, key_length, head_dim)
+        # value -> (batch_size, num_heads, key_length, head_dim)
+        # ==========================================================================================
+        attn_output = F.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=self.attn_pdrop if self.training else 0,
+            is_causal=self.causal if attention_mask is None else False,
+            scale=self.attention_multiplier if self.scale_attn_weights else 1,
+        )
+        # ==========================================================================================
+        # attn_output -> (batch_size, num_heads, query_length, head_dim)
+        # ==========================================================================================
+        batch_size = attn_output.shape[0]
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
+        # ==========================================================================================
+        # attn_output -> (batch_size, query_length, num_heads * head_dim)
+        # ==========================================================================================
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        return attn_output
+class GraniteFlashAttention2(GraniteAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        past_key_values: Optional[DynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rope_cos_sin: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        # ==========================================================================================
+        # hidden_states -> (batch_size, query_length, num_heads * head_dim)
+        # ==========================================================================================
+        query, key, value = self._prepare_qkv_for_forward(hidden_states)
+        # ==========================================================================================
+        # query -> (batch_size, num_heads, query_length, head_dim)
+        # key -> (batch_size, num_key_value_heads, query_length, head_dim)
+        # value -> (batch_size, num_key_value_heads, query_length, head_dim)
+        # ==========================================================================================
+        if self.position_embedding_type == PositionEmbeddingType.rope:
+            query = apply_rotary_pos_emb(query, rope_cos_sin)
+            key = apply_rotary_pos_emb(key, rope_cos_sin)
+        if past_key_values is not None:
+            key, value = past_key_values.update(key, value, self.layer_idx)
+        # ==========================================================================================
+        # query -> (batch_size, num_heads, query_length, head_dim)
+        # key -> (batch_size, num_key_value_heads, key_length, head_dim)
+        # value -> (batch_size, num_key_value_heads, key_length, head_dim)
+        # ==========================================================================================
+        # TODO avoid this extra transpose
+        query = query.transpose(1, 2)
+        if self.attention_head_type == AttentionHeadType.mqa:
+            key = key.squeeze(1).unsqueeze(2)
+            value = value.squeeze(1).unsqueeze(2)
+        else:
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+        # ==========================================================================================
+        # query -> (batch_size, query_length, num_heads, head_dim)
+        # key -> (batch_size, key_length, num_heads, head_dim)
+        # value -> (batch_size, key_length, num_heads, head_dim)
+        # ==========================================================================================
+        batch_size, query_length = query.shape[:2]
+        key_length = key.shape[1]
+        indices_k, cu_seqlens_k, max_seqlen_k = get_unpad_data(attention_mask)
+        key = IndexFirstAxis.apply(
+            key.reshape(batch_size * key_length, self.num_key_value_heads, self.head_dim), indices_k
+        )
+        value = IndexFirstAxis.apply(
+            value.reshape(batch_size * key_length, self.num_key_value_heads, self.head_dim), indices_k
+        )
+        if query_length == key_length:
+            query = IndexFirstAxis.apply(
+                query.reshape(batch_size * key_length, self.num_heads, self.head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_q = max_seqlen_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query = query.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(query, attention_mask)
+        # ==========================================================================================
+        # query -> (total_q, num_heads, head_dim)
+        # key -> (total_q, num_heads, head_dim)
+        # value -> (total_q, num_heads, head_dim)
+        # ==========================================================================================
+        attn_output = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=self.attn_pdrop if self.training else 0,
+            softmax_scale=self.attention_multiplier if self.scale_attn_weights else 1,
+            causal=self.causal,
+        )
+        # ==========================================================================================
+        # attn_output -> (total_q, num_heads, head_dim)
+        # ==========================================================================================
+        attn_output = pad_input(attn_output, indices_q, batch_size, query_length)
+        attn_output = attn_output.view(batch_size, query_length, -1)
+        # ==========================================================================================
+        # attn_output -> (batch_size, query_length, num_heads * head_dim)
+        # ==========================================================================================
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        return attn_output
+_ATTENTION_MODULES = {
+    "eager": GraniteAttention,
+    "sdpa": GraniteSDPA,
+    "flash_attention_2": GraniteFlashAttention2,
+}
+def get_attention_module(
+    config: GraniteConfig, causal: bool, attention_implementation: str, layer_idx: int
+) -> GraniteAttention:
+    if attention_implementation in _ATTENTION_MODULES:
+        return _ATTENTION_MODULES[attention_implementation](config, causal=causal, layer_idx=layer_idx)
+    raise ValueError(f"unexpected `attention_implementation` {attention_implementation}")
+##################################################
+# position embeddings
+class Alibi(nn.Module):
+    def __init__(self, num_heads: int) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.reset_parameters()
+    def forward(
+        self, attention_mask: torch.Tensor, batch_size: int, key_length: int, device: torch.device, dtype: torch.dtype
+    ) -> torch.Tensor:
+        """
+        Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+        relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+        `softmax(l+a) = softmax(l)`. Based on
+        https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+        TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+        Args:
+            attention_mask (torch.Tensor): attention_mask tensor of shape (`batch_size`, `key_length`)
+            num_heads (int): `num_heads` for the model
+            batch_size (int): `batch_size`
+            key_length (int): `key_length`
+            device (torch.device): device for the tensors
+            dtype (torch.dtype): dtype to use for the tensors
+        Returns:
+            torch.Tensor: alibi tensor of shape (`batch_size`, `num_heads`, `key_length`)
+        """
+        # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+        # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+        # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+        # => the query_length dimension will then be broadcasted correctly
+        # This is more or less identical to T5's relative position bias:
+        # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+        if attention_mask is None:
+            arange_tensor = (
+                torch.arange(key_length, device=device).unsqueeze(0).unsqueeze(0).expand(batch_size, -1, -1)
+            )
+        else:
+            arange_tensor = (attention_mask.cumsum(dim=-1) - 1).masked_fill_(attention_mask == 0, 0).unsqueeze(1)
+        alibi = self.slopes.unsqueeze(1) * arange_tensor
+        return alibi.to(dtype)
+    def reset_parameters(self) -> None:
+        closest_power_of_2 = 2 ** math.floor(math.log2(self.num_heads))
+        base = torch.tensor(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=torch.float32)
+        powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+        slopes = torch.pow(base, powers)
+        if closest_power_of_2 != self.num_heads:
+            extra_base = torch.tensor(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=torch.float32)
+            num_remaining_heads = min(closest_power_of_2, self.num_heads - closest_power_of_2)
+            extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=torch.int32)
+            slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+        self.register_buffer("slopes", slopes, persistent=False)
+class RoPE(nn.Module):
+    def __init__(
+        self,
+        head_dim: int,
+        max_position_embeddings: int = 2048,
+        base: int = 10000,
+    ) -> None:
+        super().__init__()
+        self.head_dim = head_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.mscale = 1
+        self.reset_parameters()
+    def forward(self, seq_len: int, dtype: torch.dtype, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=device, dtype=dtype)
+        cos = self.cos_cached[:seq_len].to(dtype)
+        sin = self.sin_cached[:seq_len].to(dtype)
+        return cos, sin
+    def reset_parameters(self) -> None:
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.head_dim, 2).float() / self.head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=self.max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    @torch.no_grad()
+    def _set_cos_sin_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> None:
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", (emb.cos() * self.mscale).to(dtype), persistent=False)
+        self.register_buffer("sin_cached", (emb.sin() * self.mscale).to(dtype), persistent=False)
+def apply_rotary_pos_emb(x: torch.Tensor, cos_sin: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+    cos, sin = cos_sin
+    x = (x * cos) + (_rotate_half(x) * sin)
+    return x
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1, x2 = torch.chunk(x, 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+##################################################
+# MLP
+class GraniteMLP(nn.Module):
+    def __init__(self, config: GraniteConfig) -> None:
+        super().__init__()
+        hidden_size = config.n_embd
+        intermediate_size = config.n_inner
+        activation_function = config.activation_function
+        add_bias = config.add_bias
+        residual_dropout = config.resid_pdrop
+        self.c_fc = nn.Linear(
+            hidden_size,
+            2 * intermediate_size if is_glu(activation_function) else intermediate_size,
+            bias=add_bias,
+        )
+        self.act = get_activation_function(activation_function)
+        self.c_proj = nn.Linear(intermediate_size, hidden_size, bias=add_bias)
+        self.dropout = nn.Identity() if residual_dropout == 0 else nn.Dropout(residual_dropout)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+##################################################
+# transformer layer
+class GraniteBlock(nn.Module):
+    def __init__(
+        self,
+        config: GraniteConfig,
+        attention_implementation: str,
+        layer_idx: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.inner_dim = config.n_inner
+        self.layer_idx = layer_idx
+        self.ln_1 = get_normalization_function(
+            config.normalization_function,
+            hidden_size,
+            eps=config.layer_norm_epsilon,
+        )
+        self.attn = get_attention_module(config, True, attention_implementation, layer_idx)
+        self.ln_2 = get_normalization_function(
+            config.normalization_function,
+            hidden_size,
+            eps=config.layer_norm_epsilon,
+        )
+        self.mlp = GraniteMLP(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        past_key_values: Optional[DynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rope_cos_sin: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            rope_cos_sin=rope_cos_sin,
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+##################################################
+# model classes
+class GranitePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = GraniteConfig
+    base_model_prefix = "transformer"
+    causal = True
+    _no_split_modules = ["GraniteBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+    def __init__(self, config: GraniteConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.attention_implementation = self.config._attn_implementation
+        self._use_eager_attention = self.attention_implementation == "eager"
+        self._use_sdpa = self.attention_implementation == "sdpa"
+        self._use_flash_attention_2 = self.attention_implementation == "flash_attention_2"
+        self.initializer_range = config.initializer_range
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, (nn.LayerNorm, RMSNorm, RoPE)):
+            module.reset_parameters()
+        elif isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0, std=self.initializer_range)
+            if module.bias is not None:
+                module.bias.zero_()
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0, std=self.initializer_range)
+            if module.padding_idx is not None:
+                module.weight[module.padding_idx].zero_()
+class GraniteModel(GranitePreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
+    mask_value = None
+    def __init__(self, config: GraniteConfig, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+        self.attention_head_type = AttentionHeadType(config.attention_head_type)
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        assert (
+            self.embed_dim % self.num_heads == 0
+        ), f"`embed_dim` ({self.embed_dim}) must be divisible by `num_heads` ({self.num_heads})"
+        self.head_dim = self.embed_dim // self.num_heads
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.drop = nn.Identity() if config.embd_pdrop == 0 else nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList(
+            [GraniteBlock(config, self.attention_implementation, layer_idx=i) for i in range(config.num_hidden_layers)]
+        )
+        self.ln_f = get_normalization_function(
+            config.normalization_function,
+            self.embed_dim,
+            eps=config.layer_norm_epsilon,
+        )
+        self.position_embedding_type = PositionEmbeddingType(config.position_embedding_type)
+        if self.position_embedding_type == PositionEmbeddingType.learned_absolute:
+            self.wpe = nn.Embedding(config.n_positions, self.embed_dim)
+        elif self.position_embedding_type == PositionEmbeddingType.alibi:
+            assert not self._use_flash_attention_2, "alibi is not implemented with FlashAttention"
+            self.alibi = Alibi(self.num_heads)
+        elif self.position_embedding_type == PositionEmbeddingType.rope:
+            self.rope = RoPE(self.head_dim, max_position_embeddings=config.n_positions, base=config.rope_theta)
+        else:
+            raise NotImplementedError()
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.wte
+    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
+        self.wte = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[DynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        (
+            output_hidden_states,
+            use_cache,
+            return_dict,
+            input_shape,
+            hidden_states,
+            attention_mask,
+            position_ids,
+            rope_cos_sin,
+            past_key_values,
+        ) = self._prepare_a_bunch_of_stuff(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # ==========================================================================================
+        # flash:
+        #     attention_mask -> (batch_size, key_length)
+        # else:
+        #     attention_mask -> (batch_size, 1, query_length, key_length)
+        # ==========================================================================================
+        output_shape = input_shape + (hidden_states.size(-1),)
+        past_key_values = DynamicCache() if use_cache and past_key_values is None else past_key_values
+        all_hidden_states = () if output_hidden_states else None
+        for block in self.h:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            hidden_states = block(
+                hidden_states,
+                past_key_values=past_key_values,
+                attention_mask=attention_mask,
+                rope_cos_sin=rope_cos_sin,
+            )
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, past_key_values, all_hidden_states] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+        )
+    def _get_position_ids(
+        self, attention_mask: torch.Tensor, past_length: int, query_length: int, key_length: int, device: torch.device
+    ) -> torch.Tensor:
+        if attention_mask is not None and len(attention_mask.shape) == 2:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 0)
+            if past_length > 0:
+                position_ids = position_ids[:, past_length:key_length:]
+        else:
+            position_ids = torch.arange(past_length, key_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, query_length)
+        return position_ids
+    def _get_alibi_bias(
+        self,
+        attention_mask: torch.Tensor,
+        batch_size: int,
+        query_length: int,
+        key_length: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        if self.position_embedding_type != PositionEmbeddingType.alibi:
+            return None
+        alibi_bias = self.alibi(attention_mask, batch_size, key_length, device, dtype)
+        # ==========================================================================================
+        # alibi_bias -> (batch_size, num_heads, key_length)
+        # ==========================================================================================
+        alibi_bias = alibi_bias.unsqueeze(2)
+        if query_length != 1:
+            alibi_bias = alibi_bias.expand(-1, -1, query_length, -1)
+        # ==========================================================================================
+        # alibi_bias -> (batch_size, num_heads, query_length, key_length)
+        # ==========================================================================================
+        return alibi_bias
+    def _get_rope_cos_sin(
+        self, key_length: int, position_ids: torch.Tensor, dtype: torch.dtype, device: torch.device
+    ) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
+        if self.position_embedding_type == PositionEmbeddingType.rope:
+            cos, sin = self.rope(key_length, dtype=dtype, device=device)
+            cos = cos[position_ids].unsqueeze(1)
+            sin = sin[position_ids].unsqueeze(1)
+            return cos, sin
+    def _prepare_causal_attention_mask(
+        self, attention_mask: torch.Tensor, batch_size: int, query_length: int, key_length: int, device: torch.device
+    ) -> torch.Tensor:
+        past_length = key_length - query_length
+        # ==========================================================================================
+        # attention_mask -> (batch_size, key_length)
+        # ==========================================================================================
+        if query_length > 1:
+            # (query_length, key_length)
+            causal_mask = torch.empty((query_length, key_length), dtype=torch.bool, device=device)
+            causal_mask[:, past_length:] = torch.tril(
+                torch.ones(query_length, query_length, dtype=torch.bool, device=device)
+            )
+            if past_length > 0:
+                causal_mask[:, :past_length] = True
+            # (query_length, key_length) -> (1, query_length, key_length)
+            causal_mask = causal_mask.unsqueeze(0)
+            if attention_mask is None:
+                # (1, query_length, key_length) -> (batch_size, query_length, key_length)
+                causal_mask = causal_mask.expand(batch_size, -1, -1)
+            else:
+                # (1, query_length, key_length) & (batch_size, 1, key_length) -> (batch_size, query_length, key_length)
+                causal_mask = causal_mask & attention_mask.unsqueeze(1).to(torch.bool)
+        else:
+            if attention_mask is None:
+                # (batch_size, query_length, key_length)
+                causal_mask = torch.ones(batch_size, query_length, key_length, dtype=torch.bool, device=device)
+            else:
+                # (batch_size, query_length, key_length)
+                causal_mask = attention_mask.unsqueeze(1).to(dtype=torch.bool, device=device)
+        # ==========================================================================================
+        # attention_mask -> (batch_size, query_length, key_length)
+        # ==========================================================================================
+        causal_mask = causal_mask.unsqueeze(1)
+        # ==========================================================================================
+        # attention_mask -> (batch_size, 1, query_length, key_length)
+        # ==========================================================================================
+        return causal_mask
+    def _get_initial_hidden_state(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        token_type_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        if self.position_embedding_type == PositionEmbeddingType.learned_absolute:
+            inputs_embeds = inputs_embeds + self.wpe(position_ids)
+        if token_type_ids is not None:
+            inputs_embeds = inputs_embeds + self.wte(token_type_ids)
+        inputs_embeds = self.drop(inputs_embeds)
+        return inputs_embeds
+    def _prepare_a_bunch_of_stuff(
+        self,
+        input_ids: torch.Tensor = None,
+        past_key_values: DynamicCache = None,
+        attention_mask: torch.Tensor = None,
+        token_type_ids: torch.Tensor = None,
+        position_ids: torch.Tensor = None,
+        inputs_embeds: torch.Tensor = None,
+        use_cache: bool = None,
+        output_hidden_states: bool = None,
+        return_dict: bool = None,
+    ) -> Tuple[
+        bool,
+        bool,
+        bool,
+        torch.Size,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Optional[Tuple[torch.Tensor, torch.Tensor]],
+        DynamicCache,
+    ]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = self.config.use_cache if use_cache is None else use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            # TODO special handling for padding free transformer needed here if we support inputs_embeds argument
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        batch_size = input_shape[0]
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if self.position_embedding_type == PositionEmbeddingType.alibi:
+            if position_ids is not None:
+                warnings.warn("`position_ids` have no functionality with Alibi.", FutureWarning)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        # ==========================================================================================
+        # input_ids -> (batch_size, query_length)
+        # attention_mask -> None or (batch_size, key_length)
+        # position_ids -> None or (batch_size, key_length)
+        # ==========================================================================================
+        past_length = 0 if past_key_values is None else past_key_values.get_seq_length()
+        query_length = input_shape[-1]
+        key_length = past_length + query_length
+        if position_ids is None:
+            position_ids = self._get_position_ids(attention_mask, past_length, query_length, key_length, device)
+        # ==========================================================================================
+        # input_ids -> (batch_size, query_length)
+        # attention_mask -> None or (batch_size, key_length)
+        # position_ids -> (batch_size, query_length)
+        # ==========================================================================================
+        hidden_states = self._get_initial_hidden_state(input_ids, inputs_embeds, position_ids, token_type_ids)
+        # ==========================================================================================
+        # hidden_states -> (batch_size, query_length, num_heads * head_dim)
+        # ==========================================================================================
+        alibi_bias = self._get_alibi_bias(
+            attention_mask, batch_size, query_length, key_length, device, hidden_states.dtype
+        )
+        # ==========================================================================================
+        # alibi_bias -> (batch_size, num_heads, query_length, key_length)
+        # ==========================================================================================
+        rope_cos_sin = self._get_rope_cos_sin(
+            key_length, position_ids, dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        # ==========================================================================================
+        # rope_cos_sin -> 2 * (key_length, head_dim)
+        # ==========================================================================================
+        # prepare causal mask only if not using flash attention
+        if self._use_flash_attention_2:
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+        elif self._use_sdpa:
+            # we use the causal/non-causal argument of SDPA for attention in this case
+            if attention_mask is not None:
+                attention_mask = self._prepare_causal_attention_mask(
+                    attention_mask, batch_size, query_length, key_length, device
+                )
+                attention_mask = torch.where(
+                    attention_mask,
+                    ~attention_mask if alibi_bias is None else alibi_bias,
+                    self._get_mask_value(attention_mask.device, hidden_states.dtype),
+                )
+        else:
+            attention_mask = self._prepare_causal_attention_mask(
+                attention_mask, batch_size, query_length, key_length, device
+            )
+            attention_mask = torch.where(
+                attention_mask,
+                ~attention_mask if alibi_bias is None else alibi_bias,
+                self._get_mask_value(attention_mask.device, hidden_states.dtype),
+            )
+        return (
+            output_hidden_states,
+            use_cache,
+            return_dict,
+            input_shape,
+            hidden_states,
+            attention_mask,
+            position_ids,
+            rope_cos_sin,
+            past_key_values,
+        )
+    def _get_mask_value(self, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        # torch.where expects a tensor. We use a cache to avoid recreating it every time.
+        if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
+            self.mask_value = torch.full([], torch.finfo(torch.float16).min, dtype=dtype, device=device)
+        return self.mask_value
+class GraniteForCausalLM(GranitePreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    def __init__(self, config: GraniteConfig, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+        self.transformer = GraniteModel(config, **kwargs)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.transformer.wte
+    def set_input_embeddings(self, value: nn.Embedding) -> None:
+        self.transformer.wte = value
+    def get_output_embeddings(self) -> nn.Linear:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
+        self.lm_head = new_embeddings
+    # FIXME typing
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[DynamicCache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # Omit tokens covered by past_key_values
+        if past_key_values:
+            past_length = past_key_values.get_seq_length()
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+            input_ids = input_ids[:, remove_prefix_length:]
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 0)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        else:
+            position_ids = None
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+            }
+        )
+        return model_inputs
+    def forward(
+        self,
+        input_ids: Optional[Union[torch.Tensor]] = None,
+        past_key_values: Optional[DynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[Union[torch.Tensor]] = None,
+        position_ids: Optional[Union[torch.Tensor]] = None,
+        inputs_embeds: Optional[Union[torch.Tensor]] = None,
+        labels: Optional[Union[torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # ==========================================================================================
+        # input_ids -> (batch_size, query_length)
+        # attention_mask -> None or (batch_size, key_length)
+        # position_ids -> None or (batch_size, key_length)
+        # ==========================================================================================
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+        loss = None
+        # Shift so that tokens < n predict n
+        if labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,187 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 9223372036854775807,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}