f8a0b68f61f298f23c8d17d6b308fcaf71e02733fee948a5b38e124f27bfa18d

Browse files

Files changed (8) hide show

README.md +84 -0
config.json +51 -0
configuration_gpt2_mq.py +201 -0
generation_config.json +6 -0
model.safetensors +3 -0
modeling_gpt2_mq.py +346 -0
plots.png +0 -0
smash_config.json +27 -0

README.md ADDED Viewed

	@@ -0,0 +1,84 @@

+---
+thumbnail: "https://assets-global.website-files.com/646b351987a8d8ce158d1940/64ec9e96b4334c0e1ac41504_Logo%20with%20white%20text.svg"
+metrics:
+- memory_disk
+- memory_inference
+- inference_latency
+- inference_throughput
+- inference_CO2_emissions
+- inference_energy_consumption
+tags:
+- pruna-ai
+---
+<!-- header start -->
+<!-- 200823 -->
+<div style="width: auto; margin-left: auto; margin-right: auto">
+    <a href="https://www.pruna.ai/" target="_blank" rel="noopener noreferrer">
+        <img src="https://i.imgur.com/eDAlcgk.png" alt="PrunaAI" style="width: 100%; min-width: 400px; display: block; margin: auto;">
+    </a>
+</div>
+<!-- header end -->
+[![Twitter](https://img.shields.io/twitter/follow/PrunaAI?style=social)](https://twitter.com/PrunaAI)
+[![GitHub](https://img.shields.io/github/followers/PrunaAI?label=Follow%20%40PrunaAI&style=social)](https://github.com/PrunaAI)
+[![LinkedIn](https://img.shields.io/badge/LinkedIn-Connect-blue)](https://www.linkedin.com/company/93832878/admin/feed/posts/?feedType=following)
+[![Discord](https://img.shields.io/badge/Discord-Join%20Us-blue?style=social&logo=discord)](https://discord.gg/CP4VSgck)
+# Simply make AI models cheaper, smaller, faster, and greener!
+- Give a thumbs up if you like this model!
+- Contact us and tell us which model to compress next [here](https://www.pruna.ai/contact).
+- Request access to easily compress your *own* AI models [here](https://z0halsaff74.typeform.com/pruna-access?typeform-source=www.pruna.ai).
+- Read the documentations to know more [here](https://pruna-ai-pruna.readthedocs-hosted.com/en/latest/)
+- Join Pruna AI community on Discord [here](https://discord.gg/CP4VSgck) to share feedback/suggestions or get help.
+## Results
+![image info](./plots.png)
+**Frequently Asked Questions**
+- ***How does the compression work?*** The model is compressed with llm-int8.
+- ***How does the model quality change?*** The quality of the model output might vary compared to the base model.
+- ***How is the model efficiency evaluated?*** These results were obtained on NVIDIA A100-PCIE-40GB with configuration described in `model/smash_config.json` and are obtained after a hardware warmup. The smashed model is directly compared to the original base model. Efficiency results may vary in other settings (e.g. other hardware, image size, batch size, ...). We recommend to directly run them in the use-case conditions to know if the smashed model can benefit you.
+- ***What is the model format?*** We use safetensors.
+- ***What calibration data has been used?*** If needed by the compression method, we used WikiText as the calibration data.
+- ***What is the naming convention for Pruna Huggingface models?*** We take the original model name and append "turbo", "tiny", or "green" if the smashed model has a measured inference speed, inference memory, or inference energy consumption which is less than 90% of the original base model.
+- ***How to compress my own models?*** You can request premium access to more compression methods and tech support for your specific use-cases [here](https://z0halsaff74.typeform.com/pruna-access?typeform-source=www.pruna.ai).
+- ***What are "first" metrics?*** Results mentioning "first" are obtained after the first run of the model. The first run might take more memory or be slower than the subsequent runs due cuda overheads.
+- ***What are "Sync" and "Async" metrics?*** "Sync" metrics are obtained by syncing all GPU processes and stop measurement when all of them are executed. "Async" metrics are obtained without syncing all GPU processes and stop when the model output can be used by the CPU. We provide both metrics since both could be relevant depending on the use-case. We recommend to test the efficiency gains directly in your use-cases.
+## Setup
+You can run the smashed model with these steps:
+0. Check requirements from the original repo bigcode/santacoder installed. In particular, check python, cuda, and transformers versions.
+1. Make sure that you have installed quantization related packages.
+    ```bash
+    pip install transformers accelerate bitsandbytes>0.37.0
+    ```
+2. Load & run the model.
+    ```python
+   from transformers import AutoModelForCausalLM, AutoTokenizer
+    model = AutoModelForCausalLM.from_pretrained("PrunaAI/bigcode-santacoder-bnb-4bit-smashed",
+                                                 trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained("bigcode/santacoder")
+    input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
+    outputs = model.generate(input_ids, max_new_tokens=216)
+    tokenizer.decode(outputs[0])
+    ```
+## Configurations
+The configuration info are in `smash_config.json`.
+## Credits & License
+The license of the smashed model follows the license of the original model. Please check the license of the original model bigcode/santacoder before using this model which provided the base model. The license  of the `pruna-engine` is [here](https://pypi.org/project/pruna-engine/) on Pypi.
+## Want to compress other models?
+- Contact us and tell us which model to compress next [here](https://www.pruna.ai/contact).
+- Request access to easily compress your own AI models [here](https://z0halsaff74.typeform.com/pruna-access?typeform-source=www.pruna.ai).

config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "_name_or_path": "/tmp/tmppkofl_7p",
+  "activation_function": "gelu_fast",
+  "architectures": [
+    "GPT2LMHeadCustomModel"
+  ],
+  "attention_head_type": "multiquery",
+  "attn_pdrop": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_gpt2_mq.GPT2CustomConfig",
+    "AutoModelForCausalLM": "modeling_gpt2_mq.GPT2LMHeadCustomModel"
+  },
+  "bos_token_id": 49152,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 49152,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 2048,
+  "n_head": 16,
+  "n_inner": 8192,
+  "n_layer": 24,
+  "n_positions": 2048,
+  "quantization_config": {
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": true,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": [
+      "lm_head"
+    ],
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.37.1",
+  "use_cache": true,
+  "vocab_size": 49280
+}

configuration_gpt2_mq.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and Hugging Face Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Custom GPT-2 configuration"""
+from collections import OrderedDict
+from typing import Any, List, Mapping, Optional
+from enum import Enum
+from transformers import PreTrainedTokenizer, TensorType, is_torch_available
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfigWithPast, PatchingSpec
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
+    "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
+    "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
+    "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
+    "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
+}
+MULTI_HEAD = "multihead"
+MULTI_QUERY = "multiquery"
+class GPT2CustomConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
+    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPT-2
+    [gpt2](https://huggingface.co/gpt2) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (`string`, *optional*, defaults to `"cls_index"`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Has to be one of the following options:
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            [`GPT2DoubleHeadsModel`].
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            The dropout ratio to be used after the projection and activation.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(head_dim)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+    Example:
+    ```python
+    >>> from transformers import GPT2Config, GPT2Model
+    >>> # Initializing a GPT2 configuration
+    >>> configuration = GPT2Config()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = GPT2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gpt2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        attention_head_type=MULTI_HEAD,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.attention_head_type = attention_head_type
+        # assert attention_head_type in [AttentionType.MULTI_HEAD, AttentionType.MULTI_QUERY]
+        assert attention_head_type in [MULTI_HEAD, MULTI_QUERY]
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 49152,
+  "eos_token_id": 49152,
+  "transformers_version": "4.37.1"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dd5eb29012210f57e6646cfe88d931ee3cf621e4d652dd7a14e849be866b437
+size 838045874

modeling_gpt2_mq.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""PyTorch OpenAI GPT-2 model modified with MultiQuery attention"""
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.cuda.amp import autocast
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel, SequenceSummary
+from transformers.pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2Block, GPT2PreTrainedModel, GPT2LMHeadModel
+from .configuration_gpt2_mq import GPT2CustomConfig, MULTI_QUERY, MULTI_HEAD
+class GPT2MQAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        assert config.attention_head_type == MULTI_QUERY
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+                1, 1, max_positions, max_positions
+            ),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale_attn_weights = config.scale_attn_weights
+        if is_cross_attention:
+            raise NotImplementedError("Cross-attention not implemented for MQA")
+        self.is_cross_attention = is_cross_attention
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            # self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+            # Keys and values are shared across heads
+            self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # query: (b, num_heads * sq, head_dim)
+        # key: (b, head_dim, sk)
+        # value: (b, sk, head_dim)
+        batch_size = query.size(0)
+        query_length = query.size(1) // self.num_heads
+        key_length = key.size(2)
+        # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
+        attn_weights = torch.bmm(query, key)
+        # -> (b, num_heads, sq, sk)
+        attn_weights = attn_weights.view(batch_size, self.num_heads, query_length, key_length)
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch.tensor(
+                value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+            )
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        # (b, num_heads, sq, sk) -> (b, num_heads * sq, sk)
+        _attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
+        # (b, num_heads * sq, sk) x (b, sk, head_dim) -> (b, num_heads * sq, head_dim)
+        attn_output = torch.bmm(_attn_weights, value)
+        attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+        return attn_output, attn_weights
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+        # Compute Scale Factor
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+        if self.scale_attn_by_inverse_layer_idx:
+            scale_factor /= float(self.layer_idx + 1)
+        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
+        with autocast(enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        if encoder_hidden_states is not None:
+            raise NotImplementedError("Cross-attention not implemented for MQA")
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+            query = self.q_attn(hidden_states)
+            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+            attention_mask = encoder_attention_mask
+        else:
+            query = self.q_attn(hidden_states)
+            key, value = self.kv_attn(hidden_states).split(self.head_dim, dim=2)
+        batch_size, seq_length = query.shape[:2]
+        # (query_length, batch, num_heads, head_dim)
+        # (batch, num_heads * query_length, head_dim)\
+        # (batch, query_length, hidden_size) -> (batch, num_heads, query_length, head_dim)
+        query = query.view(batch_size, seq_length, self.num_heads, self.head_dim).permute([0, 2, 1, 3])
+        # -> (batch, num_heads * query_length, head_dim)
+        query = query.reshape(batch_size, self.num_heads * seq_length, self.head_dim)
+        # (batch, query_length, hidden_size) -> (batch, query_length * num_heads, head_dim)
+        # query = query.view(
+        #     batch_size, seq_length, self.num_heads, self.head_dim,
+        # ).reshape(
+        #     batch_size, seq_length * self.num_heads, self.head_dim
+        # )
+        key = key.permute(0, 2, 1)  # (batch_size, head_dim, seq_length)
+        # value (batch_size, seq_length, head_dim)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # Concatenate on sequence dimension
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+        if self.reorder_and_upcast_attn:
+            raise NotImplementedError("Reorder and upcast attention not implemented for MQA")
+            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
+        else:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs  # a, present, (attentions)
+# inherit from gpt_modeling.py, and override `attn` module
+class GPT2CustomBlock(GPT2Block):
+    def __init__(self, config: GPT2CustomConfig, layer_idx=None):
+        super().__init__(config, layer_idx)
+        # Override attention module if using multiquery
+        if config.attention_head_type == MULTI_QUERY:
+            self.attn = GPT2MQAttention(config, layer_idx=layer_idx)
+            if config.add_cross_attention:
+                raise NotImplementedError("Cross-attention not implemented for MQA")
+# inherit from gpt_modeling.py and override `__init__` method
+class GPT2CustomModel(GPT2Model):
+    config_class = GPT2CustomConfig
+    def __init__(self, config):
+        GPT2PreTrainedModel.__init__(self, config)
+        self.embed_dim = config.hidden_size
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPT2CustomBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+class GPT2LMHeadCustomModel(GPT2LMHeadModel):
+    config_class = GPT2CustomConfig
+    def __init__(self, config):
+        GPT2PreTrainedModel.__init__(self, config)
+        self.transformer = GPT2CustomModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        # Initialize weights and apply final processing
+        self.post_init()

plots.png ADDED Viewed

smash_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+    "api_key": null,
+    "verify_url": "http://johnrachwan.pythonanywhere.com",
+    "smash_config": {
+        "pruners": "None",
+        "factorizers": "None",
+        "quantizers": "['llm-int8']",
+        "compilers": "None",
+        "task": "text_text_generation",
+        "device": "cuda",
+        "cache_dir": "/ceph/hdd/staff/charpent/.cache/modelsed4p5bm5",
+        "batch_size": 1,
+        "model_name": "bigcode/santacoder",
+        "pruning_ratio": 0.0,
+        "n_quantization_bits": 4,
+        "output_deviation": 0.005,
+        "max_batch_size": 1,
+        "qtype_weight": "torch.qint8",
+        "qtype_activation": "torch.quint8",
+        "qobserver": "<class 'torch.ao.quantization.observer.MinMaxObserver'>",
+        "qscheme": "torch.per_tensor_symmetric",
+        "qconfig": "x86",
+        "group_size": 128,
+        "damp_percent": 0.1,
+        "save_load_fn": "bitsandbytes"
+    }
+}