init

Files changed (8) hide show

README.md +292 -0
config.json +37 -0
configuration_gpt2_mq.py +201 -0
model.safetensors +3 -0
modeling_gpt2_mq.py +498 -0
special_tokens_map.json +1 -0
tokenizer.json +0 -0
tokenizer_config.json +7 -0

README.md ADDED Viewed

	@@ -0,0 +1,292 @@

+---
+license: openrail
+datasets:
+- bigcode/the-stack
+language:
+- code
+programming_language:
+- Java
+- JavaScript
+- Python
+pipeline_tag: text-generation
+inference: false
+widget:
+- text: 'def print_hello_world():'
+  example_title: Hello world
+  group: Python
+model-index:
+- name: SantaCoder
+  results:
+  - task:
+      type: text-generation
+    dataset:
+      type: nuprl/MultiPL-E
+      name: MultiPL HumanEval (Python)
+    metrics:
+    - name: pass@1
+      type: pass@1
+      value: 0.18
+      verified: false
+    - name: pass@10
+      type: pass@10
+      value: 0.29
+      verified: false
+    - name: pass@100
+      type: pass@100
+      value: 0.49
+      verified: false
+  - task:
+      type: text-generation
+    dataset:
+      type: nuprl/MultiPL-E
+      name: MultiPL MBPP (Python)
+    metrics:
+    - name: pass@1
+      type: pass@1
+      value: 0.35
+      verified: false
+    - name: pass@10
+      type: pass@10
+      value: 0.58
+      verified: false
+    - name: pass@100
+      type: pass@100
+      value: 0.77
+      verified: false
+  - task:
+      type: text-generation
+    dataset:
+      type: nuprl/MultiPL-E
+      name: MultiPL HumanEval (JavaScript)
+    metrics:
+    - name: pass@1
+      type: pass@1
+      value: 0.16
+      verified: false
+    - name: pass@10
+      type: pass@10
+      value: 0.27
+      verified: false
+    - name: pass@100
+      type: pass@100
+      value: 0.47
+      verified: false
+  - task:
+      type: text-generation
+    dataset:
+      type: nuprl/MultiPL-E
+      name: MultiPL MBPP (Javascript)
+    metrics:
+    - name: pass@1
+      type: pass@1
+      value: 0.28
+      verified: false
+    - name: pass@10
+      type: pass@10
+      value: 0.51
+      verified: false
+    - name: pass@100
+      type: pass@100
+      value: 0.70
+      verified: false
+  - task:
+      type: text-generation
+    dataset:
+      type: nuprl/MultiPL-E
+      name: MultiPL HumanEval (Java)
+    metrics:
+    - name: pass@1
+      type: pass@1
+      value: 0.15
+      verified: false
+    - name: pass@10
+      type: pass@10
+      value: 0.26
+      verified: false
+    - name: pass@100
+      type: pass@100
+      value: 0.41
+      verified: false
+  - task:
+      type: text-generation
+    dataset:
+      type: nuprl/MultiPL-E
+      name: MultiPL MBPP (Java)
+    metrics:
+    - name: pass@1
+      type: pass@1
+      value: 0.28
+      verified: false
+    - name: pass@10
+      type: pass@10
+      value: 0.44
+      verified: false
+    - name: pass@100
+      type: pass@100
+      value: 0.59
+      verified: false
+  - task:
+      type: text-generation
+    dataset:
+      type: loubnabnl/humaneval_infilling
+      name: HumanEval FIM (Python)
+    metrics:
+    - name: single_line
+      type: exact_match
+      value: 0.44
+      verified: false
+  - task:
+      type: text-generation
+    dataset:
+      type: nuprl/MultiPL-E
+      name: MultiPL HumanEval FIM (Java)
+    metrics:
+    - name: single_line
+      type: exact_match
+      value: 0.62
+      verified: false
+  - task:
+      type: text-generation
+    dataset:
+      type: nuprl/MultiPL-E
+      name: MultiPL HumanEval FIM (JavaScript)
+    metrics:
+    - name: single_line
+      type: exact_match
+      value: 0.60
+      verified: false
+  - task:
+      type: text-generation
+    dataset:
+      type: code_x_glue_ct_code_to_text
+      name: CodeXGLUE code-to-text (Python)
+    metrics:
+    - name: BLEU
+      type: bleu
+      value: 18.13
+      verified: false
+---
+# SantaCoder
+![banner](https://huggingface.co/datasets/bigcode/admin/resolve/main/banner.png)
+Play with the model on the [SantaCoder Space Demo](https://huggingface.co/spaces/bigcode/santacoder-demo).
+#  Table of Contents
+1. [Model Summary](#model-summary)
+2. [Use](#use)
+3. [Limitations](#limitations)
+4. [Training](#training)
+5. [License](#license)
+6. [Citation](#citation)
+# Model Summary
+The SantaCoder models are a series of 1.1B parameter models trained on the Python, Java, and JavaScript subset of [The Stack (v1.1)](https://huggingface.co/datasets/bigcode/the-stack) (which excluded opt-out requests).
+The main model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150), was trained using near-deduplication and comment-to-code ratio as filtering criteria and using the [Fill-in-the-Middle objective](https://arxiv.org/abs/2207.14255).
+In addition there are several models that were trained on datasets with different filter parameters and with architecture and objective variations.
+- **Repository:** [bigcode/Megatron-LM](https://github.com/bigcode-project/Megatron-LM)
+- **Project Website:** [bigcode-project.org](www.bigcode-project.org)
+- **Paper:** [🎅SantaCoder: Don't reach for the stars!🌟](https://t.co/YV3pzUbYOr)
+- **Point of Contact:** [contact@bigcode-project.org](mailto:contact@bigcode-project.org)
+- **Languages:** Python, Java, and JavaScript
+|Model|Architecture|Objective|Filtering|
+|:-|:-|:-|:-|
+|`mha`|MHA|AR + FIM| Base |
+|`no-fim`| MQA | AR| Base |
+|`fim`| MQA | AR + FIM | Base |
+|`stars`| MQA | AR + FIM | GitHub stars |
+|`fertility`| MQA | AR + FIM | Tokenizer fertility |
+|`comments`| MQA | AR + FIM | Comment-to-code ratio |
+|`dedup-alt`| MQA | AR + FIM | Stronger near-deduplication |
+|`final`| MQA | AR + FIM | Stronger near-deduplication and comment-to-code ratio |
+The `final` model is the best performing model and was trained twice as long (236B tokens) as the others. This checkpoint is the default model and available on the `main` branch. All other checkpoints are on separate branches with according names.
+# Use
+## Intended use
+The model was trained on GitHub code. As such it is _not_ an instruction model and commands like "Write a function that computes the square root." do not work well.
+You should phrase commands like they occur in source code such as comments (e.g. `# the following function computes the sqrt`) or write a function signature and docstring and let the model complete the function body.
+**Feel free to share your generations in the Community tab!**
+## How to use
+### Generation
+```python
+# pip install -q transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+checkpoint = "bigcode/santacoder"
+device = "cuda" # for GPU usage or "cpu" for CPU usage
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device)
+inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+### Fill-in-the-middle
+Fill-in-the-middle uses special tokens to identify the prefix/middle/suffic part of the input and output:
+```python
+input_text = "<fim-prefix>def print_hello_world():\n    <fim-suffix>\n    print('Hello world!')<fim-middle>"
+inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+### Load other checkpoints
+We upload the checkpoint of each experiment to a seperate branch as well as the intermediate checkpoints as commits on the branches. You can load them with the `revision` flag:
+```python
+model = AutoModelForCausalLM.from_pretrained(
+    "bigcode/santacoder",
+    revision="no-fim", # name of branch or commit hash
+    trust_remote_code=True
+)
+```
+### Attribution & Other Requirements
+The pretraining dataset of the model was filtered for permissive licenses only. Nevertheless, the model can generate source code verbatim from the dataset. The code's license might require attribution and/or other specific requirements that must be respected. We provide a [search index](https://huggingface.co/spaces/bigcode/santacoder-search) that let's you search through the pretraining data to identify where generated code came from and apply the proper attribution to your code.
+# Limitations
+The model has been trained on source code in Python, Java, and JavaScript. The predominant language in source is English although other languages are also present. As such the model is capable to generate code snippets provided some context but the generated code is not guaranteed to work as intended. It can be inefficient, contain bugs or exploits.
+# Training
+## Model
+- **Architecture:** GPT-2 model with multi-query attention and Fill-in-the-Middle objective
+- **Pretraining steps:** 600K
+- **Pretraining tokens:** 236 billion
+- **Precision:** float16
+## Hardware
+- **GPUs:** 96 Tesla V100
+- **Training time:** 6.2 days
+- **Total FLOPS:** 2.1 x 10e21
+## Software
+- **Orchestration:** [Megatron-LM](https://github.com/bigcode-project/Megatron-LM)
+- **Neural networks:** [PyTorch](https://github.com/pytorch/pytorch)
+- **FP16 if applicable:** [apex](https://github.com/NVIDIA/apex)
+# License
+The model is licenses under the CodeML Open RAIL-M v0.1 license. You can find the full license [here](https://huggingface.co/spaces/bigcode/license).
+# Citation
+**TODO**

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "bigcode/santacoder",
+  "activation_function": "gelu_fast",
+  "architectures": [
+    "GPT2LMHeadCustomModel"
+  ],
+  "attention_head_type": "multiquery",
+  "attn_pdrop": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_gpt2_mq.GPT2CustomConfig",
+    "AutoModelForCausalLM": "modeling_gpt2_mq.GPT2LMHeadCustomModel"
+  },
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 2048,
+  "n_head": 16,
+  "n_inner": 8192,
+  "n_layer": 24,
+  "n_positions": 2048,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1",
+  "use_cache": true,
+  "vocab_size": 49280
+}

configuration_gpt2_mq.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and Hugging Face Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Custom GPT-2 configuration"""
+from collections import OrderedDict
+from typing import Any, List, Mapping, Optional
+from enum import Enum
+from transformers import PreTrainedTokenizer, TensorType, is_torch_available
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfigWithPast, PatchingSpec
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
+    "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
+    "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
+    "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
+    "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
+}
+MULTI_HEAD = "multihead"
+MULTI_QUERY = "multiquery"
+class GPT2CustomConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
+    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPT-2
+    [gpt2](https://huggingface.co/gpt2) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (`string`, *optional*, defaults to `"cls_index"`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Has to be one of the following options:
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            [`GPT2DoubleHeadsModel`].
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            The dropout ratio to be used after the projection and activation.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(head_dim)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+    Example:
+    ```python
+    >>> from transformers import GPT2Config, GPT2Model
+    >>> # Initializing a GPT2 configuration
+    >>> configuration = GPT2Config()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = GPT2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gpt2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        attention_head_type=MULTI_HEAD,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.attention_head_type = attention_head_type
+        # assert attention_head_type in [AttentionType.MULTI_HEAD, AttentionType.MULTI_QUERY]
+        assert attention_head_type in [MULTI_HEAD, MULTI_QUERY]
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba58d7bbc20355cd3083e789a88fa6b9016ec36ffaf113e94df03d1449ecadf6
+size 4903283827

modeling_gpt2_mq.py ADDED Viewed

	@@ -0,0 +1,498 @@

+"""PyTorch OpenAI GPT-2 model modified with MultiQuery attention"""
+from typing import Optional, Tuple, Union
+import math
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+)
+from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2Block, GPT2PreTrainedModel, GPT2LMHeadModel
+from transformers.utils import logging
+from configuration_gpt2_mq import GPT2CustomConfig, MULTI_QUERY
+logger = logging.get_logger(__name__)
+def make_causal_mask(
+        input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
+) -> torch.BoolTensor:
+    """
+    Make causal mask used for self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
+    # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
+    seq_ids = torch.arange(target_length, device=device)
+    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
+    if past_key_values_length > 0:
+        mask[:, :past_key_values_length] = False
+    expanded_mask = mask[None, :, :].expand(batch_size, target_length, target_length + past_key_values_length)
+    return expanded_mask
+def expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape
+    tgt_length = tgt_length if tgt_length is not None else src_length
+    expanded_mask = ~(mask[:, None, :].to(torch.bool))
+    return expanded_mask.expand(batch_size, tgt_length, src_length)
+def prepare_attn_mask(
+        attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
+) -> torch.BoolTensor:
+    # create causal mask
+    # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
+    combined_attention_mask = None
+    device = attention_mask.device
+    _, src_length = input_shape
+    if src_length > 1:
+        combined_attention_mask = make_causal_mask(
+            input_shape, device=device, past_key_values_length=past_key_values_length
+        )
+    # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
+    expanded_attn_mask = expand_mask(attention_mask, tgt_length=src_length)
+    combined_attention_mask = (
+        expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
+    )
+    return combined_attention_mask
+@torch.jit.script
+def gelu_forward(x: torch.Tensor) -> torch.Tensor:
+    """
+    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
+    make the model jitable.
+    Args:
+        x (`torch.tensor`, *required*):
+            input hidden states
+    """
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+class LinearGPT2MLP(nn.Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = nn.Linear(embed_dim, intermediate_size)
+        self.c_proj = nn.Linear(intermediate_size, embed_dim)
+        self.act = ACT2FN[config.activation_function] if "gelu" not in config.activation_function else gelu_forward
+        self.dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class GPT2MQAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        assert config.attention_head_type == MULTI_QUERY
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale_attn_weights = config.scale_attn_weights
+        if is_cross_attention:
+            raise NotImplementedError("Cross-attention not implemented for MQA")
+        self.is_cross_attention = is_cross_attention
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+        if self.is_cross_attention:
+            raise NotImplementedError("Cross-attention not implemented for MQA")
+        else:
+            # self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+            self.q_attn = nn.Linear(self.embed_dim, self.embed_dim)
+            # Keys and values are shared across heads
+            self.kv_attn = nn.Linear(self.embed_dim, 2 * self.head_dim)
+        self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # query: (b, num_heads * sq, head_dim)
+        # key: (b, head_dim, sk)
+        # value: (b, sk, head_dim)
+        batch_size = query.size(0)
+        query_length = query.size(1) // self.num_heads
+        key_length = key.size(2)
+        # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
+        if self.scale_attn_weights:
+            query *= self.inv_norm_factor
+        attn_weights = torch.bmm(query, key)
+        # -> (b, num_heads, sq, sk)
+        attn_weights = attn_weights.view(batch_size, self.num_heads, query_length, key_length)
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+        if attention_mask is not None:
+            attn_weights = attn_weights.masked_fill_(attention_mask, torch.finfo(attn_weights.dtype).min)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        # (b, num_heads, sq, sk) -> (b, num_heads * sq, sk)
+        _attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
+        # (b, num_heads * sq, sk) x (b, sk, head_dim) -> (b, num_heads * sq, head_dim)
+        attn_output = torch.bmm(_attn_weights, value)
+        attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+        return attn_output, attn_weights
+    def _merge_heads(self, tensor):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        batch_size, num_heads, seq_length, head_dim = tensor.shape
+        tensor = tensor.permute(0, 2, 1, 3)
+        return tensor.reshape(batch_size, seq_length, num_heads * head_dim)
+    def forward(
+            self,
+            hidden_states: Optional[Tuple[torch.FloatTensor]],
+            layer_past: Optional[Tuple[torch.Tensor]] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = False,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        if encoder_hidden_states is not None:
+            raise NotImplementedError("Cross-attention not implemented for MQA")
+        else:
+            query = self.q_attn(hidden_states)
+            key, value = self.kv_attn(hidden_states).split(self.head_dim, dim=2)
+        batch_size, seq_length = query.shape[:2]
+        # (query_length, batch, num_heads, head_dim)
+        # (batch, num_heads * query_length, head_dim)\
+        # (batch, query_length, hidden_size) -> (batch, num_heads, query_length, head_dim)
+        query = query.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        # -> (batch, num_heads * query_length, head_dim)
+        query = query.reshape(batch_size, self.num_heads * seq_length, self.head_dim)
+        key = key.transpose(1, 2)  # (batch_size, head_dim, seq_length)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # Concatenate on sequence dimension
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+        if self.reorder_and_upcast_attn:
+            raise NotImplementedError("Reorder and upcast attention not implemented for MQA")
+        else:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs  # a, present, (attentions)
+# inherit from gpt_modeling.py, and override `attn` module
+class GPT2CustomBlock(GPT2Block):
+    def __init__(self, config: GPT2CustomConfig, layer_idx=None):
+        super().__init__(config, layer_idx)
+        # Override attention module if using multiquery
+        if config.attention_head_type == MULTI_QUERY:
+            self.attn = GPT2MQAttention(config, layer_idx=layer_idx)
+            if config.add_cross_attention:
+                raise NotImplementedError("Cross-attention not implemented for MQA")
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+        self.mlp = LinearGPT2MLP(inner_dim, config)
+# inherit from gpt_modeling.py and override `__init__` and `forward` methods
+class GPT2CustomModel(GPT2Model):
+    config_class = GPT2CustomConfig
+    def __init__(self, config):
+        GPT2PreTrainedModel.__init__(self, config)
+        if config.attention_head_type != MULTI_QUERY:
+            raise NotImplementedError("optimized gpt2 is not implemented for MHA")
+        self.embed_dim = config.hidden_size
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPT2CustomBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            token_type_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+            seq_length = input_ids.shape[1]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+            seq_length = input_ids.shape[1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[-1]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            position_ids = torch.arange(past_key_values_length, input_shape[-1] + past_key_values_length,
+                                        dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        # GPT2Attention mask.
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length_with_past), device=input_ids.device)
+        else:
+            attention_mask = attention_mask.to(input_ids.device)
+        attention_mask = prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+        attention_mask = attention_mask.unsqueeze(1).expand(batch_size, self.config.num_attention_heads,
+                                                            *attention_mask.shape[1:])
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            raise NotImplementedError
+        else:
+            encoder_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure layer_past is on same device as hidden_states (might not be correct)
+                if layer_past is not None:
+                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache, output_attentions)
+                    return custom_forward
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class GPT2LMHeadCustomModel(GPT2LMHeadModel):
+    config_class = GPT2CustomConfig
+    def __init__(self, config):
+        GPT2PreTrainedModel.__init__(self, config)
+        self.transformer = GPT2CustomModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        # Initialize weights and apply final processing
+        self.post_init()

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "name_or_path": "bigcode/digit-bytelevel-bpe-jss-v1.1-49152",
+  "special_tokens_map_file": "/Users/leandro/.cache/huggingface/hub/models--bigcode--digit-bytelevel-bpe-jss-v1.1-49152/snapshots/fa09b77949689a484afafc5f89534e6b6ba2c151/special_tokens_map.json",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "vocab_size": 49152,
+  "model_max_length": 2048
+}