Add files!

Browse files

Files changed (10) hide show

checkpoint-300/config.json +37 -0
checkpoint-300/configuration_gpt2_mq.py +201 -0
checkpoint-300/modeling_gpt2_mq.py +346 -0
checkpoint-300/optimizer.pt +3 -0
checkpoint-300/pytorch_model.bin +3 -0
checkpoint-300/rng_state.pth +3 -0
checkpoint-300/scaler.pt +3 -0
checkpoint-300/scheduler.pt +3 -0
checkpoint-300/trainer_state.json +2056 -0
checkpoint-300/training_args.bin +3 -0

checkpoint-300/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "bigcode/santacoder",
+  "activation_function": "gelu_fast",
+  "architectures": [
+    "GPT2LMHeadCustomModel"
+  ],
+  "attention_head_type": "multiquery",
+  "attn_pdrop": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_gpt2_mq.GPT2CustomConfig",
+    "AutoModelForCausalLM": "modeling_gpt2_mq.GPT2LMHeadCustomModel"
+  },
+  "bos_token_id": 49152,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 49152,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 2048,
+  "n_head": 16,
+  "n_inner": 8192,
+  "n_layer": 24,
+  "n_positions": 2048,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.0.dev0",
+  "use_cache": false,
+  "vocab_size": 49280
+}

checkpoint-300/configuration_gpt2_mq.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and Hugging Face Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Custom GPT-2 configuration"""
+from collections import OrderedDict
+from typing import Any, List, Mapping, Optional
+from enum import Enum
+from transformers import PreTrainedTokenizer, TensorType, is_torch_available
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfigWithPast, PatchingSpec
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
+    "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
+    "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
+    "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
+    "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
+}
+MULTI_HEAD = "multihead"
+MULTI_QUERY = "multiquery"
+class GPT2CustomConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
+    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPT-2
+    [gpt2](https://huggingface.co/gpt2) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (`string`, *optional*, defaults to `"cls_index"`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Has to be one of the following options:
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            [`GPT2DoubleHeadsModel`].
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            The dropout ratio to be used after the projection and activation.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(head_dim)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+    Example:
+    ```python
+    >>> from transformers import GPT2Config, GPT2Model
+    >>> # Initializing a GPT2 configuration
+    >>> configuration = GPT2Config()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = GPT2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gpt2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        attention_head_type=MULTI_HEAD,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.attention_head_type = attention_head_type
+        # assert attention_head_type in [AttentionType.MULTI_HEAD, AttentionType.MULTI_QUERY]
+        assert attention_head_type in [MULTI_HEAD, MULTI_QUERY]
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

checkpoint-300/modeling_gpt2_mq.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""PyTorch OpenAI GPT-2 model modified with MultiQuery attention"""
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.cuda.amp import autocast
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel, SequenceSummary
+from transformers.pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2Block, GPT2PreTrainedModel, GPT2LMHeadModel
+from .configuration_gpt2_mq import GPT2CustomConfig, MULTI_QUERY, MULTI_HEAD
+class GPT2MQAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        assert config.attention_head_type == MULTI_QUERY
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+                1, 1, max_positions, max_positions
+            ),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale_attn_weights = config.scale_attn_weights
+        if is_cross_attention:
+            raise NotImplementedError("Cross-attention not implemented for MQA")
+        self.is_cross_attention = is_cross_attention
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            # self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+            # Keys and values are shared across heads
+            self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # query: (b, num_heads * sq, head_dim)
+        # key: (b, head_dim, sk)
+        # value: (b, sk, head_dim)
+        batch_size = query.size(0)
+        query_length = query.size(1) // self.num_heads
+        key_length = key.size(2)
+        # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
+        attn_weights = torch.bmm(query, key)
+        # -> (b, num_heads, sq, sk)
+        attn_weights = attn_weights.view(batch_size, self.num_heads, query_length, key_length)
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch.tensor(
+                value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+            )
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        # (b, num_heads, sq, sk) -> (b, num_heads * sq, sk)
+        _attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
+        # (b, num_heads * sq, sk) x (b, sk, head_dim) -> (b, num_heads * sq, head_dim)
+        attn_output = torch.bmm(_attn_weights, value)
+        attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+        return attn_output, attn_weights
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+        # Compute Scale Factor
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+        if self.scale_attn_by_inverse_layer_idx:
+            scale_factor /= float(self.layer_idx + 1)
+        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
+        with autocast(enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        if encoder_hidden_states is not None:
+            raise NotImplementedError("Cross-attention not implemented for MQA")
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+            query = self.q_attn(hidden_states)
+            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+            attention_mask = encoder_attention_mask
+        else:
+            query = self.q_attn(hidden_states)
+            key, value = self.kv_attn(hidden_states).split(self.head_dim, dim=2)
+        batch_size, seq_length = query.shape[:2]
+        # (query_length, batch, num_heads, head_dim)
+        # (batch, num_heads * query_length, head_dim)\
+        # (batch, query_length, hidden_size) -> (batch, num_heads, query_length, head_dim)
+        query = query.view(batch_size, seq_length, self.num_heads, self.head_dim).permute([0, 2, 1, 3])
+        # -> (batch, num_heads * query_length, head_dim)
+        query = query.reshape(batch_size, self.num_heads * seq_length, self.head_dim)
+        # (batch, query_length, hidden_size) -> (batch, query_length * num_heads, head_dim)
+        # query = query.view(
+        #     batch_size, seq_length, self.num_heads, self.head_dim,
+        # ).reshape(
+        #     batch_size, seq_length * self.num_heads, self.head_dim
+        # )
+        key = key.permute(0, 2, 1)  # (batch_size, head_dim, seq_length)
+        # value (batch_size, seq_length, head_dim)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # Concatenate on sequence dimension
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+        if self.reorder_and_upcast_attn:
+            raise NotImplementedError("Reorder and upcast attention not implemented for MQA")
+            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
+        else:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs  # a, present, (attentions)
+# inherit from gpt_modeling.py, and override `attn` module
+class GPT2CustomBlock(GPT2Block):
+    def __init__(self, config: GPT2CustomConfig, layer_idx=None):
+        super().__init__(config, layer_idx)
+        # Override attention module if using multiquery
+        if config.attention_head_type == MULTI_QUERY:
+            self.attn = GPT2MQAttention(config, layer_idx=layer_idx)
+            if config.add_cross_attention:
+                raise NotImplementedError("Cross-attention not implemented for MQA")
+# inherit from gpt_modeling.py and override `__init__` method
+class GPT2CustomModel(GPT2Model):
+    config_class = GPT2CustomConfig
+    def __init__(self, config):
+        GPT2PreTrainedModel.__init__(self, config)
+        self.embed_dim = config.hidden_size
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPT2CustomBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+class GPT2LMHeadCustomModel(GPT2LMHeadModel):
+    config_class = GPT2CustomConfig
+    def __init__(self, config):
+        GPT2PreTrainedModel.__init__(self, config)
+        self.transformer = GPT2CustomModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        # Initialize weights and apply final processing
+        self.post_init()

checkpoint-300/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84d14294d941b0df2fd04f37973b38a81296ea1f41f9bf9e16b871c0ab8bd14b
+size 1459

checkpoint-300/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6928345a3054ec03006d7e1b519f551c79196893f038cb6cabc68a9a4f246e42
+size 4600336581

checkpoint-300/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7744e589658518cdc45ed0562339611dea49d59dab5f337b421d0fee1ff7d2e
+size 14575

checkpoint-300/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fa181fa360d46feed4180ea17c8b6a4a879a9b4231c2e91aff2be20be9076cc
+size 557

checkpoint-300/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11c20fa9a427ab3fa3faf10cbf9ec355ead8f7d713d92d58a7afc36da47f1a0b
+size 627

checkpoint-300/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2056 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.4642,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.4922,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.4662,
+      "step": 3
+    },
+    {
+      "epoch": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.4925,
+      "step": 4
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.4821,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.4877,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.4703,
+      "step": 7
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.4768,
+      "step": 8
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.479,
+      "step": 9
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.4581,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.4584,
+      "eval_samples_per_second": 34.575,
+      "eval_steps_per_second": 2.034,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.5086,
+      "step": 11
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.5105,
+      "step": 12
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.4347,
+      "step": 13
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.4871,
+      "step": 14
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0,
+      "loss": 1.504,
+      "step": 15
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.0,
+      "loss": 1.4708,
+      "step": 16
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.0,
+      "loss": 1.5296,
+      "step": 17
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.0,
+      "loss": 1.4056,
+      "step": 18
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.0,
+      "loss": 1.4928,
+      "step": 19
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.0,
+      "loss": 1.4764,
+      "step": 20
+    },
+    {
+      "epoch": 0.02,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.4788,
+      "eval_samples_per_second": 34.291,
+      "eval_steps_per_second": 2.017,
+      "step": 20
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.0,
+      "loss": 1.4898,
+      "step": 21
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.0,
+      "loss": 1.4475,
+      "step": 22
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.0,
+      "loss": 1.4926,
+      "step": 23
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.0,
+      "loss": 1.4854,
+      "step": 24
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.4763,
+      "step": 25
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.4496,
+      "step": 26
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.4878,
+      "step": 27
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.4838,
+      "step": 28
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.5233,
+      "step": 29
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.4398,
+      "step": 30
+    },
+    {
+      "epoch": 0.03,
+      "eval_loss": 1.4974406957626343,
+      "eval_runtime": 2.4674,
+      "eval_samples_per_second": 34.449,
+      "eval_steps_per_second": 2.026,
+      "step": 30
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.4769,
+      "step": 31
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.5235,
+      "step": 32
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.4976,
+      "step": 33
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 0.0,
+      "loss": 1.5019,
+      "step": 34
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.4437,
+      "step": 35
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.4414,
+      "step": 36
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.527,
+      "step": 37
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.4977,
+      "step": 38
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.4703,
+      "step": 39
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.4633,
+      "step": 40
+    },
+    {
+      "epoch": 0.04,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.4589,
+      "eval_samples_per_second": 34.568,
+      "eval_steps_per_second": 2.033,
+      "step": 40
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.5169,
+      "step": 41
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.5016,
+      "step": 42
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.4505,
+      "step": 43
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.4539,
+      "step": 44
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 0.0,
+      "loss": 1.5177,
+      "step": 45
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.0,
+      "loss": 1.4662,
+      "step": 46
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.0,
+      "loss": 1.4824,
+      "step": 47
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.0,
+      "loss": 1.4901,
+      "step": 48
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.0,
+      "loss": 1.4714,
+      "step": 49
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.0,
+      "loss": 1.4663,
+      "step": 50
+    },
+    {
+      "epoch": 0.05,
+      "eval_loss": 1.4974409341812134,
+      "eval_runtime": 2.4708,
+      "eval_samples_per_second": 34.401,
+      "eval_steps_per_second": 2.024,
+      "step": 50
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.0,
+      "loss": 1.4819,
+      "step": 51
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.0,
+      "loss": 1.495,
+      "step": 52
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.0,
+      "loss": 1.4785,
+      "step": 53
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.0,
+      "loss": 1.4907,
+      "step": 54
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.4766,
+      "step": 55
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.4638,
+      "step": 56
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.4695,
+      "step": 57
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.4272,
+      "step": 58
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.5211,
+      "step": 59
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.5044,
+      "step": 60
+    },
+    {
+      "epoch": 0.06,
+      "eval_loss": 1.4974411725997925,
+      "eval_runtime": 2.4841,
+      "eval_samples_per_second": 34.217,
+      "eval_steps_per_second": 2.013,
+      "step": 60
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.5065,
+      "step": 61
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.4428,
+      "step": 62
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.4665,
+      "step": 63
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.0,
+      "loss": 1.4986,
+      "step": 64
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.4946,
+      "step": 65
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.4675,
+      "step": 66
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.4636,
+      "step": 67
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.5105,
+      "step": 68
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.4805,
+      "step": 69
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.4444,
+      "step": 70
+    },
+    {
+      "epoch": 0.07,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.7704,
+      "eval_samples_per_second": 30.682,
+      "eval_steps_per_second": 1.805,
+      "step": 70
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.5231,
+      "step": 71
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.4438,
+      "step": 72
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.4733,
+      "step": 73
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.4863,
+      "step": 74
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 0.0,
+      "loss": 1.5116,
+      "step": 75
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0,
+      "loss": 1.4434,
+      "step": 76
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0,
+      "loss": 1.4413,
+      "step": 77
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0,
+      "loss": 1.4878,
+      "step": 78
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0,
+      "loss": 1.4866,
+      "step": 79
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0,
+      "loss": 1.4683,
+      "step": 80
+    },
+    {
+      "epoch": 0.08,
+      "eval_loss": 1.4974406957626343,
+      "eval_runtime": 2.4599,
+      "eval_samples_per_second": 34.555,
+      "eval_steps_per_second": 2.033,
+      "step": 80
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0,
+      "loss": 1.4787,
+      "step": 81
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0,
+      "loss": 1.4832,
+      "step": 82
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0,
+      "loss": 1.4494,
+      "step": 83
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0,
+      "loss": 1.4606,
+      "step": 84
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.4981,
+      "step": 85
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.5046,
+      "step": 86
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.4937,
+      "step": 87
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.4954,
+      "step": 88
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.4731,
+      "step": 89
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.5075,
+      "step": 90
+    },
+    {
+      "epoch": 0.09,
+      "eval_loss": 1.4974411725997925,
+      "eval_runtime": 2.4585,
+      "eval_samples_per_second": 34.575,
+      "eval_steps_per_second": 2.034,
+      "step": 90
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.4989,
+      "step": 91
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.4831,
+      "step": 92
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.4624,
+      "step": 93
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.0,
+      "loss": 1.4936,
+      "step": 94
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.4631,
+      "step": 95
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.4991,
+      "step": 96
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.4646,
+      "step": 97
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.4986,
+      "step": 98
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.4815,
+      "step": 99
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.4876,
+      "step": 100
+    },
+    {
+      "epoch": 0.1,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.485,
+      "eval_samples_per_second": 34.205,
+      "eval_steps_per_second": 2.012,
+      "step": 100
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.4663,
+      "step": 101
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.4616,
+      "step": 102
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.4779,
+      "step": 103
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.5175,
+      "step": 104
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0,
+      "loss": 1.48,
+      "step": 105
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 0.0,
+      "loss": 1.4722,
+      "step": 106
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 0.0,
+      "loss": 1.4856,
+      "step": 107
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 0.0,
+      "loss": 1.4342,
+      "step": 108
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 0.0,
+      "loss": 1.4481,
+      "step": 109
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 0.0,
+      "loss": 1.4997,
+      "step": 110
+    },
+    {
+      "epoch": 0.11,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.4899,
+      "eval_samples_per_second": 34.137,
+      "eval_steps_per_second": 2.008,
+      "step": 110
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 0.0,
+      "loss": 1.4932,
+      "step": 111
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 0.0,
+      "loss": 1.5032,
+      "step": 112
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 0.0,
+      "loss": 1.493,
+      "step": 113
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 0.0,
+      "loss": 1.4659,
+      "step": 114
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.4918,
+      "step": 115
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.5082,
+      "step": 116
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.4699,
+      "step": 117
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.464,
+      "step": 118
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.4729,
+      "step": 119
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.4768,
+      "step": 120
+    },
+    {
+      "epoch": 0.12,
+      "eval_loss": 1.497441291809082,
+      "eval_runtime": 2.4613,
+      "eval_samples_per_second": 34.535,
+      "eval_steps_per_second": 2.031,
+      "step": 120
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.473,
+      "step": 121
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.4733,
+      "step": 122
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.4845,
+      "step": 123
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.4629,
+      "step": 124
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0,
+      "loss": 1.4659,
+      "step": 125
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.0,
+      "loss": 1.4863,
+      "step": 126
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.0,
+      "loss": 1.4926,
+      "step": 127
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.0,
+      "loss": 1.4879,
+      "step": 128
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.0,
+      "loss": 1.4636,
+      "step": 129
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.0,
+      "loss": 1.501,
+      "step": 130
+    },
+    {
+      "epoch": 0.13,
+      "eval_loss": 1.4974408149719238,
+      "eval_runtime": 2.4972,
+      "eval_samples_per_second": 34.037,
+      "eval_steps_per_second": 2.002,
+      "step": 130
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.0,
+      "loss": 1.4754,
+      "step": 131
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.0,
+      "loss": 1.4732,
+      "step": 132
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.0,
+      "loss": 1.4862,
+      "step": 133
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.0,
+      "loss": 1.4766,
+      "step": 134
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.4898,
+      "step": 135
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.4533,
+      "step": 136
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.491,
+      "step": 137
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.4539,
+      "step": 138
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.4875,
+      "step": 139
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.5224,
+      "step": 140
+    },
+    {
+      "epoch": 0.14,
+      "eval_loss": 1.4974406957626343,
+      "eval_runtime": 2.4604,
+      "eval_samples_per_second": 34.547,
+      "eval_steps_per_second": 2.032,
+      "step": 140
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.4881,
+      "step": 141
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.4815,
+      "step": 142
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.474,
+      "step": 143
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.4913,
+      "step": 144
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 0.0,
+      "loss": 1.4527,
+      "step": 145
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.4874,
+      "step": 146
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.4907,
+      "step": 147
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.4855,
+      "step": 148
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.4746,
+      "step": 149
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.4988,
+      "step": 150
+    },
+    {
+      "epoch": 0.15,
+      "eval_loss": 1.4974409341812134,
+      "eval_runtime": 2.4749,
+      "eval_samples_per_second": 34.345,
+      "eval_steps_per_second": 2.02,
+      "step": 150
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.5209,
+      "step": 151
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.4406,
+      "step": 152
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.501,
+      "step": 153
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.4227,
+      "step": 154
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 0.0,
+      "loss": 1.474,
+      "step": 155
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0,
+      "loss": 1.493,
+      "step": 156
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0,
+      "loss": 1.4953,
+      "step": 157
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0,
+      "loss": 1.4475,
+      "step": 158
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0,
+      "loss": 1.5084,
+      "step": 159
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0,
+      "loss": 1.4582,
+      "step": 160
+    },
+    {
+      "epoch": 0.16,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.4861,
+      "eval_samples_per_second": 34.19,
+      "eval_steps_per_second": 2.011,
+      "step": 160
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0,
+      "loss": 1.4891,
+      "step": 161
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0,
+      "loss": 1.5041,
+      "step": 162
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0,
+      "loss": 1.4514,
+      "step": 163
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0,
+      "loss": 1.4876,
+      "step": 164
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 0.0,
+      "loss": 1.4778,
+      "step": 165
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 0.0,
+      "loss": 1.4555,
+      "step": 166
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 5e-06,
+      "loss": 1.5126,
+      "step": 167
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 1e-05,
+      "loss": 1.4723,
+      "step": 168
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 1.5e-05,
+      "loss": 1.4596,
+      "step": 169
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 2e-05,
+      "loss": 1.5069,
+      "step": 170
+    },
+    {
+      "epoch": 0.17,
+      "eval_loss": 1.497441291809082,
+      "eval_runtime": 2.4879,
+      "eval_samples_per_second": 34.165,
+      "eval_steps_per_second": 2.01,
+      "step": 170
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 2.5e-05,
+      "loss": 1.4832,
+      "step": 171
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 3e-05,
+      "loss": 1.4793,
+      "step": 172
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 3.5e-05,
+      "loss": 1.5123,
+      "step": 173
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 4e-05,
+      "loss": 1.4773,
+      "step": 174
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 4.5e-05,
+      "loss": 1.4608,
+      "step": 175
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 5e-05,
+      "loss": 1.4544,
+      "step": 176
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.999987412513878e-05,
+      "loss": 1.4933,
+      "step": 177
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.999949650182266e-05,
+      "loss": 1.4753,
+      "step": 178
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.999886713385432e-05,
+      "loss": 1.4745,
+      "step": 179
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.9997986027571485e-05,
+      "loss": 1.5125,
+      "step": 180
+    },
+    {
+      "epoch": 0.18,
+      "eval_loss": 1.4974409341812134,
+      "eval_runtime": 2.4894,
+      "eval_samples_per_second": 34.145,
+      "eval_steps_per_second": 2.009,
+      "step": 180
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.9996853191846885e-05,
+      "loss": 1.4941,
+      "step": 181
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.999546863808815e-05,
+      "loss": 1.4606,
+      "step": 182
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.9993832380237735e-05,
+      "loss": 1.4429,
+      "step": 183
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.9991944434772734e-05,
+      "loss": 1.4993,
+      "step": 184
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.9989804820704735e-05,
+      "loss": 1.4545,
+      "step": 185
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.9987413559579636e-05,
+      "loss": 1.5229,
+      "step": 186
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.99847706754774e-05,
+      "loss": 1.4328,
+      "step": 187
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.9981876195011844e-05,
+      "loss": 1.5037,
+      "step": 188
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.9978730147330355e-05,
+      "loss": 1.4765,
+      "step": 189
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.99753325641136e-05,
+      "loss": 1.4936,
+      "step": 190
+    },
+    {
+      "epoch": 0.19,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.4637,
+      "eval_samples_per_second": 34.5,
+      "eval_steps_per_second": 2.029,
+      "step": 190
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.99716834795752e-05,
+      "loss": 1.5076,
+      "step": 191
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.996778293046141e-05,
+      "loss": 1.441,
+      "step": 192
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.996363095605069e-05,
+      "loss": 1.4712,
+      "step": 193
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.995922759815339e-05,
+      "loss": 1.4352,
+      "step": 194
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.9954572901111286e-05,
+      "loss": 1.5377,
+      "step": 195
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.994966691179711e-05,
+      "loss": 1.4753,
+      "step": 196
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.994450967961413e-05,
+      "loss": 1.474,
+      "step": 197
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.993910125649561e-05,
+      "loss": 1.4867,
+      "step": 198
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.993344169690431e-05,
+      "loss": 1.4881,
+      "step": 199
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.992753105783194e-05,
+      "loss": 1.4942,
+      "step": 200
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.4763,
+      "eval_samples_per_second": 34.325,
+      "eval_steps_per_second": 2.019,
+      "step": 200
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.992136939879856e-05,
+      "loss": 1.4813,
+      "step": 201
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.991495678185202e-05,
+      "loss": 1.4725,
+      "step": 202
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.9908293271567286e-05,
+      "loss": 1.4791,
+      "step": 203
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.990137893504585e-05,
+      "loss": 1.4917,
+      "step": 204
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.989421384191499e-05,
+      "loss": 1.5229,
+      "step": 205
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.988679806432712e-05,
+      "loss": 1.4668,
+      "step": 206
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.987913167695904e-05,
+      "loss": 1.4752,
+      "step": 207
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.9871214757011176e-05,
+      "loss": 1.505,
+      "step": 208
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.9863047384206835e-05,
+      "loss": 1.4523,
+      "step": 209
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.985462964079137e-05,
+      "loss": 1.4324,
+      "step": 210
+    },
+    {
+      "epoch": 0.21,
+      "eval_loss": 1.4974406957626343,
+      "eval_runtime": 2.4644,
+      "eval_samples_per_second": 34.492,
+      "eval_steps_per_second": 2.029,
+      "step": 210
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.984596161153136e-05,
+      "loss": 1.4717,
+      "step": 211
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 1.493,
+      "step": 212
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.982787504714503e-05,
+      "loss": 1.4947,
+      "step": 213
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.981845669415022e-05,
+      "loss": 1.5107,
+      "step": 214
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.980878841957203e-05,
+      "loss": 1.4815,
+      "step": 215
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.9798870320769886e-05,
+      "loss": 1.4603,
+      "step": 216
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.978870249761893e-05,
+      "loss": 1.4934,
+      "step": 217
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.977828505250903e-05,
+      "loss": 1.4776,
+      "step": 218
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.9767618090343745e-05,
+      "loss": 1.4958,
+      "step": 219
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.975670171853926e-05,
+      "loss": 1.5113,
+      "step": 220
+    },
+    {
+      "epoch": 0.22,
+      "eval_loss": 1.4974405765533447,
+      "eval_runtime": 2.464,
+      "eval_samples_per_second": 34.497,
+      "eval_steps_per_second": 2.029,
+      "step": 220
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.9745536047023324e-05,
+      "loss": 1.4685,
+      "step": 221
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.973412118823412e-05,
+      "loss": 1.4465,
+      "step": 222
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.972245725711914e-05,
+      "loss": 1.4354,
+      "step": 223
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.971054437113406e-05,
+      "loss": 1.4926,
+      "step": 224
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.969838265024151e-05,
+      "loss": 1.4707,
+      "step": 225
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.968597221690986e-05,
+      "loss": 1.5009,
+      "step": 226
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.967331319611206e-05,
+      "loss": 1.4871,
+      "step": 227
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.96604057153243e-05,
+      "loss": 1.5048,
+      "step": 228
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.964724990452476e-05,
+      "loss": 1.4602,
+      "step": 229
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.963384589619233e-05,
+      "loss": 1.4779,
+      "step": 230
+    },
+    {
+      "epoch": 0.23,
+      "eval_loss": 1.4974411725997925,
+      "eval_runtime": 2.491,
+      "eval_samples_per_second": 34.123,
+      "eval_steps_per_second": 2.007,
+      "step": 230
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.962019382530521e-05,
+      "loss": 1.501,
+      "step": 231
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.9606293829339595e-05,
+      "loss": 1.4836,
+      "step": 232
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.959214604826831e-05,
+      "loss": 1.5055,
+      "step": 233
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.957775062455933e-05,
+      "loss": 1.4509,
+      "step": 234
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.9563107703174436e-05,
+      "loss": 1.4911,
+      "step": 235
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.9548217431567665e-05,
+      "loss": 1.4737,
+      "step": 236
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.95330799596839e-05,
+      "loss": 1.4336,
+      "step": 237
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.951769543995731e-05,
+      "loss": 1.5117,
+      "step": 238
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.9502064027309836e-05,
+      "loss": 1.4661,
+      "step": 239
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.948618587914963e-05,
+      "loss": 1.4962,
+      "step": 240
+    },
+    {
+      "epoch": 0.24,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.455,
+      "eval_samples_per_second": 34.623,
+      "eval_steps_per_second": 2.037,
+      "step": 240
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.947006115536947e-05,
+      "loss": 1.4783,
+      "step": 241
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.9453690018345144e-05,
+      "loss": 1.5097,
+      "step": 242
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.9437072632933814e-05,
+      "loss": 1.4832,
+      "step": 243
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.942020916647238e-05,
+      "loss": 1.4817,
+      "step": 244
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.9403099788775754e-05,
+      "loss": 1.5156,
+      "step": 245
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.938574467213518e-05,
+      "loss": 1.4654,
+      "step": 246
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.936814399131648e-05,
+      "loss": 1.4391,
+      "step": 247
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.935029792355834e-05,
+      "loss": 1.4874,
+      "step": 248
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.933220664857044e-05,
+      "loss": 1.4714,
+      "step": 249
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.931387034853173e-05,
+      "loss": 1.4763,
+      "step": 250
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.4974409341812134,
+      "eval_runtime": 2.4634,
+      "eval_samples_per_second": 34.505,
+      "eval_steps_per_second": 2.03,
+      "step": 250
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.929528920808854e-05,
+      "loss": 1.4579,
+      "step": 251
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.9276463414352757e-05,
+      "loss": 1.4982,
+      "step": 252
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.925739315689991e-05,
+      "loss": 1.4569,
+      "step": 253
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.923807862776728e-05,
+      "loss": 1.481,
+      "step": 254
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.921852002145196e-05,
+      "loss": 1.4755,
+      "step": 255
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.919871753490891e-05,
+      "loss": 1.4745,
+      "step": 256
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.917867136754893e-05,
+      "loss": 1.4845,
+      "step": 257
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.915838172123671e-05,
+      "loss": 1.4617,
+      "step": 258
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.913784880028878e-05,
+      "loss": 1.4859,
+      "step": 259
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.91170728114714e-05,
+      "loss": 1.4992,
+      "step": 260
+    },
+    {
+      "epoch": 0.26,
+      "eval_loss": 1.4974408149719238,
+      "eval_runtime": 2.4941,
+      "eval_samples_per_second": 34.081,
+      "eval_steps_per_second": 2.005,
+      "step": 260
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.909605396399856e-05,
+      "loss": 1.4671,
+      "step": 261
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.9074792469529815e-05,
+      "loss": 1.528,
+      "step": 262
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.9053288542168185e-05,
+      "loss": 1.4412,
+      "step": 263
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.9031542398457974e-05,
+      "loss": 1.4627,
+      "step": 264
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.9009554257382616e-05,
+      "loss": 1.4693,
+      "step": 265
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.898732434036244e-05,
+      "loss": 1.511,
+      "step": 266
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.896485287125246e-05,
+      "loss": 1.4986,
+      "step": 267
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.8942140076340135e-05,
+      "loss": 1.4677,
+      "step": 268
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.8919186184343046e-05,
+      "loss": 1.4725,
+      "step": 269
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.889599142640663e-05,
+      "loss": 1.5047,
+      "step": 270
+    },
+    {
+      "epoch": 0.27,
+      "eval_loss": 1.4974409341812134,
+      "eval_runtime": 2.4593,
+      "eval_samples_per_second": 34.563,
+      "eval_steps_per_second": 2.033,
+      "step": 270
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.887255603610185e-05,
+      "loss": 1.4393,
+      "step": 271
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.8848880249422815e-05,
+      "loss": 1.5039,
+      "step": 272
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.8824964304784446e-05,
+      "loss": 1.5001,
+      "step": 273
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.880080844302004e-05,
+      "loss": 1.473,
+      "step": 274
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.877641290737884e-05,
+      "loss": 1.4691,
+      "step": 275
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.8751777943523634e-05,
+      "loss": 1.4921,
+      "step": 276
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.8726903799528234e-05,
+      "loss": 1.4846,
+      "step": 277
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.870179072587499e-05,
+      "loss": 1.4709,
+      "step": 278
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.8676438975452274e-05,
+      "loss": 1.4837,
+      "step": 279
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.865084880355193e-05,
+      "loss": 1.4911,
+      "step": 280
+    },
+    {
+      "epoch": 0.28,
+      "eval_loss": 1.4974409341812134,
+      "eval_runtime": 2.4577,
+      "eval_samples_per_second": 34.585,
+      "eval_steps_per_second": 2.034,
+      "step": 280
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.862502046786671e-05,
+      "loss": 1.4743,
+      "step": 281
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.859895422848767e-05,
+      "loss": 1.4526,
+      "step": 282
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.8572650347901544e-05,
+      "loss": 1.5013,
+      "step": 283
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.854610909098812e-05,
+      "loss": 1.4916,
+      "step": 284
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.851933072501756e-05,
+      "loss": 1.4802,
+      "step": 285
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.849231551964771e-05,
+      "loss": 1.4403,
+      "step": 286
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.8465063746921395e-05,
+      "loss": 1.4976,
+      "step": 287
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.8437575681263656e-05,
+      "loss": 1.4676,
+      "step": 288
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.8409851599479015e-05,
+      "loss": 1.4919,
+      "step": 289
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.838189178074867e-05,
+      "loss": 1.4806,
+      "step": 290
+    },
+    {
+      "epoch": 0.29,
+      "eval_loss": 1.4974409341812134,
+      "eval_runtime": 2.4575,
+      "eval_samples_per_second": 34.588,
+      "eval_steps_per_second": 2.035,
+      "step": 290
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.835369650662767e-05,
+      "loss": 1.4784,
+      "step": 291
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.832526606104213e-05,
+      "loss": 1.4818,
+      "step": 292
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.829660073028631e-05,
+      "loss": 1.4512,
+      "step": 293
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.826770080301978e-05,
+      "loss": 1.4728,
+      "step": 294
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.823856657026448e-05,
+      "loss": 1.4996,
+      "step": 295
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 4.8209198325401815e-05,
+      "loss": 1.4781,
+      "step": 296
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 4.817959636416969e-05,
+      "loss": 1.4983,
+      "step": 297
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 4.8149760984659506e-05,
+      "loss": 1.4604,
+      "step": 298
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 4.811969248731323e-05,
+      "loss": 1.4986,
+      "step": 299
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 4.8089391174920275e-05,
+      "loss": 1.4972,
+      "step": 300
+    },
+    {
+      "epoch": 0.3,
+      "eval_loss": 1.497441053390503,
+      "eval_runtime": 2.4854,
+      "eval_samples_per_second": 34.2,
+      "eval_steps_per_second": 2.012,
+      "step": 300
+    }
+  ],
+  "max_steps": 1000,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 3.00741461803008e+17,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-300/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a56f0a54e3cd2b9c7442d5e94de4b5fa438d7c3e0833995d3351c23b2e7dc832
+size 3451