Model save

Browse files

Files changed (11) hide show

README.md +79 -0
all_results.json +21 -0
configuration_stablelm_epoch.py +117 -0
eval_results.json +16 -0
generation_config.json +6 -0
model.safetensors +1 -1
modeling_stablelm_epoch.py +919 -0
runs/Feb19_23-08-06_cccxc542/events.out.tfevents.1708402191.cccxc542.93482.0 +2 -2
runs/Feb19_23-08-06_cccxc542/events.out.tfevents.1708406899.cccxc542.93482.1 +3 -0
train_results.json +8 -0
trainer_state.json +1220 -0

README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+---
+license: other
+base_model: stabilityai/stablelm-2-zephyr-1_6b
+tags:
+- trl
+- dpo
+- generated_from_trainer
+model-index:
+- name: slm-2-dpo-full
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# slm-2-dpo-full
+This model is a fine-tuned version of [stabilityai/stablelm-2-zephyr-1_6b](https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 31.9894
+- Rewards/chosen: 0.0244
+- Rewards/rejected: 0.0188
+- Rewards/accuracies: 0.5234
+- Rewards/margins: 0.0057
+- Logps/rejected: -2491.7576
+- Logps/chosen: -2806.6704
+- Logits/rejected: -1.6239
+- Logits/chosen: -1.6845
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-07
+- train_batch_size: 5
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 80
+- total_eval_batch_size: 64
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
+|:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
+| 16.8403       | 0.13  | 100  | 19.5118         | 0.0256         | 0.0173           | 0.5273             | 0.0082          | -2491.9011     | -2806.5552   | -1.6068         | -1.6730       |
+| 28.1241       | 0.26  | 200  | 32.5175         | 0.0085         | -0.0039          | 0.5234             | 0.0124          | -2494.0195     | -2808.2581   | -1.6183         | -1.6812       |
+| 84.7591       | 0.39  | 300  | 47.8043         | 0.0297         | 0.0136           | 0.5391             | 0.0161          | -2492.2703     | -2806.1406   | -1.5968         | -1.6601       |
+| 40.7835       | 0.52  | 400  | 30.6722         | 0.0168         | -0.0029          | 0.5547             | 0.0197          | -2493.9204     | -2807.4263   | -1.6288         | -1.6917       |
+| 36.2204       | 0.65  | 500  | 31.2202         | 0.0303         | 0.0209           | 0.5352             | 0.0095          | -2491.5447     | -2806.0762   | -1.6236         | -1.6843       |
+| 99.7738       | 0.78  | 600  | 33.7403         | 0.0476         | 0.0372           | 0.5391             | 0.0104          | -2489.9089     | -2804.3484   | -1.6222         | -1.6827       |
+| 41.8506       | 0.92  | 700  | 32.9133         | 0.0301         | 0.0195           | 0.5547             | 0.0106          | -2491.6851     | -2806.1006   | -1.6211         | -1.6823       |
+### Framework versions
+- Transformers 4.36.2
+- Pytorch 2.2.0+cu118
+- Datasets 2.14.6
+- Tokenizers 0.15.2

all_results.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "epoch": 1.0,
+    "eval_logits/chosen": -1.684511661529541,
+    "eval_logits/rejected": -1.6239439249038696,
+    "eval_logps/chosen": -2806.67041015625,
+    "eval_logps/rejected": -2491.757568359375,
+    "eval_loss": 31.98944854736328,
+    "eval_rewards/accuracies": 0.5234375,
+    "eval_rewards/chosen": 0.024405580013990402,
+    "eval_rewards/margins": 0.0056530386209487915,
+    "eval_rewards/rejected": 0.01875254511833191,
+    "eval_runtime": 110.5807,
+    "eval_samples": 2000,
+    "eval_samples_per_second": 18.086,
+    "eval_steps_per_second": 0.289,
+    "train_loss": 46.613421885577296,
+    "train_runtime": 4597.6924,
+    "train_samples": 61135,
+    "train_samples_per_second": 13.297,
+    "train_steps_per_second": 0.166
+}

configuration_stablelm_epoch.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright 2023 Stability and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" StableLM Epoch model configuration"""
+from transformers import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class StableLMEpochConfig(PretrainedConfig):
+    r"""
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50_304):
+            Vocabulary size of the StableLM model. Defines the number of different tokens that
+            can be represented by the `inputs_ids` passed when calling [`StableLMEpochModel`].
+        intermediate_size (`int`, *optional*, defaults to 6912):
+            Dimension of the MLP representations.
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the decoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string).
+        rope_pct (`float`, *optional*, defaults to 1.0):
+            Percentage of hidden dimensions to allocate to rotary embeddings.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 1e-5):
+            The standard deviation of the truncated_normal_initializer for initializing
+             all weight matrices.
+        norm_eps (`float`, *optional*, defaults to 1e-8):
+            The epsilon used by the normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions
+            (not used by all models). Only relevant if `config.is_decoder=True`.
+        use_qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should use bias for qkv layers.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    """
+    model_type = "stablelm_epoch"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=50_304,
+        intermediate_size=6912,
+        hidden_size=2560,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        rope_pct=0.25,
+        rope_theta=10_000,
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        norm_eps=1.0e-5,
+        use_cache=True,
+        use_qkv_bias=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        attention_dropout: float = 0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.intermediate_size = intermediate_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.rope_pct = rope_pct
+        self.rope_theta = rope_theta
+        self.initializer_range = initializer_range
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.use_qkv_bias = use_qkv_bias
+        self.tie_word_embeddings = tie_word_embeddings
+        self.attention_dropout = attention_dropout
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

eval_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 1.0,
+    "eval_logits/chosen": -1.684511661529541,
+    "eval_logits/rejected": -1.6239439249038696,
+    "eval_logps/chosen": -2806.67041015625,
+    "eval_logps/rejected": -2491.757568359375,
+    "eval_loss": 31.98944854736328,
+    "eval_rewards/accuracies": 0.5234375,
+    "eval_rewards/chosen": 0.024405580013990402,
+    "eval_rewards/margins": 0.0056530386209487915,
+    "eval_rewards/rejected": 0.01875254511833191,
+    "eval_runtime": 110.5807,
+    "eval_samples": 2000,
+    "eval_samples_per_second": 18.086,
+    "eval_steps_per_second": 0.289
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 100257,
+  "eos_token_id": 100257,
+  "transformers_version": "4.36.2"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:31fe86514c36173de9e3fe7c6dae2ff5aa36b5e17acce167c2c060ea90d78f77
 size 3289069520

 version https://git-lfs.github.com/spec/v1
+oid sha256:699753fbaa57b9da7029d3c4177187bad5eefe713a604dc0f9e0a2e5757ffe81
 size 3289069520

modeling_stablelm_epoch.py ADDED Viewed

	@@ -0,0 +1,919 @@

+# coding=utf-8
+# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This code is based off the following work:
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+""" PyTorch StableLM Epoch model. """
+from typing import Optional, Tuple, Union
+import math
+import warnings
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging, is_flash_attn_greater_or_equal_2_10
+from .configuration_stablelm_epoch import StableLMEpochConfig
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+except:
+    flash_attn_func, flash_attn_varlen_func = None, None
+    index_first_axis, pad_input, unpad_input = None, None, None
+logger = logging.get_logger(__name__)
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+):
+    """Make causal mask used for bi-directional self-attention."""
+    batch_size, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(torch.float16).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(batch_size, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, tgt_seq_len, src_seq_len]`."""
+    batch_size, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(batch_size, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        max_position_embeddings: int,
+        base: int = 10_000,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype(),
+        )
+    def _set_cos_sin_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32)
+        # Don't do einsum, it converts fp32 to fp16 under AMP
+        # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+    def forward(self, x: torch.Tensor, seq_len: Optional[int] = None):
+        # x: [batch_size, num_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.get_default_dtype())
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+def rotate_half(x: torch.Tensor):
+    """Rotates half the hidden dims of the input."""
+    x1, x2 = torch.chunk(x, 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [batch_size, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [batch_size, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class MLP(nn.Module):
+    def __init__(self, config: StableLMEpochConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Attention(nn.Module):
+    def __init__(self, config: StableLMEpochConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.use_qkv_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_qkv_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_qkv_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self._init_rope()
+    def _init_rope(self):
+        self.rotary_ndims = int(self.head_dim * self.config.rope_pct)
+        self.rotary_emb = RotaryEmbedding(
+            self.rotary_ndims,
+            max_position_embeddings=self.config.max_position_embeddings,
+            base=self.config.rope_theta,
+        )
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        position_ids: torch.LongTensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        query_rot = query_states[..., : self.rotary_ndims]
+        query_pass = query_states[..., self.rotary_ndims :]
+        key_rot = key_states[..., : self.rotary_ndims]
+        key_pass = key_states[..., self.rotary_ndims :]
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+        # [batch_size, num_heads, seq_len, head_dim]
+        query_states = torch.cat((query_states, query_pass), dim=-1)
+        key_states = torch.cat((key_states, key_pass), dim=-1)
+        if past_key_value is not None:
+            # Reuse k, v, self_attention
+            key_states = torch.cat((past_key_value[0], key_states), dim=2)
+            value_states = torch.cat((past_key_value[1], value_states), dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        # Repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # Upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        # Merge heads
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        # Final linear projection
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class FlashAttention2(Attention):
+    """
+    Reference: https://github.com/huggingface/transformers/blob/5d36025ca13d05151b7a0c761e90d429c4644a30/src/transformers/models/llama/modeling_llama.py#L456
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # FlashAttention2 attention does not support output_attentions
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        query_rot = query_states[..., : self.rotary_ndims]
+        query_pass = query_states[..., self.rotary_ndims :]
+        key_rot = key_states[..., : self.rotary_ndims]
+        key_pass = key_states[..., self.rotary_ndims :]
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+        # [batch_size, num_heads, seq_len, head_dim]
+        query_states = torch.cat((query_states, query_pass), dim=-1)
+        key_states = torch.cat((key_states, key_pass), dim=-1)
+        if past_key_value is not None:
+            # Reuse k, v, self_attention
+            key_states = torch.cat((past_key_value[0], key_states), dim=2)
+            value_states = torch.cat((past_key_value[1], value_states), dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in FlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+ATTENTION_CLASSES = {
+    "eager": Attention,
+    "flash_attention_2": FlashAttention2,
+}
+class DecoderLayer(nn.Module):
+    def __init__(self, config: StableLMEpochConfig):
+        super().__init__()
+        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](config=config)
+        self.mlp = MLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class StableLMEpochPreTrainedModel(PreTrainedModel):
+    """An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+    config_class = StableLMEpochConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def _set_gradient_checkpointing(self, module: nn.Module, value=False):
+        if isinstance(module, StableLMEpochModel):
+            module.gradient_checkpointing = value
+class StableLMEpochModel(StableLMEpochPreTrainedModel):
+    def __init__(self, config: StableLMEpochConfig):
+        super().__init__(config)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value: nn.Module):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+        past_key_values_length: int,
+    ):
+        # Create causal mask
+        # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # Embed positions
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            if attention_mask is None:
+                attention_mask = torch.ones(
+                    (batch_size, seq_length_with_past),
+                    dtype=torch.bool,
+                    device=inputs_embeds.device,
+                )
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # Decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # Add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class StableLMEpochForCausalLM(StableLMEpochPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: StableLMEpochConfig):
+        super().__init__(config)
+        self.model = StableLMEpochModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Module):
+        self.lm_head = new_embeddings
+    def get_decoder(self):
+        return self.model
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states).float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        # Trim decoder_input_ids if past is used
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+            input_ids = input_ids[:, remove_prefix_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # Create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # If `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx.to(past_state.device))
+                    for past_state in layer_past
+                ),
+            )
+        return reordered_past
+StableLMEpochConfig.register_for_auto_class()
+StableLMEpochForCausalLM.register_for_auto_class("AutoModelForCausalLM")

runs/Feb19_23-08-06_cccxc542/events.out.tfevents.1708402191.cccxc542.93482.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a780bbc9c5101256ec2c7ee35478510a10fe4ac9ebcbb721ac1923eb04936be7
-size 54855

 version https://git-lfs.github.com/spec/v1
+oid sha256:1065ec687b6baaf0f460adf547c466cb2226a5e05ee5483d5bc2a5db5e551894
+size 59013

runs/Feb19_23-08-06_cccxc542/events.out.tfevents.1708406899.cccxc542.93482.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b880746d534205b79b0a80157a8b930699e5b566b7a8e0db0a8cf7a7a483a0f7
+size 828

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 1.0,
+    "train_loss": 46.613421885577296,
+    "train_runtime": 4597.6924,
+    "train_samples": 61135,
+    "train_samples_per_second": 13.297,
+    "train_steps_per_second": 0.166
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1220 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.999345977763244,
+  "eval_steps": 100,
+  "global_step": 764,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 6.493506493506494e-09,
+      "logits/chosen": -2.0615594387054443,
+      "logits/rejected": -1.9222214221954346,
+      "logps/chosen": -3380.6083984375,
+      "logps/rejected": -2521.2978515625,
+      "loss": 0.0001,
+      "rewards/accuracies": 0.0,
+      "rewards/chosen": 0.0,
+      "rewards/margins": 0.0,
+      "rewards/rejected": 0.0,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 6.493506493506492e-08,
+      "logits/chosen": -1.674426555633545,
+      "logits/rejected": -1.637134313583374,
+      "logps/chosen": -2549.3515625,
+      "logps/rejected": -2319.4013671875,
+      "loss": 10.0505,
+      "rewards/accuracies": 0.4333333373069763,
+      "rewards/chosen": 0.0008169158827513456,
+      "rewards/margins": 0.0011402772506698966,
+      "rewards/rejected": -0.00032336192089132965,
+      "step": 10
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 1.2987012987012984e-07,
+      "logits/chosen": -1.6043205261230469,
+      "logits/rejected": -1.5535523891448975,
+      "logps/chosen": -2340.101318359375,
+      "logps/rejected": -2224.145263671875,
+      "loss": 7.4843,
+      "rewards/accuracies": 0.5200000405311584,
+      "rewards/chosen": 0.00018432810611557215,
+      "rewards/margins": 0.0009077669237740338,
+      "rewards/rejected": -0.0007234388613142073,
+      "step": 20
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 1.948051948051948e-07,
+      "logits/chosen": -1.6847426891326904,
+      "logits/rejected": -1.6577625274658203,
+      "logps/chosen": -2983.23681640625,
+      "logps/rejected": -2513.237060546875,
+      "loss": 9.1379,
+      "rewards/accuracies": 0.48000001907348633,
+      "rewards/chosen": 0.010261936113238335,
+      "rewards/margins": 0.004135974682867527,
+      "rewards/rejected": 0.006125961430370808,
+      "step": 30
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 2.597402597402597e-07,
+      "logits/chosen": -1.859400987625122,
+      "logits/rejected": -1.8100417852401733,
+      "logps/chosen": -2879.57470703125,
+      "logps/rejected": -2273.878173828125,
+      "loss": 12.271,
+      "rewards/accuracies": 0.5,
+      "rewards/chosen": 0.012033696286380291,
+      "rewards/margins": 0.005555520299822092,
+      "rewards/rejected": 0.006478174589574337,
+      "step": 40
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 3.2467532467532465e-07,
+      "logits/chosen": -1.828608751296997,
+      "logits/rejected": -1.805625319480896,
+      "logps/chosen": -2893.784423828125,
+      "logps/rejected": -2551.77294921875,
+      "loss": 8.7411,
+      "rewards/accuracies": 0.5600000023841858,
+      "rewards/chosen": 0.02166888490319252,
+      "rewards/margins": 0.007775471545755863,
+      "rewards/rejected": 0.013893413357436657,
+      "step": 50
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 3.896103896103896e-07,
+      "logits/chosen": -1.7459495067596436,
+      "logits/rejected": -1.6628999710083008,
+      "logps/chosen": -3231.689697265625,
+      "logps/rejected": -2554.42919921875,
+      "loss": 9.758,
+      "rewards/accuracies": 0.559999942779541,
+      "rewards/chosen": 0.027519574388861656,
+      "rewards/margins": 0.008895651437342167,
+      "rewards/rejected": 0.018623923882842064,
+      "step": 60
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 4.545454545454545e-07,
+      "logits/chosen": -1.8072433471679688,
+      "logits/rejected": -1.7838470935821533,
+      "logps/chosen": -2829.386474609375,
+      "logps/rejected": -2542.68701171875,
+      "loss": 11.0017,
+      "rewards/accuracies": 0.5,
+      "rewards/chosen": 0.024034958332777023,
+      "rewards/margins": 0.006175906863063574,
+      "rewards/rejected": 0.017859051004052162,
+      "step": 70
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 4.99976474872689e-07,
+      "logits/chosen": -1.7730411291122437,
+      "logits/rejected": -1.7399647235870361,
+      "logps/chosen": -2769.705322265625,
+      "logps/rejected": -2476.75634765625,
+      "loss": 15.623,
+      "rewards/accuracies": 0.5400000214576721,
+      "rewards/chosen": 0.008623984642326832,
+      "rewards/margins": 0.008157819509506226,
+      "rewards/rejected": 0.0004661638231482357,
+      "step": 80
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 4.995583735427465e-07,
+      "logits/chosen": -1.790204644203186,
+      "logits/rejected": -1.7226215600967407,
+      "logps/chosen": -2688.0732421875,
+      "logps/rejected": -2436.649658203125,
+      "loss": 11.9811,
+      "rewards/accuracies": 0.6100000143051147,
+      "rewards/chosen": 0.017978714779019356,
+      "rewards/margins": 0.017238261178135872,
+      "rewards/rejected": 0.0007404519128613174,
+      "step": 90
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 4.986184978516146e-07,
+      "logits/chosen": -1.7211675643920898,
+      "logits/rejected": -1.6991230249404907,
+      "logps/chosen": -2611.177001953125,
+      "logps/rejected": -2212.4033203125,
+      "loss": 16.8403,
+      "rewards/accuracies": 0.5200001001358032,
+      "rewards/chosen": 0.024822045117616653,
+      "rewards/margins": 0.00336282467469573,
+      "rewards/rejected": 0.021459218114614487,
+      "step": 100
+    },
+    {
+      "epoch": 0.13,
+      "eval_logits/chosen": -1.6729556322097778,
+      "eval_logits/rejected": -1.6068017482757568,
+      "eval_logps/chosen": -2806.55517578125,
+      "eval_logps/rejected": -2491.901123046875,
+      "eval_loss": 19.51178741455078,
+      "eval_rewards/accuracies": 0.52734375,
+      "eval_rewards/chosen": 0.025559017434716225,
+      "eval_rewards/margins": 0.008243386633694172,
+      "eval_rewards/rejected": 0.017315629869699478,
+      "eval_runtime": 115.2508,
+      "eval_samples_per_second": 17.353,
+      "eval_steps_per_second": 0.278,
+      "step": 100
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 4.971588128827782e-07,
+      "logits/chosen": -1.7473026514053345,
+      "logits/rejected": -1.6806236505508423,
+      "logps/chosen": -3125.757080078125,
+      "logps/rejected": -2645.337158203125,
+      "loss": 26.9149,
+      "rewards/accuracies": 0.6000000238418579,
+      "rewards/chosen": 0.016931097954511642,
+      "rewards/margins": 0.002864243695512414,
+      "rewards/rejected": 0.014066850766539574,
+      "step": 110
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 4.951823705321981e-07,
+      "logits/chosen": -1.7069530487060547,
+      "logits/rejected": -1.6579583883285522,
+      "logps/chosen": -2828.78662109375,
+      "logps/rejected": -2442.76416015625,
+      "loss": 33.872,
+      "rewards/accuracies": 0.5600000023841858,
+      "rewards/chosen": 0.013961514458060265,
+      "rewards/margins": 0.00896529946476221,
+      "rewards/rejected": 0.004996216390281916,
+      "step": 120
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 4.926933031274343e-07,
+      "logits/chosen": -1.7224699258804321,
+      "logits/rejected": -1.6934731006622314,
+      "logps/chosen": -2923.9306640625,
+      "logps/rejected": -2566.210693359375,
+      "loss": 39.0757,
+      "rewards/accuracies": 0.550000011920929,
+      "rewards/chosen": 0.038237668573856354,
+      "rewards/margins": 0.006029448006302118,
+      "rewards/rejected": 0.03220822289586067,
+      "step": 130
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.896968147878145e-07,
+      "logits/chosen": -1.7280409336090088,
+      "logits/rejected": -1.7070726156234741,
+      "logps/chosen": -2737.75927734375,
+      "logps/rejected": -2486.45751953125,
+      "loss": 18.5231,
+      "rewards/accuracies": 0.6299999952316284,
+      "rewards/chosen": 0.031596291810274124,
+      "rewards/margins": 0.04216960817575455,
+      "rewards/rejected": -0.010573318228125572,
+      "step": 140
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.861991705437081e-07,
+      "logits/chosen": -1.7859830856323242,
+      "logits/rejected": -1.7191492319107056,
+      "logps/chosen": -2743.43310546875,
+      "logps/rejected": -2297.162109375,
+      "loss": 20.7835,
+      "rewards/accuracies": 0.5800000429153442,
+      "rewards/chosen": 0.0336376316845417,
+      "rewards/margins": 0.011832155287265778,
+      "rewards/rejected": 0.021805476397275925,
+      "step": 150
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.822076832376586e-07,
+      "logits/chosen": -1.8132251501083374,
+      "logits/rejected": -1.7665789127349854,
+      "logps/chosen": -2841.165771484375,
+      "logps/rejected": -2748.486572265625,
+      "loss": 57.9401,
+      "rewards/accuracies": 0.5099999904632568,
+      "rewards/chosen": 0.006118610501289368,
+      "rewards/margins": 0.0022155127953737974,
+      "rewards/rejected": 0.0039030970074236393,
+      "step": 160
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.777306982347594e-07,
+      "logits/chosen": -1.6557657718658447,
+      "logits/rejected": -1.5996118783950806,
+      "logps/chosen": -3055.95361328125,
+      "logps/rejected": -2603.83642578125,
+      "loss": 23.1296,
+      "rewards/accuracies": 0.6200000047683716,
+      "rewards/chosen": 0.028251701965928078,
+      "rewards/margins": 0.020935241132974625,
+      "rewards/rejected": 0.007316464092582464,
+      "step": 170
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.7277757597424075e-07,
+      "logits/chosen": -1.8335905075073242,
+      "logits/rejected": -1.7595329284667969,
+      "logps/chosen": -2963.73779296875,
+      "logps/rejected": -2540.163818359375,
+      "loss": 40.5046,
+      "rewards/accuracies": 0.5400000810623169,
+      "rewards/chosen": 0.018591446802020073,
+      "rewards/margins": -0.0024230503477156162,
+      "rewards/rejected": 0.02101449854671955,
+      "step": 180
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.6735867239874904e-07,
+      "logits/chosen": -1.8637840747833252,
+      "logits/rejected": -1.7640159130096436,
+      "logps/chosen": -3237.434814453125,
+      "logps/rejected": -2429.197998046875,
+      "loss": 36.3042,
+      "rewards/accuracies": 0.6200000047683716,
+      "rewards/chosen": 0.04794805496931076,
+      "rewards/margins": 0.019117821007966995,
+      "rewards/rejected": 0.028830235823988914,
+      "step": 190
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.6148531730223733e-07,
+      "logits/chosen": -1.6909841299057007,
+      "logits/rejected": -1.6915366649627686,
+      "logps/chosen": -2649.89404296875,
+      "logps/rejected": -2436.87353515625,
+      "loss": 28.1241,
+      "rewards/accuracies": 0.5300000309944153,
+      "rewards/chosen": 0.007661645300686359,
+      "rewards/margins": 0.0055509163066744804,
+      "rewards/rejected": 0.0021107294596731663,
+      "step": 200
+    },
+    {
+      "epoch": 0.26,
+      "eval_logits/chosen": -1.681164264678955,
+      "eval_logits/rejected": -1.618328332901001,
+      "eval_logps/chosen": -2808.258056640625,
+      "eval_logps/rejected": -2494.01953125,
+      "eval_loss": 32.517486572265625,
+      "eval_rewards/accuracies": 0.5234375,
+      "eval_rewards/chosen": 0.008527392521500587,
+      "eval_rewards/margins": 0.012391308322548866,
+      "eval_rewards/rejected": -0.0038639232516288757,
+      "eval_runtime": 113.682,
+      "eval_samples_per_second": 17.593,
+      "eval_steps_per_second": 0.281,
+      "step": 200
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.5516979064173524e-07,
+      "logits/chosen": -1.749903917312622,
+      "logits/rejected": -1.7615283727645874,
+      "logps/chosen": -2285.7451171875,
+      "logps/rejected": -2269.229736328125,
+      "loss": 25.9535,
+      "rewards/accuracies": 0.6100000143051147,
+      "rewards/chosen": 0.011981850489974022,
+      "rewards/margins": 0.014764687046408653,
+      "rewards/rejected": -0.0027828349266201258,
+      "step": 210
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.484252968625277e-07,
+      "logits/chosen": -1.716509222984314,
+      "logits/rejected": -1.6396989822387695,
+      "logps/chosen": -2435.95556640625,
+      "logps/rejected": -1922.770751953125,
+      "loss": 28.3739,
+      "rewards/accuracies": 0.6200000047683716,
+      "rewards/chosen": 0.004359879065304995,
+      "rewards/margins": 0.007711753249168396,
+      "rewards/rejected": -0.0033518739510327578,
+      "step": 220
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 4.4126593729042193e-07,
+      "logits/chosen": -1.799469232559204,
+      "logits/rejected": -1.757004737854004,
+      "logps/chosen": -3254.6396484375,
+      "logps/rejected": -2515.59619140625,
+      "loss": 39.4707,
+      "rewards/accuracies": 0.5900000333786011,
+      "rewards/chosen": 0.03561704605817795,
+      "rewards/margins": 0.019830647855997086,
+      "rewards/rejected": 0.015786398202180862,
+      "step": 230
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 4.3370668064882397e-07,
+      "logits/chosen": -1.7325947284698486,
+      "logits/rejected": -1.7474550008773804,
+      "logps/chosen": -2579.47412109375,
+      "logps/rejected": -2328.500732421875,
+      "loss": 44.2727,
+      "rewards/accuracies": 0.5100000500679016,
+      "rewards/chosen": 0.04269097000360489,
+      "rewards/margins": 0.02060030959546566,
+      "rewards/rejected": 0.02209065482020378,
+      "step": 240
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 4.2576333176226944e-07,
+      "logits/chosen": -1.7366650104522705,
+      "logits/rejected": -1.706789255142212,
+      "logps/chosen": -2479.5576171875,
+      "logps/rejected": -2277.726318359375,
+      "loss": 29.5758,
+      "rewards/accuracies": 0.5300000309944153,
+      "rewards/chosen": 0.1058274507522583,
+      "rewards/margins": 0.013660475611686707,
+      "rewards/rejected": 0.0921669602394104,
+      "step": 250
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 4.17452498511841e-07,
+      "logits/chosen": -1.7807962894439697,
+      "logits/rejected": -1.7134149074554443,
+      "logps/chosen": -2989.12841796875,
+      "logps/rejected": -2354.25830078125,
+      "loss": 38.7316,
+      "rewards/accuracies": 0.5200000405311584,
+      "rewards/chosen": 0.023859605193138123,
+      "rewards/margins": 0.005521018523722887,
+      "rewards/rejected": 0.018338587135076523,
+      "step": 260
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 4.087915571115629e-07,
+      "logits/chosen": -1.8165556192398071,
+      "logits/rejected": -1.7687098979949951,
+      "logps/chosen": -2833.55859375,
+      "logps/rejected": -2183.32470703125,
+      "loss": 330.4642,
+      "rewards/accuracies": 0.5699999928474426,
+      "rewards/chosen": 0.031318746507167816,
+      "rewards/margins": 0.029996121302247047,
+      "rewards/rejected": 0.0013226259034126997,
+      "step": 270
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 3.997986157783715e-07,
+      "logits/chosen": -1.6980018615722656,
+      "logits/rejected": -1.589050531387329,
+      "logps/chosen": -3510.792236328125,
+      "logps/rejected": -2689.208984375,
+      "loss": 58.1646,
+      "rewards/accuracies": 0.5200000405311584,
+      "rewards/chosen": 0.014776378870010376,
+      "rewards/margins": 0.011294273659586906,
+      "rewards/rejected": 0.0034821047447621822,
+      "step": 280
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 3.9049247687162155e-07,
+      "logits/chosen": -1.7791646718978882,
+      "logits/rejected": -1.7399044036865234,
+      "logps/chosen": -2478.590576171875,
+      "logps/rejected": -2269.01416015625,
+      "loss": 31.6725,
+      "rewards/accuracies": 0.5699999928474426,
+      "rewards/chosen": 0.04884537309408188,
+      "rewards/margins": 0.0339895561337471,
+      "rewards/rejected": 0.014855814166367054,
+      "step": 290
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 3.8089259758128543e-07,
+      "logits/chosen": -1.670789361000061,
+      "logits/rejected": -1.6030629873275757,
+      "logps/chosen": -2726.465576171875,
+      "logps/rejected": -2119.26123046875,
+      "loss": 84.7591,
+      "rewards/accuracies": 0.5699999928474426,
+      "rewards/chosen": 0.021672677248716354,
+      "rewards/margins": -0.010785548016428947,
+      "rewards/rejected": 0.03245822712779045,
+      "step": 300
+    },
+    {
+      "epoch": 0.39,
+      "eval_logits/chosen": -1.660080909729004,
+      "eval_logits/rejected": -1.596778154373169,
+      "eval_logps/chosen": -2806.140625,
+      "eval_logps/rejected": -2492.270263671875,
+      "eval_loss": 47.80431365966797,
+      "eval_rewards/accuracies": 0.5390625,
+      "eval_rewards/chosen": 0.029702020809054375,
+      "eval_rewards/margins": 0.01607733778655529,
+      "eval_rewards/rejected": 0.013624681159853935,
+      "eval_runtime": 116.3019,
+      "eval_samples_per_second": 17.197,
+      "eval_steps_per_second": 0.275,
+      "step": 300
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 3.710190492470386e-07,
+      "logits/chosen": -1.6620228290557861,
+      "logits/rejected": -1.7311099767684937,
+      "logps/chosen": -2315.977294921875,
+      "logps/rejected": -2199.08251953125,
+      "loss": 43.6013,
+      "rewards/accuracies": 0.5400000214576721,
+      "rewards/chosen": 0.032384876161813736,
+      "rewards/margins": 0.008921505883336067,
+      "rewards/rejected": 0.02346337027847767,
+      "step": 310
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 3.6089247539328616e-07,
+      "logits/chosen": -1.7675050497055054,
+      "logits/rejected": -1.7156997919082642,
+      "logps/chosen": -2859.810791015625,
+      "logps/rejected": -2569.75537109375,
+      "loss": 38.8904,
+      "rewards/accuracies": 0.559999942779541,
+      "rewards/chosen": 0.020630866289138794,
+      "rewards/margins": 0.0018306337296962738,
+      "rewards/rejected": 0.01880022883415222,
+      "step": 320
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 3.5053404856787166e-07,
+      "logits/chosen": -1.6446609497070312,
+      "logits/rejected": -1.5918724536895752,
+      "logps/chosen": -3104.72802734375,
+      "logps/rejected": -2430.239013671875,
+      "loss": 84.9753,
+      "rewards/accuracies": 0.47999995946884155,
+      "rewards/chosen": 0.053660690784454346,
+      "rewards/margins": -0.005831834394484758,
+      "rewards/rejected": 0.05949252098798752,
+      "step": 330
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 3.399654260747078e-07,
+      "logits/chosen": -1.699196219444275,
+      "logits/rejected": -1.7045748233795166,
+      "logps/chosen": -2584.699462890625,
+      "logps/rejected": -2263.678466796875,
+      "loss": 38.1532,
+      "rewards/accuracies": 0.5300000309944153,
+      "rewards/chosen": 0.02709970250725746,
+      "rewards/margins": 0.01412280835211277,
+      "rewards/rejected": 0.012976895086467266,
+      "step": 340
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 3.2920870469288373e-07,
+      "logits/chosen": -1.7267532348632812,
+      "logits/rejected": -1.6659395694732666,
+      "logps/chosen": -2935.341796875,
+      "logps/rejected": -2503.583984375,
+      "loss": 47.1836,
+      "rewards/accuracies": 0.5199999809265137,
+      "rewards/chosen": 0.031215447932481766,
+      "rewards/margins": 0.022346725687384605,
+      "rewards/rejected": 0.008868719451129436,
+      "step": 350
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 3.182863744769218e-07,
+      "logits/chosen": -1.7288787364959717,
+      "logits/rejected": -1.6928844451904297,
+      "logps/chosen": -2811.489501953125,
+      "logps/rejected": -2596.68310546875,
+      "loss": 36.8176,
+      "rewards/accuracies": 0.5099999904632568,
+      "rewards/chosen": 0.1375296413898468,
+      "rewards/margins": 0.0820910781621933,
+      "rewards/rejected": 0.05543852597475052,
+      "step": 360
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 3.072212717347776e-07,
+      "logits/chosen": -1.7680120468139648,
+      "logits/rejected": -1.6781940460205078,
+      "logps/chosen": -3101.98583984375,
+      "logps/rejected": -2426.4716796875,
+      "loss": 36.7837,
+      "rewards/accuracies": 0.5199999809265137,
+      "rewards/chosen": 0.022122934460639954,
+      "rewards/margins": 0.011781491339206696,
+      "rewards/rejected": 0.010341441258788109,
+      "step": 370
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 2.9603653128189665e-07,
+      "logits/chosen": -1.6812299489974976,
+      "logits/rejected": -1.7215496301651,
+      "logps/chosen": -2823.8291015625,
+      "logps/rejected": -2762.53076171875,
+      "loss": 42.732,
+      "rewards/accuracies": 0.5,
+      "rewards/chosen": 0.028385426849126816,
+      "rewards/margins": -0.006515379063785076,
+      "rewards/rejected": 0.03490080684423447,
+      "step": 380
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 2.8475553807115387e-07,
+      "logits/chosen": -1.8070951700210571,
+      "logits/rejected": -1.7426990270614624,
+      "logps/chosen": -2697.833251953125,
+      "logps/rejected": -2263.9990234375,
+      "loss": 55.8683,
+      "rewards/accuracies": 0.5099999904632568,
+      "rewards/chosen": 0.012206131592392921,
+      "rewards/margins": 0.011461116373538971,
+      "rewards/rejected": 0.0007450145785696805,
+      "step": 390
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 2.7340187829980883e-07,
+      "logits/chosen": -1.8249183893203735,
+      "logits/rejected": -1.7130759954452515,
+      "logps/chosen": -2940.11181640625,
+      "logps/rejected": -2463.068359375,
+      "loss": 40.7835,
+      "rewards/accuracies": 0.6000000238418579,
+      "rewards/chosen": 0.0059137181378901005,
+      "rewards/margins": 0.01795141212642193,
+      "rewards/rejected": -0.012037692591547966,
+      "step": 400
+    },
+    {
+      "epoch": 0.52,
+      "eval_logits/chosen": -1.6917269229888916,
+      "eval_logits/rejected": -1.628839373588562,
+      "eval_logps/chosen": -2807.42626953125,
+      "eval_logps/rejected": -2493.92041015625,
+      "eval_loss": 30.672218322753906,
+      "eval_rewards/accuracies": 0.5546875,
+      "eval_rewards/chosen": 0.016848012804985046,
+      "eval_rewards/margins": 0.019721925258636475,
+      "eval_rewards/rejected": -0.0028739143162965775,
+      "eval_runtime": 110.0303,
+      "eval_samples_per_second": 18.177,
+      "eval_steps_per_second": 0.291,
+      "step": 400
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 2.6199929009569996e-07,
+      "logits/chosen": -1.7034717798233032,
+      "logits/rejected": -1.707564353942871,
+      "logps/chosen": -2599.38330078125,
+      "logps/rejected": -2273.864990234375,
+      "loss": 43.9981,
+      "rewards/accuracies": 0.5600000023841858,
+      "rewards/chosen": 0.02160579524934292,
+      "rewards/margins": 0.0038177832029759884,
+      "rewards/rejected": 0.017788011580705643,
+      "step": 410
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 2.5057161388578505e-07,
+      "logits/chosen": -1.7964134216308594,
+      "logits/rejected": -1.730661392211914,
+      "logps/chosen": -3038.08740234375,
+      "logps/rejected": -2405.333740234375,
+      "loss": 31.4477,
+      "rewards/accuracies": 0.5600000619888306,
+      "rewards/chosen": 0.05013390630483627,
+      "rewards/margins": 0.029845798388123512,
+      "rewards/rejected": 0.02028810977935791,
+      "step": 420
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 2.391427425507943e-07,
+      "logits/chosen": -1.6959331035614014,
+      "logits/rejected": -1.6784296035766602,
+      "logps/chosen": -2696.2236328125,
+      "logps/rejected": -2173.48583984375,
+      "loss": 32.2174,
+      "rewards/accuracies": 0.5600000619888306,
+      "rewards/chosen": 0.01727980561554432,
+      "rewards/margins": 0.013602805323898792,
+      "rewards/rejected": 0.0036770000588148832,
+      "step": 430
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 2.2773657147021465e-07,
+      "logits/chosen": -1.8469693660736084,
+      "logits/rejected": -1.7459551095962524,
+      "logps/chosen": -3117.762451171875,
+      "logps/rejected": -2390.564208984375,
+      "loss": 37.6526,
+      "rewards/accuracies": 0.5,
+      "rewards/chosen": 0.01079155970364809,
+      "rewards/margins": 0.011436818167567253,
+      "rewards/rejected": -0.00064525764901191,
+      "step": 440
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 2.1637694856204885e-07,
+      "logits/chosen": -1.751587152481079,
+      "logits/rejected": -1.6395552158355713,
+      "logps/chosen": -2887.770751953125,
+      "logps/rejected": -2129.771728515625,
+      "loss": 53.6906,
+      "rewards/accuracies": 0.550000011920929,
+      "rewards/chosen": 0.004514098167419434,
+      "rewards/margins": 0.00042482782737351954,
+      "rewards/rejected": 0.004089272115379572,
+      "step": 450
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 2.0508762442180743e-07,
+      "logits/chosen": -1.8443762063980103,
+      "logits/rejected": -1.792295217514038,
+      "logps/chosen": -2964.06494140625,
+      "logps/rejected": -2577.50244140625,
+      "loss": 62.4137,
+      "rewards/accuracies": 0.5600000023841858,
+      "rewards/chosen": 0.04290894791483879,
+      "rewards/margins": 0.01085699163377285,
+      "rewards/rejected": 0.03205195814371109,
+      "step": 460
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 1.93892202664981e-07,
+      "logits/chosen": -1.6403262615203857,
+      "logits/rejected": -1.712969183921814,
+      "logps/chosen": -2689.706787109375,
+      "logps/rejected": -2513.2998046875,
+      "loss": 31.7885,
+      "rewards/accuracies": 0.5100000500679016,
+      "rewards/chosen": 0.01223880797624588,
+      "rewards/margins": 0.011182873509824276,
+      "rewards/rejected": 0.0010559323709458113,
+      "step": 470
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 1.8281409057681686e-07,
+      "logits/chosen": -1.651449203491211,
+      "logits/rejected": -1.5920675992965698,
+      "logps/chosen": -3211.50341796875,
+      "logps/rejected": -2753.0322265625,
+      "loss": 103.2519,
+      "rewards/accuracies": 0.550000011920929,
+      "rewards/chosen": 0.0340498685836792,
+      "rewards/margins": 0.005449384916573763,
+      "rewards/rejected": 0.028600484132766724,
+      "step": 480
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 1.7187645017258195e-07,
+      "logits/chosen": -1.823428750038147,
+      "logits/rejected": -1.7740917205810547,
+      "logps/chosen": -2745.991455078125,
+      "logps/rejected": -2407.27978515625,
+      "loss": 48.2582,
+      "rewards/accuracies": 0.5300000309944153,
+      "rewards/chosen": 0.03057839907705784,
+      "rewards/margins": 0.003420495195314288,
+      "rewards/rejected": 0.027157902717590332,
+      "step": 490
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 1.6110214977063343e-07,
+      "logits/chosen": -1.7967636585235596,
+      "logits/rejected": -1.7410199642181396,
+      "logps/chosen": -2905.251708984375,
+      "logps/rejected": -2435.401611328125,
+      "loss": 36.2204,
+      "rewards/accuracies": 0.5,
+      "rewards/chosen": 0.013903191313147545,
+      "rewards/margins": 0.00013892585411667824,
+      "rewards/rejected": 0.013764267787337303,
+      "step": 500
+    },
+    {
+      "epoch": 0.65,
+      "eval_logits/chosen": -1.6842743158340454,
+      "eval_logits/rejected": -1.6236169338226318,
+      "eval_logps/chosen": -2806.076171875,
+      "eval_logps/rejected": -2491.544677734375,
+      "eval_loss": 31.220157623291016,
+      "eval_rewards/accuracies": 0.53515625,
+      "eval_rewards/chosen": 0.030346479266881943,
+      "eval_rewards/margins": 0.009465347044169903,
+      "eval_rewards/rejected": 0.020881133154034615,
+      "eval_runtime": 112.3374,
+      "eval_samples_per_second": 17.804,
+      "eval_steps_per_second": 0.285,
+      "step": 500
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 1.5051371617954777e-07,
+      "logits/chosen": -1.6810442209243774,
+      "logits/rejected": -1.6596931219100952,
+      "logps/chosen": -2559.396728515625,
+      "logps/rejected": -2228.120361328125,
+      "loss": 44.2046,
+      "rewards/accuracies": 0.5399999618530273,
+      "rewards/chosen": 0.016220757737755775,
+      "rewards/margins": 0.007862111553549767,
+      "rewards/rejected": 0.00835864432156086,
+      "step": 510
+    },
+    {
+      "epoch": 0.68,
+      "learning_rate": 1.4013328759927622e-07,
+      "logits/chosen": -1.6315361261367798,
+      "logits/rejected": -1.6191142797470093,
+      "logps/chosen": -2893.280029296875,
+      "logps/rejected": -2805.344970703125,
+      "loss": 31.2508,
+      "rewards/accuracies": 0.6299999952316284,
+      "rewards/chosen": 0.02805119752883911,
+      "rewards/margins": 0.01223050244152546,
+      "rewards/rejected": 0.0158206969499588,
+      "step": 520
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 1.2998256733479896e-07,
+      "logits/chosen": -1.810739278793335,
+      "logits/rejected": -1.8173195123672485,
+      "logps/chosen": -2332.383056640625,
+      "logps/rejected": -1922.5443115234375,
+      "loss": 226.0483,
+      "rewards/accuracies": 0.5600000023841858,
+      "rewards/chosen": 0.020228227600455284,
+      "rewards/margins": 0.009395391680300236,
+      "rewards/rejected": 0.010832836851477623,
+      "step": 530
+    },
+    {
+      "epoch": 0.71,
+      "learning_rate": 1.200827784190537e-07,
+      "logits/chosen": -1.6795597076416016,
+      "logits/rejected": -1.6883628368377686,
+      "logps/chosen": -3027.91796875,
+      "logps/rejected": -2619.313232421875,
+      "loss": 29.3654,
+      "rewards/accuracies": 0.5600000619888306,
+      "rewards/chosen": 0.01968817412853241,
+      "rewards/margins": 0.008831174112856388,
+      "rewards/rejected": 0.010857000946998596,
+      "step": 540
+    },
+    {
+      "epoch": 0.72,
+      "learning_rate": 1.1045461924001323e-07,
+      "logits/chosen": -1.791738748550415,
+      "logits/rejected": -1.8031442165374756,
+      "logps/chosen": -2852.6904296875,
+      "logps/rejected": -2462.853271484375,
+      "loss": 45.3966,
+      "rewards/accuracies": 0.46000003814697266,
+      "rewards/chosen": 0.010838394984602928,
+      "rewards/margins": 0.003685446921736002,
+      "rewards/rejected": 0.007152946200221777,
+      "step": 550
+    },
+    {
+      "epoch": 0.73,
+      "learning_rate": 1.0111822026468514e-07,
+      "logits/chosen": -1.7872514724731445,
+      "logits/rejected": -1.658860445022583,
+      "logps/chosen": -2903.530029296875,
+      "logps/rejected": -2319.64599609375,
+      "loss": 67.4473,
+      "rewards/accuracies": 0.5700000524520874,
+      "rewards/chosen": 0.009608490392565727,
+      "rewards/margins": 0.004570655524730682,
+      "rewards/rejected": 0.0050378344021737576,
+      "step": 560
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 9.209310195051581e-08,
+      "logits/chosen": -1.8252109289169312,
+      "logits/rejected": -1.6855742931365967,
+      "logps/chosen": -2538.860107421875,
+      "logps/rejected": -1955.6048583984375,
+      "loss": 63.0174,
+      "rewards/accuracies": 0.6299999952316284,
+      "rewards/chosen": 0.04829864576458931,
+      "rewards/margins": 0.022980675101280212,
+      "rewards/rejected": 0.025317972525954247,
+      "step": 570
+    },
+    {
+      "epoch": 0.76,
+      "learning_rate": 8.339813393219713e-08,
+      "logits/chosen": -1.739793062210083,
+      "logits/rejected": -1.641005516052246,
+      "logps/chosen": -2791.561767578125,
+      "logps/rejected": -2475.72998046875,
+      "loss": 59.8369,
+      "rewards/accuracies": 0.5699999928474426,
+      "rewards/chosen": 0.05231914669275284,
+      "rewards/margins": 0.021858692169189453,
+      "rewards/rejected": 0.030460450798273087,
+      "step": 580
+    },
+    {
+      "epoch": 0.77,
+      "learning_rate": 7.505149556920698e-08,
+      "logits/chosen": -1.8431494235992432,
+      "logits/rejected": -1.7774893045425415,
+      "logps/chosen": -2542.13427734375,
+      "logps/rejected": -2193.825927734375,
+      "loss": 29.3999,
+      "rewards/accuracies": 0.5800000429153442,
+      "rewards/chosen": 0.04752471297979355,
+      "rewards/margins": 0.017444033175706863,
+      "rewards/rejected": 0.030080681666731834,
+      "step": 590
+    },
+    {
+      "epoch": 0.78,
+      "learning_rate": 6.707063793657064e-08,
+      "logits/chosen": -1.7773969173431396,
+      "logits/rejected": -1.6891686916351318,
+      "logps/chosen": -2942.21240234375,
+      "logps/rejected": -2429.352294921875,
+      "loss": 99.7738,
+      "rewards/accuracies": 0.6200000047683716,
+      "rewards/chosen": 0.03306427597999573,
+      "rewards/margins": 0.01405587512999773,
+      "rewards/rejected": 0.019008399918675423,
+      "step": 600
+    },
+    {
+      "epoch": 0.78,
+      "eval_logits/chosen": -1.6827195882797241,
+      "eval_logits/rejected": -1.6222153902053833,
+      "eval_logps/chosen": -2804.348388671875,
+      "eval_logps/rejected": -2489.908935546875,
+      "eval_loss": 33.74028778076172,
+      "eval_rewards/accuracies": 0.5390625,
+      "eval_rewards/chosen": 0.04762275516986847,
+      "eval_rewards/margins": 0.010385587811470032,
+      "eval_rewards/rejected": 0.037237171083688736,
+      "eval_runtime": 106.3716,
+      "eval_samples_per_second": 18.802,
+      "eval_steps_per_second": 0.301,
+      "step": 600
+    },
+    {
+      "epoch": 0.8,
+      "learning_rate": 5.947224733831363e-08,
+      "logits/chosen": -1.759399175643921,
+      "logits/rejected": -1.7431520223617554,
+      "logps/chosen": -2756.701416015625,
+      "logps/rejected": -2470.905029296875,
+      "loss": 51.5387,
+      "rewards/accuracies": 0.5199999809265137,
+      "rewards/chosen": 0.011415710672736168,
+      "rewards/margins": 0.009652274660766125,
+      "rewards/rejected": 0.0017634350806474686,
+      "step": 610
+    },
+    {
+      "epoch": 0.81,
+      "learning_rate": 5.227221041988955e-08,
+      "logits/chosen": -1.7857062816619873,
+      "logits/rejected": -1.725630760192871,
+      "logps/chosen": -2520.410400390625,
+      "logps/rejected": -2319.78564453125,
+      "loss": 28.3912,
+      "rewards/accuracies": 0.5900000333786011,
+      "rewards/chosen": 0.01811736635863781,
+      "rewards/margins": 0.01297797542065382,
+      "rewards/rejected": 0.005139390472322702,
+      "step": 620
+    },
+    {
+      "epoch": 0.82,
+      "learning_rate": 4.548558095252758e-08,
+      "logits/chosen": -1.6374757289886475,
+      "logits/rejected": -1.673044204711914,
+      "logps/chosen": -2845.2119140625,
+      "logps/rejected": -2698.294677734375,
+      "loss": 42.0619,
+      "rewards/accuracies": 0.48000001907348633,
+      "rewards/chosen": 0.030720695853233337,
+      "rewards/margins": 0.020541973412036896,
+      "rewards/rejected": 0.010178723372519016,
+      "step": 630
+    },
+    {
+      "epoch": 0.84,
+      "learning_rate": 3.9126548358945635e-08,
+      "logits/chosen": -1.7063062191009521,
+      "logits/rejected": -1.6988853216171265,
+      "logps/chosen": -3136.520263671875,
+      "logps/rejected": -2731.25927734375,
+      "loss": 46.0608,
+      "rewards/accuracies": 0.5600000619888306,
+      "rewards/chosen": 0.029350021854043007,
+      "rewards/margins": 0.010425332933664322,
+      "rewards/rejected": 0.018924688920378685,
+      "step": 640
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 3.3208408046234896e-08,
+      "logits/chosen": -1.8164535760879517,
+      "logits/rejected": -1.7656440734863281,
+      "logps/chosen": -2538.8046875,
+      "logps/rejected": -2061.81591796875,
+      "loss": 40.0838,
+      "rewards/accuracies": 0.550000011920929,
+      "rewards/chosen": 0.01678399369120598,
+      "rewards/margins": 0.009666666388511658,
+      "rewards/rejected": 0.007117328234016895,
+      "step": 650
+    },
+    {
+      "epoch": 0.86,
+      "learning_rate": 2.774353360794493e-08,
+      "logits/chosen": -1.7155154943466187,
+      "logits/rejected": -1.7442939281463623,
+      "logps/chosen": -2761.740966796875,
+      "logps/rejected": -2534.80419921875,
+      "loss": 36.8374,
+      "rewards/accuracies": 0.64000004529953,
+      "rewards/chosen": 0.03588343411684036,
+      "rewards/margins": 0.037469957023859024,
+      "rewards/rejected": -0.0015865217428654432,
+      "step": 660
+    },
+    {
+      "epoch": 0.88,
+      "learning_rate": 2.2743350953487422e-08,
+      "logits/chosen": -1.6992709636688232,
+      "logits/rejected": -1.7416222095489502,
+      "logps/chosen": -2850.97705078125,
+      "logps/rejected": -2569.76611328125,
+      "loss": 86.4259,
+      "rewards/accuracies": 0.5300000905990601,
+      "rewards/chosen": 0.018378589302301407,
+      "rewards/margins": 0.004103804472833872,
+      "rewards/rejected": 0.014274786226451397,
+      "step": 670
+    },
+    {
+      "epoch": 0.89,
+      "learning_rate": 1.8218314418949387e-08,
+      "logits/chosen": -1.718764305114746,
+      "logits/rejected": -1.6741740703582764,
+      "logps/chosen": -2353.093017578125,
+      "logps/rejected": -2190.77001953125,
+      "loss": 46.1473,
+      "rewards/accuracies": 0.5699999928474426,
+      "rewards/chosen": 0.0031641994137316942,
+      "rewards/margins": 0.0004996396601200104,
+      "rewards/rejected": 0.002664559753611684,
+      "step": 680
+    },
+    {
+      "epoch": 0.9,
+      "learning_rate": 1.4177884909263277e-08,
+      "logits/chosen": -1.6867101192474365,
+      "logits/rejected": -1.652515172958374,
+      "logps/chosen": -2937.97412109375,
+      "logps/rejected": -2552.07177734375,
+      "loss": 37.0029,
+      "rewards/accuracies": 0.5199999809265137,
+      "rewards/chosen": 0.007288885302841663,
+      "rewards/margins": -0.0001541988895041868,
+      "rewards/rejected": 0.007443083915859461,
+      "step": 690
+    },
+    {
+      "epoch": 0.92,
+      "learning_rate": 1.063051011743335e-08,
+      "logits/chosen": -1.7554800510406494,
+      "logits/rejected": -1.7419729232788086,
+      "logps/chosen": -2755.109619140625,
+      "logps/rejected": -2368.246337890625,
+      "loss": 41.8506,
+      "rewards/accuracies": 0.46000003814697266,
+      "rewards/chosen": 0.006798497401177883,
+      "rewards/margins": 0.010020612739026546,
+      "rewards/rejected": -0.0032221146393567324,
+      "step": 700
+    },
+    {
+      "epoch": 0.92,
+      "eval_logits/chosen": -1.682308554649353,
+      "eval_logits/rejected": -1.6210675239562988,
+      "eval_logps/chosen": -2806.1005859375,
+      "eval_logps/rejected": -2491.68505859375,
+      "eval_loss": 32.91334915161133,
+      "eval_rewards/accuracies": 0.5546875,
+      "eval_rewards/chosen": 0.030103469267487526,
+      "eval_rewards/margins": 0.01062812004238367,
+      "eval_rewards/rejected": 0.01947534643113613,
+      "eval_runtime": 110.9725,
+      "eval_samples_per_second": 18.022,
+      "eval_steps_per_second": 0.288,
+      "step": 700
+    },
+    {
+      "epoch": 0.93,
+      "learning_rate": 7.58360686217671e-09,
+      "logits/chosen": -1.7843902111053467,
+      "logits/rejected": -1.6857761144638062,
+      "logps/chosen": -2821.310546875,
+      "logps/rejected": -2445.586669921875,
+      "loss": 44.0616,
+      "rewards/accuracies": 0.6000000238418579,
+      "rewards/chosen": 0.025277357548475266,
+      "rewards/margins": 0.014521745964884758,
+      "rewards/rejected": 0.010755611583590508,
+      "step": 710
+    },
+    {
+      "epoch": 0.94,
+      "learning_rate": 5.043545580906694e-09,
+      "logits/chosen": -1.7102206945419312,
+      "logits/rejected": -1.6009165048599243,
+      "logps/chosen": -2682.336669921875,
+      "logps/rejected": -2234.36279296875,
+      "loss": 44.9558,
+      "rewards/accuracies": 0.6299999952316284,
+      "rewards/chosen": 0.018401915207505226,
+      "rewards/margins": 0.030992329120635986,
+      "rewards/rejected": -0.012590417638421059,
+      "step": 720
+    },
+    {
+      "epoch": 0.95,
+      "learning_rate": 3.015637010480576e-09,
+      "logits/chosen": -1.7701480388641357,
+      "logits/rejected": -1.7436597347259521,
+      "logps/chosen": -3042.88330078125,
+      "logps/rejected": -2501.55615234375,
+      "loss": 35.3819,
+      "rewards/accuracies": 0.5199999809265137,
+      "rewards/chosen": 0.012798592448234558,
+      "rewards/margins": -0.0020437492057681084,
+      "rewards/rejected": 0.014842341654002666,
+      "step": 730
+    },
+    {
+      "epoch": 0.97,
+      "learning_rate": 1.5041210835596285e-09,
+      "logits/chosen": -1.703537940979004,
+      "logits/rejected": -1.6897165775299072,
+      "logps/chosen": -2817.1484375,
+      "logps/rejected": -2390.23095703125,
+      "loss": 55.7217,
+      "rewards/accuracies": 0.5800000429153442,
+      "rewards/chosen": 0.022753870114684105,
+      "rewards/margins": 0.017634030431509018,
+      "rewards/rejected": 0.005119838751852512,
+      "step": 740
+    },
+    {
+      "epoch": 0.98,
+      "learning_rate": 5.121580637968137e-10,
+      "logits/chosen": -1.7322509288787842,
+      "logits/rejected": -1.6345192193984985,
+      "logps/chosen": -2836.63623046875,
+      "logps/rejected": -2363.93505859375,
+      "loss": 67.2692,
+      "rewards/accuracies": 0.6300000548362732,
+      "rewards/chosen": 0.02889620140194893,
+      "rewards/margins": 0.014323192648589611,
+      "rewards/rejected": 0.014573007822036743,
+      "step": 750
+    },
+    {
+      "epoch": 0.99,
+      "learning_rate": 4.1821938386477075e-11,
+      "logits/chosen": -1.7962977886199951,
+      "logits/rejected": -1.7127879858016968,
+      "logps/chosen": -2801.806640625,
+      "logps/rejected": -2341.844970703125,
+      "loss": 46.0674,
+      "rewards/accuracies": 0.5700000524520874,
+      "rewards/chosen": 0.009056088514626026,
+      "rewards/margins": 0.013271180912852287,
+      "rewards/rejected": -0.004215092398226261,
+      "step": 760
+    },
+    {
+      "epoch": 1.0,
+      "step": 764,
+      "total_flos": 0.0,
+      "train_loss": 46.613421885577296,
+      "train_runtime": 4597.6924,
+      "train_samples_per_second": 13.297,
+      "train_steps_per_second": 0.166
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 764,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "total_flos": 0.0,
+  "train_batch_size": 5,
+  "trial_name": null,
+  "trial_params": null
+}