MilaWang commited on Mar 28

Commit

0f8dd66

verified ·

1 Parent(s): b1a7cb3

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/README.md +203 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/README.md +203 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/trainer_state.json +846 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/README.md +203 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/trainer_state.json +1666 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2318/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/README.md +203 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/trainer_state.json +2486 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3477/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/README.md +203 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4636/trainer_state.json +3306 -0

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/README.md ADDED Viewed

	@@ -0,0 +1,203 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a396c86cea44bd83b9d8a41942862cab1058a81f7839cc727d2809918a91b7a
+size 109069176

	@@ -0,0 +1,203 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a396c86cea44bd83b9d8a41942862cab1058a81f7839cc727d2809918a91b7a
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2887b3f056c0ab921703c8c30dcfb58942d353c14d60672daecade1be31a7507
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c032c2ea48bffa522b4314767c489a63854b7728f8a516755a0e0e5a2240712
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c93b6bb2ecd1834fe3da6e50e7142ce04534e75a95b0772916387b7fe3753b6
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,846 @@

+{
+  "best_metric": 1.7723218202590942,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159",
+  "epoch": 1.0,
+  "eval_steps": 10,
+  "global_step": 1159,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008628127696289905,
+      "grad_norm": 1.7177482843399048,
+      "learning_rate": 0.0002,
+      "loss": 2.5586,
+      "step": 10
+    },
+    {
+      "epoch": 0.01725625539257981,
+      "grad_norm": 2.8122410774230957,
+      "learning_rate": 0.0002,
+      "loss": 2.2918,
+      "step": 20
+    },
+    {
+      "epoch": 0.025884383088869714,
+      "grad_norm": 1.6668062210083008,
+      "learning_rate": 0.0002,
+      "loss": 2.0885,
+      "step": 30
+    },
+    {
+      "epoch": 0.03451251078515962,
+      "grad_norm": 1.745869755744934,
+      "learning_rate": 0.0002,
+      "loss": 2.0469,
+      "step": 40
+    },
+    {
+      "epoch": 0.04314063848144953,
+      "grad_norm": 1.807971477508545,
+      "learning_rate": 0.0002,
+      "loss": 2.0754,
+      "step": 50
+    },
+    {
+      "epoch": 0.05176876617773943,
+      "grad_norm": 2.4009974002838135,
+      "learning_rate": 0.0002,
+      "loss": 2.0867,
+      "step": 60
+    },
+    {
+      "epoch": 0.060396893874029335,
+      "grad_norm": 2.0974676609039307,
+      "learning_rate": 0.0002,
+      "loss": 1.8575,
+      "step": 70
+    },
+    {
+      "epoch": 0.06902502157031924,
+      "grad_norm": 1.7705916166305542,
+      "learning_rate": 0.0002,
+      "loss": 1.8921,
+      "step": 80
+    },
+    {
+      "epoch": 0.07765314926660914,
+      "grad_norm": 1.4324289560317993,
+      "learning_rate": 0.0002,
+      "loss": 1.8119,
+      "step": 90
+    },
+    {
+      "epoch": 0.08628127696289906,
+      "grad_norm": 1.2521991729736328,
+      "learning_rate": 0.0002,
+      "loss": 1.8728,
+      "step": 100
+    },
+    {
+      "epoch": 0.09490940465918896,
+      "grad_norm": 1.3328145742416382,
+      "learning_rate": 0.0002,
+      "loss": 1.8168,
+      "step": 110
+    },
+    {
+      "epoch": 0.10353753235547886,
+      "grad_norm": 2.2908742427825928,
+      "learning_rate": 0.0002,
+      "loss": 1.8236,
+      "step": 120
+    },
+    {
+      "epoch": 0.11216566005176877,
+      "grad_norm": 1.540981411933899,
+      "learning_rate": 0.0002,
+      "loss": 1.8732,
+      "step": 130
+    },
+    {
+      "epoch": 0.12079378774805867,
+      "grad_norm": 1.1785069704055786,
+      "learning_rate": 0.0002,
+      "loss": 1.8138,
+      "step": 140
+    },
+    {
+      "epoch": 0.12942191544434858,
+      "grad_norm": 1.3138738870620728,
+      "learning_rate": 0.0002,
+      "loss": 1.8655,
+      "step": 150
+    },
+    {
+      "epoch": 0.13805004314063848,
+      "grad_norm": 1.153215765953064,
+      "learning_rate": 0.0002,
+      "loss": 1.8418,
+      "step": 160
+    },
+    {
+      "epoch": 0.14667817083692838,
+      "grad_norm": 1.2071360349655151,
+      "learning_rate": 0.0002,
+      "loss": 1.8284,
+      "step": 170
+    },
+    {
+      "epoch": 0.15530629853321828,
+      "grad_norm": 1.3546127080917358,
+      "learning_rate": 0.0002,
+      "loss": 1.8645,
+      "step": 180
+    },
+    {
+      "epoch": 0.16393442622950818,
+      "grad_norm": 1.1494425535202026,
+      "learning_rate": 0.0002,
+      "loss": 1.8699,
+      "step": 190
+    },
+    {
+      "epoch": 0.1725625539257981,
+      "grad_norm": 0.982718825340271,
+      "learning_rate": 0.0002,
+      "loss": 1.7845,
+      "step": 200
+    },
+    {
+      "epoch": 0.181190681622088,
+      "grad_norm": 1.1329727172851562,
+      "learning_rate": 0.0002,
+      "loss": 1.8237,
+      "step": 210
+    },
+    {
+      "epoch": 0.1898188093183779,
+      "grad_norm": 1.1397384405136108,
+      "learning_rate": 0.0002,
+      "loss": 1.8516,
+      "step": 220
+    },
+    {
+      "epoch": 0.1984469370146678,
+      "grad_norm": 1.2424808740615845,
+      "learning_rate": 0.0002,
+      "loss": 1.7504,
+      "step": 230
+    },
+    {
+      "epoch": 0.2070750647109577,
+      "grad_norm": 1.1463897228240967,
+      "learning_rate": 0.0002,
+      "loss": 1.7626,
+      "step": 240
+    },
+    {
+      "epoch": 0.21570319240724764,
+      "grad_norm": 1.2353036403656006,
+      "learning_rate": 0.0002,
+      "loss": 1.7977,
+      "step": 250
+    },
+    {
+      "epoch": 0.22433132010353754,
+      "grad_norm": 1.0135247707366943,
+      "learning_rate": 0.0002,
+      "loss": 1.8274,
+      "step": 260
+    },
+    {
+      "epoch": 0.23295944779982744,
+      "grad_norm": 1.1388282775878906,
+      "learning_rate": 0.0002,
+      "loss": 1.7678,
+      "step": 270
+    },
+    {
+      "epoch": 0.24158757549611734,
+      "grad_norm": 1.1262438297271729,
+      "learning_rate": 0.0002,
+      "loss": 1.7895,
+      "step": 280
+    },
+    {
+      "epoch": 0.25021570319240727,
+      "grad_norm": 1.0581450462341309,
+      "learning_rate": 0.0002,
+      "loss": 1.826,
+      "step": 290
+    },
+    {
+      "epoch": 0.25884383088869717,
+      "grad_norm": 1.1737277507781982,
+      "learning_rate": 0.0002,
+      "loss": 1.7269,
+      "step": 300
+    },
+    {
+      "epoch": 0.26747195858498707,
+      "grad_norm": 1.0906627178192139,
+      "learning_rate": 0.0002,
+      "loss": 1.7975,
+      "step": 310
+    },
+    {
+      "epoch": 0.27610008628127697,
+      "grad_norm": 1.0010069608688354,
+      "learning_rate": 0.0002,
+      "loss": 1.7594,
+      "step": 320
+    },
+    {
+      "epoch": 0.28472821397756687,
+      "grad_norm": 1.2149732112884521,
+      "learning_rate": 0.0002,
+      "loss": 1.7998,
+      "step": 330
+    },
+    {
+      "epoch": 0.29335634167385677,
+      "grad_norm": 1.293990969657898,
+      "learning_rate": 0.0002,
+      "loss": 1.8079,
+      "step": 340
+    },
+    {
+      "epoch": 0.30198446937014667,
+      "grad_norm": 1.0082058906555176,
+      "learning_rate": 0.0002,
+      "loss": 1.7629,
+      "step": 350
+    },
+    {
+      "epoch": 0.31061259706643657,
+      "grad_norm": 1.0307148694992065,
+      "learning_rate": 0.0002,
+      "loss": 1.8001,
+      "step": 360
+    },
+    {
+      "epoch": 0.31924072476272647,
+      "grad_norm": 0.9646756649017334,
+      "learning_rate": 0.0002,
+      "loss": 1.7456,
+      "step": 370
+    },
+    {
+      "epoch": 0.32786885245901637,
+      "grad_norm": 1.105623722076416,
+      "learning_rate": 0.0002,
+      "loss": 1.7979,
+      "step": 380
+    },
+    {
+      "epoch": 0.3364969801553063,
+      "grad_norm": 0.9365625977516174,
+      "learning_rate": 0.0002,
+      "loss": 1.7313,
+      "step": 390
+    },
+    {
+      "epoch": 0.3451251078515962,
+      "grad_norm": 1.1378847360610962,
+      "learning_rate": 0.0002,
+      "loss": 1.809,
+      "step": 400
+    },
+    {
+      "epoch": 0.3537532355478861,
+      "grad_norm": 1.1266193389892578,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 410
+    },
+    {
+      "epoch": 0.362381363244176,
+      "grad_norm": 1.0886635780334473,
+      "learning_rate": 0.0002,
+      "loss": 1.8096,
+      "step": 420
+    },
+    {
+      "epoch": 0.3710094909404659,
+      "grad_norm": 1.0463931560516357,
+      "learning_rate": 0.0002,
+      "loss": 1.7422,
+      "step": 430
+    },
+    {
+      "epoch": 0.3796376186367558,
+      "grad_norm": 1.0923888683319092,
+      "learning_rate": 0.0002,
+      "loss": 1.7936,
+      "step": 440
+    },
+    {
+      "epoch": 0.3882657463330457,
+      "grad_norm": 1.1386370658874512,
+      "learning_rate": 0.0002,
+      "loss": 1.7777,
+      "step": 450
+    },
+    {
+      "epoch": 0.3968938740293356,
+      "grad_norm": 1.0098074674606323,
+      "learning_rate": 0.0002,
+      "loss": 1.7211,
+      "step": 460
+    },
+    {
+      "epoch": 0.4055220017256255,
+      "grad_norm": 1.1237372159957886,
+      "learning_rate": 0.0002,
+      "loss": 1.7457,
+      "step": 470
+    },
+    {
+      "epoch": 0.4141501294219154,
+      "grad_norm": 1.0218915939331055,
+      "learning_rate": 0.0002,
+      "loss": 1.854,
+      "step": 480
+    },
+    {
+      "epoch": 0.4227782571182053,
+      "grad_norm": 0.9998831748962402,
+      "learning_rate": 0.0002,
+      "loss": 1.8548,
+      "step": 490
+    },
+    {
+      "epoch": 0.4314063848144953,
+      "grad_norm": 1.0424970388412476,
+      "learning_rate": 0.0002,
+      "loss": 1.7159,
+      "step": 500
+    },
+    {
+      "epoch": 0.4400345125107852,
+      "grad_norm": 0.903372585773468,
+      "learning_rate": 0.0002,
+      "loss": 1.788,
+      "step": 510
+    },
+    {
+      "epoch": 0.4486626402070751,
+      "grad_norm": 1.0864766836166382,
+      "learning_rate": 0.0002,
+      "loss": 1.8293,
+      "step": 520
+    },
+    {
+      "epoch": 0.457290767903365,
+      "grad_norm": 0.9694207310676575,
+      "learning_rate": 0.0002,
+      "loss": 1.8402,
+      "step": 530
+    },
+    {
+      "epoch": 0.4659188955996549,
+      "grad_norm": 1.2796396017074585,
+      "learning_rate": 0.0002,
+      "loss": 1.7802,
+      "step": 540
+    },
+    {
+      "epoch": 0.4745470232959448,
+      "grad_norm": 1.0316239595413208,
+      "learning_rate": 0.0002,
+      "loss": 1.7716,
+      "step": 550
+    },
+    {
+      "epoch": 0.4831751509922347,
+      "grad_norm": 1.0445313453674316,
+      "learning_rate": 0.0002,
+      "loss": 1.7734,
+      "step": 560
+    },
+    {
+      "epoch": 0.4918032786885246,
+      "grad_norm": 1.1078376770019531,
+      "learning_rate": 0.0002,
+      "loss": 1.8082,
+      "step": 570
+    },
+    {
+      "epoch": 0.5004314063848145,
+      "grad_norm": 1.0551974773406982,
+      "learning_rate": 0.0002,
+      "loss": 1.7298,
+      "step": 580
+    },
+    {
+      "epoch": 0.5090595340811044,
+      "grad_norm": 1.114853858947754,
+      "learning_rate": 0.0002,
+      "loss": 1.8673,
+      "step": 590
+    },
+    {
+      "epoch": 0.5176876617773943,
+      "grad_norm": 1.0642707347869873,
+      "learning_rate": 0.0002,
+      "loss": 1.7684,
+      "step": 600
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 1.088079810142517,
+      "learning_rate": 0.0002,
+      "loss": 1.8367,
+      "step": 610
+    },
+    {
+      "epoch": 0.5349439171699741,
+      "grad_norm": 1.4029462337493896,
+      "learning_rate": 0.0002,
+      "loss": 1.775,
+      "step": 620
+    },
+    {
+      "epoch": 0.543572044866264,
+      "grad_norm": 1.2136136293411255,
+      "learning_rate": 0.0002,
+      "loss": 1.7771,
+      "step": 630
+    },
+    {
+      "epoch": 0.5522001725625539,
+      "grad_norm": 0.9642075896263123,
+      "learning_rate": 0.0002,
+      "loss": 1.8006,
+      "step": 640
+    },
+    {
+      "epoch": 0.5608283002588438,
+      "grad_norm": 1.0879552364349365,
+      "learning_rate": 0.0002,
+      "loss": 1.7478,
+      "step": 650
+    },
+    {
+      "epoch": 0.5694564279551337,
+      "grad_norm": 1.1766546964645386,
+      "learning_rate": 0.0002,
+      "loss": 1.8427,
+      "step": 660
+    },
+    {
+      "epoch": 0.5780845556514237,
+      "grad_norm": 1.582840085029602,
+      "learning_rate": 0.0002,
+      "loss": 1.7129,
+      "step": 670
+    },
+    {
+      "epoch": 0.5867126833477135,
+      "grad_norm": 1.0681092739105225,
+      "learning_rate": 0.0002,
+      "loss": 1.8093,
+      "step": 680
+    },
+    {
+      "epoch": 0.5953408110440035,
+      "grad_norm": 1.103897213935852,
+      "learning_rate": 0.0002,
+      "loss": 1.8067,
+      "step": 690
+    },
+    {
+      "epoch": 0.6039689387402933,
+      "grad_norm": 1.0974211692810059,
+      "learning_rate": 0.0002,
+      "loss": 1.7425,
+      "step": 700
+    },
+    {
+      "epoch": 0.6125970664365833,
+      "grad_norm": 1.1002469062805176,
+      "learning_rate": 0.0002,
+      "loss": 1.784,
+      "step": 710
+    },
+    {
+      "epoch": 0.6212251941328731,
+      "grad_norm": 1.0022329092025757,
+      "learning_rate": 0.0002,
+      "loss": 1.8106,
+      "step": 720
+    },
+    {
+      "epoch": 0.6298533218291631,
+      "grad_norm": 1.0089571475982666,
+      "learning_rate": 0.0002,
+      "loss": 1.7647,
+      "step": 730
+    },
+    {
+      "epoch": 0.6384814495254529,
+      "grad_norm": 0.9531904458999634,
+      "learning_rate": 0.0002,
+      "loss": 1.8033,
+      "step": 740
+    },
+    {
+      "epoch": 0.6471095772217429,
+      "grad_norm": 1.162675380706787,
+      "learning_rate": 0.0002,
+      "loss": 1.7644,
+      "step": 750
+    },
+    {
+      "epoch": 0.6557377049180327,
+      "grad_norm": 1.0488134622573853,
+      "learning_rate": 0.0002,
+      "loss": 1.7531,
+      "step": 760
+    },
+    {
+      "epoch": 0.6643658326143227,
+      "grad_norm": 1.12964928150177,
+      "learning_rate": 0.0002,
+      "loss": 1.7583,
+      "step": 770
+    },
+    {
+      "epoch": 0.6729939603106126,
+      "grad_norm": 1.0867345333099365,
+      "learning_rate": 0.0002,
+      "loss": 1.7765,
+      "step": 780
+    },
+    {
+      "epoch": 0.6816220880069025,
+      "grad_norm": 1.1084282398223877,
+      "learning_rate": 0.0002,
+      "loss": 1.7797,
+      "step": 790
+    },
+    {
+      "epoch": 0.6902502157031924,
+      "grad_norm": 0.9905423521995544,
+      "learning_rate": 0.0002,
+      "loss": 1.7792,
+      "step": 800
+    },
+    {
+      "epoch": 0.6988783433994823,
+      "grad_norm": 1.18604576587677,
+      "learning_rate": 0.0002,
+      "loss": 1.7825,
+      "step": 810
+    },
+    {
+      "epoch": 0.7075064710957722,
+      "grad_norm": 1.0819629430770874,
+      "learning_rate": 0.0002,
+      "loss": 1.8242,
+      "step": 820
+    },
+    {
+      "epoch": 0.7161345987920621,
+      "grad_norm": 2.0091195106506348,
+      "learning_rate": 0.0002,
+      "loss": 1.7916,
+      "step": 830
+    },
+    {
+      "epoch": 0.724762726488352,
+      "grad_norm": 1.0371277332305908,
+      "learning_rate": 0.0002,
+      "loss": 1.8186,
+      "step": 840
+    },
+    {
+      "epoch": 0.7333908541846419,
+      "grad_norm": 1.217102289199829,
+      "learning_rate": 0.0002,
+      "loss": 1.7937,
+      "step": 850
+    },
+    {
+      "epoch": 0.7420189818809318,
+      "grad_norm": 1.0528525114059448,
+      "learning_rate": 0.0002,
+      "loss": 1.7317,
+      "step": 860
+    },
+    {
+      "epoch": 0.7506471095772217,
+      "grad_norm": 1.1398800611495972,
+      "learning_rate": 0.0002,
+      "loss": 1.7757,
+      "step": 870
+    },
+    {
+      "epoch": 0.7592752372735116,
+      "grad_norm": 1.1546001434326172,
+      "learning_rate": 0.0002,
+      "loss": 1.8326,
+      "step": 880
+    },
+    {
+      "epoch": 0.7679033649698016,
+      "grad_norm": 1.0745750665664673,
+      "learning_rate": 0.0002,
+      "loss": 1.7178,
+      "step": 890
+    },
+    {
+      "epoch": 0.7765314926660914,
+      "grad_norm": 1.1739161014556885,
+      "learning_rate": 0.0002,
+      "loss": 1.7718,
+      "step": 900
+    },
+    {
+      "epoch": 0.7851596203623814,
+      "grad_norm": 1.1932017803192139,
+      "learning_rate": 0.0002,
+      "loss": 1.7764,
+      "step": 910
+    },
+    {
+      "epoch": 0.7937877480586712,
+      "grad_norm": 1.143064022064209,
+      "learning_rate": 0.0002,
+      "loss": 1.7192,
+      "step": 920
+    },
+    {
+      "epoch": 0.8024158757549612,
+      "grad_norm": 1.200974464416504,
+      "learning_rate": 0.0002,
+      "loss": 1.7342,
+      "step": 930
+    },
+    {
+      "epoch": 0.811044003451251,
+      "grad_norm": 1.0878669023513794,
+      "learning_rate": 0.0002,
+      "loss": 1.7399,
+      "step": 940
+    },
+    {
+      "epoch": 0.819672131147541,
+      "grad_norm": 1.0516951084136963,
+      "learning_rate": 0.0002,
+      "loss": 1.8019,
+      "step": 950
+    },
+    {
+      "epoch": 0.8283002588438308,
+      "grad_norm": 1.2017741203308105,
+      "learning_rate": 0.0002,
+      "loss": 1.7645,
+      "step": 960
+    },
+    {
+      "epoch": 0.8369283865401208,
+      "grad_norm": 0.9762169718742371,
+      "learning_rate": 0.0002,
+      "loss": 1.7367,
+      "step": 970
+    },
+    {
+      "epoch": 0.8455565142364107,
+      "grad_norm": 1.0837513208389282,
+      "learning_rate": 0.0002,
+      "loss": 1.7802,
+      "step": 980
+    },
+    {
+      "epoch": 0.8541846419327006,
+      "grad_norm": 1.155504822731018,
+      "learning_rate": 0.0002,
+      "loss": 1.8094,
+      "step": 990
+    },
+    {
+      "epoch": 0.8628127696289906,
+      "grad_norm": 1.067771315574646,
+      "learning_rate": 0.0002,
+      "loss": 1.7633,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8714408973252804,
+      "grad_norm": 1.2283565998077393,
+      "learning_rate": 0.0002,
+      "loss": 1.7993,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8800690250215704,
+      "grad_norm": 1.1549772024154663,
+      "learning_rate": 0.0002,
+      "loss": 1.7362,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8886971527178602,
+      "grad_norm": 1.0022625923156738,
+      "learning_rate": 0.0002,
+      "loss": 1.7583,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8973252804141502,
+      "grad_norm": 1.0237284898757935,
+      "learning_rate": 0.0002,
+      "loss": 1.7718,
+      "step": 1040
+    },
+    {
+      "epoch": 0.90595340811044,
+      "grad_norm": 1.1863008737564087,
+      "learning_rate": 0.0002,
+      "loss": 1.7457,
+      "step": 1050
+    },
+    {
+      "epoch": 0.91458153580673,
+      "grad_norm": 1.001204013824463,
+      "learning_rate": 0.0002,
+      "loss": 1.6951,
+      "step": 1060
+    },
+    {
+      "epoch": 0.9232096635030198,
+      "grad_norm": 1.2686481475830078,
+      "learning_rate": 0.0002,
+      "loss": 1.7506,
+      "step": 1070
+    },
+    {
+      "epoch": 0.9318377911993098,
+      "grad_norm": 1.0700076818466187,
+      "learning_rate": 0.0002,
+      "loss": 1.7064,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9404659188955996,
+      "grad_norm": 1.05950927734375,
+      "learning_rate": 0.0002,
+      "loss": 1.8015,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9490940465918896,
+      "grad_norm": 0.9669114947319031,
+      "learning_rate": 0.0002,
+      "loss": 1.8155,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9577221742881795,
+      "grad_norm": 1.1823079586029053,
+      "learning_rate": 0.0002,
+      "loss": 1.8074,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9663503019844694,
+      "grad_norm": 1.0857175588607788,
+      "learning_rate": 0.0002,
+      "loss": 1.7636,
+      "step": 1120
+    },
+    {
+      "epoch": 0.9749784296807593,
+      "grad_norm": 1.1258848905563354,
+      "learning_rate": 0.0002,
+      "loss": 1.822,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9836065573770492,
+      "grad_norm": 1.16336989402771,
+      "learning_rate": 0.0002,
+      "loss": 1.8167,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9922346850733391,
+      "grad_norm": 1.118432879447937,
+      "learning_rate": 0.0002,
+      "loss": 1.7402,
+      "step": 1150
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7723218202590942,
+      "eval_runtime": 158.8593,
+      "eval_samples_per_second": 3.355,
+      "eval_steps_per_second": 0.422,
+      "step": 1159
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9272,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.08508082536448e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87c51db54c9eeeb9c31c1d5d10a2ca49db936f9c0e6c5697c8941ee541bc7c94
+size 5688

	@@ -0,0 +1,203 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c75103b2c75c2dff928c26b9642800c9637f7b7afc2381be000c736e8ce0c540
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de11a0742ba08e13f1dfa52f4090ddda5e39ed7e48c969b2b8642ccfc0ab309b
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:481b38ae17fbc851d1b1a141d13a25a8e96e39f13f5d4e3bb5d3591b6b680034
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9343f114ec70aae86a46eac6c101eab59fee4d553a4199f8b437d813e37e38bf
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1666 @@

+{
+  "best_metric": 1.7723218202590942,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 2318,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008628127696289905,
+      "grad_norm": 1.7177482843399048,
+      "learning_rate": 0.0002,
+      "loss": 2.5586,
+      "step": 10
+    },
+    {
+      "epoch": 0.01725625539257981,
+      "grad_norm": 2.8122410774230957,
+      "learning_rate": 0.0002,
+      "loss": 2.2918,
+      "step": 20
+    },
+    {
+      "epoch": 0.025884383088869714,
+      "grad_norm": 1.6668062210083008,
+      "learning_rate": 0.0002,
+      "loss": 2.0885,
+      "step": 30
+    },
+    {
+      "epoch": 0.03451251078515962,
+      "grad_norm": 1.745869755744934,
+      "learning_rate": 0.0002,
+      "loss": 2.0469,
+      "step": 40
+    },
+    {
+      "epoch": 0.04314063848144953,
+      "grad_norm": 1.807971477508545,
+      "learning_rate": 0.0002,
+      "loss": 2.0754,
+      "step": 50
+    },
+    {
+      "epoch": 0.05176876617773943,
+      "grad_norm": 2.4009974002838135,
+      "learning_rate": 0.0002,
+      "loss": 2.0867,
+      "step": 60
+    },
+    {
+      "epoch": 0.060396893874029335,
+      "grad_norm": 2.0974676609039307,
+      "learning_rate": 0.0002,
+      "loss": 1.8575,
+      "step": 70
+    },
+    {
+      "epoch": 0.06902502157031924,
+      "grad_norm": 1.7705916166305542,
+      "learning_rate": 0.0002,
+      "loss": 1.8921,
+      "step": 80
+    },
+    {
+      "epoch": 0.07765314926660914,
+      "grad_norm": 1.4324289560317993,
+      "learning_rate": 0.0002,
+      "loss": 1.8119,
+      "step": 90
+    },
+    {
+      "epoch": 0.08628127696289906,
+      "grad_norm": 1.2521991729736328,
+      "learning_rate": 0.0002,
+      "loss": 1.8728,
+      "step": 100
+    },
+    {
+      "epoch": 0.09490940465918896,
+      "grad_norm": 1.3328145742416382,
+      "learning_rate": 0.0002,
+      "loss": 1.8168,
+      "step": 110
+    },
+    {
+      "epoch": 0.10353753235547886,
+      "grad_norm": 2.2908742427825928,
+      "learning_rate": 0.0002,
+      "loss": 1.8236,
+      "step": 120
+    },
+    {
+      "epoch": 0.11216566005176877,
+      "grad_norm": 1.540981411933899,
+      "learning_rate": 0.0002,
+      "loss": 1.8732,
+      "step": 130
+    },
+    {
+      "epoch": 0.12079378774805867,
+      "grad_norm": 1.1785069704055786,
+      "learning_rate": 0.0002,
+      "loss": 1.8138,
+      "step": 140
+    },
+    {
+      "epoch": 0.12942191544434858,
+      "grad_norm": 1.3138738870620728,
+      "learning_rate": 0.0002,
+      "loss": 1.8655,
+      "step": 150
+    },
+    {
+      "epoch": 0.13805004314063848,
+      "grad_norm": 1.153215765953064,
+      "learning_rate": 0.0002,
+      "loss": 1.8418,
+      "step": 160
+    },
+    {
+      "epoch": 0.14667817083692838,
+      "grad_norm": 1.2071360349655151,
+      "learning_rate": 0.0002,
+      "loss": 1.8284,
+      "step": 170
+    },
+    {
+      "epoch": 0.15530629853321828,
+      "grad_norm": 1.3546127080917358,
+      "learning_rate": 0.0002,
+      "loss": 1.8645,
+      "step": 180
+    },
+    {
+      "epoch": 0.16393442622950818,
+      "grad_norm": 1.1494425535202026,
+      "learning_rate": 0.0002,
+      "loss": 1.8699,
+      "step": 190
+    },
+    {
+      "epoch": 0.1725625539257981,
+      "grad_norm": 0.982718825340271,
+      "learning_rate": 0.0002,
+      "loss": 1.7845,
+      "step": 200
+    },
+    {
+      "epoch": 0.181190681622088,
+      "grad_norm": 1.1329727172851562,
+      "learning_rate": 0.0002,
+      "loss": 1.8237,
+      "step": 210
+    },
+    {
+      "epoch": 0.1898188093183779,
+      "grad_norm": 1.1397384405136108,
+      "learning_rate": 0.0002,
+      "loss": 1.8516,
+      "step": 220
+    },
+    {
+      "epoch": 0.1984469370146678,
+      "grad_norm": 1.2424808740615845,
+      "learning_rate": 0.0002,
+      "loss": 1.7504,
+      "step": 230
+    },
+    {
+      "epoch": 0.2070750647109577,
+      "grad_norm": 1.1463897228240967,
+      "learning_rate": 0.0002,
+      "loss": 1.7626,
+      "step": 240
+    },
+    {
+      "epoch": 0.21570319240724764,
+      "grad_norm": 1.2353036403656006,
+      "learning_rate": 0.0002,
+      "loss": 1.7977,
+      "step": 250
+    },
+    {
+      "epoch": 0.22433132010353754,
+      "grad_norm": 1.0135247707366943,
+      "learning_rate": 0.0002,
+      "loss": 1.8274,
+      "step": 260
+    },
+    {
+      "epoch": 0.23295944779982744,
+      "grad_norm": 1.1388282775878906,
+      "learning_rate": 0.0002,
+      "loss": 1.7678,
+      "step": 270
+    },
+    {
+      "epoch": 0.24158757549611734,
+      "grad_norm": 1.1262438297271729,
+      "learning_rate": 0.0002,
+      "loss": 1.7895,
+      "step": 280
+    },
+    {
+      "epoch": 0.25021570319240727,
+      "grad_norm": 1.0581450462341309,
+      "learning_rate": 0.0002,
+      "loss": 1.826,
+      "step": 290
+    },
+    {
+      "epoch": 0.25884383088869717,
+      "grad_norm": 1.1737277507781982,
+      "learning_rate": 0.0002,
+      "loss": 1.7269,
+      "step": 300
+    },
+    {
+      "epoch": 0.26747195858498707,
+      "grad_norm": 1.0906627178192139,
+      "learning_rate": 0.0002,
+      "loss": 1.7975,
+      "step": 310
+    },
+    {
+      "epoch": 0.27610008628127697,
+      "grad_norm": 1.0010069608688354,
+      "learning_rate": 0.0002,
+      "loss": 1.7594,
+      "step": 320
+    },
+    {
+      "epoch": 0.28472821397756687,
+      "grad_norm": 1.2149732112884521,
+      "learning_rate": 0.0002,
+      "loss": 1.7998,
+      "step": 330
+    },
+    {
+      "epoch": 0.29335634167385677,
+      "grad_norm": 1.293990969657898,
+      "learning_rate": 0.0002,
+      "loss": 1.8079,
+      "step": 340
+    },
+    {
+      "epoch": 0.30198446937014667,
+      "grad_norm": 1.0082058906555176,
+      "learning_rate": 0.0002,
+      "loss": 1.7629,
+      "step": 350
+    },
+    {
+      "epoch": 0.31061259706643657,
+      "grad_norm": 1.0307148694992065,
+      "learning_rate": 0.0002,
+      "loss": 1.8001,
+      "step": 360
+    },
+    {
+      "epoch": 0.31924072476272647,
+      "grad_norm": 0.9646756649017334,
+      "learning_rate": 0.0002,
+      "loss": 1.7456,
+      "step": 370
+    },
+    {
+      "epoch": 0.32786885245901637,
+      "grad_norm": 1.105623722076416,
+      "learning_rate": 0.0002,
+      "loss": 1.7979,
+      "step": 380
+    },
+    {
+      "epoch": 0.3364969801553063,
+      "grad_norm": 0.9365625977516174,
+      "learning_rate": 0.0002,
+      "loss": 1.7313,
+      "step": 390
+    },
+    {
+      "epoch": 0.3451251078515962,
+      "grad_norm": 1.1378847360610962,
+      "learning_rate": 0.0002,
+      "loss": 1.809,
+      "step": 400
+    },
+    {
+      "epoch": 0.3537532355478861,
+      "grad_norm": 1.1266193389892578,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 410
+    },
+    {
+      "epoch": 0.362381363244176,
+      "grad_norm": 1.0886635780334473,
+      "learning_rate": 0.0002,
+      "loss": 1.8096,
+      "step": 420
+    },
+    {
+      "epoch": 0.3710094909404659,
+      "grad_norm": 1.0463931560516357,
+      "learning_rate": 0.0002,
+      "loss": 1.7422,
+      "step": 430
+    },
+    {
+      "epoch": 0.3796376186367558,
+      "grad_norm": 1.0923888683319092,
+      "learning_rate": 0.0002,
+      "loss": 1.7936,
+      "step": 440
+    },
+    {
+      "epoch": 0.3882657463330457,
+      "grad_norm": 1.1386370658874512,
+      "learning_rate": 0.0002,
+      "loss": 1.7777,
+      "step": 450
+    },
+    {
+      "epoch": 0.3968938740293356,
+      "grad_norm": 1.0098074674606323,
+      "learning_rate": 0.0002,
+      "loss": 1.7211,
+      "step": 460
+    },
+    {
+      "epoch": 0.4055220017256255,
+      "grad_norm": 1.1237372159957886,
+      "learning_rate": 0.0002,
+      "loss": 1.7457,
+      "step": 470
+    },
+    {
+      "epoch": 0.4141501294219154,
+      "grad_norm": 1.0218915939331055,
+      "learning_rate": 0.0002,
+      "loss": 1.854,
+      "step": 480
+    },
+    {
+      "epoch": 0.4227782571182053,
+      "grad_norm": 0.9998831748962402,
+      "learning_rate": 0.0002,
+      "loss": 1.8548,
+      "step": 490
+    },
+    {
+      "epoch": 0.4314063848144953,
+      "grad_norm": 1.0424970388412476,
+      "learning_rate": 0.0002,
+      "loss": 1.7159,
+      "step": 500
+    },
+    {
+      "epoch": 0.4400345125107852,
+      "grad_norm": 0.903372585773468,
+      "learning_rate": 0.0002,
+      "loss": 1.788,
+      "step": 510
+    },
+    {
+      "epoch": 0.4486626402070751,
+      "grad_norm": 1.0864766836166382,
+      "learning_rate": 0.0002,
+      "loss": 1.8293,
+      "step": 520
+    },
+    {
+      "epoch": 0.457290767903365,
+      "grad_norm": 0.9694207310676575,
+      "learning_rate": 0.0002,
+      "loss": 1.8402,
+      "step": 530
+    },
+    {
+      "epoch": 0.4659188955996549,
+      "grad_norm": 1.2796396017074585,
+      "learning_rate": 0.0002,
+      "loss": 1.7802,
+      "step": 540
+    },
+    {
+      "epoch": 0.4745470232959448,
+      "grad_norm": 1.0316239595413208,
+      "learning_rate": 0.0002,
+      "loss": 1.7716,
+      "step": 550
+    },
+    {
+      "epoch": 0.4831751509922347,
+      "grad_norm": 1.0445313453674316,
+      "learning_rate": 0.0002,
+      "loss": 1.7734,
+      "step": 560
+    },
+    {
+      "epoch": 0.4918032786885246,
+      "grad_norm": 1.1078376770019531,
+      "learning_rate": 0.0002,
+      "loss": 1.8082,
+      "step": 570
+    },
+    {
+      "epoch": 0.5004314063848145,
+      "grad_norm": 1.0551974773406982,
+      "learning_rate": 0.0002,
+      "loss": 1.7298,
+      "step": 580
+    },
+    {
+      "epoch": 0.5090595340811044,
+      "grad_norm": 1.114853858947754,
+      "learning_rate": 0.0002,
+      "loss": 1.8673,
+      "step": 590
+    },
+    {
+      "epoch": 0.5176876617773943,
+      "grad_norm": 1.0642707347869873,
+      "learning_rate": 0.0002,
+      "loss": 1.7684,
+      "step": 600
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 1.088079810142517,
+      "learning_rate": 0.0002,
+      "loss": 1.8367,
+      "step": 610
+    },
+    {
+      "epoch": 0.5349439171699741,
+      "grad_norm": 1.4029462337493896,
+      "learning_rate": 0.0002,
+      "loss": 1.775,
+      "step": 620
+    },
+    {
+      "epoch": 0.543572044866264,
+      "grad_norm": 1.2136136293411255,
+      "learning_rate": 0.0002,
+      "loss": 1.7771,
+      "step": 630
+    },
+    {
+      "epoch": 0.5522001725625539,
+      "grad_norm": 0.9642075896263123,
+      "learning_rate": 0.0002,
+      "loss": 1.8006,
+      "step": 640
+    },
+    {
+      "epoch": 0.5608283002588438,
+      "grad_norm": 1.0879552364349365,
+      "learning_rate": 0.0002,
+      "loss": 1.7478,
+      "step": 650
+    },
+    {
+      "epoch": 0.5694564279551337,
+      "grad_norm": 1.1766546964645386,
+      "learning_rate": 0.0002,
+      "loss": 1.8427,
+      "step": 660
+    },
+    {
+      "epoch": 0.5780845556514237,
+      "grad_norm": 1.582840085029602,
+      "learning_rate": 0.0002,
+      "loss": 1.7129,
+      "step": 670
+    },
+    {
+      "epoch": 0.5867126833477135,
+      "grad_norm": 1.0681092739105225,
+      "learning_rate": 0.0002,
+      "loss": 1.8093,
+      "step": 680
+    },
+    {
+      "epoch": 0.5953408110440035,
+      "grad_norm": 1.103897213935852,
+      "learning_rate": 0.0002,
+      "loss": 1.8067,
+      "step": 690
+    },
+    {
+      "epoch": 0.6039689387402933,
+      "grad_norm": 1.0974211692810059,
+      "learning_rate": 0.0002,
+      "loss": 1.7425,
+      "step": 700
+    },
+    {
+      "epoch": 0.6125970664365833,
+      "grad_norm": 1.1002469062805176,
+      "learning_rate": 0.0002,
+      "loss": 1.784,
+      "step": 710
+    },
+    {
+      "epoch": 0.6212251941328731,
+      "grad_norm": 1.0022329092025757,
+      "learning_rate": 0.0002,
+      "loss": 1.8106,
+      "step": 720
+    },
+    {
+      "epoch": 0.6298533218291631,
+      "grad_norm": 1.0089571475982666,
+      "learning_rate": 0.0002,
+      "loss": 1.7647,
+      "step": 730
+    },
+    {
+      "epoch": 0.6384814495254529,
+      "grad_norm": 0.9531904458999634,
+      "learning_rate": 0.0002,
+      "loss": 1.8033,
+      "step": 740
+    },
+    {
+      "epoch": 0.6471095772217429,
+      "grad_norm": 1.162675380706787,
+      "learning_rate": 0.0002,
+      "loss": 1.7644,
+      "step": 750
+    },
+    {
+      "epoch": 0.6557377049180327,
+      "grad_norm": 1.0488134622573853,
+      "learning_rate": 0.0002,
+      "loss": 1.7531,
+      "step": 760
+    },
+    {
+      "epoch": 0.6643658326143227,
+      "grad_norm": 1.12964928150177,
+      "learning_rate": 0.0002,
+      "loss": 1.7583,
+      "step": 770
+    },
+    {
+      "epoch": 0.6729939603106126,
+      "grad_norm": 1.0867345333099365,
+      "learning_rate": 0.0002,
+      "loss": 1.7765,
+      "step": 780
+    },
+    {
+      "epoch": 0.6816220880069025,
+      "grad_norm": 1.1084282398223877,
+      "learning_rate": 0.0002,
+      "loss": 1.7797,
+      "step": 790
+    },
+    {
+      "epoch": 0.6902502157031924,
+      "grad_norm": 0.9905423521995544,
+      "learning_rate": 0.0002,
+      "loss": 1.7792,
+      "step": 800
+    },
+    {
+      "epoch": 0.6988783433994823,
+      "grad_norm": 1.18604576587677,
+      "learning_rate": 0.0002,
+      "loss": 1.7825,
+      "step": 810
+    },
+    {
+      "epoch": 0.7075064710957722,
+      "grad_norm": 1.0819629430770874,
+      "learning_rate": 0.0002,
+      "loss": 1.8242,
+      "step": 820
+    },
+    {
+      "epoch": 0.7161345987920621,
+      "grad_norm": 2.0091195106506348,
+      "learning_rate": 0.0002,
+      "loss": 1.7916,
+      "step": 830
+    },
+    {
+      "epoch": 0.724762726488352,
+      "grad_norm": 1.0371277332305908,
+      "learning_rate": 0.0002,
+      "loss": 1.8186,
+      "step": 840
+    },
+    {
+      "epoch": 0.7333908541846419,
+      "grad_norm": 1.217102289199829,
+      "learning_rate": 0.0002,
+      "loss": 1.7937,
+      "step": 850
+    },
+    {
+      "epoch": 0.7420189818809318,
+      "grad_norm": 1.0528525114059448,
+      "learning_rate": 0.0002,
+      "loss": 1.7317,
+      "step": 860
+    },
+    {
+      "epoch": 0.7506471095772217,
+      "grad_norm": 1.1398800611495972,
+      "learning_rate": 0.0002,
+      "loss": 1.7757,
+      "step": 870
+    },
+    {
+      "epoch": 0.7592752372735116,
+      "grad_norm": 1.1546001434326172,
+      "learning_rate": 0.0002,
+      "loss": 1.8326,
+      "step": 880
+    },
+    {
+      "epoch": 0.7679033649698016,
+      "grad_norm": 1.0745750665664673,
+      "learning_rate": 0.0002,
+      "loss": 1.7178,
+      "step": 890
+    },
+    {
+      "epoch": 0.7765314926660914,
+      "grad_norm": 1.1739161014556885,
+      "learning_rate": 0.0002,
+      "loss": 1.7718,
+      "step": 900
+    },
+    {
+      "epoch": 0.7851596203623814,
+      "grad_norm": 1.1932017803192139,
+      "learning_rate": 0.0002,
+      "loss": 1.7764,
+      "step": 910
+    },
+    {
+      "epoch": 0.7937877480586712,
+      "grad_norm": 1.143064022064209,
+      "learning_rate": 0.0002,
+      "loss": 1.7192,
+      "step": 920
+    },
+    {
+      "epoch": 0.8024158757549612,
+      "grad_norm": 1.200974464416504,
+      "learning_rate": 0.0002,
+      "loss": 1.7342,
+      "step": 930
+    },
+    {
+      "epoch": 0.811044003451251,
+      "grad_norm": 1.0878669023513794,
+      "learning_rate": 0.0002,
+      "loss": 1.7399,
+      "step": 940
+    },
+    {
+      "epoch": 0.819672131147541,
+      "grad_norm": 1.0516951084136963,
+      "learning_rate": 0.0002,
+      "loss": 1.8019,
+      "step": 950
+    },
+    {
+      "epoch": 0.8283002588438308,
+      "grad_norm": 1.2017741203308105,
+      "learning_rate": 0.0002,
+      "loss": 1.7645,
+      "step": 960
+    },
+    {
+      "epoch": 0.8369283865401208,
+      "grad_norm": 0.9762169718742371,
+      "learning_rate": 0.0002,
+      "loss": 1.7367,
+      "step": 970
+    },
+    {
+      "epoch": 0.8455565142364107,
+      "grad_norm": 1.0837513208389282,
+      "learning_rate": 0.0002,
+      "loss": 1.7802,
+      "step": 980
+    },
+    {
+      "epoch": 0.8541846419327006,
+      "grad_norm": 1.155504822731018,
+      "learning_rate": 0.0002,
+      "loss": 1.8094,
+      "step": 990
+    },
+    {
+      "epoch": 0.8628127696289906,
+      "grad_norm": 1.067771315574646,
+      "learning_rate": 0.0002,
+      "loss": 1.7633,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8714408973252804,
+      "grad_norm": 1.2283565998077393,
+      "learning_rate": 0.0002,
+      "loss": 1.7993,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8800690250215704,
+      "grad_norm": 1.1549772024154663,
+      "learning_rate": 0.0002,
+      "loss": 1.7362,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8886971527178602,
+      "grad_norm": 1.0022625923156738,
+      "learning_rate": 0.0002,
+      "loss": 1.7583,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8973252804141502,
+      "grad_norm": 1.0237284898757935,
+      "learning_rate": 0.0002,
+      "loss": 1.7718,
+      "step": 1040
+    },
+    {
+      "epoch": 0.90595340811044,
+      "grad_norm": 1.1863008737564087,
+      "learning_rate": 0.0002,
+      "loss": 1.7457,
+      "step": 1050
+    },
+    {
+      "epoch": 0.91458153580673,
+      "grad_norm": 1.001204013824463,
+      "learning_rate": 0.0002,
+      "loss": 1.6951,
+      "step": 1060
+    },
+    {
+      "epoch": 0.9232096635030198,
+      "grad_norm": 1.2686481475830078,
+      "learning_rate": 0.0002,
+      "loss": 1.7506,
+      "step": 1070
+    },
+    {
+      "epoch": 0.9318377911993098,
+      "grad_norm": 1.0700076818466187,
+      "learning_rate": 0.0002,
+      "loss": 1.7064,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9404659188955996,
+      "grad_norm": 1.05950927734375,
+      "learning_rate": 0.0002,
+      "loss": 1.8015,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9490940465918896,
+      "grad_norm": 0.9669114947319031,
+      "learning_rate": 0.0002,
+      "loss": 1.8155,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9577221742881795,
+      "grad_norm": 1.1823079586029053,
+      "learning_rate": 0.0002,
+      "loss": 1.8074,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9663503019844694,
+      "grad_norm": 1.0857175588607788,
+      "learning_rate": 0.0002,
+      "loss": 1.7636,
+      "step": 1120
+    },
+    {
+      "epoch": 0.9749784296807593,
+      "grad_norm": 1.1258848905563354,
+      "learning_rate": 0.0002,
+      "loss": 1.822,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9836065573770492,
+      "grad_norm": 1.16336989402771,
+      "learning_rate": 0.0002,
+      "loss": 1.8167,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9922346850733391,
+      "grad_norm": 1.118432879447937,
+      "learning_rate": 0.0002,
+      "loss": 1.7402,
+      "step": 1150
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7723218202590942,
+      "eval_runtime": 158.8593,
+      "eval_samples_per_second": 3.355,
+      "eval_steps_per_second": 0.422,
+      "step": 1159
+    },
+    {
+      "epoch": 1.000862812769629,
+      "grad_norm": 1.1056718826293945,
+      "learning_rate": 0.0002,
+      "loss": 1.7863,
+      "step": 1160
+    },
+    {
+      "epoch": 1.009490940465919,
+      "grad_norm": 1.0352667570114136,
+      "learning_rate": 0.0002,
+      "loss": 1.672,
+      "step": 1170
+    },
+    {
+      "epoch": 1.0181190681622088,
+      "grad_norm": 1.0315937995910645,
+      "learning_rate": 0.0002,
+      "loss": 1.6718,
+      "step": 1180
+    },
+    {
+      "epoch": 1.0267471958584986,
+      "grad_norm": 1.369126558303833,
+      "learning_rate": 0.0002,
+      "loss": 1.6937,
+      "step": 1190
+    },
+    {
+      "epoch": 1.0353753235547887,
+      "grad_norm": 1.330876350402832,
+      "learning_rate": 0.0002,
+      "loss": 1.6732,
+      "step": 1200
+    },
+    {
+      "epoch": 1.0440034512510785,
+      "grad_norm": 1.406552791595459,
+      "learning_rate": 0.0002,
+      "loss": 1.6497,
+      "step": 1210
+    },
+    {
+      "epoch": 1.0526315789473684,
+      "grad_norm": 1.1256251335144043,
+      "learning_rate": 0.0002,
+      "loss": 1.6873,
+      "step": 1220
+    },
+    {
+      "epoch": 1.0612597066436584,
+      "grad_norm": 1.315566897392273,
+      "learning_rate": 0.0002,
+      "loss": 1.6765,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0698878343399483,
+      "grad_norm": 1.2100263833999634,
+      "learning_rate": 0.0002,
+      "loss": 1.6763,
+      "step": 1240
+    },
+    {
+      "epoch": 1.0785159620362381,
+      "grad_norm": 1.2762185335159302,
+      "learning_rate": 0.0002,
+      "loss": 1.6496,
+      "step": 1250
+    },
+    {
+      "epoch": 1.087144089732528,
+      "grad_norm": 1.2971566915512085,
+      "learning_rate": 0.0002,
+      "loss": 1.6826,
+      "step": 1260
+    },
+    {
+      "epoch": 1.095772217428818,
+      "grad_norm": 1.3685089349746704,
+      "learning_rate": 0.0002,
+      "loss": 1.6721,
+      "step": 1270
+    },
+    {
+      "epoch": 1.1044003451251079,
+      "grad_norm": 1.3135347366333008,
+      "learning_rate": 0.0002,
+      "loss": 1.6399,
+      "step": 1280
+    },
+    {
+      "epoch": 1.1130284728213977,
+      "grad_norm": 1.4514861106872559,
+      "learning_rate": 0.0002,
+      "loss": 1.641,
+      "step": 1290
+    },
+    {
+      "epoch": 1.1216566005176876,
+      "grad_norm": 1.5077004432678223,
+      "learning_rate": 0.0002,
+      "loss": 1.6443,
+      "step": 1300
+    },
+    {
+      "epoch": 1.1302847282139776,
+      "grad_norm": 1.4807840585708618,
+      "learning_rate": 0.0002,
+      "loss": 1.6406,
+      "step": 1310
+    },
+    {
+      "epoch": 1.1389128559102675,
+      "grad_norm": 1.2386537790298462,
+      "learning_rate": 0.0002,
+      "loss": 1.7022,
+      "step": 1320
+    },
+    {
+      "epoch": 1.1475409836065573,
+      "grad_norm": 1.2637739181518555,
+      "learning_rate": 0.0002,
+      "loss": 1.6265,
+      "step": 1330
+    },
+    {
+      "epoch": 1.1561691113028472,
+      "grad_norm": 1.2472519874572754,
+      "learning_rate": 0.0002,
+      "loss": 1.7103,
+      "step": 1340
+    },
+    {
+      "epoch": 1.1647972389991372,
+      "grad_norm": 1.290644884109497,
+      "learning_rate": 0.0002,
+      "loss": 1.676,
+      "step": 1350
+    },
+    {
+      "epoch": 1.173425366695427,
+      "grad_norm": 1.3227870464324951,
+      "learning_rate": 0.0002,
+      "loss": 1.6713,
+      "step": 1360
+    },
+    {
+      "epoch": 1.182053494391717,
+      "grad_norm": 1.3311200141906738,
+      "learning_rate": 0.0002,
+      "loss": 1.7158,
+      "step": 1370
+    },
+    {
+      "epoch": 1.190681622088007,
+      "grad_norm": 1.2624584436416626,
+      "learning_rate": 0.0002,
+      "loss": 1.6501,
+      "step": 1380
+    },
+    {
+      "epoch": 1.1993097497842968,
+      "grad_norm": 1.4712986946105957,
+      "learning_rate": 0.0002,
+      "loss": 1.6398,
+      "step": 1390
+    },
+    {
+      "epoch": 1.2079378774805867,
+      "grad_norm": 1.416508674621582,
+      "learning_rate": 0.0002,
+      "loss": 1.6818,
+      "step": 1400
+    },
+    {
+      "epoch": 1.2165660051768765,
+      "grad_norm": 1.367967963218689,
+      "learning_rate": 0.0002,
+      "loss": 1.7184,
+      "step": 1410
+    },
+    {
+      "epoch": 1.2251941328731666,
+      "grad_norm": 1.3865700960159302,
+      "learning_rate": 0.0002,
+      "loss": 1.6834,
+      "step": 1420
+    },
+    {
+      "epoch": 1.2338222605694564,
+      "grad_norm": 2.076512336730957,
+      "learning_rate": 0.0002,
+      "loss": 1.7532,
+      "step": 1430
+    },
+    {
+      "epoch": 1.2424503882657463,
+      "grad_norm": 1.305572509765625,
+      "learning_rate": 0.0002,
+      "loss": 1.7448,
+      "step": 1440
+    },
+    {
+      "epoch": 1.2510785159620363,
+      "grad_norm": 1.2752642631530762,
+      "learning_rate": 0.0002,
+      "loss": 1.7422,
+      "step": 1450
+    },
+    {
+      "epoch": 1.2597066436583262,
+      "grad_norm": 1.1802726984024048,
+      "learning_rate": 0.0002,
+      "loss": 1.7121,
+      "step": 1460
+    },
+    {
+      "epoch": 1.268334771354616,
+      "grad_norm": 1.2195663452148438,
+      "learning_rate": 0.0002,
+      "loss": 1.7617,
+      "step": 1470
+    },
+    {
+      "epoch": 1.2769628990509059,
+      "grad_norm": 1.3073176145553589,
+      "learning_rate": 0.0002,
+      "loss": 1.6022,
+      "step": 1480
+    },
+    {
+      "epoch": 1.2855910267471957,
+      "grad_norm": 1.2829731702804565,
+      "learning_rate": 0.0002,
+      "loss": 1.6472,
+      "step": 1490
+    },
+    {
+      "epoch": 1.2942191544434858,
+      "grad_norm": 1.361060619354248,
+      "learning_rate": 0.0002,
+      "loss": 1.6076,
+      "step": 1500
+    },
+    {
+      "epoch": 1.3028472821397756,
+      "grad_norm": 1.4285917282104492,
+      "learning_rate": 0.0002,
+      "loss": 1.7059,
+      "step": 1510
+    },
+    {
+      "epoch": 1.3114754098360657,
+      "grad_norm": 1.186866283416748,
+      "learning_rate": 0.0002,
+      "loss": 1.696,
+      "step": 1520
+    },
+    {
+      "epoch": 1.3201035375323555,
+      "grad_norm": 1.2615889310836792,
+      "learning_rate": 0.0002,
+      "loss": 1.6707,
+      "step": 1530
+    },
+    {
+      "epoch": 1.3287316652286454,
+      "grad_norm": 1.2732815742492676,
+      "learning_rate": 0.0002,
+      "loss": 1.5797,
+      "step": 1540
+    },
+    {
+      "epoch": 1.3373597929249352,
+      "grad_norm": 1.4152132272720337,
+      "learning_rate": 0.0002,
+      "loss": 1.6623,
+      "step": 1550
+    },
+    {
+      "epoch": 1.345987920621225,
+      "grad_norm": 1.1730318069458008,
+      "learning_rate": 0.0002,
+      "loss": 1.6649,
+      "step": 1560
+    },
+    {
+      "epoch": 1.3546160483175151,
+      "grad_norm": 1.2282229661941528,
+      "learning_rate": 0.0002,
+      "loss": 1.7247,
+      "step": 1570
+    },
+    {
+      "epoch": 1.363244176013805,
+      "grad_norm": 1.227974534034729,
+      "learning_rate": 0.0002,
+      "loss": 1.7125,
+      "step": 1580
+    },
+    {
+      "epoch": 1.3718723037100948,
+      "grad_norm": 1.3480374813079834,
+      "learning_rate": 0.0002,
+      "loss": 1.622,
+      "step": 1590
+    },
+    {
+      "epoch": 1.380500431406385,
+      "grad_norm": 1.3460094928741455,
+      "learning_rate": 0.0002,
+      "loss": 1.7126,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3891285591026747,
+      "grad_norm": 1.254465937614441,
+      "learning_rate": 0.0002,
+      "loss": 1.6845,
+      "step": 1610
+    },
+    {
+      "epoch": 1.3977566867989646,
+      "grad_norm": 1.4135496616363525,
+      "learning_rate": 0.0002,
+      "loss": 1.643,
+      "step": 1620
+    },
+    {
+      "epoch": 1.4063848144952544,
+      "grad_norm": 1.277063012123108,
+      "learning_rate": 0.0002,
+      "loss": 1.6392,
+      "step": 1630
+    },
+    {
+      "epoch": 1.4150129421915445,
+      "grad_norm": 1.5031940937042236,
+      "learning_rate": 0.0002,
+      "loss": 1.7338,
+      "step": 1640
+    },
+    {
+      "epoch": 1.4236410698878343,
+      "grad_norm": 1.3918952941894531,
+      "learning_rate": 0.0002,
+      "loss": 1.6229,
+      "step": 1650
+    },
+    {
+      "epoch": 1.4322691975841242,
+      "grad_norm": 1.5893778800964355,
+      "learning_rate": 0.0002,
+      "loss": 1.6893,
+      "step": 1660
+    },
+    {
+      "epoch": 1.4408973252804143,
+      "grad_norm": 1.4636809825897217,
+      "learning_rate": 0.0002,
+      "loss": 1.7129,
+      "step": 1670
+    },
+    {
+      "epoch": 1.449525452976704,
+      "grad_norm": 1.1985419988632202,
+      "learning_rate": 0.0002,
+      "loss": 1.6481,
+      "step": 1680
+    },
+    {
+      "epoch": 1.458153580672994,
+      "grad_norm": 1.509252905845642,
+      "learning_rate": 0.0002,
+      "loss": 1.7322,
+      "step": 1690
+    },
+    {
+      "epoch": 1.4667817083692838,
+      "grad_norm": 1.4157838821411133,
+      "learning_rate": 0.0002,
+      "loss": 1.6653,
+      "step": 1700
+    },
+    {
+      "epoch": 1.4754098360655736,
+      "grad_norm": 1.3481059074401855,
+      "learning_rate": 0.0002,
+      "loss": 1.7111,
+      "step": 1710
+    },
+    {
+      "epoch": 1.4840379637618637,
+      "grad_norm": 1.4127949476242065,
+      "learning_rate": 0.0002,
+      "loss": 1.6488,
+      "step": 1720
+    },
+    {
+      "epoch": 1.4926660914581535,
+      "grad_norm": 1.3087295293807983,
+      "learning_rate": 0.0002,
+      "loss": 1.6336,
+      "step": 1730
+    },
+    {
+      "epoch": 1.5012942191544436,
+      "grad_norm": 1.4421851634979248,
+      "learning_rate": 0.0002,
+      "loss": 1.7226,
+      "step": 1740
+    },
+    {
+      "epoch": 1.5099223468507335,
+      "grad_norm": 1.3953148126602173,
+      "learning_rate": 0.0002,
+      "loss": 1.7006,
+      "step": 1750
+    },
+    {
+      "epoch": 1.5185504745470233,
+      "grad_norm": 1.4613851308822632,
+      "learning_rate": 0.0002,
+      "loss": 1.6281,
+      "step": 1760
+    },
+    {
+      "epoch": 1.5271786022433131,
+      "grad_norm": 1.2866744995117188,
+      "learning_rate": 0.0002,
+      "loss": 1.6404,
+      "step": 1770
+    },
+    {
+      "epoch": 1.535806729939603,
+      "grad_norm": 1.2769535779953003,
+      "learning_rate": 0.0002,
+      "loss": 1.628,
+      "step": 1780
+    },
+    {
+      "epoch": 1.544434857635893,
+      "grad_norm": 1.371022343635559,
+      "learning_rate": 0.0002,
+      "loss": 1.6439,
+      "step": 1790
+    },
+    {
+      "epoch": 1.553062985332183,
+      "grad_norm": 1.4434700012207031,
+      "learning_rate": 0.0002,
+      "loss": 1.6363,
+      "step": 1800
+    },
+    {
+      "epoch": 1.561691113028473,
+      "grad_norm": 1.269386887550354,
+      "learning_rate": 0.0002,
+      "loss": 1.6606,
+      "step": 1810
+    },
+    {
+      "epoch": 1.5703192407247628,
+      "grad_norm": 1.2668766975402832,
+      "learning_rate": 0.0002,
+      "loss": 1.6493,
+      "step": 1820
+    },
+    {
+      "epoch": 1.5789473684210527,
+      "grad_norm": 1.4857951402664185,
+      "learning_rate": 0.0002,
+      "loss": 1.7124,
+      "step": 1830
+    },
+    {
+      "epoch": 1.5875754961173425,
+      "grad_norm": 1.330338954925537,
+      "learning_rate": 0.0002,
+      "loss": 1.6474,
+      "step": 1840
+    },
+    {
+      "epoch": 1.5962036238136323,
+      "grad_norm": 1.3832308053970337,
+      "learning_rate": 0.0002,
+      "loss": 1.6412,
+      "step": 1850
+    },
+    {
+      "epoch": 1.6048317515099222,
+      "grad_norm": 1.2697869539260864,
+      "learning_rate": 0.0002,
+      "loss": 1.6988,
+      "step": 1860
+    },
+    {
+      "epoch": 1.6134598792062123,
+      "grad_norm": 1.338875412940979,
+      "learning_rate": 0.0002,
+      "loss": 1.6651,
+      "step": 1870
+    },
+    {
+      "epoch": 1.6220880069025023,
+      "grad_norm": 1.4077556133270264,
+      "learning_rate": 0.0002,
+      "loss": 1.7319,
+      "step": 1880
+    },
+    {
+      "epoch": 1.6307161345987922,
+      "grad_norm": 1.40274178981781,
+      "learning_rate": 0.0002,
+      "loss": 1.644,
+      "step": 1890
+    },
+    {
+      "epoch": 1.639344262295082,
+      "grad_norm": 1.416042447090149,
+      "learning_rate": 0.0002,
+      "loss": 1.6648,
+      "step": 1900
+    },
+    {
+      "epoch": 1.6479723899913719,
+      "grad_norm": 1.4196866750717163,
+      "learning_rate": 0.0002,
+      "loss": 1.729,
+      "step": 1910
+    },
+    {
+      "epoch": 1.6566005176876617,
+      "grad_norm": 1.378732681274414,
+      "learning_rate": 0.0002,
+      "loss": 1.7381,
+      "step": 1920
+    },
+    {
+      "epoch": 1.6652286453839515,
+      "grad_norm": 1.544751524925232,
+      "learning_rate": 0.0002,
+      "loss": 1.7804,
+      "step": 1930
+    },
+    {
+      "epoch": 1.6738567730802416,
+      "grad_norm": 1.4318190813064575,
+      "learning_rate": 0.0002,
+      "loss": 1.6563,
+      "step": 1940
+    },
+    {
+      "epoch": 1.6824849007765315,
+      "grad_norm": 1.3794575929641724,
+      "learning_rate": 0.0002,
+      "loss": 1.6806,
+      "step": 1950
+    },
+    {
+      "epoch": 1.6911130284728215,
+      "grad_norm": 1.6301822662353516,
+      "learning_rate": 0.0002,
+      "loss": 1.6707,
+      "step": 1960
+    },
+    {
+      "epoch": 1.6997411561691114,
+      "grad_norm": 1.3090870380401611,
+      "learning_rate": 0.0002,
+      "loss": 1.6945,
+      "step": 1970
+    },
+    {
+      "epoch": 1.7083692838654012,
+      "grad_norm": 1.4537303447723389,
+      "learning_rate": 0.0002,
+      "loss": 1.6018,
+      "step": 1980
+    },
+    {
+      "epoch": 1.716997411561691,
+      "grad_norm": 1.3618766069412231,
+      "learning_rate": 0.0002,
+      "loss": 1.7225,
+      "step": 1990
+    },
+    {
+      "epoch": 1.725625539257981,
+      "grad_norm": 1.398790955543518,
+      "learning_rate": 0.0002,
+      "loss": 1.6948,
+      "step": 2000
+    },
+    {
+      "epoch": 1.734253666954271,
+      "grad_norm": 1.4606391191482544,
+      "learning_rate": 0.0002,
+      "loss": 1.6963,
+      "step": 2010
+    },
+    {
+      "epoch": 1.7428817946505608,
+      "grad_norm": 1.602010726928711,
+      "learning_rate": 0.0002,
+      "loss": 1.727,
+      "step": 2020
+    },
+    {
+      "epoch": 1.7515099223468509,
+      "grad_norm": 1.4865907430648804,
+      "learning_rate": 0.0002,
+      "loss": 1.7238,
+      "step": 2030
+    },
+    {
+      "epoch": 1.7601380500431407,
+      "grad_norm": 1.5954750776290894,
+      "learning_rate": 0.0002,
+      "loss": 1.713,
+      "step": 2040
+    },
+    {
+      "epoch": 1.7687661777394306,
+      "grad_norm": 1.3561054468154907,
+      "learning_rate": 0.0002,
+      "loss": 1.6794,
+      "step": 2050
+    },
+    {
+      "epoch": 1.7773943054357204,
+      "grad_norm": 1.4540512561798096,
+      "learning_rate": 0.0002,
+      "loss": 1.7058,
+      "step": 2060
+    },
+    {
+      "epoch": 1.7860224331320103,
+      "grad_norm": 1.2661199569702148,
+      "learning_rate": 0.0002,
+      "loss": 1.6187,
+      "step": 2070
+    },
+    {
+      "epoch": 1.7946505608283,
+      "grad_norm": 2.188016176223755,
+      "learning_rate": 0.0002,
+      "loss": 1.6998,
+      "step": 2080
+    },
+    {
+      "epoch": 1.8032786885245902,
+      "grad_norm": 1.4326417446136475,
+      "learning_rate": 0.0002,
+      "loss": 1.6909,
+      "step": 2090
+    },
+    {
+      "epoch": 1.8119068162208802,
+      "grad_norm": 2.2382805347442627,
+      "learning_rate": 0.0002,
+      "loss": 1.7765,
+      "step": 2100
+    },
+    {
+      "epoch": 1.82053494391717,
+      "grad_norm": 1.396160364151001,
+      "learning_rate": 0.0002,
+      "loss": 1.7034,
+      "step": 2110
+    },
+    {
+      "epoch": 1.82916307161346,
+      "grad_norm": 1.3848069906234741,
+      "learning_rate": 0.0002,
+      "loss": 1.629,
+      "step": 2120
+    },
+    {
+      "epoch": 1.8377911993097498,
+      "grad_norm": 1.6975245475769043,
+      "learning_rate": 0.0002,
+      "loss": 1.6153,
+      "step": 2130
+    },
+    {
+      "epoch": 1.8464193270060396,
+      "grad_norm": 1.476306676864624,
+      "learning_rate": 0.0002,
+      "loss": 1.6631,
+      "step": 2140
+    },
+    {
+      "epoch": 1.8550474547023295,
+      "grad_norm": 1.5690935850143433,
+      "learning_rate": 0.0002,
+      "loss": 1.646,
+      "step": 2150
+    },
+    {
+      "epoch": 1.8636755823986195,
+      "grad_norm": 1.4900702238082886,
+      "learning_rate": 0.0002,
+      "loss": 1.6989,
+      "step": 2160
+    },
+    {
+      "epoch": 1.8723037100949094,
+      "grad_norm": 1.4173238277435303,
+      "learning_rate": 0.0002,
+      "loss": 1.657,
+      "step": 2170
+    },
+    {
+      "epoch": 1.8809318377911994,
+      "grad_norm": 1.3687001466751099,
+      "learning_rate": 0.0002,
+      "loss": 1.6587,
+      "step": 2180
+    },
+    {
+      "epoch": 1.8895599654874893,
+      "grad_norm": 1.371954321861267,
+      "learning_rate": 0.0002,
+      "loss": 1.6209,
+      "step": 2190
+    },
+    {
+      "epoch": 1.8981880931837791,
+      "grad_norm": 1.5397378206253052,
+      "learning_rate": 0.0002,
+      "loss": 1.6749,
+      "step": 2200
+    },
+    {
+      "epoch": 1.906816220880069,
+      "grad_norm": 1.7145664691925049,
+      "learning_rate": 0.0002,
+      "loss": 1.7149,
+      "step": 2210
+    },
+    {
+      "epoch": 1.9154443485763588,
+      "grad_norm": 1.5490705966949463,
+      "learning_rate": 0.0002,
+      "loss": 1.6663,
+      "step": 2220
+    },
+    {
+      "epoch": 1.9240724762726489,
+      "grad_norm": 1.3237485885620117,
+      "learning_rate": 0.0002,
+      "loss": 1.7056,
+      "step": 2230
+    },
+    {
+      "epoch": 1.9327006039689387,
+      "grad_norm": 1.4739165306091309,
+      "learning_rate": 0.0002,
+      "loss": 1.7613,
+      "step": 2240
+    },
+    {
+      "epoch": 1.9413287316652288,
+      "grad_norm": 1.7177914381027222,
+      "learning_rate": 0.0002,
+      "loss": 1.601,
+      "step": 2250
+    },
+    {
+      "epoch": 1.9499568593615186,
+      "grad_norm": 1.3587760925292969,
+      "learning_rate": 0.0002,
+      "loss": 1.6733,
+      "step": 2260
+    },
+    {
+      "epoch": 1.9585849870578085,
+      "grad_norm": 1.3180559873580933,
+      "learning_rate": 0.0002,
+      "loss": 1.6511,
+      "step": 2270
+    },
+    {
+      "epoch": 1.9672131147540983,
+      "grad_norm": 1.9988678693771362,
+      "learning_rate": 0.0002,
+      "loss": 1.5875,
+      "step": 2280
+    },
+    {
+      "epoch": 1.9758412424503882,
+      "grad_norm": 1.4148619174957275,
+      "learning_rate": 0.0002,
+      "loss": 1.6516,
+      "step": 2290
+    },
+    {
+      "epoch": 1.984469370146678,
+      "grad_norm": 1.6429015398025513,
+      "learning_rate": 0.0002,
+      "loss": 1.6649,
+      "step": 2300
+    },
+    {
+      "epoch": 1.993097497842968,
+      "grad_norm": 1.6742682456970215,
+      "learning_rate": 0.0002,
+      "loss": 1.6504,
+      "step": 2310
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7843003273010254,
+      "eval_runtime": 155.4967,
+      "eval_samples_per_second": 3.428,
+      "eval_steps_per_second": 0.431,
+      "step": 2318
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9272,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.017016165072896e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87c51db54c9eeeb9c31c1d5d10a2ca49db936f9c0e6c5697c8941ee541bc7c94
+size 5688

	@@ -0,0 +1,203 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:474f22de69858081a1f0c48a1c61d5aa767ea78b5a9f1bc22a235f5ef12e3664
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7ebc430cb9a17f716b864cf5af7b82317e310c46a4d0a8a9d35f85328075ade
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ac358ec4e63c78d3c1b56c7db81c82cfa7ad16a246072dbf1a7a16b964411cd
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e7414ec39ae7be46fb129cd4f9dc2e9ade69a04c642c601ff8624fdaf00ebee
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2486 @@

+{
+  "best_metric": 1.7723218202590942,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159",
+  "epoch": 3.0,
+  "eval_steps": 10,
+  "global_step": 3477,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008628127696289905,
+      "grad_norm": 1.7177482843399048,
+      "learning_rate": 0.0002,
+      "loss": 2.5586,
+      "step": 10
+    },
+    {
+      "epoch": 0.01725625539257981,
+      "grad_norm": 2.8122410774230957,
+      "learning_rate": 0.0002,
+      "loss": 2.2918,
+      "step": 20
+    },
+    {
+      "epoch": 0.025884383088869714,
+      "grad_norm": 1.6668062210083008,
+      "learning_rate": 0.0002,
+      "loss": 2.0885,
+      "step": 30
+    },
+    {
+      "epoch": 0.03451251078515962,
+      "grad_norm": 1.745869755744934,
+      "learning_rate": 0.0002,
+      "loss": 2.0469,
+      "step": 40
+    },
+    {
+      "epoch": 0.04314063848144953,
+      "grad_norm": 1.807971477508545,
+      "learning_rate": 0.0002,
+      "loss": 2.0754,
+      "step": 50
+    },
+    {
+      "epoch": 0.05176876617773943,
+      "grad_norm": 2.4009974002838135,
+      "learning_rate": 0.0002,
+      "loss": 2.0867,
+      "step": 60
+    },
+    {
+      "epoch": 0.060396893874029335,
+      "grad_norm": 2.0974676609039307,
+      "learning_rate": 0.0002,
+      "loss": 1.8575,
+      "step": 70
+    },
+    {
+      "epoch": 0.06902502157031924,
+      "grad_norm": 1.7705916166305542,
+      "learning_rate": 0.0002,
+      "loss": 1.8921,
+      "step": 80
+    },
+    {
+      "epoch": 0.07765314926660914,
+      "grad_norm": 1.4324289560317993,
+      "learning_rate": 0.0002,
+      "loss": 1.8119,
+      "step": 90
+    },
+    {
+      "epoch": 0.08628127696289906,
+      "grad_norm": 1.2521991729736328,
+      "learning_rate": 0.0002,
+      "loss": 1.8728,
+      "step": 100
+    },
+    {
+      "epoch": 0.09490940465918896,
+      "grad_norm": 1.3328145742416382,
+      "learning_rate": 0.0002,
+      "loss": 1.8168,
+      "step": 110
+    },
+    {
+      "epoch": 0.10353753235547886,
+      "grad_norm": 2.2908742427825928,
+      "learning_rate": 0.0002,
+      "loss": 1.8236,
+      "step": 120
+    },
+    {
+      "epoch": 0.11216566005176877,
+      "grad_norm": 1.540981411933899,
+      "learning_rate": 0.0002,
+      "loss": 1.8732,
+      "step": 130
+    },
+    {
+      "epoch": 0.12079378774805867,
+      "grad_norm": 1.1785069704055786,
+      "learning_rate": 0.0002,
+      "loss": 1.8138,
+      "step": 140
+    },
+    {
+      "epoch": 0.12942191544434858,
+      "grad_norm": 1.3138738870620728,
+      "learning_rate": 0.0002,
+      "loss": 1.8655,
+      "step": 150
+    },
+    {
+      "epoch": 0.13805004314063848,
+      "grad_norm": 1.153215765953064,
+      "learning_rate": 0.0002,
+      "loss": 1.8418,
+      "step": 160
+    },
+    {
+      "epoch": 0.14667817083692838,
+      "grad_norm": 1.2071360349655151,
+      "learning_rate": 0.0002,
+      "loss": 1.8284,
+      "step": 170
+    },
+    {
+      "epoch": 0.15530629853321828,
+      "grad_norm": 1.3546127080917358,
+      "learning_rate": 0.0002,
+      "loss": 1.8645,
+      "step": 180
+    },
+    {
+      "epoch": 0.16393442622950818,
+      "grad_norm": 1.1494425535202026,
+      "learning_rate": 0.0002,
+      "loss": 1.8699,
+      "step": 190
+    },
+    {
+      "epoch": 0.1725625539257981,
+      "grad_norm": 0.982718825340271,
+      "learning_rate": 0.0002,
+      "loss": 1.7845,
+      "step": 200
+    },
+    {
+      "epoch": 0.181190681622088,
+      "grad_norm": 1.1329727172851562,
+      "learning_rate": 0.0002,
+      "loss": 1.8237,
+      "step": 210
+    },
+    {
+      "epoch": 0.1898188093183779,
+      "grad_norm": 1.1397384405136108,
+      "learning_rate": 0.0002,
+      "loss": 1.8516,
+      "step": 220
+    },
+    {
+      "epoch": 0.1984469370146678,
+      "grad_norm": 1.2424808740615845,
+      "learning_rate": 0.0002,
+      "loss": 1.7504,
+      "step": 230
+    },
+    {
+      "epoch": 0.2070750647109577,
+      "grad_norm": 1.1463897228240967,
+      "learning_rate": 0.0002,
+      "loss": 1.7626,
+      "step": 240
+    },
+    {
+      "epoch": 0.21570319240724764,
+      "grad_norm": 1.2353036403656006,
+      "learning_rate": 0.0002,
+      "loss": 1.7977,
+      "step": 250
+    },
+    {
+      "epoch": 0.22433132010353754,
+      "grad_norm": 1.0135247707366943,
+      "learning_rate": 0.0002,
+      "loss": 1.8274,
+      "step": 260
+    },
+    {
+      "epoch": 0.23295944779982744,
+      "grad_norm": 1.1388282775878906,
+      "learning_rate": 0.0002,
+      "loss": 1.7678,
+      "step": 270
+    },
+    {
+      "epoch": 0.24158757549611734,
+      "grad_norm": 1.1262438297271729,
+      "learning_rate": 0.0002,
+      "loss": 1.7895,
+      "step": 280
+    },
+    {
+      "epoch": 0.25021570319240727,
+      "grad_norm": 1.0581450462341309,
+      "learning_rate": 0.0002,
+      "loss": 1.826,
+      "step": 290
+    },
+    {
+      "epoch": 0.25884383088869717,
+      "grad_norm": 1.1737277507781982,
+      "learning_rate": 0.0002,
+      "loss": 1.7269,
+      "step": 300
+    },
+    {
+      "epoch": 0.26747195858498707,
+      "grad_norm": 1.0906627178192139,
+      "learning_rate": 0.0002,
+      "loss": 1.7975,
+      "step": 310
+    },
+    {
+      "epoch": 0.27610008628127697,
+      "grad_norm": 1.0010069608688354,
+      "learning_rate": 0.0002,
+      "loss": 1.7594,
+      "step": 320
+    },
+    {
+      "epoch": 0.28472821397756687,
+      "grad_norm": 1.2149732112884521,
+      "learning_rate": 0.0002,
+      "loss": 1.7998,
+      "step": 330
+    },
+    {
+      "epoch": 0.29335634167385677,
+      "grad_norm": 1.293990969657898,
+      "learning_rate": 0.0002,
+      "loss": 1.8079,
+      "step": 340
+    },
+    {
+      "epoch": 0.30198446937014667,
+      "grad_norm": 1.0082058906555176,
+      "learning_rate": 0.0002,
+      "loss": 1.7629,
+      "step": 350
+    },
+    {
+      "epoch": 0.31061259706643657,
+      "grad_norm": 1.0307148694992065,
+      "learning_rate": 0.0002,
+      "loss": 1.8001,
+      "step": 360
+    },
+    {
+      "epoch": 0.31924072476272647,
+      "grad_norm": 0.9646756649017334,
+      "learning_rate": 0.0002,
+      "loss": 1.7456,
+      "step": 370
+    },
+    {
+      "epoch": 0.32786885245901637,
+      "grad_norm": 1.105623722076416,
+      "learning_rate": 0.0002,
+      "loss": 1.7979,
+      "step": 380
+    },
+    {
+      "epoch": 0.3364969801553063,
+      "grad_norm": 0.9365625977516174,
+      "learning_rate": 0.0002,
+      "loss": 1.7313,
+      "step": 390
+    },
+    {
+      "epoch": 0.3451251078515962,
+      "grad_norm": 1.1378847360610962,
+      "learning_rate": 0.0002,
+      "loss": 1.809,
+      "step": 400
+    },
+    {
+      "epoch": 0.3537532355478861,
+      "grad_norm": 1.1266193389892578,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 410
+    },
+    {
+      "epoch": 0.362381363244176,
+      "grad_norm": 1.0886635780334473,
+      "learning_rate": 0.0002,
+      "loss": 1.8096,
+      "step": 420
+    },
+    {
+      "epoch": 0.3710094909404659,
+      "grad_norm": 1.0463931560516357,
+      "learning_rate": 0.0002,
+      "loss": 1.7422,
+      "step": 430
+    },
+    {
+      "epoch": 0.3796376186367558,
+      "grad_norm": 1.0923888683319092,
+      "learning_rate": 0.0002,
+      "loss": 1.7936,
+      "step": 440
+    },
+    {
+      "epoch": 0.3882657463330457,
+      "grad_norm": 1.1386370658874512,
+      "learning_rate": 0.0002,
+      "loss": 1.7777,
+      "step": 450
+    },
+    {
+      "epoch": 0.3968938740293356,
+      "grad_norm": 1.0098074674606323,
+      "learning_rate": 0.0002,
+      "loss": 1.7211,
+      "step": 460
+    },
+    {
+      "epoch": 0.4055220017256255,
+      "grad_norm": 1.1237372159957886,
+      "learning_rate": 0.0002,
+      "loss": 1.7457,
+      "step": 470
+    },
+    {
+      "epoch": 0.4141501294219154,
+      "grad_norm": 1.0218915939331055,
+      "learning_rate": 0.0002,
+      "loss": 1.854,
+      "step": 480
+    },
+    {
+      "epoch": 0.4227782571182053,
+      "grad_norm": 0.9998831748962402,
+      "learning_rate": 0.0002,
+      "loss": 1.8548,
+      "step": 490
+    },
+    {
+      "epoch": 0.4314063848144953,
+      "grad_norm": 1.0424970388412476,
+      "learning_rate": 0.0002,
+      "loss": 1.7159,
+      "step": 500
+    },
+    {
+      "epoch": 0.4400345125107852,
+      "grad_norm": 0.903372585773468,
+      "learning_rate": 0.0002,
+      "loss": 1.788,
+      "step": 510
+    },
+    {
+      "epoch": 0.4486626402070751,
+      "grad_norm": 1.0864766836166382,
+      "learning_rate": 0.0002,
+      "loss": 1.8293,
+      "step": 520
+    },
+    {
+      "epoch": 0.457290767903365,
+      "grad_norm": 0.9694207310676575,
+      "learning_rate": 0.0002,
+      "loss": 1.8402,
+      "step": 530
+    },
+    {
+      "epoch": 0.4659188955996549,
+      "grad_norm": 1.2796396017074585,
+      "learning_rate": 0.0002,
+      "loss": 1.7802,
+      "step": 540
+    },
+    {
+      "epoch": 0.4745470232959448,
+      "grad_norm": 1.0316239595413208,
+      "learning_rate": 0.0002,
+      "loss": 1.7716,
+      "step": 550
+    },
+    {
+      "epoch": 0.4831751509922347,
+      "grad_norm": 1.0445313453674316,
+      "learning_rate": 0.0002,
+      "loss": 1.7734,
+      "step": 560
+    },
+    {
+      "epoch": 0.4918032786885246,
+      "grad_norm": 1.1078376770019531,
+      "learning_rate": 0.0002,
+      "loss": 1.8082,
+      "step": 570
+    },
+    {
+      "epoch": 0.5004314063848145,
+      "grad_norm": 1.0551974773406982,
+      "learning_rate": 0.0002,
+      "loss": 1.7298,
+      "step": 580
+    },
+    {
+      "epoch": 0.5090595340811044,
+      "grad_norm": 1.114853858947754,
+      "learning_rate": 0.0002,
+      "loss": 1.8673,
+      "step": 590
+    },
+    {
+      "epoch": 0.5176876617773943,
+      "grad_norm": 1.0642707347869873,
+      "learning_rate": 0.0002,
+      "loss": 1.7684,
+      "step": 600
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 1.088079810142517,
+      "learning_rate": 0.0002,
+      "loss": 1.8367,
+      "step": 610
+    },
+    {
+      "epoch": 0.5349439171699741,
+      "grad_norm": 1.4029462337493896,
+      "learning_rate": 0.0002,
+      "loss": 1.775,
+      "step": 620
+    },
+    {
+      "epoch": 0.543572044866264,
+      "grad_norm": 1.2136136293411255,
+      "learning_rate": 0.0002,
+      "loss": 1.7771,
+      "step": 630
+    },
+    {
+      "epoch": 0.5522001725625539,
+      "grad_norm": 0.9642075896263123,
+      "learning_rate": 0.0002,
+      "loss": 1.8006,
+      "step": 640
+    },
+    {
+      "epoch": 0.5608283002588438,
+      "grad_norm": 1.0879552364349365,
+      "learning_rate": 0.0002,
+      "loss": 1.7478,
+      "step": 650
+    },
+    {
+      "epoch": 0.5694564279551337,
+      "grad_norm": 1.1766546964645386,
+      "learning_rate": 0.0002,
+      "loss": 1.8427,
+      "step": 660
+    },
+    {
+      "epoch": 0.5780845556514237,
+      "grad_norm": 1.582840085029602,
+      "learning_rate": 0.0002,
+      "loss": 1.7129,
+      "step": 670
+    },
+    {
+      "epoch": 0.5867126833477135,
+      "grad_norm": 1.0681092739105225,
+      "learning_rate": 0.0002,
+      "loss": 1.8093,
+      "step": 680
+    },
+    {
+      "epoch": 0.5953408110440035,
+      "grad_norm": 1.103897213935852,
+      "learning_rate": 0.0002,
+      "loss": 1.8067,
+      "step": 690
+    },
+    {
+      "epoch": 0.6039689387402933,
+      "grad_norm": 1.0974211692810059,
+      "learning_rate": 0.0002,
+      "loss": 1.7425,
+      "step": 700
+    },
+    {
+      "epoch": 0.6125970664365833,
+      "grad_norm": 1.1002469062805176,
+      "learning_rate": 0.0002,
+      "loss": 1.784,
+      "step": 710
+    },
+    {
+      "epoch": 0.6212251941328731,
+      "grad_norm": 1.0022329092025757,
+      "learning_rate": 0.0002,
+      "loss": 1.8106,
+      "step": 720
+    },
+    {
+      "epoch": 0.6298533218291631,
+      "grad_norm": 1.0089571475982666,
+      "learning_rate": 0.0002,
+      "loss": 1.7647,
+      "step": 730
+    },
+    {
+      "epoch": 0.6384814495254529,
+      "grad_norm": 0.9531904458999634,
+      "learning_rate": 0.0002,
+      "loss": 1.8033,
+      "step": 740
+    },
+    {
+      "epoch": 0.6471095772217429,
+      "grad_norm": 1.162675380706787,
+      "learning_rate": 0.0002,
+      "loss": 1.7644,
+      "step": 750
+    },
+    {
+      "epoch": 0.6557377049180327,
+      "grad_norm": 1.0488134622573853,
+      "learning_rate": 0.0002,
+      "loss": 1.7531,
+      "step": 760
+    },
+    {
+      "epoch": 0.6643658326143227,
+      "grad_norm": 1.12964928150177,
+      "learning_rate": 0.0002,
+      "loss": 1.7583,
+      "step": 770
+    },
+    {
+      "epoch": 0.6729939603106126,
+      "grad_norm": 1.0867345333099365,
+      "learning_rate": 0.0002,
+      "loss": 1.7765,
+      "step": 780
+    },
+    {
+      "epoch": 0.6816220880069025,
+      "grad_norm": 1.1084282398223877,
+      "learning_rate": 0.0002,
+      "loss": 1.7797,
+      "step": 790
+    },
+    {
+      "epoch": 0.6902502157031924,
+      "grad_norm": 0.9905423521995544,
+      "learning_rate": 0.0002,
+      "loss": 1.7792,
+      "step": 800
+    },
+    {
+      "epoch": 0.6988783433994823,
+      "grad_norm": 1.18604576587677,
+      "learning_rate": 0.0002,
+      "loss": 1.7825,
+      "step": 810
+    },
+    {
+      "epoch": 0.7075064710957722,
+      "grad_norm": 1.0819629430770874,
+      "learning_rate": 0.0002,
+      "loss": 1.8242,
+      "step": 820
+    },
+    {
+      "epoch": 0.7161345987920621,
+      "grad_norm": 2.0091195106506348,
+      "learning_rate": 0.0002,
+      "loss": 1.7916,
+      "step": 830
+    },
+    {
+      "epoch": 0.724762726488352,
+      "grad_norm": 1.0371277332305908,
+      "learning_rate": 0.0002,
+      "loss": 1.8186,
+      "step": 840
+    },
+    {
+      "epoch": 0.7333908541846419,
+      "grad_norm": 1.217102289199829,
+      "learning_rate": 0.0002,
+      "loss": 1.7937,
+      "step": 850
+    },
+    {
+      "epoch": 0.7420189818809318,
+      "grad_norm": 1.0528525114059448,
+      "learning_rate": 0.0002,
+      "loss": 1.7317,
+      "step": 860
+    },
+    {
+      "epoch": 0.7506471095772217,
+      "grad_norm": 1.1398800611495972,
+      "learning_rate": 0.0002,
+      "loss": 1.7757,
+      "step": 870
+    },
+    {
+      "epoch": 0.7592752372735116,
+      "grad_norm": 1.1546001434326172,
+      "learning_rate": 0.0002,
+      "loss": 1.8326,
+      "step": 880
+    },
+    {
+      "epoch": 0.7679033649698016,
+      "grad_norm": 1.0745750665664673,
+      "learning_rate": 0.0002,
+      "loss": 1.7178,
+      "step": 890
+    },
+    {
+      "epoch": 0.7765314926660914,
+      "grad_norm": 1.1739161014556885,
+      "learning_rate": 0.0002,
+      "loss": 1.7718,
+      "step": 900
+    },
+    {
+      "epoch": 0.7851596203623814,
+      "grad_norm": 1.1932017803192139,
+      "learning_rate": 0.0002,
+      "loss": 1.7764,
+      "step": 910
+    },
+    {
+      "epoch": 0.7937877480586712,
+      "grad_norm": 1.143064022064209,
+      "learning_rate": 0.0002,
+      "loss": 1.7192,
+      "step": 920
+    },
+    {
+      "epoch": 0.8024158757549612,
+      "grad_norm": 1.200974464416504,
+      "learning_rate": 0.0002,
+      "loss": 1.7342,
+      "step": 930
+    },
+    {
+      "epoch": 0.811044003451251,
+      "grad_norm": 1.0878669023513794,
+      "learning_rate": 0.0002,
+      "loss": 1.7399,
+      "step": 940
+    },
+    {
+      "epoch": 0.819672131147541,
+      "grad_norm": 1.0516951084136963,
+      "learning_rate": 0.0002,
+      "loss": 1.8019,
+      "step": 950
+    },
+    {
+      "epoch": 0.8283002588438308,
+      "grad_norm": 1.2017741203308105,
+      "learning_rate": 0.0002,
+      "loss": 1.7645,
+      "step": 960
+    },
+    {
+      "epoch": 0.8369283865401208,
+      "grad_norm": 0.9762169718742371,
+      "learning_rate": 0.0002,
+      "loss": 1.7367,
+      "step": 970
+    },
+    {
+      "epoch": 0.8455565142364107,
+      "grad_norm": 1.0837513208389282,
+      "learning_rate": 0.0002,
+      "loss": 1.7802,
+      "step": 980
+    },
+    {
+      "epoch": 0.8541846419327006,
+      "grad_norm": 1.155504822731018,
+      "learning_rate": 0.0002,
+      "loss": 1.8094,
+      "step": 990
+    },
+    {
+      "epoch": 0.8628127696289906,
+      "grad_norm": 1.067771315574646,
+      "learning_rate": 0.0002,
+      "loss": 1.7633,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8714408973252804,
+      "grad_norm": 1.2283565998077393,
+      "learning_rate": 0.0002,
+      "loss": 1.7993,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8800690250215704,
+      "grad_norm": 1.1549772024154663,
+      "learning_rate": 0.0002,
+      "loss": 1.7362,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8886971527178602,
+      "grad_norm": 1.0022625923156738,
+      "learning_rate": 0.0002,
+      "loss": 1.7583,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8973252804141502,
+      "grad_norm": 1.0237284898757935,
+      "learning_rate": 0.0002,
+      "loss": 1.7718,
+      "step": 1040
+    },
+    {
+      "epoch": 0.90595340811044,
+      "grad_norm": 1.1863008737564087,
+      "learning_rate": 0.0002,
+      "loss": 1.7457,
+      "step": 1050
+    },
+    {
+      "epoch": 0.91458153580673,
+      "grad_norm": 1.001204013824463,
+      "learning_rate": 0.0002,
+      "loss": 1.6951,
+      "step": 1060
+    },
+    {
+      "epoch": 0.9232096635030198,
+      "grad_norm": 1.2686481475830078,
+      "learning_rate": 0.0002,
+      "loss": 1.7506,
+      "step": 1070
+    },
+    {
+      "epoch": 0.9318377911993098,
+      "grad_norm": 1.0700076818466187,
+      "learning_rate": 0.0002,
+      "loss": 1.7064,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9404659188955996,
+      "grad_norm": 1.05950927734375,
+      "learning_rate": 0.0002,
+      "loss": 1.8015,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9490940465918896,
+      "grad_norm": 0.9669114947319031,
+      "learning_rate": 0.0002,
+      "loss": 1.8155,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9577221742881795,
+      "grad_norm": 1.1823079586029053,
+      "learning_rate": 0.0002,
+      "loss": 1.8074,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9663503019844694,
+      "grad_norm": 1.0857175588607788,
+      "learning_rate": 0.0002,
+      "loss": 1.7636,
+      "step": 1120
+    },
+    {
+      "epoch": 0.9749784296807593,
+      "grad_norm": 1.1258848905563354,
+      "learning_rate": 0.0002,
+      "loss": 1.822,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9836065573770492,
+      "grad_norm": 1.16336989402771,
+      "learning_rate": 0.0002,
+      "loss": 1.8167,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9922346850733391,
+      "grad_norm": 1.118432879447937,
+      "learning_rate": 0.0002,
+      "loss": 1.7402,
+      "step": 1150
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7723218202590942,
+      "eval_runtime": 158.8593,
+      "eval_samples_per_second": 3.355,
+      "eval_steps_per_second": 0.422,
+      "step": 1159
+    },
+    {
+      "epoch": 1.000862812769629,
+      "grad_norm": 1.1056718826293945,
+      "learning_rate": 0.0002,
+      "loss": 1.7863,
+      "step": 1160
+    },
+    {
+      "epoch": 1.009490940465919,
+      "grad_norm": 1.0352667570114136,
+      "learning_rate": 0.0002,
+      "loss": 1.672,
+      "step": 1170
+    },
+    {
+      "epoch": 1.0181190681622088,
+      "grad_norm": 1.0315937995910645,
+      "learning_rate": 0.0002,
+      "loss": 1.6718,
+      "step": 1180
+    },
+    {
+      "epoch": 1.0267471958584986,
+      "grad_norm": 1.369126558303833,
+      "learning_rate": 0.0002,
+      "loss": 1.6937,
+      "step": 1190
+    },
+    {
+      "epoch": 1.0353753235547887,
+      "grad_norm": 1.330876350402832,
+      "learning_rate": 0.0002,
+      "loss": 1.6732,
+      "step": 1200
+    },
+    {
+      "epoch": 1.0440034512510785,
+      "grad_norm": 1.406552791595459,
+      "learning_rate": 0.0002,
+      "loss": 1.6497,
+      "step": 1210
+    },
+    {
+      "epoch": 1.0526315789473684,
+      "grad_norm": 1.1256251335144043,
+      "learning_rate": 0.0002,
+      "loss": 1.6873,
+      "step": 1220
+    },
+    {
+      "epoch": 1.0612597066436584,
+      "grad_norm": 1.315566897392273,
+      "learning_rate": 0.0002,
+      "loss": 1.6765,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0698878343399483,
+      "grad_norm": 1.2100263833999634,
+      "learning_rate": 0.0002,
+      "loss": 1.6763,
+      "step": 1240
+    },
+    {
+      "epoch": 1.0785159620362381,
+      "grad_norm": 1.2762185335159302,
+      "learning_rate": 0.0002,
+      "loss": 1.6496,
+      "step": 1250
+    },
+    {
+      "epoch": 1.087144089732528,
+      "grad_norm": 1.2971566915512085,
+      "learning_rate": 0.0002,
+      "loss": 1.6826,
+      "step": 1260
+    },
+    {
+      "epoch": 1.095772217428818,
+      "grad_norm": 1.3685089349746704,
+      "learning_rate": 0.0002,
+      "loss": 1.6721,
+      "step": 1270
+    },
+    {
+      "epoch": 1.1044003451251079,
+      "grad_norm": 1.3135347366333008,
+      "learning_rate": 0.0002,
+      "loss": 1.6399,
+      "step": 1280
+    },
+    {
+      "epoch": 1.1130284728213977,
+      "grad_norm": 1.4514861106872559,
+      "learning_rate": 0.0002,
+      "loss": 1.641,
+      "step": 1290
+    },
+    {
+      "epoch": 1.1216566005176876,
+      "grad_norm": 1.5077004432678223,
+      "learning_rate": 0.0002,
+      "loss": 1.6443,
+      "step": 1300
+    },
+    {
+      "epoch": 1.1302847282139776,
+      "grad_norm": 1.4807840585708618,
+      "learning_rate": 0.0002,
+      "loss": 1.6406,
+      "step": 1310
+    },
+    {
+      "epoch": 1.1389128559102675,
+      "grad_norm": 1.2386537790298462,
+      "learning_rate": 0.0002,
+      "loss": 1.7022,
+      "step": 1320
+    },
+    {
+      "epoch": 1.1475409836065573,
+      "grad_norm": 1.2637739181518555,
+      "learning_rate": 0.0002,
+      "loss": 1.6265,
+      "step": 1330
+    },
+    {
+      "epoch": 1.1561691113028472,
+      "grad_norm": 1.2472519874572754,
+      "learning_rate": 0.0002,
+      "loss": 1.7103,
+      "step": 1340
+    },
+    {
+      "epoch": 1.1647972389991372,
+      "grad_norm": 1.290644884109497,
+      "learning_rate": 0.0002,
+      "loss": 1.676,
+      "step": 1350
+    },
+    {
+      "epoch": 1.173425366695427,
+      "grad_norm": 1.3227870464324951,
+      "learning_rate": 0.0002,
+      "loss": 1.6713,
+      "step": 1360
+    },
+    {
+      "epoch": 1.182053494391717,
+      "grad_norm": 1.3311200141906738,
+      "learning_rate": 0.0002,
+      "loss": 1.7158,
+      "step": 1370
+    },
+    {
+      "epoch": 1.190681622088007,
+      "grad_norm": 1.2624584436416626,
+      "learning_rate": 0.0002,
+      "loss": 1.6501,
+      "step": 1380
+    },
+    {
+      "epoch": 1.1993097497842968,
+      "grad_norm": 1.4712986946105957,
+      "learning_rate": 0.0002,
+      "loss": 1.6398,
+      "step": 1390
+    },
+    {
+      "epoch": 1.2079378774805867,
+      "grad_norm": 1.416508674621582,
+      "learning_rate": 0.0002,
+      "loss": 1.6818,
+      "step": 1400
+    },
+    {
+      "epoch": 1.2165660051768765,
+      "grad_norm": 1.367967963218689,
+      "learning_rate": 0.0002,
+      "loss": 1.7184,
+      "step": 1410
+    },
+    {
+      "epoch": 1.2251941328731666,
+      "grad_norm": 1.3865700960159302,
+      "learning_rate": 0.0002,
+      "loss": 1.6834,
+      "step": 1420
+    },
+    {
+      "epoch": 1.2338222605694564,
+      "grad_norm": 2.076512336730957,
+      "learning_rate": 0.0002,
+      "loss": 1.7532,
+      "step": 1430
+    },
+    {
+      "epoch": 1.2424503882657463,
+      "grad_norm": 1.305572509765625,
+      "learning_rate": 0.0002,
+      "loss": 1.7448,
+      "step": 1440
+    },
+    {
+      "epoch": 1.2510785159620363,
+      "grad_norm": 1.2752642631530762,
+      "learning_rate": 0.0002,
+      "loss": 1.7422,
+      "step": 1450
+    },
+    {
+      "epoch": 1.2597066436583262,
+      "grad_norm": 1.1802726984024048,
+      "learning_rate": 0.0002,
+      "loss": 1.7121,
+      "step": 1460
+    },
+    {
+      "epoch": 1.268334771354616,
+      "grad_norm": 1.2195663452148438,
+      "learning_rate": 0.0002,
+      "loss": 1.7617,
+      "step": 1470
+    },
+    {
+      "epoch": 1.2769628990509059,
+      "grad_norm": 1.3073176145553589,
+      "learning_rate": 0.0002,
+      "loss": 1.6022,
+      "step": 1480
+    },
+    {
+      "epoch": 1.2855910267471957,
+      "grad_norm": 1.2829731702804565,
+      "learning_rate": 0.0002,
+      "loss": 1.6472,
+      "step": 1490
+    },
+    {
+      "epoch": 1.2942191544434858,
+      "grad_norm": 1.361060619354248,
+      "learning_rate": 0.0002,
+      "loss": 1.6076,
+      "step": 1500
+    },
+    {
+      "epoch": 1.3028472821397756,
+      "grad_norm": 1.4285917282104492,
+      "learning_rate": 0.0002,
+      "loss": 1.7059,
+      "step": 1510
+    },
+    {
+      "epoch": 1.3114754098360657,
+      "grad_norm": 1.186866283416748,
+      "learning_rate": 0.0002,
+      "loss": 1.696,
+      "step": 1520
+    },
+    {
+      "epoch": 1.3201035375323555,
+      "grad_norm": 1.2615889310836792,
+      "learning_rate": 0.0002,
+      "loss": 1.6707,
+      "step": 1530
+    },
+    {
+      "epoch": 1.3287316652286454,
+      "grad_norm": 1.2732815742492676,
+      "learning_rate": 0.0002,
+      "loss": 1.5797,
+      "step": 1540
+    },
+    {
+      "epoch": 1.3373597929249352,
+      "grad_norm": 1.4152132272720337,
+      "learning_rate": 0.0002,
+      "loss": 1.6623,
+      "step": 1550
+    },
+    {
+      "epoch": 1.345987920621225,
+      "grad_norm": 1.1730318069458008,
+      "learning_rate": 0.0002,
+      "loss": 1.6649,
+      "step": 1560
+    },
+    {
+      "epoch": 1.3546160483175151,
+      "grad_norm": 1.2282229661941528,
+      "learning_rate": 0.0002,
+      "loss": 1.7247,
+      "step": 1570
+    },
+    {
+      "epoch": 1.363244176013805,
+      "grad_norm": 1.227974534034729,
+      "learning_rate": 0.0002,
+      "loss": 1.7125,
+      "step": 1580
+    },
+    {
+      "epoch": 1.3718723037100948,
+      "grad_norm": 1.3480374813079834,
+      "learning_rate": 0.0002,
+      "loss": 1.622,
+      "step": 1590
+    },
+    {
+      "epoch": 1.380500431406385,
+      "grad_norm": 1.3460094928741455,
+      "learning_rate": 0.0002,
+      "loss": 1.7126,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3891285591026747,
+      "grad_norm": 1.254465937614441,
+      "learning_rate": 0.0002,
+      "loss": 1.6845,
+      "step": 1610
+    },
+    {
+      "epoch": 1.3977566867989646,
+      "grad_norm": 1.4135496616363525,
+      "learning_rate": 0.0002,
+      "loss": 1.643,
+      "step": 1620
+    },
+    {
+      "epoch": 1.4063848144952544,
+      "grad_norm": 1.277063012123108,
+      "learning_rate": 0.0002,
+      "loss": 1.6392,
+      "step": 1630
+    },
+    {
+      "epoch": 1.4150129421915445,
+      "grad_norm": 1.5031940937042236,
+      "learning_rate": 0.0002,
+      "loss": 1.7338,
+      "step": 1640
+    },
+    {
+      "epoch": 1.4236410698878343,
+      "grad_norm": 1.3918952941894531,
+      "learning_rate": 0.0002,
+      "loss": 1.6229,
+      "step": 1650
+    },
+    {
+      "epoch": 1.4322691975841242,
+      "grad_norm": 1.5893778800964355,
+      "learning_rate": 0.0002,
+      "loss": 1.6893,
+      "step": 1660
+    },
+    {
+      "epoch": 1.4408973252804143,
+      "grad_norm": 1.4636809825897217,
+      "learning_rate": 0.0002,
+      "loss": 1.7129,
+      "step": 1670
+    },
+    {
+      "epoch": 1.449525452976704,
+      "grad_norm": 1.1985419988632202,
+      "learning_rate": 0.0002,
+      "loss": 1.6481,
+      "step": 1680
+    },
+    {
+      "epoch": 1.458153580672994,
+      "grad_norm": 1.509252905845642,
+      "learning_rate": 0.0002,
+      "loss": 1.7322,
+      "step": 1690
+    },
+    {
+      "epoch": 1.4667817083692838,
+      "grad_norm": 1.4157838821411133,
+      "learning_rate": 0.0002,
+      "loss": 1.6653,
+      "step": 1700
+    },
+    {
+      "epoch": 1.4754098360655736,
+      "grad_norm": 1.3481059074401855,
+      "learning_rate": 0.0002,
+      "loss": 1.7111,
+      "step": 1710
+    },
+    {
+      "epoch": 1.4840379637618637,
+      "grad_norm": 1.4127949476242065,
+      "learning_rate": 0.0002,
+      "loss": 1.6488,
+      "step": 1720
+    },
+    {
+      "epoch": 1.4926660914581535,
+      "grad_norm": 1.3087295293807983,
+      "learning_rate": 0.0002,
+      "loss": 1.6336,
+      "step": 1730
+    },
+    {
+      "epoch": 1.5012942191544436,
+      "grad_norm": 1.4421851634979248,
+      "learning_rate": 0.0002,
+      "loss": 1.7226,
+      "step": 1740
+    },
+    {
+      "epoch": 1.5099223468507335,
+      "grad_norm": 1.3953148126602173,
+      "learning_rate": 0.0002,
+      "loss": 1.7006,
+      "step": 1750
+    },
+    {
+      "epoch": 1.5185504745470233,
+      "grad_norm": 1.4613851308822632,
+      "learning_rate": 0.0002,
+      "loss": 1.6281,
+      "step": 1760
+    },
+    {
+      "epoch": 1.5271786022433131,
+      "grad_norm": 1.2866744995117188,
+      "learning_rate": 0.0002,
+      "loss": 1.6404,
+      "step": 1770
+    },
+    {
+      "epoch": 1.535806729939603,
+      "grad_norm": 1.2769535779953003,
+      "learning_rate": 0.0002,
+      "loss": 1.628,
+      "step": 1780
+    },
+    {
+      "epoch": 1.544434857635893,
+      "grad_norm": 1.371022343635559,
+      "learning_rate": 0.0002,
+      "loss": 1.6439,
+      "step": 1790
+    },
+    {
+      "epoch": 1.553062985332183,
+      "grad_norm": 1.4434700012207031,
+      "learning_rate": 0.0002,
+      "loss": 1.6363,
+      "step": 1800
+    },
+    {
+      "epoch": 1.561691113028473,
+      "grad_norm": 1.269386887550354,
+      "learning_rate": 0.0002,
+      "loss": 1.6606,
+      "step": 1810
+    },
+    {
+      "epoch": 1.5703192407247628,
+      "grad_norm": 1.2668766975402832,
+      "learning_rate": 0.0002,
+      "loss": 1.6493,
+      "step": 1820
+    },
+    {
+      "epoch": 1.5789473684210527,
+      "grad_norm": 1.4857951402664185,
+      "learning_rate": 0.0002,
+      "loss": 1.7124,
+      "step": 1830
+    },
+    {
+      "epoch": 1.5875754961173425,
+      "grad_norm": 1.330338954925537,
+      "learning_rate": 0.0002,
+      "loss": 1.6474,
+      "step": 1840
+    },
+    {
+      "epoch": 1.5962036238136323,
+      "grad_norm": 1.3832308053970337,
+      "learning_rate": 0.0002,
+      "loss": 1.6412,
+      "step": 1850
+    },
+    {
+      "epoch": 1.6048317515099222,
+      "grad_norm": 1.2697869539260864,
+      "learning_rate": 0.0002,
+      "loss": 1.6988,
+      "step": 1860
+    },
+    {
+      "epoch": 1.6134598792062123,
+      "grad_norm": 1.338875412940979,
+      "learning_rate": 0.0002,
+      "loss": 1.6651,
+      "step": 1870
+    },
+    {
+      "epoch": 1.6220880069025023,
+      "grad_norm": 1.4077556133270264,
+      "learning_rate": 0.0002,
+      "loss": 1.7319,
+      "step": 1880
+    },
+    {
+      "epoch": 1.6307161345987922,
+      "grad_norm": 1.40274178981781,
+      "learning_rate": 0.0002,
+      "loss": 1.644,
+      "step": 1890
+    },
+    {
+      "epoch": 1.639344262295082,
+      "grad_norm": 1.416042447090149,
+      "learning_rate": 0.0002,
+      "loss": 1.6648,
+      "step": 1900
+    },
+    {
+      "epoch": 1.6479723899913719,
+      "grad_norm": 1.4196866750717163,
+      "learning_rate": 0.0002,
+      "loss": 1.729,
+      "step": 1910
+    },
+    {
+      "epoch": 1.6566005176876617,
+      "grad_norm": 1.378732681274414,
+      "learning_rate": 0.0002,
+      "loss": 1.7381,
+      "step": 1920
+    },
+    {
+      "epoch": 1.6652286453839515,
+      "grad_norm": 1.544751524925232,
+      "learning_rate": 0.0002,
+      "loss": 1.7804,
+      "step": 1930
+    },
+    {
+      "epoch": 1.6738567730802416,
+      "grad_norm": 1.4318190813064575,
+      "learning_rate": 0.0002,
+      "loss": 1.6563,
+      "step": 1940
+    },
+    {
+      "epoch": 1.6824849007765315,
+      "grad_norm": 1.3794575929641724,
+      "learning_rate": 0.0002,
+      "loss": 1.6806,
+      "step": 1950
+    },
+    {
+      "epoch": 1.6911130284728215,
+      "grad_norm": 1.6301822662353516,
+      "learning_rate": 0.0002,
+      "loss": 1.6707,
+      "step": 1960
+    },
+    {
+      "epoch": 1.6997411561691114,
+      "grad_norm": 1.3090870380401611,
+      "learning_rate": 0.0002,
+      "loss": 1.6945,
+      "step": 1970
+    },
+    {
+      "epoch": 1.7083692838654012,
+      "grad_norm": 1.4537303447723389,
+      "learning_rate": 0.0002,
+      "loss": 1.6018,
+      "step": 1980
+    },
+    {
+      "epoch": 1.716997411561691,
+      "grad_norm": 1.3618766069412231,
+      "learning_rate": 0.0002,
+      "loss": 1.7225,
+      "step": 1990
+    },
+    {
+      "epoch": 1.725625539257981,
+      "grad_norm": 1.398790955543518,
+      "learning_rate": 0.0002,
+      "loss": 1.6948,
+      "step": 2000
+    },
+    {
+      "epoch": 1.734253666954271,
+      "grad_norm": 1.4606391191482544,
+      "learning_rate": 0.0002,
+      "loss": 1.6963,
+      "step": 2010
+    },
+    {
+      "epoch": 1.7428817946505608,
+      "grad_norm": 1.602010726928711,
+      "learning_rate": 0.0002,
+      "loss": 1.727,
+      "step": 2020
+    },
+    {
+      "epoch": 1.7515099223468509,
+      "grad_norm": 1.4865907430648804,
+      "learning_rate": 0.0002,
+      "loss": 1.7238,
+      "step": 2030
+    },
+    {
+      "epoch": 1.7601380500431407,
+      "grad_norm": 1.5954750776290894,
+      "learning_rate": 0.0002,
+      "loss": 1.713,
+      "step": 2040
+    },
+    {
+      "epoch": 1.7687661777394306,
+      "grad_norm": 1.3561054468154907,
+      "learning_rate": 0.0002,
+      "loss": 1.6794,
+      "step": 2050
+    },
+    {
+      "epoch": 1.7773943054357204,
+      "grad_norm": 1.4540512561798096,
+      "learning_rate": 0.0002,
+      "loss": 1.7058,
+      "step": 2060
+    },
+    {
+      "epoch": 1.7860224331320103,
+      "grad_norm": 1.2661199569702148,
+      "learning_rate": 0.0002,
+      "loss": 1.6187,
+      "step": 2070
+    },
+    {
+      "epoch": 1.7946505608283,
+      "grad_norm": 2.188016176223755,
+      "learning_rate": 0.0002,
+      "loss": 1.6998,
+      "step": 2080
+    },
+    {
+      "epoch": 1.8032786885245902,
+      "grad_norm": 1.4326417446136475,
+      "learning_rate": 0.0002,
+      "loss": 1.6909,
+      "step": 2090
+    },
+    {
+      "epoch": 1.8119068162208802,
+      "grad_norm": 2.2382805347442627,
+      "learning_rate": 0.0002,
+      "loss": 1.7765,
+      "step": 2100
+    },
+    {
+      "epoch": 1.82053494391717,
+      "grad_norm": 1.396160364151001,
+      "learning_rate": 0.0002,
+      "loss": 1.7034,
+      "step": 2110
+    },
+    {
+      "epoch": 1.82916307161346,
+      "grad_norm": 1.3848069906234741,
+      "learning_rate": 0.0002,
+      "loss": 1.629,
+      "step": 2120
+    },
+    {
+      "epoch": 1.8377911993097498,
+      "grad_norm": 1.6975245475769043,
+      "learning_rate": 0.0002,
+      "loss": 1.6153,
+      "step": 2130
+    },
+    {
+      "epoch": 1.8464193270060396,
+      "grad_norm": 1.476306676864624,
+      "learning_rate": 0.0002,
+      "loss": 1.6631,
+      "step": 2140
+    },
+    {
+      "epoch": 1.8550474547023295,
+      "grad_norm": 1.5690935850143433,
+      "learning_rate": 0.0002,
+      "loss": 1.646,
+      "step": 2150
+    },
+    {
+      "epoch": 1.8636755823986195,
+      "grad_norm": 1.4900702238082886,
+      "learning_rate": 0.0002,
+      "loss": 1.6989,
+      "step": 2160
+    },
+    {
+      "epoch": 1.8723037100949094,
+      "grad_norm": 1.4173238277435303,
+      "learning_rate": 0.0002,
+      "loss": 1.657,
+      "step": 2170
+    },
+    {
+      "epoch": 1.8809318377911994,
+      "grad_norm": 1.3687001466751099,
+      "learning_rate": 0.0002,
+      "loss": 1.6587,
+      "step": 2180
+    },
+    {
+      "epoch": 1.8895599654874893,
+      "grad_norm": 1.371954321861267,
+      "learning_rate": 0.0002,
+      "loss": 1.6209,
+      "step": 2190
+    },
+    {
+      "epoch": 1.8981880931837791,
+      "grad_norm": 1.5397378206253052,
+      "learning_rate": 0.0002,
+      "loss": 1.6749,
+      "step": 2200
+    },
+    {
+      "epoch": 1.906816220880069,
+      "grad_norm": 1.7145664691925049,
+      "learning_rate": 0.0002,
+      "loss": 1.7149,
+      "step": 2210
+    },
+    {
+      "epoch": 1.9154443485763588,
+      "grad_norm": 1.5490705966949463,
+      "learning_rate": 0.0002,
+      "loss": 1.6663,
+      "step": 2220
+    },
+    {
+      "epoch": 1.9240724762726489,
+      "grad_norm": 1.3237485885620117,
+      "learning_rate": 0.0002,
+      "loss": 1.7056,
+      "step": 2230
+    },
+    {
+      "epoch": 1.9327006039689387,
+      "grad_norm": 1.4739165306091309,
+      "learning_rate": 0.0002,
+      "loss": 1.7613,
+      "step": 2240
+    },
+    {
+      "epoch": 1.9413287316652288,
+      "grad_norm": 1.7177914381027222,
+      "learning_rate": 0.0002,
+      "loss": 1.601,
+      "step": 2250
+    },
+    {
+      "epoch": 1.9499568593615186,
+      "grad_norm": 1.3587760925292969,
+      "learning_rate": 0.0002,
+      "loss": 1.6733,
+      "step": 2260
+    },
+    {
+      "epoch": 1.9585849870578085,
+      "grad_norm": 1.3180559873580933,
+      "learning_rate": 0.0002,
+      "loss": 1.6511,
+      "step": 2270
+    },
+    {
+      "epoch": 1.9672131147540983,
+      "grad_norm": 1.9988678693771362,
+      "learning_rate": 0.0002,
+      "loss": 1.5875,
+      "step": 2280
+    },
+    {
+      "epoch": 1.9758412424503882,
+      "grad_norm": 1.4148619174957275,
+      "learning_rate": 0.0002,
+      "loss": 1.6516,
+      "step": 2290
+    },
+    {
+      "epoch": 1.984469370146678,
+      "grad_norm": 1.6429015398025513,
+      "learning_rate": 0.0002,
+      "loss": 1.6649,
+      "step": 2300
+    },
+    {
+      "epoch": 1.993097497842968,
+      "grad_norm": 1.6742682456970215,
+      "learning_rate": 0.0002,
+      "loss": 1.6504,
+      "step": 2310
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7843003273010254,
+      "eval_runtime": 155.4967,
+      "eval_samples_per_second": 3.428,
+      "eval_steps_per_second": 0.431,
+      "step": 2318
+    },
+    {
+      "epoch": 2.001725625539258,
+      "grad_norm": 1.399217128753662,
+      "learning_rate": 0.0002,
+      "loss": 1.6082,
+      "step": 2320
+    },
+    {
+      "epoch": 2.010353753235548,
+      "grad_norm": 1.7028861045837402,
+      "learning_rate": 0.0002,
+      "loss": 1.4883,
+      "step": 2330
+    },
+    {
+      "epoch": 2.018981880931838,
+      "grad_norm": 1.506859540939331,
+      "learning_rate": 0.0002,
+      "loss": 1.4019,
+      "step": 2340
+    },
+    {
+      "epoch": 2.0276100086281277,
+      "grad_norm": 1.3946882486343384,
+      "learning_rate": 0.0002,
+      "loss": 1.482,
+      "step": 2350
+    },
+    {
+      "epoch": 2.0362381363244175,
+      "grad_norm": 1.5871425867080688,
+      "learning_rate": 0.0002,
+      "loss": 1.5225,
+      "step": 2360
+    },
+    {
+      "epoch": 2.0448662640207074,
+      "grad_norm": 1.636025309562683,
+      "learning_rate": 0.0002,
+      "loss": 1.5915,
+      "step": 2370
+    },
+    {
+      "epoch": 2.053494391716997,
+      "grad_norm": 1.971501111984253,
+      "learning_rate": 0.0002,
+      "loss": 1.5434,
+      "step": 2380
+    },
+    {
+      "epoch": 2.0621225194132875,
+      "grad_norm": 1.5961263179779053,
+      "learning_rate": 0.0002,
+      "loss": 1.5265,
+      "step": 2390
+    },
+    {
+      "epoch": 2.0707506471095773,
+      "grad_norm": 1.4916940927505493,
+      "learning_rate": 0.0002,
+      "loss": 1.446,
+      "step": 2400
+    },
+    {
+      "epoch": 2.079378774805867,
+      "grad_norm": 1.6255263090133667,
+      "learning_rate": 0.0002,
+      "loss": 1.528,
+      "step": 2410
+    },
+    {
+      "epoch": 2.088006902502157,
+      "grad_norm": 1.9251011610031128,
+      "learning_rate": 0.0002,
+      "loss": 1.6365,
+      "step": 2420
+    },
+    {
+      "epoch": 2.096635030198447,
+      "grad_norm": 1.6198536157608032,
+      "learning_rate": 0.0002,
+      "loss": 1.5883,
+      "step": 2430
+    },
+    {
+      "epoch": 2.1052631578947367,
+      "grad_norm": 1.6935237646102905,
+      "learning_rate": 0.0002,
+      "loss": 1.4984,
+      "step": 2440
+    },
+    {
+      "epoch": 2.1138912855910266,
+      "grad_norm": 1.5107334852218628,
+      "learning_rate": 0.0002,
+      "loss": 1.5477,
+      "step": 2450
+    },
+    {
+      "epoch": 2.122519413287317,
+      "grad_norm": 1.801699161529541,
+      "learning_rate": 0.0002,
+      "loss": 1.4898,
+      "step": 2460
+    },
+    {
+      "epoch": 2.1311475409836067,
+      "grad_norm": 1.6194193363189697,
+      "learning_rate": 0.0002,
+      "loss": 1.5471,
+      "step": 2470
+    },
+    {
+      "epoch": 2.1397756686798965,
+      "grad_norm": 1.896286964416504,
+      "learning_rate": 0.0002,
+      "loss": 1.4619,
+      "step": 2480
+    },
+    {
+      "epoch": 2.1484037963761864,
+      "grad_norm": 1.9456146955490112,
+      "learning_rate": 0.0002,
+      "loss": 1.5496,
+      "step": 2490
+    },
+    {
+      "epoch": 2.1570319240724762,
+      "grad_norm": 23.566476821899414,
+      "learning_rate": 0.0002,
+      "loss": 1.5449,
+      "step": 2500
+    },
+    {
+      "epoch": 2.165660051768766,
+      "grad_norm": 1.7737925052642822,
+      "learning_rate": 0.0002,
+      "loss": 1.5675,
+      "step": 2510
+    },
+    {
+      "epoch": 2.174288179465056,
+      "grad_norm": 1.7305291891098022,
+      "learning_rate": 0.0002,
+      "loss": 1.4775,
+      "step": 2520
+    },
+    {
+      "epoch": 2.1829163071613458,
+      "grad_norm": 2.130882978439331,
+      "learning_rate": 0.0002,
+      "loss": 1.5051,
+      "step": 2530
+    },
+    {
+      "epoch": 2.191544434857636,
+      "grad_norm": 1.790124535560608,
+      "learning_rate": 0.0002,
+      "loss": 1.4675,
+      "step": 2540
+    },
+    {
+      "epoch": 2.200172562553926,
+      "grad_norm": 1.8408042192459106,
+      "learning_rate": 0.0002,
+      "loss": 1.5208,
+      "step": 2550
+    },
+    {
+      "epoch": 2.2088006902502157,
+      "grad_norm": 1.7635295391082764,
+      "learning_rate": 0.0002,
+      "loss": 1.4732,
+      "step": 2560
+    },
+    {
+      "epoch": 2.2174288179465056,
+      "grad_norm": 1.7026700973510742,
+      "learning_rate": 0.0002,
+      "loss": 1.4604,
+      "step": 2570
+    },
+    {
+      "epoch": 2.2260569456427954,
+      "grad_norm": 1.881218433380127,
+      "learning_rate": 0.0002,
+      "loss": 1.5223,
+      "step": 2580
+    },
+    {
+      "epoch": 2.2346850733390853,
+      "grad_norm": 1.9007751941680908,
+      "learning_rate": 0.0002,
+      "loss": 1.4422,
+      "step": 2590
+    },
+    {
+      "epoch": 2.243313201035375,
+      "grad_norm": 1.7862553596496582,
+      "learning_rate": 0.0002,
+      "loss": 1.4695,
+      "step": 2600
+    },
+    {
+      "epoch": 2.2519413287316654,
+      "grad_norm": 1.7117811441421509,
+      "learning_rate": 0.0002,
+      "loss": 1.4731,
+      "step": 2610
+    },
+    {
+      "epoch": 2.2605694564279553,
+      "grad_norm": 1.7809374332427979,
+      "learning_rate": 0.0002,
+      "loss": 1.4951,
+      "step": 2620
+    },
+    {
+      "epoch": 2.269197584124245,
+      "grad_norm": 1.7089564800262451,
+      "learning_rate": 0.0002,
+      "loss": 1.4744,
+      "step": 2630
+    },
+    {
+      "epoch": 2.277825711820535,
+      "grad_norm": 1.7662888765335083,
+      "learning_rate": 0.0002,
+      "loss": 1.5186,
+      "step": 2640
+    },
+    {
+      "epoch": 2.286453839516825,
+      "grad_norm": 1.8892756700515747,
+      "learning_rate": 0.0002,
+      "loss": 1.5468,
+      "step": 2650
+    },
+    {
+      "epoch": 2.2950819672131146,
+      "grad_norm": 1.678238034248352,
+      "learning_rate": 0.0002,
+      "loss": 1.5266,
+      "step": 2660
+    },
+    {
+      "epoch": 2.3037100949094045,
+      "grad_norm": 1.865786075592041,
+      "learning_rate": 0.0002,
+      "loss": 1.4897,
+      "step": 2670
+    },
+    {
+      "epoch": 2.3123382226056943,
+      "grad_norm": 1.9744012355804443,
+      "learning_rate": 0.0002,
+      "loss": 1.5578,
+      "step": 2680
+    },
+    {
+      "epoch": 2.3209663503019846,
+      "grad_norm": 1.884690284729004,
+      "learning_rate": 0.0002,
+      "loss": 1.5021,
+      "step": 2690
+    },
+    {
+      "epoch": 2.3295944779982745,
+      "grad_norm": 1.6391639709472656,
+      "learning_rate": 0.0002,
+      "loss": 1.6071,
+      "step": 2700
+    },
+    {
+      "epoch": 2.3382226056945643,
+      "grad_norm": 1.7777862548828125,
+      "learning_rate": 0.0002,
+      "loss": 1.5721,
+      "step": 2710
+    },
+    {
+      "epoch": 2.346850733390854,
+      "grad_norm": 1.6615192890167236,
+      "learning_rate": 0.0002,
+      "loss": 1.5633,
+      "step": 2720
+    },
+    {
+      "epoch": 2.355478861087144,
+      "grad_norm": 2.2202742099761963,
+      "learning_rate": 0.0002,
+      "loss": 1.5213,
+      "step": 2730
+    },
+    {
+      "epoch": 2.364106988783434,
+      "grad_norm": 2.1986732482910156,
+      "learning_rate": 0.0002,
+      "loss": 1.5443,
+      "step": 2740
+    },
+    {
+      "epoch": 2.372735116479724,
+      "grad_norm": 1.7847017049789429,
+      "learning_rate": 0.0002,
+      "loss": 1.5834,
+      "step": 2750
+    },
+    {
+      "epoch": 2.381363244176014,
+      "grad_norm": 1.8832756280899048,
+      "learning_rate": 0.0002,
+      "loss": 1.4946,
+      "step": 2760
+    },
+    {
+      "epoch": 2.389991371872304,
+      "grad_norm": 1.8374940156936646,
+      "learning_rate": 0.0002,
+      "loss": 1.5725,
+      "step": 2770
+    },
+    {
+      "epoch": 2.3986194995685937,
+      "grad_norm": 1.741965413093567,
+      "learning_rate": 0.0002,
+      "loss": 1.5181,
+      "step": 2780
+    },
+    {
+      "epoch": 2.4072476272648835,
+      "grad_norm": 1.789699673652649,
+      "learning_rate": 0.0002,
+      "loss": 1.5571,
+      "step": 2790
+    },
+    {
+      "epoch": 2.4158757549611733,
+      "grad_norm": 2.0495948791503906,
+      "learning_rate": 0.0002,
+      "loss": 1.4763,
+      "step": 2800
+    },
+    {
+      "epoch": 2.424503882657463,
+      "grad_norm": 1.7399765253067017,
+      "learning_rate": 0.0002,
+      "loss": 1.5129,
+      "step": 2810
+    },
+    {
+      "epoch": 2.433132010353753,
+      "grad_norm": 1.9142578840255737,
+      "learning_rate": 0.0002,
+      "loss": 1.556,
+      "step": 2820
+    },
+    {
+      "epoch": 2.4417601380500433,
+      "grad_norm": 1.920663595199585,
+      "learning_rate": 0.0002,
+      "loss": 1.4848,
+      "step": 2830
+    },
+    {
+      "epoch": 2.450388265746333,
+      "grad_norm": 1.7982150316238403,
+      "learning_rate": 0.0002,
+      "loss": 1.5411,
+      "step": 2840
+    },
+    {
+      "epoch": 2.459016393442623,
+      "grad_norm": 1.7665464878082275,
+      "learning_rate": 0.0002,
+      "loss": 1.5802,
+      "step": 2850
+    },
+    {
+      "epoch": 2.467644521138913,
+      "grad_norm": 1.9115102291107178,
+      "learning_rate": 0.0002,
+      "loss": 1.5433,
+      "step": 2860
+    },
+    {
+      "epoch": 2.4762726488352027,
+      "grad_norm": 1.9024899005889893,
+      "learning_rate": 0.0002,
+      "loss": 1.4518,
+      "step": 2870
+    },
+    {
+      "epoch": 2.4849007765314925,
+      "grad_norm": 1.7804782390594482,
+      "learning_rate": 0.0002,
+      "loss": 1.4797,
+      "step": 2880
+    },
+    {
+      "epoch": 2.4935289042277824,
+      "grad_norm": 2.0264487266540527,
+      "learning_rate": 0.0002,
+      "loss": 1.5182,
+      "step": 2890
+    },
+    {
+      "epoch": 2.5021570319240727,
+      "grad_norm": 1.8650445938110352,
+      "learning_rate": 0.0002,
+      "loss": 1.4455,
+      "step": 2900
+    },
+    {
+      "epoch": 2.5107851596203625,
+      "grad_norm": 2.0831475257873535,
+      "learning_rate": 0.0002,
+      "loss": 1.54,
+      "step": 2910
+    },
+    {
+      "epoch": 2.5194132873166524,
+      "grad_norm": 1.9633755683898926,
+      "learning_rate": 0.0002,
+      "loss": 1.6014,
+      "step": 2920
+    },
+    {
+      "epoch": 2.528041415012942,
+      "grad_norm": 2.2055106163024902,
+      "learning_rate": 0.0002,
+      "loss": 1.56,
+      "step": 2930
+    },
+    {
+      "epoch": 2.536669542709232,
+      "grad_norm": 2.1060245037078857,
+      "learning_rate": 0.0002,
+      "loss": 1.492,
+      "step": 2940
+    },
+    {
+      "epoch": 2.545297670405522,
+      "grad_norm": 2.0236003398895264,
+      "learning_rate": 0.0002,
+      "loss": 1.5688,
+      "step": 2950
+    },
+    {
+      "epoch": 2.5539257981018118,
+      "grad_norm": 1.898287296295166,
+      "learning_rate": 0.0002,
+      "loss": 1.5186,
+      "step": 2960
+    },
+    {
+      "epoch": 2.5625539257981016,
+      "grad_norm": 1.9526840448379517,
+      "learning_rate": 0.0002,
+      "loss": 1.5441,
+      "step": 2970
+    },
+    {
+      "epoch": 2.5711820534943914,
+      "grad_norm": 1.9538743495941162,
+      "learning_rate": 0.0002,
+      "loss": 1.5608,
+      "step": 2980
+    },
+    {
+      "epoch": 2.5798101811906817,
+      "grad_norm": 1.787394404411316,
+      "learning_rate": 0.0002,
+      "loss": 1.4356,
+      "step": 2990
+    },
+    {
+      "epoch": 2.5884383088869716,
+      "grad_norm": 2.0792672634124756,
+      "learning_rate": 0.0002,
+      "loss": 1.5096,
+      "step": 3000
+    },
+    {
+      "epoch": 2.5970664365832614,
+      "grad_norm": 1.760083556175232,
+      "learning_rate": 0.0002,
+      "loss": 1.5131,
+      "step": 3010
+    },
+    {
+      "epoch": 2.6056945642795513,
+      "grad_norm": 1.8766807317733765,
+      "learning_rate": 0.0002,
+      "loss": 1.5553,
+      "step": 3020
+    },
+    {
+      "epoch": 2.614322691975841,
+      "grad_norm": 1.9650694131851196,
+      "learning_rate": 0.0002,
+      "loss": 1.5381,
+      "step": 3030
+    },
+    {
+      "epoch": 2.6229508196721314,
+      "grad_norm": 1.8143510818481445,
+      "learning_rate": 0.0002,
+      "loss": 1.5263,
+      "step": 3040
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 2.5094006061553955,
+      "learning_rate": 0.0002,
+      "loss": 1.5187,
+      "step": 3050
+    },
+    {
+      "epoch": 2.640207075064711,
+      "grad_norm": 1.852913737297058,
+      "learning_rate": 0.0002,
+      "loss": 1.4729,
+      "step": 3060
+    },
+    {
+      "epoch": 2.648835202761001,
+      "grad_norm": 2.052318811416626,
+      "learning_rate": 0.0002,
+      "loss": 1.5563,
+      "step": 3070
+    },
+    {
+      "epoch": 2.6574633304572908,
+      "grad_norm": 1.8995426893234253,
+      "learning_rate": 0.0002,
+      "loss": 1.5543,
+      "step": 3080
+    },
+    {
+      "epoch": 2.6660914581535806,
+      "grad_norm": 1.979037880897522,
+      "learning_rate": 0.0002,
+      "loss": 1.5357,
+      "step": 3090
+    },
+    {
+      "epoch": 2.6747195858498705,
+      "grad_norm": 1.8179038763046265,
+      "learning_rate": 0.0002,
+      "loss": 1.537,
+      "step": 3100
+    },
+    {
+      "epoch": 2.6833477135461603,
+      "grad_norm": 1.8502779006958008,
+      "learning_rate": 0.0002,
+      "loss": 1.5929,
+      "step": 3110
+    },
+    {
+      "epoch": 2.69197584124245,
+      "grad_norm": 2.0174338817596436,
+      "learning_rate": 0.0002,
+      "loss": 1.5139,
+      "step": 3120
+    },
+    {
+      "epoch": 2.7006039689387404,
+      "grad_norm": 2.1845622062683105,
+      "learning_rate": 0.0002,
+      "loss": 1.5609,
+      "step": 3130
+    },
+    {
+      "epoch": 2.7092320966350303,
+      "grad_norm": 2.1443305015563965,
+      "learning_rate": 0.0002,
+      "loss": 1.5083,
+      "step": 3140
+    },
+    {
+      "epoch": 2.71786022433132,
+      "grad_norm": 2.057907819747925,
+      "learning_rate": 0.0002,
+      "loss": 1.5856,
+      "step": 3150
+    },
+    {
+      "epoch": 2.72648835202761,
+      "grad_norm": 1.9795310497283936,
+      "learning_rate": 0.0002,
+      "loss": 1.5298,
+      "step": 3160
+    },
+    {
+      "epoch": 2.7351164797239,
+      "grad_norm": 1.9476630687713623,
+      "learning_rate": 0.0002,
+      "loss": 1.574,
+      "step": 3170
+    },
+    {
+      "epoch": 2.7437446074201897,
+      "grad_norm": 1.9144753217697144,
+      "learning_rate": 0.0002,
+      "loss": 1.5884,
+      "step": 3180
+    },
+    {
+      "epoch": 2.75237273511648,
+      "grad_norm": 2.0273289680480957,
+      "learning_rate": 0.0002,
+      "loss": 1.554,
+      "step": 3190
+    },
+    {
+      "epoch": 2.76100086281277,
+      "grad_norm": 1.9641752243041992,
+      "learning_rate": 0.0002,
+      "loss": 1.6172,
+      "step": 3200
+    },
+    {
+      "epoch": 2.7696289905090596,
+      "grad_norm": 1.721760630607605,
+      "learning_rate": 0.0002,
+      "loss": 1.525,
+      "step": 3210
+    },
+    {
+      "epoch": 2.7782571182053495,
+      "grad_norm": 1.8093656301498413,
+      "learning_rate": 0.0002,
+      "loss": 1.5414,
+      "step": 3220
+    },
+    {
+      "epoch": 2.7868852459016393,
+      "grad_norm": 1.907056212425232,
+      "learning_rate": 0.0002,
+      "loss": 1.544,
+      "step": 3230
+    },
+    {
+      "epoch": 2.795513373597929,
+      "grad_norm": 2.0488245487213135,
+      "learning_rate": 0.0002,
+      "loss": 1.5911,
+      "step": 3240
+    },
+    {
+      "epoch": 2.804141501294219,
+      "grad_norm": 2.161618232727051,
+      "learning_rate": 0.0002,
+      "loss": 1.5548,
+      "step": 3250
+    },
+    {
+      "epoch": 2.812769628990509,
+      "grad_norm": 1.8043134212493896,
+      "learning_rate": 0.0002,
+      "loss": 1.5549,
+      "step": 3260
+    },
+    {
+      "epoch": 2.8213977566867987,
+      "grad_norm": 1.879629373550415,
+      "learning_rate": 0.0002,
+      "loss": 1.5883,
+      "step": 3270
+    },
+    {
+      "epoch": 2.830025884383089,
+      "grad_norm": 1.9248288869857788,
+      "learning_rate": 0.0002,
+      "loss": 1.5424,
+      "step": 3280
+    },
+    {
+      "epoch": 2.838654012079379,
+      "grad_norm": 1.9379483461380005,
+      "learning_rate": 0.0002,
+      "loss": 1.5166,
+      "step": 3290
+    },
+    {
+      "epoch": 2.8472821397756687,
+      "grad_norm": 1.7068989276885986,
+      "learning_rate": 0.0002,
+      "loss": 1.5575,
+      "step": 3300
+    },
+    {
+      "epoch": 2.8559102674719585,
+      "grad_norm": 1.8729630708694458,
+      "learning_rate": 0.0002,
+      "loss": 1.5513,
+      "step": 3310
+    },
+    {
+      "epoch": 2.8645383951682484,
+      "grad_norm": 1.7893825769424438,
+      "learning_rate": 0.0002,
+      "loss": 1.4364,
+      "step": 3320
+    },
+    {
+      "epoch": 2.8731665228645387,
+      "grad_norm": 1.9462252855300903,
+      "learning_rate": 0.0002,
+      "loss": 1.5439,
+      "step": 3330
+    },
+    {
+      "epoch": 2.8817946505608285,
+      "grad_norm": 1.9320255517959595,
+      "learning_rate": 0.0002,
+      "loss": 1.5171,
+      "step": 3340
+    },
+    {
+      "epoch": 2.8904227782571184,
+      "grad_norm": 1.9695475101470947,
+      "learning_rate": 0.0002,
+      "loss": 1.5695,
+      "step": 3350
+    },
+    {
+      "epoch": 2.899050905953408,
+      "grad_norm": 2.01279354095459,
+      "learning_rate": 0.0002,
+      "loss": 1.5418,
+      "step": 3360
+    },
+    {
+      "epoch": 2.907679033649698,
+      "grad_norm": 1.992236852645874,
+      "learning_rate": 0.0002,
+      "loss": 1.5559,
+      "step": 3370
+    },
+    {
+      "epoch": 2.916307161345988,
+      "grad_norm": 2.3763930797576904,
+      "learning_rate": 0.0002,
+      "loss": 1.5873,
+      "step": 3380
+    },
+    {
+      "epoch": 2.9249352890422777,
+      "grad_norm": 1.91392982006073,
+      "learning_rate": 0.0002,
+      "loss": 1.5182,
+      "step": 3390
+    },
+    {
+      "epoch": 2.9335634167385676,
+      "grad_norm": 1.969994306564331,
+      "learning_rate": 0.0002,
+      "loss": 1.5317,
+      "step": 3400
+    },
+    {
+      "epoch": 2.9421915444348574,
+      "grad_norm": 1.9397379159927368,
+      "learning_rate": 0.0002,
+      "loss": 1.4554,
+      "step": 3410
+    },
+    {
+      "epoch": 2.9508196721311473,
+      "grad_norm": 2.1597039699554443,
+      "learning_rate": 0.0002,
+      "loss": 1.5135,
+      "step": 3420
+    },
+    {
+      "epoch": 2.9594477998274376,
+      "grad_norm": 1.9564080238342285,
+      "learning_rate": 0.0002,
+      "loss": 1.6098,
+      "step": 3430
+    },
+    {
+      "epoch": 2.9680759275237274,
+      "grad_norm": 1.8007603883743286,
+      "learning_rate": 0.0002,
+      "loss": 1.603,
+      "step": 3440
+    },
+    {
+      "epoch": 2.9767040552200172,
+      "grad_norm": 2.5556256771087646,
+      "learning_rate": 0.0002,
+      "loss": 1.5593,
+      "step": 3450
+    },
+    {
+      "epoch": 2.985332182916307,
+      "grad_norm": 1.96817147731781,
+      "learning_rate": 0.0002,
+      "loss": 1.5564,
+      "step": 3460
+    },
+    {
+      "epoch": 2.993960310612597,
+      "grad_norm": 1.921637773513794,
+      "learning_rate": 0.0002,
+      "loss": 1.5664,
+      "step": 3470
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.8276220560073853,
+      "eval_runtime": 148.7597,
+      "eval_samples_per_second": 3.583,
+      "eval_steps_per_second": 0.45,
+      "step": 3477
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9272,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.525524247609344e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87c51db54c9eeeb9c31c1d5d10a2ca49db936f9c0e6c5697c8941ee541bc7c94
+size 5688

	@@ -0,0 +1,203 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:296f3f9cafdb8c14df28327a2d52eb4cd13c72175ad40286b7882015c457217a
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7b5f6776b738ca267b8e9de81b69f2c5fa6d58f2824003a2facc3548f0d4208
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c85af1a9dd3cc7fb08f8635eba3e981c95313639e4c46e88abe8f35ee116ae5
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abdd770edb41849c0f08acafa3bb5c5dc1ad2e1043d2a0142089d6a8df92d289
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3306 @@

+{
+  "best_metric": 1.7723218202590942,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1159",
+  "epoch": 4.0,
+  "eval_steps": 10,
+  "global_step": 4636,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008628127696289905,
+      "grad_norm": 1.7177482843399048,
+      "learning_rate": 0.0002,
+      "loss": 2.5586,
+      "step": 10
+    },
+    {
+      "epoch": 0.01725625539257981,
+      "grad_norm": 2.8122410774230957,
+      "learning_rate": 0.0002,
+      "loss": 2.2918,
+      "step": 20
+    },
+    {
+      "epoch": 0.025884383088869714,
+      "grad_norm": 1.6668062210083008,
+      "learning_rate": 0.0002,
+      "loss": 2.0885,
+      "step": 30
+    },
+    {
+      "epoch": 0.03451251078515962,
+      "grad_norm": 1.745869755744934,
+      "learning_rate": 0.0002,
+      "loss": 2.0469,
+      "step": 40
+    },
+    {
+      "epoch": 0.04314063848144953,
+      "grad_norm": 1.807971477508545,
+      "learning_rate": 0.0002,
+      "loss": 2.0754,
+      "step": 50
+    },
+    {
+      "epoch": 0.05176876617773943,
+      "grad_norm": 2.4009974002838135,
+      "learning_rate": 0.0002,
+      "loss": 2.0867,
+      "step": 60
+    },
+    {
+      "epoch": 0.060396893874029335,
+      "grad_norm": 2.0974676609039307,
+      "learning_rate": 0.0002,
+      "loss": 1.8575,
+      "step": 70
+    },
+    {
+      "epoch": 0.06902502157031924,
+      "grad_norm": 1.7705916166305542,
+      "learning_rate": 0.0002,
+      "loss": 1.8921,
+      "step": 80
+    },
+    {
+      "epoch": 0.07765314926660914,
+      "grad_norm": 1.4324289560317993,
+      "learning_rate": 0.0002,
+      "loss": 1.8119,
+      "step": 90
+    },
+    {
+      "epoch": 0.08628127696289906,
+      "grad_norm": 1.2521991729736328,
+      "learning_rate": 0.0002,
+      "loss": 1.8728,
+      "step": 100
+    },
+    {
+      "epoch": 0.09490940465918896,
+      "grad_norm": 1.3328145742416382,
+      "learning_rate": 0.0002,
+      "loss": 1.8168,
+      "step": 110
+    },
+    {
+      "epoch": 0.10353753235547886,
+      "grad_norm": 2.2908742427825928,
+      "learning_rate": 0.0002,
+      "loss": 1.8236,
+      "step": 120
+    },
+    {
+      "epoch": 0.11216566005176877,
+      "grad_norm": 1.540981411933899,
+      "learning_rate": 0.0002,
+      "loss": 1.8732,
+      "step": 130
+    },
+    {
+      "epoch": 0.12079378774805867,
+      "grad_norm": 1.1785069704055786,
+      "learning_rate": 0.0002,
+      "loss": 1.8138,
+      "step": 140
+    },
+    {
+      "epoch": 0.12942191544434858,
+      "grad_norm": 1.3138738870620728,
+      "learning_rate": 0.0002,
+      "loss": 1.8655,
+      "step": 150
+    },
+    {
+      "epoch": 0.13805004314063848,
+      "grad_norm": 1.153215765953064,
+      "learning_rate": 0.0002,
+      "loss": 1.8418,
+      "step": 160
+    },
+    {
+      "epoch": 0.14667817083692838,
+      "grad_norm": 1.2071360349655151,
+      "learning_rate": 0.0002,
+      "loss": 1.8284,
+      "step": 170
+    },
+    {
+      "epoch": 0.15530629853321828,
+      "grad_norm": 1.3546127080917358,
+      "learning_rate": 0.0002,
+      "loss": 1.8645,
+      "step": 180
+    },
+    {
+      "epoch": 0.16393442622950818,
+      "grad_norm": 1.1494425535202026,
+      "learning_rate": 0.0002,
+      "loss": 1.8699,
+      "step": 190
+    },
+    {
+      "epoch": 0.1725625539257981,
+      "grad_norm": 0.982718825340271,
+      "learning_rate": 0.0002,
+      "loss": 1.7845,
+      "step": 200
+    },
+    {
+      "epoch": 0.181190681622088,
+      "grad_norm": 1.1329727172851562,
+      "learning_rate": 0.0002,
+      "loss": 1.8237,
+      "step": 210
+    },
+    {
+      "epoch": 0.1898188093183779,
+      "grad_norm": 1.1397384405136108,
+      "learning_rate": 0.0002,
+      "loss": 1.8516,
+      "step": 220
+    },
+    {
+      "epoch": 0.1984469370146678,
+      "grad_norm": 1.2424808740615845,
+      "learning_rate": 0.0002,
+      "loss": 1.7504,
+      "step": 230
+    },
+    {
+      "epoch": 0.2070750647109577,
+      "grad_norm": 1.1463897228240967,
+      "learning_rate": 0.0002,
+      "loss": 1.7626,
+      "step": 240
+    },
+    {
+      "epoch": 0.21570319240724764,
+      "grad_norm": 1.2353036403656006,
+      "learning_rate": 0.0002,
+      "loss": 1.7977,
+      "step": 250
+    },
+    {
+      "epoch": 0.22433132010353754,
+      "grad_norm": 1.0135247707366943,
+      "learning_rate": 0.0002,
+      "loss": 1.8274,
+      "step": 260
+    },
+    {
+      "epoch": 0.23295944779982744,
+      "grad_norm": 1.1388282775878906,
+      "learning_rate": 0.0002,
+      "loss": 1.7678,
+      "step": 270
+    },
+    {
+      "epoch": 0.24158757549611734,
+      "grad_norm": 1.1262438297271729,
+      "learning_rate": 0.0002,
+      "loss": 1.7895,
+      "step": 280
+    },
+    {
+      "epoch": 0.25021570319240727,
+      "grad_norm": 1.0581450462341309,
+      "learning_rate": 0.0002,
+      "loss": 1.826,
+      "step": 290
+    },
+    {
+      "epoch": 0.25884383088869717,
+      "grad_norm": 1.1737277507781982,
+      "learning_rate": 0.0002,
+      "loss": 1.7269,
+      "step": 300
+    },
+    {
+      "epoch": 0.26747195858498707,
+      "grad_norm": 1.0906627178192139,
+      "learning_rate": 0.0002,
+      "loss": 1.7975,
+      "step": 310
+    },
+    {
+      "epoch": 0.27610008628127697,
+      "grad_norm": 1.0010069608688354,
+      "learning_rate": 0.0002,
+      "loss": 1.7594,
+      "step": 320
+    },
+    {
+      "epoch": 0.28472821397756687,
+      "grad_norm": 1.2149732112884521,
+      "learning_rate": 0.0002,
+      "loss": 1.7998,
+      "step": 330
+    },
+    {
+      "epoch": 0.29335634167385677,
+      "grad_norm": 1.293990969657898,
+      "learning_rate": 0.0002,
+      "loss": 1.8079,
+      "step": 340
+    },
+    {
+      "epoch": 0.30198446937014667,
+      "grad_norm": 1.0082058906555176,
+      "learning_rate": 0.0002,
+      "loss": 1.7629,
+      "step": 350
+    },
+    {
+      "epoch": 0.31061259706643657,
+      "grad_norm": 1.0307148694992065,
+      "learning_rate": 0.0002,
+      "loss": 1.8001,
+      "step": 360
+    },
+    {
+      "epoch": 0.31924072476272647,
+      "grad_norm": 0.9646756649017334,
+      "learning_rate": 0.0002,
+      "loss": 1.7456,
+      "step": 370
+    },
+    {
+      "epoch": 0.32786885245901637,
+      "grad_norm": 1.105623722076416,
+      "learning_rate": 0.0002,
+      "loss": 1.7979,
+      "step": 380
+    },
+    {
+      "epoch": 0.3364969801553063,
+      "grad_norm": 0.9365625977516174,
+      "learning_rate": 0.0002,
+      "loss": 1.7313,
+      "step": 390
+    },
+    {
+      "epoch": 0.3451251078515962,
+      "grad_norm": 1.1378847360610962,
+      "learning_rate": 0.0002,
+      "loss": 1.809,
+      "step": 400
+    },
+    {
+      "epoch": 0.3537532355478861,
+      "grad_norm": 1.1266193389892578,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 410
+    },
+    {
+      "epoch": 0.362381363244176,
+      "grad_norm": 1.0886635780334473,
+      "learning_rate": 0.0002,
+      "loss": 1.8096,
+      "step": 420
+    },
+    {
+      "epoch": 0.3710094909404659,
+      "grad_norm": 1.0463931560516357,
+      "learning_rate": 0.0002,
+      "loss": 1.7422,
+      "step": 430
+    },
+    {
+      "epoch": 0.3796376186367558,
+      "grad_norm": 1.0923888683319092,
+      "learning_rate": 0.0002,
+      "loss": 1.7936,
+      "step": 440
+    },
+    {
+      "epoch": 0.3882657463330457,
+      "grad_norm": 1.1386370658874512,
+      "learning_rate": 0.0002,
+      "loss": 1.7777,
+      "step": 450
+    },
+    {
+      "epoch": 0.3968938740293356,
+      "grad_norm": 1.0098074674606323,
+      "learning_rate": 0.0002,
+      "loss": 1.7211,
+      "step": 460
+    },
+    {
+      "epoch": 0.4055220017256255,
+      "grad_norm": 1.1237372159957886,
+      "learning_rate": 0.0002,
+      "loss": 1.7457,
+      "step": 470
+    },
+    {
+      "epoch": 0.4141501294219154,
+      "grad_norm": 1.0218915939331055,
+      "learning_rate": 0.0002,
+      "loss": 1.854,
+      "step": 480
+    },
+    {
+      "epoch": 0.4227782571182053,
+      "grad_norm": 0.9998831748962402,
+      "learning_rate": 0.0002,
+      "loss": 1.8548,
+      "step": 490
+    },
+    {
+      "epoch": 0.4314063848144953,
+      "grad_norm": 1.0424970388412476,
+      "learning_rate": 0.0002,
+      "loss": 1.7159,
+      "step": 500
+    },
+    {
+      "epoch": 0.4400345125107852,
+      "grad_norm": 0.903372585773468,
+      "learning_rate": 0.0002,
+      "loss": 1.788,
+      "step": 510
+    },
+    {
+      "epoch": 0.4486626402070751,
+      "grad_norm": 1.0864766836166382,
+      "learning_rate": 0.0002,
+      "loss": 1.8293,
+      "step": 520
+    },
+    {
+      "epoch": 0.457290767903365,
+      "grad_norm": 0.9694207310676575,
+      "learning_rate": 0.0002,
+      "loss": 1.8402,
+      "step": 530
+    },
+    {
+      "epoch": 0.4659188955996549,
+      "grad_norm": 1.2796396017074585,
+      "learning_rate": 0.0002,
+      "loss": 1.7802,
+      "step": 540
+    },
+    {
+      "epoch": 0.4745470232959448,
+      "grad_norm": 1.0316239595413208,
+      "learning_rate": 0.0002,
+      "loss": 1.7716,
+      "step": 550
+    },
+    {
+      "epoch": 0.4831751509922347,
+      "grad_norm": 1.0445313453674316,
+      "learning_rate": 0.0002,
+      "loss": 1.7734,
+      "step": 560
+    },
+    {
+      "epoch": 0.4918032786885246,
+      "grad_norm": 1.1078376770019531,
+      "learning_rate": 0.0002,
+      "loss": 1.8082,
+      "step": 570
+    },
+    {
+      "epoch": 0.5004314063848145,
+      "grad_norm": 1.0551974773406982,
+      "learning_rate": 0.0002,
+      "loss": 1.7298,
+      "step": 580
+    },
+    {
+      "epoch": 0.5090595340811044,
+      "grad_norm": 1.114853858947754,
+      "learning_rate": 0.0002,
+      "loss": 1.8673,
+      "step": 590
+    },
+    {
+      "epoch": 0.5176876617773943,
+      "grad_norm": 1.0642707347869873,
+      "learning_rate": 0.0002,
+      "loss": 1.7684,
+      "step": 600
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 1.088079810142517,
+      "learning_rate": 0.0002,
+      "loss": 1.8367,
+      "step": 610
+    },
+    {
+      "epoch": 0.5349439171699741,
+      "grad_norm": 1.4029462337493896,
+      "learning_rate": 0.0002,
+      "loss": 1.775,
+      "step": 620
+    },
+    {
+      "epoch": 0.543572044866264,
+      "grad_norm": 1.2136136293411255,
+      "learning_rate": 0.0002,
+      "loss": 1.7771,
+      "step": 630
+    },
+    {
+      "epoch": 0.5522001725625539,
+      "grad_norm": 0.9642075896263123,
+      "learning_rate": 0.0002,
+      "loss": 1.8006,
+      "step": 640
+    },
+    {
+      "epoch": 0.5608283002588438,
+      "grad_norm": 1.0879552364349365,
+      "learning_rate": 0.0002,
+      "loss": 1.7478,
+      "step": 650
+    },
+    {
+      "epoch": 0.5694564279551337,
+      "grad_norm": 1.1766546964645386,
+      "learning_rate": 0.0002,
+      "loss": 1.8427,
+      "step": 660
+    },
+    {
+      "epoch": 0.5780845556514237,
+      "grad_norm": 1.582840085029602,
+      "learning_rate": 0.0002,
+      "loss": 1.7129,
+      "step": 670
+    },
+    {
+      "epoch": 0.5867126833477135,
+      "grad_norm": 1.0681092739105225,
+      "learning_rate": 0.0002,
+      "loss": 1.8093,
+      "step": 680
+    },
+    {
+      "epoch": 0.5953408110440035,
+      "grad_norm": 1.103897213935852,
+      "learning_rate": 0.0002,
+      "loss": 1.8067,
+      "step": 690
+    },
+    {
+      "epoch": 0.6039689387402933,
+      "grad_norm": 1.0974211692810059,
+      "learning_rate": 0.0002,
+      "loss": 1.7425,
+      "step": 700
+    },
+    {
+      "epoch": 0.6125970664365833,
+      "grad_norm": 1.1002469062805176,
+      "learning_rate": 0.0002,
+      "loss": 1.784,
+      "step": 710
+    },
+    {
+      "epoch": 0.6212251941328731,
+      "grad_norm": 1.0022329092025757,
+      "learning_rate": 0.0002,
+      "loss": 1.8106,
+      "step": 720
+    },
+    {
+      "epoch": 0.6298533218291631,
+      "grad_norm": 1.0089571475982666,
+      "learning_rate": 0.0002,
+      "loss": 1.7647,
+      "step": 730
+    },
+    {
+      "epoch": 0.6384814495254529,
+      "grad_norm": 0.9531904458999634,
+      "learning_rate": 0.0002,
+      "loss": 1.8033,
+      "step": 740
+    },
+    {
+      "epoch": 0.6471095772217429,
+      "grad_norm": 1.162675380706787,
+      "learning_rate": 0.0002,
+      "loss": 1.7644,
+      "step": 750
+    },
+    {
+      "epoch": 0.6557377049180327,
+      "grad_norm": 1.0488134622573853,
+      "learning_rate": 0.0002,
+      "loss": 1.7531,
+      "step": 760
+    },
+    {
+      "epoch": 0.6643658326143227,
+      "grad_norm": 1.12964928150177,
+      "learning_rate": 0.0002,
+      "loss": 1.7583,
+      "step": 770
+    },
+    {
+      "epoch": 0.6729939603106126,
+      "grad_norm": 1.0867345333099365,
+      "learning_rate": 0.0002,
+      "loss": 1.7765,
+      "step": 780
+    },
+    {
+      "epoch": 0.6816220880069025,
+      "grad_norm": 1.1084282398223877,
+      "learning_rate": 0.0002,
+      "loss": 1.7797,
+      "step": 790
+    },
+    {
+      "epoch": 0.6902502157031924,
+      "grad_norm": 0.9905423521995544,
+      "learning_rate": 0.0002,
+      "loss": 1.7792,
+      "step": 800
+    },
+    {
+      "epoch": 0.6988783433994823,
+      "grad_norm": 1.18604576587677,
+      "learning_rate": 0.0002,
+      "loss": 1.7825,
+      "step": 810
+    },
+    {
+      "epoch": 0.7075064710957722,
+      "grad_norm": 1.0819629430770874,
+      "learning_rate": 0.0002,
+      "loss": 1.8242,
+      "step": 820
+    },
+    {
+      "epoch": 0.7161345987920621,
+      "grad_norm": 2.0091195106506348,
+      "learning_rate": 0.0002,
+      "loss": 1.7916,
+      "step": 830
+    },
+    {
+      "epoch": 0.724762726488352,
+      "grad_norm": 1.0371277332305908,
+      "learning_rate": 0.0002,
+      "loss": 1.8186,
+      "step": 840
+    },
+    {
+      "epoch": 0.7333908541846419,
+      "grad_norm": 1.217102289199829,
+      "learning_rate": 0.0002,
+      "loss": 1.7937,
+      "step": 850
+    },
+    {
+      "epoch": 0.7420189818809318,
+      "grad_norm": 1.0528525114059448,
+      "learning_rate": 0.0002,
+      "loss": 1.7317,
+      "step": 860
+    },
+    {
+      "epoch": 0.7506471095772217,
+      "grad_norm": 1.1398800611495972,
+      "learning_rate": 0.0002,
+      "loss": 1.7757,
+      "step": 870
+    },
+    {
+      "epoch": 0.7592752372735116,
+      "grad_norm": 1.1546001434326172,
+      "learning_rate": 0.0002,
+      "loss": 1.8326,
+      "step": 880
+    },
+    {
+      "epoch": 0.7679033649698016,
+      "grad_norm": 1.0745750665664673,
+      "learning_rate": 0.0002,
+      "loss": 1.7178,
+      "step": 890
+    },
+    {
+      "epoch": 0.7765314926660914,
+      "grad_norm": 1.1739161014556885,
+      "learning_rate": 0.0002,
+      "loss": 1.7718,
+      "step": 900
+    },
+    {
+      "epoch": 0.7851596203623814,
+      "grad_norm": 1.1932017803192139,
+      "learning_rate": 0.0002,
+      "loss": 1.7764,
+      "step": 910
+    },
+    {
+      "epoch": 0.7937877480586712,
+      "grad_norm": 1.143064022064209,
+      "learning_rate": 0.0002,
+      "loss": 1.7192,
+      "step": 920
+    },
+    {
+      "epoch": 0.8024158757549612,
+      "grad_norm": 1.200974464416504,
+      "learning_rate": 0.0002,
+      "loss": 1.7342,
+      "step": 930
+    },
+    {
+      "epoch": 0.811044003451251,
+      "grad_norm": 1.0878669023513794,
+      "learning_rate": 0.0002,
+      "loss": 1.7399,
+      "step": 940
+    },
+    {
+      "epoch": 0.819672131147541,
+      "grad_norm": 1.0516951084136963,
+      "learning_rate": 0.0002,
+      "loss": 1.8019,
+      "step": 950
+    },
+    {
+      "epoch": 0.8283002588438308,
+      "grad_norm": 1.2017741203308105,
+      "learning_rate": 0.0002,
+      "loss": 1.7645,
+      "step": 960
+    },
+    {
+      "epoch": 0.8369283865401208,
+      "grad_norm": 0.9762169718742371,
+      "learning_rate": 0.0002,
+      "loss": 1.7367,
+      "step": 970
+    },
+    {
+      "epoch": 0.8455565142364107,
+      "grad_norm": 1.0837513208389282,
+      "learning_rate": 0.0002,
+      "loss": 1.7802,
+      "step": 980
+    },
+    {
+      "epoch": 0.8541846419327006,
+      "grad_norm": 1.155504822731018,
+      "learning_rate": 0.0002,
+      "loss": 1.8094,
+      "step": 990
+    },
+    {
+      "epoch": 0.8628127696289906,
+      "grad_norm": 1.067771315574646,
+      "learning_rate": 0.0002,
+      "loss": 1.7633,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8714408973252804,
+      "grad_norm": 1.2283565998077393,
+      "learning_rate": 0.0002,
+      "loss": 1.7993,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8800690250215704,
+      "grad_norm": 1.1549772024154663,
+      "learning_rate": 0.0002,
+      "loss": 1.7362,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8886971527178602,
+      "grad_norm": 1.0022625923156738,
+      "learning_rate": 0.0002,
+      "loss": 1.7583,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8973252804141502,
+      "grad_norm": 1.0237284898757935,
+      "learning_rate": 0.0002,
+      "loss": 1.7718,
+      "step": 1040
+    },
+    {
+      "epoch": 0.90595340811044,
+      "grad_norm": 1.1863008737564087,
+      "learning_rate": 0.0002,
+      "loss": 1.7457,
+      "step": 1050
+    },
+    {
+      "epoch": 0.91458153580673,
+      "grad_norm": 1.001204013824463,
+      "learning_rate": 0.0002,
+      "loss": 1.6951,
+      "step": 1060
+    },
+    {
+      "epoch": 0.9232096635030198,
+      "grad_norm": 1.2686481475830078,
+      "learning_rate": 0.0002,
+      "loss": 1.7506,
+      "step": 1070
+    },
+    {
+      "epoch": 0.9318377911993098,
+      "grad_norm": 1.0700076818466187,
+      "learning_rate": 0.0002,
+      "loss": 1.7064,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9404659188955996,
+      "grad_norm": 1.05950927734375,
+      "learning_rate": 0.0002,
+      "loss": 1.8015,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9490940465918896,
+      "grad_norm": 0.9669114947319031,
+      "learning_rate": 0.0002,
+      "loss": 1.8155,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9577221742881795,
+      "grad_norm": 1.1823079586029053,
+      "learning_rate": 0.0002,
+      "loss": 1.8074,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9663503019844694,
+      "grad_norm": 1.0857175588607788,
+      "learning_rate": 0.0002,
+      "loss": 1.7636,
+      "step": 1120
+    },
+    {
+      "epoch": 0.9749784296807593,
+      "grad_norm": 1.1258848905563354,
+      "learning_rate": 0.0002,
+      "loss": 1.822,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9836065573770492,
+      "grad_norm": 1.16336989402771,
+      "learning_rate": 0.0002,
+      "loss": 1.8167,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9922346850733391,
+      "grad_norm": 1.118432879447937,
+      "learning_rate": 0.0002,
+      "loss": 1.7402,
+      "step": 1150
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7723218202590942,
+      "eval_runtime": 158.8593,
+      "eval_samples_per_second": 3.355,
+      "eval_steps_per_second": 0.422,
+      "step": 1159
+    },
+    {
+      "epoch": 1.000862812769629,
+      "grad_norm": 1.1056718826293945,
+      "learning_rate": 0.0002,
+      "loss": 1.7863,
+      "step": 1160
+    },
+    {
+      "epoch": 1.009490940465919,
+      "grad_norm": 1.0352667570114136,
+      "learning_rate": 0.0002,
+      "loss": 1.672,
+      "step": 1170
+    },
+    {
+      "epoch": 1.0181190681622088,
+      "grad_norm": 1.0315937995910645,
+      "learning_rate": 0.0002,
+      "loss": 1.6718,
+      "step": 1180
+    },
+    {
+      "epoch": 1.0267471958584986,
+      "grad_norm": 1.369126558303833,
+      "learning_rate": 0.0002,
+      "loss": 1.6937,
+      "step": 1190
+    },
+    {
+      "epoch": 1.0353753235547887,
+      "grad_norm": 1.330876350402832,
+      "learning_rate": 0.0002,
+      "loss": 1.6732,
+      "step": 1200
+    },
+    {
+      "epoch": 1.0440034512510785,
+      "grad_norm": 1.406552791595459,
+      "learning_rate": 0.0002,
+      "loss": 1.6497,
+      "step": 1210
+    },
+    {
+      "epoch": 1.0526315789473684,
+      "grad_norm": 1.1256251335144043,
+      "learning_rate": 0.0002,
+      "loss": 1.6873,
+      "step": 1220
+    },
+    {
+      "epoch": 1.0612597066436584,
+      "grad_norm": 1.315566897392273,
+      "learning_rate": 0.0002,
+      "loss": 1.6765,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0698878343399483,
+      "grad_norm": 1.2100263833999634,
+      "learning_rate": 0.0002,
+      "loss": 1.6763,
+      "step": 1240
+    },
+    {
+      "epoch": 1.0785159620362381,
+      "grad_norm": 1.2762185335159302,
+      "learning_rate": 0.0002,
+      "loss": 1.6496,
+      "step": 1250
+    },
+    {
+      "epoch": 1.087144089732528,
+      "grad_norm": 1.2971566915512085,
+      "learning_rate": 0.0002,
+      "loss": 1.6826,
+      "step": 1260
+    },
+    {
+      "epoch": 1.095772217428818,
+      "grad_norm": 1.3685089349746704,
+      "learning_rate": 0.0002,
+      "loss": 1.6721,
+      "step": 1270
+    },
+    {
+      "epoch": 1.1044003451251079,
+      "grad_norm": 1.3135347366333008,
+      "learning_rate": 0.0002,
+      "loss": 1.6399,
+      "step": 1280
+    },
+    {
+      "epoch": 1.1130284728213977,
+      "grad_norm": 1.4514861106872559,
+      "learning_rate": 0.0002,
+      "loss": 1.641,
+      "step": 1290
+    },
+    {
+      "epoch": 1.1216566005176876,
+      "grad_norm": 1.5077004432678223,
+      "learning_rate": 0.0002,
+      "loss": 1.6443,
+      "step": 1300
+    },
+    {
+      "epoch": 1.1302847282139776,
+      "grad_norm": 1.4807840585708618,
+      "learning_rate": 0.0002,
+      "loss": 1.6406,
+      "step": 1310
+    },
+    {
+      "epoch": 1.1389128559102675,
+      "grad_norm": 1.2386537790298462,
+      "learning_rate": 0.0002,
+      "loss": 1.7022,
+      "step": 1320
+    },
+    {
+      "epoch": 1.1475409836065573,
+      "grad_norm": 1.2637739181518555,
+      "learning_rate": 0.0002,
+      "loss": 1.6265,
+      "step": 1330
+    },
+    {
+      "epoch": 1.1561691113028472,
+      "grad_norm": 1.2472519874572754,
+      "learning_rate": 0.0002,
+      "loss": 1.7103,
+      "step": 1340
+    },
+    {
+      "epoch": 1.1647972389991372,
+      "grad_norm": 1.290644884109497,
+      "learning_rate": 0.0002,
+      "loss": 1.676,
+      "step": 1350
+    },
+    {
+      "epoch": 1.173425366695427,
+      "grad_norm": 1.3227870464324951,
+      "learning_rate": 0.0002,
+      "loss": 1.6713,
+      "step": 1360
+    },
+    {
+      "epoch": 1.182053494391717,
+      "grad_norm": 1.3311200141906738,
+      "learning_rate": 0.0002,
+      "loss": 1.7158,
+      "step": 1370
+    },
+    {
+      "epoch": 1.190681622088007,
+      "grad_norm": 1.2624584436416626,
+      "learning_rate": 0.0002,
+      "loss": 1.6501,
+      "step": 1380
+    },
+    {
+      "epoch": 1.1993097497842968,
+      "grad_norm": 1.4712986946105957,
+      "learning_rate": 0.0002,
+      "loss": 1.6398,
+      "step": 1390
+    },
+    {
+      "epoch": 1.2079378774805867,
+      "grad_norm": 1.416508674621582,
+      "learning_rate": 0.0002,
+      "loss": 1.6818,
+      "step": 1400
+    },
+    {
+      "epoch": 1.2165660051768765,
+      "grad_norm": 1.367967963218689,
+      "learning_rate": 0.0002,
+      "loss": 1.7184,
+      "step": 1410
+    },
+    {
+      "epoch": 1.2251941328731666,
+      "grad_norm": 1.3865700960159302,
+      "learning_rate": 0.0002,
+      "loss": 1.6834,
+      "step": 1420
+    },
+    {
+      "epoch": 1.2338222605694564,
+      "grad_norm": 2.076512336730957,
+      "learning_rate": 0.0002,
+      "loss": 1.7532,
+      "step": 1430
+    },
+    {
+      "epoch": 1.2424503882657463,
+      "grad_norm": 1.305572509765625,
+      "learning_rate": 0.0002,
+      "loss": 1.7448,
+      "step": 1440
+    },
+    {
+      "epoch": 1.2510785159620363,
+      "grad_norm": 1.2752642631530762,
+      "learning_rate": 0.0002,
+      "loss": 1.7422,
+      "step": 1450
+    },
+    {
+      "epoch": 1.2597066436583262,
+      "grad_norm": 1.1802726984024048,
+      "learning_rate": 0.0002,
+      "loss": 1.7121,
+      "step": 1460
+    },
+    {
+      "epoch": 1.268334771354616,
+      "grad_norm": 1.2195663452148438,
+      "learning_rate": 0.0002,
+      "loss": 1.7617,
+      "step": 1470
+    },
+    {
+      "epoch": 1.2769628990509059,
+      "grad_norm": 1.3073176145553589,
+      "learning_rate": 0.0002,
+      "loss": 1.6022,
+      "step": 1480
+    },
+    {
+      "epoch": 1.2855910267471957,
+      "grad_norm": 1.2829731702804565,
+      "learning_rate": 0.0002,
+      "loss": 1.6472,
+      "step": 1490
+    },
+    {
+      "epoch": 1.2942191544434858,
+      "grad_norm": 1.361060619354248,
+      "learning_rate": 0.0002,
+      "loss": 1.6076,
+      "step": 1500
+    },
+    {
+      "epoch": 1.3028472821397756,
+      "grad_norm": 1.4285917282104492,
+      "learning_rate": 0.0002,
+      "loss": 1.7059,
+      "step": 1510
+    },
+    {
+      "epoch": 1.3114754098360657,
+      "grad_norm": 1.186866283416748,
+      "learning_rate": 0.0002,
+      "loss": 1.696,
+      "step": 1520
+    },
+    {
+      "epoch": 1.3201035375323555,
+      "grad_norm": 1.2615889310836792,
+      "learning_rate": 0.0002,
+      "loss": 1.6707,
+      "step": 1530
+    },
+    {
+      "epoch": 1.3287316652286454,
+      "grad_norm": 1.2732815742492676,
+      "learning_rate": 0.0002,
+      "loss": 1.5797,
+      "step": 1540
+    },
+    {
+      "epoch": 1.3373597929249352,
+      "grad_norm": 1.4152132272720337,
+      "learning_rate": 0.0002,
+      "loss": 1.6623,
+      "step": 1550
+    },
+    {
+      "epoch": 1.345987920621225,
+      "grad_norm": 1.1730318069458008,
+      "learning_rate": 0.0002,
+      "loss": 1.6649,
+      "step": 1560
+    },
+    {
+      "epoch": 1.3546160483175151,
+      "grad_norm": 1.2282229661941528,
+      "learning_rate": 0.0002,
+      "loss": 1.7247,
+      "step": 1570
+    },
+    {
+      "epoch": 1.363244176013805,
+      "grad_norm": 1.227974534034729,
+      "learning_rate": 0.0002,
+      "loss": 1.7125,
+      "step": 1580
+    },
+    {
+      "epoch": 1.3718723037100948,
+      "grad_norm": 1.3480374813079834,
+      "learning_rate": 0.0002,
+      "loss": 1.622,
+      "step": 1590
+    },
+    {
+      "epoch": 1.380500431406385,
+      "grad_norm": 1.3460094928741455,
+      "learning_rate": 0.0002,
+      "loss": 1.7126,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3891285591026747,
+      "grad_norm": 1.254465937614441,
+      "learning_rate": 0.0002,
+      "loss": 1.6845,
+      "step": 1610
+    },
+    {
+      "epoch": 1.3977566867989646,
+      "grad_norm": 1.4135496616363525,
+      "learning_rate": 0.0002,
+      "loss": 1.643,
+      "step": 1620
+    },
+    {
+      "epoch": 1.4063848144952544,
+      "grad_norm": 1.277063012123108,
+      "learning_rate": 0.0002,
+      "loss": 1.6392,
+      "step": 1630
+    },
+    {
+      "epoch": 1.4150129421915445,
+      "grad_norm": 1.5031940937042236,
+      "learning_rate": 0.0002,
+      "loss": 1.7338,
+      "step": 1640
+    },
+    {
+      "epoch": 1.4236410698878343,
+      "grad_norm": 1.3918952941894531,
+      "learning_rate": 0.0002,
+      "loss": 1.6229,
+      "step": 1650
+    },
+    {
+      "epoch": 1.4322691975841242,
+      "grad_norm": 1.5893778800964355,
+      "learning_rate": 0.0002,
+      "loss": 1.6893,
+      "step": 1660
+    },
+    {
+      "epoch": 1.4408973252804143,
+      "grad_norm": 1.4636809825897217,
+      "learning_rate": 0.0002,
+      "loss": 1.7129,
+      "step": 1670
+    },
+    {
+      "epoch": 1.449525452976704,
+      "grad_norm": 1.1985419988632202,
+      "learning_rate": 0.0002,
+      "loss": 1.6481,
+      "step": 1680
+    },
+    {
+      "epoch": 1.458153580672994,
+      "grad_norm": 1.509252905845642,
+      "learning_rate": 0.0002,
+      "loss": 1.7322,
+      "step": 1690
+    },
+    {
+      "epoch": 1.4667817083692838,
+      "grad_norm": 1.4157838821411133,
+      "learning_rate": 0.0002,
+      "loss": 1.6653,
+      "step": 1700
+    },
+    {
+      "epoch": 1.4754098360655736,
+      "grad_norm": 1.3481059074401855,
+      "learning_rate": 0.0002,
+      "loss": 1.7111,
+      "step": 1710
+    },
+    {
+      "epoch": 1.4840379637618637,
+      "grad_norm": 1.4127949476242065,
+      "learning_rate": 0.0002,
+      "loss": 1.6488,
+      "step": 1720
+    },
+    {
+      "epoch": 1.4926660914581535,
+      "grad_norm": 1.3087295293807983,
+      "learning_rate": 0.0002,
+      "loss": 1.6336,
+      "step": 1730
+    },
+    {
+      "epoch": 1.5012942191544436,
+      "grad_norm": 1.4421851634979248,
+      "learning_rate": 0.0002,
+      "loss": 1.7226,
+      "step": 1740
+    },
+    {
+      "epoch": 1.5099223468507335,
+      "grad_norm": 1.3953148126602173,
+      "learning_rate": 0.0002,
+      "loss": 1.7006,
+      "step": 1750
+    },
+    {
+      "epoch": 1.5185504745470233,
+      "grad_norm": 1.4613851308822632,
+      "learning_rate": 0.0002,
+      "loss": 1.6281,
+      "step": 1760
+    },
+    {
+      "epoch": 1.5271786022433131,
+      "grad_norm": 1.2866744995117188,
+      "learning_rate": 0.0002,
+      "loss": 1.6404,
+      "step": 1770
+    },
+    {
+      "epoch": 1.535806729939603,
+      "grad_norm": 1.2769535779953003,
+      "learning_rate": 0.0002,
+      "loss": 1.628,
+      "step": 1780
+    },
+    {
+      "epoch": 1.544434857635893,
+      "grad_norm": 1.371022343635559,
+      "learning_rate": 0.0002,
+      "loss": 1.6439,
+      "step": 1790
+    },
+    {
+      "epoch": 1.553062985332183,
+      "grad_norm": 1.4434700012207031,
+      "learning_rate": 0.0002,
+      "loss": 1.6363,
+      "step": 1800
+    },
+    {
+      "epoch": 1.561691113028473,
+      "grad_norm": 1.269386887550354,
+      "learning_rate": 0.0002,
+      "loss": 1.6606,
+      "step": 1810
+    },
+    {
+      "epoch": 1.5703192407247628,
+      "grad_norm": 1.2668766975402832,
+      "learning_rate": 0.0002,
+      "loss": 1.6493,
+      "step": 1820
+    },
+    {
+      "epoch": 1.5789473684210527,
+      "grad_norm": 1.4857951402664185,
+      "learning_rate": 0.0002,
+      "loss": 1.7124,
+      "step": 1830
+    },
+    {
+      "epoch": 1.5875754961173425,
+      "grad_norm": 1.330338954925537,
+      "learning_rate": 0.0002,
+      "loss": 1.6474,
+      "step": 1840
+    },
+    {
+      "epoch": 1.5962036238136323,
+      "grad_norm": 1.3832308053970337,
+      "learning_rate": 0.0002,
+      "loss": 1.6412,
+      "step": 1850
+    },
+    {
+      "epoch": 1.6048317515099222,
+      "grad_norm": 1.2697869539260864,
+      "learning_rate": 0.0002,
+      "loss": 1.6988,
+      "step": 1860
+    },
+    {
+      "epoch": 1.6134598792062123,
+      "grad_norm": 1.338875412940979,
+      "learning_rate": 0.0002,
+      "loss": 1.6651,
+      "step": 1870
+    },
+    {
+      "epoch": 1.6220880069025023,
+      "grad_norm": 1.4077556133270264,
+      "learning_rate": 0.0002,
+      "loss": 1.7319,
+      "step": 1880
+    },
+    {
+      "epoch": 1.6307161345987922,
+      "grad_norm": 1.40274178981781,
+      "learning_rate": 0.0002,
+      "loss": 1.644,
+      "step": 1890
+    },
+    {
+      "epoch": 1.639344262295082,
+      "grad_norm": 1.416042447090149,
+      "learning_rate": 0.0002,
+      "loss": 1.6648,
+      "step": 1900
+    },
+    {
+      "epoch": 1.6479723899913719,
+      "grad_norm": 1.4196866750717163,
+      "learning_rate": 0.0002,
+      "loss": 1.729,
+      "step": 1910
+    },
+    {
+      "epoch": 1.6566005176876617,
+      "grad_norm": 1.378732681274414,
+      "learning_rate": 0.0002,
+      "loss": 1.7381,
+      "step": 1920
+    },
+    {
+      "epoch": 1.6652286453839515,
+      "grad_norm": 1.544751524925232,
+      "learning_rate": 0.0002,
+      "loss": 1.7804,
+      "step": 1930
+    },
+    {
+      "epoch": 1.6738567730802416,
+      "grad_norm": 1.4318190813064575,
+      "learning_rate": 0.0002,
+      "loss": 1.6563,
+      "step": 1940
+    },
+    {
+      "epoch": 1.6824849007765315,
+      "grad_norm": 1.3794575929641724,
+      "learning_rate": 0.0002,
+      "loss": 1.6806,
+      "step": 1950
+    },
+    {
+      "epoch": 1.6911130284728215,
+      "grad_norm": 1.6301822662353516,
+      "learning_rate": 0.0002,
+      "loss": 1.6707,
+      "step": 1960
+    },
+    {
+      "epoch": 1.6997411561691114,
+      "grad_norm": 1.3090870380401611,
+      "learning_rate": 0.0002,
+      "loss": 1.6945,
+      "step": 1970
+    },
+    {
+      "epoch": 1.7083692838654012,
+      "grad_norm": 1.4537303447723389,
+      "learning_rate": 0.0002,
+      "loss": 1.6018,
+      "step": 1980
+    },
+    {
+      "epoch": 1.716997411561691,
+      "grad_norm": 1.3618766069412231,
+      "learning_rate": 0.0002,
+      "loss": 1.7225,
+      "step": 1990
+    },
+    {
+      "epoch": 1.725625539257981,
+      "grad_norm": 1.398790955543518,
+      "learning_rate": 0.0002,
+      "loss": 1.6948,
+      "step": 2000
+    },
+    {
+      "epoch": 1.734253666954271,
+      "grad_norm": 1.4606391191482544,
+      "learning_rate": 0.0002,
+      "loss": 1.6963,
+      "step": 2010
+    },
+    {
+      "epoch": 1.7428817946505608,
+      "grad_norm": 1.602010726928711,
+      "learning_rate": 0.0002,
+      "loss": 1.727,
+      "step": 2020
+    },
+    {
+      "epoch": 1.7515099223468509,
+      "grad_norm": 1.4865907430648804,
+      "learning_rate": 0.0002,
+      "loss": 1.7238,
+      "step": 2030
+    },
+    {
+      "epoch": 1.7601380500431407,
+      "grad_norm": 1.5954750776290894,
+      "learning_rate": 0.0002,
+      "loss": 1.713,
+      "step": 2040
+    },
+    {
+      "epoch": 1.7687661777394306,
+      "grad_norm": 1.3561054468154907,
+      "learning_rate": 0.0002,
+      "loss": 1.6794,
+      "step": 2050
+    },
+    {
+      "epoch": 1.7773943054357204,
+      "grad_norm": 1.4540512561798096,
+      "learning_rate": 0.0002,
+      "loss": 1.7058,
+      "step": 2060
+    },
+    {
+      "epoch": 1.7860224331320103,
+      "grad_norm": 1.2661199569702148,
+      "learning_rate": 0.0002,
+      "loss": 1.6187,
+      "step": 2070
+    },
+    {
+      "epoch": 1.7946505608283,
+      "grad_norm": 2.188016176223755,
+      "learning_rate": 0.0002,
+      "loss": 1.6998,
+      "step": 2080
+    },
+    {
+      "epoch": 1.8032786885245902,
+      "grad_norm": 1.4326417446136475,
+      "learning_rate": 0.0002,
+      "loss": 1.6909,
+      "step": 2090
+    },
+    {
+      "epoch": 1.8119068162208802,
+      "grad_norm": 2.2382805347442627,
+      "learning_rate": 0.0002,
+      "loss": 1.7765,
+      "step": 2100
+    },
+    {
+      "epoch": 1.82053494391717,
+      "grad_norm": 1.396160364151001,
+      "learning_rate": 0.0002,
+      "loss": 1.7034,
+      "step": 2110
+    },
+    {
+      "epoch": 1.82916307161346,
+      "grad_norm": 1.3848069906234741,
+      "learning_rate": 0.0002,
+      "loss": 1.629,
+      "step": 2120
+    },
+    {
+      "epoch": 1.8377911993097498,
+      "grad_norm": 1.6975245475769043,
+      "learning_rate": 0.0002,
+      "loss": 1.6153,
+      "step": 2130
+    },
+    {
+      "epoch": 1.8464193270060396,
+      "grad_norm": 1.476306676864624,
+      "learning_rate": 0.0002,
+      "loss": 1.6631,
+      "step": 2140
+    },
+    {
+      "epoch": 1.8550474547023295,
+      "grad_norm": 1.5690935850143433,
+      "learning_rate": 0.0002,
+      "loss": 1.646,
+      "step": 2150
+    },
+    {
+      "epoch": 1.8636755823986195,
+      "grad_norm": 1.4900702238082886,
+      "learning_rate": 0.0002,
+      "loss": 1.6989,
+      "step": 2160
+    },
+    {
+      "epoch": 1.8723037100949094,
+      "grad_norm": 1.4173238277435303,
+      "learning_rate": 0.0002,
+      "loss": 1.657,
+      "step": 2170
+    },
+    {
+      "epoch": 1.8809318377911994,
+      "grad_norm": 1.3687001466751099,
+      "learning_rate": 0.0002,
+      "loss": 1.6587,
+      "step": 2180
+    },
+    {
+      "epoch": 1.8895599654874893,
+      "grad_norm": 1.371954321861267,
+      "learning_rate": 0.0002,
+      "loss": 1.6209,
+      "step": 2190
+    },
+    {
+      "epoch": 1.8981880931837791,
+      "grad_norm": 1.5397378206253052,
+      "learning_rate": 0.0002,
+      "loss": 1.6749,
+      "step": 2200
+    },
+    {
+      "epoch": 1.906816220880069,
+      "grad_norm": 1.7145664691925049,
+      "learning_rate": 0.0002,
+      "loss": 1.7149,
+      "step": 2210
+    },
+    {
+      "epoch": 1.9154443485763588,
+      "grad_norm": 1.5490705966949463,
+      "learning_rate": 0.0002,
+      "loss": 1.6663,
+      "step": 2220
+    },
+    {
+      "epoch": 1.9240724762726489,
+      "grad_norm": 1.3237485885620117,
+      "learning_rate": 0.0002,
+      "loss": 1.7056,
+      "step": 2230
+    },
+    {
+      "epoch": 1.9327006039689387,
+      "grad_norm": 1.4739165306091309,
+      "learning_rate": 0.0002,
+      "loss": 1.7613,
+      "step": 2240
+    },
+    {
+      "epoch": 1.9413287316652288,
+      "grad_norm": 1.7177914381027222,
+      "learning_rate": 0.0002,
+      "loss": 1.601,
+      "step": 2250
+    },
+    {
+      "epoch": 1.9499568593615186,
+      "grad_norm": 1.3587760925292969,
+      "learning_rate": 0.0002,
+      "loss": 1.6733,
+      "step": 2260
+    },
+    {
+      "epoch": 1.9585849870578085,
+      "grad_norm": 1.3180559873580933,
+      "learning_rate": 0.0002,
+      "loss": 1.6511,
+      "step": 2270
+    },
+    {
+      "epoch": 1.9672131147540983,
+      "grad_norm": 1.9988678693771362,
+      "learning_rate": 0.0002,
+      "loss": 1.5875,
+      "step": 2280
+    },
+    {
+      "epoch": 1.9758412424503882,
+      "grad_norm": 1.4148619174957275,
+      "learning_rate": 0.0002,
+      "loss": 1.6516,
+      "step": 2290
+    },
+    {
+      "epoch": 1.984469370146678,
+      "grad_norm": 1.6429015398025513,
+      "learning_rate": 0.0002,
+      "loss": 1.6649,
+      "step": 2300
+    },
+    {
+      "epoch": 1.993097497842968,
+      "grad_norm": 1.6742682456970215,
+      "learning_rate": 0.0002,
+      "loss": 1.6504,
+      "step": 2310
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7843003273010254,
+      "eval_runtime": 155.4967,
+      "eval_samples_per_second": 3.428,
+      "eval_steps_per_second": 0.431,
+      "step": 2318
+    },
+    {
+      "epoch": 2.001725625539258,
+      "grad_norm": 1.399217128753662,
+      "learning_rate": 0.0002,
+      "loss": 1.6082,
+      "step": 2320
+    },
+    {
+      "epoch": 2.010353753235548,
+      "grad_norm": 1.7028861045837402,
+      "learning_rate": 0.0002,
+      "loss": 1.4883,
+      "step": 2330
+    },
+    {
+      "epoch": 2.018981880931838,
+      "grad_norm": 1.506859540939331,
+      "learning_rate": 0.0002,
+      "loss": 1.4019,
+      "step": 2340
+    },
+    {
+      "epoch": 2.0276100086281277,
+      "grad_norm": 1.3946882486343384,
+      "learning_rate": 0.0002,
+      "loss": 1.482,
+      "step": 2350
+    },
+    {
+      "epoch": 2.0362381363244175,
+      "grad_norm": 1.5871425867080688,
+      "learning_rate": 0.0002,
+      "loss": 1.5225,
+      "step": 2360
+    },
+    {
+      "epoch": 2.0448662640207074,
+      "grad_norm": 1.636025309562683,
+      "learning_rate": 0.0002,
+      "loss": 1.5915,
+      "step": 2370
+    },
+    {
+      "epoch": 2.053494391716997,
+      "grad_norm": 1.971501111984253,
+      "learning_rate": 0.0002,
+      "loss": 1.5434,
+      "step": 2380
+    },
+    {
+      "epoch": 2.0621225194132875,
+      "grad_norm": 1.5961263179779053,
+      "learning_rate": 0.0002,
+      "loss": 1.5265,
+      "step": 2390
+    },
+    {
+      "epoch": 2.0707506471095773,
+      "grad_norm": 1.4916940927505493,
+      "learning_rate": 0.0002,
+      "loss": 1.446,
+      "step": 2400
+    },
+    {
+      "epoch": 2.079378774805867,
+      "grad_norm": 1.6255263090133667,
+      "learning_rate": 0.0002,
+      "loss": 1.528,
+      "step": 2410
+    },
+    {
+      "epoch": 2.088006902502157,
+      "grad_norm": 1.9251011610031128,
+      "learning_rate": 0.0002,
+      "loss": 1.6365,
+      "step": 2420
+    },
+    {
+      "epoch": 2.096635030198447,
+      "grad_norm": 1.6198536157608032,
+      "learning_rate": 0.0002,
+      "loss": 1.5883,
+      "step": 2430
+    },
+    {
+      "epoch": 2.1052631578947367,
+      "grad_norm": 1.6935237646102905,
+      "learning_rate": 0.0002,
+      "loss": 1.4984,
+      "step": 2440
+    },
+    {
+      "epoch": 2.1138912855910266,
+      "grad_norm": 1.5107334852218628,
+      "learning_rate": 0.0002,
+      "loss": 1.5477,
+      "step": 2450
+    },
+    {
+      "epoch": 2.122519413287317,
+      "grad_norm": 1.801699161529541,
+      "learning_rate": 0.0002,
+      "loss": 1.4898,
+      "step": 2460
+    },
+    {
+      "epoch": 2.1311475409836067,
+      "grad_norm": 1.6194193363189697,
+      "learning_rate": 0.0002,
+      "loss": 1.5471,
+      "step": 2470
+    },
+    {
+      "epoch": 2.1397756686798965,
+      "grad_norm": 1.896286964416504,
+      "learning_rate": 0.0002,
+      "loss": 1.4619,
+      "step": 2480
+    },
+    {
+      "epoch": 2.1484037963761864,
+      "grad_norm": 1.9456146955490112,
+      "learning_rate": 0.0002,
+      "loss": 1.5496,
+      "step": 2490
+    },
+    {
+      "epoch": 2.1570319240724762,
+      "grad_norm": 23.566476821899414,
+      "learning_rate": 0.0002,
+      "loss": 1.5449,
+      "step": 2500
+    },
+    {
+      "epoch": 2.165660051768766,
+      "grad_norm": 1.7737925052642822,
+      "learning_rate": 0.0002,
+      "loss": 1.5675,
+      "step": 2510
+    },
+    {
+      "epoch": 2.174288179465056,
+      "grad_norm": 1.7305291891098022,
+      "learning_rate": 0.0002,
+      "loss": 1.4775,
+      "step": 2520
+    },
+    {
+      "epoch": 2.1829163071613458,
+      "grad_norm": 2.130882978439331,
+      "learning_rate": 0.0002,
+      "loss": 1.5051,
+      "step": 2530
+    },
+    {
+      "epoch": 2.191544434857636,
+      "grad_norm": 1.790124535560608,
+      "learning_rate": 0.0002,
+      "loss": 1.4675,
+      "step": 2540
+    },
+    {
+      "epoch": 2.200172562553926,
+      "grad_norm": 1.8408042192459106,
+      "learning_rate": 0.0002,
+      "loss": 1.5208,
+      "step": 2550
+    },
+    {
+      "epoch": 2.2088006902502157,
+      "grad_norm": 1.7635295391082764,
+      "learning_rate": 0.0002,
+      "loss": 1.4732,
+      "step": 2560
+    },
+    {
+      "epoch": 2.2174288179465056,
+      "grad_norm": 1.7026700973510742,
+      "learning_rate": 0.0002,
+      "loss": 1.4604,
+      "step": 2570
+    },
+    {
+      "epoch": 2.2260569456427954,
+      "grad_norm": 1.881218433380127,
+      "learning_rate": 0.0002,
+      "loss": 1.5223,
+      "step": 2580
+    },
+    {
+      "epoch": 2.2346850733390853,
+      "grad_norm": 1.9007751941680908,
+      "learning_rate": 0.0002,
+      "loss": 1.4422,
+      "step": 2590
+    },
+    {
+      "epoch": 2.243313201035375,
+      "grad_norm": 1.7862553596496582,
+      "learning_rate": 0.0002,
+      "loss": 1.4695,
+      "step": 2600
+    },
+    {
+      "epoch": 2.2519413287316654,
+      "grad_norm": 1.7117811441421509,
+      "learning_rate": 0.0002,
+      "loss": 1.4731,
+      "step": 2610
+    },
+    {
+      "epoch": 2.2605694564279553,
+      "grad_norm": 1.7809374332427979,
+      "learning_rate": 0.0002,
+      "loss": 1.4951,
+      "step": 2620
+    },
+    {
+      "epoch": 2.269197584124245,
+      "grad_norm": 1.7089564800262451,
+      "learning_rate": 0.0002,
+      "loss": 1.4744,
+      "step": 2630
+    },
+    {
+      "epoch": 2.277825711820535,
+      "grad_norm": 1.7662888765335083,
+      "learning_rate": 0.0002,
+      "loss": 1.5186,
+      "step": 2640
+    },
+    {
+      "epoch": 2.286453839516825,
+      "grad_norm": 1.8892756700515747,
+      "learning_rate": 0.0002,
+      "loss": 1.5468,
+      "step": 2650
+    },
+    {
+      "epoch": 2.2950819672131146,
+      "grad_norm": 1.678238034248352,
+      "learning_rate": 0.0002,
+      "loss": 1.5266,
+      "step": 2660
+    },
+    {
+      "epoch": 2.3037100949094045,
+      "grad_norm": 1.865786075592041,
+      "learning_rate": 0.0002,
+      "loss": 1.4897,
+      "step": 2670
+    },
+    {
+      "epoch": 2.3123382226056943,
+      "grad_norm": 1.9744012355804443,
+      "learning_rate": 0.0002,
+      "loss": 1.5578,
+      "step": 2680
+    },
+    {
+      "epoch": 2.3209663503019846,
+      "grad_norm": 1.884690284729004,
+      "learning_rate": 0.0002,
+      "loss": 1.5021,
+      "step": 2690
+    },
+    {
+      "epoch": 2.3295944779982745,
+      "grad_norm": 1.6391639709472656,
+      "learning_rate": 0.0002,
+      "loss": 1.6071,
+      "step": 2700
+    },
+    {
+      "epoch": 2.3382226056945643,
+      "grad_norm": 1.7777862548828125,
+      "learning_rate": 0.0002,
+      "loss": 1.5721,
+      "step": 2710
+    },
+    {
+      "epoch": 2.346850733390854,
+      "grad_norm": 1.6615192890167236,
+      "learning_rate": 0.0002,
+      "loss": 1.5633,
+      "step": 2720
+    },
+    {
+      "epoch": 2.355478861087144,
+      "grad_norm": 2.2202742099761963,
+      "learning_rate": 0.0002,
+      "loss": 1.5213,
+      "step": 2730
+    },
+    {
+      "epoch": 2.364106988783434,
+      "grad_norm": 2.1986732482910156,
+      "learning_rate": 0.0002,
+      "loss": 1.5443,
+      "step": 2740
+    },
+    {
+      "epoch": 2.372735116479724,
+      "grad_norm": 1.7847017049789429,
+      "learning_rate": 0.0002,
+      "loss": 1.5834,
+      "step": 2750
+    },
+    {
+      "epoch": 2.381363244176014,
+      "grad_norm": 1.8832756280899048,
+      "learning_rate": 0.0002,
+      "loss": 1.4946,
+      "step": 2760
+    },
+    {
+      "epoch": 2.389991371872304,
+      "grad_norm": 1.8374940156936646,
+      "learning_rate": 0.0002,
+      "loss": 1.5725,
+      "step": 2770
+    },
+    {
+      "epoch": 2.3986194995685937,
+      "grad_norm": 1.741965413093567,
+      "learning_rate": 0.0002,
+      "loss": 1.5181,
+      "step": 2780
+    },
+    {
+      "epoch": 2.4072476272648835,
+      "grad_norm": 1.789699673652649,
+      "learning_rate": 0.0002,
+      "loss": 1.5571,
+      "step": 2790
+    },
+    {
+      "epoch": 2.4158757549611733,
+      "grad_norm": 2.0495948791503906,
+      "learning_rate": 0.0002,
+      "loss": 1.4763,
+      "step": 2800
+    },
+    {
+      "epoch": 2.424503882657463,
+      "grad_norm": 1.7399765253067017,
+      "learning_rate": 0.0002,
+      "loss": 1.5129,
+      "step": 2810
+    },
+    {
+      "epoch": 2.433132010353753,
+      "grad_norm": 1.9142578840255737,
+      "learning_rate": 0.0002,
+      "loss": 1.556,
+      "step": 2820
+    },
+    {
+      "epoch": 2.4417601380500433,
+      "grad_norm": 1.920663595199585,
+      "learning_rate": 0.0002,
+      "loss": 1.4848,
+      "step": 2830
+    },
+    {
+      "epoch": 2.450388265746333,
+      "grad_norm": 1.7982150316238403,
+      "learning_rate": 0.0002,
+      "loss": 1.5411,
+      "step": 2840
+    },
+    {
+      "epoch": 2.459016393442623,
+      "grad_norm": 1.7665464878082275,
+      "learning_rate": 0.0002,
+      "loss": 1.5802,
+      "step": 2850
+    },
+    {
+      "epoch": 2.467644521138913,
+      "grad_norm": 1.9115102291107178,
+      "learning_rate": 0.0002,
+      "loss": 1.5433,
+      "step": 2860
+    },
+    {
+      "epoch": 2.4762726488352027,
+      "grad_norm": 1.9024899005889893,
+      "learning_rate": 0.0002,
+      "loss": 1.4518,
+      "step": 2870
+    },
+    {
+      "epoch": 2.4849007765314925,
+      "grad_norm": 1.7804782390594482,
+      "learning_rate": 0.0002,
+      "loss": 1.4797,
+      "step": 2880
+    },
+    {
+      "epoch": 2.4935289042277824,
+      "grad_norm": 2.0264487266540527,
+      "learning_rate": 0.0002,
+      "loss": 1.5182,
+      "step": 2890
+    },
+    {
+      "epoch": 2.5021570319240727,
+      "grad_norm": 1.8650445938110352,
+      "learning_rate": 0.0002,
+      "loss": 1.4455,
+      "step": 2900
+    },
+    {
+      "epoch": 2.5107851596203625,
+      "grad_norm": 2.0831475257873535,
+      "learning_rate": 0.0002,
+      "loss": 1.54,
+      "step": 2910
+    },
+    {
+      "epoch": 2.5194132873166524,
+      "grad_norm": 1.9633755683898926,
+      "learning_rate": 0.0002,
+      "loss": 1.6014,
+      "step": 2920
+    },
+    {
+      "epoch": 2.528041415012942,
+      "grad_norm": 2.2055106163024902,
+      "learning_rate": 0.0002,
+      "loss": 1.56,
+      "step": 2930
+    },
+    {
+      "epoch": 2.536669542709232,
+      "grad_norm": 2.1060245037078857,
+      "learning_rate": 0.0002,
+      "loss": 1.492,
+      "step": 2940
+    },
+    {
+      "epoch": 2.545297670405522,
+      "grad_norm": 2.0236003398895264,
+      "learning_rate": 0.0002,
+      "loss": 1.5688,
+      "step": 2950
+    },
+    {
+      "epoch": 2.5539257981018118,
+      "grad_norm": 1.898287296295166,
+      "learning_rate": 0.0002,
+      "loss": 1.5186,
+      "step": 2960
+    },
+    {
+      "epoch": 2.5625539257981016,
+      "grad_norm": 1.9526840448379517,
+      "learning_rate": 0.0002,
+      "loss": 1.5441,
+      "step": 2970
+    },
+    {
+      "epoch": 2.5711820534943914,
+      "grad_norm": 1.9538743495941162,
+      "learning_rate": 0.0002,
+      "loss": 1.5608,
+      "step": 2980
+    },
+    {
+      "epoch": 2.5798101811906817,
+      "grad_norm": 1.787394404411316,
+      "learning_rate": 0.0002,
+      "loss": 1.4356,
+      "step": 2990
+    },
+    {
+      "epoch": 2.5884383088869716,
+      "grad_norm": 2.0792672634124756,
+      "learning_rate": 0.0002,
+      "loss": 1.5096,
+      "step": 3000
+    },
+    {
+      "epoch": 2.5970664365832614,
+      "grad_norm": 1.760083556175232,
+      "learning_rate": 0.0002,
+      "loss": 1.5131,
+      "step": 3010
+    },
+    {
+      "epoch": 2.6056945642795513,
+      "grad_norm": 1.8766807317733765,
+      "learning_rate": 0.0002,
+      "loss": 1.5553,
+      "step": 3020
+    },
+    {
+      "epoch": 2.614322691975841,
+      "grad_norm": 1.9650694131851196,
+      "learning_rate": 0.0002,
+      "loss": 1.5381,
+      "step": 3030
+    },
+    {
+      "epoch": 2.6229508196721314,
+      "grad_norm": 1.8143510818481445,
+      "learning_rate": 0.0002,
+      "loss": 1.5263,
+      "step": 3040
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 2.5094006061553955,
+      "learning_rate": 0.0002,
+      "loss": 1.5187,
+      "step": 3050
+    },
+    {
+      "epoch": 2.640207075064711,
+      "grad_norm": 1.852913737297058,
+      "learning_rate": 0.0002,
+      "loss": 1.4729,
+      "step": 3060
+    },
+    {
+      "epoch": 2.648835202761001,
+      "grad_norm": 2.052318811416626,
+      "learning_rate": 0.0002,
+      "loss": 1.5563,
+      "step": 3070
+    },
+    {
+      "epoch": 2.6574633304572908,
+      "grad_norm": 1.8995426893234253,
+      "learning_rate": 0.0002,
+      "loss": 1.5543,
+      "step": 3080
+    },
+    {
+      "epoch": 2.6660914581535806,
+      "grad_norm": 1.979037880897522,
+      "learning_rate": 0.0002,
+      "loss": 1.5357,
+      "step": 3090
+    },
+    {
+      "epoch": 2.6747195858498705,
+      "grad_norm": 1.8179038763046265,
+      "learning_rate": 0.0002,
+      "loss": 1.537,
+      "step": 3100
+    },
+    {
+      "epoch": 2.6833477135461603,
+      "grad_norm": 1.8502779006958008,
+      "learning_rate": 0.0002,
+      "loss": 1.5929,
+      "step": 3110
+    },
+    {
+      "epoch": 2.69197584124245,
+      "grad_norm": 2.0174338817596436,
+      "learning_rate": 0.0002,
+      "loss": 1.5139,
+      "step": 3120
+    },
+    {
+      "epoch": 2.7006039689387404,
+      "grad_norm": 2.1845622062683105,
+      "learning_rate": 0.0002,
+      "loss": 1.5609,
+      "step": 3130
+    },
+    {
+      "epoch": 2.7092320966350303,
+      "grad_norm": 2.1443305015563965,
+      "learning_rate": 0.0002,
+      "loss": 1.5083,
+      "step": 3140
+    },
+    {
+      "epoch": 2.71786022433132,
+      "grad_norm": 2.057907819747925,
+      "learning_rate": 0.0002,
+      "loss": 1.5856,
+      "step": 3150
+    },
+    {
+      "epoch": 2.72648835202761,
+      "grad_norm": 1.9795310497283936,
+      "learning_rate": 0.0002,
+      "loss": 1.5298,
+      "step": 3160
+    },
+    {
+      "epoch": 2.7351164797239,
+      "grad_norm": 1.9476630687713623,
+      "learning_rate": 0.0002,
+      "loss": 1.574,
+      "step": 3170
+    },
+    {
+      "epoch": 2.7437446074201897,
+      "grad_norm": 1.9144753217697144,
+      "learning_rate": 0.0002,
+      "loss": 1.5884,
+      "step": 3180
+    },
+    {
+      "epoch": 2.75237273511648,
+      "grad_norm": 2.0273289680480957,
+      "learning_rate": 0.0002,
+      "loss": 1.554,
+      "step": 3190
+    },
+    {
+      "epoch": 2.76100086281277,
+      "grad_norm": 1.9641752243041992,
+      "learning_rate": 0.0002,
+      "loss": 1.6172,
+      "step": 3200
+    },
+    {
+      "epoch": 2.7696289905090596,
+      "grad_norm": 1.721760630607605,
+      "learning_rate": 0.0002,
+      "loss": 1.525,
+      "step": 3210
+    },
+    {
+      "epoch": 2.7782571182053495,
+      "grad_norm": 1.8093656301498413,
+      "learning_rate": 0.0002,
+      "loss": 1.5414,
+      "step": 3220
+    },
+    {
+      "epoch": 2.7868852459016393,
+      "grad_norm": 1.907056212425232,
+      "learning_rate": 0.0002,
+      "loss": 1.544,
+      "step": 3230
+    },
+    {
+      "epoch": 2.795513373597929,
+      "grad_norm": 2.0488245487213135,
+      "learning_rate": 0.0002,
+      "loss": 1.5911,
+      "step": 3240
+    },
+    {
+      "epoch": 2.804141501294219,
+      "grad_norm": 2.161618232727051,
+      "learning_rate": 0.0002,
+      "loss": 1.5548,
+      "step": 3250
+    },
+    {
+      "epoch": 2.812769628990509,
+      "grad_norm": 1.8043134212493896,
+      "learning_rate": 0.0002,
+      "loss": 1.5549,
+      "step": 3260
+    },
+    {
+      "epoch": 2.8213977566867987,
+      "grad_norm": 1.879629373550415,
+      "learning_rate": 0.0002,
+      "loss": 1.5883,
+      "step": 3270
+    },
+    {
+      "epoch": 2.830025884383089,
+      "grad_norm": 1.9248288869857788,
+      "learning_rate": 0.0002,
+      "loss": 1.5424,
+      "step": 3280
+    },
+    {
+      "epoch": 2.838654012079379,
+      "grad_norm": 1.9379483461380005,
+      "learning_rate": 0.0002,
+      "loss": 1.5166,
+      "step": 3290
+    },
+    {
+      "epoch": 2.8472821397756687,
+      "grad_norm": 1.7068989276885986,
+      "learning_rate": 0.0002,
+      "loss": 1.5575,
+      "step": 3300
+    },
+    {
+      "epoch": 2.8559102674719585,
+      "grad_norm": 1.8729630708694458,
+      "learning_rate": 0.0002,
+      "loss": 1.5513,
+      "step": 3310
+    },
+    {
+      "epoch": 2.8645383951682484,
+      "grad_norm": 1.7893825769424438,
+      "learning_rate": 0.0002,
+      "loss": 1.4364,
+      "step": 3320
+    },
+    {
+      "epoch": 2.8731665228645387,
+      "grad_norm": 1.9462252855300903,
+      "learning_rate": 0.0002,
+      "loss": 1.5439,
+      "step": 3330
+    },
+    {
+      "epoch": 2.8817946505608285,
+      "grad_norm": 1.9320255517959595,
+      "learning_rate": 0.0002,
+      "loss": 1.5171,
+      "step": 3340
+    },
+    {
+      "epoch": 2.8904227782571184,
+      "grad_norm": 1.9695475101470947,
+      "learning_rate": 0.0002,
+      "loss": 1.5695,
+      "step": 3350
+    },
+    {
+      "epoch": 2.899050905953408,
+      "grad_norm": 2.01279354095459,
+      "learning_rate": 0.0002,
+      "loss": 1.5418,
+      "step": 3360
+    },
+    {
+      "epoch": 2.907679033649698,
+      "grad_norm": 1.992236852645874,
+      "learning_rate": 0.0002,
+      "loss": 1.5559,
+      "step": 3370
+    },
+    {
+      "epoch": 2.916307161345988,
+      "grad_norm": 2.3763930797576904,
+      "learning_rate": 0.0002,
+      "loss": 1.5873,
+      "step": 3380
+    },
+    {
+      "epoch": 2.9249352890422777,
+      "grad_norm": 1.91392982006073,
+      "learning_rate": 0.0002,
+      "loss": 1.5182,
+      "step": 3390
+    },
+    {
+      "epoch": 2.9335634167385676,
+      "grad_norm": 1.969994306564331,
+      "learning_rate": 0.0002,
+      "loss": 1.5317,
+      "step": 3400
+    },
+    {
+      "epoch": 2.9421915444348574,
+      "grad_norm": 1.9397379159927368,
+      "learning_rate": 0.0002,
+      "loss": 1.4554,
+      "step": 3410
+    },
+    {
+      "epoch": 2.9508196721311473,
+      "grad_norm": 2.1597039699554443,
+      "learning_rate": 0.0002,
+      "loss": 1.5135,
+      "step": 3420
+    },
+    {
+      "epoch": 2.9594477998274376,
+      "grad_norm": 1.9564080238342285,
+      "learning_rate": 0.0002,
+      "loss": 1.6098,
+      "step": 3430
+    },
+    {
+      "epoch": 2.9680759275237274,
+      "grad_norm": 1.8007603883743286,
+      "learning_rate": 0.0002,
+      "loss": 1.603,
+      "step": 3440
+    },
+    {
+      "epoch": 2.9767040552200172,
+      "grad_norm": 2.5556256771087646,
+      "learning_rate": 0.0002,
+      "loss": 1.5593,
+      "step": 3450
+    },
+    {
+      "epoch": 2.985332182916307,
+      "grad_norm": 1.96817147731781,
+      "learning_rate": 0.0002,
+      "loss": 1.5564,
+      "step": 3460
+    },
+    {
+      "epoch": 2.993960310612597,
+      "grad_norm": 1.921637773513794,
+      "learning_rate": 0.0002,
+      "loss": 1.5664,
+      "step": 3470
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.8276220560073853,
+      "eval_runtime": 148.7597,
+      "eval_samples_per_second": 3.583,
+      "eval_steps_per_second": 0.45,
+      "step": 3477
+    },
+    {
+      "epoch": 3.0025884383088868,
+      "grad_norm": 1.6692646741867065,
+      "learning_rate": 0.0002,
+      "loss": 1.5993,
+      "step": 3480
+    },
+    {
+      "epoch": 3.011216566005177,
+      "grad_norm": 2.7466068267822266,
+      "learning_rate": 0.0002,
+      "loss": 1.3835,
+      "step": 3490
+    },
+    {
+      "epoch": 3.019844693701467,
+      "grad_norm": 2.2054216861724854,
+      "learning_rate": 0.0002,
+      "loss": 1.4099,
+      "step": 3500
+    },
+    {
+      "epoch": 3.0284728213977568,
+      "grad_norm": 2.306040048599243,
+      "learning_rate": 0.0002,
+      "loss": 1.3087,
+      "step": 3510
+    },
+    {
+      "epoch": 3.0371009490940466,
+      "grad_norm": 3.224613666534424,
+      "learning_rate": 0.0002,
+      "loss": 1.3396,
+      "step": 3520
+    },
+    {
+      "epoch": 3.0457290767903364,
+      "grad_norm": 2.1571338176727295,
+      "learning_rate": 0.0002,
+      "loss": 1.2672,
+      "step": 3530
+    },
+    {
+      "epoch": 3.0543572044866263,
+      "grad_norm": 2.585041046142578,
+      "learning_rate": 0.0002,
+      "loss": 1.2956,
+      "step": 3540
+    },
+    {
+      "epoch": 3.062985332182916,
+      "grad_norm": 2.3463659286499023,
+      "learning_rate": 0.0002,
+      "loss": 1.3168,
+      "step": 3550
+    },
+    {
+      "epoch": 3.0716134598792064,
+      "grad_norm": 2.5111236572265625,
+      "learning_rate": 0.0002,
+      "loss": 1.3237,
+      "step": 3560
+    },
+    {
+      "epoch": 3.0802415875754963,
+      "grad_norm": 2.797116994857788,
+      "learning_rate": 0.0002,
+      "loss": 1.3894,
+      "step": 3570
+    },
+    {
+      "epoch": 3.088869715271786,
+      "grad_norm": 2.4545280933380127,
+      "learning_rate": 0.0002,
+      "loss": 1.3185,
+      "step": 3580
+    },
+    {
+      "epoch": 3.097497842968076,
+      "grad_norm": 2.846592664718628,
+      "learning_rate": 0.0002,
+      "loss": 1.32,
+      "step": 3590
+    },
+    {
+      "epoch": 3.106125970664366,
+      "grad_norm": 2.571178913116455,
+      "learning_rate": 0.0002,
+      "loss": 1.3508,
+      "step": 3600
+    },
+    {
+      "epoch": 3.1147540983606556,
+      "grad_norm": 2.4101171493530273,
+      "learning_rate": 0.0002,
+      "loss": 1.312,
+      "step": 3610
+    },
+    {
+      "epoch": 3.1233822260569455,
+      "grad_norm": 2.7548887729644775,
+      "learning_rate": 0.0002,
+      "loss": 1.3978,
+      "step": 3620
+    },
+    {
+      "epoch": 3.1320103537532358,
+      "grad_norm": 2.4694085121154785,
+      "learning_rate": 0.0002,
+      "loss": 1.3766,
+      "step": 3630
+    },
+    {
+      "epoch": 3.1406384814495256,
+      "grad_norm": 3.227698802947998,
+      "learning_rate": 0.0002,
+      "loss": 1.3498,
+      "step": 3640
+    },
+    {
+      "epoch": 3.1492666091458155,
+      "grad_norm": 2.5243587493896484,
+      "learning_rate": 0.0002,
+      "loss": 1.3334,
+      "step": 3650
+    },
+    {
+      "epoch": 3.1578947368421053,
+      "grad_norm": 2.1482925415039062,
+      "learning_rate": 0.0002,
+      "loss": 1.3546,
+      "step": 3660
+    },
+    {
+      "epoch": 3.166522864538395,
+      "grad_norm": 2.366222858428955,
+      "learning_rate": 0.0002,
+      "loss": 1.3438,
+      "step": 3670
+    },
+    {
+      "epoch": 3.175150992234685,
+      "grad_norm": 3.0339198112487793,
+      "learning_rate": 0.0002,
+      "loss": 1.3117,
+      "step": 3680
+    },
+    {
+      "epoch": 3.183779119930975,
+      "grad_norm": 2.4929068088531494,
+      "learning_rate": 0.0002,
+      "loss": 1.2785,
+      "step": 3690
+    },
+    {
+      "epoch": 3.1924072476272647,
+      "grad_norm": 2.526604652404785,
+      "learning_rate": 0.0002,
+      "loss": 1.3008,
+      "step": 3700
+    },
+    {
+      "epoch": 3.201035375323555,
+      "grad_norm": 2.414598226547241,
+      "learning_rate": 0.0002,
+      "loss": 1.2952,
+      "step": 3710
+    },
+    {
+      "epoch": 3.209663503019845,
+      "grad_norm": 2.5312447547912598,
+      "learning_rate": 0.0002,
+      "loss": 1.3369,
+      "step": 3720
+    },
+    {
+      "epoch": 3.2182916307161347,
+      "grad_norm": 2.694946527481079,
+      "learning_rate": 0.0002,
+      "loss": 1.2423,
+      "step": 3730
+    },
+    {
+      "epoch": 3.2269197584124245,
+      "grad_norm": 2.4538211822509766,
+      "learning_rate": 0.0002,
+      "loss": 1.3589,
+      "step": 3740
+    },
+    {
+      "epoch": 3.2355478861087144,
+      "grad_norm": 3.2487967014312744,
+      "learning_rate": 0.0002,
+      "loss": 1.3872,
+      "step": 3750
+    },
+    {
+      "epoch": 3.244176013805004,
+      "grad_norm": 2.8900558948516846,
+      "learning_rate": 0.0002,
+      "loss": 1.3019,
+      "step": 3760
+    },
+    {
+      "epoch": 3.252804141501294,
+      "grad_norm": 2.325157880783081,
+      "learning_rate": 0.0002,
+      "loss": 1.3032,
+      "step": 3770
+    },
+    {
+      "epoch": 3.2614322691975843,
+      "grad_norm": 2.497964859008789,
+      "learning_rate": 0.0002,
+      "loss": 1.3332,
+      "step": 3780
+    },
+    {
+      "epoch": 3.270060396893874,
+      "grad_norm": 2.4324586391448975,
+      "learning_rate": 0.0002,
+      "loss": 1.3886,
+      "step": 3790
+    },
+    {
+      "epoch": 3.278688524590164,
+      "grad_norm": 2.3308541774749756,
+      "learning_rate": 0.0002,
+      "loss": 1.3841,
+      "step": 3800
+    },
+    {
+      "epoch": 3.287316652286454,
+      "grad_norm": 2.8938093185424805,
+      "learning_rate": 0.0002,
+      "loss": 1.3948,
+      "step": 3810
+    },
+    {
+      "epoch": 3.2959447799827437,
+      "grad_norm": 2.744821548461914,
+      "learning_rate": 0.0002,
+      "loss": 1.3282,
+      "step": 3820
+    },
+    {
+      "epoch": 3.3045729076790336,
+      "grad_norm": 2.3175134658813477,
+      "learning_rate": 0.0002,
+      "loss": 1.3536,
+      "step": 3830
+    },
+    {
+      "epoch": 3.3132010353753234,
+      "grad_norm": 2.424309730529785,
+      "learning_rate": 0.0002,
+      "loss": 1.3267,
+      "step": 3840
+    },
+    {
+      "epoch": 3.3218291630716132,
+      "grad_norm": 2.7309727668762207,
+      "learning_rate": 0.0002,
+      "loss": 1.3312,
+      "step": 3850
+    },
+    {
+      "epoch": 3.3304572907679035,
+      "grad_norm": 2.4642956256866455,
+      "learning_rate": 0.0002,
+      "loss": 1.2985,
+      "step": 3860
+    },
+    {
+      "epoch": 3.3390854184641934,
+      "grad_norm": 2.586395025253296,
+      "learning_rate": 0.0002,
+      "loss": 1.3757,
+      "step": 3870
+    },
+    {
+      "epoch": 3.3477135461604832,
+      "grad_norm": 2.6067605018615723,
+      "learning_rate": 0.0002,
+      "loss": 1.3359,
+      "step": 3880
+    },
+    {
+      "epoch": 3.356341673856773,
+      "grad_norm": 2.7179007530212402,
+      "learning_rate": 0.0002,
+      "loss": 1.3257,
+      "step": 3890
+    },
+    {
+      "epoch": 3.364969801553063,
+      "grad_norm": 2.8866937160491943,
+      "learning_rate": 0.0002,
+      "loss": 1.3374,
+      "step": 3900
+    },
+    {
+      "epoch": 3.3735979292493528,
+      "grad_norm": 3.0192813873291016,
+      "learning_rate": 0.0002,
+      "loss": 1.3636,
+      "step": 3910
+    },
+    {
+      "epoch": 3.382226056945643,
+      "grad_norm": 2.9578323364257812,
+      "learning_rate": 0.0002,
+      "loss": 1.3554,
+      "step": 3920
+    },
+    {
+      "epoch": 3.390854184641933,
+      "grad_norm": 2.4524383544921875,
+      "learning_rate": 0.0002,
+      "loss": 1.3838,
+      "step": 3930
+    },
+    {
+      "epoch": 3.3994823123382227,
+      "grad_norm": 2.681588649749756,
+      "learning_rate": 0.0002,
+      "loss": 1.2733,
+      "step": 3940
+    },
+    {
+      "epoch": 3.4081104400345126,
+      "grad_norm": 2.594383716583252,
+      "learning_rate": 0.0002,
+      "loss": 1.32,
+      "step": 3950
+    },
+    {
+      "epoch": 3.4167385677308024,
+      "grad_norm": 2.364607810974121,
+      "learning_rate": 0.0002,
+      "loss": 1.313,
+      "step": 3960
+    },
+    {
+      "epoch": 3.4253666954270923,
+      "grad_norm": 2.327899217605591,
+      "learning_rate": 0.0002,
+      "loss": 1.4074,
+      "step": 3970
+    },
+    {
+      "epoch": 3.433994823123382,
+      "grad_norm": 2.6864054203033447,
+      "learning_rate": 0.0002,
+      "loss": 1.3655,
+      "step": 3980
+    },
+    {
+      "epoch": 3.442622950819672,
+      "grad_norm": 2.8951292037963867,
+      "learning_rate": 0.0002,
+      "loss": 1.4381,
+      "step": 3990
+    },
+    {
+      "epoch": 3.451251078515962,
+      "grad_norm": 2.7218570709228516,
+      "learning_rate": 0.0002,
+      "loss": 1.3515,
+      "step": 4000
+    },
+    {
+      "epoch": 3.459879206212252,
+      "grad_norm": 2.7867014408111572,
+      "learning_rate": 0.0002,
+      "loss": 1.2841,
+      "step": 4010
+    },
+    {
+      "epoch": 3.468507333908542,
+      "grad_norm": 2.967764377593994,
+      "learning_rate": 0.0002,
+      "loss": 1.3812,
+      "step": 4020
+    },
+    {
+      "epoch": 3.477135461604832,
+      "grad_norm": 2.274709701538086,
+      "learning_rate": 0.0002,
+      "loss": 1.3473,
+      "step": 4030
+    },
+    {
+      "epoch": 3.4857635893011216,
+      "grad_norm": 2.348278522491455,
+      "learning_rate": 0.0002,
+      "loss": 1.3881,
+      "step": 4040
+    },
+    {
+      "epoch": 3.4943917169974115,
+      "grad_norm": 2.4520280361175537,
+      "learning_rate": 0.0002,
+      "loss": 1.3272,
+      "step": 4050
+    },
+    {
+      "epoch": 3.5030198446937013,
+      "grad_norm": 2.3606009483337402,
+      "learning_rate": 0.0002,
+      "loss": 1.3729,
+      "step": 4060
+    },
+    {
+      "epoch": 3.5116479723899916,
+      "grad_norm": 2.5735526084899902,
+      "learning_rate": 0.0002,
+      "loss": 1.3608,
+      "step": 4070
+    },
+    {
+      "epoch": 3.5202761000862814,
+      "grad_norm": 2.7478349208831787,
+      "learning_rate": 0.0002,
+      "loss": 1.417,
+      "step": 4080
+    },
+    {
+      "epoch": 3.5289042277825713,
+      "grad_norm": 2.5038864612579346,
+      "learning_rate": 0.0002,
+      "loss": 1.3551,
+      "step": 4090
+    },
+    {
+      "epoch": 3.537532355478861,
+      "grad_norm": 2.8981692790985107,
+      "learning_rate": 0.0002,
+      "loss": 1.3656,
+      "step": 4100
+    },
+    {
+      "epoch": 3.546160483175151,
+      "grad_norm": 2.4158923625946045,
+      "learning_rate": 0.0002,
+      "loss": 1.3517,
+      "step": 4110
+    },
+    {
+      "epoch": 3.554788610871441,
+      "grad_norm": 2.8376917839050293,
+      "learning_rate": 0.0002,
+      "loss": 1.3701,
+      "step": 4120
+    },
+    {
+      "epoch": 3.5634167385677307,
+      "grad_norm": 2.419693946838379,
+      "learning_rate": 0.0002,
+      "loss": 1.3902,
+      "step": 4130
+    },
+    {
+      "epoch": 3.5720448662640205,
+      "grad_norm": 2.6899116039276123,
+      "learning_rate": 0.0002,
+      "loss": 1.3598,
+      "step": 4140
+    },
+    {
+      "epoch": 3.5806729939603104,
+      "grad_norm": 2.3880856037139893,
+      "learning_rate": 0.0002,
+      "loss": 1.2996,
+      "step": 4150
+    },
+    {
+      "epoch": 3.5893011216566006,
+      "grad_norm": 2.90229868888855,
+      "learning_rate": 0.0002,
+      "loss": 1.3822,
+      "step": 4160
+    },
+    {
+      "epoch": 3.5979292493528905,
+      "grad_norm": 2.4554741382598877,
+      "learning_rate": 0.0002,
+      "loss": 1.3287,
+      "step": 4170
+    },
+    {
+      "epoch": 3.6065573770491803,
+      "grad_norm": 2.473515510559082,
+      "learning_rate": 0.0002,
+      "loss": 1.3742,
+      "step": 4180
+    },
+    {
+      "epoch": 3.61518550474547,
+      "grad_norm": 3.1744322776794434,
+      "learning_rate": 0.0002,
+      "loss": 1.3688,
+      "step": 4190
+    },
+    {
+      "epoch": 3.62381363244176,
+      "grad_norm": 2.62101674079895,
+      "learning_rate": 0.0002,
+      "loss": 1.3488,
+      "step": 4200
+    },
+    {
+      "epoch": 3.6324417601380503,
+      "grad_norm": 3.0989694595336914,
+      "learning_rate": 0.0002,
+      "loss": 1.3455,
+      "step": 4210
+    },
+    {
+      "epoch": 3.64106988783434,
+      "grad_norm": 2.9838531017303467,
+      "learning_rate": 0.0002,
+      "loss": 1.3398,
+      "step": 4220
+    },
+    {
+      "epoch": 3.64969801553063,
+      "grad_norm": 2.748436689376831,
+      "learning_rate": 0.0002,
+      "loss": 1.3907,
+      "step": 4230
+    },
+    {
+      "epoch": 3.65832614322692,
+      "grad_norm": 2.4843106269836426,
+      "learning_rate": 0.0002,
+      "loss": 1.4439,
+      "step": 4240
+    },
+    {
+      "epoch": 3.6669542709232097,
+      "grad_norm": 2.7401318550109863,
+      "learning_rate": 0.0002,
+      "loss": 1.3545,
+      "step": 4250
+    },
+    {
+      "epoch": 3.6755823986194995,
+      "grad_norm": 2.7355165481567383,
+      "learning_rate": 0.0002,
+      "loss": 1.3654,
+      "step": 4260
+    },
+    {
+      "epoch": 3.6842105263157894,
+      "grad_norm": 2.281362533569336,
+      "learning_rate": 0.0002,
+      "loss": 1.3341,
+      "step": 4270
+    },
+    {
+      "epoch": 3.6928386540120792,
+      "grad_norm": 2.718242645263672,
+      "learning_rate": 0.0002,
+      "loss": 1.345,
+      "step": 4280
+    },
+    {
+      "epoch": 3.701466781708369,
+      "grad_norm": 2.622870445251465,
+      "learning_rate": 0.0002,
+      "loss": 1.3801,
+      "step": 4290
+    },
+    {
+      "epoch": 3.710094909404659,
+      "grad_norm": 3.1335484981536865,
+      "learning_rate": 0.0002,
+      "loss": 1.4118,
+      "step": 4300
+    },
+    {
+      "epoch": 3.718723037100949,
+      "grad_norm": 2.8306872844696045,
+      "learning_rate": 0.0002,
+      "loss": 1.396,
+      "step": 4310
+    },
+    {
+      "epoch": 3.727351164797239,
+      "grad_norm": 2.459052562713623,
+      "learning_rate": 0.0002,
+      "loss": 1.3227,
+      "step": 4320
+    },
+    {
+      "epoch": 3.735979292493529,
+      "grad_norm": 2.5947954654693604,
+      "learning_rate": 0.0002,
+      "loss": 1.3956,
+      "step": 4330
+    },
+    {
+      "epoch": 3.7446074201898187,
+      "grad_norm": 2.5057010650634766,
+      "learning_rate": 0.0002,
+      "loss": 1.3452,
+      "step": 4340
+    },
+    {
+      "epoch": 3.7532355478861086,
+      "grad_norm": 2.914073944091797,
+      "learning_rate": 0.0002,
+      "loss": 1.3885,
+      "step": 4350
+    },
+    {
+      "epoch": 3.761863675582399,
+      "grad_norm": 2.6400020122528076,
+      "learning_rate": 0.0002,
+      "loss": 1.3908,
+      "step": 4360
+    },
+    {
+      "epoch": 3.7704918032786887,
+      "grad_norm": 2.4498777389526367,
+      "learning_rate": 0.0002,
+      "loss": 1.425,
+      "step": 4370
+    },
+    {
+      "epoch": 3.7791199309749786,
+      "grad_norm": 2.395721673965454,
+      "learning_rate": 0.0002,
+      "loss": 1.3774,
+      "step": 4380
+    },
+    {
+      "epoch": 3.7877480586712684,
+      "grad_norm": 2.450078248977661,
+      "learning_rate": 0.0002,
+      "loss": 1.4062,
+      "step": 4390
+    },
+    {
+      "epoch": 3.7963761863675582,
+      "grad_norm": 2.56607985496521,
+      "learning_rate": 0.0002,
+      "loss": 1.4131,
+      "step": 4400
+    },
+    {
+      "epoch": 3.805004314063848,
+      "grad_norm": 2.7677228450775146,
+      "learning_rate": 0.0002,
+      "loss": 1.398,
+      "step": 4410
+    },
+    {
+      "epoch": 3.813632441760138,
+      "grad_norm": 2.3147966861724854,
+      "learning_rate": 0.0002,
+      "loss": 1.3271,
+      "step": 4420
+    },
+    {
+      "epoch": 3.822260569456428,
+      "grad_norm": 2.526195764541626,
+      "learning_rate": 0.0002,
+      "loss": 1.4567,
+      "step": 4430
+    },
+    {
+      "epoch": 3.8308886971527176,
+      "grad_norm": 2.689009666442871,
+      "learning_rate": 0.0002,
+      "loss": 1.3538,
+      "step": 4440
+    },
+    {
+      "epoch": 3.839516824849008,
+      "grad_norm": 2.7414004802703857,
+      "learning_rate": 0.0002,
+      "loss": 1.3873,
+      "step": 4450
+    },
+    {
+      "epoch": 3.8481449525452978,
+      "grad_norm": 2.402777910232544,
+      "learning_rate": 0.0002,
+      "loss": 1.3735,
+      "step": 4460
+    },
+    {
+      "epoch": 3.8567730802415876,
+      "grad_norm": 2.724787950515747,
+      "learning_rate": 0.0002,
+      "loss": 1.424,
+      "step": 4470
+    },
+    {
+      "epoch": 3.8654012079378774,
+      "grad_norm": 2.671051025390625,
+      "learning_rate": 0.0002,
+      "loss": 1.365,
+      "step": 4480
+    },
+    {
+      "epoch": 3.8740293356341673,
+      "grad_norm": 2.3963396549224854,
+      "learning_rate": 0.0002,
+      "loss": 1.411,
+      "step": 4490
+    },
+    {
+      "epoch": 3.882657463330457,
+      "grad_norm": 2.740722894668579,
+      "learning_rate": 0.0002,
+      "loss": 1.4021,
+      "step": 4500
+    },
+    {
+      "epoch": 3.8912855910267474,
+      "grad_norm": 2.6354315280914307,
+      "learning_rate": 0.0002,
+      "loss": 1.3137,
+      "step": 4510
+    },
+    {
+      "epoch": 3.8999137187230373,
+      "grad_norm": 3.3707101345062256,
+      "learning_rate": 0.0002,
+      "loss": 1.3835,
+      "step": 4520
+    },
+    {
+      "epoch": 3.908541846419327,
+      "grad_norm": 2.7361974716186523,
+      "learning_rate": 0.0002,
+      "loss": 1.4361,
+      "step": 4530
+    },
+    {
+      "epoch": 3.917169974115617,
+      "grad_norm": 3.0370259284973145,
+      "learning_rate": 0.0002,
+      "loss": 1.371,
+      "step": 4540
+    },
+    {
+      "epoch": 3.925798101811907,
+      "grad_norm": 2.3987460136413574,
+      "learning_rate": 0.0002,
+      "loss": 1.3733,
+      "step": 4550
+    },
+    {
+      "epoch": 3.9344262295081966,
+      "grad_norm": 2.7995121479034424,
+      "learning_rate": 0.0002,
+      "loss": 1.4056,
+      "step": 4560
+    },
+    {
+      "epoch": 3.9430543572044865,
+      "grad_norm": 2.5444767475128174,
+      "learning_rate": 0.0002,
+      "loss": 1.3746,
+      "step": 4570
+    },
+    {
+      "epoch": 3.9516824849007763,
+      "grad_norm": 2.4560024738311768,
+      "learning_rate": 0.0002,
+      "loss": 1.4665,
+      "step": 4580
+    },
+    {
+      "epoch": 3.960310612597066,
+      "grad_norm": 2.787332057952881,
+      "learning_rate": 0.0002,
+      "loss": 1.4489,
+      "step": 4590
+    },
+    {
+      "epoch": 3.9689387402933565,
+      "grad_norm": 2.4420697689056396,
+      "learning_rate": 0.0002,
+      "loss": 1.3774,
+      "step": 4600
+    },
+    {
+      "epoch": 3.9775668679896463,
+      "grad_norm": 2.5259456634521484,
+      "learning_rate": 0.0002,
+      "loss": 1.3966,
+      "step": 4610
+    },
+    {
+      "epoch": 3.986194995685936,
+      "grad_norm": 2.5357017517089844,
+      "learning_rate": 0.0002,
+      "loss": 1.4216,
+      "step": 4620
+    },
+    {
+      "epoch": 3.994823123382226,
+      "grad_norm": 51.29335403442383,
+      "learning_rate": 0.0002,
+      "loss": 1.416,
+      "step": 4630
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.9125109910964966,
+      "eval_runtime": 151.0067,
+      "eval_samples_per_second": 3.53,
+      "eval_steps_per_second": 0.444,
+      "step": 4636
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9272,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.034032330145792e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}