Volko76 commited on Apr 11

Commit

4202234

•

1 Parent(s): 24b0b45

Upload folder using huggingface_hub

Browse files

Files changed (41) hide show

README.md +140 -1
adapter_config.json +34 -0
adapter_model.bin +3 -0
added_tokens.json +5 -0
checkpoint-13105/README.md +202 -0
checkpoint-13105/adapter_config.json +34 -0
checkpoint-13105/adapter_model.safetensors +3 -0
checkpoint-13105/added_tokens.json +5 -0
checkpoint-13105/merges.txt +0 -0
checkpoint-13105/optimizer.pt +3 -0
checkpoint-13105/rng_state.pth +3 -0
checkpoint-13105/scheduler.pt +3 -0
checkpoint-13105/special_tokens_map.json +20 -0
checkpoint-13105/tokenizer.json +0 -0
checkpoint-13105/tokenizer_config.json +43 -0
checkpoint-13105/trainer_state.json +0 -0
checkpoint-13105/training_args.bin +3 -0
checkpoint-13105/vocab.json +0 -0
checkpoint-475/README.md +202 -0
checkpoint-475/adapter_config.json +34 -0
checkpoint-475/adapter_model.safetensors +3 -0
checkpoint-475/added_tokens.json +5 -0
checkpoint-475/merges.txt +0 -0
checkpoint-475/optimizer.pt +3 -0
checkpoint-475/rng_state.pth +3 -0
checkpoint-475/scheduler.pt +3 -0
checkpoint-475/special_tokens_map.json +20 -0
checkpoint-475/tokenizer.json +0 -0
checkpoint-475/tokenizer_config.json +43 -0
checkpoint-475/trainer_state.json +3378 -0
checkpoint-475/training_args.bin +3 -0
checkpoint-475/vocab.json +0 -0
config.json +42 -0
merges.txt +0 -0
runs/Apr11_16-32-20_volko-MS-7D09/events.out.tfevents.1712845940.volko-MS-7D09.38265.0 +3 -0
runs/Apr11_16-53-26_volko-MS-7D09/events.out.tfevents.1712847206.volko-MS-7D09.40309.0 +3 -0
runs/Apr11_17-04-13_volko-MS-7D09/events.out.tfevents.1712847853.volko-MS-7D09.41247.0 +3 -0
special_tokens_map.json +20 -0
tokenizer.json +0 -0
tokenizer_config.json +43 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,142 @@
 ---
-license: apache-2.0
 ---

 ---
+license: other
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: Qwen/Qwen1.5-0.5B
+model-index:
+- name: lora-out
+  results: []
 ---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.4.0`
+```yaml
+base_model: Qwen/Qwen1.5-0.5B
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+datasets:
+  - path: jpacifico/French-Alpaca-dataset-Instruct-55K
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./lora-out
+sequence_len: 2048  # supports up to 8192
+sample_packing: false
+pad_to_sequence_len:
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+gradient_checkpointing: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention:
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+```
+</details><br>
+# lora-out
+This model is a fine-tuned version of [Qwen/Qwen1.5-0.5B](https://huggingface.co/Qwen/Qwen1.5-0.5B) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: nan
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 4
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 1.4213        | 0.0   | 1    | nan             |
+| 1.0472        | 0.25  | 3277 | nan             |
+| 1.4289        | 0.5   | 6554 | nan             |
+| 1.6165        | 0.75  | 9831 | nan             |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.40.0.dev0
+- Pytorch 2.2.2
+- Datasets 2.18.0
+- Tokenizers 0.15.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen1.5-0.5B",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:807088491fc73daa7477d675f55c710b05b829d1ce2927d1f0b1e102a0264c23
+size 60676170

added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-13105/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: Qwen/Qwen1.5-0.5B
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-13105/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen1.5-0.5B",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-13105/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e14394f034d36a1e0688b1db8e5e1eda1f4423fe8ed4a81e678e11adf138382
+size 60599872

checkpoint-13105/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-13105/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-13105/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b73d2dc75c8b2bf0784a6c801bd52b2a187b7d13405a5b08b529cb25fd184a6e
+size 30723092

checkpoint-13105/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3366175c7f29ead4afc790d73217f0ac8231cc0f95b982c050b5022478b06fe4
+size 14244

checkpoint-13105/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e74e7ce9fb51bebfdcf8be91b516547c1f38df077a815b3fe8b31c66169dda21
+size 1064

checkpoint-13105/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-13105/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-13105/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-13105/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-13105/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f3fad28a028c0f2db58dd0e98057fd1f6d6f14fd12149fd7997f77062ceb2a
+size 5752

checkpoint-13105/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-475/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: Qwen/Qwen1.5-0.5B
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-475/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen1.5-0.5B",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-475/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54b3e967b729f742e1ab902d55c9f10641ebe9f2695aa58d13efe40c3b54f6bc
+size 60599872

checkpoint-475/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-475/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-475/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5c70a89c6440b79ff52bcc3586a2b8b901f33f0b3a9694856d1c4b6e2fa125d
+size 30723092

checkpoint-475/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abeaf8862616c21f7b62fcd82983987d9d6a5df087a0f3263981a478e87dab0e
+size 14244

checkpoint-475/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d3ab8b8dc5babc32b4adc3c596b50dd0fcac27b238d3838d86c3c68054c541d
+size 1064

checkpoint-475/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-475/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-475/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-475/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3378 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 119,
+  "global_step": 475,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.7395525574684143,
+      "learning_rate": 2e-05,
+      "loss": 1.1916,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 1.3024712800979614,
+      "eval_runtime": 11.3342,
+      "eval_samples_per_second": 8.823,
+      "eval_steps_per_second": 8.823,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.7760837078094482,
+      "learning_rate": 4e-05,
+      "loss": 1.5154,
+      "step": 2
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.630995512008667,
+      "learning_rate": 6e-05,
+      "loss": 2.1425,
+      "step": 3
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6175426244735718,
+      "learning_rate": 8e-05,
+      "loss": 0.7877,
+      "step": 4
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.5972404479980469,
+      "learning_rate": 0.0001,
+      "loss": 1.3798,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.5723439455032349,
+      "learning_rate": 0.00012,
+      "loss": 1.0747,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.4761886596679688,
+      "learning_rate": 0.00014,
+      "loss": 1.1005,
+      "step": 7
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.2958413362503052,
+      "learning_rate": 0.00016,
+      "loss": 1.1242,
+      "step": 8
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.9850685000419617,
+      "learning_rate": 0.00018,
+      "loss": 1.1449,
+      "step": 9
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.666906714439392,
+      "learning_rate": 0.0002,
+      "loss": 1.4931,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5160439014434814,
+      "learning_rate": 0.00019999771775537991,
+      "loss": 1.1436,
+      "step": 11
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.0970433950424194,
+      "learning_rate": 0.00019999087112569246,
+      "loss": 1.6171,
+      "step": 12
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7956830263137817,
+      "learning_rate": 0.00019997946042345127,
+      "loss": 1.3005,
+      "step": 13
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.2406549453735352,
+      "learning_rate": 0.00019996348616949672,
+      "loss": 1.6621,
+      "step": 14
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.932831346988678,
+      "learning_rate": 0.0001999429490929718,
+      "loss": 1.784,
+      "step": 15
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.9084440469741821,
+      "learning_rate": 0.00019991785013128923,
+      "loss": 1.5638,
+      "step": 16
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.491622805595398,
+      "learning_rate": 0.0001998881904300884,
+      "loss": 1.5091,
+      "step": 17
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6921408176422119,
+      "learning_rate": 0.00019985397134318319,
+      "loss": 0.9959,
+      "step": 18
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.7786202430725098,
+      "learning_rate": 0.0001998151944325001,
+      "loss": 1.0953,
+      "step": 19
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5404906272888184,
+      "learning_rate": 0.00019977186146800707,
+      "loss": 2.0195,
+      "step": 20
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.5230717658996582,
+      "learning_rate": 0.00019972397442763262,
+      "loss": 1.0865,
+      "step": 21
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 6.9937639236450195,
+      "learning_rate": 0.00019967153549717553,
+      "loss": 1.6098,
+      "step": 22
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.6447493433952332,
+      "learning_rate": 0.00019961454707020514,
+      "loss": 1.63,
+      "step": 23
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5249179601669312,
+      "learning_rate": 0.00019955301174795208,
+      "loss": 0.9971,
+      "step": 24
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.7006235122680664,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 1.151,
+      "step": 25
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.094476580619812,
+      "learning_rate": 0.00019941631186010494,
+      "loss": 1.0216,
+      "step": 26
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.7347458004951477,
+      "learning_rate": 0.0001993411535341625,
+      "loss": 0.8214,
+      "step": 27
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.36545494198799133,
+      "learning_rate": 0.00019926146079195594,
+      "loss": 1.3825,
+      "step": 28
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.0544871091842651,
+      "learning_rate": 0.0001991772372710519,
+      "loss": 1.244,
+      "step": 29
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.498039484024048,
+      "learning_rate": 0.00019908848681582391,
+      "loss": 1.8747,
+      "step": 30
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.9571327567100525,
+      "learning_rate": 0.0001989952134772769,
+      "loss": 1.3877,
+      "step": 31
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0039713382720947,
+      "learning_rate": 0.00019889742151286247,
+      "loss": 2.0081,
+      "step": 32
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.1989943981170654,
+      "learning_rate": 0.00019879511538628428,
+      "loss": 1.4427,
+      "step": 33
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.8076533675193787,
+      "learning_rate": 0.00019868829976729443,
+      "loss": 1.3122,
+      "step": 34
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 3.5690324306488037,
+      "learning_rate": 0.00019857697953148037,
+      "loss": 1.5759,
+      "step": 35
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7935991883277893,
+      "learning_rate": 0.00019846115976004234,
+      "loss": 1.2685,
+      "step": 36
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.0418881177902222,
+      "learning_rate": 0.00019834084573956128,
+      "loss": 1.8058,
+      "step": 37
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.13619065284729,
+      "learning_rate": 0.00019821604296175774,
+      "loss": 1.55,
+      "step": 38
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.460271954536438,
+      "learning_rate": 0.00019808675712324107,
+      "loss": 1.3906,
+      "step": 39
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.8569309711456299,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 1.3688,
+      "step": 40
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.2582310438156128,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 1.093,
+      "step": 41
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.4148155450820923,
+      "learning_rate": 0.00019767206127731281,
+      "loss": 1.3204,
+      "step": 42
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.2747466564178467,
+      "learning_rate": 0.00019752490425051743,
+      "loss": 1.0998,
+      "step": 43
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.7309204339981079,
+      "learning_rate": 0.000197373295709961,
+      "loss": 1.6907,
+      "step": 44
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5022898316383362,
+      "learning_rate": 0.00019721724257579907,
+      "loss": 1.4717,
+      "step": 45
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.9706376194953918,
+      "learning_rate": 0.00019705675197106016,
+      "loss": 1.8908,
+      "step": 46
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.6191427111625671,
+      "learning_rate": 0.00019689183122132068,
+      "loss": 1.3458,
+      "step": 47
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.8788885474205017,
+      "learning_rate": 0.0001967224878543705,
+      "loss": 1.772,
+      "step": 48
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.3387707471847534,
+      "learning_rate": 0.00019654872959986937,
+      "loss": 1.4979,
+      "step": 49
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.0316563844680786,
+      "learning_rate": 0.0001963705643889941,
+      "loss": 1.2347,
+      "step": 50
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.2246055603027344,
+      "learning_rate": 0.00019618800035407658,
+      "loss": 1.7885,
+      "step": 51
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.443323016166687,
+      "learning_rate": 0.0001960010458282326,
+      "loss": 1.2274,
+      "step": 52
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.8868201375007629,
+      "learning_rate": 0.0001958097093449813,
+      "loss": 0.9314,
+      "step": 53
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 2.7264301776885986,
+      "learning_rate": 0.00019561399963785586,
+      "loss": 1.1364,
+      "step": 54
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.450035810470581,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 1.6993,
+      "step": 55
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.0921446084976196,
+      "learning_rate": 0.00019520949648378443,
+      "loss": 1.4098,
+      "step": 56
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.580201268196106,
+      "learning_rate": 0.00019500072150034137,
+      "loss": 1.0974,
+      "step": 57
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.6637622117996216,
+      "learning_rate": 0.00019478761021918728,
+      "loss": 1.4646,
+      "step": 58
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.7193565964698792,
+      "learning_rate": 0.00019457017236776373,
+      "loss": 1.2315,
+      "step": 59
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.8072863817214966,
+      "learning_rate": 0.00019434841787099803,
+      "loss": 1.1918,
+      "step": 60
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.3189361095428467,
+      "learning_rate": 0.00019412235685085035,
+      "loss": 1.5442,
+      "step": 61
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.0731955766677856,
+      "learning_rate": 0.00019389199962585157,
+      "loss": 0.9577,
+      "step": 62
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.7714097499847412,
+      "learning_rate": 0.0001936573567106325,
+      "loss": 1.6435,
+      "step": 63
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.1686166524887085,
+      "learning_rate": 0.00019341843881544372,
+      "loss": 1.7296,
+      "step": 64
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.8492275476455688,
+      "learning_rate": 0.00019317525684566685,
+      "loss": 1.4221,
+      "step": 65
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.8079515099525452,
+      "learning_rate": 0.00019292782190131677,
+      "loss": 1.234,
+      "step": 66
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.6675179600715637,
+      "learning_rate": 0.00019267614527653488,
+      "loss": 1.2457,
+      "step": 67
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5377606153488159,
+      "learning_rate": 0.0001924202384590736,
+      "loss": 1.4115,
+      "step": 68
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.6757211089134216,
+      "learning_rate": 0.0001921601131297721,
+      "loss": 1.0735,
+      "step": 69
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.585841178894043,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 1.3859,
+      "step": 70
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.6633312106132507,
+      "learning_rate": 0.00019162725462123072,
+      "loss": 1.1772,
+      "step": 71
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.7894064784049988,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.9443,
+      "step": 72
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.7125688195228577,
+      "learning_rate": 0.00019107766703887764,
+      "loss": 1.2188,
+      "step": 73
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4715336561203003,
+      "learning_rate": 0.00019079663108318302,
+      "loss": 1.3095,
+      "step": 74
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 3.017150402069092,
+      "learning_rate": 0.00019051145072503215,
+      "loss": 1.704,
+      "step": 75
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5130548477172852,
+      "learning_rate": 0.00019022213898145176,
+      "loss": 1.2175,
+      "step": 76
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.931109607219696,
+      "learning_rate": 0.00018992870905804534,
+      "loss": 0.9057,
+      "step": 77
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 3.687540292739868,
+      "learning_rate": 0.0001896311743483901,
+      "loss": 1.0161,
+      "step": 78
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5885124802589417,
+      "learning_rate": 0.00018932954843342591,
+      "loss": 1.2787,
+      "step": 79
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.636814534664154,
+      "learning_rate": 0.00018902384508083517,
+      "loss": 1.0253,
+      "step": 80
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.0739303827285767,
+      "learning_rate": 0.0001887140782444145,
+      "loss": 1.1437,
+      "step": 81
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.6316006183624268,
+      "learning_rate": 0.00018840026206343784,
+      "loss": 0.4953,
+      "step": 82
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.1502597332000732,
+      "learning_rate": 0.00018808241086201103,
+      "loss": 1.043,
+      "step": 83
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.3769752979278564,
+      "learning_rate": 0.0001877605391484179,
+      "loss": 1.2975,
+      "step": 84
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.9198704957962036,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 1.3415,
+      "step": 85
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.8441985845565796,
+      "learning_rate": 0.00018710479313477696,
+      "loss": 0.9262,
+      "step": 86
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.6904205679893494,
+      "learning_rate": 0.00018677094876618538,
+      "loss": 1.0266,
+      "step": 87
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.9781098365783691,
+      "learning_rate": 0.00018643314374697378,
+      "loss": 1.8946,
+      "step": 88
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5499415397644043,
+      "learning_rate": 0.00018609139349621588,
+      "loss": 0.8428,
+      "step": 89
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.4500677585601807,
+      "learning_rate": 0.0001857457136130651,
+      "loss": 1.3847,
+      "step": 90
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.2159172296524048,
+      "learning_rate": 0.00018539611987604258,
+      "loss": 1.0733,
+      "step": 91
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.3588328957557678,
+      "learning_rate": 0.00018504262824231674,
+      "loss": 1.3488,
+      "step": 92
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.6259031891822815,
+      "learning_rate": 0.00018468525484697525,
+      "loss": 1.9598,
+      "step": 93
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.32454252243042,
+      "learning_rate": 0.00018432401600228823,
+      "loss": 0.7533,
+      "step": 94
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4529890716075897,
+      "learning_rate": 0.00018395892819696389,
+      "loss": 1.816,
+      "step": 95
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5825135707855225,
+      "learning_rate": 0.00018359000809539585,
+      "loss": 1.104,
+      "step": 96
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.4329938888549805,
+      "learning_rate": 0.0001832172725369024,
+      "loss": 1.37,
+      "step": 97
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.0493029356002808,
+      "learning_rate": 0.00018284073853495807,
+      "loss": 1.3342,
+      "step": 98
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.8605116605758667,
+      "learning_rate": 0.00018246042327641678,
+      "loss": 1.26,
+      "step": 99
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.775687575340271,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 0.8628,
+      "step": 100
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 4.881119251251221,
+      "learning_rate": 0.0001816885185991424,
+      "loss": 1.6938,
+      "step": 101
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.4081553220748901,
+      "learning_rate": 0.00018129696441391522,
+      "loss": 1.7014,
+      "step": 102
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.636123538017273,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 1.4061,
+      "step": 103
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.2846980392932892,
+      "learning_rate": 0.00018050274171170836,
+      "loss": 0.5465,
+      "step": 104
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.0350561141967773,
+      "learning_rate": 0.00018010010944693848,
+      "loss": 1.7158,
+      "step": 105
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.09168541431427,
+      "learning_rate": 0.0001796938210212915,
+      "loss": 1.0047,
+      "step": 106
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.8954039812088013,
+      "learning_rate": 0.00017928389497975895,
+      "loss": 1.7889,
+      "step": 107
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.9749327301979065,
+      "learning_rate": 0.00017887035003337083,
+      "loss": 1.9958,
+      "step": 108
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.0174291133880615,
+      "learning_rate": 0.00017845320505834175,
+      "loss": 1.5635,
+      "step": 109
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.013778805732727,
+      "learning_rate": 0.0001780324790952092,
+      "loss": 1.1228,
+      "step": 110
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.6990927457809448,
+      "learning_rate": 0.0001776081913479645,
+      "loss": 1.2748,
+      "step": 111
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.4960625171661377,
+      "learning_rate": 0.0001771803611831762,
+      "loss": 1.1509,
+      "step": 112
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7278648018836975,
+      "learning_rate": 0.0001767490081291062,
+      "loss": 1.1221,
+      "step": 113
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7677181363105774,
+      "learning_rate": 0.0001763141518748182,
+      "loss": 1.7333,
+      "step": 114
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.0699597597122192,
+      "learning_rate": 0.0001758758122692791,
+      "loss": 1.3511,
+      "step": 115
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5539786219596863,
+      "learning_rate": 0.00017543400932045307,
+      "loss": 1.171,
+      "step": 116
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.4356689751148224,
+      "learning_rate": 0.0001749887631943882,
+      "loss": 1.2168,
+      "step": 117
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.6579217910766602,
+      "learning_rate": 0.00017454009421429597,
+      "loss": 1.331,
+      "step": 118
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.777512311935425,
+      "learning_rate": 0.00017408802285962368,
+      "loss": 1.4826,
+      "step": 119
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.2498269081115723,
+      "eval_runtime": 11.2195,
+      "eval_samples_per_second": 8.913,
+      "eval_steps_per_second": 8.913,
+      "step": 119
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.47785863280296326,
+      "learning_rate": 0.00017363256976511972,
+      "loss": 1.4644,
+      "step": 120
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5930947661399841,
+      "learning_rate": 0.00017317375571989158,
+      "loss": 1.591,
+      "step": 121
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.7645952701568604,
+      "learning_rate": 0.00017271160166645695,
+      "loss": 1.2038,
+      "step": 122
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.7254743576049805,
+      "learning_rate": 0.0001722461286997879,
+      "loss": 1.1559,
+      "step": 123
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.39301592111587524,
+      "learning_rate": 0.00017177735806634789,
+      "loss": 1.1492,
+      "step": 124
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.8771683573722839,
+      "learning_rate": 0.00017130531116312203,
+      "loss": 1.0438,
+      "step": 125
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.6636186242103577,
+      "learning_rate": 0.0001708300095366405,
+      "loss": 1.0158,
+      "step": 126
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.7356956601142883,
+      "learning_rate": 0.00017035147488199482,
+      "loss": 1.2417,
+      "step": 127
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 3.8965201377868652,
+      "learning_rate": 0.00016986972904184784,
+      "loss": 1.3653,
+      "step": 128
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.6314372420310974,
+      "learning_rate": 0.00016938479400543658,
+      "loss": 0.9501,
+      "step": 129
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.1143038272857666,
+      "learning_rate": 0.00016889669190756868,
+      "loss": 1.2585,
+      "step": 130
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.9126316905021667,
+      "learning_rate": 0.00016840544502761176,
+      "loss": 0.9933,
+      "step": 131
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.7595810294151306,
+      "learning_rate": 0.0001679110757884769,
+      "loss": 1.3224,
+      "step": 132
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.7331580519676208,
+      "learning_rate": 0.00016741360675559473,
+      "loss": 1.451,
+      "step": 133
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.37914204597473145,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 1.0334,
+      "step": 134
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.7935598492622375,
+      "learning_rate": 0.00016640946027672392,
+      "loss": 1.401,
+      "step": 135
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.3880078196525574,
+      "learning_rate": 0.00016590282866489319,
+      "loss": 1.2831,
+      "step": 136
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.7817143201828003,
+      "learning_rate": 0.0001653931889255391,
+      "loss": 1.2609,
+      "step": 137
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.7870498299598694,
+      "learning_rate": 0.0001648805643211127,
+      "loss": 0.8674,
+      "step": 138
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5795213580131531,
+      "learning_rate": 0.00016436497825030884,
+      "loss": 0.9604,
+      "step": 139
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5516975522041321,
+      "learning_rate": 0.00016384645424699835,
+      "loss": 0.6344,
+      "step": 140
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.6294902563095093,
+      "learning_rate": 0.00016332501597915352,
+      "loss": 1.0385,
+      "step": 141
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.3721281886100769,
+      "learning_rate": 0.00016280068724776797,
+      "loss": 1.0667,
+      "step": 142
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.632927656173706,
+      "learning_rate": 0.0001622734919857702,
+      "loss": 0.5996,
+      "step": 143
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.7089216709136963,
+      "learning_rate": 0.0001617434542569313,
+      "loss": 1.0407,
+      "step": 144
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.2346296310424805,
+      "learning_rate": 0.0001612105982547663,
+      "loss": 1.4111,
+      "step": 145
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5850281119346619,
+      "learning_rate": 0.00016067494830143014,
+      "loss": 1.1949,
+      "step": 146
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.6999644041061401,
+      "learning_rate": 0.00016013652884660723,
+      "loss": 1.2583,
+      "step": 147
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.6640311479568481,
+      "learning_rate": 0.0001595953644663957,
+      "loss": 0.8627,
+      "step": 148
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.467826008796692,
+      "learning_rate": 0.00015905147986218547,
+      "loss": 1.4436,
+      "step": 149
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5436001420021057,
+      "learning_rate": 0.00015850489985953076,
+      "loss": 1.1029,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.3373098373413086,
+      "learning_rate": 0.000157955649407017,
+      "loss": 1.0907,
+      "step": 151
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.8601608276367188,
+      "learning_rate": 0.00015740375357512195,
+      "loss": 1.285,
+      "step": 152
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.090238332748413,
+      "learning_rate": 0.0001568492375550715,
+      "loss": 1.1262,
+      "step": 153
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.0742828845977783,
+      "learning_rate": 0.00015629212665768978,
+      "loss": 0.9301,
+      "step": 154
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.6687757968902588,
+      "learning_rate": 0.00015573244631224365,
+      "loss": 1.3763,
+      "step": 155
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.322456032037735,
+      "learning_rate": 0.00015517022206528233,
+      "loss": 1.157,
+      "step": 156
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.552617073059082,
+      "learning_rate": 0.00015460547957947104,
+      "loss": 1.5864,
+      "step": 157
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.0862557888031006,
+      "learning_rate": 0.0001540382446324198,
+      "loss": 1.2378,
+      "step": 158
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.6755203008651733,
+      "learning_rate": 0.00015346854311550673,
+      "loss": 1.1782,
+      "step": 159
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5506089329719543,
+      "learning_rate": 0.00015289640103269625,
+      "loss": 1.5313,
+      "step": 160
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5224264860153198,
+      "learning_rate": 0.0001523218444993522,
+      "loss": 1.1505,
+      "step": 161
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.8278419971466064,
+      "learning_rate": 0.00015174489974104574,
+      "loss": 1.4319,
+      "step": 162
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.9323415160179138,
+      "learning_rate": 0.00015116559309235825,
+      "loss": 1.3218,
+      "step": 163
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.1334632635116577,
+      "learning_rate": 0.00015058395099567935,
+      "loss": 1.0519,
+      "step": 164
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.3949350118637085,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.0169,
+      "step": 165
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.9846246242523193,
+      "learning_rate": 0.0001494137667597006,
+      "loss": 1.8383,
+      "step": 166
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.7132427096366882,
+      "learning_rate": 0.0001488252780333342,
+      "loss": 1.1292,
+      "step": 167
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.259857177734375,
+      "learning_rate": 0.00014823456068240558,
+      "loss": 0.929,
+      "step": 168
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.3703701496124268,
+      "learning_rate": 0.00014764164167014451,
+      "loss": 1.5655,
+      "step": 169
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5980277061462402,
+      "learning_rate": 0.0001470465480602756,
+      "loss": 1.1459,
+      "step": 170
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.2204481363296509,
+      "learning_rate": 0.00014644930701578253,
+      "loss": 0.8177,
+      "step": 171
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.325509637594223,
+      "learning_rate": 0.00014584994579766865,
+      "loss": 1.2372,
+      "step": 172
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.206936001777649,
+      "learning_rate": 0.0001452484917637122,
+      "loss": 1.3148,
+      "step": 173
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.0160785913467407,
+      "learning_rate": 0.00014464497236721778,
+      "loss": 1.1832,
+      "step": 174
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.3579516410827637,
+      "learning_rate": 0.00014403941515576344,
+      "loss": 1.3798,
+      "step": 175
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.47968143224716187,
+      "learning_rate": 0.00014343184776994289,
+      "loss": 1.0797,
+      "step": 176
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.9022389650344849,
+      "learning_rate": 0.00014282229794210404,
+      "loss": 1.3824,
+      "step": 177
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.9592376947402954,
+      "learning_rate": 0.0001422107934950832,
+      "loss": 0.7374,
+      "step": 178
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.8611739873886108,
+      "learning_rate": 0.0001415973623409351,
+      "loss": 1.4377,
+      "step": 179
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.4279676079750061,
+      "learning_rate": 0.00014098203247965875,
+      "loss": 1.7355,
+      "step": 180
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.669457733631134,
+      "learning_rate": 0.00014036483199791948,
+      "loss": 1.3662,
+      "step": 181
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.8061684370040894,
+      "learning_rate": 0.00013974578906776684,
+      "loss": 1.2989,
+      "step": 182
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.6208318471908569,
+      "learning_rate": 0.00013912493194534874,
+      "loss": 1.4503,
+      "step": 183
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.9276289343833923,
+      "learning_rate": 0.0001385022889696218,
+      "loss": 1.7339,
+      "step": 184
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.5193164944648743,
+      "learning_rate": 0.0001378778885610576,
+      "loss": 1.0053,
+      "step": 185
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.7712852954864502,
+      "learning_rate": 0.00013725175922034565,
+      "loss": 0.7669,
+      "step": 186
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.6915028691291809,
+      "learning_rate": 0.00013662392952709228,
+      "loss": 1.2908,
+      "step": 187
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.8135898113250732,
+      "learning_rate": 0.00013599442813851632,
+      "loss": 1.2639,
+      "step": 188
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.9134801626205444,
+      "learning_rate": 0.00013536328378814093,
+      "loss": 1.3689,
+      "step": 189
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3660939037799835,
+      "learning_rate": 0.00013473052528448201,
+      "loss": 1.196,
+      "step": 190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 3.6361420154571533,
+      "learning_rate": 0.00013409618150973348,
+      "loss": 2.6822,
+      "step": 191
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4731757938861847,
+      "learning_rate": 0.0001334602814184486,
+      "loss": 1.1966,
+      "step": 192
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.3311164081096649,
+      "learning_rate": 0.00013282285403621864,
+      "loss": 1.4858,
+      "step": 193
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.9196639060974121,
+      "learning_rate": 0.00013218392845834787,
+      "loss": 1.2163,
+      "step": 194
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.2580674886703491,
+      "learning_rate": 0.00013154353384852558,
+      "loss": 1.467,
+      "step": 195
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.3060836791992188,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 2.2125,
+      "step": 196
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.5120143294334412,
+      "learning_rate": 0.00013025845452171807,
+      "loss": 1.3174,
+      "step": 197
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.9528945684432983,
+      "learning_rate": 0.00012961382846204055,
+      "loss": 1.7378,
+      "step": 198
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.5604123473167419,
+      "learning_rate": 0.00012896785068234926,
+      "loss": 1.287,
+      "step": 199
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.5821733474731445,
+      "learning_rate": 0.00012832055066823038,
+      "loss": 1.4721,
+      "step": 200
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.4492989778518677,
+      "learning_rate": 0.0001276719579656236,
+      "loss": 1.2461,
+      "step": 201
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.423685908317566,
+      "learning_rate": 0.00012702210217947288,
+      "loss": 0.9973,
+      "step": 202
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.5945116877555847,
+      "learning_rate": 0.0001263710129723757,
+      "loss": 0.6836,
+      "step": 203
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.8192650079727173,
+      "learning_rate": 0.00012571872006322888,
+      "loss": 1.236,
+      "step": 204
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.35752618312835693,
+      "learning_rate": 0.00012506525322587207,
+      "loss": 1.2113,
+      "step": 205
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.5102562308311462,
+      "learning_rate": 0.00012441064228772874,
+      "loss": 0.835,
+      "step": 206
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6736109852790833,
+      "learning_rate": 0.0001237549171284447,
+      "loss": 1.389,
+      "step": 207
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.7494972348213196,
+      "learning_rate": 0.00012309810767852433,
+      "loss": 1.0185,
+      "step": 208
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.34725117683410645,
+      "learning_rate": 0.0001224402439179643,
+      "loss": 0.9023,
+      "step": 209
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5315357446670532,
+      "learning_rate": 0.00012178135587488515,
+      "loss": 0.9621,
+      "step": 210
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4834609031677246,
+      "learning_rate": 0.00012112147362416076,
+      "loss": 0.9703,
+      "step": 211
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.5364122986793518,
+      "learning_rate": 0.0001204606272860454,
+      "loss": 1.4784,
+      "step": 212
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.008988380432129,
+      "learning_rate": 0.00011979884702479909,
+      "loss": 1.6889,
+      "step": 213
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.8513673543930054,
+      "learning_rate": 0.00011913616304731063,
+      "loss": 1.4391,
+      "step": 214
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.0368411540985107,
+      "learning_rate": 0.00011847260560171896,
+      "loss": 1.4572,
+      "step": 215
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.43699911236763,
+      "learning_rate": 0.00011780820497603215,
+      "loss": 0.9995,
+      "step": 216
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.1479334831237793,
+      "learning_rate": 0.00011714299149674537,
+      "loss": 0.9971,
+      "step": 217
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.6493399739265442,
+      "learning_rate": 0.00011647699552745628,
+      "loss": 1.1328,
+      "step": 218
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.8177739977836609,
+      "learning_rate": 0.00011581024746747924,
+      "loss": 1.2741,
+      "step": 219
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.31355175375938416,
+      "learning_rate": 0.00011514277775045768,
+      "loss": 1.2813,
+      "step": 220
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.5200531482696533,
+      "learning_rate": 0.00011447461684297504,
+      "loss": 1.4285,
+      "step": 221
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.6473718881607056,
+      "learning_rate": 0.00011380579524316406,
+      "loss": 1.5263,
+      "step": 222
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.004498243331909,
+      "learning_rate": 0.00011313634347931466,
+      "loss": 0.9576,
+      "step": 223
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.9827050566673279,
+      "learning_rate": 0.00011246629210848061,
+      "loss": 1.3642,
+      "step": 224
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.6069352626800537,
+      "learning_rate": 0.00011179567171508463,
+      "loss": 1.3768,
+      "step": 225
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4674800634384155,
+      "learning_rate": 0.00011112451290952237,
+      "loss": 0.9445,
+      "step": 226
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.1005616188049316,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 1.5748,
+      "step": 227
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.578959584236145,
+      "learning_rate": 0.00010978070262496247,
+      "loss": 1.3462,
+      "step": 228
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9835721254348755,
+      "learning_rate": 0.00010910811248404065,
+      "loss": 2.2544,
+      "step": 229
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.37735217809677124,
+      "learning_rate": 0.00010843510660430447,
+      "loss": 1.585,
+      "step": 230
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.3374781608581543,
+      "learning_rate": 0.00010776171570503499,
+      "loss": 0.7627,
+      "step": 231
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.3360700607299805,
+      "learning_rate": 0.0001070879705230873,
+      "loss": 1.8169,
+      "step": 232
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.6257541179656982,
+      "learning_rate": 0.00010641390181148772,
+      "loss": 1.0015,
+      "step": 233
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.234805703163147,
+      "learning_rate": 0.00010573954033803007,
+      "loss": 1.3024,
+      "step": 234
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 2.442201852798462,
+      "learning_rate": 0.00010506491688387127,
+      "loss": 1.3141,
+      "step": 235
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.39670243859291077,
+      "learning_rate": 0.00010439006224212628,
+      "loss": 0.9339,
+      "step": 236
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.37090030312538147,
+      "learning_rate": 0.00010371500721646261,
+      "loss": 1.3281,
+      "step": 237
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.6054628491401672,
+      "learning_rate": 0.0001030397826196943,
+      "loss": 1.2919,
+      "step": 238
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.2451566457748413,
+      "eval_runtime": 11.2081,
+      "eval_samples_per_second": 8.922,
+      "eval_steps_per_second": 8.922,
+      "step": 238
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.8905054926872253,
+      "learning_rate": 0.00010236441927237535,
+      "loss": 1.5113,
+      "step": 239
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.8593491911888123,
+      "learning_rate": 0.0001016889480013931,
+      "loss": 0.9025,
+      "step": 240
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.9700817465782166,
+      "learning_rate": 0.00010101339963856111,
+      "loss": 1.2504,
+      "step": 241
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.2512820959091187,
+      "learning_rate": 0.00010033780501921164,
+      "loss": 1.769,
+      "step": 242
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 3.91457462310791,
+      "learning_rate": 9.966219498078839e-05,
+      "loss": 2.3025,
+      "step": 243
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 2.128582239151001,
+      "learning_rate": 9.898660036143893e-05,
+      "loss": 2.0598,
+      "step": 244
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3125651478767395,
+      "learning_rate": 9.83110519986069e-05,
+      "loss": 1.0118,
+      "step": 245
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.2437289953231812,
+      "learning_rate": 9.763558072762468e-05,
+      "loss": 1.2138,
+      "step": 246
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.653993546962738,
+      "learning_rate": 9.696021738030575e-05,
+      "loss": 0.983,
+      "step": 247
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.0555764436721802,
+      "learning_rate": 9.62849927835374e-05,
+      "loss": 1.4967,
+      "step": 248
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6005884408950806,
+      "learning_rate": 9.560993775787373e-05,
+      "loss": 0.9319,
+      "step": 249
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.776595413684845,
+      "learning_rate": 9.493508311612874e-05,
+      "loss": 1.2568,
+      "step": 250
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.8884586691856384,
+      "learning_rate": 9.426045966196993e-05,
+      "loss": 1.4376,
+      "step": 251
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.7831926345825195,
+      "learning_rate": 9.358609818851229e-05,
+      "loss": 1.2657,
+      "step": 252
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.2989636659622192,
+      "learning_rate": 9.291202947691271e-05,
+      "loss": 1.9303,
+      "step": 253
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.6510024070739746,
+      "learning_rate": 9.223828429496499e-05,
+      "loss": 1.361,
+      "step": 254
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.6142978668212891,
+      "learning_rate": 9.156489339569554e-05,
+      "loss": 1.2702,
+      "step": 255
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.1854060888290405,
+      "learning_rate": 9.089188751595936e-05,
+      "loss": 0.6902,
+      "step": 256
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.6005362868309021,
+      "learning_rate": 9.021929737503757e-05,
+      "loss": 1.0575,
+      "step": 257
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.7906481027603149,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.9277,
+      "step": 258
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.450592041015625,
+      "learning_rate": 8.887548709047764e-05,
+      "loss": 1.6923,
+      "step": 259
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.780250072479248,
+      "learning_rate": 8.820432828491542e-05,
+      "loss": 1.0708,
+      "step": 260
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.6275424957275391,
+      "learning_rate": 8.753370789151941e-05,
+      "loss": 1.2547,
+      "step": 261
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.6233257055282593,
+      "learning_rate": 8.686365652068535e-05,
+      "loss": 1.2188,
+      "step": 262
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.9274204969406128,
+      "learning_rate": 8.619420475683597e-05,
+      "loss": 1.2182,
+      "step": 263
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7996556162834167,
+      "learning_rate": 8.552538315702498e-05,
+      "loss": 0.953,
+      "step": 264
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.8006198406219482,
+      "learning_rate": 8.485722224954237e-05,
+      "loss": 0.8871,
+      "step": 265
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3429837226867676,
+      "learning_rate": 8.418975253252078e-05,
+      "loss": 1.3951,
+      "step": 266
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.6119269728660583,
+      "learning_rate": 8.352300447254372e-05,
+      "loss": 1.4484,
+      "step": 267
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.8381280303001404,
+      "learning_rate": 8.285700850325467e-05,
+      "loss": 1.4779,
+      "step": 268
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.9022213816642761,
+      "learning_rate": 8.219179502396787e-05,
+      "loss": 1.6646,
+      "step": 269
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.6201476454734802,
+      "learning_rate": 8.15273943982811e-05,
+      "loss": 1.6252,
+      "step": 270
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.8065648674964905,
+      "learning_rate": 8.086383695268938e-05,
+      "loss": 1.0879,
+      "step": 271
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.40957608819007874,
+      "learning_rate": 8.020115297520093e-05,
+      "loss": 1.4822,
+      "step": 272
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 2.3640942573547363,
+      "learning_rate": 7.953937271395464e-05,
+      "loss": 1.4912,
+      "step": 273
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.3901112079620361,
+      "learning_rate": 7.887852637583926e-05,
+      "loss": 1.2811,
+      "step": 274
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.41163328289985657,
+      "learning_rate": 7.821864412511485e-05,
+      "loss": 1.4811,
+      "step": 275
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.8646697402000427,
+      "learning_rate": 7.755975608203572e-05,
+      "loss": 1.1372,
+      "step": 276
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.7892725467681885,
+      "learning_rate": 7.690189232147566e-05,
+      "loss": 1.239,
+      "step": 277
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.0216518640518188,
+      "learning_rate": 7.624508287155533e-05,
+      "loss": 1.9391,
+      "step": 278
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.6296453475952148,
+      "learning_rate": 7.558935771227129e-05,
+      "loss": 1.283,
+      "step": 279
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.7754496335983276,
+      "learning_rate": 7.493474677412794e-05,
+      "loss": 1.5106,
+      "step": 280
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.9793290495872498,
+      "learning_rate": 7.428127993677115e-05,
+      "loss": 1.6032,
+      "step": 281
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.9508350491523743,
+      "learning_rate": 7.362898702762433e-05,
+      "loss": 1.4869,
+      "step": 282
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.551931619644165,
+      "learning_rate": 7.297789782052717e-05,
+      "loss": 1.268,
+      "step": 283
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.45385247468948364,
+      "learning_rate": 7.232804203437644e-05,
+      "loss": 1.0128,
+      "step": 284
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.7140306830406189,
+      "learning_rate": 7.16794493317696e-05,
+      "loss": 1.24,
+      "step": 285
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 5.549449920654297,
+      "learning_rate": 7.10321493176508e-05,
+      "loss": 1.6162,
+      "step": 286
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.3684600591659546,
+      "learning_rate": 7.038617153795948e-05,
+      "loss": 1.8522,
+      "step": 287
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.850640058517456,
+      "learning_rate": 6.974154547828191e-05,
+      "loss": 1.9203,
+      "step": 288
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.7197821140289307,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 1.2131,
+      "step": 289
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.7707614898681641,
+      "learning_rate": 6.845646615147445e-05,
+      "loss": 1.7302,
+      "step": 290
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.7183483242988586,
+      "learning_rate": 6.781607154165218e-05,
+      "loss": 0.676,
+      "step": 291
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 2.40576171875,
+      "learning_rate": 6.717714596378137e-05,
+      "loss": 1.399,
+      "step": 292
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.5833460688591003,
+      "learning_rate": 6.653971858155141e-05,
+      "loss": 1.3112,
+      "step": 293
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.4274028539657593,
+      "learning_rate": 6.590381849026655e-05,
+      "loss": 1.7495,
+      "step": 294
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.43348875641822815,
+      "learning_rate": 6.526947471551798e-05,
+      "loss": 1.5504,
+      "step": 295
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.9081869721412659,
+      "learning_rate": 6.463671621185908e-05,
+      "loss": 1.0873,
+      "step": 296
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.0856722593307495,
+      "learning_rate": 6.40055718614837e-05,
+      "loss": 1.2858,
+      "step": 297
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.3667042851448059,
+      "learning_rate": 6.337607047290774e-05,
+      "loss": 1.1236,
+      "step": 298
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.053734540939331,
+      "learning_rate": 6.274824077965438e-05,
+      "loss": 1.0311,
+      "step": 299
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.639960765838623,
+      "learning_rate": 6.21221114389424e-05,
+      "loss": 1.2877,
+      "step": 300
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.3543897569179535,
+      "learning_rate": 6.149771103037821e-05,
+      "loss": 1.2895,
+      "step": 301
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.5615779161453247,
+      "learning_rate": 6.0875068054651266e-05,
+      "loss": 1.3834,
+      "step": 302
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6555135846138,
+      "learning_rate": 6.0254210932233176e-05,
+      "loss": 1.1616,
+      "step": 303
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.647588312625885,
+      "learning_rate": 5.9635168002080564e-05,
+      "loss": 1.3614,
+      "step": 304
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6642404794692993,
+      "learning_rate": 5.901796752034128e-05,
+      "loss": 1.321,
+      "step": 305
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.41330766677856445,
+      "learning_rate": 5.8402637659064895e-05,
+      "loss": 1.1208,
+      "step": 306
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.7018295526504517,
+      "learning_rate": 5.7789206504916816e-05,
+      "loss": 1.408,
+      "step": 307
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.5185045599937439,
+      "learning_rate": 5.717770205789601e-05,
+      "loss": 1.2841,
+      "step": 308
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.5689573287963867,
+      "learning_rate": 5.656815223005714e-05,
+      "loss": 0.8656,
+      "step": 309
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.1489999294281006,
+      "learning_rate": 5.596058484423656e-05,
+      "loss": 1.0203,
+      "step": 310
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.6517840027809143,
+      "learning_rate": 5.535502763278222e-05,
+      "loss": 1.2159,
+      "step": 311
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.47839996218681335,
+      "learning_rate": 5.4751508236287865e-05,
+      "loss": 0.9904,
+      "step": 312
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.5880618691444397,
+      "learning_rate": 5.415005420233141e-05,
+      "loss": 0.6293,
+      "step": 313
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.8745140433311462,
+      "learning_rate": 5.355069298421747e-05,
+      "loss": 0.9696,
+      "step": 314
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.5589584112167358,
+      "learning_rate": 5.2953451939724454e-05,
+      "loss": 0.7707,
+      "step": 315
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.5342937111854553,
+      "learning_rate": 5.2358358329855516e-05,
+      "loss": 1.0788,
+      "step": 316
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.537293016910553,
+      "learning_rate": 5.1765439317594466e-05,
+      "loss": 1.4954,
+      "step": 317
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.48457396030426025,
+      "learning_rate": 5.1174721966665774e-05,
+      "loss": 1.0569,
+      "step": 318
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.46171680092811584,
+      "learning_rate": 5.058623324029944e-05,
+      "loss": 1.6199,
+      "step": 319
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.9995810985565186,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 1.5058,
+      "step": 320
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.8011060357093811,
+      "learning_rate": 4.941604900432065e-05,
+      "loss": 1.0958,
+      "step": 321
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5803133845329285,
+      "learning_rate": 4.8834406907641784e-05,
+      "loss": 0.8619,
+      "step": 322
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.2584335803985596,
+      "learning_rate": 4.825510025895429e-05,
+      "loss": 1.4993,
+      "step": 323
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.43015971779823303,
+      "learning_rate": 4.767815550064778e-05,
+      "loss": 0.8414,
+      "step": 324
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.1480287313461304,
+      "learning_rate": 4.710359896730379e-05,
+      "loss": 1.4084,
+      "step": 325
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.6011125445365906,
+      "learning_rate": 4.65314568844933e-05,
+      "loss": 0.9027,
+      "step": 326
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.9287271499633789,
+      "learning_rate": 4.596175536758024e-05,
+      "loss": 1.3626,
+      "step": 327
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.5835272669792175,
+      "learning_rate": 4.539452042052901e-05,
+      "loss": 0.9265,
+      "step": 328
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.9826067686080933,
+      "learning_rate": 4.482977793471769e-05,
+      "loss": 0.9542,
+      "step": 329
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.4515445828437805,
+      "learning_rate": 4.426755368775637e-05,
+      "loss": 1.1758,
+      "step": 330
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.791499674320221,
+      "learning_rate": 4.3707873342310254e-05,
+      "loss": 0.6814,
+      "step": 331
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.8545525074005127,
+      "learning_rate": 4.3150762444928473e-05,
+      "loss": 1.4359,
+      "step": 332
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.8281897306442261,
+      "learning_rate": 4.259624642487805e-05,
+      "loss": 1.5126,
+      "step": 333
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.3756364583969116,
+      "learning_rate": 4.204435059298303e-05,
+      "loss": 0.8866,
+      "step": 334
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.1280224323272705,
+      "learning_rate": 4.149510014046922e-05,
+      "loss": 0.9567,
+      "step": 335
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.3786430060863495,
+      "learning_rate": 4.094852013781456e-05,
+      "loss": 1.2275,
+      "step": 336
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.3916789293289185,
+      "learning_rate": 4.040463553360431e-05,
+      "loss": 3.2689,
+      "step": 337
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.828298032283783,
+      "learning_rate": 3.9863471153392804e-05,
+      "loss": 1.0928,
+      "step": 338
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.3147966861724854,
+      "learning_rate": 3.9325051698569925e-05,
+      "loss": 1.7501,
+      "step": 339
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8213374018669128,
+      "learning_rate": 3.878940174523371e-05,
+      "loss": 1.5596,
+      "step": 340
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.037734270095825,
+      "learning_rate": 3.8256545743068725e-05,
+      "loss": 1.3332,
+      "step": 341
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.9790574908256531,
+      "learning_rate": 3.772650801422982e-05,
+      "loss": 1.0873,
+      "step": 342
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4515441954135895,
+      "learning_rate": 3.719931275223205e-05,
+      "loss": 1.3623,
+      "step": 343
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.9941519498825073,
+      "learning_rate": 3.6674984020846504e-05,
+      "loss": 1.0522,
+      "step": 344
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.93306565284729,
+      "learning_rate": 3.615354575300166e-05,
+      "loss": 1.5876,
+      "step": 345
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 2.297698497772217,
+      "learning_rate": 3.5635021749691166e-05,
+      "loss": 0.8665,
+      "step": 346
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.9025039076805115,
+      "learning_rate": 3.511943567888732e-05,
+      "loss": 0.4777,
+      "step": 347
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.38800546526908875,
+      "learning_rate": 3.460681107446091e-05,
+      "loss": 0.8782,
+      "step": 348
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.42990243434906,
+      "learning_rate": 3.4097171335106824e-05,
+      "loss": 1.1503,
+      "step": 349
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.4309168756008148,
+      "learning_rate": 3.3590539723276083e-05,
+      "loss": 1.4906,
+      "step": 350
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.5898098945617676,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 1.2899,
+      "step": 351
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.8075172305107117,
+      "learning_rate": 3.258639324440527e-05,
+      "loss": 0.7296,
+      "step": 352
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.7422662973403931,
+      "learning_rate": 3.2088924211523144e-05,
+      "loss": 1.5174,
+      "step": 353
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.38478884100914,
+      "learning_rate": 3.1594554972388265e-05,
+      "loss": 1.3737,
+      "step": 354
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.49486249685287476,
+      "learning_rate": 3.110330809243134e-05,
+      "loss": 1.3679,
+      "step": 355
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.4216090738773346,
+      "learning_rate": 3.061520599456341e-05,
+      "loss": 1.3637,
+      "step": 356
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.46424904465675354,
+      "learning_rate": 3.0130270958152197e-05,
+      "loss": 1.3429,
+      "step": 357
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.2427868843078613,
+      "eval_runtime": 10.4613,
+      "eval_samples_per_second": 9.559,
+      "eval_steps_per_second": 9.559,
+      "step": 357
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.1968390941619873,
+      "learning_rate": 2.964852511800519e-05,
+      "loss": 1.3649,
+      "step": 358
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.45987701416015625,
+      "learning_rate": 2.9169990463359555e-05,
+      "loss": 1.4336,
+      "step": 359
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5114177465438843,
+      "learning_rate": 2.869468883687798e-05,
+      "loss": 1.4723,
+      "step": 360
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3620365560054779,
+      "learning_rate": 2.8222641933652117e-05,
+      "loss": 1.6468,
+      "step": 361
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.9834029078483582,
+      "learning_rate": 2.7753871300212142e-05,
+      "loss": 0.963,
+      "step": 362
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.6030866503715515,
+      "learning_rate": 2.7288398333543064e-05,
+      "loss": 1.1532,
+      "step": 363
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.7282407879829407,
+      "learning_rate": 2.6826244280108437e-05,
+      "loss": 1.2677,
+      "step": 364
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.549340009689331,
+      "learning_rate": 2.6367430234880284e-05,
+      "loss": 1.1274,
+      "step": 365
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.6679304838180542,
+      "learning_rate": 2.591197714037631e-05,
+      "loss": 1.5468,
+      "step": 366
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.3590414822101593,
+      "learning_rate": 2.5459905785704042e-05,
+      "loss": 1.7081,
+      "step": 367
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.8412752747535706,
+      "learning_rate": 2.5011236805611814e-05,
+      "loss": 1.3058,
+      "step": 368
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.4232923984527588,
+      "learning_rate": 2.4565990679546914e-05,
+      "loss": 1.3649,
+      "step": 369
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.7082968950271606,
+      "learning_rate": 2.4124187730720917e-05,
+      "loss": 1.3421,
+      "step": 370
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.8737981915473938,
+      "learning_rate": 2.368584812518184e-05,
+      "loss": 0.8252,
+      "step": 371
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.6763678193092346,
+      "learning_rate": 2.3250991870893835e-05,
+      "loss": 1.8269,
+      "step": 372
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.49625009298324585,
+      "learning_rate": 2.2819638816823797e-05,
+      "loss": 1.7375,
+      "step": 373
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.235646367073059,
+      "learning_rate": 2.2391808652035517e-05,
+      "loss": 1.0455,
+      "step": 374
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.6838667988777161,
+      "learning_rate": 2.1967520904790827e-05,
+      "loss": 1.2117,
+      "step": 375
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.5402644872665405,
+      "learning_rate": 2.154679494165829e-05,
+      "loss": 1.4113,
+      "step": 376
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.9066751599311829,
+      "learning_rate": 2.1129649966629184e-05,
+      "loss": 1.1857,
+      "step": 377
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.1335337162017822,
+      "learning_rate": 2.0716105020241072e-05,
+      "loss": 1.4199,
+      "step": 378
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7712799310684204,
+      "learning_rate": 2.0306178978708514e-05,
+      "loss": 1.7568,
+      "step": 379
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3476005494594574,
+      "learning_rate": 1.9899890553061562e-05,
+      "loss": 1.4962,
+      "step": 380
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7450062036514282,
+      "learning_rate": 1.9497258288291654e-05,
+      "loss": 1.5029,
+      "step": 381
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9961157441139221,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 1.2156,
+      "step": 382
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.5865272283554077,
+      "learning_rate": 1.8703035586084816e-05,
+      "loss": 0.8954,
+      "step": 383
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.7002312541007996,
+      "learning_rate": 1.831148140085762e-05,
+      "loss": 1.3208,
+      "step": 384
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.5696095824241638,
+      "learning_rate": 1.7923655879272393e-05,
+      "loss": 1.6606,
+      "step": 385
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.6654199361801147,
+      "learning_rate": 1.753957672358324e-05,
+      "loss": 1.0694,
+      "step": 386
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.4102238714694977,
+      "learning_rate": 1.7159261465041952e-05,
+      "loss": 1.2681,
+      "step": 387
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.5439934134483337,
+      "learning_rate": 1.6782727463097624e-05,
+      "loss": 1.018,
+      "step": 388
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.4591739177703857,
+      "learning_rate": 1.6409991904604173e-05,
+      "loss": 0.8686,
+      "step": 389
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.4289948046207428,
+      "learning_rate": 1.60410718030361e-05,
+      "loss": 1.1516,
+      "step": 390
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.46624648571014404,
+      "learning_rate": 1.5675983997711795e-05,
+      "loss": 1.3106,
+      "step": 391
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.7769433259963989,
+      "learning_rate": 1.5314745153024766e-05,
+      "loss": 1.205,
+      "step": 392
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.6348716020584106,
+      "learning_rate": 1.495737175768326e-05,
+      "loss": 1.0937,
+      "step": 393
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.4135714769363403,
+      "learning_rate": 1.4603880123957447e-05,
+      "loss": 1.0782,
+      "step": 394
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.7596187591552734,
+      "learning_rate": 1.425428638693489e-05,
+      "loss": 1.6273,
+      "step": 395
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.391519546508789,
+      "learning_rate": 1.3908606503784139e-05,
+      "loss": 1.4292,
+      "step": 396
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.8692115545272827,
+      "learning_rate": 1.356685625302625e-05,
+      "loss": 0.5871,
+      "step": 397
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.47545358538627625,
+      "learning_rate": 1.3229051233814637e-05,
+      "loss": 1.0054,
+      "step": 398
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.2672554552555084,
+      "learning_rate": 1.2895206865223064e-05,
+      "loss": 0.6172,
+      "step": 399
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6463977694511414,
+      "learning_rate": 1.2565338385541792e-05,
+      "loss": 2.0793,
+      "step": 400
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5812274813652039,
+      "learning_rate": 1.2239460851582118e-05,
+      "loss": 0.8392,
+      "step": 401
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.5981879234313965,
+      "learning_rate": 1.1917589137989005e-05,
+      "loss": 1.459,
+      "step": 402
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.9397989511489868,
+      "learning_rate": 1.1599737936562149e-05,
+      "loss": 1.3638,
+      "step": 403
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.3462386429309845,
+      "learning_rate": 1.1285921755585504e-05,
+      "loss": 1.1605,
+      "step": 404
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.203017234802246,
+      "learning_rate": 1.097615491916485e-05,
+      "loss": 1.5022,
+      "step": 405
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.7160519957542419,
+      "learning_rate": 1.0670451566574102e-05,
+      "loss": 1.0726,
+      "step": 406
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.9885064959526062,
+      "learning_rate": 1.0368825651609893e-05,
+      "loss": 1.0344,
+      "step": 407
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.1007866859436035,
+      "learning_rate": 1.007129094195468e-05,
+      "loss": 1.4191,
+      "step": 408
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.0664376020431519,
+      "learning_rate": 9.777861018548251e-06,
+      "loss": 1.8957,
+      "step": 409
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.2938302755355835,
+      "learning_rate": 9.488549274967872e-06,
+      "loss": 1.181,
+      "step": 410
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.7518212199211121,
+      "learning_rate": 9.203368916817012e-06,
+      "loss": 1.4975,
+      "step": 411
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.3200393915176392,
+      "learning_rate": 8.92233296112236e-06,
+      "loss": 1.0157,
+      "step": 412
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.677116334438324,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 1.1984,
+      "step": 413
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.9666435122489929,
+      "learning_rate": 8.372745378769309e-06,
+      "loss": 1.1112,
+      "step": 414
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.8352184295654297,
+      "learning_rate": 8.10421883797694e-06,
+      "loss": 1.0512,
+      "step": 415
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 2.5026190280914307,
+      "learning_rate": 7.839886870227909e-06,
+      "loss": 1.1279,
+      "step": 416
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.36853912472724915,
+      "learning_rate": 7.5797615409264335e-06,
+      "loss": 1.1051,
+      "step": 417
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5076872110366821,
+      "learning_rate": 7.32385472346514e-06,
+      "loss": 1.597,
+      "step": 418
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.9718438982963562,
+      "learning_rate": 7.072178098683246e-06,
+      "loss": 1.1807,
+      "step": 419
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5310528874397278,
+      "learning_rate": 6.824743154333157e-06,
+      "loss": 1.2764,
+      "step": 420
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.9790662527084351,
+      "learning_rate": 6.581561184556295e-06,
+      "loss": 1.5014,
+      "step": 421
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 2.499530553817749,
+      "learning_rate": 6.342643289367522e-06,
+      "loss": 1.3249,
+      "step": 422
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.6649277210235596,
+      "learning_rate": 6.108000374148448e-06,
+      "loss": 1.432,
+      "step": 423
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.8835659623146057,
+      "learning_rate": 5.87764314914967e-06,
+      "loss": 1.0439,
+      "step": 424
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.7555325627326965,
+      "learning_rate": 5.651582129001986e-06,
+      "loss": 0.9622,
+      "step": 425
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.9321297407150269,
+      "learning_rate": 5.429827632236284e-06,
+      "loss": 1.1174,
+      "step": 426
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.7446428537368774,
+      "learning_rate": 5.212389780812732e-06,
+      "loss": 1.2175,
+      "step": 427
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.4157393276691437,
+      "learning_rate": 4.999278499658666e-06,
+      "loss": 1.3221,
+      "step": 428
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.4439353048801422,
+      "learning_rate": 4.790503516215572e-06,
+      "loss": 1.3804,
+      "step": 429
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6780000329017639,
+      "learning_rate": 4.586074359995119e-06,
+      "loss": 1.5498,
+      "step": 430
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6816204190254211,
+      "learning_rate": 4.386000362144138e-06,
+      "loss": 0.8413,
+      "step": 431
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.4683542847633362,
+      "learning_rate": 4.190290655018736e-06,
+      "loss": 1.5352,
+      "step": 432
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.4905780553817749,
+      "learning_rate": 3.998954171767422e-06,
+      "loss": 1.5878,
+      "step": 433
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6626597046852112,
+      "learning_rate": 3.811999645923414e-06,
+      "loss": 1.5102,
+      "step": 434
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.5728758573532104,
+      "learning_rate": 3.6294356110059157e-06,
+      "loss": 1.3155,
+      "step": 435
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.9096332788467407,
+      "learning_rate": 3.451270400130646e-06,
+      "loss": 1.5012,
+      "step": 436
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.9138293266296387,
+      "learning_rate": 3.277512145629502e-06,
+      "loss": 1.0071,
+      "step": 437
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.7803674340248108,
+      "learning_rate": 3.10816877867931e-06,
+      "loss": 1.1,
+      "step": 438
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.2517226934432983,
+      "learning_rate": 2.943248028939838e-06,
+      "loss": 1.2342,
+      "step": 439
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.6347147822380066,
+      "learning_rate": 2.7827574242009437e-06,
+      "loss": 1.1261,
+      "step": 440
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.4368477165699005,
+      "learning_rate": 2.626704290039017e-06,
+      "loss": 1.1669,
+      "step": 441
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.7963775396347046,
+      "learning_rate": 2.4750957494826033e-06,
+      "loss": 1.3863,
+      "step": 442
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.0838688611984253,
+      "learning_rate": 2.327938722687184e-06,
+      "loss": 1.1262,
+      "step": 443
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.48085644841194153,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 1.7383,
+      "step": 444
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.5789294242858887,
+      "learning_rate": 2.0470058747505516e-06,
+      "loss": 1.2789,
+      "step": 445
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7602370977401733,
+      "learning_rate": 1.9132428767589473e-06,
+      "loss": 2.2284,
+      "step": 446
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.6166039109230042,
+      "learning_rate": 1.7839570382422787e-06,
+      "loss": 1.1418,
+      "step": 447
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.5087316036224365,
+      "learning_rate": 1.6591542604387445e-06,
+      "loss": 0.7367,
+      "step": 448
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 4.2447404861450195,
+      "learning_rate": 1.538840239957684e-06,
+      "loss": 2.1703,
+      "step": 449
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.3775041103363037,
+      "learning_rate": 1.4230204685196203e-06,
+      "loss": 1.4201,
+      "step": 450
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.3511093854904175,
+      "learning_rate": 1.3117002327055927e-06,
+      "loss": 1.5358,
+      "step": 451
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.6467747092247009,
+      "learning_rate": 1.20488461371574e-06,
+      "loss": 1.0678,
+      "step": 452
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.7409128546714783,
+      "learning_rate": 1.102578487137529e-06,
+      "loss": 1.5819,
+      "step": 453
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.4174960851669312,
+      "learning_rate": 1.004786522723089e-06,
+      "loss": 1.3788,
+      "step": 454
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.54808509349823,
+      "learning_rate": 9.11513184176116e-07,
+      "loss": 1.1524,
+      "step": 455
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3604092299938202,
+      "learning_rate": 8.227627289481121e-07,
+      "loss": 1.246,
+      "step": 456
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.0738741159439087,
+      "learning_rate": 7.385392080440534e-07,
+      "loss": 0.9884,
+      "step": 457
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.2032325267791748,
+      "learning_rate": 6.588464658374815e-07,
+      "loss": 2.1638,
+      "step": 458
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.5337828993797302,
+      "learning_rate": 5.836881398950667e-07,
+      "loss": 1.1283,
+      "step": 459
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.2100324630737305,
+      "learning_rate": 5.130676608104845e-07,
+      "loss": 1.5026,
+      "step": 460
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.029868483543396,
+      "learning_rate": 4.469882520479196e-07,
+      "loss": 1.5626,
+      "step": 461
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.9376425743103027,
+      "learning_rate": 3.8545292979486057e-07,
+      "loss": 1.4563,
+      "step": 462
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.3371017575263977,
+      "learning_rate": 3.2846450282447703e-07,
+      "loss": 1.0665,
+      "step": 463
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.8099949359893799,
+      "learning_rate": 2.760255723673888e-07,
+      "loss": 1.0242,
+      "step": 464
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.5782180428504944,
+      "learning_rate": 2.2813853199292746e-07,
+      "loss": 1.1465,
+      "step": 465
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.8454908132553101,
+      "learning_rate": 1.8480556749991274e-07,
+      "loss": 1.0718,
+      "step": 466
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.310767650604248,
+      "learning_rate": 1.460286568168212e-07,
+      "loss": 1.0671,
+      "step": 467
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.9640721678733826,
+      "learning_rate": 1.1180956991160286e-07,
+      "loss": 0.9314,
+      "step": 468
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.6716411113739014,
+      "learning_rate": 8.214986871076802e-08,
+      "loss": 1.202,
+      "step": 469
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.5227134823799133,
+      "learning_rate": 5.705090702819993e-08,
+      "loss": 0.6217,
+      "step": 470
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.6409493684768677,
+      "learning_rate": 3.6513830503293045e-08,
+      "loss": 1.3795,
+      "step": 471
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.9884561896324158,
+      "learning_rate": 2.0539576548717076e-08,
+      "loss": 1.0896,
+      "step": 472
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.44622230529785156,
+      "learning_rate": 9.128874307551272e-09,
+      "loss": 1.4132,
+      "step": 473
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5369505286216736,
+      "learning_rate": 2.282244620088747e-09,
+      "loss": 0.7652,
+      "step": 474
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.6095150709152222,
+      "learning_rate": 0.0,
+      "loss": 1.8662,
+      "step": 475
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 475,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "total_flos": 777748162805760.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-475/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9a1fb5e26ed8821493786ab87117b1dbfd309c2834aae0cc2b1b60637743893
+size 5752

checkpoint-475/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_name_or_path": "Qwen/Qwen1.5-0.5B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 2816,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 16,
+  "quantization_config": {
+    "_load_in_4bit": false,
+    "_load_in_8bit": true,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": false,
+    "load_in_8bit": true,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.0.dev0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

runs/Apr11_16-32-20_volko-MS-7D09/events.out.tfevents.1712845940.volko-MS-7D09.38265.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a85576b4937d02bc9999cc71ce248f5820388874be11c945cb9a8dc1ffddc10f
+size 107131

runs/Apr11_16-53-26_volko-MS-7D09/events.out.tfevents.1712847206.volko-MS-7D09.40309.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:580c7dbd47b1cfe881cdd965f46548671d6de873ddd41d8be7d4c4118cb844a0
+size 107130

runs/Apr11_17-04-13_volko-MS-7D09/events.out.tfevents.1712847853.volko-MS-7D09.41247.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17e8e4539db8c7bc2a450a68061192458958f1046af43c643afc7cba5e69addf
+size 2772065

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff