nicolof88 commited on Mar 21

Commit

32de623

•

1 Parent(s): da9c083

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

README.md +40 -0
adapter_config.json +33 -0
adapter_model.safetensors +3 -0
checkpoint-9676/README.md +202 -0
checkpoint-9676/adapter_config.json +33 -0
checkpoint-9676/adapter_model.safetensors +3 -0
checkpoint-9676/optimizer.pt +3 -0
checkpoint-9676/pytorch_model.bin +3 -0
checkpoint-9676/rng_state.pth +3 -0
checkpoint-9676/scheduler.pt +3 -0
checkpoint-9676/special_tokens_map.json +24 -0
checkpoint-9676/tokenizer.json +0 -0
checkpoint-9676/tokenizer.model +3 -0
checkpoint-9676/tokenizer_config.json +43 -0
checkpoint-9676/trainer_state.json +2730 -0
checkpoint-9676/training_args.bin +3 -0
handler.py +32 -0
requirements.txt +2 -0
runs/Mar21_02-37-23_r-nicolof88-mistral7b-spider-at-188n2lyh-a2f2a-vnlru/events.out.tfevents.1710988652.r-nicolof88-mistral7b-spider-at-188n2lyh-a2f2a-vnlru.98.0 +2 -2
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +43 -0
training_args.bin +3 -0
training_params.json +47 -0

README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+---
+tags:
+- autotrain
+- text-generation
+widget:
+- text: "I love AutoTrain because "
+license: other
+---
+# Model Trained Using AutoTrain
+This model was trained using AutoTrain. For more information, please visit [AutoTrain](https://hf.co/docs/autotrain).
+# Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_path = "PATH_TO_THIS_REPO"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    device_map="auto",
+    torch_dtype='auto'
+).eval()
+# Prompt content: "hi"
+messages = [
+    {"role": "user", "content": "hi"}
+]
+input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
+output_ids = model.generate(input_ids.to('cuda'))
+response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
+# Model response: "Hello! How can I assist you today?"
+print(response)
+```

adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "v_proj",
+    "up_proj",
+    "k_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf35931b3c5aed981d5e1d9a71d0340a4761c5157b8fccf28d16da2b1c080439
+size 671149168

checkpoint-9676/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: mistralai/Mistral-7B-Instruct-v0.2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.9.0

checkpoint-9676/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "v_proj",
+    "up_proj",
+    "k_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-9676/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf35931b3c5aed981d5e1d9a71d0340a4761c5157b8fccf28d16da2b1c080439
+size 671149168

checkpoint-9676/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc5176933f9170ac12c5c8c7757752889f45b015e76eac64a6dcc19cfbd31b82
+size 1342555602

checkpoint-9676/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:049c26b844b79121ddd8379f7f69194e63f6fbf6aa007eeac0c66f17eebb8893
+size 888

checkpoint-9676/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3919a6c0eded6d6d7870145bb89f27445b92aae1ef7d2a38f1223b4e7820cfc
+size 14244

checkpoint-9676/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60ef1f8511acc4ba8b3aa4c57afaff33980f2570f13a13ce4e7d272a72264763
+size 1064

checkpoint-9676/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-9676/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-9676/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-9676/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-9676/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2730 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.0,
+  "eval_steps": 500,
+  "global_step": 9676,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.180048942565918,
+      "learning_rate": 7.747933884297521e-07,
+      "loss": 1.2211,
+      "step": 25
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8116350769996643,
+      "learning_rate": 1.5495867768595043e-06,
+      "loss": 1.1783,
+      "step": 50
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.8945431709289551,
+      "learning_rate": 2.3243801652892563e-06,
+      "loss": 1.0768,
+      "step": 75
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.4761434495449066,
+      "learning_rate": 3.0991735537190086e-06,
+      "loss": 0.8783,
+      "step": 100
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.37374910712242126,
+      "learning_rate": 3.87396694214876e-06,
+      "loss": 0.8531,
+      "step": 125
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5040513277053833,
+      "learning_rate": 4.648760330578513e-06,
+      "loss": 0.7562,
+      "step": 150
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.3926999270915985,
+      "learning_rate": 5.423553719008265e-06,
+      "loss": 0.6665,
+      "step": 175
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.38587668538093567,
+      "learning_rate": 6.198347107438017e-06,
+      "loss": 0.6037,
+      "step": 200
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.3665236830711365,
+      "learning_rate": 6.9731404958677686e-06,
+      "loss": 0.5731,
+      "step": 225
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.43044623732566833,
+      "learning_rate": 7.74793388429752e-06,
+      "loss": 0.536,
+      "step": 250
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.3693198561668396,
+      "learning_rate": 8.522727272727273e-06,
+      "loss": 0.5333,
+      "step": 275
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.46105554699897766,
+      "learning_rate": 9.297520661157025e-06,
+      "loss": 0.4943,
+      "step": 300
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5055803656578064,
+      "learning_rate": 1.0072314049586778e-05,
+      "loss": 0.4264,
+      "step": 325
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5727549195289612,
+      "learning_rate": 1.084710743801653e-05,
+      "loss": 0.4373,
+      "step": 350
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6658156514167786,
+      "learning_rate": 1.1621900826446282e-05,
+      "loss": 0.3876,
+      "step": 375
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.6771320700645447,
+      "learning_rate": 1.2396694214876034e-05,
+      "loss": 0.3518,
+      "step": 400
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.6591110229492188,
+      "learning_rate": 1.3171487603305787e-05,
+      "loss": 0.3193,
+      "step": 425
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.8635509610176086,
+      "learning_rate": 1.3946280991735537e-05,
+      "loss": 0.3055,
+      "step": 450
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.0686248540878296,
+      "learning_rate": 1.472107438016529e-05,
+      "loss": 0.2824,
+      "step": 475
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.88297039270401,
+      "learning_rate": 1.549586776859504e-05,
+      "loss": 0.2704,
+      "step": 500
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.0282652378082275,
+      "learning_rate": 1.6270661157024794e-05,
+      "loss": 0.2384,
+      "step": 525
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.009024977684021,
+      "learning_rate": 1.7045454545454546e-05,
+      "loss": 0.2241,
+      "step": 550
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.1481122970581055,
+      "learning_rate": 1.78202479338843e-05,
+      "loss": 0.1914,
+      "step": 575
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.3241642713546753,
+      "learning_rate": 1.859504132231405e-05,
+      "loss": 0.1672,
+      "step": 600
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.0569857358932495,
+      "learning_rate": 1.9369834710743803e-05,
+      "loss": 0.1936,
+      "step": 625
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.8776970505714417,
+      "learning_rate": 2.0144628099173555e-05,
+      "loss": 0.1528,
+      "step": 650
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.7247719168663025,
+      "learning_rate": 2.0919421487603307e-05,
+      "loss": 0.1635,
+      "step": 675
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.1265312433242798,
+      "learning_rate": 2.169421487603306e-05,
+      "loss": 0.1432,
+      "step": 700
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.9996057748794556,
+      "learning_rate": 2.2469008264462812e-05,
+      "loss": 0.1448,
+      "step": 725
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.7036225199699402,
+      "learning_rate": 2.3243801652892564e-05,
+      "loss": 0.1505,
+      "step": 750
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.8441881537437439,
+      "learning_rate": 2.4018595041322316e-05,
+      "loss": 0.1446,
+      "step": 775
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.1108651161193848,
+      "learning_rate": 2.479338842975207e-05,
+      "loss": 0.1291,
+      "step": 800
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.3914504051208496,
+      "learning_rate": 2.556818181818182e-05,
+      "loss": 0.1188,
+      "step": 825
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.5808395147323608,
+      "learning_rate": 2.6342975206611573e-05,
+      "loss": 0.1294,
+      "step": 850
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8349003195762634,
+      "learning_rate": 2.7117768595041322e-05,
+      "loss": 0.1396,
+      "step": 875
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.9605033993721008,
+      "learning_rate": 2.7892561983471074e-05,
+      "loss": 0.1229,
+      "step": 900
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.8516783118247986,
+      "learning_rate": 2.8667355371900826e-05,
+      "loss": 0.116,
+      "step": 925
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.3230878114700317,
+      "learning_rate": 2.944214876033058e-05,
+      "loss": 0.1224,
+      "step": 950
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.7461815476417542,
+      "learning_rate": 2.9975884244372988e-05,
+      "loss": 0.119,
+      "step": 975
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.6247888803482056,
+      "learning_rate": 2.98897565457051e-05,
+      "loss": 0.0974,
+      "step": 1000
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.6625025868415833,
+      "learning_rate": 2.980362884703721e-05,
+      "loss": 0.111,
+      "step": 1025
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.8011340498924255,
+      "learning_rate": 2.9717501148369314e-05,
+      "loss": 0.1013,
+      "step": 1050
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6088280081748962,
+      "learning_rate": 2.9631373449701425e-05,
+      "loss": 0.1135,
+      "step": 1075
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.32283875346183777,
+      "learning_rate": 2.9545245751033532e-05,
+      "loss": 0.1039,
+      "step": 1100
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.8527675271034241,
+      "learning_rate": 2.945911805236564e-05,
+      "loss": 0.1119,
+      "step": 1125
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6812451481819153,
+      "learning_rate": 2.937299035369775e-05,
+      "loss": 0.0999,
+      "step": 1150
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.3629147708415985,
+      "learning_rate": 2.928686265502986e-05,
+      "loss": 0.0947,
+      "step": 1175
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.9273631572723389,
+      "learning_rate": 2.9200734956361966e-05,
+      "loss": 0.1101,
+      "step": 1200
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.4564916491508484,
+      "learning_rate": 2.9114607257694073e-05,
+      "loss": 0.1019,
+      "step": 1225
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.147258996963501,
+      "learning_rate": 2.9028479559026184e-05,
+      "loss": 0.0865,
+      "step": 1250
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.4461434781551361,
+      "learning_rate": 2.8942351860358292e-05,
+      "loss": 0.0925,
+      "step": 1275
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.5906102657318115,
+      "learning_rate": 2.88562241616904e-05,
+      "loss": 0.1044,
+      "step": 1300
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.44959595799446106,
+      "learning_rate": 2.877009646302251e-05,
+      "loss": 0.1008,
+      "step": 1325
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.918072521686554,
+      "learning_rate": 2.8683968764354614e-05,
+      "loss": 0.0904,
+      "step": 1350
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.47751984000205994,
+      "learning_rate": 2.8597841065686725e-05,
+      "loss": 0.0821,
+      "step": 1375
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.37669825553894043,
+      "learning_rate": 2.8511713367018833e-05,
+      "loss": 0.0889,
+      "step": 1400
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.5528315305709839,
+      "learning_rate": 2.842558566835094e-05,
+      "loss": 0.0884,
+      "step": 1425
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.7219104170799255,
+      "learning_rate": 2.833945796968305e-05,
+      "loss": 0.0909,
+      "step": 1450
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.5634051561355591,
+      "learning_rate": 2.825333027101516e-05,
+      "loss": 0.0868,
+      "step": 1475
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.33194178342819214,
+      "learning_rate": 2.816720257234727e-05,
+      "loss": 0.0901,
+      "step": 1500
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.6954057812690735,
+      "learning_rate": 2.8081074873679374e-05,
+      "loss": 0.0834,
+      "step": 1525
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.5542522072792053,
+      "learning_rate": 2.7994947175011485e-05,
+      "loss": 0.0923,
+      "step": 1550
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.5636582970619202,
+      "learning_rate": 2.7908819476343596e-05,
+      "loss": 0.0887,
+      "step": 1575
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.6528864502906799,
+      "learning_rate": 2.78226917776757e-05,
+      "loss": 0.0776,
+      "step": 1600
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.3220342695713043,
+      "learning_rate": 2.773656407900781e-05,
+      "loss": 0.0813,
+      "step": 1625
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4045586884021759,
+      "learning_rate": 2.765043638033992e-05,
+      "loss": 0.0888,
+      "step": 1650
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7669140100479126,
+      "learning_rate": 2.7564308681672026e-05,
+      "loss": 0.0949,
+      "step": 1675
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.3345740735530853,
+      "learning_rate": 2.7478180983004137e-05,
+      "loss": 0.0876,
+      "step": 1700
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.7757517695426941,
+      "learning_rate": 2.7392053284336244e-05,
+      "loss": 0.0799,
+      "step": 1725
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5490168929100037,
+      "learning_rate": 2.7305925585668352e-05,
+      "loss": 0.0814,
+      "step": 1750
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.4781877398490906,
+      "learning_rate": 2.721979788700046e-05,
+      "loss": 0.0785,
+      "step": 1775
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.5043837428092957,
+      "learning_rate": 2.713367018833257e-05,
+      "loss": 0.0834,
+      "step": 1800
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.4992827773094177,
+      "learning_rate": 2.7047542489664674e-05,
+      "loss": 0.0921,
+      "step": 1825
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.6074797511100769,
+      "learning_rate": 2.6961414790996785e-05,
+      "loss": 0.0909,
+      "step": 1850
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.6552305817604065,
+      "learning_rate": 2.6875287092328896e-05,
+      "loss": 0.0784,
+      "step": 1875
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.5910527110099792,
+      "learning_rate": 2.6789159393661e-05,
+      "loss": 0.0848,
+      "step": 1900
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7094011306762695,
+      "learning_rate": 2.670303169499311e-05,
+      "loss": 0.0857,
+      "step": 1925
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.6078013777732849,
+      "learning_rate": 2.661690399632522e-05,
+      "loss": 0.0869,
+      "step": 1950
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.43232443928718567,
+      "learning_rate": 2.6530776297657326e-05,
+      "loss": 0.0834,
+      "step": 1975
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.4060278534889221,
+      "learning_rate": 2.6444648598989437e-05,
+      "loss": 0.0813,
+      "step": 2000
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6071100234985352,
+      "learning_rate": 2.6358520900321545e-05,
+      "loss": 0.0795,
+      "step": 2025
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.41472747921943665,
+      "learning_rate": 2.6272393201653652e-05,
+      "loss": 0.0828,
+      "step": 2050
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.586531400680542,
+      "learning_rate": 2.618626550298576e-05,
+      "loss": 0.0736,
+      "step": 2075
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.42143017053604126,
+      "learning_rate": 2.610013780431787e-05,
+      "loss": 0.078,
+      "step": 2100
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35077112913131714,
+      "learning_rate": 2.601401010564998e-05,
+      "loss": 0.0844,
+      "step": 2125
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.5425326824188232,
+      "learning_rate": 2.5927882406982086e-05,
+      "loss": 0.0772,
+      "step": 2150
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.25989875197410583,
+      "learning_rate": 2.5841754708314197e-05,
+      "loss": 0.0749,
+      "step": 2175
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.3507814407348633,
+      "learning_rate": 2.57556270096463e-05,
+      "loss": 0.0859,
+      "step": 2200
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.39133837819099426,
+      "learning_rate": 2.5669499310978412e-05,
+      "loss": 0.0765,
+      "step": 2225
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.3232302963733673,
+      "learning_rate": 2.558337161231052e-05,
+      "loss": 0.0737,
+      "step": 2250
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.1417970657348633,
+      "learning_rate": 2.5497243913642627e-05,
+      "loss": 0.0796,
+      "step": 2275
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.6268133521080017,
+      "learning_rate": 2.5411116214974738e-05,
+      "loss": 0.0723,
+      "step": 2300
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4392102360725403,
+      "learning_rate": 2.5324988516306845e-05,
+      "loss": 0.0742,
+      "step": 2325
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.3616221249103546,
+      "learning_rate": 2.5238860817638953e-05,
+      "loss": 0.0811,
+      "step": 2350
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.5831270813941956,
+      "learning_rate": 2.515273311897106e-05,
+      "loss": 0.0741,
+      "step": 2375
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.4768739640712738,
+      "learning_rate": 2.506660542030317e-05,
+      "loss": 0.0813,
+      "step": 2400
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3268464207649231,
+      "learning_rate": 2.498047772163528e-05,
+      "loss": 0.061,
+      "step": 2425
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.5923863649368286,
+      "learning_rate": 2.4894350022967386e-05,
+      "loss": 0.0799,
+      "step": 2450
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.3295147120952606,
+      "learning_rate": 2.4808222324299497e-05,
+      "loss": 0.0742,
+      "step": 2475
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.6255597472190857,
+      "learning_rate": 2.47220946256316e-05,
+      "loss": 0.0685,
+      "step": 2500
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.28953835368156433,
+      "learning_rate": 2.4635966926963712e-05,
+      "loss": 0.0645,
+      "step": 2525
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.28053247928619385,
+      "learning_rate": 2.4549839228295823e-05,
+      "loss": 0.0756,
+      "step": 2550
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.48497211933135986,
+      "learning_rate": 2.4463711529627927e-05,
+      "loss": 0.0659,
+      "step": 2575
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.4199415445327759,
+      "learning_rate": 2.437758383096004e-05,
+      "loss": 0.0783,
+      "step": 2600
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.5894821882247925,
+      "learning_rate": 2.4291456132292146e-05,
+      "loss": 0.0655,
+      "step": 2625
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.31835877895355225,
+      "learning_rate": 2.4205328433624253e-05,
+      "loss": 0.0595,
+      "step": 2650
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.3910796642303467,
+      "learning_rate": 2.411920073495636e-05,
+      "loss": 0.0703,
+      "step": 2675
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.4052046239376068,
+      "learning_rate": 2.4033073036288472e-05,
+      "loss": 0.0751,
+      "step": 2700
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.5519680380821228,
+      "learning_rate": 2.394694533762058e-05,
+      "loss": 0.0643,
+      "step": 2725
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.2853243052959442,
+      "learning_rate": 2.3860817638952687e-05,
+      "loss": 0.0582,
+      "step": 2750
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.3067447245121002,
+      "learning_rate": 2.3774689940284798e-05,
+      "loss": 0.0676,
+      "step": 2775
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.3290148079395294,
+      "learning_rate": 2.3688562241616902e-05,
+      "loss": 0.0679,
+      "step": 2800
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.3863273561000824,
+      "learning_rate": 2.3602434542949013e-05,
+      "loss": 0.0606,
+      "step": 2825
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.4047059416770935,
+      "learning_rate": 2.3516306844281124e-05,
+      "loss": 0.0668,
+      "step": 2850
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.42530855536460876,
+      "learning_rate": 2.3430179145613228e-05,
+      "loss": 0.0606,
+      "step": 2875
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.3491179943084717,
+      "learning_rate": 2.334405144694534e-05,
+      "loss": 0.0762,
+      "step": 2900
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.15498507022857666,
+      "learning_rate": 2.3257923748277446e-05,
+      "loss": 0.0719,
+      "step": 2925
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.4127048850059509,
+      "learning_rate": 2.3171796049609554e-05,
+      "loss": 0.0614,
+      "step": 2950
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.3426351249217987,
+      "learning_rate": 2.3085668350941665e-05,
+      "loss": 0.0624,
+      "step": 2975
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.28411659598350525,
+      "learning_rate": 2.2999540652273772e-05,
+      "loss": 0.0662,
+      "step": 3000
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.16444465517997742,
+      "learning_rate": 2.291341295360588e-05,
+      "loss": 0.0615,
+      "step": 3025
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.3092861771583557,
+      "learning_rate": 2.2827285254937987e-05,
+      "loss": 0.0582,
+      "step": 3050
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.483084499835968,
+      "learning_rate": 2.2741157556270098e-05,
+      "loss": 0.0595,
+      "step": 3075
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.2572250962257385,
+      "learning_rate": 2.2655029857602202e-05,
+      "loss": 0.0699,
+      "step": 3100
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.21841402351856232,
+      "learning_rate": 2.2568902158934313e-05,
+      "loss": 0.0606,
+      "step": 3125
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.33092397451400757,
+      "learning_rate": 2.2482774460266424e-05,
+      "loss": 0.0649,
+      "step": 3150
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.35950547456741333,
+      "learning_rate": 2.239664676159853e-05,
+      "loss": 0.0644,
+      "step": 3175
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.33506807684898376,
+      "learning_rate": 2.231051906293064e-05,
+      "loss": 0.0662,
+      "step": 3200
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.5829468369483948,
+      "learning_rate": 2.2224391364262747e-05,
+      "loss": 0.0643,
+      "step": 3225
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.3790678381919861,
+      "learning_rate": 2.2138263665594854e-05,
+      "loss": 0.0729,
+      "step": 3250
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.34478724002838135,
+      "learning_rate": 2.2052135966926965e-05,
+      "loss": 0.0645,
+      "step": 3275
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.3655211925506592,
+      "learning_rate": 2.1966008268259073e-05,
+      "loss": 0.0756,
+      "step": 3300
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.3092879354953766,
+      "learning_rate": 2.187988056959118e-05,
+      "loss": 0.0677,
+      "step": 3325
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.3613986372947693,
+      "learning_rate": 2.1793752870923288e-05,
+      "loss": 0.0627,
+      "step": 3350
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.6292563080787659,
+      "learning_rate": 2.17076251722554e-05,
+      "loss": 0.0698,
+      "step": 3375
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.24357610940933228,
+      "learning_rate": 2.1621497473587506e-05,
+      "loss": 0.0701,
+      "step": 3400
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.32474613189697266,
+      "learning_rate": 2.1535369774919614e-05,
+      "loss": 0.0662,
+      "step": 3425
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.30164840817451477,
+      "learning_rate": 2.1449242076251725e-05,
+      "loss": 0.0613,
+      "step": 3450
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.25660645961761475,
+      "learning_rate": 2.136311437758383e-05,
+      "loss": 0.0622,
+      "step": 3475
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.4381033480167389,
+      "learning_rate": 2.127698667891594e-05,
+      "loss": 0.0676,
+      "step": 3500
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.36808228492736816,
+      "learning_rate": 2.1190858980248047e-05,
+      "loss": 0.0665,
+      "step": 3525
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.2712726294994354,
+      "learning_rate": 2.1104731281580155e-05,
+      "loss": 0.0728,
+      "step": 3550
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.5337083339691162,
+      "learning_rate": 2.1018603582912266e-05,
+      "loss": 0.0666,
+      "step": 3575
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.33135494589805603,
+      "learning_rate": 2.0932475884244373e-05,
+      "loss": 0.0721,
+      "step": 3600
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.3100278675556183,
+      "learning_rate": 2.084634818557648e-05,
+      "loss": 0.066,
+      "step": 3625
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.4423840045928955,
+      "learning_rate": 2.076022048690859e-05,
+      "loss": 0.0666,
+      "step": 3650
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.3579668700695038,
+      "learning_rate": 2.06740927882407e-05,
+      "loss": 0.0635,
+      "step": 3675
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.4105582535266876,
+      "learning_rate": 2.0587965089572807e-05,
+      "loss": 0.0683,
+      "step": 3700
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.5205901861190796,
+      "learning_rate": 2.0501837390904914e-05,
+      "loss": 0.0725,
+      "step": 3725
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.508314311504364,
+      "learning_rate": 2.0415709692237025e-05,
+      "loss": 0.0751,
+      "step": 3750
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.36034587025642395,
+      "learning_rate": 2.032958199356913e-05,
+      "loss": 0.0685,
+      "step": 3775
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.2791132926940918,
+      "learning_rate": 2.024345429490124e-05,
+      "loss": 0.0642,
+      "step": 3800
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.41801777482032776,
+      "learning_rate": 2.015732659623335e-05,
+      "loss": 0.0552,
+      "step": 3825
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.2085207998752594,
+      "learning_rate": 2.0071198897565455e-05,
+      "loss": 0.057,
+      "step": 3850
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.3542513847351074,
+      "learning_rate": 1.9985071198897566e-05,
+      "loss": 0.0635,
+      "step": 3875
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.21708066761493683,
+      "learning_rate": 1.9898943500229674e-05,
+      "loss": 0.0609,
+      "step": 3900
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.39065513014793396,
+      "learning_rate": 1.9812815801561785e-05,
+      "loss": 0.0638,
+      "step": 3925
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.29231196641921997,
+      "learning_rate": 1.972668810289389e-05,
+      "loss": 0.0665,
+      "step": 3950
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.24261677265167236,
+      "learning_rate": 1.9640560404226e-05,
+      "loss": 0.0621,
+      "step": 3975
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.3188820779323578,
+      "learning_rate": 1.955443270555811e-05,
+      "loss": 0.0579,
+      "step": 4000
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 0.44512245059013367,
+      "learning_rate": 1.9468305006890215e-05,
+      "loss": 0.0638,
+      "step": 4025
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.3457041084766388,
+      "learning_rate": 1.9382177308222326e-05,
+      "loss": 0.062,
+      "step": 4050
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.20665928721427917,
+      "learning_rate": 1.9296049609554433e-05,
+      "loss": 0.0614,
+      "step": 4075
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.3206002116203308,
+      "learning_rate": 1.920992191088654e-05,
+      "loss": 0.0714,
+      "step": 4100
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.2654290497303009,
+      "learning_rate": 1.9123794212218652e-05,
+      "loss": 0.0603,
+      "step": 4125
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.39113229513168335,
+      "learning_rate": 1.903766651355076e-05,
+      "loss": 0.0635,
+      "step": 4150
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.34004124999046326,
+      "learning_rate": 1.8951538814882867e-05,
+      "loss": 0.0638,
+      "step": 4175
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.27496448159217834,
+      "learning_rate": 1.8865411116214974e-05,
+      "loss": 0.0622,
+      "step": 4200
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.3293686509132385,
+      "learning_rate": 1.8779283417547085e-05,
+      "loss": 0.0652,
+      "step": 4225
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.2733684480190277,
+      "learning_rate": 1.8693155718879193e-05,
+      "loss": 0.0616,
+      "step": 4250
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.21654804050922394,
+      "learning_rate": 1.86070280202113e-05,
+      "loss": 0.0618,
+      "step": 4275
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.24511463940143585,
+      "learning_rate": 1.852090032154341e-05,
+      "loss": 0.0579,
+      "step": 4300
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.363006591796875,
+      "learning_rate": 1.8434772622875515e-05,
+      "loss": 0.0673,
+      "step": 4325
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.2865668535232544,
+      "learning_rate": 1.8348644924207626e-05,
+      "loss": 0.0633,
+      "step": 4350
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.20682819187641144,
+      "learning_rate": 1.8262517225539734e-05,
+      "loss": 0.0519,
+      "step": 4375
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.38699063658714294,
+      "learning_rate": 1.817638952687184e-05,
+      "loss": 0.0604,
+      "step": 4400
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.35452330112457275,
+      "learning_rate": 1.8090261828203952e-05,
+      "loss": 0.0568,
+      "step": 4425
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.2268667072057724,
+      "learning_rate": 1.800413412953606e-05,
+      "loss": 0.0653,
+      "step": 4450
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.30717945098876953,
+      "learning_rate": 1.7918006430868167e-05,
+      "loss": 0.0664,
+      "step": 4475
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.46489375829696655,
+      "learning_rate": 1.7831878732200275e-05,
+      "loss": 0.0562,
+      "step": 4500
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.43247151374816895,
+      "learning_rate": 1.7745751033532386e-05,
+      "loss": 0.0573,
+      "step": 4525
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.3387090563774109,
+      "learning_rate": 1.7659623334864493e-05,
+      "loss": 0.0519,
+      "step": 4550
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.16439248621463776,
+      "learning_rate": 1.75734956361966e-05,
+      "loss": 0.0663,
+      "step": 4575
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.8034338355064392,
+      "learning_rate": 1.7487367937528712e-05,
+      "loss": 0.0621,
+      "step": 4600
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.2516898512840271,
+      "learning_rate": 1.7401240238860816e-05,
+      "loss": 0.0612,
+      "step": 4625
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.22889916598796844,
+      "learning_rate": 1.7315112540192927e-05,
+      "loss": 0.0542,
+      "step": 4650
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.1372820883989334,
+      "learning_rate": 1.7228984841525038e-05,
+      "loss": 0.0591,
+      "step": 4675
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.3031134307384491,
+      "learning_rate": 1.7142857142857142e-05,
+      "loss": 0.0536,
+      "step": 4700
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.22743794322013855,
+      "learning_rate": 1.7056729444189253e-05,
+      "loss": 0.0579,
+      "step": 4725
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.419313907623291,
+      "learning_rate": 1.697060174552136e-05,
+      "loss": 0.0681,
+      "step": 4750
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.39816129207611084,
+      "learning_rate": 1.6884474046853468e-05,
+      "loss": 0.0629,
+      "step": 4775
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.3139801621437073,
+      "learning_rate": 1.6798346348185575e-05,
+      "loss": 0.0547,
+      "step": 4800
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.36901557445526123,
+      "learning_rate": 1.6712218649517686e-05,
+      "loss": 0.0631,
+      "step": 4825
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.3577054738998413,
+      "learning_rate": 1.6626090950849794e-05,
+      "loss": 0.0625,
+      "step": 4850
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.28602612018585205,
+      "learning_rate": 1.65399632521819e-05,
+      "loss": 0.0544,
+      "step": 4875
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 0.3862529397010803,
+      "learning_rate": 1.6453835553514012e-05,
+      "loss": 0.0544,
+      "step": 4900
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.2228946089744568,
+      "learning_rate": 1.6367707854846116e-05,
+      "loss": 0.0558,
+      "step": 4925
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 0.2535618841648102,
+      "learning_rate": 1.6281580156178227e-05,
+      "loss": 0.0521,
+      "step": 4950
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.22719667851924896,
+      "learning_rate": 1.6195452457510338e-05,
+      "loss": 0.056,
+      "step": 4975
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 0.3262714445590973,
+      "learning_rate": 1.6109324758842442e-05,
+      "loss": 0.0512,
+      "step": 5000
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.2454172968864441,
+      "learning_rate": 1.6023197060174553e-05,
+      "loss": 0.0499,
+      "step": 5025
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 0.6654460430145264,
+      "learning_rate": 1.593706936150666e-05,
+      "loss": 0.0516,
+      "step": 5050
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.37515684962272644,
+      "learning_rate": 1.585094166283877e-05,
+      "loss": 0.0487,
+      "step": 5075
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 0.3624023199081421,
+      "learning_rate": 1.576481396417088e-05,
+      "loss": 0.051,
+      "step": 5100
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.33498457074165344,
+      "learning_rate": 1.5678686265502987e-05,
+      "loss": 0.0489,
+      "step": 5125
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 0.4283454418182373,
+      "learning_rate": 1.5592558566835094e-05,
+      "loss": 0.0497,
+      "step": 5150
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 0.4568246006965637,
+      "learning_rate": 1.5506430868167202e-05,
+      "loss": 0.0552,
+      "step": 5175
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 0.3240622580051422,
+      "learning_rate": 1.5420303169499313e-05,
+      "loss": 0.0524,
+      "step": 5200
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.3433873951435089,
+      "learning_rate": 1.533417547083142e-05,
+      "loss": 0.0562,
+      "step": 5225
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 0.28902968764305115,
+      "learning_rate": 1.5248047772163528e-05,
+      "loss": 0.0564,
+      "step": 5250
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.5900773406028748,
+      "learning_rate": 1.5161920073495637e-05,
+      "loss": 0.0593,
+      "step": 5275
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 0.37150174379348755,
+      "learning_rate": 1.5075792374827745e-05,
+      "loss": 0.05,
+      "step": 5300
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.2919517159461975,
+      "learning_rate": 1.4989664676159854e-05,
+      "loss": 0.057,
+      "step": 5325
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 0.24746154248714447,
+      "learning_rate": 1.4903536977491961e-05,
+      "loss": 0.0549,
+      "step": 5350
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.4665452539920807,
+      "learning_rate": 1.481740927882407e-05,
+      "loss": 0.051,
+      "step": 5375
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 0.32012248039245605,
+      "learning_rate": 1.4731281580156178e-05,
+      "loss": 0.0447,
+      "step": 5400
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.3855270445346832,
+      "learning_rate": 1.4645153881488286e-05,
+      "loss": 0.0492,
+      "step": 5425
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.238910511136055,
+      "learning_rate": 1.4559026182820396e-05,
+      "loss": 0.0527,
+      "step": 5450
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.3347514569759369,
+      "learning_rate": 1.4472898484152504e-05,
+      "loss": 0.0514,
+      "step": 5475
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 0.34102463722229004,
+      "learning_rate": 1.4386770785484612e-05,
+      "loss": 0.05,
+      "step": 5500
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 0.4247712790966034,
+      "learning_rate": 1.4304088194763437e-05,
+      "loss": 0.0537,
+      "step": 5525
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 0.2721666395664215,
+      "learning_rate": 1.4217960496095544e-05,
+      "loss": 0.0503,
+      "step": 5550
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 0.4121835231781006,
+      "learning_rate": 1.4131832797427654e-05,
+      "loss": 0.0471,
+      "step": 5575
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.34142744541168213,
+      "learning_rate": 1.4045705098759761e-05,
+      "loss": 0.0528,
+      "step": 5600
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 0.44415149092674255,
+      "learning_rate": 1.3959577400091869e-05,
+      "loss": 0.0476,
+      "step": 5625
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.4206011891365051,
+      "learning_rate": 1.3873449701423978e-05,
+      "loss": 0.0476,
+      "step": 5650
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 0.2635906934738159,
+      "learning_rate": 1.3787322002756087e-05,
+      "loss": 0.0412,
+      "step": 5675
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.21374674141407013,
+      "learning_rate": 1.3701194304088195e-05,
+      "loss": 0.0476,
+      "step": 5700
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 0.29949426651000977,
+      "learning_rate": 1.3615066605420304e-05,
+      "loss": 0.0526,
+      "step": 5725
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.28353065252304077,
+      "learning_rate": 1.3528938906752411e-05,
+      "loss": 0.0523,
+      "step": 5750
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 0.37691453099250793,
+      "learning_rate": 1.344281120808452e-05,
+      "loss": 0.0496,
+      "step": 5775
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.20896878838539124,
+      "learning_rate": 1.3356683509416628e-05,
+      "loss": 0.0505,
+      "step": 5800
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 0.2891576588153839,
+      "learning_rate": 1.3270555810748737e-05,
+      "loss": 0.0519,
+      "step": 5825
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 0.32109466195106506,
+      "learning_rate": 1.3184428112080847e-05,
+      "loss": 0.0457,
+      "step": 5850
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 0.37973764538764954,
+      "learning_rate": 1.3098300413412954e-05,
+      "loss": 0.0469,
+      "step": 5875
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.31194964051246643,
+      "learning_rate": 1.3012172714745062e-05,
+      "loss": 0.0455,
+      "step": 5900
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 0.46967265009880066,
+      "learning_rate": 1.2926045016077171e-05,
+      "loss": 0.0503,
+      "step": 5925
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 0.23756754398345947,
+      "learning_rate": 1.283991731740928e-05,
+      "loss": 0.0534,
+      "step": 5950
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 0.2805767059326172,
+      "learning_rate": 1.2753789618741388e-05,
+      "loss": 0.0535,
+      "step": 5975
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.3886590301990509,
+      "learning_rate": 1.2667661920073497e-05,
+      "loss": 0.0483,
+      "step": 6000
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 0.35990557074546814,
+      "learning_rate": 1.2581534221405604e-05,
+      "loss": 0.0442,
+      "step": 6025
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.39960598945617676,
+      "learning_rate": 1.2495406522737712e-05,
+      "loss": 0.0517,
+      "step": 6050
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 0.25265923142433167,
+      "learning_rate": 1.2409278824069821e-05,
+      "loss": 0.052,
+      "step": 6075
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.19778184592723846,
+      "learning_rate": 1.232315112540193e-05,
+      "loss": 0.0489,
+      "step": 6100
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 0.3069133758544922,
+      "learning_rate": 1.2237023426734038e-05,
+      "loss": 0.05,
+      "step": 6125
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 0.23380334675312042,
+      "learning_rate": 1.2150895728066147e-05,
+      "loss": 0.0456,
+      "step": 6150
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 0.22880606353282928,
+      "learning_rate": 1.2064768029398255e-05,
+      "loss": 0.0567,
+      "step": 6175
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.2991226315498352,
+      "learning_rate": 1.1978640330730362e-05,
+      "loss": 0.0537,
+      "step": 6200
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 0.23710553348064423,
+      "learning_rate": 1.1892512632062471e-05,
+      "loss": 0.0453,
+      "step": 6225
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 0.37337175011634827,
+      "learning_rate": 1.180638493339458e-05,
+      "loss": 0.0554,
+      "step": 6250
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 0.2040768265724182,
+      "learning_rate": 1.1720257234726688e-05,
+      "loss": 0.0568,
+      "step": 6275
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.14503860473632812,
+      "learning_rate": 1.1634129536058797e-05,
+      "loss": 0.0488,
+      "step": 6300
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 0.41582000255584717,
+      "learning_rate": 1.1548001837390905e-05,
+      "loss": 0.0503,
+      "step": 6325
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 0.4040040075778961,
+      "learning_rate": 1.1461874138723012e-05,
+      "loss": 0.0448,
+      "step": 6350
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.37314775586128235,
+      "learning_rate": 1.1375746440055123e-05,
+      "loss": 0.051,
+      "step": 6375
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 0.37785276770591736,
+      "learning_rate": 1.128961874138723e-05,
+      "loss": 0.0524,
+      "step": 6400
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 0.12397664785385132,
+      "learning_rate": 1.1203491042719338e-05,
+      "loss": 0.0554,
+      "step": 6425
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 0.252644419670105,
+      "learning_rate": 1.1117363344051448e-05,
+      "loss": 0.0504,
+      "step": 6450
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 0.27793678641319275,
+      "learning_rate": 1.1031235645383555e-05,
+      "loss": 0.0484,
+      "step": 6475
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 0.27731946110725403,
+      "learning_rate": 1.0945107946715663e-05,
+      "loss": 0.0456,
+      "step": 6500
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.5174788236618042,
+      "learning_rate": 1.0858980248047774e-05,
+      "loss": 0.0514,
+      "step": 6525
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 0.3238148093223572,
+      "learning_rate": 1.0772852549379881e-05,
+      "loss": 0.0509,
+      "step": 6550
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 0.25130346417427063,
+      "learning_rate": 1.0686724850711989e-05,
+      "loss": 0.046,
+      "step": 6575
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 0.2648930549621582,
+      "learning_rate": 1.0600597152044098e-05,
+      "loss": 0.0482,
+      "step": 6600
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 0.23606421053409576,
+      "learning_rate": 1.0514469453376205e-05,
+      "loss": 0.049,
+      "step": 6625
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.13270416855812073,
+      "learning_rate": 1.0428341754708315e-05,
+      "loss": 0.0494,
+      "step": 6650
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.485784113407135,
+      "learning_rate": 1.0342214056040424e-05,
+      "loss": 0.0435,
+      "step": 6675
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 0.37061092257499695,
+      "learning_rate": 1.0256086357372531e-05,
+      "loss": 0.0458,
+      "step": 6700
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 0.23628602921962738,
+      "learning_rate": 1.0169958658704639e-05,
+      "loss": 0.0488,
+      "step": 6725
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 0.5137524604797363,
+      "learning_rate": 1.0083830960036748e-05,
+      "loss": 0.0515,
+      "step": 6750
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.4176236093044281,
+      "learning_rate": 9.997703261368856e-06,
+      "loss": 0.0452,
+      "step": 6775
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 0.3956039547920227,
+      "learning_rate": 9.911575562700965e-06,
+      "loss": 0.0499,
+      "step": 6800
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 0.29062360525131226,
+      "learning_rate": 9.825447864033074e-06,
+      "loss": 0.0479,
+      "step": 6825
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 0.3083875775337219,
+      "learning_rate": 9.739320165365182e-06,
+      "loss": 0.0501,
+      "step": 6850
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.43474555015563965,
+      "learning_rate": 9.653192466697289e-06,
+      "loss": 0.0562,
+      "step": 6875
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 0.21572205424308777,
+      "learning_rate": 9.567064768029398e-06,
+      "loss": 0.049,
+      "step": 6900
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 0.17227627336978912,
+      "learning_rate": 9.480937069361506e-06,
+      "loss": 0.0455,
+      "step": 6925
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 0.34257158637046814,
+      "learning_rate": 9.394809370693617e-06,
+      "loss": 0.0456,
+      "step": 6950
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.30878308415412903,
+      "learning_rate": 9.308681672025724e-06,
+      "loss": 0.0469,
+      "step": 6975
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 0.338555246591568,
+      "learning_rate": 9.222553973357832e-06,
+      "loss": 0.0458,
+      "step": 7000
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.2819863259792328,
+      "learning_rate": 9.136426274689941e-06,
+      "loss": 0.0514,
+      "step": 7025
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 0.27121391892433167,
+      "learning_rate": 9.050298576022049e-06,
+      "loss": 0.0574,
+      "step": 7050
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.28231513500213623,
+      "learning_rate": 8.964170877354158e-06,
+      "loss": 0.0459,
+      "step": 7075
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 0.3147279620170593,
+      "learning_rate": 8.878043178686267e-06,
+      "loss": 0.055,
+      "step": 7100
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 0.3944794535636902,
+      "learning_rate": 8.791915480018375e-06,
+      "loss": 0.0561,
+      "step": 7125
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.5070275664329529,
+      "learning_rate": 8.705787781350482e-06,
+      "loss": 0.0452,
+      "step": 7150
+    },
+    {
+      "epoch": 2.97,
+      "grad_norm": 0.29337751865386963,
+      "learning_rate": 8.619660082682591e-06,
+      "loss": 0.047,
+      "step": 7175
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 0.3418371081352234,
+      "learning_rate": 8.533532384014699e-06,
+      "loss": 0.0441,
+      "step": 7200
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 0.34925562143325806,
+      "learning_rate": 8.447404685346808e-06,
+      "loss": 0.043,
+      "step": 7225
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.32116442918777466,
+      "learning_rate": 8.361276986678917e-06,
+      "loss": 0.0475,
+      "step": 7250
+    },
+    {
+      "epoch": 3.01,
+      "grad_norm": 0.3370840549468994,
+      "learning_rate": 8.275149288011025e-06,
+      "loss": 0.0454,
+      "step": 7275
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 0.28512734174728394,
+      "learning_rate": 8.189021589343132e-06,
+      "loss": 0.0358,
+      "step": 7300
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 0.35008323192596436,
+      "learning_rate": 8.102893890675242e-06,
+      "loss": 0.0425,
+      "step": 7325
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.32068151235580444,
+      "learning_rate": 8.016766192007349e-06,
+      "loss": 0.0413,
+      "step": 7350
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 0.27377787232398987,
+      "learning_rate": 7.930638493339458e-06,
+      "loss": 0.037,
+      "step": 7375
+    },
+    {
+      "epoch": 3.06,
+      "grad_norm": 0.1453736424446106,
+      "learning_rate": 7.844510794671568e-06,
+      "loss": 0.0364,
+      "step": 7400
+    },
+    {
+      "epoch": 3.07,
+      "grad_norm": 0.4593455493450165,
+      "learning_rate": 7.758383096003675e-06,
+      "loss": 0.0398,
+      "step": 7425
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 0.3376910984516144,
+      "learning_rate": 7.672255397335783e-06,
+      "loss": 0.0425,
+      "step": 7450
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 0.4269729554653168,
+      "learning_rate": 7.586127698667893e-06,
+      "loss": 0.0421,
+      "step": 7475
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 0.42669641971588135,
+      "learning_rate": 7.5e-06,
+      "loss": 0.0402,
+      "step": 7500
+    },
+    {
+      "epoch": 3.11,
+      "grad_norm": 0.3629492223262787,
+      "learning_rate": 7.4138723013321086e-06,
+      "loss": 0.0379,
+      "step": 7525
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.24746298789978027,
+      "learning_rate": 7.327744602664217e-06,
+      "loss": 0.0404,
+      "step": 7550
+    },
+    {
+      "epoch": 3.13,
+      "grad_norm": 0.19069620966911316,
+      "learning_rate": 7.241616903996325e-06,
+      "loss": 0.0393,
+      "step": 7575
+    },
+    {
+      "epoch": 3.14,
+      "grad_norm": 0.2659347951412201,
+      "learning_rate": 7.155489205328434e-06,
+      "loss": 0.0404,
+      "step": 7600
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 0.15745492279529572,
+      "learning_rate": 7.069361506660542e-06,
+      "loss": 0.0392,
+      "step": 7625
+    },
+    {
+      "epoch": 3.16,
+      "grad_norm": 0.20051732659339905,
+      "learning_rate": 6.9866789159393665e-06,
+      "loss": 0.0361,
+      "step": 7650
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 0.3713512122631073,
+      "learning_rate": 6.900551217271474e-06,
+      "loss": 0.0386,
+      "step": 7675
+    },
+    {
+      "epoch": 3.18,
+      "grad_norm": 0.42229992151260376,
+      "learning_rate": 6.814423518603583e-06,
+      "loss": 0.0361,
+      "step": 7700
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 0.2441110610961914,
+      "learning_rate": 6.728295819935692e-06,
+      "loss": 0.037,
+      "step": 7725
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.18802495300769806,
+      "learning_rate": 6.642168121267799e-06,
+      "loss": 0.042,
+      "step": 7750
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 0.25679391622543335,
+      "learning_rate": 6.556040422599908e-06,
+      "loss": 0.0393,
+      "step": 7775
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 0.29285240173339844,
+      "learning_rate": 6.469912723932017e-06,
+      "loss": 0.0419,
+      "step": 7800
+    },
+    {
+      "epoch": 3.23,
+      "grad_norm": 0.3169197738170624,
+      "learning_rate": 6.383785025264124e-06,
+      "loss": 0.0409,
+      "step": 7825
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.32817521691322327,
+      "learning_rate": 6.2976573265962335e-06,
+      "loss": 0.0403,
+      "step": 7850
+    },
+    {
+      "epoch": 3.26,
+      "grad_norm": 0.29750293493270874,
+      "learning_rate": 6.211529627928342e-06,
+      "loss": 0.0387,
+      "step": 7875
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 0.44555339217185974,
+      "learning_rate": 6.125401929260451e-06,
+      "loss": 0.0413,
+      "step": 7900
+    },
+    {
+      "epoch": 3.28,
+      "grad_norm": 0.34628069400787354,
+      "learning_rate": 6.039274230592559e-06,
+      "loss": 0.0405,
+      "step": 7925
+    },
+    {
+      "epoch": 3.29,
+      "grad_norm": 0.36807015538215637,
+      "learning_rate": 5.953146531924667e-06,
+      "loss": 0.0399,
+      "step": 7950
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 0.2611570656299591,
+      "learning_rate": 5.867018833256776e-06,
+      "loss": 0.0384,
+      "step": 7975
+    },
+    {
+      "epoch": 3.31,
+      "grad_norm": 0.2191769927740097,
+      "learning_rate": 5.780891134588884e-06,
+      "loss": 0.0387,
+      "step": 8000
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 0.38859057426452637,
+      "learning_rate": 5.694763435920992e-06,
+      "loss": 0.0451,
+      "step": 8025
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 0.24283719062805176,
+      "learning_rate": 5.608635737253101e-06,
+      "loss": 0.0378,
+      "step": 8050
+    },
+    {
+      "epoch": 3.34,
+      "grad_norm": 0.3681369423866272,
+      "learning_rate": 5.522508038585209e-06,
+      "loss": 0.0438,
+      "step": 8075
+    },
+    {
+      "epoch": 3.35,
+      "grad_norm": 0.3467373251914978,
+      "learning_rate": 5.436380339917317e-06,
+      "loss": 0.0416,
+      "step": 8100
+    },
+    {
+      "epoch": 3.36,
+      "grad_norm": 0.3951582610607147,
+      "learning_rate": 5.3502526412494265e-06,
+      "loss": 0.0407,
+      "step": 8125
+    },
+    {
+      "epoch": 3.37,
+      "grad_norm": 0.581302285194397,
+      "learning_rate": 5.264124942581534e-06,
+      "loss": 0.0391,
+      "step": 8150
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 0.34989818930625916,
+      "learning_rate": 5.177997243913642e-06,
+      "loss": 0.0415,
+      "step": 8175
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 0.4126095771789551,
+      "learning_rate": 5.091869545245752e-06,
+      "loss": 0.0364,
+      "step": 8200
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 0.3435223698616028,
+      "learning_rate": 5.005741846577859e-06,
+      "loss": 0.0375,
+      "step": 8225
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 0.42563363909721375,
+      "learning_rate": 4.9196141479099675e-06,
+      "loss": 0.0404,
+      "step": 8250
+    },
+    {
+      "epoch": 3.42,
+      "grad_norm": 0.2580004632472992,
+      "learning_rate": 4.833486449242077e-06,
+      "loss": 0.0381,
+      "step": 8275
+    },
+    {
+      "epoch": 3.43,
+      "grad_norm": 0.37050294876098633,
+      "learning_rate": 4.747358750574184e-06,
+      "loss": 0.042,
+      "step": 8300
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.31261393427848816,
+      "learning_rate": 4.6612310519062935e-06,
+      "loss": 0.0411,
+      "step": 8325
+    },
+    {
+      "epoch": 3.45,
+      "grad_norm": 0.33394724130630493,
+      "learning_rate": 4.575103353238402e-06,
+      "loss": 0.0445,
+      "step": 8350
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 0.6875510811805725,
+      "learning_rate": 4.4889756545705094e-06,
+      "loss": 0.0407,
+      "step": 8375
+    },
+    {
+      "epoch": 3.47,
+      "grad_norm": 0.4609951674938202,
+      "learning_rate": 4.402847955902619e-06,
+      "loss": 0.0355,
+      "step": 8400
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 0.36755943298339844,
+      "learning_rate": 4.316720257234727e-06,
+      "loss": 0.0412,
+      "step": 8425
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 0.3488776385784149,
+      "learning_rate": 4.2305925585668345e-06,
+      "loss": 0.0432,
+      "step": 8450
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.2769566476345062,
+      "learning_rate": 4.144464859898944e-06,
+      "loss": 0.034,
+      "step": 8475
+    },
+    {
+      "epoch": 3.51,
+      "grad_norm": 0.3229292333126068,
+      "learning_rate": 4.058337161231052e-06,
+      "loss": 0.0351,
+      "step": 8500
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 0.39641430974006653,
+      "learning_rate": 3.9722094625631605e-06,
+      "loss": 0.0365,
+      "step": 8525
+    },
+    {
+      "epoch": 3.53,
+      "grad_norm": 0.3574431240558624,
+      "learning_rate": 3.886081763895269e-06,
+      "loss": 0.04,
+      "step": 8550
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 0.3420943319797516,
+      "learning_rate": 3.7999540652273773e-06,
+      "loss": 0.039,
+      "step": 8575
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 0.3045111894607544,
+      "learning_rate": 3.7138263665594856e-06,
+      "loss": 0.0394,
+      "step": 8600
+    },
+    {
+      "epoch": 3.57,
+      "grad_norm": 0.5104286670684814,
+      "learning_rate": 3.627698667891594e-06,
+      "loss": 0.0358,
+      "step": 8625
+    },
+    {
+      "epoch": 3.58,
+      "grad_norm": 0.2855064272880554,
+      "learning_rate": 3.5415709692237024e-06,
+      "loss": 0.0426,
+      "step": 8650
+    },
+    {
+      "epoch": 3.59,
+      "grad_norm": 0.3009551167488098,
+      "learning_rate": 3.4554432705558108e-06,
+      "loss": 0.0412,
+      "step": 8675
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.24823172390460968,
+      "learning_rate": 3.3693155718879196e-06,
+      "loss": 0.0408,
+      "step": 8700
+    },
+    {
+      "epoch": 3.61,
+      "grad_norm": 0.3101685643196106,
+      "learning_rate": 3.2831878732200275e-06,
+      "loss": 0.0385,
+      "step": 8725
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 0.2525741159915924,
+      "learning_rate": 3.197060174552136e-06,
+      "loss": 0.0405,
+      "step": 8750
+    },
+    {
+      "epoch": 3.63,
+      "grad_norm": 0.27567043900489807,
+      "learning_rate": 3.1109324758842447e-06,
+      "loss": 0.0402,
+      "step": 8775
+    },
+    {
+      "epoch": 3.64,
+      "grad_norm": 0.3665957748889923,
+      "learning_rate": 3.0248047772163526e-06,
+      "loss": 0.0369,
+      "step": 8800
+    },
+    {
+      "epoch": 3.65,
+      "grad_norm": 0.32239583134651184,
+      "learning_rate": 2.938677078548461e-06,
+      "loss": 0.0366,
+      "step": 8825
+    },
+    {
+      "epoch": 3.66,
+      "grad_norm": 0.2736150026321411,
+      "learning_rate": 2.85254937988057e-06,
+      "loss": 0.0379,
+      "step": 8850
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 0.6374419331550598,
+      "learning_rate": 2.766421681212678e-06,
+      "loss": 0.0386,
+      "step": 8875
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 0.17933352291584015,
+      "learning_rate": 2.680293982544786e-06,
+      "loss": 0.0403,
+      "step": 8900
+    },
+    {
+      "epoch": 3.69,
+      "grad_norm": 0.429572731256485,
+      "learning_rate": 2.594166283876895e-06,
+      "loss": 0.041,
+      "step": 8925
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 0.343364953994751,
+      "learning_rate": 2.5080385852090033e-06,
+      "loss": 0.0415,
+      "step": 8950
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 0.3911416530609131,
+      "learning_rate": 2.4219108865411117e-06,
+      "loss": 0.0337,
+      "step": 8975
+    },
+    {
+      "epoch": 3.72,
+      "grad_norm": 0.43486475944519043,
+      "learning_rate": 2.33578318787322e-06,
+      "loss": 0.0415,
+      "step": 9000
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 0.22443470358848572,
+      "learning_rate": 2.2496554892053284e-06,
+      "loss": 0.0389,
+      "step": 9025
+    },
+    {
+      "epoch": 3.74,
+      "grad_norm": 0.36421477794647217,
+      "learning_rate": 2.1635277905374372e-06,
+      "loss": 0.0407,
+      "step": 9050
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.382355660200119,
+      "learning_rate": 2.077400091869545e-06,
+      "loss": 0.0394,
+      "step": 9075
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 0.2957639694213867,
+      "learning_rate": 1.9912723932016536e-06,
+      "loss": 0.0425,
+      "step": 9100
+    },
+    {
+      "epoch": 3.77,
+      "grad_norm": 0.44382327795028687,
+      "learning_rate": 1.9051446945337622e-06,
+      "loss": 0.0392,
+      "step": 9125
+    },
+    {
+      "epoch": 3.78,
+      "grad_norm": 0.3849116265773773,
+      "learning_rate": 1.8190169958658705e-06,
+      "loss": 0.0376,
+      "step": 9150
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 0.37635913491249084,
+      "learning_rate": 1.732889297197979e-06,
+      "loss": 0.0364,
+      "step": 9175
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 0.2440154105424881,
+      "learning_rate": 1.6467615985300875e-06,
+      "loss": 0.0353,
+      "step": 9200
+    },
+    {
+      "epoch": 3.81,
+      "grad_norm": 0.3721398711204529,
+      "learning_rate": 1.5606338998621957e-06,
+      "loss": 0.0456,
+      "step": 9225
+    },
+    {
+      "epoch": 3.82,
+      "grad_norm": 0.42328134179115295,
+      "learning_rate": 1.474506201194304e-06,
+      "loss": 0.0381,
+      "step": 9250
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 0.3101789653301239,
+      "learning_rate": 1.3883785025264126e-06,
+      "loss": 0.0355,
+      "step": 9275
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 0.41222986578941345,
+      "learning_rate": 1.302250803858521e-06,
+      "loss": 0.0379,
+      "step": 9300
+    },
+    {
+      "epoch": 3.85,
+      "grad_norm": 0.38471460342407227,
+      "learning_rate": 1.2161231051906294e-06,
+      "loss": 0.04,
+      "step": 9325
+    },
+    {
+      "epoch": 3.87,
+      "grad_norm": 0.3404221534729004,
+      "learning_rate": 1.1299954065227377e-06,
+      "loss": 0.0429,
+      "step": 9350
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 0.3508870303630829,
+      "learning_rate": 1.0438677078548461e-06,
+      "loss": 0.0422,
+      "step": 9375
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 0.3830946385860443,
+      "learning_rate": 9.577400091869545e-07,
+      "loss": 0.0354,
+      "step": 9400
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 0.2902699410915375,
+      "learning_rate": 8.71612310519063e-07,
+      "loss": 0.0383,
+      "step": 9425
+    },
+    {
+      "epoch": 3.91,
+      "grad_norm": 0.6969540119171143,
+      "learning_rate": 7.854846118511714e-07,
+      "loss": 0.0388,
+      "step": 9450
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 0.23226848244667053,
+      "learning_rate": 6.993569131832797e-07,
+      "loss": 0.0399,
+      "step": 9475
+    },
+    {
+      "epoch": 3.93,
+      "grad_norm": 0.22804780304431915,
+      "learning_rate": 6.132292145153882e-07,
+      "loss": 0.0403,
+      "step": 9500
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 0.46192172169685364,
+      "learning_rate": 5.271015158474966e-07,
+      "loss": 0.0385,
+      "step": 9525
+    },
+    {
+      "epoch": 3.95,
+      "grad_norm": 0.24761895835399628,
+      "learning_rate": 4.4097381717960496e-07,
+      "loss": 0.0372,
+      "step": 9550
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 0.3965108096599579,
+      "learning_rate": 3.548461185117134e-07,
+      "loss": 0.0389,
+      "step": 9575
+    },
+    {
+      "epoch": 3.97,
+      "grad_norm": 0.24169424176216125,
+      "learning_rate": 2.687184198438218e-07,
+      "loss": 0.0393,
+      "step": 9600
+    },
+    {
+      "epoch": 3.98,
+      "grad_norm": 0.4271424412727356,
+      "learning_rate": 1.825907211759302e-07,
+      "loss": 0.0385,
+      "step": 9625
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 0.3440055847167969,
+      "learning_rate": 9.646302250803859e-08,
+      "loss": 0.0394,
+      "step": 9650
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.43433114886283875,
+      "learning_rate": 1.0335323840146991e-08,
+      "loss": 0.036,
+      "step": 9675
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 9676,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 500,
+  "total_flos": 8.653960459352801e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-9676/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:471278576427115d6d17bfb25bab0702b85641a0b161bfd1e29e51fbba4b4223
+size 4984

handler.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import  Dict, List, Any
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
+from peft import PeftModel
+import json
+import os
+class EndpointHandler():
+    def __init__(self, path=""):
+        base_model_path = json.load(open(os.path.join(path, "training_params.json")))["model"]
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_path,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            device_map="auto",
+        )
+        tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+        model.resize_token_embeddings(len(tokenizer))
+        model = PeftModel.from_pretrained(model, path)
+        model = model.merge_and_unload()
+        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
+    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
+        inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", None)
+        if parameters is not None:
+            prediction = self.pipeline(inputs, **parameters)
+        else:
+            prediction = self.pipeline(inputs)
+        return prediction

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ peft==0.9.0
2	+ transformers==4.38.2

runs/Mar21_02-37-23_r-nicolof88-mistral7b-spider-at-188n2lyh-a2f2a-vnlru/events.out.tfevents.1710988652.r-nicolof88-mistral7b-spider-at-188n2lyh-a2f2a-vnlru.98.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7612823918dcca134557e133967c50bf0947aba2a2cd56218766fee871b62c36
-size 86510

 version https://git-lfs.github.com/spec/v1
+oid sha256:a68f799653a345af0889cf7ca9e4f06849262070d3ead3f614d9ca0904a4d1b1
+size 87075

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:471278576427115d6d17bfb25bab0702b85641a0b161bfd1e29e51fbba4b4223
+size 4984

training_params.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "model": "mistralai/Mistral-7B-Instruct-v0.2",
+    "project_name": "mistral7b-spider-ft",
+    "data_path": "mistral7b-spider-ft/autotrain-data",
+    "train_split": "train",
+    "valid_split": null,
+    "add_eos_token": true,
+    "block_size": 1024,
+    "model_max_length": 2048,
+    "padding": "right",
+    "trainer": "sft",
+    "use_flash_attention_2": false,
+    "log": "tensorboard",
+    "disable_gradient_checkpointing": false,
+    "logging_steps": -1,
+    "evaluation_strategy": "epoch",
+    "save_total_limit": 1,
+    "save_strategy": "epoch",
+    "auto_find_batch_size": false,
+    "mixed_precision": "fp16",
+    "lr": 3e-05,
+    "epochs": 4,
+    "batch_size": 2,
+    "warmup_ratio": 0.1,
+    "gradient_accumulation": 1,
+    "optimizer": "adamw_torch",
+    "scheduler": "linear",
+    "weight_decay": 0.0,
+    "max_grad_norm": 1.0,
+    "seed": 42,
+    "chat_template": "none",
+    "quantization": "int4",
+    "target_modules": "all-linear",
+    "merge_adapter": false,
+    "peft": true,
+    "lora_r": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+    "model_ref": null,
+    "dpo_beta": 0.1,
+    "prompt_text_column": "autotrain_prompt",
+    "text_column": "autotrain_text",
+    "rejected_text_column": "autotrain_rejected_text",
+    "push_to_hub": true,
+    "repo_id": "nicolof88/mistral7b-spider-ft",
+    "username": "nicolof88"
+}