nntoan209 commited on May 29

Commit

989fa49

•

1 Parent(s): df21bfa

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitattributes +1 -0
1_Pooling/config.json +10 -0
README.md +57 -0
colbert_linear.pt +3 -0
config.json +28 -0
config_sentence_transformers.json +9 -0
latest +1 -0
model.safetensors +3 -0
modules.json +20 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
sentence_bert_config.json +4 -0
sentencepiece.bpe.model +3 -0
sparse_linear.pt +3 -0
special_tokens_map.json +51 -0
tokenizer.json +3 -0
tokenizer_config.json +55 -0
trainer_state.json +3192 -0
training_args.bin +3 -0
zero_to_fp32.py +592 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+library_name: sentence-transformers
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+---
+# {MODEL_NAME}
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+<!--- Describe your model here -->
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('{MODEL_NAME}')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Evaluation Results
+<!--- Describe how your model was evaluated -->
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
+## Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
+  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+  (2): Normalize()
+)
+```
+## Citing & Authors
+<!--- Describe where people can find more information -->

colbert_linear.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07fb4c11dbf7a594afafb841e4c0f2eabd9a0a44679b5d64407c038798e5cee7
+size 2100674

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "saved_models/bgem3_sft_20240528/tmp-checkpoint-4532",
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 8194,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.2",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.5.1",
+    "transformers": "4.38.2",
+    "pytorch": "2.1.0+cu121"
+  },
+  "prompts": {},
+  "default_prompt_name": null
+}

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step4532

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67be58be92274efa45c9e04c701e70f27270a8f02515bf18805994a96963f7f2
+size 2271064456

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd17d388b5d19dbff8498fb5d5b9492821580b117eff968d193690cb88c3941b
+size 14512

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b22f2ba964ee6780fa354c19ec2585c8483517d67068144dd1ac1c860817236
+size 14512

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 8192,
+  "do_lower_case": false
+}

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

sparse_linear.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3edb1e33b5841b67152515c0d2bbdaec52e10f1c61faa4139978e0df8fa04d37
+size 3516

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69564b696052886ed0ac63fa393e928384e0f8caada38c1f4864a9bfbf379c15
+size 17098273

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3192 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9181523500810372,
+  "eval_steps": 500,
+  "global_step": 4532,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 10.73312117404433,
+      "learning_rate": 4.4048582995951427e-07,
+      "loss": 0.5001,
+      "step": 10
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 10.320516533751473,
+      "learning_rate": 7.076923076923077e-07,
+      "loss": 0.4459,
+      "step": 20
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 10.604032764201449,
+      "learning_rate": 9.748987854251014e-07,
+      "loss": 0.5337,
+      "step": 30
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 9.501814819458504,
+      "learning_rate": 1.2421052631578948e-06,
+      "loss": 0.4867,
+      "step": 40
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.2055395221098815,
+      "learning_rate": 1.5093117408906883e-06,
+      "loss": 0.4851,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.16903562483095,
+      "learning_rate": 1.776518218623482e-06,
+      "loss": 0.4368,
+      "step": 60
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.746924199634763,
+      "learning_rate": 2.0437246963562754e-06,
+      "loss": 0.4289,
+      "step": 70
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 5.827693974405843,
+      "learning_rate": 2.3109311740890693e-06,
+      "loss": 0.3924,
+      "step": 80
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 7.020982629413032,
+      "learning_rate": 2.5781376518218628e-06,
+      "loss": 0.4108,
+      "step": 90
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 6.01679137180817,
+      "learning_rate": 2.8453441295546562e-06,
+      "loss": 0.4146,
+      "step": 100
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 5.448653259468705,
+      "learning_rate": 3.11255060728745e-06,
+      "loss": 0.3843,
+      "step": 110
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 4.431377216444548,
+      "learning_rate": 3.379757085020243e-06,
+      "loss": 0.3752,
+      "step": 120
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 4.706847685087678,
+      "learning_rate": 3.646963562753037e-06,
+      "loss": 0.373,
+      "step": 130
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 4.800217647386447,
+      "learning_rate": 3.9141700404858305e-06,
+      "loss": 0.3189,
+      "step": 140
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 4.55540918616468,
+      "learning_rate": 4.1813765182186235e-06,
+      "loss": 0.3337,
+      "step": 150
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 5.728866550792321,
+      "learning_rate": 4.448582995951417e-06,
+      "loss": 0.2853,
+      "step": 160
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 4.866232005178406,
+      "learning_rate": 4.71578947368421e-06,
+      "loss": 0.335,
+      "step": 170
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 5.153227503990654,
+      "learning_rate": 4.982995951417004e-06,
+      "loss": 0.3563,
+      "step": 180
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 4.833149588589071,
+      "learning_rate": 5.250202429149799e-06,
+      "loss": 0.3363,
+      "step": 190
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 5.057205257076979,
+      "learning_rate": 5.517408906882591e-06,
+      "loss": 0.3718,
+      "step": 200
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 5.623057559852214,
+      "learning_rate": 5.784615384615385e-06,
+      "loss": 0.3425,
+      "step": 210
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 5.285732195613963,
+      "learning_rate": 6.05182186234818e-06,
+      "loss": 0.3323,
+      "step": 220
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 5.287687993596724,
+      "learning_rate": 6.319028340080971e-06,
+      "loss": 0.3509,
+      "step": 230
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 5.422212177972575,
+      "learning_rate": 6.586234817813766e-06,
+      "loss": 0.3967,
+      "step": 240
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 4.898901033752465,
+      "learning_rate": 6.853441295546559e-06,
+      "loss": 0.304,
+      "step": 250
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 5.807565871352443,
+      "learning_rate": 7.120647773279354e-06,
+      "loss": 0.3138,
+      "step": 260
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 4.706658034348249,
+      "learning_rate": 7.387854251012147e-06,
+      "loss": 0.2711,
+      "step": 270
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 4.330996445669558,
+      "learning_rate": 7.65506072874494e-06,
+      "loss": 0.3013,
+      "step": 280
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 5.040245023856746,
+      "learning_rate": 7.922267206477734e-06,
+      "loss": 0.2845,
+      "step": 290
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 5.377102462850796,
+      "learning_rate": 8.189473684210527e-06,
+      "loss": 0.2274,
+      "step": 300
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 5.49394013615042,
+      "learning_rate": 8.45668016194332e-06,
+      "loss": 0.3002,
+      "step": 310
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 7.592214672466295,
+      "learning_rate": 8.723886639676115e-06,
+      "loss": 0.2683,
+      "step": 320
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 5.640000073588486,
+      "learning_rate": 8.991093117408907e-06,
+      "loss": 0.2753,
+      "step": 330
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 4.527692569068709,
+      "learning_rate": 9.258299595141701e-06,
+      "loss": 0.2799,
+      "step": 340
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 4.146002735657809,
+      "learning_rate": 9.525506072874495e-06,
+      "loss": 0.2735,
+      "step": 350
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 7.711163580304015,
+      "learning_rate": 9.792712550607289e-06,
+      "loss": 0.2735,
+      "step": 360
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 6.7114456570561005,
+      "learning_rate": 1.0059919028340081e-05,
+      "loss": 0.299,
+      "step": 370
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.166917647191482,
+      "learning_rate": 1.0327125506072877e-05,
+      "loss": 0.2633,
+      "step": 380
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 4.804784956773296,
+      "learning_rate": 1.0594331983805667e-05,
+      "loss": 0.3049,
+      "step": 390
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 5.407897698128889,
+      "learning_rate": 1.0861538461538461e-05,
+      "loss": 0.2665,
+      "step": 400
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 9.363037208047256,
+      "learning_rate": 1.1128744939271257e-05,
+      "loss": 0.2891,
+      "step": 410
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.40785296512954,
+      "learning_rate": 1.1395951417004049e-05,
+      "loss": 0.305,
+      "step": 420
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 4.832043881349699,
+      "learning_rate": 1.1663157894736843e-05,
+      "loss": 0.2275,
+      "step": 430
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 5.819742592169462,
+      "learning_rate": 1.1930364372469638e-05,
+      "loss": 0.2919,
+      "step": 440
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 4.641758430345508,
+      "learning_rate": 1.2197570850202429e-05,
+      "loss": 0.2943,
+      "step": 450
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 6.619813436316814,
+      "learning_rate": 1.2464777327935223e-05,
+      "loss": 0.2852,
+      "step": 460
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 4.1511414761990375,
+      "learning_rate": 1.2731983805668018e-05,
+      "loss": 0.2886,
+      "step": 470
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 13.864446251944102,
+      "learning_rate": 1.299919028340081e-05,
+      "loss": 0.2418,
+      "step": 480
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 4.2334846039024985,
+      "learning_rate": 1.3266396761133604e-05,
+      "loss": 0.3023,
+      "step": 490
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 8.049684733976,
+      "learning_rate": 1.3533603238866397e-05,
+      "loss": 0.2475,
+      "step": 500
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 10.458696782052067,
+      "learning_rate": 1.3800809716599192e-05,
+      "loss": 0.25,
+      "step": 510
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 4.63729749376232,
+      "learning_rate": 1.4068016194331984e-05,
+      "loss": 0.2871,
+      "step": 520
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 7.406867021700095,
+      "learning_rate": 1.4335222672064776e-05,
+      "loss": 0.2932,
+      "step": 530
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 5.89092417799605,
+      "learning_rate": 1.4602429149797572e-05,
+      "loss": 0.2841,
+      "step": 540
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 4.784763615906999,
+      "learning_rate": 1.4869635627530366e-05,
+      "loss": 0.2583,
+      "step": 550
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 5.425616026327031,
+      "learning_rate": 1.5136842105263158e-05,
+      "loss": 0.2633,
+      "step": 560
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 3.972773037284401,
+      "learning_rate": 1.5404048582995954e-05,
+      "loss": 0.2528,
+      "step": 570
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 4.221584018079482,
+      "learning_rate": 1.5671255060728746e-05,
+      "loss": 0.2633,
+      "step": 580
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 7.132176048070046,
+      "learning_rate": 1.5938461538461538e-05,
+      "loss": 0.2136,
+      "step": 590
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 5.736662281189121,
+      "learning_rate": 1.6205668016194334e-05,
+      "loss": 0.2639,
+      "step": 600
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 4.001622347847798,
+      "learning_rate": 1.6472874493927126e-05,
+      "loss": 0.315,
+      "step": 610
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 4.526524467163638,
+      "learning_rate": 1.674008097165992e-05,
+      "loss": 0.2424,
+      "step": 620
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 9.626596293663294,
+      "learning_rate": 1.7007287449392717e-05,
+      "loss": 0.2417,
+      "step": 630
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 4.2292844007418555,
+      "learning_rate": 1.727449392712551e-05,
+      "loss": 0.2534,
+      "step": 640
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 4.808396608849778,
+      "learning_rate": 1.75417004048583e-05,
+      "loss": 0.2501,
+      "step": 650
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 3.79257297513555,
+      "learning_rate": 1.7808906882591094e-05,
+      "loss": 0.2583,
+      "step": 660
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.8750772880448703,
+      "learning_rate": 1.807611336032389e-05,
+      "loss": 0.2296,
+      "step": 670
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 4.439196839762044,
+      "learning_rate": 1.834331983805668e-05,
+      "loss": 0.2439,
+      "step": 680
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 4.3898290815513965,
+      "learning_rate": 1.8610526315789473e-05,
+      "loss": 0.2899,
+      "step": 690
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 4.42001400664859,
+      "learning_rate": 1.887773279352227e-05,
+      "loss": 0.2616,
+      "step": 700
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 5.324629277322316,
+      "learning_rate": 1.9144939271255065e-05,
+      "loss": 0.2547,
+      "step": 710
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 5.181807370455961,
+      "learning_rate": 1.9412145748987857e-05,
+      "loss": 0.2746,
+      "step": 720
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.195963917912233,
+      "learning_rate": 1.967935222672065e-05,
+      "loss": 0.2269,
+      "step": 730
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 21.9010566073324,
+      "learning_rate": 1.994655870445344e-05,
+      "loss": 0.2214,
+      "step": 740
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.220773890093973,
+      "learning_rate": 1.9999979802007072e-05,
+      "loss": 0.3237,
+      "step": 750
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 7.431578258197571,
+      "learning_rate": 1.9999909981889357e-05,
+      "loss": 0.2585,
+      "step": 760
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 5.277053700798239,
+      "learning_rate": 1.999979029063708e-05,
+      "loss": 0.2865,
+      "step": 770
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 4.434866755224634,
+      "learning_rate": 1.9999620728847215e-05,
+      "loss": 0.2355,
+      "step": 780
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 5.028960471152081,
+      "learning_rate": 1.9999401297365485e-05,
+      "loss": 0.2896,
+      "step": 790
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 4.533763530364985,
+      "learning_rate": 1.999913199728633e-05,
+      "loss": 0.3033,
+      "step": 800
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 3.554515792556175,
+      "learning_rate": 1.9998812829952933e-05,
+      "loss": 0.2617,
+      "step": 810
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 4.026912751246585,
+      "learning_rate": 1.999844379695719e-05,
+      "loss": 0.2924,
+      "step": 820
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 8.551197472310484,
+      "learning_rate": 1.999802490013971e-05,
+      "loss": 0.2658,
+      "step": 830
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 4.390661707823087,
+      "learning_rate": 1.9997556141589807e-05,
+      "loss": 0.2386,
+      "step": 840
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 3.697035277012858,
+      "learning_rate": 1.9997037523645485e-05,
+      "loss": 0.2787,
+      "step": 850
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 4.624963502482154,
+      "learning_rate": 1.9996469048893438e-05,
+      "loss": 0.2885,
+      "step": 860
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 4.519073120214446,
+      "learning_rate": 1.999585072016902e-05,
+      "loss": 0.2184,
+      "step": 870
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 4.45710292962464,
+      "learning_rate": 1.9995182540556242e-05,
+      "loss": 0.2465,
+      "step": 880
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 4.907767674612729,
+      "learning_rate": 1.9994464513387758e-05,
+      "loss": 0.2579,
+      "step": 890
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 4.569838800905319,
+      "learning_rate": 1.999369664224484e-05,
+      "loss": 0.3058,
+      "step": 900
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 10.180778298594696,
+      "learning_rate": 1.9992878930957364e-05,
+      "loss": 0.2722,
+      "step": 910
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 3.9044365907967737,
+      "learning_rate": 1.9992011383603794e-05,
+      "loss": 0.2905,
+      "step": 920
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 4.204036344405577,
+      "learning_rate": 1.999109400451116e-05,
+      "loss": 0.2597,
+      "step": 930
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 4.483086909407923,
+      "learning_rate": 1.9990126798255032e-05,
+      "loss": 0.2527,
+      "step": 940
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 5.650984362708689,
+      "learning_rate": 1.9989109769659506e-05,
+      "loss": 0.2924,
+      "step": 950
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 4.289005355593254,
+      "learning_rate": 1.9988042923797176e-05,
+      "loss": 0.2248,
+      "step": 960
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 5.879170337714607,
+      "learning_rate": 1.9986926265989092e-05,
+      "loss": 0.2313,
+      "step": 970
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.7369398894003716,
+      "learning_rate": 1.9985759801804768e-05,
+      "loss": 0.2655,
+      "step": 980
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 3.9309313039975713,
+      "learning_rate": 1.998454353706213e-05,
+      "loss": 0.2413,
+      "step": 990
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 3.853949683327624,
+      "learning_rate": 1.998327747782748e-05,
+      "loss": 0.2626,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 4.233603769710417,
+      "learning_rate": 1.9981961630415495e-05,
+      "loss": 0.2813,
+      "step": 1010
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 3.4712931614995775,
+      "learning_rate": 1.9980596001389173e-05,
+      "loss": 0.2804,
+      "step": 1020
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 3.5161721036970564,
+      "learning_rate": 1.9979180597559795e-05,
+      "loss": 0.2498,
+      "step": 1030
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 3.914986811551112,
+      "learning_rate": 1.997771542598691e-05,
+      "loss": 0.2638,
+      "step": 1040
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 3.2174038340462405,
+      "learning_rate": 1.9976200493978302e-05,
+      "loss": 0.2188,
+      "step": 1050
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 5.034094723003907,
+      "learning_rate": 1.9974635809089923e-05,
+      "loss": 0.2574,
+      "step": 1060
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 3.9061787102869756,
+      "learning_rate": 1.9973021379125887e-05,
+      "loss": 0.2823,
+      "step": 1070
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 3.749214471559017,
+      "learning_rate": 1.9971357212138418e-05,
+      "loss": 0.2158,
+      "step": 1080
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 4.438077647261819,
+      "learning_rate": 1.9969643316427806e-05,
+      "loss": 0.2373,
+      "step": 1090
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 3.7946665428127644,
+      "learning_rate": 1.9967879700542382e-05,
+      "loss": 0.266,
+      "step": 1100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 4.020417555953455,
+      "learning_rate": 1.996606637327846e-05,
+      "loss": 0.2692,
+      "step": 1110
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 5.291843241151047,
+      "learning_rate": 1.9964203343680284e-05,
+      "loss": 0.2477,
+      "step": 1120
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 3.3437519964764775,
+      "learning_rate": 1.996229062104001e-05,
+      "loss": 0.2507,
+      "step": 1130
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 3.3879034836678033,
+      "learning_rate": 1.996032821489765e-05,
+      "loss": 0.2476,
+      "step": 1140
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 3.7050532861499375,
+      "learning_rate": 1.9958316135041e-05,
+      "loss": 0.2224,
+      "step": 1150
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 3.8739888502442037,
+      "learning_rate": 1.995625439150564e-05,
+      "loss": 0.2419,
+      "step": 1160
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.982270952246383,
+      "learning_rate": 1.9954142994574825e-05,
+      "loss": 0.2242,
+      "step": 1170
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 9.832505746272833,
+      "learning_rate": 1.9951981954779488e-05,
+      "loss": 0.2591,
+      "step": 1180
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 4.369158741790222,
+      "learning_rate": 1.9949771282898153e-05,
+      "loss": 0.2374,
+      "step": 1190
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 5.630104122791308,
+      "learning_rate": 1.994751098995689e-05,
+      "loss": 0.2442,
+      "step": 1200
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 4.220933137387155,
+      "learning_rate": 1.9945201087229272e-05,
+      "loss": 0.2431,
+      "step": 1210
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 5.346564014873171,
+      "learning_rate": 1.9942841586236297e-05,
+      "loss": 0.2787,
+      "step": 1220
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 3.884615013459559,
+      "learning_rate": 1.9940432498746342e-05,
+      "loss": 0.1948,
+      "step": 1230
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 4.199936121500256,
+      "learning_rate": 1.993797383677512e-05,
+      "loss": 0.2605,
+      "step": 1240
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 3.7958971813092854,
+      "learning_rate": 1.9935465612585588e-05,
+      "loss": 0.2077,
+      "step": 1250
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 4.513168141785257,
+      "learning_rate": 1.993290783868791e-05,
+      "loss": 0.2584,
+      "step": 1260
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 4.7824378439198805,
+      "learning_rate": 1.993030052783938e-05,
+      "loss": 0.3022,
+      "step": 1270
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 4.207953986768479,
+      "learning_rate": 1.992764369304438e-05,
+      "loss": 0.2308,
+      "step": 1280
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 17.62536360994611,
+      "learning_rate": 1.9924937347554282e-05,
+      "loss": 0.2761,
+      "step": 1290
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 4.270009899619283,
+      "learning_rate": 1.9922181504867414e-05,
+      "loss": 0.2427,
+      "step": 1300
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 4.922976222292152,
+      "learning_rate": 1.9919376178728975e-05,
+      "loss": 0.2306,
+      "step": 1310
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 4.215458242682613,
+      "learning_rate": 1.9916521383130965e-05,
+      "loss": 0.2339,
+      "step": 1320
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.559875114528341,
+      "learning_rate": 1.9913617132312132e-05,
+      "loss": 0.1979,
+      "step": 1330
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 3.911384564892276,
+      "learning_rate": 1.9910663440757878e-05,
+      "loss": 0.2474,
+      "step": 1340
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 3.5797528557741036,
+      "learning_rate": 1.9907660323200207e-05,
+      "loss": 0.223,
+      "step": 1350
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 7.031043833783013,
+      "learning_rate": 1.9904607794617635e-05,
+      "loss": 0.2141,
+      "step": 1360
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 3.721340014798071,
+      "learning_rate": 1.9901505870235137e-05,
+      "loss": 0.2039,
+      "step": 1370
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 3.889394419978848,
+      "learning_rate": 1.989835456552404e-05,
+      "loss": 0.3015,
+      "step": 1380
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 4.368266346017717,
+      "learning_rate": 1.9895153896201977e-05,
+      "loss": 0.2278,
+      "step": 1390
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 4.736620682008376,
+      "learning_rate": 1.9891903878232782e-05,
+      "loss": 0.246,
+      "step": 1400
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 3.8257038790244744,
+      "learning_rate": 1.9888604527826435e-05,
+      "loss": 0.2758,
+      "step": 1410
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 3.1454697624115093,
+      "learning_rate": 1.9885255861438966e-05,
+      "loss": 0.2253,
+      "step": 1420
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 4.241144266291823,
+      "learning_rate": 1.988185789577237e-05,
+      "loss": 0.2545,
+      "step": 1430
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 4.303491443553284,
+      "learning_rate": 1.987841064777454e-05,
+      "loss": 0.2547,
+      "step": 1440
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 7.233119649704639,
+      "learning_rate": 1.9874914134639163e-05,
+      "loss": 0.2697,
+      "step": 1450
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 3.1627622593633133,
+      "learning_rate": 1.987136837380565e-05,
+      "loss": 0.2072,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 4.803567895467617,
+      "learning_rate": 1.986777338295904e-05,
+      "loss": 0.2504,
+      "step": 1470
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 4.645148028590643,
+      "learning_rate": 1.9864129180029915e-05,
+      "loss": 0.2957,
+      "step": 1480
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 4.372735663683347,
+      "learning_rate": 1.9860435783194306e-05,
+      "loss": 0.2342,
+      "step": 1490
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 4.244572708317798,
+      "learning_rate": 1.9856693210873616e-05,
+      "loss": 0.1971,
+      "step": 1500
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 3.150358082839134,
+      "learning_rate": 1.9852901481734505e-05,
+      "loss": 0.2601,
+      "step": 1510
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 3.8063005165293173,
+      "learning_rate": 1.9849060614688825e-05,
+      "loss": 0.2419,
+      "step": 1520
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 7.0543295261475105,
+      "learning_rate": 1.98451706288935e-05,
+      "loss": 0.2637,
+      "step": 1530
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 3.795035839087491,
+      "learning_rate": 1.9841231543750446e-05,
+      "loss": 0.2632,
+      "step": 1540
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 4.609631133641002,
+      "learning_rate": 1.983724337890647e-05,
+      "loss": 0.234,
+      "step": 1550
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 4.678952778756691,
+      "learning_rate": 1.9833206154253165e-05,
+      "loss": 0.2096,
+      "step": 1560
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 3.695528116851361,
+      "learning_rate": 1.9829119889926836e-05,
+      "loss": 0.2585,
+      "step": 1570
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 3.787900010945368,
+      "learning_rate": 1.9824984606308356e-05,
+      "loss": 0.2201,
+      "step": 1580
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 4.851549133184082,
+      "learning_rate": 1.982080032402311e-05,
+      "loss": 0.2625,
+      "step": 1590
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 4.285022938688077,
+      "learning_rate": 1.9816567063940856e-05,
+      "loss": 0.1898,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 4.576105603313343,
+      "learning_rate": 1.981228484717565e-05,
+      "loss": 0.3178,
+      "step": 1610
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 2.614430082371516,
+      "learning_rate": 1.980795369508572e-05,
+      "loss": 0.2022,
+      "step": 1620
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 4.348898710957631,
+      "learning_rate": 1.9803573629273364e-05,
+      "loss": 0.2426,
+      "step": 1630
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 3.7753032234127812,
+      "learning_rate": 1.9799144671584853e-05,
+      "loss": 0.2547,
+      "step": 1640
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 5.096970178030036,
+      "learning_rate": 1.9794666844110303e-05,
+      "loss": 0.209,
+      "step": 1650
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 3.8017117810837084,
+      "learning_rate": 1.979014016918359e-05,
+      "loss": 0.2149,
+      "step": 1660
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 4.186597128936523,
+      "learning_rate": 1.978556466938221e-05,
+      "loss": 0.2464,
+      "step": 1670
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 3.696513394158522,
+      "learning_rate": 1.978094036752719e-05,
+      "loss": 0.2568,
+      "step": 1680
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 4.675325784194562,
+      "learning_rate": 1.9776267286682965e-05,
+      "loss": 0.2323,
+      "step": 1690
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 5.427689218899514,
+      "learning_rate": 1.9771545450157254e-05,
+      "loss": 0.2631,
+      "step": 1700
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 3.1730771347081395,
+      "learning_rate": 1.9766774881500958e-05,
+      "loss": 0.216,
+      "step": 1710
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 3.581631337579299,
+      "learning_rate": 1.9761955604508043e-05,
+      "loss": 0.2327,
+      "step": 1720
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 4.216013154865277,
+      "learning_rate": 1.975708764321541e-05,
+      "loss": 0.2737,
+      "step": 1730
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 3.616181664241495,
+      "learning_rate": 1.975217102190278e-05,
+      "loss": 0.2531,
+      "step": 1740
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 4.944731426602014,
+      "learning_rate": 1.974720576509257e-05,
+      "loss": 0.2329,
+      "step": 1750
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 3.4547474359483075,
+      "learning_rate": 1.9742191897549783e-05,
+      "loss": 0.2082,
+      "step": 1760
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 16.606773079360572,
+      "learning_rate": 1.973712944428187e-05,
+      "loss": 0.2476,
+      "step": 1770
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 3.224499798212032,
+      "learning_rate": 1.9732018430538613e-05,
+      "loss": 0.2574,
+      "step": 1780
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 3.5823954456349805,
+      "learning_rate": 1.9726858881811992e-05,
+      "loss": 0.2242,
+      "step": 1790
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 4.249584626487707,
+      "learning_rate": 1.9721650823836074e-05,
+      "loss": 0.1894,
+      "step": 1800
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.805651655743975,
+      "learning_rate": 1.971639428258686e-05,
+      "loss": 0.2224,
+      "step": 1810
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 6.825631461083238,
+      "learning_rate": 1.971108928428218e-05,
+      "loss": 0.206,
+      "step": 1820
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 3.668175314712389,
+      "learning_rate": 1.9705735855381544e-05,
+      "loss": 0.228,
+      "step": 1830
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.8084463789135126,
+      "learning_rate": 1.9700334022586016e-05,
+      "loss": 0.2313,
+      "step": 1840
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 3.566250082150367,
+      "learning_rate": 1.9694883812838095e-05,
+      "loss": 0.2291,
+      "step": 1850
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 3.9864026464050437,
+      "learning_rate": 1.9689385253321548e-05,
+      "loss": 0.2312,
+      "step": 1860
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 4.0540053541677885,
+      "learning_rate": 1.9683838371461315e-05,
+      "loss": 0.2687,
+      "step": 1870
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 3.6121079968078234,
+      "learning_rate": 1.9678243194923333e-05,
+      "loss": 0.2403,
+      "step": 1880
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 4.062791315050937,
+      "learning_rate": 1.9672599751614427e-05,
+      "loss": 0.2225,
+      "step": 1890
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 11.916356720706089,
+      "learning_rate": 1.966690806968216e-05,
+      "loss": 0.2517,
+      "step": 1900
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 9.070397034086051,
+      "learning_rate": 1.9661168177514683e-05,
+      "loss": 0.2386,
+      "step": 1910
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 4.223253798926425,
+      "learning_rate": 1.9655380103740618e-05,
+      "loss": 0.2409,
+      "step": 1920
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 3.338130757023872,
+      "learning_rate": 1.9649543877228886e-05,
+      "loss": 0.2368,
+      "step": 1930
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 3.545188334725578,
+      "learning_rate": 1.9643659527088587e-05,
+      "loss": 0.1738,
+      "step": 1940
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 3.408623244835359,
+      "learning_rate": 1.963772708266884e-05,
+      "loss": 0.198,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 3.900722651898139,
+      "learning_rate": 1.9631746573558646e-05,
+      "loss": 0.1959,
+      "step": 1960
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 3.692501620096666,
+      "learning_rate": 1.9625718029586732e-05,
+      "loss": 0.2356,
+      "step": 1970
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 3.655015133723331,
+      "learning_rate": 1.9619641480821407e-05,
+      "loss": 0.242,
+      "step": 1980
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 13.072999667731198,
+      "learning_rate": 1.9613516957570416e-05,
+      "loss": 0.2206,
+      "step": 1990
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 3.6663532701131696,
+      "learning_rate": 1.9607344490380778e-05,
+      "loss": 0.2006,
+      "step": 2000
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.9175793161114592,
+      "learning_rate": 1.9601124110038647e-05,
+      "loss": 0.2138,
+      "step": 2010
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.244205112045336,
+      "learning_rate": 1.9594855847569144e-05,
+      "loss": 0.2025,
+      "step": 2020
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.896116682683314,
+      "learning_rate": 1.9588539734236213e-05,
+      "loss": 0.1844,
+      "step": 2030
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.0057399160737157,
+      "learning_rate": 1.958217580154246e-05,
+      "loss": 0.1576,
+      "step": 2040
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.1453450088649535,
+      "learning_rate": 1.9575764081229004e-05,
+      "loss": 0.2168,
+      "step": 2050
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.030575801761766,
+      "learning_rate": 1.9569304605275304e-05,
+      "loss": 0.1728,
+      "step": 2060
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.5496294537760775,
+      "learning_rate": 1.9562797405899012e-05,
+      "loss": 0.2134,
+      "step": 2070
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.242162724268332,
+      "learning_rate": 1.955624251555581e-05,
+      "loss": 0.201,
+      "step": 2080
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.9933952709556402,
+      "learning_rate": 1.954963996693924e-05,
+      "loss": 0.2179,
+      "step": 2090
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.240735158513416,
+      "learning_rate": 1.954298979298055e-05,
+      "loss": 0.2494,
+      "step": 2100
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.397210518680832,
+      "learning_rate": 1.953629202684853e-05,
+      "loss": 0.2077,
+      "step": 2110
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 4.51567146250967,
+      "learning_rate": 1.9529546701949338e-05,
+      "loss": 0.1859,
+      "step": 2120
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.6048501115252622,
+      "learning_rate": 1.952275385192635e-05,
+      "loss": 0.1858,
+      "step": 2130
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 3.266632654700684,
+      "learning_rate": 1.951591351065996e-05,
+      "loss": 0.2065,
+      "step": 2140
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.9963705041975857,
+      "learning_rate": 1.950902571226745e-05,
+      "loss": 0.2395,
+      "step": 2150
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.47269279368607,
+      "learning_rate": 1.9502090491102805e-05,
+      "loss": 0.2234,
+      "step": 2160
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.4172687657411283,
+      "learning_rate": 1.949510788175652e-05,
+      "loss": 0.2054,
+      "step": 2170
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.24131454859904,
+      "learning_rate": 1.948807791905546e-05,
+      "loss": 0.2036,
+      "step": 2180
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.7824059130997916,
+      "learning_rate": 1.9481000638062667e-05,
+      "loss": 0.211,
+      "step": 2190
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.5084669738687966,
+      "learning_rate": 1.9473876074077193e-05,
+      "loss": 0.1987,
+      "step": 2200
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.374685869021834,
+      "learning_rate": 1.946670426263392e-05,
+      "loss": 0.18,
+      "step": 2210
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.1982980430417665,
+      "learning_rate": 1.9459485239503385e-05,
+      "loss": 0.1883,
+      "step": 2220
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.2072115394074947,
+      "learning_rate": 1.9452219040691604e-05,
+      "loss": 0.1962,
+      "step": 2230
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.616971043987373,
+      "learning_rate": 1.9444905702439874e-05,
+      "loss": 0.2126,
+      "step": 2240
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.1674222240111525,
+      "learning_rate": 1.943754526122463e-05,
+      "loss": 0.1622,
+      "step": 2250
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.201552458849196,
+      "learning_rate": 1.9430137753757222e-05,
+      "loss": 0.2293,
+      "step": 2260
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.5879310691120314,
+      "learning_rate": 1.9422683216983766e-05,
+      "loss": 0.2008,
+      "step": 2270
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.113474838782104,
+      "learning_rate": 1.9415181688084922e-05,
+      "loss": 0.1622,
+      "step": 2280
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 2.05705061518402,
+      "learning_rate": 1.9407633204475756e-05,
+      "loss": 0.1674,
+      "step": 2290
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 3.3556666998600018,
+      "learning_rate": 1.940003780380551e-05,
+      "loss": 0.1765,
+      "step": 2300
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.332083901173591,
+      "learning_rate": 1.9392395523957438e-05,
+      "loss": 0.1656,
+      "step": 2310
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.3736488112606757,
+      "learning_rate": 1.9384706403048618e-05,
+      "loss": 0.2206,
+      "step": 2320
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.6382812391557318,
+      "learning_rate": 1.937697047942974e-05,
+      "loss": 0.1645,
+      "step": 2330
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.6013892863578194,
+      "learning_rate": 1.9369187791684943e-05,
+      "loss": 0.1705,
+      "step": 2340
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.1145392602796855,
+      "learning_rate": 1.9361358378631604e-05,
+      "loss": 0.206,
+      "step": 2350
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.200460676164746,
+      "learning_rate": 1.9353482279320154e-05,
+      "loss": 0.2172,
+      "step": 2360
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.3806064086412455,
+      "learning_rate": 1.9345559533033867e-05,
+      "loss": 0.1837,
+      "step": 2370
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.53378489217648,
+      "learning_rate": 1.9337590179288694e-05,
+      "loss": 0.1962,
+      "step": 2380
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 13.445648419685131,
+      "learning_rate": 1.9329574257833035e-05,
+      "loss": 0.1332,
+      "step": 2390
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.7839934654329992,
+      "learning_rate": 1.932151180864756e-05,
+      "loss": 0.1713,
+      "step": 2400
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.9367486050463985,
+      "learning_rate": 1.9313402871945e-05,
+      "loss": 0.207,
+      "step": 2410
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 3.9679375965918084,
+      "learning_rate": 1.930524748816995e-05,
+      "loss": 0.1766,
+      "step": 2420
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 2.12675067368969,
+      "learning_rate": 1.9297045697998667e-05,
+      "loss": 0.2119,
+      "step": 2430
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 2.330245927266684,
+      "learning_rate": 1.9288797542338875e-05,
+      "loss": 0.192,
+      "step": 2440
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 2.2758592638707986,
+      "learning_rate": 1.9280503062329537e-05,
+      "loss": 0.2218,
+      "step": 2450
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 3.420743432917777,
+      "learning_rate": 1.9272162299340675e-05,
+      "loss": 0.1517,
+      "step": 2460
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.4543338792444171,
+      "learning_rate": 1.9263775294973168e-05,
+      "loss": 0.1854,
+      "step": 2470
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 2.1078454017192123,
+      "learning_rate": 1.92553420910585e-05,
+      "loss": 0.2071,
+      "step": 2480
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 2.24945739985224,
+      "learning_rate": 1.9246862729658616e-05,
+      "loss": 0.1434,
+      "step": 2490
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 2.313736662246618,
+      "learning_rate": 1.9238337253065655e-05,
+      "loss": 0.2095,
+      "step": 2500
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 2.7303675467709847,
+      "learning_rate": 1.922976570380177e-05,
+      "loss": 0.2015,
+      "step": 2510
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 3.1027147281182703,
+      "learning_rate": 1.9221148124618915e-05,
+      "loss": 0.1902,
+      "step": 2520
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.9571047100967691,
+      "learning_rate": 1.921248455849862e-05,
+      "loss": 0.163,
+      "step": 2530
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 2.258815028020344,
+      "learning_rate": 1.9203775048651776e-05,
+      "loss": 0.159,
+      "step": 2540
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 3.119730206342172,
+      "learning_rate": 1.9195019638518437e-05,
+      "loss": 0.1766,
+      "step": 2550
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 4.155231092799811,
+      "learning_rate": 1.9186218371767587e-05,
+      "loss": 0.1999,
+      "step": 2560
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 3.938178594333825,
+      "learning_rate": 1.9177371292296926e-05,
+      "loss": 0.1967,
+      "step": 2570
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 2.264714656649641,
+      "learning_rate": 1.916847844423265e-05,
+      "loss": 0.1873,
+      "step": 2580
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 2.3278837142950835,
+      "learning_rate": 1.915953987192924e-05,
+      "loss": 0.198,
+      "step": 2590
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 2.3796897562558557,
+      "learning_rate": 1.9150555619969228e-05,
+      "loss": 0.1591,
+      "step": 2600
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.2146800627013359,
+      "learning_rate": 1.914152573316298e-05,
+      "loss": 0.1772,
+      "step": 2610
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 2.284220185885104,
+      "learning_rate": 1.9132450256548482e-05,
+      "loss": 0.1924,
+      "step": 2620
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 2.3132986803051954,
+      "learning_rate": 1.912332923539109e-05,
+      "loss": 0.1575,
+      "step": 2630
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 3.3347587557674214,
+      "learning_rate": 1.9114162715183338e-05,
+      "loss": 0.2016,
+      "step": 2640
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 4.1416219410825565,
+      "learning_rate": 1.9104950741644682e-05,
+      "loss": 0.1841,
+      "step": 2650
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9784717611438265,
+      "learning_rate": 1.9095693360721288e-05,
+      "loss": 0.1988,
+      "step": 2660
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9851485419245707,
+      "learning_rate": 1.90863906185858e-05,
+      "loss": 0.2306,
+      "step": 2670
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.7507079599778372,
+      "learning_rate": 1.90770425616371e-05,
+      "loss": 0.1812,
+      "step": 2680
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.4418225191285714,
+      "learning_rate": 1.90676492365001e-05,
+      "loss": 0.1724,
+      "step": 2690
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 6.358944590106823,
+      "learning_rate": 1.905821069002548e-05,
+      "loss": 0.2033,
+      "step": 2700
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.569265877186431,
+      "learning_rate": 1.9048726969289472e-05,
+      "loss": 0.1962,
+      "step": 2710
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.143027189864614,
+      "learning_rate": 1.9039198121593623e-05,
+      "loss": 0.2134,
+      "step": 2720
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.3661795002812593,
+      "learning_rate": 1.9029624194464562e-05,
+      "loss": 0.1594,
+      "step": 2730
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.2244765834008855,
+      "learning_rate": 1.9020005235653752e-05,
+      "loss": 0.1797,
+      "step": 2740
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.48042520800243,
+      "learning_rate": 1.9010341293137265e-05,
+      "loss": 0.1992,
+      "step": 2750
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.4141715312149854,
+      "learning_rate": 1.9000632415115526e-05,
+      "loss": 0.194,
+      "step": 2760
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.1311219610420453,
+      "learning_rate": 1.8990878650013095e-05,
+      "loss": 0.2152,
+      "step": 2770
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.2386424254232806,
+      "learning_rate": 1.8981080046478408e-05,
+      "loss": 0.1678,
+      "step": 2780
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 2.1290366036640167,
+      "learning_rate": 1.8971236653383534e-05,
+      "loss": 0.1815,
+      "step": 2790
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 2.332990235126333,
+      "learning_rate": 1.896134851982395e-05,
+      "loss": 0.1601,
+      "step": 2800
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9313701752737014,
+      "learning_rate": 1.895141569511827e-05,
+      "loss": 0.1913,
+      "step": 2810
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 2.0079851484471387,
+      "learning_rate": 1.8941438228808023e-05,
+      "loss": 0.2147,
+      "step": 2820
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 3.300791940163902,
+      "learning_rate": 1.8931416170657383e-05,
+      "loss": 0.198,
+      "step": 2830
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.005476843621061,
+      "learning_rate": 1.892134957065295e-05,
+      "loss": 0.1835,
+      "step": 2840
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.6085877287350487,
+      "learning_rate": 1.8911238479003464e-05,
+      "loss": 0.1898,
+      "step": 2850
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.5393547946239683,
+      "learning_rate": 1.8901082946139585e-05,
+      "loss": 0.2094,
+      "step": 2860
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.1428525152610582,
+      "learning_rate": 1.8890883022713635e-05,
+      "loss": 0.1754,
+      "step": 2870
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.9149274467688195,
+      "learning_rate": 1.8880638759599327e-05,
+      "loss": 0.2039,
+      "step": 2880
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 3.51632507135593,
+      "learning_rate": 1.8870350207891536e-05,
+      "loss": 0.1857,
+      "step": 2890
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 2.4186516754116196,
+      "learning_rate": 1.8860017418906028e-05,
+      "loss": 0.1374,
+      "step": 2900
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 2.0258647249782333,
+      "learning_rate": 1.884964044417921e-05,
+      "loss": 0.2084,
+      "step": 2910
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 2.0985094320845574,
+      "learning_rate": 1.8839219335467886e-05,
+      "loss": 0.1484,
+      "step": 2920
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 2.19199552449932,
+      "learning_rate": 1.8828754144748958e-05,
+      "loss": 0.2029,
+      "step": 2930
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.9095904364534577,
+      "learning_rate": 1.8818244924219217e-05,
+      "loss": 0.1997,
+      "step": 2940
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 2.052834839544693,
+      "learning_rate": 1.8807691726295053e-05,
+      "loss": 0.1536,
+      "step": 2950
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 2.179267323237696,
+      "learning_rate": 1.8797094603612192e-05,
+      "loss": 0.2086,
+      "step": 2960
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 2.5098689789030355,
+      "learning_rate": 1.878645360902546e-05,
+      "loss": 0.1994,
+      "step": 2970
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 4.400137699746838,
+      "learning_rate": 1.8775768795608472e-05,
+      "loss": 0.1606,
+      "step": 2980
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 2.590590977646675,
+      "learning_rate": 1.8765040216653427e-05,
+      "loss": 0.1897,
+      "step": 2990
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 2.1321993774458954,
+      "learning_rate": 1.8754267925670796e-05,
+      "loss": 0.1531,
+      "step": 3000
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 2.5568642398269334,
+      "learning_rate": 1.8743451976389068e-05,
+      "loss": 0.178,
+      "step": 3010
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.9214830076333376,
+      "learning_rate": 1.8732592422754495e-05,
+      "loss": 0.1897,
+      "step": 3020
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.969751254803088,
+      "learning_rate": 1.8721689318930806e-05,
+      "loss": 0.1502,
+      "step": 3030
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 5.017433292114636,
+      "learning_rate": 1.871074271929894e-05,
+      "loss": 0.1982,
+      "step": 3040
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9369614375603101,
+      "learning_rate": 1.8699752678456788e-05,
+      "loss": 0.1719,
+      "step": 3050
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 2.006594277192507,
+      "learning_rate": 1.86887192512189e-05,
+      "loss": 0.1932,
+      "step": 3060
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.715514310451893,
+      "learning_rate": 1.8677642492616236e-05,
+      "loss": 0.1801,
+      "step": 3070
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.9267545952361633,
+      "learning_rate": 1.8666522457895862e-05,
+      "loss": 0.1893,
+      "step": 3080
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 2.1874472475503106,
+      "learning_rate": 1.86553592025207e-05,
+      "loss": 0.1767,
+      "step": 3090
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.8843605357258664,
+      "learning_rate": 1.8644152782169247e-05,
+      "loss": 0.1802,
+      "step": 3100
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 2.3298848304608684,
+      "learning_rate": 1.8632903252735276e-05,
+      "loss": 0.1667,
+      "step": 3110
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 2.0245193476149312,
+      "learning_rate": 1.862161067032759e-05,
+      "loss": 0.1834,
+      "step": 3120
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 2.239252595358892,
+      "learning_rate": 1.861027509126971e-05,
+      "loss": 0.2083,
+      "step": 3130
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 2.000691294407405,
+      "learning_rate": 1.8598896572099624e-05,
+      "loss": 0.1739,
+      "step": 3140
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.656761882061858,
+      "learning_rate": 1.8587475169569483e-05,
+      "loss": 0.1815,
+      "step": 3150
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 2.1245215382192346,
+      "learning_rate": 1.8576010940645325e-05,
+      "loss": 0.1606,
+      "step": 3160
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.7762853803876042,
+      "learning_rate": 1.856450394250679e-05,
+      "loss": 0.1969,
+      "step": 3170
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 2.8830827622510697,
+      "learning_rate": 1.855295423254685e-05,
+      "loss": 0.1555,
+      "step": 3180
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.1861197340441234,
+      "learning_rate": 1.854136186837149e-05,
+      "loss": 0.1889,
+      "step": 3190
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.3198392751530705,
+      "learning_rate": 1.8529726907799444e-05,
+      "loss": 0.1943,
+      "step": 3200
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.9214231378576725,
+      "learning_rate": 1.8518049408861915e-05,
+      "loss": 0.1831,
+      "step": 3210
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.398048593011276,
+      "learning_rate": 1.850632942980226e-05,
+      "loss": 0.2029,
+      "step": 3220
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.2143080398207573,
+      "learning_rate": 1.8494567029075714e-05,
+      "loss": 0.1718,
+      "step": 3230
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.126646974386725,
+      "learning_rate": 1.84827622653491e-05,
+      "loss": 0.1302,
+      "step": 3240
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 11.599444777256583,
+      "learning_rate": 1.847091519750053e-05,
+      "loss": 0.1809,
+      "step": 3250
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.3991674250656403,
+      "learning_rate": 1.8459025884619125e-05,
+      "loss": 0.1865,
+      "step": 3260
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 6.4503963141704155,
+      "learning_rate": 1.844709438600469e-05,
+      "loss": 0.2251,
+      "step": 3270
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.4044007961395626,
+      "learning_rate": 1.8435120761167453e-05,
+      "loss": 0.1708,
+      "step": 3280
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.5993439730175965,
+      "learning_rate": 1.8423105069827753e-05,
+      "loss": 0.1651,
+      "step": 3290
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9740878207799195,
+      "learning_rate": 1.8411047371915737e-05,
+      "loss": 0.1955,
+      "step": 3300
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 2.244083998109266,
+      "learning_rate": 1.839894772757106e-05,
+      "loss": 0.2106,
+      "step": 3310
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 2.2654343097283434,
+      "learning_rate": 1.8386806197142607e-05,
+      "loss": 0.1331,
+      "step": 3320
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.9373155321474749,
+      "learning_rate": 1.837462284118817e-05,
+      "loss": 0.143,
+      "step": 3330
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.041586413720046,
+      "learning_rate": 1.8362397720474144e-05,
+      "loss": 0.1805,
+      "step": 3340
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.311869767372086,
+      "learning_rate": 1.8350130895975247e-05,
+      "loss": 0.1748,
+      "step": 3350
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.0358596142782224,
+      "learning_rate": 1.8337822428874187e-05,
+      "loss": 0.197,
+      "step": 3360
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 12.765829395166001,
+      "learning_rate": 1.8325472380561382e-05,
+      "loss": 0.2043,
+      "step": 3370
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.283832499199408,
+      "learning_rate": 1.831308081263464e-05,
+      "loss": 0.1696,
+      "step": 3380
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 2.099264143128957,
+      "learning_rate": 1.8300647786898843e-05,
+      "loss": 0.1772,
+      "step": 3390
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 3.0418348048172117,
+      "learning_rate": 1.8288173365365675e-05,
+      "loss": 0.1679,
+      "step": 3400
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 2.928400270597656,
+      "learning_rate": 1.827565761025326e-05,
+      "loss": 0.1839,
+      "step": 3410
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.9489192388525756,
+      "learning_rate": 1.82631005839859e-05,
+      "loss": 0.1702,
+      "step": 3420
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.866673546449154,
+      "learning_rate": 1.825050234919374e-05,
+      "loss": 0.1913,
+      "step": 3430
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.9782480885566223,
+      "learning_rate": 1.8237862968712442e-05,
+      "loss": 0.189,
+      "step": 3440
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 2.072252048803903,
+      "learning_rate": 1.8225182505582918e-05,
+      "loss": 0.1908,
+      "step": 3450
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 2.1333806325715523,
+      "learning_rate": 1.821246102305096e-05,
+      "loss": 0.204,
+      "step": 3460
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 2.0590278992877113,
+      "learning_rate": 1.8199698584566967e-05,
+      "loss": 0.1833,
+      "step": 3470
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.2552945044942474,
+      "learning_rate": 1.8186895253785603e-05,
+      "loss": 0.2076,
+      "step": 3480
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.9634969960872868,
+      "learning_rate": 1.8174051094565484e-05,
+      "loss": 0.2097,
+      "step": 3490
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.2410044456550184,
+      "learning_rate": 1.816116617096889e-05,
+      "loss": 0.2217,
+      "step": 3500
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.1336586639575237,
+      "learning_rate": 1.8148240547261387e-05,
+      "loss": 0.18,
+      "step": 3510
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.173336388150898,
+      "learning_rate": 1.813527428791156e-05,
+      "loss": 0.1756,
+      "step": 3520
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.671364540282752,
+      "learning_rate": 1.812226745759066e-05,
+      "loss": 0.1863,
+      "step": 3530
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.9057012299641733,
+      "learning_rate": 1.8109220121172306e-05,
+      "loss": 0.2206,
+      "step": 3540
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1813973982571034,
+      "learning_rate": 1.8096132343732135e-05,
+      "loss": 0.1462,
+      "step": 3550
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.8892866709987723,
+      "learning_rate": 1.808300419054749e-05,
+      "loss": 0.1803,
+      "step": 3560
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.003086926459966,
+      "learning_rate": 1.80698357270971e-05,
+      "loss": 0.2032,
+      "step": 3570
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 3.2221389262807874,
+      "learning_rate": 1.8056627019060738e-05,
+      "loss": 0.1631,
+      "step": 3580
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.2538102728217932,
+      "learning_rate": 1.8043378132318927e-05,
+      "loss": 0.1692,
+      "step": 3590
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 4.563350232475948,
+      "learning_rate": 1.8030089132952557e-05,
+      "loss": 0.1727,
+      "step": 3600
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 2.000374282147408,
+      "learning_rate": 1.8016760087242605e-05,
+      "loss": 0.1733,
+      "step": 3610
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.9278853800801559,
+      "learning_rate": 1.800339106166978e-05,
+      "loss": 0.1852,
+      "step": 3620
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 2.2241283295196514,
+      "learning_rate": 1.79899821229142e-05,
+      "loss": 0.143,
+      "step": 3630
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 2.4864913275556133,
+      "learning_rate": 1.7976533337855053e-05,
+      "loss": 0.1884,
+      "step": 3640
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 3.860042375322093,
+      "learning_rate": 1.7963044773570265e-05,
+      "loss": 0.1641,
+      "step": 3650
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 2.0454176745318677,
+      "learning_rate": 1.7949516497336176e-05,
+      "loss": 0.1864,
+      "step": 3660
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 2.46822670327846,
+      "learning_rate": 1.793594857662718e-05,
+      "loss": 0.1924,
+      "step": 3670
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.4872265651073158,
+      "learning_rate": 1.792234107911542e-05,
+      "loss": 0.1546,
+      "step": 3680
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.0379945728128663,
+      "learning_rate": 1.7908694072670426e-05,
+      "loss": 0.1711,
+      "step": 3690
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.407085620398544,
+      "learning_rate": 1.7895007625358783e-05,
+      "loss": 0.1701,
+      "step": 3700
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 4.042625051438552,
+      "learning_rate": 1.7881281805443805e-05,
+      "loss": 0.195,
+      "step": 3710
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9781223218758097,
+      "learning_rate": 1.786751668138517e-05,
+      "loss": 0.1621,
+      "step": 3720
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 4.657693319193961,
+      "learning_rate": 1.7853712321838602e-05,
+      "loss": 0.1968,
+      "step": 3730
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 17.520636897246522,
+      "learning_rate": 1.7839868795655507e-05,
+      "loss": 0.1475,
+      "step": 3740
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 2.1710924893238626,
+      "learning_rate": 1.782598617188265e-05,
+      "loss": 0.1686,
+      "step": 3750
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 2.205220842093223,
+      "learning_rate": 1.78120645197618e-05,
+      "loss": 0.1587,
+      "step": 3760
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.95114988022513,
+      "learning_rate": 1.7798103908729377e-05,
+      "loss": 0.1743,
+      "step": 3770
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.7456615210938744,
+      "learning_rate": 1.778410440841613e-05,
+      "loss": 0.1468,
+      "step": 3780
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.8973758532017895,
+      "learning_rate": 1.7770066088646767e-05,
+      "loss": 0.1883,
+      "step": 3790
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.5445827161522616,
+      "learning_rate": 1.7755989019439607e-05,
+      "loss": 0.1895,
+      "step": 3800
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.169743981267003,
+      "learning_rate": 1.774187327100625e-05,
+      "loss": 0.1716,
+      "step": 3810
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.1135486144554387,
+      "learning_rate": 1.7727718913751207e-05,
+      "loss": 0.1164,
+      "step": 3820
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 2.0715545958907535,
+      "learning_rate": 1.7713526018271558e-05,
+      "loss": 0.1717,
+      "step": 3830
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 8.108185256384777,
+      "learning_rate": 1.76992946553566e-05,
+      "loss": 0.1662,
+      "step": 3840
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 2.4906125249035713,
+      "learning_rate": 1.7685024895987494e-05,
+      "loss": 0.184,
+      "step": 3850
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 3.009141821458063,
+      "learning_rate": 1.7670716811336902e-05,
+      "loss": 0.2086,
+      "step": 3860
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.935774373100715,
+      "learning_rate": 1.7656370472768648e-05,
+      "loss": 0.1566,
+      "step": 3870
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 6.368484758089139,
+      "learning_rate": 1.7641985951837347e-05,
+      "loss": 0.2127,
+      "step": 3880
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 2.2821667018184533,
+      "learning_rate": 1.7627563320288056e-05,
+      "loss": 0.1822,
+      "step": 3890
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.7481761010849746,
+      "learning_rate": 1.7613102650055925e-05,
+      "loss": 0.2264,
+      "step": 3900
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 2.111098426089184,
+      "learning_rate": 1.759860401326581e-05,
+      "loss": 0.1838,
+      "step": 3910
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 2.3865926806030204,
+      "learning_rate": 1.758406748223194e-05,
+      "loss": 0.1779,
+      "step": 3920
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.1732390806125,
+      "learning_rate": 1.7569493129457554e-05,
+      "loss": 0.1713,
+      "step": 3930
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.8814065202760815,
+      "learning_rate": 1.7554881027634516e-05,
+      "loss": 0.178,
+      "step": 3940
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.16916361006078,
+      "learning_rate": 1.754023124964299e-05,
+      "loss": 0.1475,
+      "step": 3950
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.27943306248829,
+      "learning_rate": 1.7525543868551045e-05,
+      "loss": 0.1997,
+      "step": 3960
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 3.117493119710199,
+      "learning_rate": 1.7510818957614292e-05,
+      "loss": 0.1475,
+      "step": 3970
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 2.3033259103584567,
+      "learning_rate": 1.7496056590275546e-05,
+      "loss": 0.1853,
+      "step": 3980
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 4.045509608812605,
+      "learning_rate": 1.7481256840164436e-05,
+      "loss": 0.171,
+      "step": 3990
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.7320172203917021,
+      "learning_rate": 1.7466419781097038e-05,
+      "loss": 0.1619,
+      "step": 4000
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 2.1721310910520772,
+      "learning_rate": 1.745154548707551e-05,
+      "loss": 0.1614,
+      "step": 4010
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 3.4362498003979374,
+      "learning_rate": 1.7436634032287735e-05,
+      "loss": 0.1885,
+      "step": 4020
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.203441191364378,
+      "learning_rate": 1.7421685491106933e-05,
+      "loss": 0.1746,
+      "step": 4030
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.2235513235782136,
+      "learning_rate": 1.740669993809131e-05,
+      "loss": 0.1958,
+      "step": 4040
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.662744062478203,
+      "learning_rate": 1.7391677447983663e-05,
+      "loss": 0.168,
+      "step": 4050
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.9923877091876279,
+      "learning_rate": 1.7376618095711018e-05,
+      "loss": 0.1718,
+      "step": 4060
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.426593736593661,
+      "learning_rate": 1.7361521956384264e-05,
+      "loss": 0.1741,
+      "step": 4070
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7464227217148067,
+      "learning_rate": 1.7346389105297766e-05,
+      "loss": 0.1726,
+      "step": 4080
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 2.3749413734026383,
+      "learning_rate": 1.7331219617928997e-05,
+      "loss": 0.1583,
+      "step": 4090
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 2.114701165986187,
+      "learning_rate": 1.7316013569938154e-05,
+      "loss": 0.2066,
+      "step": 4100
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.8180037497973824,
+      "learning_rate": 1.73007710371678e-05,
+      "loss": 0.188,
+      "step": 4110
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 2.2556879416055726,
+      "learning_rate": 1.7285492095642455e-05,
+      "loss": 0.1824,
+      "step": 4120
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.9441561651729724,
+      "learning_rate": 1.7270176821568244e-05,
+      "loss": 0.1828,
+      "step": 4130
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 3.863082155472389,
+      "learning_rate": 1.72548252913325e-05,
+      "loss": 0.1929,
+      "step": 4140
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 2.1777659050408067,
+      "learning_rate": 1.72394375815034e-05,
+      "loss": 0.1872,
+      "step": 4150
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 2.289054063384357,
+      "learning_rate": 1.722401376882955e-05,
+      "loss": 0.1619,
+      "step": 4160
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 7.1273553535612,
+      "learning_rate": 1.7208553930239655e-05,
+      "loss": 0.1752,
+      "step": 4170
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 2.4202257016442523,
+      "learning_rate": 1.7193058142842076e-05,
+      "loss": 0.1966,
+      "step": 4180
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 2.599358985816695,
+      "learning_rate": 1.7177526483924492e-05,
+      "loss": 0.1739,
+      "step": 4190
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 2.145207088406254,
+      "learning_rate": 1.7161959030953498e-05,
+      "loss": 0.1606,
+      "step": 4200
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 2.4989297483292643,
+      "learning_rate": 1.71463558615742e-05,
+      "loss": 0.1441,
+      "step": 4210
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 2.333564701416749,
+      "learning_rate": 1.713071705360987e-05,
+      "loss": 0.1697,
+      "step": 4220
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.6007875381874954,
+      "learning_rate": 1.7115042685061507e-05,
+      "loss": 0.1801,
+      "step": 4230
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.0001777331534516,
+      "learning_rate": 1.7099332834107497e-05,
+      "loss": 0.1236,
+      "step": 4240
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.6336492006976187,
+      "learning_rate": 1.7083587579103187e-05,
+      "loss": 0.166,
+      "step": 4250
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.181960694578793,
+      "learning_rate": 1.7067806998580507e-05,
+      "loss": 0.1997,
+      "step": 4260
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 3.2024986585127366,
+      "learning_rate": 1.7051991171247582e-05,
+      "loss": 0.1508,
+      "step": 4270
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 2.0206614417941258,
+      "learning_rate": 1.7036140175988344e-05,
+      "loss": 0.1471,
+      "step": 4280
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 2.047393965997251,
+      "learning_rate": 1.702025409186211e-05,
+      "loss": 0.1777,
+      "step": 4290
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 2.6683142062789713,
+      "learning_rate": 1.7004332998103232e-05,
+      "loss": 0.1769,
+      "step": 4300
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.6732439515126907,
+      "learning_rate": 1.698837697412066e-05,
+      "loss": 0.1268,
+      "step": 4310
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 3.001202183493077,
+      "learning_rate": 1.697238609949757e-05,
+      "loss": 0.1489,
+      "step": 4320
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.9957859824768167,
+      "learning_rate": 1.6956360453990964e-05,
+      "loss": 0.1536,
+      "step": 4330
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 3.2825840735727154,
+      "learning_rate": 1.694030011753127e-05,
+      "loss": 0.2101,
+      "step": 4340
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 2.0765138274517088,
+      "learning_rate": 1.6924205170221933e-05,
+      "loss": 0.1811,
+      "step": 4350
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 2.1133828976030595,
+      "learning_rate": 1.6908075692339035e-05,
+      "loss": 0.1728,
+      "step": 4360
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.1354696745558366,
+      "learning_rate": 1.6891911764330887e-05,
+      "loss": 0.1663,
+      "step": 4370
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 2.131963092613327,
+      "learning_rate": 1.6875713466817608e-05,
+      "loss": 0.1971,
+      "step": 4380
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 6.080414407059132,
+      "learning_rate": 1.6859480880590755e-05,
+      "loss": 0.1518,
+      "step": 4390
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.9313104885467733,
+      "learning_rate": 1.684321408661291e-05,
+      "loss": 0.1726,
+      "step": 4400
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.891911281896888,
+      "learning_rate": 1.6826913166017257e-05,
+      "loss": 0.2049,
+      "step": 4410
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.8833660404212225,
+      "learning_rate": 1.68105782001072e-05,
+      "loss": 0.1628,
+      "step": 4420
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 2.043852343315347,
+      "learning_rate": 1.6794209270355946e-05,
+      "loss": 0.1975,
+      "step": 4430
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.4598525807667742,
+      "learning_rate": 1.677780645840611e-05,
+      "loss": 0.159,
+      "step": 4440
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.6015094283079794,
+      "learning_rate": 1.6761369846069292e-05,
+      "loss": 0.157,
+      "step": 4450
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 2.0628558921093125,
+      "learning_rate": 1.6744899515325674e-05,
+      "loss": 0.1748,
+      "step": 4460
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.076881323364807,
+      "learning_rate": 1.672839554832362e-05,
+      "loss": 0.1966,
+      "step": 4470
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.0325935028828135,
+      "learning_rate": 1.671185802737926e-05,
+      "loss": 0.1885,
+      "step": 4480
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.280549242220261,
+      "learning_rate": 1.6695287034976078e-05,
+      "loss": 0.1624,
+      "step": 4490
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.08609705396923,
+      "learning_rate": 1.6678682653764502e-05,
+      "loss": 0.1631,
+      "step": 4500
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.2526237237040903,
+      "learning_rate": 1.666204496656149e-05,
+      "loss": 0.145,
+      "step": 4510
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.9007711000245981,
+      "learning_rate": 1.6645374056350128e-05,
+      "loss": 0.173,
+      "step": 4520
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.9372171995443488,
+      "learning_rate": 1.6628670006279194e-05,
+      "loss": 0.142,
+      "step": 4530
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 14808,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 4532,
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5a9a77cca320ca6b5e0711cab13402d6db40c25a953099f160649f67f0e98c2
+size 6712

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,592 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)