UltimoUno commited on
Commit
479d4eb
1 Parent(s): b03f73d

Uploaded checkpoint-25000

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +3511 -3
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bcfce4195a15d1c67172a7aefe67af94867820d68efa2e46ab93cd2b5fa134b
3
  size 2692969128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bb301bf8fc5272c04a72e51a13f9b832cd846a9c51360f628cb3fc9a1ac42a2
3
  size 2692969128
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc261511118721b030d66cbd19ed9d28203a26f346b752eaad9f2d558bde7468
3
  size 5386075202
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0211b169e1bc00bdf338234616fcee78f3b3fa8789f060393af487bb73e8419c
3
  size 5386075202
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15a3a2ff59d530809f33e75e185c52fdceb84e6eba7c55faa5cf42d910644089
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c92a95a97d689d636b085d406167a1d143dce26fb83ee64d21cf4b37a120302
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7dc694a733ff91b79c5eaf7bcfe8aa41771c4ef8a47d325d2a9e9f6bc78f946
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3bdbaa37c77733a3ea9eb90a36bc290f4f5b9f56abe23cc6586cbaa459f92c6
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 1.3379485607147217,
3
  "best_model_checkpoint": "runs/deepseek_20240422-210351/checkpoint-15000",
4
- "epoch": 0.5,
5
  "eval_steps": 5000,
6
- "global_step": 20000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -14039,6 +14039,3514 @@
14039
  "eval_samples_per_second": 16.881,
14040
  "eval_steps_per_second": 16.881,
14041
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14042
  }
14043
  ],
14044
  "logging_steps": 10,
@@ -14046,7 +17554,7 @@
14046
  "num_input_tokens_seen": 0,
14047
  "num_train_epochs": 1,
14048
  "save_steps": 5000,
14049
- "total_flos": 3.1467396661248e+17,
14050
  "train_batch_size": 1,
14051
  "trial_name": null,
14052
  "trial_params": null
 
1
  {
2
  "best_metric": 1.3379485607147217,
3
  "best_model_checkpoint": "runs/deepseek_20240422-210351/checkpoint-15000",
4
+ "epoch": 0.625,
5
  "eval_steps": 5000,
6
+ "global_step": 25000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
14039
  "eval_samples_per_second": 16.881,
14040
  "eval_steps_per_second": 16.881,
14041
  "step": 20000
14042
+ },
14043
+ {
14044
+ "epoch": 0.5,
14045
+ "grad_norm": 15.5,
14046
+ "learning_rate": 6.772881355932204e-06,
14047
+ "loss": 1.4218,
14048
+ "step": 20010
14049
+ },
14050
+ {
14051
+ "epoch": 0.5,
14052
+ "grad_norm": 10.75,
14053
+ "learning_rate": 6.766101694915255e-06,
14054
+ "loss": 1.4617,
14055
+ "step": 20020
14056
+ },
14057
+ {
14058
+ "epoch": 0.5,
14059
+ "grad_norm": 9.0,
14060
+ "learning_rate": 6.759322033898306e-06,
14061
+ "loss": 1.3029,
14062
+ "step": 20030
14063
+ },
14064
+ {
14065
+ "epoch": 0.5,
14066
+ "grad_norm": 12.3125,
14067
+ "learning_rate": 6.7525423728813565e-06,
14068
+ "loss": 1.2876,
14069
+ "step": 20040
14070
+ },
14071
+ {
14072
+ "epoch": 0.5,
14073
+ "grad_norm": 15.3125,
14074
+ "learning_rate": 6.745762711864408e-06,
14075
+ "loss": 1.4408,
14076
+ "step": 20050
14077
+ },
14078
+ {
14079
+ "epoch": 0.5,
14080
+ "grad_norm": 35.25,
14081
+ "learning_rate": 6.7389830508474585e-06,
14082
+ "loss": 1.2928,
14083
+ "step": 20060
14084
+ },
14085
+ {
14086
+ "epoch": 0.5,
14087
+ "grad_norm": 23.25,
14088
+ "learning_rate": 6.73220338983051e-06,
14089
+ "loss": 1.3227,
14090
+ "step": 20070
14091
+ },
14092
+ {
14093
+ "epoch": 0.5,
14094
+ "grad_norm": 24.75,
14095
+ "learning_rate": 6.7254237288135604e-06,
14096
+ "loss": 1.3638,
14097
+ "step": 20080
14098
+ },
14099
+ {
14100
+ "epoch": 0.5,
14101
+ "grad_norm": 19.125,
14102
+ "learning_rate": 6.71864406779661e-06,
14103
+ "loss": 1.3714,
14104
+ "step": 20090
14105
+ },
14106
+ {
14107
+ "epoch": 0.5,
14108
+ "grad_norm": 12.625,
14109
+ "learning_rate": 6.7118644067796615e-06,
14110
+ "loss": 1.4923,
14111
+ "step": 20100
14112
+ },
14113
+ {
14114
+ "epoch": 0.5,
14115
+ "grad_norm": 26.25,
14116
+ "learning_rate": 6.705084745762712e-06,
14117
+ "loss": 1.4869,
14118
+ "step": 20110
14119
+ },
14120
+ {
14121
+ "epoch": 0.5,
14122
+ "grad_norm": 17.75,
14123
+ "learning_rate": 6.6983050847457635e-06,
14124
+ "loss": 1.3427,
14125
+ "step": 20120
14126
+ },
14127
+ {
14128
+ "epoch": 0.5,
14129
+ "grad_norm": 33.5,
14130
+ "learning_rate": 6.691525423728814e-06,
14131
+ "loss": 1.3611,
14132
+ "step": 20130
14133
+ },
14134
+ {
14135
+ "epoch": 0.5,
14136
+ "grad_norm": 14.1875,
14137
+ "learning_rate": 6.6847457627118655e-06,
14138
+ "loss": 1.4874,
14139
+ "step": 20140
14140
+ },
14141
+ {
14142
+ "epoch": 0.5,
14143
+ "grad_norm": 28.25,
14144
+ "learning_rate": 6.677966101694916e-06,
14145
+ "loss": 1.368,
14146
+ "step": 20150
14147
+ },
14148
+ {
14149
+ "epoch": 0.5,
14150
+ "grad_norm": 51.5,
14151
+ "learning_rate": 6.6711864406779666e-06,
14152
+ "loss": 1.3734,
14153
+ "step": 20160
14154
+ },
14155
+ {
14156
+ "epoch": 0.5,
14157
+ "grad_norm": 44.5,
14158
+ "learning_rate": 6.664406779661018e-06,
14159
+ "loss": 1.3609,
14160
+ "step": 20170
14161
+ },
14162
+ {
14163
+ "epoch": 0.5,
14164
+ "grad_norm": 41.0,
14165
+ "learning_rate": 6.6576271186440685e-06,
14166
+ "loss": 1.4929,
14167
+ "step": 20180
14168
+ },
14169
+ {
14170
+ "epoch": 0.5,
14171
+ "grad_norm": 49.5,
14172
+ "learning_rate": 6.650847457627119e-06,
14173
+ "loss": 1.3619,
14174
+ "step": 20190
14175
+ },
14176
+ {
14177
+ "epoch": 0.51,
14178
+ "grad_norm": 24.125,
14179
+ "learning_rate": 6.64406779661017e-06,
14180
+ "loss": 1.3749,
14181
+ "step": 20200
14182
+ },
14183
+ {
14184
+ "epoch": 0.51,
14185
+ "grad_norm": 31.75,
14186
+ "learning_rate": 6.637288135593221e-06,
14187
+ "loss": 1.4952,
14188
+ "step": 20210
14189
+ },
14190
+ {
14191
+ "epoch": 0.51,
14192
+ "grad_norm": 21.0,
14193
+ "learning_rate": 6.6305084745762716e-06,
14194
+ "loss": 1.3311,
14195
+ "step": 20220
14196
+ },
14197
+ {
14198
+ "epoch": 0.51,
14199
+ "grad_norm": 28.5,
14200
+ "learning_rate": 6.623728813559322e-06,
14201
+ "loss": 1.4247,
14202
+ "step": 20230
14203
+ },
14204
+ {
14205
+ "epoch": 0.51,
14206
+ "grad_norm": 20.375,
14207
+ "learning_rate": 6.6169491525423735e-06,
14208
+ "loss": 1.4556,
14209
+ "step": 20240
14210
+ },
14211
+ {
14212
+ "epoch": 0.51,
14213
+ "grad_norm": 28.25,
14214
+ "learning_rate": 6.610169491525424e-06,
14215
+ "loss": 1.4477,
14216
+ "step": 20250
14217
+ },
14218
+ {
14219
+ "epoch": 0.51,
14220
+ "grad_norm": 21.0,
14221
+ "learning_rate": 6.6033898305084755e-06,
14222
+ "loss": 1.3582,
14223
+ "step": 20260
14224
+ },
14225
+ {
14226
+ "epoch": 0.51,
14227
+ "grad_norm": 55.75,
14228
+ "learning_rate": 6.596610169491526e-06,
14229
+ "loss": 1.3473,
14230
+ "step": 20270
14231
+ },
14232
+ {
14233
+ "epoch": 0.51,
14234
+ "grad_norm": 9.1875,
14235
+ "learning_rate": 6.5898305084745774e-06,
14236
+ "loss": 1.4635,
14237
+ "step": 20280
14238
+ },
14239
+ {
14240
+ "epoch": 0.51,
14241
+ "grad_norm": 8.3125,
14242
+ "learning_rate": 6.583050847457627e-06,
14243
+ "loss": 1.249,
14244
+ "step": 20290
14245
+ },
14246
+ {
14247
+ "epoch": 0.51,
14248
+ "grad_norm": 65.0,
14249
+ "learning_rate": 6.576271186440678e-06,
14250
+ "loss": 1.3769,
14251
+ "step": 20300
14252
+ },
14253
+ {
14254
+ "epoch": 0.51,
14255
+ "grad_norm": 14.125,
14256
+ "learning_rate": 6.569491525423729e-06,
14257
+ "loss": 1.4276,
14258
+ "step": 20310
14259
+ },
14260
+ {
14261
+ "epoch": 0.51,
14262
+ "grad_norm": 44.0,
14263
+ "learning_rate": 6.56271186440678e-06,
14264
+ "loss": 1.4521,
14265
+ "step": 20320
14266
+ },
14267
+ {
14268
+ "epoch": 0.51,
14269
+ "grad_norm": 43.5,
14270
+ "learning_rate": 6.555932203389831e-06,
14271
+ "loss": 1.3885,
14272
+ "step": 20330
14273
+ },
14274
+ {
14275
+ "epoch": 0.51,
14276
+ "grad_norm": 52.25,
14277
+ "learning_rate": 6.549152542372882e-06,
14278
+ "loss": 1.345,
14279
+ "step": 20340
14280
+ },
14281
+ {
14282
+ "epoch": 0.51,
14283
+ "grad_norm": 27.625,
14284
+ "learning_rate": 6.542372881355933e-06,
14285
+ "loss": 1.3546,
14286
+ "step": 20350
14287
+ },
14288
+ {
14289
+ "epoch": 0.51,
14290
+ "grad_norm": 42.0,
14291
+ "learning_rate": 6.5355932203389836e-06,
14292
+ "loss": 1.3781,
14293
+ "step": 20360
14294
+ },
14295
+ {
14296
+ "epoch": 0.51,
14297
+ "grad_norm": 10.625,
14298
+ "learning_rate": 6.528813559322035e-06,
14299
+ "loss": 1.3201,
14300
+ "step": 20370
14301
+ },
14302
+ {
14303
+ "epoch": 0.51,
14304
+ "grad_norm": 34.25,
14305
+ "learning_rate": 6.5220338983050855e-06,
14306
+ "loss": 1.3468,
14307
+ "step": 20380
14308
+ },
14309
+ {
14310
+ "epoch": 0.51,
14311
+ "grad_norm": 27.125,
14312
+ "learning_rate": 6.515254237288137e-06,
14313
+ "loss": 1.3345,
14314
+ "step": 20390
14315
+ },
14316
+ {
14317
+ "epoch": 0.51,
14318
+ "grad_norm": 13.625,
14319
+ "learning_rate": 6.508474576271187e-06,
14320
+ "loss": 1.2565,
14321
+ "step": 20400
14322
+ },
14323
+ {
14324
+ "epoch": 0.51,
14325
+ "grad_norm": 17.625,
14326
+ "learning_rate": 6.501694915254237e-06,
14327
+ "loss": 1.3067,
14328
+ "step": 20410
14329
+ },
14330
+ {
14331
+ "epoch": 0.51,
14332
+ "grad_norm": 16.0,
14333
+ "learning_rate": 6.4949152542372886e-06,
14334
+ "loss": 1.4644,
14335
+ "step": 20420
14336
+ },
14337
+ {
14338
+ "epoch": 0.51,
14339
+ "grad_norm": 29.25,
14340
+ "learning_rate": 6.488135593220339e-06,
14341
+ "loss": 1.3739,
14342
+ "step": 20430
14343
+ },
14344
+ {
14345
+ "epoch": 0.51,
14346
+ "grad_norm": 22.875,
14347
+ "learning_rate": 6.4813559322033905e-06,
14348
+ "loss": 1.4237,
14349
+ "step": 20440
14350
+ },
14351
+ {
14352
+ "epoch": 0.51,
14353
+ "grad_norm": 14.0,
14354
+ "learning_rate": 6.474576271186441e-06,
14355
+ "loss": 1.4379,
14356
+ "step": 20450
14357
+ },
14358
+ {
14359
+ "epoch": 0.51,
14360
+ "grad_norm": 67.5,
14361
+ "learning_rate": 6.4677966101694925e-06,
14362
+ "loss": 1.2757,
14363
+ "step": 20460
14364
+ },
14365
+ {
14366
+ "epoch": 0.51,
14367
+ "grad_norm": 15.5,
14368
+ "learning_rate": 6.461016949152543e-06,
14369
+ "loss": 1.4572,
14370
+ "step": 20470
14371
+ },
14372
+ {
14373
+ "epoch": 0.51,
14374
+ "grad_norm": 15.5625,
14375
+ "learning_rate": 6.4542372881355944e-06,
14376
+ "loss": 1.3073,
14377
+ "step": 20480
14378
+ },
14379
+ {
14380
+ "epoch": 0.51,
14381
+ "grad_norm": 19.5,
14382
+ "learning_rate": 6.447457627118645e-06,
14383
+ "loss": 1.4111,
14384
+ "step": 20490
14385
+ },
14386
+ {
14387
+ "epoch": 0.51,
14388
+ "grad_norm": 31.5,
14389
+ "learning_rate": 6.440677966101695e-06,
14390
+ "loss": 1.4395,
14391
+ "step": 20500
14392
+ },
14393
+ {
14394
+ "epoch": 0.51,
14395
+ "grad_norm": 99.5,
14396
+ "learning_rate": 6.433898305084746e-06,
14397
+ "loss": 1.2802,
14398
+ "step": 20510
14399
+ },
14400
+ {
14401
+ "epoch": 0.51,
14402
+ "grad_norm": 39.25,
14403
+ "learning_rate": 6.427118644067797e-06,
14404
+ "loss": 1.3699,
14405
+ "step": 20520
14406
+ },
14407
+ {
14408
+ "epoch": 0.51,
14409
+ "grad_norm": 8.0,
14410
+ "learning_rate": 6.420338983050848e-06,
14411
+ "loss": 1.311,
14412
+ "step": 20530
14413
+ },
14414
+ {
14415
+ "epoch": 0.51,
14416
+ "grad_norm": 32.25,
14417
+ "learning_rate": 6.413559322033899e-06,
14418
+ "loss": 1.2298,
14419
+ "step": 20540
14420
+ },
14421
+ {
14422
+ "epoch": 0.51,
14423
+ "grad_norm": 65.5,
14424
+ "learning_rate": 6.40677966101695e-06,
14425
+ "loss": 1.1912,
14426
+ "step": 20550
14427
+ },
14428
+ {
14429
+ "epoch": 0.51,
14430
+ "grad_norm": 22.375,
14431
+ "learning_rate": 6.4000000000000006e-06,
14432
+ "loss": 1.2825,
14433
+ "step": 20560
14434
+ },
14435
+ {
14436
+ "epoch": 0.51,
14437
+ "grad_norm": 6.25,
14438
+ "learning_rate": 6.393220338983052e-06,
14439
+ "loss": 1.3518,
14440
+ "step": 20570
14441
+ },
14442
+ {
14443
+ "epoch": 0.51,
14444
+ "grad_norm": 38.0,
14445
+ "learning_rate": 6.3864406779661025e-06,
14446
+ "loss": 1.3,
14447
+ "step": 20580
14448
+ },
14449
+ {
14450
+ "epoch": 0.51,
14451
+ "grad_norm": 14.125,
14452
+ "learning_rate": 6.379661016949154e-06,
14453
+ "loss": 1.2622,
14454
+ "step": 20590
14455
+ },
14456
+ {
14457
+ "epoch": 0.52,
14458
+ "grad_norm": 52.75,
14459
+ "learning_rate": 6.372881355932204e-06,
14460
+ "loss": 1.1673,
14461
+ "step": 20600
14462
+ },
14463
+ {
14464
+ "epoch": 0.52,
14465
+ "grad_norm": 38.25,
14466
+ "learning_rate": 6.366101694915254e-06,
14467
+ "loss": 1.3722,
14468
+ "step": 20610
14469
+ },
14470
+ {
14471
+ "epoch": 0.52,
14472
+ "grad_norm": 13.8125,
14473
+ "learning_rate": 6.3593220338983056e-06,
14474
+ "loss": 1.2911,
14475
+ "step": 20620
14476
+ },
14477
+ {
14478
+ "epoch": 0.52,
14479
+ "grad_norm": 19.875,
14480
+ "learning_rate": 6.352542372881356e-06,
14481
+ "loss": 1.4929,
14482
+ "step": 20630
14483
+ },
14484
+ {
14485
+ "epoch": 0.52,
14486
+ "grad_norm": 15.625,
14487
+ "learning_rate": 6.3457627118644075e-06,
14488
+ "loss": 1.2885,
14489
+ "step": 20640
14490
+ },
14491
+ {
14492
+ "epoch": 0.52,
14493
+ "grad_norm": 9.375,
14494
+ "learning_rate": 6.338983050847458e-06,
14495
+ "loss": 1.4606,
14496
+ "step": 20650
14497
+ },
14498
+ {
14499
+ "epoch": 0.52,
14500
+ "grad_norm": 13.8125,
14501
+ "learning_rate": 6.3322033898305095e-06,
14502
+ "loss": 1.398,
14503
+ "step": 20660
14504
+ },
14505
+ {
14506
+ "epoch": 0.52,
14507
+ "grad_norm": 63.5,
14508
+ "learning_rate": 6.32542372881356e-06,
14509
+ "loss": 1.309,
14510
+ "step": 20670
14511
+ },
14512
+ {
14513
+ "epoch": 0.52,
14514
+ "grad_norm": 34.75,
14515
+ "learning_rate": 6.318644067796611e-06,
14516
+ "loss": 1.4643,
14517
+ "step": 20680
14518
+ },
14519
+ {
14520
+ "epoch": 0.52,
14521
+ "grad_norm": 17.875,
14522
+ "learning_rate": 6.311864406779662e-06,
14523
+ "loss": 1.4006,
14524
+ "step": 20690
14525
+ },
14526
+ {
14527
+ "epoch": 0.52,
14528
+ "grad_norm": 36.5,
14529
+ "learning_rate": 6.3050847457627125e-06,
14530
+ "loss": 1.4196,
14531
+ "step": 20700
14532
+ },
14533
+ {
14534
+ "epoch": 0.52,
14535
+ "grad_norm": 26.75,
14536
+ "learning_rate": 6.298305084745763e-06,
14537
+ "loss": 1.5026,
14538
+ "step": 20710
14539
+ },
14540
+ {
14541
+ "epoch": 0.52,
14542
+ "grad_norm": 52.75,
14543
+ "learning_rate": 6.291525423728814e-06,
14544
+ "loss": 1.3181,
14545
+ "step": 20720
14546
+ },
14547
+ {
14548
+ "epoch": 0.52,
14549
+ "grad_norm": 18.0,
14550
+ "learning_rate": 6.284745762711865e-06,
14551
+ "loss": 1.2799,
14552
+ "step": 20730
14553
+ },
14554
+ {
14555
+ "epoch": 0.52,
14556
+ "grad_norm": 13.0625,
14557
+ "learning_rate": 6.277966101694916e-06,
14558
+ "loss": 1.5509,
14559
+ "step": 20740
14560
+ },
14561
+ {
14562
+ "epoch": 0.52,
14563
+ "grad_norm": 14.625,
14564
+ "learning_rate": 6.271186440677966e-06,
14565
+ "loss": 1.1255,
14566
+ "step": 20750
14567
+ },
14568
+ {
14569
+ "epoch": 0.52,
14570
+ "grad_norm": 11.9375,
14571
+ "learning_rate": 6.2644067796610176e-06,
14572
+ "loss": 1.2412,
14573
+ "step": 20760
14574
+ },
14575
+ {
14576
+ "epoch": 0.52,
14577
+ "grad_norm": 40.25,
14578
+ "learning_rate": 6.257627118644068e-06,
14579
+ "loss": 1.3701,
14580
+ "step": 20770
14581
+ },
14582
+ {
14583
+ "epoch": 0.52,
14584
+ "grad_norm": 25.875,
14585
+ "learning_rate": 6.2508474576271195e-06,
14586
+ "loss": 1.3874,
14587
+ "step": 20780
14588
+ },
14589
+ {
14590
+ "epoch": 0.52,
14591
+ "grad_norm": 11.8125,
14592
+ "learning_rate": 6.24406779661017e-06,
14593
+ "loss": 1.3743,
14594
+ "step": 20790
14595
+ },
14596
+ {
14597
+ "epoch": 0.52,
14598
+ "grad_norm": 9.25,
14599
+ "learning_rate": 6.2372881355932215e-06,
14600
+ "loss": 1.4481,
14601
+ "step": 20800
14602
+ },
14603
+ {
14604
+ "epoch": 0.52,
14605
+ "grad_norm": 26.875,
14606
+ "learning_rate": 6.230508474576271e-06,
14607
+ "loss": 1.381,
14608
+ "step": 20810
14609
+ },
14610
+ {
14611
+ "epoch": 0.52,
14612
+ "grad_norm": 14.1875,
14613
+ "learning_rate": 6.223728813559322e-06,
14614
+ "loss": 1.0738,
14615
+ "step": 20820
14616
+ },
14617
+ {
14618
+ "epoch": 0.52,
14619
+ "grad_norm": 24.75,
14620
+ "learning_rate": 6.216949152542373e-06,
14621
+ "loss": 1.2937,
14622
+ "step": 20830
14623
+ },
14624
+ {
14625
+ "epoch": 0.52,
14626
+ "grad_norm": 28.625,
14627
+ "learning_rate": 6.210169491525424e-06,
14628
+ "loss": 1.3835,
14629
+ "step": 20840
14630
+ },
14631
+ {
14632
+ "epoch": 0.52,
14633
+ "grad_norm": 47.0,
14634
+ "learning_rate": 6.203389830508475e-06,
14635
+ "loss": 1.565,
14636
+ "step": 20850
14637
+ },
14638
+ {
14639
+ "epoch": 0.52,
14640
+ "grad_norm": 7.6875,
14641
+ "learning_rate": 6.196610169491526e-06,
14642
+ "loss": 1.3831,
14643
+ "step": 20860
14644
+ },
14645
+ {
14646
+ "epoch": 0.52,
14647
+ "grad_norm": 7.53125,
14648
+ "learning_rate": 6.189830508474577e-06,
14649
+ "loss": 1.2528,
14650
+ "step": 20870
14651
+ },
14652
+ {
14653
+ "epoch": 0.52,
14654
+ "grad_norm": 20.75,
14655
+ "learning_rate": 6.183050847457628e-06,
14656
+ "loss": 1.4723,
14657
+ "step": 20880
14658
+ },
14659
+ {
14660
+ "epoch": 0.52,
14661
+ "grad_norm": 14.3125,
14662
+ "learning_rate": 6.176271186440679e-06,
14663
+ "loss": 1.4442,
14664
+ "step": 20890
14665
+ },
14666
+ {
14667
+ "epoch": 0.52,
14668
+ "grad_norm": 12.9375,
14669
+ "learning_rate": 6.1694915254237295e-06,
14670
+ "loss": 1.3466,
14671
+ "step": 20900
14672
+ },
14673
+ {
14674
+ "epoch": 0.52,
14675
+ "grad_norm": 38.0,
14676
+ "learning_rate": 6.162711864406781e-06,
14677
+ "loss": 1.3282,
14678
+ "step": 20910
14679
+ },
14680
+ {
14681
+ "epoch": 0.52,
14682
+ "grad_norm": 24.625,
14683
+ "learning_rate": 6.155932203389831e-06,
14684
+ "loss": 1.3331,
14685
+ "step": 20920
14686
+ },
14687
+ {
14688
+ "epoch": 0.52,
14689
+ "grad_norm": 30.25,
14690
+ "learning_rate": 6.149152542372881e-06,
14691
+ "loss": 1.4733,
14692
+ "step": 20930
14693
+ },
14694
+ {
14695
+ "epoch": 0.52,
14696
+ "grad_norm": 25.625,
14697
+ "learning_rate": 6.142372881355933e-06,
14698
+ "loss": 1.558,
14699
+ "step": 20940
14700
+ },
14701
+ {
14702
+ "epoch": 0.52,
14703
+ "grad_norm": 16.75,
14704
+ "learning_rate": 6.135593220338983e-06,
14705
+ "loss": 1.225,
14706
+ "step": 20950
14707
+ },
14708
+ {
14709
+ "epoch": 0.52,
14710
+ "grad_norm": 18.5,
14711
+ "learning_rate": 6.1288135593220346e-06,
14712
+ "loss": 1.3408,
14713
+ "step": 20960
14714
+ },
14715
+ {
14716
+ "epoch": 0.52,
14717
+ "grad_norm": 25.125,
14718
+ "learning_rate": 6.122033898305085e-06,
14719
+ "loss": 1.424,
14720
+ "step": 20970
14721
+ },
14722
+ {
14723
+ "epoch": 0.52,
14724
+ "grad_norm": 67.0,
14725
+ "learning_rate": 6.1152542372881365e-06,
14726
+ "loss": 1.2106,
14727
+ "step": 20980
14728
+ },
14729
+ {
14730
+ "epoch": 0.52,
14731
+ "grad_norm": 17.25,
14732
+ "learning_rate": 6.108474576271187e-06,
14733
+ "loss": 1.3747,
14734
+ "step": 20990
14735
+ },
14736
+ {
14737
+ "epoch": 0.53,
14738
+ "grad_norm": 20.625,
14739
+ "learning_rate": 6.1016949152542385e-06,
14740
+ "loss": 1.3211,
14741
+ "step": 21000
14742
+ },
14743
+ {
14744
+ "epoch": 0.53,
14745
+ "grad_norm": 40.0,
14746
+ "learning_rate": 6.094915254237289e-06,
14747
+ "loss": 1.3193,
14748
+ "step": 21010
14749
+ },
14750
+ {
14751
+ "epoch": 0.53,
14752
+ "grad_norm": 52.0,
14753
+ "learning_rate": 6.088135593220339e-06,
14754
+ "loss": 1.4027,
14755
+ "step": 21020
14756
+ },
14757
+ {
14758
+ "epoch": 0.53,
14759
+ "grad_norm": 12.375,
14760
+ "learning_rate": 6.08135593220339e-06,
14761
+ "loss": 1.3687,
14762
+ "step": 21030
14763
+ },
14764
+ {
14765
+ "epoch": 0.53,
14766
+ "grad_norm": 23.0,
14767
+ "learning_rate": 6.074576271186441e-06,
14768
+ "loss": 1.4436,
14769
+ "step": 21040
14770
+ },
14771
+ {
14772
+ "epoch": 0.53,
14773
+ "grad_norm": 46.75,
14774
+ "learning_rate": 6.067796610169492e-06,
14775
+ "loss": 1.3343,
14776
+ "step": 21050
14777
+ },
14778
+ {
14779
+ "epoch": 0.53,
14780
+ "grad_norm": 21.0,
14781
+ "learning_rate": 6.061016949152543e-06,
14782
+ "loss": 1.3544,
14783
+ "step": 21060
14784
+ },
14785
+ {
14786
+ "epoch": 0.53,
14787
+ "grad_norm": 16.875,
14788
+ "learning_rate": 6.054237288135594e-06,
14789
+ "loss": 1.3887,
14790
+ "step": 21070
14791
+ },
14792
+ {
14793
+ "epoch": 0.53,
14794
+ "grad_norm": 14.9375,
14795
+ "learning_rate": 6.047457627118645e-06,
14796
+ "loss": 1.385,
14797
+ "step": 21080
14798
+ },
14799
+ {
14800
+ "epoch": 0.53,
14801
+ "grad_norm": 78.0,
14802
+ "learning_rate": 6.040677966101696e-06,
14803
+ "loss": 1.4487,
14804
+ "step": 21090
14805
+ },
14806
+ {
14807
+ "epoch": 0.53,
14808
+ "grad_norm": 16.625,
14809
+ "learning_rate": 6.0338983050847465e-06,
14810
+ "loss": 1.4415,
14811
+ "step": 21100
14812
+ },
14813
+ {
14814
+ "epoch": 0.53,
14815
+ "grad_norm": 33.0,
14816
+ "learning_rate": 6.027118644067798e-06,
14817
+ "loss": 1.3929,
14818
+ "step": 21110
14819
+ },
14820
+ {
14821
+ "epoch": 0.53,
14822
+ "grad_norm": 17.375,
14823
+ "learning_rate": 6.020338983050848e-06,
14824
+ "loss": 1.2834,
14825
+ "step": 21120
14826
+ },
14827
+ {
14828
+ "epoch": 0.53,
14829
+ "grad_norm": 14.8125,
14830
+ "learning_rate": 6.013559322033898e-06,
14831
+ "loss": 1.4886,
14832
+ "step": 21130
14833
+ },
14834
+ {
14835
+ "epoch": 0.53,
14836
+ "grad_norm": 22.0,
14837
+ "learning_rate": 6.00677966101695e-06,
14838
+ "loss": 1.401,
14839
+ "step": 21140
14840
+ },
14841
+ {
14842
+ "epoch": 0.53,
14843
+ "grad_norm": 26.875,
14844
+ "learning_rate": 6e-06,
14845
+ "loss": 1.2837,
14846
+ "step": 21150
14847
+ },
14848
+ {
14849
+ "epoch": 0.53,
14850
+ "grad_norm": 27.75,
14851
+ "learning_rate": 5.9932203389830516e-06,
14852
+ "loss": 1.1739,
14853
+ "step": 21160
14854
+ },
14855
+ {
14856
+ "epoch": 0.53,
14857
+ "grad_norm": 27.0,
14858
+ "learning_rate": 5.986440677966102e-06,
14859
+ "loss": 1.4129,
14860
+ "step": 21170
14861
+ },
14862
+ {
14863
+ "epoch": 0.53,
14864
+ "grad_norm": 37.0,
14865
+ "learning_rate": 5.9796610169491535e-06,
14866
+ "loss": 1.3065,
14867
+ "step": 21180
14868
+ },
14869
+ {
14870
+ "epoch": 0.53,
14871
+ "grad_norm": 15.625,
14872
+ "learning_rate": 5.972881355932204e-06,
14873
+ "loss": 1.4291,
14874
+ "step": 21190
14875
+ },
14876
+ {
14877
+ "epoch": 0.53,
14878
+ "grad_norm": 17.25,
14879
+ "learning_rate": 5.9661016949152555e-06,
14880
+ "loss": 1.3072,
14881
+ "step": 21200
14882
+ },
14883
+ {
14884
+ "epoch": 0.53,
14885
+ "grad_norm": 11.375,
14886
+ "learning_rate": 5.959322033898306e-06,
14887
+ "loss": 1.2729,
14888
+ "step": 21210
14889
+ },
14890
+ {
14891
+ "epoch": 0.53,
14892
+ "grad_norm": 23.75,
14893
+ "learning_rate": 5.9525423728813566e-06,
14894
+ "loss": 1.3186,
14895
+ "step": 21220
14896
+ },
14897
+ {
14898
+ "epoch": 0.53,
14899
+ "grad_norm": 14.125,
14900
+ "learning_rate": 5.945762711864407e-06,
14901
+ "loss": 1.3264,
14902
+ "step": 21230
14903
+ },
14904
+ {
14905
+ "epoch": 0.53,
14906
+ "grad_norm": 36.5,
14907
+ "learning_rate": 5.938983050847458e-06,
14908
+ "loss": 1.4535,
14909
+ "step": 21240
14910
+ },
14911
+ {
14912
+ "epoch": 0.53,
14913
+ "grad_norm": 12.0625,
14914
+ "learning_rate": 5.932203389830509e-06,
14915
+ "loss": 1.4143,
14916
+ "step": 21250
14917
+ },
14918
+ {
14919
+ "epoch": 0.53,
14920
+ "grad_norm": 30.75,
14921
+ "learning_rate": 5.92542372881356e-06,
14922
+ "loss": 1.2338,
14923
+ "step": 21260
14924
+ },
14925
+ {
14926
+ "epoch": 0.53,
14927
+ "grad_norm": 31.5,
14928
+ "learning_rate": 5.91864406779661e-06,
14929
+ "loss": 1.3512,
14930
+ "step": 21270
14931
+ },
14932
+ {
14933
+ "epoch": 0.53,
14934
+ "grad_norm": 16.25,
14935
+ "learning_rate": 5.911864406779662e-06,
14936
+ "loss": 1.4743,
14937
+ "step": 21280
14938
+ },
14939
+ {
14940
+ "epoch": 0.53,
14941
+ "grad_norm": 30.625,
14942
+ "learning_rate": 5.905084745762712e-06,
14943
+ "loss": 1.5777,
14944
+ "step": 21290
14945
+ },
14946
+ {
14947
+ "epoch": 0.53,
14948
+ "grad_norm": 15.875,
14949
+ "learning_rate": 5.8983050847457635e-06,
14950
+ "loss": 1.4545,
14951
+ "step": 21300
14952
+ },
14953
+ {
14954
+ "epoch": 0.53,
14955
+ "grad_norm": 48.25,
14956
+ "learning_rate": 5.891525423728814e-06,
14957
+ "loss": 1.3676,
14958
+ "step": 21310
14959
+ },
14960
+ {
14961
+ "epoch": 0.53,
14962
+ "grad_norm": 70.5,
14963
+ "learning_rate": 5.8847457627118655e-06,
14964
+ "loss": 1.2971,
14965
+ "step": 21320
14966
+ },
14967
+ {
14968
+ "epoch": 0.53,
14969
+ "grad_norm": 18.375,
14970
+ "learning_rate": 5.877966101694915e-06,
14971
+ "loss": 1.3533,
14972
+ "step": 21330
14973
+ },
14974
+ {
14975
+ "epoch": 0.53,
14976
+ "grad_norm": 17.5,
14977
+ "learning_rate": 5.871186440677966e-06,
14978
+ "loss": 1.4603,
14979
+ "step": 21340
14980
+ },
14981
+ {
14982
+ "epoch": 0.53,
14983
+ "grad_norm": 14.3125,
14984
+ "learning_rate": 5.864406779661017e-06,
14985
+ "loss": 1.2108,
14986
+ "step": 21350
14987
+ },
14988
+ {
14989
+ "epoch": 0.53,
14990
+ "grad_norm": 22.875,
14991
+ "learning_rate": 5.857627118644068e-06,
14992
+ "loss": 1.4043,
14993
+ "step": 21360
14994
+ },
14995
+ {
14996
+ "epoch": 0.53,
14997
+ "grad_norm": 29.5,
14998
+ "learning_rate": 5.850847457627119e-06,
14999
+ "loss": 1.4477,
15000
+ "step": 21370
15001
+ },
15002
+ {
15003
+ "epoch": 0.53,
15004
+ "grad_norm": 51.25,
15005
+ "learning_rate": 5.84406779661017e-06,
15006
+ "loss": 1.1954,
15007
+ "step": 21380
15008
+ },
15009
+ {
15010
+ "epoch": 0.53,
15011
+ "grad_norm": 17.375,
15012
+ "learning_rate": 5.837288135593221e-06,
15013
+ "loss": 1.4261,
15014
+ "step": 21390
15015
+ },
15016
+ {
15017
+ "epoch": 0.54,
15018
+ "grad_norm": 16.75,
15019
+ "learning_rate": 5.830508474576272e-06,
15020
+ "loss": 1.2177,
15021
+ "step": 21400
15022
+ },
15023
+ {
15024
+ "epoch": 0.54,
15025
+ "grad_norm": 31.375,
15026
+ "learning_rate": 5.823728813559323e-06,
15027
+ "loss": 1.4369,
15028
+ "step": 21410
15029
+ },
15030
+ {
15031
+ "epoch": 0.54,
15032
+ "grad_norm": 38.5,
15033
+ "learning_rate": 5.8169491525423736e-06,
15034
+ "loss": 1.524,
15035
+ "step": 21420
15036
+ },
15037
+ {
15038
+ "epoch": 0.54,
15039
+ "grad_norm": 24.125,
15040
+ "learning_rate": 5.810169491525425e-06,
15041
+ "loss": 1.287,
15042
+ "step": 21430
15043
+ },
15044
+ {
15045
+ "epoch": 0.54,
15046
+ "grad_norm": 27.25,
15047
+ "learning_rate": 5.803389830508475e-06,
15048
+ "loss": 1.331,
15049
+ "step": 21440
15050
+ },
15051
+ {
15052
+ "epoch": 0.54,
15053
+ "grad_norm": 26.25,
15054
+ "learning_rate": 5.796610169491525e-06,
15055
+ "loss": 1.2623,
15056
+ "step": 21450
15057
+ },
15058
+ {
15059
+ "epoch": 0.54,
15060
+ "grad_norm": 27.0,
15061
+ "learning_rate": 5.789830508474577e-06,
15062
+ "loss": 1.4062,
15063
+ "step": 21460
15064
+ },
15065
+ {
15066
+ "epoch": 0.54,
15067
+ "grad_norm": 33.0,
15068
+ "learning_rate": 5.783050847457627e-06,
15069
+ "loss": 1.282,
15070
+ "step": 21470
15071
+ },
15072
+ {
15073
+ "epoch": 0.54,
15074
+ "grad_norm": 10.0,
15075
+ "learning_rate": 5.776271186440679e-06,
15076
+ "loss": 1.1568,
15077
+ "step": 21480
15078
+ },
15079
+ {
15080
+ "epoch": 0.54,
15081
+ "grad_norm": 32.5,
15082
+ "learning_rate": 5.769491525423729e-06,
15083
+ "loss": 1.3498,
15084
+ "step": 21490
15085
+ },
15086
+ {
15087
+ "epoch": 0.54,
15088
+ "grad_norm": 108.5,
15089
+ "learning_rate": 5.7627118644067805e-06,
15090
+ "loss": 1.527,
15091
+ "step": 21500
15092
+ },
15093
+ {
15094
+ "epoch": 0.54,
15095
+ "grad_norm": 32.75,
15096
+ "learning_rate": 5.755932203389831e-06,
15097
+ "loss": 1.2589,
15098
+ "step": 21510
15099
+ },
15100
+ {
15101
+ "epoch": 0.54,
15102
+ "grad_norm": 17.5,
15103
+ "learning_rate": 5.7491525423728825e-06,
15104
+ "loss": 1.2433,
15105
+ "step": 21520
15106
+ },
15107
+ {
15108
+ "epoch": 0.54,
15109
+ "grad_norm": 18.25,
15110
+ "learning_rate": 5.742372881355933e-06,
15111
+ "loss": 1.2189,
15112
+ "step": 21530
15113
+ },
15114
+ {
15115
+ "epoch": 0.54,
15116
+ "grad_norm": 23.125,
15117
+ "learning_rate": 5.735593220338983e-06,
15118
+ "loss": 1.1866,
15119
+ "step": 21540
15120
+ },
15121
+ {
15122
+ "epoch": 0.54,
15123
+ "grad_norm": 22.625,
15124
+ "learning_rate": 5.728813559322034e-06,
15125
+ "loss": 1.2153,
15126
+ "step": 21550
15127
+ },
15128
+ {
15129
+ "epoch": 0.54,
15130
+ "grad_norm": 87.5,
15131
+ "learning_rate": 5.722033898305085e-06,
15132
+ "loss": 1.4075,
15133
+ "step": 21560
15134
+ },
15135
+ {
15136
+ "epoch": 0.54,
15137
+ "grad_norm": 21.5,
15138
+ "learning_rate": 5.715254237288136e-06,
15139
+ "loss": 1.3385,
15140
+ "step": 21570
15141
+ },
15142
+ {
15143
+ "epoch": 0.54,
15144
+ "grad_norm": 18.125,
15145
+ "learning_rate": 5.708474576271187e-06,
15146
+ "loss": 1.2616,
15147
+ "step": 21580
15148
+ },
15149
+ {
15150
+ "epoch": 0.54,
15151
+ "grad_norm": 17.625,
15152
+ "learning_rate": 5.701694915254238e-06,
15153
+ "loss": 1.2999,
15154
+ "step": 21590
15155
+ },
15156
+ {
15157
+ "epoch": 0.54,
15158
+ "grad_norm": 27.375,
15159
+ "learning_rate": 5.694915254237289e-06,
15160
+ "loss": 1.3824,
15161
+ "step": 21600
15162
+ },
15163
+ {
15164
+ "epoch": 0.54,
15165
+ "grad_norm": 16.875,
15166
+ "learning_rate": 5.68813559322034e-06,
15167
+ "loss": 1.2897,
15168
+ "step": 21610
15169
+ },
15170
+ {
15171
+ "epoch": 0.54,
15172
+ "grad_norm": 37.75,
15173
+ "learning_rate": 5.6813559322033906e-06,
15174
+ "loss": 1.2475,
15175
+ "step": 21620
15176
+ },
15177
+ {
15178
+ "epoch": 0.54,
15179
+ "grad_norm": 32.0,
15180
+ "learning_rate": 5.674576271186442e-06,
15181
+ "loss": 1.4961,
15182
+ "step": 21630
15183
+ },
15184
+ {
15185
+ "epoch": 0.54,
15186
+ "grad_norm": 28.625,
15187
+ "learning_rate": 5.667796610169492e-06,
15188
+ "loss": 1.5183,
15189
+ "step": 21640
15190
+ },
15191
+ {
15192
+ "epoch": 0.54,
15193
+ "grad_norm": 34.5,
15194
+ "learning_rate": 5.661016949152542e-06,
15195
+ "loss": 1.5494,
15196
+ "step": 21650
15197
+ },
15198
+ {
15199
+ "epoch": 0.54,
15200
+ "grad_norm": 20.25,
15201
+ "learning_rate": 5.654237288135594e-06,
15202
+ "loss": 1.424,
15203
+ "step": 21660
15204
+ },
15205
+ {
15206
+ "epoch": 0.54,
15207
+ "grad_norm": 27.75,
15208
+ "learning_rate": 5.647457627118644e-06,
15209
+ "loss": 1.3111,
15210
+ "step": 21670
15211
+ },
15212
+ {
15213
+ "epoch": 0.54,
15214
+ "grad_norm": 17.125,
15215
+ "learning_rate": 5.640677966101696e-06,
15216
+ "loss": 1.3682,
15217
+ "step": 21680
15218
+ },
15219
+ {
15220
+ "epoch": 0.54,
15221
+ "grad_norm": 24.625,
15222
+ "learning_rate": 5.633898305084746e-06,
15223
+ "loss": 1.2766,
15224
+ "step": 21690
15225
+ },
15226
+ {
15227
+ "epoch": 0.54,
15228
+ "grad_norm": 24.875,
15229
+ "learning_rate": 5.6271186440677975e-06,
15230
+ "loss": 1.3045,
15231
+ "step": 21700
15232
+ },
15233
+ {
15234
+ "epoch": 0.54,
15235
+ "grad_norm": 12.3125,
15236
+ "learning_rate": 5.620338983050848e-06,
15237
+ "loss": 1.4166,
15238
+ "step": 21710
15239
+ },
15240
+ {
15241
+ "epoch": 0.54,
15242
+ "grad_norm": 13.0,
15243
+ "learning_rate": 5.6135593220338995e-06,
15244
+ "loss": 1.4751,
15245
+ "step": 21720
15246
+ },
15247
+ {
15248
+ "epoch": 0.54,
15249
+ "grad_norm": 21.125,
15250
+ "learning_rate": 5.60677966101695e-06,
15251
+ "loss": 1.2947,
15252
+ "step": 21730
15253
+ },
15254
+ {
15255
+ "epoch": 0.54,
15256
+ "grad_norm": 19.0,
15257
+ "learning_rate": 5.600000000000001e-06,
15258
+ "loss": 1.3368,
15259
+ "step": 21740
15260
+ },
15261
+ {
15262
+ "epoch": 0.54,
15263
+ "grad_norm": 20.625,
15264
+ "learning_rate": 5.593220338983051e-06,
15265
+ "loss": 1.3925,
15266
+ "step": 21750
15267
+ },
15268
+ {
15269
+ "epoch": 0.54,
15270
+ "grad_norm": 19.25,
15271
+ "learning_rate": 5.586440677966102e-06,
15272
+ "loss": 1.2478,
15273
+ "step": 21760
15274
+ },
15275
+ {
15276
+ "epoch": 0.54,
15277
+ "grad_norm": 50.5,
15278
+ "learning_rate": 5.579661016949153e-06,
15279
+ "loss": 1.3246,
15280
+ "step": 21770
15281
+ },
15282
+ {
15283
+ "epoch": 0.54,
15284
+ "grad_norm": 66.0,
15285
+ "learning_rate": 5.572881355932204e-06,
15286
+ "loss": 1.2474,
15287
+ "step": 21780
15288
+ },
15289
+ {
15290
+ "epoch": 0.54,
15291
+ "grad_norm": 22.75,
15292
+ "learning_rate": 5.566101694915255e-06,
15293
+ "loss": 1.6124,
15294
+ "step": 21790
15295
+ },
15296
+ {
15297
+ "epoch": 0.55,
15298
+ "grad_norm": 24.125,
15299
+ "learning_rate": 5.559322033898306e-06,
15300
+ "loss": 1.4545,
15301
+ "step": 21800
15302
+ },
15303
+ {
15304
+ "epoch": 0.55,
15305
+ "grad_norm": 15.5,
15306
+ "learning_rate": 5.552542372881356e-06,
15307
+ "loss": 1.1902,
15308
+ "step": 21810
15309
+ },
15310
+ {
15311
+ "epoch": 0.55,
15312
+ "grad_norm": 12.625,
15313
+ "learning_rate": 5.5457627118644076e-06,
15314
+ "loss": 1.2157,
15315
+ "step": 21820
15316
+ },
15317
+ {
15318
+ "epoch": 0.55,
15319
+ "grad_norm": 12.5,
15320
+ "learning_rate": 5.538983050847458e-06,
15321
+ "loss": 1.3625,
15322
+ "step": 21830
15323
+ },
15324
+ {
15325
+ "epoch": 0.55,
15326
+ "grad_norm": 19.0,
15327
+ "learning_rate": 5.5322033898305095e-06,
15328
+ "loss": 1.3041,
15329
+ "step": 21840
15330
+ },
15331
+ {
15332
+ "epoch": 0.55,
15333
+ "grad_norm": 47.0,
15334
+ "learning_rate": 5.525423728813559e-06,
15335
+ "loss": 1.1826,
15336
+ "step": 21850
15337
+ },
15338
+ {
15339
+ "epoch": 0.55,
15340
+ "grad_norm": 24.875,
15341
+ "learning_rate": 5.518644067796611e-06,
15342
+ "loss": 1.412,
15343
+ "step": 21860
15344
+ },
15345
+ {
15346
+ "epoch": 0.55,
15347
+ "grad_norm": 10.0625,
15348
+ "learning_rate": 5.511864406779661e-06,
15349
+ "loss": 1.3009,
15350
+ "step": 21870
15351
+ },
15352
+ {
15353
+ "epoch": 0.55,
15354
+ "grad_norm": 17.5,
15355
+ "learning_rate": 5.505084745762712e-06,
15356
+ "loss": 1.3198,
15357
+ "step": 21880
15358
+ },
15359
+ {
15360
+ "epoch": 0.55,
15361
+ "grad_norm": 31.5,
15362
+ "learning_rate": 5.498305084745763e-06,
15363
+ "loss": 1.3365,
15364
+ "step": 21890
15365
+ },
15366
+ {
15367
+ "epoch": 0.55,
15368
+ "grad_norm": 36.0,
15369
+ "learning_rate": 5.491525423728814e-06,
15370
+ "loss": 1.4381,
15371
+ "step": 21900
15372
+ },
15373
+ {
15374
+ "epoch": 0.55,
15375
+ "grad_norm": 21.625,
15376
+ "learning_rate": 5.484745762711865e-06,
15377
+ "loss": 1.3239,
15378
+ "step": 21910
15379
+ },
15380
+ {
15381
+ "epoch": 0.55,
15382
+ "grad_norm": 45.0,
15383
+ "learning_rate": 5.477966101694916e-06,
15384
+ "loss": 1.5603,
15385
+ "step": 21920
15386
+ },
15387
+ {
15388
+ "epoch": 0.55,
15389
+ "grad_norm": 32.5,
15390
+ "learning_rate": 5.471186440677967e-06,
15391
+ "loss": 1.3348,
15392
+ "step": 21930
15393
+ },
15394
+ {
15395
+ "epoch": 0.55,
15396
+ "grad_norm": 20.375,
15397
+ "learning_rate": 5.464406779661018e-06,
15398
+ "loss": 1.3331,
15399
+ "step": 21940
15400
+ },
15401
+ {
15402
+ "epoch": 0.55,
15403
+ "grad_norm": 25.0,
15404
+ "learning_rate": 5.457627118644067e-06,
15405
+ "loss": 1.4195,
15406
+ "step": 21950
15407
+ },
15408
+ {
15409
+ "epoch": 0.55,
15410
+ "grad_norm": 39.25,
15411
+ "learning_rate": 5.450847457627119e-06,
15412
+ "loss": 1.5658,
15413
+ "step": 21960
15414
+ },
15415
+ {
15416
+ "epoch": 0.55,
15417
+ "grad_norm": 46.5,
15418
+ "learning_rate": 5.444067796610169e-06,
15419
+ "loss": 1.3294,
15420
+ "step": 21970
15421
+ },
15422
+ {
15423
+ "epoch": 0.55,
15424
+ "grad_norm": 37.75,
15425
+ "learning_rate": 5.437288135593221e-06,
15426
+ "loss": 1.5364,
15427
+ "step": 21980
15428
+ },
15429
+ {
15430
+ "epoch": 0.55,
15431
+ "grad_norm": 16.25,
15432
+ "learning_rate": 5.430508474576271e-06,
15433
+ "loss": 1.4479,
15434
+ "step": 21990
15435
+ },
15436
+ {
15437
+ "epoch": 0.55,
15438
+ "grad_norm": 45.25,
15439
+ "learning_rate": 5.423728813559323e-06,
15440
+ "loss": 1.2464,
15441
+ "step": 22000
15442
+ },
15443
+ {
15444
+ "epoch": 0.55,
15445
+ "grad_norm": 21.375,
15446
+ "learning_rate": 5.416949152542373e-06,
15447
+ "loss": 1.1403,
15448
+ "step": 22010
15449
+ },
15450
+ {
15451
+ "epoch": 0.55,
15452
+ "grad_norm": 23.5,
15453
+ "learning_rate": 5.4101694915254246e-06,
15454
+ "loss": 1.3901,
15455
+ "step": 22020
15456
+ },
15457
+ {
15458
+ "epoch": 0.55,
15459
+ "grad_norm": 9.1875,
15460
+ "learning_rate": 5.403389830508475e-06,
15461
+ "loss": 1.394,
15462
+ "step": 22030
15463
+ },
15464
+ {
15465
+ "epoch": 0.55,
15466
+ "grad_norm": 10.0625,
15467
+ "learning_rate": 5.3966101694915265e-06,
15468
+ "loss": 1.0434,
15469
+ "step": 22040
15470
+ },
15471
+ {
15472
+ "epoch": 0.55,
15473
+ "grad_norm": 18.375,
15474
+ "learning_rate": 5.389830508474577e-06,
15475
+ "loss": 1.3842,
15476
+ "step": 22050
15477
+ },
15478
+ {
15479
+ "epoch": 0.55,
15480
+ "grad_norm": 1.734375,
15481
+ "learning_rate": 5.383050847457627e-06,
15482
+ "loss": 1.2067,
15483
+ "step": 22060
15484
+ },
15485
+ {
15486
+ "epoch": 0.55,
15487
+ "grad_norm": 13.5,
15488
+ "learning_rate": 5.376271186440678e-06,
15489
+ "loss": 1.3957,
15490
+ "step": 22070
15491
+ },
15492
+ {
15493
+ "epoch": 0.55,
15494
+ "grad_norm": 63.5,
15495
+ "learning_rate": 5.369491525423729e-06,
15496
+ "loss": 1.2194,
15497
+ "step": 22080
15498
+ },
15499
+ {
15500
+ "epoch": 0.55,
15501
+ "grad_norm": 32.0,
15502
+ "learning_rate": 5.36271186440678e-06,
15503
+ "loss": 1.0633,
15504
+ "step": 22090
15505
+ },
15506
+ {
15507
+ "epoch": 0.55,
15508
+ "grad_norm": 20.625,
15509
+ "learning_rate": 5.355932203389831e-06,
15510
+ "loss": 1.2579,
15511
+ "step": 22100
15512
+ },
15513
+ {
15514
+ "epoch": 0.55,
15515
+ "grad_norm": 8.3125,
15516
+ "learning_rate": 5.349152542372882e-06,
15517
+ "loss": 1.1221,
15518
+ "step": 22110
15519
+ },
15520
+ {
15521
+ "epoch": 0.55,
15522
+ "grad_norm": 25.375,
15523
+ "learning_rate": 5.342372881355933e-06,
15524
+ "loss": 1.2434,
15525
+ "step": 22120
15526
+ },
15527
+ {
15528
+ "epoch": 0.55,
15529
+ "grad_norm": 36.0,
15530
+ "learning_rate": 5.335593220338984e-06,
15531
+ "loss": 1.5282,
15532
+ "step": 22130
15533
+ },
15534
+ {
15535
+ "epoch": 0.55,
15536
+ "grad_norm": 13.4375,
15537
+ "learning_rate": 5.328813559322035e-06,
15538
+ "loss": 1.3162,
15539
+ "step": 22140
15540
+ },
15541
+ {
15542
+ "epoch": 0.55,
15543
+ "grad_norm": 33.25,
15544
+ "learning_rate": 5.322033898305086e-06,
15545
+ "loss": 1.2288,
15546
+ "step": 22150
15547
+ },
15548
+ {
15549
+ "epoch": 0.55,
15550
+ "grad_norm": 50.0,
15551
+ "learning_rate": 5.315254237288136e-06,
15552
+ "loss": 1.5033,
15553
+ "step": 22160
15554
+ },
15555
+ {
15556
+ "epoch": 0.55,
15557
+ "grad_norm": 19.75,
15558
+ "learning_rate": 5.308474576271186e-06,
15559
+ "loss": 1.3759,
15560
+ "step": 22170
15561
+ },
15562
+ {
15563
+ "epoch": 0.55,
15564
+ "grad_norm": 56.0,
15565
+ "learning_rate": 5.301694915254238e-06,
15566
+ "loss": 1.342,
15567
+ "step": 22180
15568
+ },
15569
+ {
15570
+ "epoch": 0.55,
15571
+ "grad_norm": 23.375,
15572
+ "learning_rate": 5.294915254237288e-06,
15573
+ "loss": 1.3466,
15574
+ "step": 22190
15575
+ },
15576
+ {
15577
+ "epoch": 0.56,
15578
+ "grad_norm": 60.0,
15579
+ "learning_rate": 5.28813559322034e-06,
15580
+ "loss": 1.3293,
15581
+ "step": 22200
15582
+ },
15583
+ {
15584
+ "epoch": 0.56,
15585
+ "grad_norm": 30.625,
15586
+ "learning_rate": 5.28135593220339e-06,
15587
+ "loss": 1.4459,
15588
+ "step": 22210
15589
+ },
15590
+ {
15591
+ "epoch": 0.56,
15592
+ "grad_norm": 53.5,
15593
+ "learning_rate": 5.2745762711864416e-06,
15594
+ "loss": 1.3259,
15595
+ "step": 22220
15596
+ },
15597
+ {
15598
+ "epoch": 0.56,
15599
+ "grad_norm": 45.25,
15600
+ "learning_rate": 5.267796610169492e-06,
15601
+ "loss": 1.273,
15602
+ "step": 22230
15603
+ },
15604
+ {
15605
+ "epoch": 0.56,
15606
+ "grad_norm": 35.25,
15607
+ "learning_rate": 5.2610169491525435e-06,
15608
+ "loss": 1.2938,
15609
+ "step": 22240
15610
+ },
15611
+ {
15612
+ "epoch": 0.56,
15613
+ "grad_norm": 5.125,
15614
+ "learning_rate": 5.254237288135594e-06,
15615
+ "loss": 1.3133,
15616
+ "step": 22250
15617
+ },
15618
+ {
15619
+ "epoch": 0.56,
15620
+ "grad_norm": 47.25,
15621
+ "learning_rate": 5.247457627118645e-06,
15622
+ "loss": 1.4179,
15623
+ "step": 22260
15624
+ },
15625
+ {
15626
+ "epoch": 0.56,
15627
+ "grad_norm": 14.4375,
15628
+ "learning_rate": 5.240677966101695e-06,
15629
+ "loss": 1.2183,
15630
+ "step": 22270
15631
+ },
15632
+ {
15633
+ "epoch": 0.56,
15634
+ "grad_norm": 12.125,
15635
+ "learning_rate": 5.233898305084746e-06,
15636
+ "loss": 1.3146,
15637
+ "step": 22280
15638
+ },
15639
+ {
15640
+ "epoch": 0.56,
15641
+ "grad_norm": 16.5,
15642
+ "learning_rate": 5.227118644067797e-06,
15643
+ "loss": 1.3791,
15644
+ "step": 22290
15645
+ },
15646
+ {
15647
+ "epoch": 0.56,
15648
+ "grad_norm": 16.75,
15649
+ "learning_rate": 5.220338983050848e-06,
15650
+ "loss": 1.4089,
15651
+ "step": 22300
15652
+ },
15653
+ {
15654
+ "epoch": 0.56,
15655
+ "grad_norm": 15.25,
15656
+ "learning_rate": 5.213559322033899e-06,
15657
+ "loss": 1.2884,
15658
+ "step": 22310
15659
+ },
15660
+ {
15661
+ "epoch": 0.56,
15662
+ "grad_norm": 15.5625,
15663
+ "learning_rate": 5.20677966101695e-06,
15664
+ "loss": 1.358,
15665
+ "step": 22320
15666
+ },
15667
+ {
15668
+ "epoch": 0.56,
15669
+ "grad_norm": 50.5,
15670
+ "learning_rate": 5.2e-06,
15671
+ "loss": 1.5178,
15672
+ "step": 22330
15673
+ },
15674
+ {
15675
+ "epoch": 0.56,
15676
+ "grad_norm": 16.875,
15677
+ "learning_rate": 5.193220338983052e-06,
15678
+ "loss": 1.4127,
15679
+ "step": 22340
15680
+ },
15681
+ {
15682
+ "epoch": 0.56,
15683
+ "grad_norm": 14.3125,
15684
+ "learning_rate": 5.186440677966102e-06,
15685
+ "loss": 1.2716,
15686
+ "step": 22350
15687
+ },
15688
+ {
15689
+ "epoch": 0.56,
15690
+ "grad_norm": 34.25,
15691
+ "learning_rate": 5.1796610169491535e-06,
15692
+ "loss": 1.2392,
15693
+ "step": 22360
15694
+ },
15695
+ {
15696
+ "epoch": 0.56,
15697
+ "grad_norm": 68.0,
15698
+ "learning_rate": 5.172881355932203e-06,
15699
+ "loss": 1.2263,
15700
+ "step": 22370
15701
+ },
15702
+ {
15703
+ "epoch": 0.56,
15704
+ "grad_norm": 29.875,
15705
+ "learning_rate": 5.166101694915255e-06,
15706
+ "loss": 1.5481,
15707
+ "step": 22380
15708
+ },
15709
+ {
15710
+ "epoch": 0.56,
15711
+ "grad_norm": 27.5,
15712
+ "learning_rate": 5.159322033898305e-06,
15713
+ "loss": 1.4132,
15714
+ "step": 22390
15715
+ },
15716
+ {
15717
+ "epoch": 0.56,
15718
+ "grad_norm": 9.6875,
15719
+ "learning_rate": 5.152542372881356e-06,
15720
+ "loss": 1.2786,
15721
+ "step": 22400
15722
+ },
15723
+ {
15724
+ "epoch": 0.56,
15725
+ "grad_norm": 15.1875,
15726
+ "learning_rate": 5.145762711864407e-06,
15727
+ "loss": 1.3105,
15728
+ "step": 22410
15729
+ },
15730
+ {
15731
+ "epoch": 0.56,
15732
+ "grad_norm": 15.3125,
15733
+ "learning_rate": 5.138983050847458e-06,
15734
+ "loss": 1.2716,
15735
+ "step": 22420
15736
+ },
15737
+ {
15738
+ "epoch": 0.56,
15739
+ "grad_norm": 11.5,
15740
+ "learning_rate": 5.132203389830509e-06,
15741
+ "loss": 1.381,
15742
+ "step": 22430
15743
+ },
15744
+ {
15745
+ "epoch": 0.56,
15746
+ "grad_norm": 28.75,
15747
+ "learning_rate": 5.12542372881356e-06,
15748
+ "loss": 1.2154,
15749
+ "step": 22440
15750
+ },
15751
+ {
15752
+ "epoch": 0.56,
15753
+ "grad_norm": 34.75,
15754
+ "learning_rate": 5.118644067796611e-06,
15755
+ "loss": 1.2588,
15756
+ "step": 22450
15757
+ },
15758
+ {
15759
+ "epoch": 0.56,
15760
+ "grad_norm": 27.625,
15761
+ "learning_rate": 5.111864406779662e-06,
15762
+ "loss": 1.2042,
15763
+ "step": 22460
15764
+ },
15765
+ {
15766
+ "epoch": 0.56,
15767
+ "grad_norm": 21.75,
15768
+ "learning_rate": 5.105084745762711e-06,
15769
+ "loss": 1.3014,
15770
+ "step": 22470
15771
+ },
15772
+ {
15773
+ "epoch": 0.56,
15774
+ "grad_norm": 16.25,
15775
+ "learning_rate": 5.098305084745763e-06,
15776
+ "loss": 1.2872,
15777
+ "step": 22480
15778
+ },
15779
+ {
15780
+ "epoch": 0.56,
15781
+ "grad_norm": 6.34375,
15782
+ "learning_rate": 5.091525423728813e-06,
15783
+ "loss": 1.3333,
15784
+ "step": 22490
15785
+ },
15786
+ {
15787
+ "epoch": 0.56,
15788
+ "grad_norm": 24.375,
15789
+ "learning_rate": 5.084745762711865e-06,
15790
+ "loss": 1.2379,
15791
+ "step": 22500
15792
+ },
15793
+ {
15794
+ "epoch": 0.56,
15795
+ "grad_norm": 15.9375,
15796
+ "learning_rate": 5.077966101694915e-06,
15797
+ "loss": 1.3752,
15798
+ "step": 22510
15799
+ },
15800
+ {
15801
+ "epoch": 0.56,
15802
+ "grad_norm": 36.0,
15803
+ "learning_rate": 5.071186440677967e-06,
15804
+ "loss": 1.3533,
15805
+ "step": 22520
15806
+ },
15807
+ {
15808
+ "epoch": 0.56,
15809
+ "grad_norm": 15.6875,
15810
+ "learning_rate": 5.064406779661017e-06,
15811
+ "loss": 1.2247,
15812
+ "step": 22530
15813
+ },
15814
+ {
15815
+ "epoch": 0.56,
15816
+ "grad_norm": 30.125,
15817
+ "learning_rate": 5.057627118644069e-06,
15818
+ "loss": 1.3658,
15819
+ "step": 22540
15820
+ },
15821
+ {
15822
+ "epoch": 0.56,
15823
+ "grad_norm": 16.0,
15824
+ "learning_rate": 5.050847457627119e-06,
15825
+ "loss": 1.4848,
15826
+ "step": 22550
15827
+ },
15828
+ {
15829
+ "epoch": 0.56,
15830
+ "grad_norm": 17.625,
15831
+ "learning_rate": 5.0440677966101705e-06,
15832
+ "loss": 1.3547,
15833
+ "step": 22560
15834
+ },
15835
+ {
15836
+ "epoch": 0.56,
15837
+ "grad_norm": 30.5,
15838
+ "learning_rate": 5.037288135593221e-06,
15839
+ "loss": 1.3617,
15840
+ "step": 22570
15841
+ },
15842
+ {
15843
+ "epoch": 0.56,
15844
+ "grad_norm": 11.8125,
15845
+ "learning_rate": 5.030508474576271e-06,
15846
+ "loss": 1.3552,
15847
+ "step": 22580
15848
+ },
15849
+ {
15850
+ "epoch": 0.56,
15851
+ "grad_norm": 29.375,
15852
+ "learning_rate": 5.023728813559322e-06,
15853
+ "loss": 1.4876,
15854
+ "step": 22590
15855
+ },
15856
+ {
15857
+ "epoch": 0.56,
15858
+ "grad_norm": 32.75,
15859
+ "learning_rate": 5.016949152542373e-06,
15860
+ "loss": 1.2906,
15861
+ "step": 22600
15862
+ },
15863
+ {
15864
+ "epoch": 0.57,
15865
+ "grad_norm": 28.625,
15866
+ "learning_rate": 5.010169491525424e-06,
15867
+ "loss": 1.3796,
15868
+ "step": 22610
15869
+ },
15870
+ {
15871
+ "epoch": 0.57,
15872
+ "grad_norm": 47.75,
15873
+ "learning_rate": 5.003389830508475e-06,
15874
+ "loss": 1.4018,
15875
+ "step": 22620
15876
+ },
15877
+ {
15878
+ "epoch": 0.57,
15879
+ "grad_norm": 29.625,
15880
+ "learning_rate": 4.996610169491526e-06,
15881
+ "loss": 1.1936,
15882
+ "step": 22630
15883
+ },
15884
+ {
15885
+ "epoch": 0.57,
15886
+ "grad_norm": 62.0,
15887
+ "learning_rate": 4.989830508474577e-06,
15888
+ "loss": 1.4104,
15889
+ "step": 22640
15890
+ },
15891
+ {
15892
+ "epoch": 0.57,
15893
+ "grad_norm": 16.0,
15894
+ "learning_rate": 4.983050847457628e-06,
15895
+ "loss": 1.2701,
15896
+ "step": 22650
15897
+ },
15898
+ {
15899
+ "epoch": 0.57,
15900
+ "grad_norm": 37.5,
15901
+ "learning_rate": 4.976271186440678e-06,
15902
+ "loss": 1.3059,
15903
+ "step": 22660
15904
+ },
15905
+ {
15906
+ "epoch": 0.57,
15907
+ "grad_norm": 16.5,
15908
+ "learning_rate": 4.969491525423729e-06,
15909
+ "loss": 1.313,
15910
+ "step": 22670
15911
+ },
15912
+ {
15913
+ "epoch": 0.57,
15914
+ "grad_norm": 6.75,
15915
+ "learning_rate": 4.96271186440678e-06,
15916
+ "loss": 1.3277,
15917
+ "step": 22680
15918
+ },
15919
+ {
15920
+ "epoch": 0.57,
15921
+ "grad_norm": 13.25,
15922
+ "learning_rate": 4.955932203389831e-06,
15923
+ "loss": 1.1521,
15924
+ "step": 22690
15925
+ },
15926
+ {
15927
+ "epoch": 0.57,
15928
+ "grad_norm": 13.4375,
15929
+ "learning_rate": 4.949152542372882e-06,
15930
+ "loss": 1.4065,
15931
+ "step": 22700
15932
+ },
15933
+ {
15934
+ "epoch": 0.57,
15935
+ "grad_norm": 25.0,
15936
+ "learning_rate": 4.942372881355932e-06,
15937
+ "loss": 1.3772,
15938
+ "step": 22710
15939
+ },
15940
+ {
15941
+ "epoch": 0.57,
15942
+ "grad_norm": 31.125,
15943
+ "learning_rate": 4.935593220338984e-06,
15944
+ "loss": 1.3597,
15945
+ "step": 22720
15946
+ },
15947
+ {
15948
+ "epoch": 0.57,
15949
+ "grad_norm": 12.625,
15950
+ "learning_rate": 4.928813559322034e-06,
15951
+ "loss": 1.3534,
15952
+ "step": 22730
15953
+ },
15954
+ {
15955
+ "epoch": 0.57,
15956
+ "grad_norm": 15.5,
15957
+ "learning_rate": 4.922033898305086e-06,
15958
+ "loss": 1.4141,
15959
+ "step": 22740
15960
+ },
15961
+ {
15962
+ "epoch": 0.57,
15963
+ "grad_norm": 26.375,
15964
+ "learning_rate": 4.915254237288136e-06,
15965
+ "loss": 1.4019,
15966
+ "step": 22750
15967
+ },
15968
+ {
15969
+ "epoch": 0.57,
15970
+ "grad_norm": 40.25,
15971
+ "learning_rate": 4.908474576271187e-06,
15972
+ "loss": 1.273,
15973
+ "step": 22760
15974
+ },
15975
+ {
15976
+ "epoch": 0.57,
15977
+ "grad_norm": 19.25,
15978
+ "learning_rate": 4.901694915254237e-06,
15979
+ "loss": 1.3954,
15980
+ "step": 22770
15981
+ },
15982
+ {
15983
+ "epoch": 0.57,
15984
+ "grad_norm": 26.875,
15985
+ "learning_rate": 4.894915254237289e-06,
15986
+ "loss": 1.4013,
15987
+ "step": 22780
15988
+ },
15989
+ {
15990
+ "epoch": 0.57,
15991
+ "grad_norm": 28.0,
15992
+ "learning_rate": 4.888135593220339e-06,
15993
+ "loss": 1.428,
15994
+ "step": 22790
15995
+ },
15996
+ {
15997
+ "epoch": 0.57,
15998
+ "grad_norm": 27.0,
15999
+ "learning_rate": 4.881355932203391e-06,
16000
+ "loss": 1.1701,
16001
+ "step": 22800
16002
+ },
16003
+ {
16004
+ "epoch": 0.57,
16005
+ "grad_norm": 78.0,
16006
+ "learning_rate": 4.874576271186441e-06,
16007
+ "loss": 1.3312,
16008
+ "step": 22810
16009
+ },
16010
+ {
16011
+ "epoch": 0.57,
16012
+ "grad_norm": 26.375,
16013
+ "learning_rate": 4.867796610169492e-06,
16014
+ "loss": 1.2204,
16015
+ "step": 22820
16016
+ },
16017
+ {
16018
+ "epoch": 0.57,
16019
+ "grad_norm": 32.5,
16020
+ "learning_rate": 4.861016949152543e-06,
16021
+ "loss": 1.4364,
16022
+ "step": 22830
16023
+ },
16024
+ {
16025
+ "epoch": 0.57,
16026
+ "grad_norm": 65.5,
16027
+ "learning_rate": 4.854237288135594e-06,
16028
+ "loss": 1.5421,
16029
+ "step": 22840
16030
+ },
16031
+ {
16032
+ "epoch": 0.57,
16033
+ "grad_norm": 48.25,
16034
+ "learning_rate": 4.847457627118645e-06,
16035
+ "loss": 1.4326,
16036
+ "step": 22850
16037
+ },
16038
+ {
16039
+ "epoch": 0.57,
16040
+ "grad_norm": 47.25,
16041
+ "learning_rate": 4.840677966101695e-06,
16042
+ "loss": 1.243,
16043
+ "step": 22860
16044
+ },
16045
+ {
16046
+ "epoch": 0.57,
16047
+ "grad_norm": 12.1875,
16048
+ "learning_rate": 4.833898305084746e-06,
16049
+ "loss": 1.3397,
16050
+ "step": 22870
16051
+ },
16052
+ {
16053
+ "epoch": 0.57,
16054
+ "grad_norm": 20.5,
16055
+ "learning_rate": 4.827118644067797e-06,
16056
+ "loss": 1.2613,
16057
+ "step": 22880
16058
+ },
16059
+ {
16060
+ "epoch": 0.57,
16061
+ "grad_norm": 37.5,
16062
+ "learning_rate": 4.820338983050848e-06,
16063
+ "loss": 1.3743,
16064
+ "step": 22890
16065
+ },
16066
+ {
16067
+ "epoch": 0.57,
16068
+ "grad_norm": 43.25,
16069
+ "learning_rate": 4.813559322033899e-06,
16070
+ "loss": 1.4017,
16071
+ "step": 22900
16072
+ },
16073
+ {
16074
+ "epoch": 0.57,
16075
+ "grad_norm": 11.375,
16076
+ "learning_rate": 4.80677966101695e-06,
16077
+ "loss": 1.268,
16078
+ "step": 22910
16079
+ },
16080
+ {
16081
+ "epoch": 0.57,
16082
+ "grad_norm": 59.0,
16083
+ "learning_rate": 4.800000000000001e-06,
16084
+ "loss": 1.347,
16085
+ "step": 22920
16086
+ },
16087
+ {
16088
+ "epoch": 0.57,
16089
+ "grad_norm": 42.75,
16090
+ "learning_rate": 4.793220338983051e-06,
16091
+ "loss": 1.3816,
16092
+ "step": 22930
16093
+ },
16094
+ {
16095
+ "epoch": 0.57,
16096
+ "grad_norm": 38.5,
16097
+ "learning_rate": 4.786440677966102e-06,
16098
+ "loss": 1.4551,
16099
+ "step": 22940
16100
+ },
16101
+ {
16102
+ "epoch": 0.57,
16103
+ "grad_norm": 28.875,
16104
+ "learning_rate": 4.779661016949153e-06,
16105
+ "loss": 1.3871,
16106
+ "step": 22950
16107
+ },
16108
+ {
16109
+ "epoch": 0.57,
16110
+ "grad_norm": 14.375,
16111
+ "learning_rate": 4.772881355932204e-06,
16112
+ "loss": 1.2011,
16113
+ "step": 22960
16114
+ },
16115
+ {
16116
+ "epoch": 0.57,
16117
+ "grad_norm": 31.875,
16118
+ "learning_rate": 4.766101694915254e-06,
16119
+ "loss": 1.3172,
16120
+ "step": 22970
16121
+ },
16122
+ {
16123
+ "epoch": 0.57,
16124
+ "grad_norm": 41.75,
16125
+ "learning_rate": 4.759322033898306e-06,
16126
+ "loss": 1.1602,
16127
+ "step": 22980
16128
+ },
16129
+ {
16130
+ "epoch": 0.57,
16131
+ "grad_norm": 27.875,
16132
+ "learning_rate": 4.752542372881356e-06,
16133
+ "loss": 1.1804,
16134
+ "step": 22990
16135
+ },
16136
+ {
16137
+ "epoch": 0.57,
16138
+ "grad_norm": 17.0,
16139
+ "learning_rate": 4.745762711864408e-06,
16140
+ "loss": 1.4837,
16141
+ "step": 23000
16142
+ },
16143
+ {
16144
+ "epoch": 0.58,
16145
+ "grad_norm": 61.75,
16146
+ "learning_rate": 4.738983050847458e-06,
16147
+ "loss": 1.4385,
16148
+ "step": 23010
16149
+ },
16150
+ {
16151
+ "epoch": 0.58,
16152
+ "grad_norm": 44.0,
16153
+ "learning_rate": 4.732203389830509e-06,
16154
+ "loss": 1.3842,
16155
+ "step": 23020
16156
+ },
16157
+ {
16158
+ "epoch": 0.58,
16159
+ "grad_norm": 51.25,
16160
+ "learning_rate": 4.725423728813559e-06,
16161
+ "loss": 1.4563,
16162
+ "step": 23030
16163
+ },
16164
+ {
16165
+ "epoch": 0.58,
16166
+ "grad_norm": 28.875,
16167
+ "learning_rate": 4.718644067796611e-06,
16168
+ "loss": 1.3894,
16169
+ "step": 23040
16170
+ },
16171
+ {
16172
+ "epoch": 0.58,
16173
+ "grad_norm": 144.0,
16174
+ "learning_rate": 4.711864406779661e-06,
16175
+ "loss": 1.266,
16176
+ "step": 23050
16177
+ },
16178
+ {
16179
+ "epoch": 0.58,
16180
+ "grad_norm": 9.8125,
16181
+ "learning_rate": 4.705084745762713e-06,
16182
+ "loss": 1.446,
16183
+ "step": 23060
16184
+ },
16185
+ {
16186
+ "epoch": 0.58,
16187
+ "grad_norm": 17.125,
16188
+ "learning_rate": 4.698305084745763e-06,
16189
+ "loss": 1.2639,
16190
+ "step": 23070
16191
+ },
16192
+ {
16193
+ "epoch": 0.58,
16194
+ "grad_norm": 11.125,
16195
+ "learning_rate": 4.691525423728814e-06,
16196
+ "loss": 1.3835,
16197
+ "step": 23080
16198
+ },
16199
+ {
16200
+ "epoch": 0.58,
16201
+ "grad_norm": 30.75,
16202
+ "learning_rate": 4.684745762711865e-06,
16203
+ "loss": 1.3717,
16204
+ "step": 23090
16205
+ },
16206
+ {
16207
+ "epoch": 0.58,
16208
+ "grad_norm": 28.125,
16209
+ "learning_rate": 4.677966101694916e-06,
16210
+ "loss": 1.4138,
16211
+ "step": 23100
16212
+ },
16213
+ {
16214
+ "epoch": 0.58,
16215
+ "grad_norm": 10.75,
16216
+ "learning_rate": 4.671186440677967e-06,
16217
+ "loss": 1.4599,
16218
+ "step": 23110
16219
+ },
16220
+ {
16221
+ "epoch": 0.58,
16222
+ "grad_norm": 36.75,
16223
+ "learning_rate": 4.664406779661017e-06,
16224
+ "loss": 1.2974,
16225
+ "step": 23120
16226
+ },
16227
+ {
16228
+ "epoch": 0.58,
16229
+ "grad_norm": 11.9375,
16230
+ "learning_rate": 4.657627118644068e-06,
16231
+ "loss": 1.4729,
16232
+ "step": 23130
16233
+ },
16234
+ {
16235
+ "epoch": 0.58,
16236
+ "grad_norm": 19.25,
16237
+ "learning_rate": 4.650847457627119e-06,
16238
+ "loss": 1.4423,
16239
+ "step": 23140
16240
+ },
16241
+ {
16242
+ "epoch": 0.58,
16243
+ "grad_norm": 22.375,
16244
+ "learning_rate": 4.64406779661017e-06,
16245
+ "loss": 1.2612,
16246
+ "step": 23150
16247
+ },
16248
+ {
16249
+ "epoch": 0.58,
16250
+ "grad_norm": 40.5,
16251
+ "learning_rate": 4.637288135593221e-06,
16252
+ "loss": 1.1974,
16253
+ "step": 23160
16254
+ },
16255
+ {
16256
+ "epoch": 0.58,
16257
+ "grad_norm": 28.75,
16258
+ "learning_rate": 4.630508474576272e-06,
16259
+ "loss": 1.3801,
16260
+ "step": 23170
16261
+ },
16262
+ {
16263
+ "epoch": 0.58,
16264
+ "grad_norm": 33.75,
16265
+ "learning_rate": 4.623728813559323e-06,
16266
+ "loss": 1.248,
16267
+ "step": 23180
16268
+ },
16269
+ {
16270
+ "epoch": 0.58,
16271
+ "grad_norm": 34.0,
16272
+ "learning_rate": 4.616949152542373e-06,
16273
+ "loss": 1.4849,
16274
+ "step": 23190
16275
+ },
16276
+ {
16277
+ "epoch": 0.58,
16278
+ "grad_norm": 31.25,
16279
+ "learning_rate": 4.610169491525424e-06,
16280
+ "loss": 1.2063,
16281
+ "step": 23200
16282
+ },
16283
+ {
16284
+ "epoch": 0.58,
16285
+ "grad_norm": 33.25,
16286
+ "learning_rate": 4.603389830508475e-06,
16287
+ "loss": 1.449,
16288
+ "step": 23210
16289
+ },
16290
+ {
16291
+ "epoch": 0.58,
16292
+ "grad_norm": 24.875,
16293
+ "learning_rate": 4.596610169491526e-06,
16294
+ "loss": 1.327,
16295
+ "step": 23220
16296
+ },
16297
+ {
16298
+ "epoch": 0.58,
16299
+ "grad_norm": 14.5,
16300
+ "learning_rate": 4.589830508474576e-06,
16301
+ "loss": 1.2771,
16302
+ "step": 23230
16303
+ },
16304
+ {
16305
+ "epoch": 0.58,
16306
+ "grad_norm": 16.75,
16307
+ "learning_rate": 4.583050847457628e-06,
16308
+ "loss": 1.1508,
16309
+ "step": 23240
16310
+ },
16311
+ {
16312
+ "epoch": 0.58,
16313
+ "grad_norm": 31.125,
16314
+ "learning_rate": 4.576271186440678e-06,
16315
+ "loss": 1.2125,
16316
+ "step": 23250
16317
+ },
16318
+ {
16319
+ "epoch": 0.58,
16320
+ "grad_norm": 17.625,
16321
+ "learning_rate": 4.56949152542373e-06,
16322
+ "loss": 1.3442,
16323
+ "step": 23260
16324
+ },
16325
+ {
16326
+ "epoch": 0.58,
16327
+ "grad_norm": 37.25,
16328
+ "learning_rate": 4.56271186440678e-06,
16329
+ "loss": 1.238,
16330
+ "step": 23270
16331
+ },
16332
+ {
16333
+ "epoch": 0.58,
16334
+ "grad_norm": 21.0,
16335
+ "learning_rate": 4.555932203389831e-06,
16336
+ "loss": 1.3523,
16337
+ "step": 23280
16338
+ },
16339
+ {
16340
+ "epoch": 0.58,
16341
+ "grad_norm": 16.375,
16342
+ "learning_rate": 4.549152542372881e-06,
16343
+ "loss": 1.3336,
16344
+ "step": 23290
16345
+ },
16346
+ {
16347
+ "epoch": 0.58,
16348
+ "grad_norm": 28.75,
16349
+ "learning_rate": 4.542372881355933e-06,
16350
+ "loss": 1.1508,
16351
+ "step": 23300
16352
+ },
16353
+ {
16354
+ "epoch": 0.58,
16355
+ "grad_norm": 47.25,
16356
+ "learning_rate": 4.535593220338983e-06,
16357
+ "loss": 1.3125,
16358
+ "step": 23310
16359
+ },
16360
+ {
16361
+ "epoch": 0.58,
16362
+ "grad_norm": 23.875,
16363
+ "learning_rate": 4.528813559322035e-06,
16364
+ "loss": 1.2634,
16365
+ "step": 23320
16366
+ },
16367
+ {
16368
+ "epoch": 0.58,
16369
+ "grad_norm": 23.25,
16370
+ "learning_rate": 4.522033898305085e-06,
16371
+ "loss": 1.2934,
16372
+ "step": 23330
16373
+ },
16374
+ {
16375
+ "epoch": 0.58,
16376
+ "grad_norm": 58.5,
16377
+ "learning_rate": 4.515254237288136e-06,
16378
+ "loss": 1.2535,
16379
+ "step": 23340
16380
+ },
16381
+ {
16382
+ "epoch": 0.58,
16383
+ "grad_norm": 17.5,
16384
+ "learning_rate": 4.508474576271187e-06,
16385
+ "loss": 1.1762,
16386
+ "step": 23350
16387
+ },
16388
+ {
16389
+ "epoch": 0.58,
16390
+ "grad_norm": 25.5,
16391
+ "learning_rate": 4.501694915254238e-06,
16392
+ "loss": 1.396,
16393
+ "step": 23360
16394
+ },
16395
+ {
16396
+ "epoch": 0.58,
16397
+ "grad_norm": 63.5,
16398
+ "learning_rate": 4.494915254237289e-06,
16399
+ "loss": 1.3609,
16400
+ "step": 23370
16401
+ },
16402
+ {
16403
+ "epoch": 0.58,
16404
+ "grad_norm": 42.5,
16405
+ "learning_rate": 4.488135593220339e-06,
16406
+ "loss": 1.3194,
16407
+ "step": 23380
16408
+ },
16409
+ {
16410
+ "epoch": 0.58,
16411
+ "grad_norm": 50.5,
16412
+ "learning_rate": 4.48135593220339e-06,
16413
+ "loss": 1.4122,
16414
+ "step": 23390
16415
+ },
16416
+ {
16417
+ "epoch": 0.58,
16418
+ "grad_norm": 50.5,
16419
+ "learning_rate": 4.474576271186441e-06,
16420
+ "loss": 1.4809,
16421
+ "step": 23400
16422
+ },
16423
+ {
16424
+ "epoch": 0.59,
16425
+ "grad_norm": 44.25,
16426
+ "learning_rate": 4.467796610169492e-06,
16427
+ "loss": 1.2724,
16428
+ "step": 23410
16429
+ },
16430
+ {
16431
+ "epoch": 0.59,
16432
+ "grad_norm": 28.375,
16433
+ "learning_rate": 4.461016949152543e-06,
16434
+ "loss": 1.3694,
16435
+ "step": 23420
16436
+ },
16437
+ {
16438
+ "epoch": 0.59,
16439
+ "grad_norm": 10.3125,
16440
+ "learning_rate": 4.454237288135594e-06,
16441
+ "loss": 1.4755,
16442
+ "step": 23430
16443
+ },
16444
+ {
16445
+ "epoch": 0.59,
16446
+ "grad_norm": 16.375,
16447
+ "learning_rate": 4.447457627118645e-06,
16448
+ "loss": 1.2465,
16449
+ "step": 23440
16450
+ },
16451
+ {
16452
+ "epoch": 0.59,
16453
+ "grad_norm": 32.25,
16454
+ "learning_rate": 4.440677966101695e-06,
16455
+ "loss": 1.302,
16456
+ "step": 23450
16457
+ },
16458
+ {
16459
+ "epoch": 0.59,
16460
+ "grad_norm": 44.75,
16461
+ "learning_rate": 4.433898305084746e-06,
16462
+ "loss": 1.1708,
16463
+ "step": 23460
16464
+ },
16465
+ {
16466
+ "epoch": 0.59,
16467
+ "grad_norm": 55.5,
16468
+ "learning_rate": 4.427118644067797e-06,
16469
+ "loss": 1.4715,
16470
+ "step": 23470
16471
+ },
16472
+ {
16473
+ "epoch": 0.59,
16474
+ "grad_norm": 29.625,
16475
+ "learning_rate": 4.420338983050848e-06,
16476
+ "loss": 1.316,
16477
+ "step": 23480
16478
+ },
16479
+ {
16480
+ "epoch": 0.59,
16481
+ "grad_norm": 15.625,
16482
+ "learning_rate": 4.413559322033898e-06,
16483
+ "loss": 1.4471,
16484
+ "step": 23490
16485
+ },
16486
+ {
16487
+ "epoch": 0.59,
16488
+ "grad_norm": 27.625,
16489
+ "learning_rate": 4.40677966101695e-06,
16490
+ "loss": 1.1327,
16491
+ "step": 23500
16492
+ },
16493
+ {
16494
+ "epoch": 0.59,
16495
+ "grad_norm": 21.25,
16496
+ "learning_rate": 4.4e-06,
16497
+ "loss": 1.1244,
16498
+ "step": 23510
16499
+ },
16500
+ {
16501
+ "epoch": 0.59,
16502
+ "grad_norm": 15.8125,
16503
+ "learning_rate": 4.393220338983052e-06,
16504
+ "loss": 1.278,
16505
+ "step": 23520
16506
+ },
16507
+ {
16508
+ "epoch": 0.59,
16509
+ "grad_norm": 14.1875,
16510
+ "learning_rate": 4.386440677966102e-06,
16511
+ "loss": 1.3043,
16512
+ "step": 23530
16513
+ },
16514
+ {
16515
+ "epoch": 0.59,
16516
+ "grad_norm": 22.5,
16517
+ "learning_rate": 4.379661016949153e-06,
16518
+ "loss": 1.3161,
16519
+ "step": 23540
16520
+ },
16521
+ {
16522
+ "epoch": 0.59,
16523
+ "grad_norm": 14.8125,
16524
+ "learning_rate": 4.372881355932203e-06,
16525
+ "loss": 1.5512,
16526
+ "step": 23550
16527
+ },
16528
+ {
16529
+ "epoch": 0.59,
16530
+ "grad_norm": 46.25,
16531
+ "learning_rate": 4.366101694915255e-06,
16532
+ "loss": 1.2756,
16533
+ "step": 23560
16534
+ },
16535
+ {
16536
+ "epoch": 0.59,
16537
+ "grad_norm": 37.75,
16538
+ "learning_rate": 4.359322033898305e-06,
16539
+ "loss": 1.2049,
16540
+ "step": 23570
16541
+ },
16542
+ {
16543
+ "epoch": 0.59,
16544
+ "grad_norm": 14.4375,
16545
+ "learning_rate": 4.352542372881357e-06,
16546
+ "loss": 1.5071,
16547
+ "step": 23580
16548
+ },
16549
+ {
16550
+ "epoch": 0.59,
16551
+ "grad_norm": 24.75,
16552
+ "learning_rate": 4.345762711864407e-06,
16553
+ "loss": 1.3423,
16554
+ "step": 23590
16555
+ },
16556
+ {
16557
+ "epoch": 0.59,
16558
+ "grad_norm": 24.75,
16559
+ "learning_rate": 4.338983050847458e-06,
16560
+ "loss": 1.1705,
16561
+ "step": 23600
16562
+ },
16563
+ {
16564
+ "epoch": 0.59,
16565
+ "grad_norm": 39.25,
16566
+ "learning_rate": 4.332203389830509e-06,
16567
+ "loss": 1.2632,
16568
+ "step": 23610
16569
+ },
16570
+ {
16571
+ "epoch": 0.59,
16572
+ "grad_norm": 17.5,
16573
+ "learning_rate": 4.32542372881356e-06,
16574
+ "loss": 1.4266,
16575
+ "step": 23620
16576
+ },
16577
+ {
16578
+ "epoch": 0.59,
16579
+ "grad_norm": 51.0,
16580
+ "learning_rate": 4.318644067796611e-06,
16581
+ "loss": 1.4648,
16582
+ "step": 23630
16583
+ },
16584
+ {
16585
+ "epoch": 0.59,
16586
+ "grad_norm": 19.25,
16587
+ "learning_rate": 4.311864406779661e-06,
16588
+ "loss": 1.3572,
16589
+ "step": 23640
16590
+ },
16591
+ {
16592
+ "epoch": 0.59,
16593
+ "grad_norm": 11.75,
16594
+ "learning_rate": 4.305084745762712e-06,
16595
+ "loss": 1.3391,
16596
+ "step": 23650
16597
+ },
16598
+ {
16599
+ "epoch": 0.59,
16600
+ "grad_norm": 24.0,
16601
+ "learning_rate": 4.298305084745763e-06,
16602
+ "loss": 1.3104,
16603
+ "step": 23660
16604
+ },
16605
+ {
16606
+ "epoch": 0.59,
16607
+ "grad_norm": 15.25,
16608
+ "learning_rate": 4.291525423728814e-06,
16609
+ "loss": 1.2981,
16610
+ "step": 23670
16611
+ },
16612
+ {
16613
+ "epoch": 0.59,
16614
+ "grad_norm": 14.875,
16615
+ "learning_rate": 4.284745762711865e-06,
16616
+ "loss": 1.3867,
16617
+ "step": 23680
16618
+ },
16619
+ {
16620
+ "epoch": 0.59,
16621
+ "grad_norm": 12.875,
16622
+ "learning_rate": 4.277966101694915e-06,
16623
+ "loss": 1.2998,
16624
+ "step": 23690
16625
+ },
16626
+ {
16627
+ "epoch": 0.59,
16628
+ "grad_norm": 25.75,
16629
+ "learning_rate": 4.271186440677967e-06,
16630
+ "loss": 1.4142,
16631
+ "step": 23700
16632
+ },
16633
+ {
16634
+ "epoch": 0.59,
16635
+ "grad_norm": 11.0625,
16636
+ "learning_rate": 4.264406779661017e-06,
16637
+ "loss": 1.4814,
16638
+ "step": 23710
16639
+ },
16640
+ {
16641
+ "epoch": 0.59,
16642
+ "grad_norm": 21.375,
16643
+ "learning_rate": 4.257627118644068e-06,
16644
+ "loss": 1.527,
16645
+ "step": 23720
16646
+ },
16647
+ {
16648
+ "epoch": 0.59,
16649
+ "grad_norm": 11.375,
16650
+ "learning_rate": 4.250847457627119e-06,
16651
+ "loss": 1.2311,
16652
+ "step": 23730
16653
+ },
16654
+ {
16655
+ "epoch": 0.59,
16656
+ "grad_norm": 35.75,
16657
+ "learning_rate": 4.24406779661017e-06,
16658
+ "loss": 1.4337,
16659
+ "step": 23740
16660
+ },
16661
+ {
16662
+ "epoch": 0.59,
16663
+ "grad_norm": 12.1875,
16664
+ "learning_rate": 4.23728813559322e-06,
16665
+ "loss": 1.2348,
16666
+ "step": 23750
16667
+ },
16668
+ {
16669
+ "epoch": 0.59,
16670
+ "grad_norm": 11.5,
16671
+ "learning_rate": 4.230508474576272e-06,
16672
+ "loss": 1.2744,
16673
+ "step": 23760
16674
+ },
16675
+ {
16676
+ "epoch": 0.59,
16677
+ "grad_norm": 24.875,
16678
+ "learning_rate": 4.223728813559322e-06,
16679
+ "loss": 1.1926,
16680
+ "step": 23770
16681
+ },
16682
+ {
16683
+ "epoch": 0.59,
16684
+ "grad_norm": 33.5,
16685
+ "learning_rate": 4.216949152542374e-06,
16686
+ "loss": 1.3709,
16687
+ "step": 23780
16688
+ },
16689
+ {
16690
+ "epoch": 0.59,
16691
+ "grad_norm": 44.0,
16692
+ "learning_rate": 4.210169491525424e-06,
16693
+ "loss": 1.2787,
16694
+ "step": 23790
16695
+ },
16696
+ {
16697
+ "epoch": 0.59,
16698
+ "grad_norm": 60.5,
16699
+ "learning_rate": 4.203389830508475e-06,
16700
+ "loss": 1.5057,
16701
+ "step": 23800
16702
+ },
16703
+ {
16704
+ "epoch": 0.6,
16705
+ "grad_norm": 24.875,
16706
+ "learning_rate": 4.196610169491525e-06,
16707
+ "loss": 1.3983,
16708
+ "step": 23810
16709
+ },
16710
+ {
16711
+ "epoch": 0.6,
16712
+ "grad_norm": 87.0,
16713
+ "learning_rate": 4.189830508474577e-06,
16714
+ "loss": 1.266,
16715
+ "step": 23820
16716
+ },
16717
+ {
16718
+ "epoch": 0.6,
16719
+ "grad_norm": 35.75,
16720
+ "learning_rate": 4.183050847457627e-06,
16721
+ "loss": 1.5536,
16722
+ "step": 23830
16723
+ },
16724
+ {
16725
+ "epoch": 0.6,
16726
+ "grad_norm": 13.25,
16727
+ "learning_rate": 4.176271186440679e-06,
16728
+ "loss": 1.4579,
16729
+ "step": 23840
16730
+ },
16731
+ {
16732
+ "epoch": 0.6,
16733
+ "grad_norm": 13.0625,
16734
+ "learning_rate": 4.169491525423729e-06,
16735
+ "loss": 1.3378,
16736
+ "step": 23850
16737
+ },
16738
+ {
16739
+ "epoch": 0.6,
16740
+ "grad_norm": 11.6875,
16741
+ "learning_rate": 4.16271186440678e-06,
16742
+ "loss": 1.4234,
16743
+ "step": 23860
16744
+ },
16745
+ {
16746
+ "epoch": 0.6,
16747
+ "grad_norm": 36.75,
16748
+ "learning_rate": 4.155932203389831e-06,
16749
+ "loss": 1.1912,
16750
+ "step": 23870
16751
+ },
16752
+ {
16753
+ "epoch": 0.6,
16754
+ "grad_norm": 26.5,
16755
+ "learning_rate": 4.149152542372882e-06,
16756
+ "loss": 1.3767,
16757
+ "step": 23880
16758
+ },
16759
+ {
16760
+ "epoch": 0.6,
16761
+ "grad_norm": 12.375,
16762
+ "learning_rate": 4.142372881355933e-06,
16763
+ "loss": 1.4622,
16764
+ "step": 23890
16765
+ },
16766
+ {
16767
+ "epoch": 0.6,
16768
+ "grad_norm": 42.25,
16769
+ "learning_rate": 4.135593220338983e-06,
16770
+ "loss": 1.4049,
16771
+ "step": 23900
16772
+ },
16773
+ {
16774
+ "epoch": 0.6,
16775
+ "grad_norm": 61.5,
16776
+ "learning_rate": 4.128813559322034e-06,
16777
+ "loss": 1.4161,
16778
+ "step": 23910
16779
+ },
16780
+ {
16781
+ "epoch": 0.6,
16782
+ "grad_norm": 12.5625,
16783
+ "learning_rate": 4.122033898305085e-06,
16784
+ "loss": 1.2148,
16785
+ "step": 23920
16786
+ },
16787
+ {
16788
+ "epoch": 0.6,
16789
+ "grad_norm": 43.0,
16790
+ "learning_rate": 4.115254237288136e-06,
16791
+ "loss": 1.4088,
16792
+ "step": 23930
16793
+ },
16794
+ {
16795
+ "epoch": 0.6,
16796
+ "grad_norm": 34.25,
16797
+ "learning_rate": 4.108474576271187e-06,
16798
+ "loss": 1.4125,
16799
+ "step": 23940
16800
+ },
16801
+ {
16802
+ "epoch": 0.6,
16803
+ "grad_norm": 69.0,
16804
+ "learning_rate": 4.101694915254237e-06,
16805
+ "loss": 1.4205,
16806
+ "step": 23950
16807
+ },
16808
+ {
16809
+ "epoch": 0.6,
16810
+ "grad_norm": 29.375,
16811
+ "learning_rate": 4.094915254237289e-06,
16812
+ "loss": 1.1675,
16813
+ "step": 23960
16814
+ },
16815
+ {
16816
+ "epoch": 0.6,
16817
+ "grad_norm": 27.125,
16818
+ "learning_rate": 4.088135593220339e-06,
16819
+ "loss": 1.2709,
16820
+ "step": 23970
16821
+ },
16822
+ {
16823
+ "epoch": 0.6,
16824
+ "grad_norm": 6.84375,
16825
+ "learning_rate": 4.081355932203391e-06,
16826
+ "loss": 1.236,
16827
+ "step": 23980
16828
+ },
16829
+ {
16830
+ "epoch": 0.6,
16831
+ "grad_norm": 40.75,
16832
+ "learning_rate": 4.074576271186441e-06,
16833
+ "loss": 1.2984,
16834
+ "step": 23990
16835
+ },
16836
+ {
16837
+ "epoch": 0.6,
16838
+ "grad_norm": 24.5,
16839
+ "learning_rate": 4.067796610169492e-06,
16840
+ "loss": 1.2543,
16841
+ "step": 24000
16842
+ },
16843
+ {
16844
+ "epoch": 0.6,
16845
+ "grad_norm": 17.0,
16846
+ "learning_rate": 4.061016949152542e-06,
16847
+ "loss": 1.1977,
16848
+ "step": 24010
16849
+ },
16850
+ {
16851
+ "epoch": 0.6,
16852
+ "grad_norm": 9.8125,
16853
+ "learning_rate": 4.054237288135594e-06,
16854
+ "loss": 1.2005,
16855
+ "step": 24020
16856
+ },
16857
+ {
16858
+ "epoch": 0.6,
16859
+ "grad_norm": 28.875,
16860
+ "learning_rate": 4.047457627118644e-06,
16861
+ "loss": 1.3207,
16862
+ "step": 24030
16863
+ },
16864
+ {
16865
+ "epoch": 0.6,
16866
+ "grad_norm": 19.25,
16867
+ "learning_rate": 4.040677966101696e-06,
16868
+ "loss": 1.2664,
16869
+ "step": 24040
16870
+ },
16871
+ {
16872
+ "epoch": 0.6,
16873
+ "grad_norm": 26.0,
16874
+ "learning_rate": 4.033898305084746e-06,
16875
+ "loss": 1.2086,
16876
+ "step": 24050
16877
+ },
16878
+ {
16879
+ "epoch": 0.6,
16880
+ "grad_norm": 36.5,
16881
+ "learning_rate": 4.027118644067797e-06,
16882
+ "loss": 1.4535,
16883
+ "step": 24060
16884
+ },
16885
+ {
16886
+ "epoch": 0.6,
16887
+ "grad_norm": 22.25,
16888
+ "learning_rate": 4.020338983050847e-06,
16889
+ "loss": 1.411,
16890
+ "step": 24070
16891
+ },
16892
+ {
16893
+ "epoch": 0.6,
16894
+ "grad_norm": 7.90625,
16895
+ "learning_rate": 4.013559322033899e-06,
16896
+ "loss": 1.3128,
16897
+ "step": 24080
16898
+ },
16899
+ {
16900
+ "epoch": 0.6,
16901
+ "grad_norm": 11.875,
16902
+ "learning_rate": 4.006779661016949e-06,
16903
+ "loss": 1.1817,
16904
+ "step": 24090
16905
+ },
16906
+ {
16907
+ "epoch": 0.6,
16908
+ "grad_norm": 11.6875,
16909
+ "learning_rate": 4.000000000000001e-06,
16910
+ "loss": 1.382,
16911
+ "step": 24100
16912
+ },
16913
+ {
16914
+ "epoch": 0.6,
16915
+ "grad_norm": 9.4375,
16916
+ "learning_rate": 3.993220338983051e-06,
16917
+ "loss": 1.2053,
16918
+ "step": 24110
16919
+ },
16920
+ {
16921
+ "epoch": 0.6,
16922
+ "grad_norm": 32.5,
16923
+ "learning_rate": 3.986440677966102e-06,
16924
+ "loss": 1.4975,
16925
+ "step": 24120
16926
+ },
16927
+ {
16928
+ "epoch": 0.6,
16929
+ "grad_norm": 13.9375,
16930
+ "learning_rate": 3.979661016949153e-06,
16931
+ "loss": 1.3662,
16932
+ "step": 24130
16933
+ },
16934
+ {
16935
+ "epoch": 0.6,
16936
+ "grad_norm": 13.375,
16937
+ "learning_rate": 3.972881355932204e-06,
16938
+ "loss": 1.1596,
16939
+ "step": 24140
16940
+ },
16941
+ {
16942
+ "epoch": 0.6,
16943
+ "grad_norm": 16.75,
16944
+ "learning_rate": 3.966101694915255e-06,
16945
+ "loss": 1.4116,
16946
+ "step": 24150
16947
+ },
16948
+ {
16949
+ "epoch": 0.6,
16950
+ "grad_norm": 13.1875,
16951
+ "learning_rate": 3.959322033898305e-06,
16952
+ "loss": 1.4245,
16953
+ "step": 24160
16954
+ },
16955
+ {
16956
+ "epoch": 0.6,
16957
+ "grad_norm": 14.125,
16958
+ "learning_rate": 3.952542372881356e-06,
16959
+ "loss": 1.5293,
16960
+ "step": 24170
16961
+ },
16962
+ {
16963
+ "epoch": 0.6,
16964
+ "grad_norm": 16.125,
16965
+ "learning_rate": 3.945762711864407e-06,
16966
+ "loss": 1.324,
16967
+ "step": 24180
16968
+ },
16969
+ {
16970
+ "epoch": 0.6,
16971
+ "grad_norm": 15.5625,
16972
+ "learning_rate": 3.938983050847458e-06,
16973
+ "loss": 1.3789,
16974
+ "step": 24190
16975
+ },
16976
+ {
16977
+ "epoch": 0.6,
16978
+ "grad_norm": 23.125,
16979
+ "learning_rate": 3.932203389830509e-06,
16980
+ "loss": 1.4892,
16981
+ "step": 24200
16982
+ },
16983
+ {
16984
+ "epoch": 0.61,
16985
+ "grad_norm": 18.125,
16986
+ "learning_rate": 3.925423728813559e-06,
16987
+ "loss": 1.3624,
16988
+ "step": 24210
16989
+ },
16990
+ {
16991
+ "epoch": 0.61,
16992
+ "grad_norm": 32.5,
16993
+ "learning_rate": 3.918644067796611e-06,
16994
+ "loss": 1.3137,
16995
+ "step": 24220
16996
+ },
16997
+ {
16998
+ "epoch": 0.61,
16999
+ "grad_norm": 32.0,
17000
+ "learning_rate": 3.911864406779661e-06,
17001
+ "loss": 1.5286,
17002
+ "step": 24230
17003
+ },
17004
+ {
17005
+ "epoch": 0.61,
17006
+ "grad_norm": 52.75,
17007
+ "learning_rate": 3.905084745762713e-06,
17008
+ "loss": 1.4272,
17009
+ "step": 24240
17010
+ },
17011
+ {
17012
+ "epoch": 0.61,
17013
+ "grad_norm": 54.25,
17014
+ "learning_rate": 3.898305084745763e-06,
17015
+ "loss": 1.3123,
17016
+ "step": 24250
17017
+ },
17018
+ {
17019
+ "epoch": 0.61,
17020
+ "grad_norm": 79.5,
17021
+ "learning_rate": 3.891525423728814e-06,
17022
+ "loss": 1.2272,
17023
+ "step": 24260
17024
+ },
17025
+ {
17026
+ "epoch": 0.61,
17027
+ "grad_norm": 22.375,
17028
+ "learning_rate": 3.884745762711864e-06,
17029
+ "loss": 1.369,
17030
+ "step": 24270
17031
+ },
17032
+ {
17033
+ "epoch": 0.61,
17034
+ "grad_norm": 25.5,
17035
+ "learning_rate": 3.877966101694916e-06,
17036
+ "loss": 1.3113,
17037
+ "step": 24280
17038
+ },
17039
+ {
17040
+ "epoch": 0.61,
17041
+ "grad_norm": 30.375,
17042
+ "learning_rate": 3.871186440677966e-06,
17043
+ "loss": 1.3477,
17044
+ "step": 24290
17045
+ },
17046
+ {
17047
+ "epoch": 0.61,
17048
+ "grad_norm": 24.125,
17049
+ "learning_rate": 3.864406779661018e-06,
17050
+ "loss": 1.3172,
17051
+ "step": 24300
17052
+ },
17053
+ {
17054
+ "epoch": 0.61,
17055
+ "grad_norm": 19.75,
17056
+ "learning_rate": 3.857627118644068e-06,
17057
+ "loss": 1.3028,
17058
+ "step": 24310
17059
+ },
17060
+ {
17061
+ "epoch": 0.61,
17062
+ "grad_norm": 30.375,
17063
+ "learning_rate": 3.850847457627119e-06,
17064
+ "loss": 1.4591,
17065
+ "step": 24320
17066
+ },
17067
+ {
17068
+ "epoch": 0.61,
17069
+ "grad_norm": 35.25,
17070
+ "learning_rate": 3.844067796610169e-06,
17071
+ "loss": 1.3009,
17072
+ "step": 24330
17073
+ },
17074
+ {
17075
+ "epoch": 0.61,
17076
+ "grad_norm": 24.375,
17077
+ "learning_rate": 3.837288135593221e-06,
17078
+ "loss": 1.35,
17079
+ "step": 24340
17080
+ },
17081
+ {
17082
+ "epoch": 0.61,
17083
+ "grad_norm": 31.0,
17084
+ "learning_rate": 3.830508474576271e-06,
17085
+ "loss": 1.4239,
17086
+ "step": 24350
17087
+ },
17088
+ {
17089
+ "epoch": 0.61,
17090
+ "grad_norm": 50.25,
17091
+ "learning_rate": 3.823728813559323e-06,
17092
+ "loss": 1.2657,
17093
+ "step": 24360
17094
+ },
17095
+ {
17096
+ "epoch": 0.61,
17097
+ "grad_norm": 45.75,
17098
+ "learning_rate": 3.816949152542373e-06,
17099
+ "loss": 1.3529,
17100
+ "step": 24370
17101
+ },
17102
+ {
17103
+ "epoch": 0.61,
17104
+ "grad_norm": 31.5,
17105
+ "learning_rate": 3.8101694915254238e-06,
17106
+ "loss": 1.3574,
17107
+ "step": 24380
17108
+ },
17109
+ {
17110
+ "epoch": 0.61,
17111
+ "grad_norm": 21.125,
17112
+ "learning_rate": 3.8033898305084748e-06,
17113
+ "loss": 1.2754,
17114
+ "step": 24390
17115
+ },
17116
+ {
17117
+ "epoch": 0.61,
17118
+ "grad_norm": 11.3125,
17119
+ "learning_rate": 3.7966101694915257e-06,
17120
+ "loss": 1.2281,
17121
+ "step": 24400
17122
+ },
17123
+ {
17124
+ "epoch": 0.61,
17125
+ "grad_norm": 22.125,
17126
+ "learning_rate": 3.7898305084745767e-06,
17127
+ "loss": 1.3828,
17128
+ "step": 24410
17129
+ },
17130
+ {
17131
+ "epoch": 0.61,
17132
+ "grad_norm": 13.0,
17133
+ "learning_rate": 3.7830508474576273e-06,
17134
+ "loss": 1.4197,
17135
+ "step": 24420
17136
+ },
17137
+ {
17138
+ "epoch": 0.61,
17139
+ "grad_norm": 33.0,
17140
+ "learning_rate": 3.7762711864406782e-06,
17141
+ "loss": 1.2412,
17142
+ "step": 24430
17143
+ },
17144
+ {
17145
+ "epoch": 0.61,
17146
+ "grad_norm": 22.375,
17147
+ "learning_rate": 3.7694915254237292e-06,
17148
+ "loss": 1.1763,
17149
+ "step": 24440
17150
+ },
17151
+ {
17152
+ "epoch": 0.61,
17153
+ "grad_norm": 29.625,
17154
+ "learning_rate": 3.76271186440678e-06,
17155
+ "loss": 1.2184,
17156
+ "step": 24450
17157
+ },
17158
+ {
17159
+ "epoch": 0.61,
17160
+ "grad_norm": 15.375,
17161
+ "learning_rate": 3.755932203389831e-06,
17162
+ "loss": 1.381,
17163
+ "step": 24460
17164
+ },
17165
+ {
17166
+ "epoch": 0.61,
17167
+ "grad_norm": 12.9375,
17168
+ "learning_rate": 3.7491525423728813e-06,
17169
+ "loss": 1.2347,
17170
+ "step": 24470
17171
+ },
17172
+ {
17173
+ "epoch": 0.61,
17174
+ "grad_norm": 18.125,
17175
+ "learning_rate": 3.7423728813559323e-06,
17176
+ "loss": 1.2788,
17177
+ "step": 24480
17178
+ },
17179
+ {
17180
+ "epoch": 0.61,
17181
+ "grad_norm": 9.0625,
17182
+ "learning_rate": 3.7355932203389833e-06,
17183
+ "loss": 1.3998,
17184
+ "step": 24490
17185
+ },
17186
+ {
17187
+ "epoch": 0.61,
17188
+ "grad_norm": 19.375,
17189
+ "learning_rate": 3.7288135593220342e-06,
17190
+ "loss": 1.489,
17191
+ "step": 24500
17192
+ },
17193
+ {
17194
+ "epoch": 0.61,
17195
+ "grad_norm": 50.0,
17196
+ "learning_rate": 3.7220338983050852e-06,
17197
+ "loss": 1.3069,
17198
+ "step": 24510
17199
+ },
17200
+ {
17201
+ "epoch": 0.61,
17202
+ "grad_norm": 21.25,
17203
+ "learning_rate": 3.715254237288136e-06,
17204
+ "loss": 1.2861,
17205
+ "step": 24520
17206
+ },
17207
+ {
17208
+ "epoch": 0.61,
17209
+ "grad_norm": 18.875,
17210
+ "learning_rate": 3.7084745762711867e-06,
17211
+ "loss": 1.2172,
17212
+ "step": 24530
17213
+ },
17214
+ {
17215
+ "epoch": 0.61,
17216
+ "grad_norm": 56.75,
17217
+ "learning_rate": 3.7016949152542377e-06,
17218
+ "loss": 1.2433,
17219
+ "step": 24540
17220
+ },
17221
+ {
17222
+ "epoch": 0.61,
17223
+ "grad_norm": 54.25,
17224
+ "learning_rate": 3.6949152542372883e-06,
17225
+ "loss": 1.3888,
17226
+ "step": 24550
17227
+ },
17228
+ {
17229
+ "epoch": 0.61,
17230
+ "grad_norm": 25.25,
17231
+ "learning_rate": 3.6881355932203393e-06,
17232
+ "loss": 1.3927,
17233
+ "step": 24560
17234
+ },
17235
+ {
17236
+ "epoch": 0.61,
17237
+ "grad_norm": 48.25,
17238
+ "learning_rate": 3.6813559322033902e-06,
17239
+ "loss": 1.3315,
17240
+ "step": 24570
17241
+ },
17242
+ {
17243
+ "epoch": 0.61,
17244
+ "grad_norm": 76.0,
17245
+ "learning_rate": 3.6745762711864408e-06,
17246
+ "loss": 1.3999,
17247
+ "step": 24580
17248
+ },
17249
+ {
17250
+ "epoch": 0.61,
17251
+ "grad_norm": 50.5,
17252
+ "learning_rate": 3.6677966101694918e-06,
17253
+ "loss": 1.3443,
17254
+ "step": 24590
17255
+ },
17256
+ {
17257
+ "epoch": 0.61,
17258
+ "grad_norm": 25.375,
17259
+ "learning_rate": 3.6610169491525427e-06,
17260
+ "loss": 1.4185,
17261
+ "step": 24600
17262
+ },
17263
+ {
17264
+ "epoch": 0.62,
17265
+ "grad_norm": 14.3125,
17266
+ "learning_rate": 3.6542372881355937e-06,
17267
+ "loss": 1.2463,
17268
+ "step": 24610
17269
+ },
17270
+ {
17271
+ "epoch": 0.62,
17272
+ "grad_norm": 13.25,
17273
+ "learning_rate": 3.6474576271186447e-06,
17274
+ "loss": 1.1997,
17275
+ "step": 24620
17276
+ },
17277
+ {
17278
+ "epoch": 0.62,
17279
+ "grad_norm": 16.875,
17280
+ "learning_rate": 3.640677966101695e-06,
17281
+ "loss": 1.3394,
17282
+ "step": 24630
17283
+ },
17284
+ {
17285
+ "epoch": 0.62,
17286
+ "grad_norm": 32.25,
17287
+ "learning_rate": 3.633898305084746e-06,
17288
+ "loss": 1.2986,
17289
+ "step": 24640
17290
+ },
17291
+ {
17292
+ "epoch": 0.62,
17293
+ "grad_norm": 13.25,
17294
+ "learning_rate": 3.6271186440677968e-06,
17295
+ "loss": 1.4319,
17296
+ "step": 24650
17297
+ },
17298
+ {
17299
+ "epoch": 0.62,
17300
+ "grad_norm": 17.75,
17301
+ "learning_rate": 3.6203389830508478e-06,
17302
+ "loss": 1.2953,
17303
+ "step": 24660
17304
+ },
17305
+ {
17306
+ "epoch": 0.62,
17307
+ "grad_norm": 7.84375,
17308
+ "learning_rate": 3.6135593220338987e-06,
17309
+ "loss": 1.4395,
17310
+ "step": 24670
17311
+ },
17312
+ {
17313
+ "epoch": 0.62,
17314
+ "grad_norm": 12.5,
17315
+ "learning_rate": 3.6067796610169493e-06,
17316
+ "loss": 1.294,
17317
+ "step": 24680
17318
+ },
17319
+ {
17320
+ "epoch": 0.62,
17321
+ "grad_norm": 23.75,
17322
+ "learning_rate": 3.6000000000000003e-06,
17323
+ "loss": 1.3854,
17324
+ "step": 24690
17325
+ },
17326
+ {
17327
+ "epoch": 0.62,
17328
+ "grad_norm": 24.5,
17329
+ "learning_rate": 3.5932203389830512e-06,
17330
+ "loss": 1.2867,
17331
+ "step": 24700
17332
+ },
17333
+ {
17334
+ "epoch": 0.62,
17335
+ "grad_norm": 27.0,
17336
+ "learning_rate": 3.5864406779661022e-06,
17337
+ "loss": 1.2254,
17338
+ "step": 24710
17339
+ },
17340
+ {
17341
+ "epoch": 0.62,
17342
+ "grad_norm": 10.125,
17343
+ "learning_rate": 3.579661016949153e-06,
17344
+ "loss": 1.3512,
17345
+ "step": 24720
17346
+ },
17347
+ {
17348
+ "epoch": 0.62,
17349
+ "grad_norm": 59.75,
17350
+ "learning_rate": 3.5728813559322033e-06,
17351
+ "loss": 1.3847,
17352
+ "step": 24730
17353
+ },
17354
+ {
17355
+ "epoch": 0.62,
17356
+ "grad_norm": 51.0,
17357
+ "learning_rate": 3.5661016949152543e-06,
17358
+ "loss": 1.3099,
17359
+ "step": 24740
17360
+ },
17361
+ {
17362
+ "epoch": 0.62,
17363
+ "grad_norm": 29.625,
17364
+ "learning_rate": 3.5593220338983053e-06,
17365
+ "loss": 1.3505,
17366
+ "step": 24750
17367
+ },
17368
+ {
17369
+ "epoch": 0.62,
17370
+ "grad_norm": 43.5,
17371
+ "learning_rate": 3.5525423728813563e-06,
17372
+ "loss": 1.055,
17373
+ "step": 24760
17374
+ },
17375
+ {
17376
+ "epoch": 0.62,
17377
+ "grad_norm": 14.1875,
17378
+ "learning_rate": 3.5457627118644072e-06,
17379
+ "loss": 1.4283,
17380
+ "step": 24770
17381
+ },
17382
+ {
17383
+ "epoch": 0.62,
17384
+ "grad_norm": 31.625,
17385
+ "learning_rate": 3.538983050847458e-06,
17386
+ "loss": 1.5257,
17387
+ "step": 24780
17388
+ },
17389
+ {
17390
+ "epoch": 0.62,
17391
+ "grad_norm": 17.5,
17392
+ "learning_rate": 3.5322033898305088e-06,
17393
+ "loss": 1.3837,
17394
+ "step": 24790
17395
+ },
17396
+ {
17397
+ "epoch": 0.62,
17398
+ "grad_norm": 26.875,
17399
+ "learning_rate": 3.5254237288135597e-06,
17400
+ "loss": 1.3111,
17401
+ "step": 24800
17402
+ },
17403
+ {
17404
+ "epoch": 0.62,
17405
+ "grad_norm": 13.9375,
17406
+ "learning_rate": 3.5186440677966103e-06,
17407
+ "loss": 1.4159,
17408
+ "step": 24810
17409
+ },
17410
+ {
17411
+ "epoch": 0.62,
17412
+ "grad_norm": 47.25,
17413
+ "learning_rate": 3.5118644067796613e-06,
17414
+ "loss": 1.4506,
17415
+ "step": 24820
17416
+ },
17417
+ {
17418
+ "epoch": 0.62,
17419
+ "grad_norm": 28.375,
17420
+ "learning_rate": 3.5050847457627122e-06,
17421
+ "loss": 1.235,
17422
+ "step": 24830
17423
+ },
17424
+ {
17425
+ "epoch": 0.62,
17426
+ "grad_norm": 41.25,
17427
+ "learning_rate": 3.498305084745763e-06,
17428
+ "loss": 1.3643,
17429
+ "step": 24840
17430
+ },
17431
+ {
17432
+ "epoch": 0.62,
17433
+ "grad_norm": 15.5625,
17434
+ "learning_rate": 3.4915254237288138e-06,
17435
+ "loss": 1.1906,
17436
+ "step": 24850
17437
+ },
17438
+ {
17439
+ "epoch": 0.62,
17440
+ "grad_norm": 29.625,
17441
+ "learning_rate": 3.4847457627118648e-06,
17442
+ "loss": 1.3653,
17443
+ "step": 24860
17444
+ },
17445
+ {
17446
+ "epoch": 0.62,
17447
+ "grad_norm": 19.625,
17448
+ "learning_rate": 3.4779661016949157e-06,
17449
+ "loss": 1.2675,
17450
+ "step": 24870
17451
+ },
17452
+ {
17453
+ "epoch": 0.62,
17454
+ "grad_norm": 18.375,
17455
+ "learning_rate": 3.4711864406779667e-06,
17456
+ "loss": 1.4436,
17457
+ "step": 24880
17458
+ },
17459
+ {
17460
+ "epoch": 0.62,
17461
+ "grad_norm": 20.5,
17462
+ "learning_rate": 3.464406779661017e-06,
17463
+ "loss": 1.3635,
17464
+ "step": 24890
17465
+ },
17466
+ {
17467
+ "epoch": 0.62,
17468
+ "grad_norm": 18.125,
17469
+ "learning_rate": 3.457627118644068e-06,
17470
+ "loss": 1.4103,
17471
+ "step": 24900
17472
+ },
17473
+ {
17474
+ "epoch": 0.62,
17475
+ "grad_norm": 28.5,
17476
+ "learning_rate": 3.450847457627119e-06,
17477
+ "loss": 1.2253,
17478
+ "step": 24910
17479
+ },
17480
+ {
17481
+ "epoch": 0.62,
17482
+ "grad_norm": 41.25,
17483
+ "learning_rate": 3.4440677966101698e-06,
17484
+ "loss": 1.3705,
17485
+ "step": 24920
17486
+ },
17487
+ {
17488
+ "epoch": 0.62,
17489
+ "grad_norm": 39.25,
17490
+ "learning_rate": 3.4372881355932207e-06,
17491
+ "loss": 1.4641,
17492
+ "step": 24930
17493
+ },
17494
+ {
17495
+ "epoch": 0.62,
17496
+ "grad_norm": 30.875,
17497
+ "learning_rate": 3.4305084745762713e-06,
17498
+ "loss": 1.4391,
17499
+ "step": 24940
17500
+ },
17501
+ {
17502
+ "epoch": 0.62,
17503
+ "grad_norm": 33.0,
17504
+ "learning_rate": 3.4237288135593223e-06,
17505
+ "loss": 1.2609,
17506
+ "step": 24950
17507
+ },
17508
+ {
17509
+ "epoch": 0.62,
17510
+ "grad_norm": 13.0,
17511
+ "learning_rate": 3.4169491525423733e-06,
17512
+ "loss": 1.4069,
17513
+ "step": 24960
17514
+ },
17515
+ {
17516
+ "epoch": 0.62,
17517
+ "grad_norm": 30.125,
17518
+ "learning_rate": 3.4101694915254242e-06,
17519
+ "loss": 1.3547,
17520
+ "step": 24970
17521
+ },
17522
+ {
17523
+ "epoch": 0.62,
17524
+ "grad_norm": 33.5,
17525
+ "learning_rate": 3.403389830508475e-06,
17526
+ "loss": 1.0802,
17527
+ "step": 24980
17528
+ },
17529
+ {
17530
+ "epoch": 0.62,
17531
+ "grad_norm": 16.625,
17532
+ "learning_rate": 3.3966101694915253e-06,
17533
+ "loss": 1.4374,
17534
+ "step": 24990
17535
+ },
17536
+ {
17537
+ "epoch": 0.62,
17538
+ "grad_norm": 18.375,
17539
+ "learning_rate": 3.3898305084745763e-06,
17540
+ "loss": 1.3566,
17541
+ "step": 25000
17542
+ },
17543
+ {
17544
+ "epoch": 0.62,
17545
+ "eval_loss": 1.3512009382247925,
17546
+ "eval_runtime": 59.1664,
17547
+ "eval_samples_per_second": 16.901,
17548
+ "eval_steps_per_second": 16.901,
17549
+ "step": 25000
17550
  }
17551
  ],
17552
  "logging_steps": 10,
 
17554
  "num_input_tokens_seen": 0,
17555
  "num_train_epochs": 1,
17556
  "save_steps": 5000,
17557
+ "total_flos": 3.933424582656e+17,
17558
  "train_batch_size": 1,
17559
  "trial_name": null,
17560
  "trial_params": null