UltimoUno commited on
Commit
7bf31b9
1 Parent(s): 479d4eb

Uploaded checkpoint-30000

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +3511 -3
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bb301bf8fc5272c04a72e51a13f9b832cd846a9c51360f628cb3fc9a1ac42a2
3
  size 2692969128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e9607fb1911b956c4d02106ef08e76a51729b53898739d1cc111b25cd67ecd4
3
  size 2692969128
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0211b169e1bc00bdf338234616fcee78f3b3fa8789f060393af487bb73e8419c
3
  size 5386075202
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1578e699caa99ef0803efbd0253276112c1bf03f598f3a83293db1a03c64c90
3
  size 5386075202
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c92a95a97d689d636b085d406167a1d143dce26fb83ee64d21cf4b37a120302
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce3f1fd7866b93bfea3f328fd35124d6c8818c1dd3b24d163380d4a576714e9
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3bdbaa37c77733a3ea9eb90a36bc290f4f5b9f56abe23cc6586cbaa459f92c6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b36093e06845c6146f3175c64f0e8bdb441d4f7fc67a6962ed0b80b6725daf1
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 1.3379485607147217,
3
  "best_model_checkpoint": "runs/deepseek_20240422-210351/checkpoint-15000",
4
- "epoch": 0.625,
5
  "eval_steps": 5000,
6
- "global_step": 25000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -17547,6 +17547,3514 @@
17547
  "eval_samples_per_second": 16.901,
17548
  "eval_steps_per_second": 16.901,
17549
  "step": 25000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17550
  }
17551
  ],
17552
  "logging_steps": 10,
@@ -17554,7 +21062,7 @@
17554
  "num_input_tokens_seen": 0,
17555
  "num_train_epochs": 1,
17556
  "save_steps": 5000,
17557
- "total_flos": 3.933424582656e+17,
17558
  "train_batch_size": 1,
17559
  "trial_name": null,
17560
  "trial_params": null
 
1
  {
2
  "best_metric": 1.3379485607147217,
3
  "best_model_checkpoint": "runs/deepseek_20240422-210351/checkpoint-15000",
4
+ "epoch": 0.75,
5
  "eval_steps": 5000,
6
+ "global_step": 30000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
17547
  "eval_samples_per_second": 16.901,
17548
  "eval_steps_per_second": 16.901,
17549
  "step": 25000
17550
+ },
17551
+ {
17552
+ "epoch": 0.63,
17553
+ "grad_norm": 36.25,
17554
+ "learning_rate": 3.3830508474576273e-06,
17555
+ "loss": 1.5974,
17556
+ "step": 25010
17557
+ },
17558
+ {
17559
+ "epoch": 0.63,
17560
+ "grad_norm": 21.125,
17561
+ "learning_rate": 3.3762711864406783e-06,
17562
+ "loss": 1.2943,
17563
+ "step": 25020
17564
+ },
17565
+ {
17566
+ "epoch": 0.63,
17567
+ "grad_norm": 13.9375,
17568
+ "learning_rate": 3.3694915254237292e-06,
17569
+ "loss": 1.2029,
17570
+ "step": 25030
17571
+ },
17572
+ {
17573
+ "epoch": 0.63,
17574
+ "grad_norm": 25.375,
17575
+ "learning_rate": 3.3627118644067802e-06,
17576
+ "loss": 1.4546,
17577
+ "step": 25040
17578
+ },
17579
+ {
17580
+ "epoch": 0.63,
17581
+ "grad_norm": 23.5,
17582
+ "learning_rate": 3.3559322033898308e-06,
17583
+ "loss": 1.4137,
17584
+ "step": 25050
17585
+ },
17586
+ {
17587
+ "epoch": 0.63,
17588
+ "grad_norm": 17.125,
17589
+ "learning_rate": 3.3491525423728817e-06,
17590
+ "loss": 1.3464,
17591
+ "step": 25060
17592
+ },
17593
+ {
17594
+ "epoch": 0.63,
17595
+ "grad_norm": 16.75,
17596
+ "learning_rate": 3.3423728813559327e-06,
17597
+ "loss": 1.3147,
17598
+ "step": 25070
17599
+ },
17600
+ {
17601
+ "epoch": 0.63,
17602
+ "grad_norm": 14.5,
17603
+ "learning_rate": 3.3355932203389833e-06,
17604
+ "loss": 1.3724,
17605
+ "step": 25080
17606
+ },
17607
+ {
17608
+ "epoch": 0.63,
17609
+ "grad_norm": 33.5,
17610
+ "learning_rate": 3.3288135593220343e-06,
17611
+ "loss": 1.4675,
17612
+ "step": 25090
17613
+ },
17614
+ {
17615
+ "epoch": 0.63,
17616
+ "grad_norm": 9.4375,
17617
+ "learning_rate": 3.322033898305085e-06,
17618
+ "loss": 1.337,
17619
+ "step": 25100
17620
+ },
17621
+ {
17622
+ "epoch": 0.63,
17623
+ "grad_norm": 19.875,
17624
+ "learning_rate": 3.3152542372881358e-06,
17625
+ "loss": 1.3212,
17626
+ "step": 25110
17627
+ },
17628
+ {
17629
+ "epoch": 0.63,
17630
+ "grad_norm": 26.375,
17631
+ "learning_rate": 3.3084745762711868e-06,
17632
+ "loss": 1.3812,
17633
+ "step": 25120
17634
+ },
17635
+ {
17636
+ "epoch": 0.63,
17637
+ "grad_norm": 14.4375,
17638
+ "learning_rate": 3.3016949152542377e-06,
17639
+ "loss": 1.2622,
17640
+ "step": 25130
17641
+ },
17642
+ {
17643
+ "epoch": 0.63,
17644
+ "grad_norm": 51.25,
17645
+ "learning_rate": 3.2949152542372887e-06,
17646
+ "loss": 1.3327,
17647
+ "step": 25140
17648
+ },
17649
+ {
17650
+ "epoch": 0.63,
17651
+ "grad_norm": 7.03125,
17652
+ "learning_rate": 3.288135593220339e-06,
17653
+ "loss": 1.3353,
17654
+ "step": 25150
17655
+ },
17656
+ {
17657
+ "epoch": 0.63,
17658
+ "grad_norm": 11.25,
17659
+ "learning_rate": 3.28135593220339e-06,
17660
+ "loss": 1.1754,
17661
+ "step": 25160
17662
+ },
17663
+ {
17664
+ "epoch": 0.63,
17665
+ "grad_norm": 12.8125,
17666
+ "learning_rate": 3.274576271186441e-06,
17667
+ "loss": 1.4266,
17668
+ "step": 25170
17669
+ },
17670
+ {
17671
+ "epoch": 0.63,
17672
+ "grad_norm": 20.25,
17673
+ "learning_rate": 3.2677966101694918e-06,
17674
+ "loss": 1.2969,
17675
+ "step": 25180
17676
+ },
17677
+ {
17678
+ "epoch": 0.63,
17679
+ "grad_norm": 44.0,
17680
+ "learning_rate": 3.2610169491525428e-06,
17681
+ "loss": 1.4229,
17682
+ "step": 25190
17683
+ },
17684
+ {
17685
+ "epoch": 0.63,
17686
+ "grad_norm": 28.875,
17687
+ "learning_rate": 3.2542372881355933e-06,
17688
+ "loss": 1.2987,
17689
+ "step": 25200
17690
+ },
17691
+ {
17692
+ "epoch": 0.63,
17693
+ "grad_norm": 31.0,
17694
+ "learning_rate": 3.2474576271186443e-06,
17695
+ "loss": 1.1972,
17696
+ "step": 25210
17697
+ },
17698
+ {
17699
+ "epoch": 0.63,
17700
+ "grad_norm": 48.75,
17701
+ "learning_rate": 3.2406779661016953e-06,
17702
+ "loss": 1.3853,
17703
+ "step": 25220
17704
+ },
17705
+ {
17706
+ "epoch": 0.63,
17707
+ "grad_norm": 38.5,
17708
+ "learning_rate": 3.2338983050847462e-06,
17709
+ "loss": 1.1954,
17710
+ "step": 25230
17711
+ },
17712
+ {
17713
+ "epoch": 0.63,
17714
+ "grad_norm": 28.625,
17715
+ "learning_rate": 3.2271186440677972e-06,
17716
+ "loss": 1.4971,
17717
+ "step": 25240
17718
+ },
17719
+ {
17720
+ "epoch": 0.63,
17721
+ "grad_norm": 20.0,
17722
+ "learning_rate": 3.2203389830508473e-06,
17723
+ "loss": 1.4897,
17724
+ "step": 25250
17725
+ },
17726
+ {
17727
+ "epoch": 0.63,
17728
+ "grad_norm": 25.625,
17729
+ "learning_rate": 3.2135593220338983e-06,
17730
+ "loss": 1.2711,
17731
+ "step": 25260
17732
+ },
17733
+ {
17734
+ "epoch": 0.63,
17735
+ "grad_norm": 13.1875,
17736
+ "learning_rate": 3.2067796610169493e-06,
17737
+ "loss": 1.4647,
17738
+ "step": 25270
17739
+ },
17740
+ {
17741
+ "epoch": 0.63,
17742
+ "grad_norm": 25.125,
17743
+ "learning_rate": 3.2000000000000003e-06,
17744
+ "loss": 1.3632,
17745
+ "step": 25280
17746
+ },
17747
+ {
17748
+ "epoch": 0.63,
17749
+ "grad_norm": 23.125,
17750
+ "learning_rate": 3.1932203389830513e-06,
17751
+ "loss": 1.3991,
17752
+ "step": 25290
17753
+ },
17754
+ {
17755
+ "epoch": 0.63,
17756
+ "grad_norm": 31.75,
17757
+ "learning_rate": 3.186440677966102e-06,
17758
+ "loss": 1.4431,
17759
+ "step": 25300
17760
+ },
17761
+ {
17762
+ "epoch": 0.63,
17763
+ "grad_norm": 23.625,
17764
+ "learning_rate": 3.1796610169491528e-06,
17765
+ "loss": 1.2923,
17766
+ "step": 25310
17767
+ },
17768
+ {
17769
+ "epoch": 0.63,
17770
+ "grad_norm": 20.25,
17771
+ "learning_rate": 3.1728813559322038e-06,
17772
+ "loss": 1.3391,
17773
+ "step": 25320
17774
+ },
17775
+ {
17776
+ "epoch": 0.63,
17777
+ "grad_norm": 17.5,
17778
+ "learning_rate": 3.1661016949152547e-06,
17779
+ "loss": 1.3396,
17780
+ "step": 25330
17781
+ },
17782
+ {
17783
+ "epoch": 0.63,
17784
+ "grad_norm": 12.6875,
17785
+ "learning_rate": 3.1593220338983053e-06,
17786
+ "loss": 1.451,
17787
+ "step": 25340
17788
+ },
17789
+ {
17790
+ "epoch": 0.63,
17791
+ "grad_norm": 52.75,
17792
+ "learning_rate": 3.1525423728813563e-06,
17793
+ "loss": 1.1884,
17794
+ "step": 25350
17795
+ },
17796
+ {
17797
+ "epoch": 0.63,
17798
+ "grad_norm": 12.1875,
17799
+ "learning_rate": 3.145762711864407e-06,
17800
+ "loss": 1.4216,
17801
+ "step": 25360
17802
+ },
17803
+ {
17804
+ "epoch": 0.63,
17805
+ "grad_norm": 18.5,
17806
+ "learning_rate": 3.138983050847458e-06,
17807
+ "loss": 1.2726,
17808
+ "step": 25370
17809
+ },
17810
+ {
17811
+ "epoch": 0.63,
17812
+ "grad_norm": 20.75,
17813
+ "learning_rate": 3.1322033898305088e-06,
17814
+ "loss": 1.1607,
17815
+ "step": 25380
17816
+ },
17817
+ {
17818
+ "epoch": 0.63,
17819
+ "grad_norm": 24.375,
17820
+ "learning_rate": 3.1254237288135598e-06,
17821
+ "loss": 1.3368,
17822
+ "step": 25390
17823
+ },
17824
+ {
17825
+ "epoch": 0.64,
17826
+ "grad_norm": 22.5,
17827
+ "learning_rate": 3.1186440677966107e-06,
17828
+ "loss": 1.3483,
17829
+ "step": 25400
17830
+ },
17831
+ {
17832
+ "epoch": 0.64,
17833
+ "grad_norm": 15.3125,
17834
+ "learning_rate": 3.111864406779661e-06,
17835
+ "loss": 1.376,
17836
+ "step": 25410
17837
+ },
17838
+ {
17839
+ "epoch": 0.64,
17840
+ "grad_norm": 30.625,
17841
+ "learning_rate": 3.105084745762712e-06,
17842
+ "loss": 1.5073,
17843
+ "step": 25420
17844
+ },
17845
+ {
17846
+ "epoch": 0.64,
17847
+ "grad_norm": 20.375,
17848
+ "learning_rate": 3.098305084745763e-06,
17849
+ "loss": 1.2712,
17850
+ "step": 25430
17851
+ },
17852
+ {
17853
+ "epoch": 0.64,
17854
+ "grad_norm": 30.0,
17855
+ "learning_rate": 3.091525423728814e-06,
17856
+ "loss": 1.4744,
17857
+ "step": 25440
17858
+ },
17859
+ {
17860
+ "epoch": 0.64,
17861
+ "grad_norm": 45.75,
17862
+ "learning_rate": 3.0847457627118648e-06,
17863
+ "loss": 1.1748,
17864
+ "step": 25450
17865
+ },
17866
+ {
17867
+ "epoch": 0.64,
17868
+ "grad_norm": 17.75,
17869
+ "learning_rate": 3.0779661016949153e-06,
17870
+ "loss": 1.3216,
17871
+ "step": 25460
17872
+ },
17873
+ {
17874
+ "epoch": 0.64,
17875
+ "grad_norm": 10.25,
17876
+ "learning_rate": 3.0711864406779663e-06,
17877
+ "loss": 1.3373,
17878
+ "step": 25470
17879
+ },
17880
+ {
17881
+ "epoch": 0.64,
17882
+ "grad_norm": 22.25,
17883
+ "learning_rate": 3.0644067796610173e-06,
17884
+ "loss": 1.1978,
17885
+ "step": 25480
17886
+ },
17887
+ {
17888
+ "epoch": 0.64,
17889
+ "grad_norm": 30.125,
17890
+ "learning_rate": 3.0576271186440683e-06,
17891
+ "loss": 1.2682,
17892
+ "step": 25490
17893
+ },
17894
+ {
17895
+ "epoch": 0.64,
17896
+ "grad_norm": 16.0,
17897
+ "learning_rate": 3.0508474576271192e-06,
17898
+ "loss": 1.1775,
17899
+ "step": 25500
17900
+ },
17901
+ {
17902
+ "epoch": 0.64,
17903
+ "grad_norm": 23.125,
17904
+ "learning_rate": 3.0440677966101694e-06,
17905
+ "loss": 1.1059,
17906
+ "step": 25510
17907
+ },
17908
+ {
17909
+ "epoch": 0.64,
17910
+ "grad_norm": 25.5,
17911
+ "learning_rate": 3.0372881355932203e-06,
17912
+ "loss": 1.2425,
17913
+ "step": 25520
17914
+ },
17915
+ {
17916
+ "epoch": 0.64,
17917
+ "grad_norm": 14.3125,
17918
+ "learning_rate": 3.0305084745762713e-06,
17919
+ "loss": 1.4788,
17920
+ "step": 25530
17921
+ },
17922
+ {
17923
+ "epoch": 0.64,
17924
+ "grad_norm": 41.25,
17925
+ "learning_rate": 3.0237288135593223e-06,
17926
+ "loss": 1.4762,
17927
+ "step": 25540
17928
+ },
17929
+ {
17930
+ "epoch": 0.64,
17931
+ "grad_norm": 19.375,
17932
+ "learning_rate": 3.0169491525423733e-06,
17933
+ "loss": 1.3921,
17934
+ "step": 25550
17935
+ },
17936
+ {
17937
+ "epoch": 0.64,
17938
+ "grad_norm": 28.125,
17939
+ "learning_rate": 3.010169491525424e-06,
17940
+ "loss": 1.4866,
17941
+ "step": 25560
17942
+ },
17943
+ {
17944
+ "epoch": 0.64,
17945
+ "grad_norm": 76.5,
17946
+ "learning_rate": 3.003389830508475e-06,
17947
+ "loss": 1.2202,
17948
+ "step": 25570
17949
+ },
17950
+ {
17951
+ "epoch": 0.64,
17952
+ "grad_norm": 49.5,
17953
+ "learning_rate": 2.9966101694915258e-06,
17954
+ "loss": 1.3687,
17955
+ "step": 25580
17956
+ },
17957
+ {
17958
+ "epoch": 0.64,
17959
+ "grad_norm": 25.0,
17960
+ "learning_rate": 2.9898305084745768e-06,
17961
+ "loss": 1.5083,
17962
+ "step": 25590
17963
+ },
17964
+ {
17965
+ "epoch": 0.64,
17966
+ "grad_norm": 20.5,
17967
+ "learning_rate": 2.9830508474576277e-06,
17968
+ "loss": 1.3506,
17969
+ "step": 25600
17970
+ },
17971
+ {
17972
+ "epoch": 0.64,
17973
+ "grad_norm": 40.5,
17974
+ "learning_rate": 2.9762711864406783e-06,
17975
+ "loss": 1.5132,
17976
+ "step": 25610
17977
+ },
17978
+ {
17979
+ "epoch": 0.64,
17980
+ "grad_norm": 40.75,
17981
+ "learning_rate": 2.969491525423729e-06,
17982
+ "loss": 1.3987,
17983
+ "step": 25620
17984
+ },
17985
+ {
17986
+ "epoch": 0.64,
17987
+ "grad_norm": 37.25,
17988
+ "learning_rate": 2.96271186440678e-06,
17989
+ "loss": 1.3267,
17990
+ "step": 25630
17991
+ },
17992
+ {
17993
+ "epoch": 0.64,
17994
+ "grad_norm": 17.625,
17995
+ "learning_rate": 2.955932203389831e-06,
17996
+ "loss": 1.2356,
17997
+ "step": 25640
17998
+ },
17999
+ {
18000
+ "epoch": 0.64,
18001
+ "grad_norm": 23.875,
18002
+ "learning_rate": 2.9491525423728818e-06,
18003
+ "loss": 1.5515,
18004
+ "step": 25650
18005
+ },
18006
+ {
18007
+ "epoch": 0.64,
18008
+ "grad_norm": 32.25,
18009
+ "learning_rate": 2.9423728813559327e-06,
18010
+ "loss": 1.3458,
18011
+ "step": 25660
18012
+ },
18013
+ {
18014
+ "epoch": 0.64,
18015
+ "grad_norm": 24.25,
18016
+ "learning_rate": 2.935593220338983e-06,
18017
+ "loss": 1.3487,
18018
+ "step": 25670
18019
+ },
18020
+ {
18021
+ "epoch": 0.64,
18022
+ "grad_norm": 10.0,
18023
+ "learning_rate": 2.928813559322034e-06,
18024
+ "loss": 1.2515,
18025
+ "step": 25680
18026
+ },
18027
+ {
18028
+ "epoch": 0.64,
18029
+ "grad_norm": 11.75,
18030
+ "learning_rate": 2.922033898305085e-06,
18031
+ "loss": 1.4637,
18032
+ "step": 25690
18033
+ },
18034
+ {
18035
+ "epoch": 0.64,
18036
+ "grad_norm": 33.0,
18037
+ "learning_rate": 2.915254237288136e-06,
18038
+ "loss": 1.2734,
18039
+ "step": 25700
18040
+ },
18041
+ {
18042
+ "epoch": 0.64,
18043
+ "grad_norm": 31.125,
18044
+ "learning_rate": 2.9084745762711868e-06,
18045
+ "loss": 1.3431,
18046
+ "step": 25710
18047
+ },
18048
+ {
18049
+ "epoch": 0.64,
18050
+ "grad_norm": 20.375,
18051
+ "learning_rate": 2.9016949152542373e-06,
18052
+ "loss": 1.2379,
18053
+ "step": 25720
18054
+ },
18055
+ {
18056
+ "epoch": 0.64,
18057
+ "grad_norm": 27.5,
18058
+ "learning_rate": 2.8949152542372883e-06,
18059
+ "loss": 1.4185,
18060
+ "step": 25730
18061
+ },
18062
+ {
18063
+ "epoch": 0.64,
18064
+ "grad_norm": 23.625,
18065
+ "learning_rate": 2.8881355932203393e-06,
18066
+ "loss": 1.3128,
18067
+ "step": 25740
18068
+ },
18069
+ {
18070
+ "epoch": 0.64,
18071
+ "grad_norm": 16.875,
18072
+ "learning_rate": 2.8813559322033903e-06,
18073
+ "loss": 1.3461,
18074
+ "step": 25750
18075
+ },
18076
+ {
18077
+ "epoch": 0.64,
18078
+ "grad_norm": 29.375,
18079
+ "learning_rate": 2.8745762711864412e-06,
18080
+ "loss": 1.3569,
18081
+ "step": 25760
18082
+ },
18083
+ {
18084
+ "epoch": 0.64,
18085
+ "grad_norm": 40.5,
18086
+ "learning_rate": 2.8677966101694914e-06,
18087
+ "loss": 1.1441,
18088
+ "step": 25770
18089
+ },
18090
+ {
18091
+ "epoch": 0.64,
18092
+ "grad_norm": 20.75,
18093
+ "learning_rate": 2.8610169491525424e-06,
18094
+ "loss": 1.2368,
18095
+ "step": 25780
18096
+ },
18097
+ {
18098
+ "epoch": 0.64,
18099
+ "grad_norm": 23.5,
18100
+ "learning_rate": 2.8542372881355933e-06,
18101
+ "loss": 1.167,
18102
+ "step": 25790
18103
+ },
18104
+ {
18105
+ "epoch": 0.65,
18106
+ "grad_norm": 16.625,
18107
+ "learning_rate": 2.8474576271186443e-06,
18108
+ "loss": 1.2205,
18109
+ "step": 25800
18110
+ },
18111
+ {
18112
+ "epoch": 0.65,
18113
+ "grad_norm": 7.40625,
18114
+ "learning_rate": 2.8406779661016953e-06,
18115
+ "loss": 1.2521,
18116
+ "step": 25810
18117
+ },
18118
+ {
18119
+ "epoch": 0.65,
18120
+ "grad_norm": 15.4375,
18121
+ "learning_rate": 2.833898305084746e-06,
18122
+ "loss": 1.4285,
18123
+ "step": 25820
18124
+ },
18125
+ {
18126
+ "epoch": 0.65,
18127
+ "grad_norm": 25.875,
18128
+ "learning_rate": 2.827118644067797e-06,
18129
+ "loss": 1.4341,
18130
+ "step": 25830
18131
+ },
18132
+ {
18133
+ "epoch": 0.65,
18134
+ "grad_norm": 41.25,
18135
+ "learning_rate": 2.820338983050848e-06,
18136
+ "loss": 1.3918,
18137
+ "step": 25840
18138
+ },
18139
+ {
18140
+ "epoch": 0.65,
18141
+ "grad_norm": 17.0,
18142
+ "learning_rate": 2.8135593220338988e-06,
18143
+ "loss": 1.3055,
18144
+ "step": 25850
18145
+ },
18146
+ {
18147
+ "epoch": 0.65,
18148
+ "grad_norm": 22.375,
18149
+ "learning_rate": 2.8067796610169497e-06,
18150
+ "loss": 1.3505,
18151
+ "step": 25860
18152
+ },
18153
+ {
18154
+ "epoch": 0.65,
18155
+ "grad_norm": 16.75,
18156
+ "learning_rate": 2.8000000000000003e-06,
18157
+ "loss": 1.3144,
18158
+ "step": 25870
18159
+ },
18160
+ {
18161
+ "epoch": 0.65,
18162
+ "grad_norm": 32.25,
18163
+ "learning_rate": 2.793220338983051e-06,
18164
+ "loss": 1.3681,
18165
+ "step": 25880
18166
+ },
18167
+ {
18168
+ "epoch": 0.65,
18169
+ "grad_norm": 50.75,
18170
+ "learning_rate": 2.786440677966102e-06,
18171
+ "loss": 1.3335,
18172
+ "step": 25890
18173
+ },
18174
+ {
18175
+ "epoch": 0.65,
18176
+ "grad_norm": 23.5,
18177
+ "learning_rate": 2.779661016949153e-06,
18178
+ "loss": 1.493,
18179
+ "step": 25900
18180
+ },
18181
+ {
18182
+ "epoch": 0.65,
18183
+ "grad_norm": 32.5,
18184
+ "learning_rate": 2.7728813559322038e-06,
18185
+ "loss": 1.574,
18186
+ "step": 25910
18187
+ },
18188
+ {
18189
+ "epoch": 0.65,
18190
+ "grad_norm": 17.375,
18191
+ "learning_rate": 2.7661016949152548e-06,
18192
+ "loss": 1.415,
18193
+ "step": 25920
18194
+ },
18195
+ {
18196
+ "epoch": 0.65,
18197
+ "grad_norm": 13.75,
18198
+ "learning_rate": 2.7593220338983053e-06,
18199
+ "loss": 1.1766,
18200
+ "step": 25930
18201
+ },
18202
+ {
18203
+ "epoch": 0.65,
18204
+ "grad_norm": 20.875,
18205
+ "learning_rate": 2.752542372881356e-06,
18206
+ "loss": 1.3461,
18207
+ "step": 25940
18208
+ },
18209
+ {
18210
+ "epoch": 0.65,
18211
+ "grad_norm": 36.0,
18212
+ "learning_rate": 2.745762711864407e-06,
18213
+ "loss": 1.4238,
18214
+ "step": 25950
18215
+ },
18216
+ {
18217
+ "epoch": 0.65,
18218
+ "grad_norm": 24.5,
18219
+ "learning_rate": 2.738983050847458e-06,
18220
+ "loss": 1.3314,
18221
+ "step": 25960
18222
+ },
18223
+ {
18224
+ "epoch": 0.65,
18225
+ "grad_norm": 12.0,
18226
+ "learning_rate": 2.732203389830509e-06,
18227
+ "loss": 1.4093,
18228
+ "step": 25970
18229
+ },
18230
+ {
18231
+ "epoch": 0.65,
18232
+ "grad_norm": 47.75,
18233
+ "learning_rate": 2.7254237288135593e-06,
18234
+ "loss": 1.1776,
18235
+ "step": 25980
18236
+ },
18237
+ {
18238
+ "epoch": 0.65,
18239
+ "grad_norm": 9.5,
18240
+ "learning_rate": 2.7186440677966103e-06,
18241
+ "loss": 1.3146,
18242
+ "step": 25990
18243
+ },
18244
+ {
18245
+ "epoch": 0.65,
18246
+ "grad_norm": 26.0,
18247
+ "learning_rate": 2.7118644067796613e-06,
18248
+ "loss": 1.4606,
18249
+ "step": 26000
18250
+ },
18251
+ {
18252
+ "epoch": 0.65,
18253
+ "grad_norm": 10.375,
18254
+ "learning_rate": 2.7050847457627123e-06,
18255
+ "loss": 1.2379,
18256
+ "step": 26010
18257
+ },
18258
+ {
18259
+ "epoch": 0.65,
18260
+ "grad_norm": 21.0,
18261
+ "learning_rate": 2.6983050847457633e-06,
18262
+ "loss": 1.3822,
18263
+ "step": 26020
18264
+ },
18265
+ {
18266
+ "epoch": 0.65,
18267
+ "grad_norm": 14.5625,
18268
+ "learning_rate": 2.6915254237288134e-06,
18269
+ "loss": 1.3943,
18270
+ "step": 26030
18271
+ },
18272
+ {
18273
+ "epoch": 0.65,
18274
+ "grad_norm": 25.625,
18275
+ "learning_rate": 2.6847457627118644e-06,
18276
+ "loss": 1.4265,
18277
+ "step": 26040
18278
+ },
18279
+ {
18280
+ "epoch": 0.65,
18281
+ "grad_norm": 24.125,
18282
+ "learning_rate": 2.6779661016949153e-06,
18283
+ "loss": 1.3966,
18284
+ "step": 26050
18285
+ },
18286
+ {
18287
+ "epoch": 0.65,
18288
+ "grad_norm": 21.875,
18289
+ "learning_rate": 2.6711864406779663e-06,
18290
+ "loss": 1.4399,
18291
+ "step": 26060
18292
+ },
18293
+ {
18294
+ "epoch": 0.65,
18295
+ "grad_norm": 38.75,
18296
+ "learning_rate": 2.6644067796610173e-06,
18297
+ "loss": 1.3613,
18298
+ "step": 26070
18299
+ },
18300
+ {
18301
+ "epoch": 0.65,
18302
+ "grad_norm": 42.25,
18303
+ "learning_rate": 2.657627118644068e-06,
18304
+ "loss": 1.2613,
18305
+ "step": 26080
18306
+ },
18307
+ {
18308
+ "epoch": 0.65,
18309
+ "grad_norm": 43.25,
18310
+ "learning_rate": 2.650847457627119e-06,
18311
+ "loss": 1.2263,
18312
+ "step": 26090
18313
+ },
18314
+ {
18315
+ "epoch": 0.65,
18316
+ "grad_norm": 44.25,
18317
+ "learning_rate": 2.64406779661017e-06,
18318
+ "loss": 1.2948,
18319
+ "step": 26100
18320
+ },
18321
+ {
18322
+ "epoch": 0.65,
18323
+ "grad_norm": 19.0,
18324
+ "learning_rate": 2.6372881355932208e-06,
18325
+ "loss": 1.3783,
18326
+ "step": 26110
18327
+ },
18328
+ {
18329
+ "epoch": 0.65,
18330
+ "grad_norm": 35.5,
18331
+ "learning_rate": 2.6305084745762718e-06,
18332
+ "loss": 1.3806,
18333
+ "step": 26120
18334
+ },
18335
+ {
18336
+ "epoch": 0.65,
18337
+ "grad_norm": 20.0,
18338
+ "learning_rate": 2.6237288135593223e-06,
18339
+ "loss": 1.3714,
18340
+ "step": 26130
18341
+ },
18342
+ {
18343
+ "epoch": 0.65,
18344
+ "grad_norm": 28.0,
18345
+ "learning_rate": 2.616949152542373e-06,
18346
+ "loss": 1.2989,
18347
+ "step": 26140
18348
+ },
18349
+ {
18350
+ "epoch": 0.65,
18351
+ "grad_norm": 28.875,
18352
+ "learning_rate": 2.610169491525424e-06,
18353
+ "loss": 1.3626,
18354
+ "step": 26150
18355
+ },
18356
+ {
18357
+ "epoch": 0.65,
18358
+ "grad_norm": 23.125,
18359
+ "learning_rate": 2.603389830508475e-06,
18360
+ "loss": 1.3074,
18361
+ "step": 26160
18362
+ },
18363
+ {
18364
+ "epoch": 0.65,
18365
+ "grad_norm": 11.625,
18366
+ "learning_rate": 2.596610169491526e-06,
18367
+ "loss": 1.5129,
18368
+ "step": 26170
18369
+ },
18370
+ {
18371
+ "epoch": 0.65,
18372
+ "grad_norm": 11.0625,
18373
+ "learning_rate": 2.5898305084745768e-06,
18374
+ "loss": 1.3508,
18375
+ "step": 26180
18376
+ },
18377
+ {
18378
+ "epoch": 0.65,
18379
+ "grad_norm": 7.25,
18380
+ "learning_rate": 2.5830508474576273e-06,
18381
+ "loss": 1.3963,
18382
+ "step": 26190
18383
+ },
18384
+ {
18385
+ "epoch": 0.66,
18386
+ "grad_norm": 7.0625,
18387
+ "learning_rate": 2.576271186440678e-06,
18388
+ "loss": 1.4194,
18389
+ "step": 26200
18390
+ },
18391
+ {
18392
+ "epoch": 0.66,
18393
+ "grad_norm": 34.0,
18394
+ "learning_rate": 2.569491525423729e-06,
18395
+ "loss": 1.2905,
18396
+ "step": 26210
18397
+ },
18398
+ {
18399
+ "epoch": 0.66,
18400
+ "grad_norm": 12.875,
18401
+ "learning_rate": 2.56271186440678e-06,
18402
+ "loss": 1.554,
18403
+ "step": 26220
18404
+ },
18405
+ {
18406
+ "epoch": 0.66,
18407
+ "grad_norm": 24.375,
18408
+ "learning_rate": 2.555932203389831e-06,
18409
+ "loss": 1.1871,
18410
+ "step": 26230
18411
+ },
18412
+ {
18413
+ "epoch": 0.66,
18414
+ "grad_norm": 62.75,
18415
+ "learning_rate": 2.5491525423728814e-06,
18416
+ "loss": 1.3252,
18417
+ "step": 26240
18418
+ },
18419
+ {
18420
+ "epoch": 0.66,
18421
+ "grad_norm": 52.0,
18422
+ "learning_rate": 2.5423728813559323e-06,
18423
+ "loss": 1.3434,
18424
+ "step": 26250
18425
+ },
18426
+ {
18427
+ "epoch": 0.66,
18428
+ "grad_norm": 32.0,
18429
+ "learning_rate": 2.5355932203389833e-06,
18430
+ "loss": 1.5209,
18431
+ "step": 26260
18432
+ },
18433
+ {
18434
+ "epoch": 0.66,
18435
+ "grad_norm": 34.0,
18436
+ "learning_rate": 2.5288135593220343e-06,
18437
+ "loss": 1.2505,
18438
+ "step": 26270
18439
+ },
18440
+ {
18441
+ "epoch": 0.66,
18442
+ "grad_norm": 23.875,
18443
+ "learning_rate": 2.5220338983050853e-06,
18444
+ "loss": 1.3513,
18445
+ "step": 26280
18446
+ },
18447
+ {
18448
+ "epoch": 0.66,
18449
+ "grad_norm": 26.625,
18450
+ "learning_rate": 2.5152542372881354e-06,
18451
+ "loss": 1.4316,
18452
+ "step": 26290
18453
+ },
18454
+ {
18455
+ "epoch": 0.66,
18456
+ "grad_norm": 29.25,
18457
+ "learning_rate": 2.5084745762711864e-06,
18458
+ "loss": 1.367,
18459
+ "step": 26300
18460
+ },
18461
+ {
18462
+ "epoch": 0.66,
18463
+ "grad_norm": 22.75,
18464
+ "learning_rate": 2.5016949152542374e-06,
18465
+ "loss": 1.3901,
18466
+ "step": 26310
18467
+ },
18468
+ {
18469
+ "epoch": 0.66,
18470
+ "grad_norm": 26.75,
18471
+ "learning_rate": 2.4949152542372883e-06,
18472
+ "loss": 1.4063,
18473
+ "step": 26320
18474
+ },
18475
+ {
18476
+ "epoch": 0.66,
18477
+ "grad_norm": 12.0625,
18478
+ "learning_rate": 2.488135593220339e-06,
18479
+ "loss": 1.4608,
18480
+ "step": 26330
18481
+ },
18482
+ {
18483
+ "epoch": 0.66,
18484
+ "grad_norm": 16.375,
18485
+ "learning_rate": 2.48135593220339e-06,
18486
+ "loss": 1.1745,
18487
+ "step": 26340
18488
+ },
18489
+ {
18490
+ "epoch": 0.66,
18491
+ "grad_norm": 23.25,
18492
+ "learning_rate": 2.474576271186441e-06,
18493
+ "loss": 1.4735,
18494
+ "step": 26350
18495
+ },
18496
+ {
18497
+ "epoch": 0.66,
18498
+ "grad_norm": 38.0,
18499
+ "learning_rate": 2.467796610169492e-06,
18500
+ "loss": 1.1955,
18501
+ "step": 26360
18502
+ },
18503
+ {
18504
+ "epoch": 0.66,
18505
+ "grad_norm": 34.0,
18506
+ "learning_rate": 2.461016949152543e-06,
18507
+ "loss": 1.3215,
18508
+ "step": 26370
18509
+ },
18510
+ {
18511
+ "epoch": 0.66,
18512
+ "grad_norm": 30.875,
18513
+ "learning_rate": 2.4542372881355933e-06,
18514
+ "loss": 1.3611,
18515
+ "step": 26380
18516
+ },
18517
+ {
18518
+ "epoch": 0.66,
18519
+ "grad_norm": 28.625,
18520
+ "learning_rate": 2.4474576271186443e-06,
18521
+ "loss": 1.2287,
18522
+ "step": 26390
18523
+ },
18524
+ {
18525
+ "epoch": 0.66,
18526
+ "grad_norm": 26.625,
18527
+ "learning_rate": 2.4406779661016953e-06,
18528
+ "loss": 1.2022,
18529
+ "step": 26400
18530
+ },
18531
+ {
18532
+ "epoch": 0.66,
18533
+ "grad_norm": 30.625,
18534
+ "learning_rate": 2.433898305084746e-06,
18535
+ "loss": 1.3603,
18536
+ "step": 26410
18537
+ },
18538
+ {
18539
+ "epoch": 0.66,
18540
+ "grad_norm": 13.8125,
18541
+ "learning_rate": 2.427118644067797e-06,
18542
+ "loss": 1.463,
18543
+ "step": 26420
18544
+ },
18545
+ {
18546
+ "epoch": 0.66,
18547
+ "grad_norm": 14.9375,
18548
+ "learning_rate": 2.4203389830508474e-06,
18549
+ "loss": 1.3554,
18550
+ "step": 26430
18551
+ },
18552
+ {
18553
+ "epoch": 0.66,
18554
+ "grad_norm": 17.125,
18555
+ "learning_rate": 2.4135593220338984e-06,
18556
+ "loss": 1.4788,
18557
+ "step": 26440
18558
+ },
18559
+ {
18560
+ "epoch": 0.66,
18561
+ "grad_norm": 40.75,
18562
+ "learning_rate": 2.4067796610169493e-06,
18563
+ "loss": 1.3611,
18564
+ "step": 26450
18565
+ },
18566
+ {
18567
+ "epoch": 0.66,
18568
+ "grad_norm": 28.0,
18569
+ "learning_rate": 2.4000000000000003e-06,
18570
+ "loss": 1.3523,
18571
+ "step": 26460
18572
+ },
18573
+ {
18574
+ "epoch": 0.66,
18575
+ "grad_norm": 23.375,
18576
+ "learning_rate": 2.393220338983051e-06,
18577
+ "loss": 1.3743,
18578
+ "step": 26470
18579
+ },
18580
+ {
18581
+ "epoch": 0.66,
18582
+ "grad_norm": 54.25,
18583
+ "learning_rate": 2.386440677966102e-06,
18584
+ "loss": 1.3022,
18585
+ "step": 26480
18586
+ },
18587
+ {
18588
+ "epoch": 0.66,
18589
+ "grad_norm": 13.0,
18590
+ "learning_rate": 2.379661016949153e-06,
18591
+ "loss": 1.3887,
18592
+ "step": 26490
18593
+ },
18594
+ {
18595
+ "epoch": 0.66,
18596
+ "grad_norm": 15.9375,
18597
+ "learning_rate": 2.372881355932204e-06,
18598
+ "loss": 1.4052,
18599
+ "step": 26500
18600
+ },
18601
+ {
18602
+ "epoch": 0.66,
18603
+ "grad_norm": 29.25,
18604
+ "learning_rate": 2.3661016949152544e-06,
18605
+ "loss": 1.3447,
18606
+ "step": 26510
18607
+ },
18608
+ {
18609
+ "epoch": 0.66,
18610
+ "grad_norm": 22.375,
18611
+ "learning_rate": 2.3593220338983053e-06,
18612
+ "loss": 1.3072,
18613
+ "step": 26520
18614
+ },
18615
+ {
18616
+ "epoch": 0.66,
18617
+ "grad_norm": 14.125,
18618
+ "learning_rate": 2.3525423728813563e-06,
18619
+ "loss": 1.4062,
18620
+ "step": 26530
18621
+ },
18622
+ {
18623
+ "epoch": 0.66,
18624
+ "grad_norm": 41.75,
18625
+ "learning_rate": 2.345762711864407e-06,
18626
+ "loss": 1.3262,
18627
+ "step": 26540
18628
+ },
18629
+ {
18630
+ "epoch": 0.66,
18631
+ "grad_norm": 24.25,
18632
+ "learning_rate": 2.338983050847458e-06,
18633
+ "loss": 1.4441,
18634
+ "step": 26550
18635
+ },
18636
+ {
18637
+ "epoch": 0.66,
18638
+ "grad_norm": 15.3125,
18639
+ "learning_rate": 2.3322033898305084e-06,
18640
+ "loss": 1.3227,
18641
+ "step": 26560
18642
+ },
18643
+ {
18644
+ "epoch": 0.66,
18645
+ "grad_norm": 52.25,
18646
+ "learning_rate": 2.3254237288135594e-06,
18647
+ "loss": 1.3981,
18648
+ "step": 26570
18649
+ },
18650
+ {
18651
+ "epoch": 0.66,
18652
+ "grad_norm": 30.125,
18653
+ "learning_rate": 2.3186440677966103e-06,
18654
+ "loss": 1.4248,
18655
+ "step": 26580
18656
+ },
18657
+ {
18658
+ "epoch": 0.66,
18659
+ "grad_norm": 70.0,
18660
+ "learning_rate": 2.3118644067796613e-06,
18661
+ "loss": 1.3605,
18662
+ "step": 26590
18663
+ },
18664
+ {
18665
+ "epoch": 0.67,
18666
+ "grad_norm": 35.5,
18667
+ "learning_rate": 2.305084745762712e-06,
18668
+ "loss": 1.2909,
18669
+ "step": 26600
18670
+ },
18671
+ {
18672
+ "epoch": 0.67,
18673
+ "grad_norm": 13.0,
18674
+ "learning_rate": 2.298305084745763e-06,
18675
+ "loss": 1.3771,
18676
+ "step": 26610
18677
+ },
18678
+ {
18679
+ "epoch": 0.67,
18680
+ "grad_norm": 49.25,
18681
+ "learning_rate": 2.291525423728814e-06,
18682
+ "loss": 1.2699,
18683
+ "step": 26620
18684
+ },
18685
+ {
18686
+ "epoch": 0.67,
18687
+ "grad_norm": 24.5,
18688
+ "learning_rate": 2.284745762711865e-06,
18689
+ "loss": 1.1961,
18690
+ "step": 26630
18691
+ },
18692
+ {
18693
+ "epoch": 0.67,
18694
+ "grad_norm": 38.5,
18695
+ "learning_rate": 2.2779661016949154e-06,
18696
+ "loss": 1.3764,
18697
+ "step": 26640
18698
+ },
18699
+ {
18700
+ "epoch": 0.67,
18701
+ "grad_norm": 35.25,
18702
+ "learning_rate": 2.2711864406779663e-06,
18703
+ "loss": 1.2731,
18704
+ "step": 26650
18705
+ },
18706
+ {
18707
+ "epoch": 0.67,
18708
+ "grad_norm": 20.875,
18709
+ "learning_rate": 2.2644067796610173e-06,
18710
+ "loss": 1.2816,
18711
+ "step": 26660
18712
+ },
18713
+ {
18714
+ "epoch": 0.67,
18715
+ "grad_norm": 46.75,
18716
+ "learning_rate": 2.257627118644068e-06,
18717
+ "loss": 1.2159,
18718
+ "step": 26670
18719
+ },
18720
+ {
18721
+ "epoch": 0.67,
18722
+ "grad_norm": 9.25,
18723
+ "learning_rate": 2.250847457627119e-06,
18724
+ "loss": 1.371,
18725
+ "step": 26680
18726
+ },
18727
+ {
18728
+ "epoch": 0.67,
18729
+ "grad_norm": 25.625,
18730
+ "learning_rate": 2.2440677966101694e-06,
18731
+ "loss": 1.5302,
18732
+ "step": 26690
18733
+ },
18734
+ {
18735
+ "epoch": 0.67,
18736
+ "grad_norm": 15.0,
18737
+ "learning_rate": 2.2372881355932204e-06,
18738
+ "loss": 1.3585,
18739
+ "step": 26700
18740
+ },
18741
+ {
18742
+ "epoch": 0.67,
18743
+ "grad_norm": 16.5,
18744
+ "learning_rate": 2.2305084745762714e-06,
18745
+ "loss": 1.201,
18746
+ "step": 26710
18747
+ },
18748
+ {
18749
+ "epoch": 0.67,
18750
+ "grad_norm": 27.25,
18751
+ "learning_rate": 2.2237288135593223e-06,
18752
+ "loss": 1.3247,
18753
+ "step": 26720
18754
+ },
18755
+ {
18756
+ "epoch": 0.67,
18757
+ "grad_norm": 17.75,
18758
+ "learning_rate": 2.216949152542373e-06,
18759
+ "loss": 1.4727,
18760
+ "step": 26730
18761
+ },
18762
+ {
18763
+ "epoch": 0.67,
18764
+ "grad_norm": 22.5,
18765
+ "learning_rate": 2.210169491525424e-06,
18766
+ "loss": 1.3304,
18767
+ "step": 26740
18768
+ },
18769
+ {
18770
+ "epoch": 0.67,
18771
+ "grad_norm": 25.125,
18772
+ "learning_rate": 2.203389830508475e-06,
18773
+ "loss": 1.3933,
18774
+ "step": 26750
18775
+ },
18776
+ {
18777
+ "epoch": 0.67,
18778
+ "grad_norm": 19.75,
18779
+ "learning_rate": 2.196610169491526e-06,
18780
+ "loss": 1.3612,
18781
+ "step": 26760
18782
+ },
18783
+ {
18784
+ "epoch": 0.67,
18785
+ "grad_norm": 30.0,
18786
+ "learning_rate": 2.1898305084745764e-06,
18787
+ "loss": 1.3657,
18788
+ "step": 26770
18789
+ },
18790
+ {
18791
+ "epoch": 0.67,
18792
+ "grad_norm": 43.5,
18793
+ "learning_rate": 2.1830508474576273e-06,
18794
+ "loss": 1.4132,
18795
+ "step": 26780
18796
+ },
18797
+ {
18798
+ "epoch": 0.67,
18799
+ "grad_norm": 37.5,
18800
+ "learning_rate": 2.1762711864406783e-06,
18801
+ "loss": 1.5674,
18802
+ "step": 26790
18803
+ },
18804
+ {
18805
+ "epoch": 0.67,
18806
+ "grad_norm": 24.625,
18807
+ "learning_rate": 2.169491525423729e-06,
18808
+ "loss": 1.2488,
18809
+ "step": 26800
18810
+ },
18811
+ {
18812
+ "epoch": 0.67,
18813
+ "grad_norm": 27.625,
18814
+ "learning_rate": 2.16271186440678e-06,
18815
+ "loss": 1.3611,
18816
+ "step": 26810
18817
+ },
18818
+ {
18819
+ "epoch": 0.67,
18820
+ "grad_norm": 35.75,
18821
+ "learning_rate": 2.1559322033898304e-06,
18822
+ "loss": 1.4191,
18823
+ "step": 26820
18824
+ },
18825
+ {
18826
+ "epoch": 0.67,
18827
+ "grad_norm": 19.625,
18828
+ "learning_rate": 2.1491525423728814e-06,
18829
+ "loss": 1.3845,
18830
+ "step": 26830
18831
+ },
18832
+ {
18833
+ "epoch": 0.67,
18834
+ "grad_norm": 20.0,
18835
+ "learning_rate": 2.1423728813559324e-06,
18836
+ "loss": 1.2857,
18837
+ "step": 26840
18838
+ },
18839
+ {
18840
+ "epoch": 0.67,
18841
+ "grad_norm": 13.875,
18842
+ "learning_rate": 2.1355932203389833e-06,
18843
+ "loss": 1.4068,
18844
+ "step": 26850
18845
+ },
18846
+ {
18847
+ "epoch": 0.67,
18848
+ "grad_norm": 20.625,
18849
+ "learning_rate": 2.128813559322034e-06,
18850
+ "loss": 1.5439,
18851
+ "step": 26860
18852
+ },
18853
+ {
18854
+ "epoch": 0.67,
18855
+ "grad_norm": 42.5,
18856
+ "learning_rate": 2.122033898305085e-06,
18857
+ "loss": 1.2449,
18858
+ "step": 26870
18859
+ },
18860
+ {
18861
+ "epoch": 0.67,
18862
+ "grad_norm": 32.75,
18863
+ "learning_rate": 2.115254237288136e-06,
18864
+ "loss": 1.2669,
18865
+ "step": 26880
18866
+ },
18867
+ {
18868
+ "epoch": 0.67,
18869
+ "grad_norm": 11.625,
18870
+ "learning_rate": 2.108474576271187e-06,
18871
+ "loss": 1.2574,
18872
+ "step": 26890
18873
+ },
18874
+ {
18875
+ "epoch": 0.67,
18876
+ "grad_norm": 20.25,
18877
+ "learning_rate": 2.1016949152542374e-06,
18878
+ "loss": 1.4733,
18879
+ "step": 26900
18880
+ },
18881
+ {
18882
+ "epoch": 0.67,
18883
+ "grad_norm": 41.75,
18884
+ "learning_rate": 2.0949152542372883e-06,
18885
+ "loss": 1.1929,
18886
+ "step": 26910
18887
+ },
18888
+ {
18889
+ "epoch": 0.67,
18890
+ "grad_norm": 14.625,
18891
+ "learning_rate": 2.0881355932203393e-06,
18892
+ "loss": 1.0785,
18893
+ "step": 26920
18894
+ },
18895
+ {
18896
+ "epoch": 0.67,
18897
+ "grad_norm": 14.6875,
18898
+ "learning_rate": 2.08135593220339e-06,
18899
+ "loss": 1.6247,
18900
+ "step": 26930
18901
+ },
18902
+ {
18903
+ "epoch": 0.67,
18904
+ "grad_norm": 36.5,
18905
+ "learning_rate": 2.074576271186441e-06,
18906
+ "loss": 1.451,
18907
+ "step": 26940
18908
+ },
18909
+ {
18910
+ "epoch": 0.67,
18911
+ "grad_norm": 31.5,
18912
+ "learning_rate": 2.0677966101694914e-06,
18913
+ "loss": 1.2365,
18914
+ "step": 26950
18915
+ },
18916
+ {
18917
+ "epoch": 0.67,
18918
+ "grad_norm": 16.125,
18919
+ "learning_rate": 2.0610169491525424e-06,
18920
+ "loss": 1.4036,
18921
+ "step": 26960
18922
+ },
18923
+ {
18924
+ "epoch": 0.67,
18925
+ "grad_norm": 47.75,
18926
+ "learning_rate": 2.0542372881355934e-06,
18927
+ "loss": 1.402,
18928
+ "step": 26970
18929
+ },
18930
+ {
18931
+ "epoch": 0.67,
18932
+ "grad_norm": 31.25,
18933
+ "learning_rate": 2.0474576271186443e-06,
18934
+ "loss": 1.3498,
18935
+ "step": 26980
18936
+ },
18937
+ {
18938
+ "epoch": 0.67,
18939
+ "grad_norm": 23.0,
18940
+ "learning_rate": 2.0406779661016953e-06,
18941
+ "loss": 1.2872,
18942
+ "step": 26990
18943
+ },
18944
+ {
18945
+ "epoch": 0.68,
18946
+ "grad_norm": 26.625,
18947
+ "learning_rate": 2.033898305084746e-06,
18948
+ "loss": 1.4502,
18949
+ "step": 27000
18950
+ },
18951
+ {
18952
+ "epoch": 0.68,
18953
+ "grad_norm": 14.4375,
18954
+ "learning_rate": 2.027118644067797e-06,
18955
+ "loss": 1.3091,
18956
+ "step": 27010
18957
+ },
18958
+ {
18959
+ "epoch": 0.68,
18960
+ "grad_norm": 22.875,
18961
+ "learning_rate": 2.020338983050848e-06,
18962
+ "loss": 1.3883,
18963
+ "step": 27020
18964
+ },
18965
+ {
18966
+ "epoch": 0.68,
18967
+ "grad_norm": 16.0,
18968
+ "learning_rate": 2.0135593220338984e-06,
18969
+ "loss": 1.3933,
18970
+ "step": 27030
18971
+ },
18972
+ {
18973
+ "epoch": 0.68,
18974
+ "grad_norm": 14.3125,
18975
+ "learning_rate": 2.0067796610169494e-06,
18976
+ "loss": 1.4509,
18977
+ "step": 27040
18978
+ },
18979
+ {
18980
+ "epoch": 0.68,
18981
+ "grad_norm": 12.5625,
18982
+ "learning_rate": 2.0000000000000003e-06,
18983
+ "loss": 1.2744,
18984
+ "step": 27050
18985
+ },
18986
+ {
18987
+ "epoch": 0.68,
18988
+ "grad_norm": 43.5,
18989
+ "learning_rate": 1.993220338983051e-06,
18990
+ "loss": 1.2796,
18991
+ "step": 27060
18992
+ },
18993
+ {
18994
+ "epoch": 0.68,
18995
+ "grad_norm": 25.875,
18996
+ "learning_rate": 1.986440677966102e-06,
18997
+ "loss": 1.3602,
18998
+ "step": 27070
18999
+ },
19000
+ {
19001
+ "epoch": 0.68,
19002
+ "grad_norm": 20.5,
19003
+ "learning_rate": 1.9796610169491524e-06,
19004
+ "loss": 1.312,
19005
+ "step": 27080
19006
+ },
19007
+ {
19008
+ "epoch": 0.68,
19009
+ "grad_norm": 15.75,
19010
+ "learning_rate": 1.9728813559322034e-06,
19011
+ "loss": 1.2828,
19012
+ "step": 27090
19013
+ },
19014
+ {
19015
+ "epoch": 0.68,
19016
+ "grad_norm": 43.0,
19017
+ "learning_rate": 1.9661016949152544e-06,
19018
+ "loss": 1.448,
19019
+ "step": 27100
19020
+ },
19021
+ {
19022
+ "epoch": 0.68,
19023
+ "grad_norm": 7.28125,
19024
+ "learning_rate": 1.9593220338983053e-06,
19025
+ "loss": 1.3548,
19026
+ "step": 27110
19027
+ },
19028
+ {
19029
+ "epoch": 0.68,
19030
+ "grad_norm": 20.5,
19031
+ "learning_rate": 1.9525423728813563e-06,
19032
+ "loss": 1.4944,
19033
+ "step": 27120
19034
+ },
19035
+ {
19036
+ "epoch": 0.68,
19037
+ "grad_norm": 59.25,
19038
+ "learning_rate": 1.945762711864407e-06,
19039
+ "loss": 1.4544,
19040
+ "step": 27130
19041
+ },
19042
+ {
19043
+ "epoch": 0.68,
19044
+ "grad_norm": 13.25,
19045
+ "learning_rate": 1.938983050847458e-06,
19046
+ "loss": 1.4506,
19047
+ "step": 27140
19048
+ },
19049
+ {
19050
+ "epoch": 0.68,
19051
+ "grad_norm": 35.5,
19052
+ "learning_rate": 1.932203389830509e-06,
19053
+ "loss": 1.3822,
19054
+ "step": 27150
19055
+ },
19056
+ {
19057
+ "epoch": 0.68,
19058
+ "grad_norm": 23.125,
19059
+ "learning_rate": 1.9254237288135594e-06,
19060
+ "loss": 1.3075,
19061
+ "step": 27160
19062
+ },
19063
+ {
19064
+ "epoch": 0.68,
19065
+ "grad_norm": 27.125,
19066
+ "learning_rate": 1.9186440677966104e-06,
19067
+ "loss": 1.2924,
19068
+ "step": 27170
19069
+ },
19070
+ {
19071
+ "epoch": 0.68,
19072
+ "grad_norm": 12.5625,
19073
+ "learning_rate": 1.9118644067796613e-06,
19074
+ "loss": 1.3947,
19075
+ "step": 27180
19076
+ },
19077
+ {
19078
+ "epoch": 0.68,
19079
+ "grad_norm": 40.75,
19080
+ "learning_rate": 1.9050847457627119e-06,
19081
+ "loss": 1.4555,
19082
+ "step": 27190
19083
+ },
19084
+ {
19085
+ "epoch": 0.68,
19086
+ "grad_norm": 8.9375,
19087
+ "learning_rate": 1.8983050847457629e-06,
19088
+ "loss": 1.3187,
19089
+ "step": 27200
19090
+ },
19091
+ {
19092
+ "epoch": 0.68,
19093
+ "grad_norm": 33.5,
19094
+ "learning_rate": 1.8915254237288136e-06,
19095
+ "loss": 1.2239,
19096
+ "step": 27210
19097
+ },
19098
+ {
19099
+ "epoch": 0.68,
19100
+ "grad_norm": 43.5,
19101
+ "learning_rate": 1.8847457627118646e-06,
19102
+ "loss": 1.4288,
19103
+ "step": 27220
19104
+ },
19105
+ {
19106
+ "epoch": 0.68,
19107
+ "grad_norm": 43.5,
19108
+ "learning_rate": 1.8779661016949156e-06,
19109
+ "loss": 1.2993,
19110
+ "step": 27230
19111
+ },
19112
+ {
19113
+ "epoch": 0.68,
19114
+ "grad_norm": 38.75,
19115
+ "learning_rate": 1.8711864406779661e-06,
19116
+ "loss": 1.4127,
19117
+ "step": 27240
19118
+ },
19119
+ {
19120
+ "epoch": 0.68,
19121
+ "grad_norm": 17.75,
19122
+ "learning_rate": 1.8644067796610171e-06,
19123
+ "loss": 1.3089,
19124
+ "step": 27250
19125
+ },
19126
+ {
19127
+ "epoch": 0.68,
19128
+ "grad_norm": 41.25,
19129
+ "learning_rate": 1.857627118644068e-06,
19130
+ "loss": 1.3116,
19131
+ "step": 27260
19132
+ },
19133
+ {
19134
+ "epoch": 0.68,
19135
+ "grad_norm": 15.5,
19136
+ "learning_rate": 1.8508474576271189e-06,
19137
+ "loss": 1.4069,
19138
+ "step": 27270
19139
+ },
19140
+ {
19141
+ "epoch": 0.68,
19142
+ "grad_norm": 8.375,
19143
+ "learning_rate": 1.8440677966101696e-06,
19144
+ "loss": 1.4496,
19145
+ "step": 27280
19146
+ },
19147
+ {
19148
+ "epoch": 0.68,
19149
+ "grad_norm": 33.0,
19150
+ "learning_rate": 1.8372881355932204e-06,
19151
+ "loss": 1.3556,
19152
+ "step": 27290
19153
+ },
19154
+ {
19155
+ "epoch": 0.68,
19156
+ "grad_norm": 17.375,
19157
+ "learning_rate": 1.8305084745762714e-06,
19158
+ "loss": 1.4954,
19159
+ "step": 27300
19160
+ },
19161
+ {
19162
+ "epoch": 0.68,
19163
+ "grad_norm": 20.25,
19164
+ "learning_rate": 1.8237288135593223e-06,
19165
+ "loss": 1.439,
19166
+ "step": 27310
19167
+ },
19168
+ {
19169
+ "epoch": 0.68,
19170
+ "grad_norm": 34.25,
19171
+ "learning_rate": 1.816949152542373e-06,
19172
+ "loss": 1.1777,
19173
+ "step": 27320
19174
+ },
19175
+ {
19176
+ "epoch": 0.68,
19177
+ "grad_norm": 27.875,
19178
+ "learning_rate": 1.8101694915254239e-06,
19179
+ "loss": 1.4065,
19180
+ "step": 27330
19181
+ },
19182
+ {
19183
+ "epoch": 0.68,
19184
+ "grad_norm": 14.25,
19185
+ "learning_rate": 1.8033898305084746e-06,
19186
+ "loss": 1.2855,
19187
+ "step": 27340
19188
+ },
19189
+ {
19190
+ "epoch": 0.68,
19191
+ "grad_norm": 13.625,
19192
+ "learning_rate": 1.7966101694915256e-06,
19193
+ "loss": 1.2503,
19194
+ "step": 27350
19195
+ },
19196
+ {
19197
+ "epoch": 0.68,
19198
+ "grad_norm": 16.625,
19199
+ "learning_rate": 1.7898305084745766e-06,
19200
+ "loss": 1.3458,
19201
+ "step": 27360
19202
+ },
19203
+ {
19204
+ "epoch": 0.68,
19205
+ "grad_norm": 11.6875,
19206
+ "learning_rate": 1.7830508474576271e-06,
19207
+ "loss": 1.2963,
19208
+ "step": 27370
19209
+ },
19210
+ {
19211
+ "epoch": 0.68,
19212
+ "grad_norm": 23.625,
19213
+ "learning_rate": 1.7762711864406781e-06,
19214
+ "loss": 1.4009,
19215
+ "step": 27380
19216
+ },
19217
+ {
19218
+ "epoch": 0.68,
19219
+ "grad_norm": 33.75,
19220
+ "learning_rate": 1.769491525423729e-06,
19221
+ "loss": 1.444,
19222
+ "step": 27390
19223
+ },
19224
+ {
19225
+ "epoch": 0.69,
19226
+ "grad_norm": 16.25,
19227
+ "learning_rate": 1.7627118644067799e-06,
19228
+ "loss": 1.4343,
19229
+ "step": 27400
19230
+ },
19231
+ {
19232
+ "epoch": 0.69,
19233
+ "grad_norm": 16.0,
19234
+ "learning_rate": 1.7559322033898306e-06,
19235
+ "loss": 1.2348,
19236
+ "step": 27410
19237
+ },
19238
+ {
19239
+ "epoch": 0.69,
19240
+ "grad_norm": 21.25,
19241
+ "learning_rate": 1.7491525423728814e-06,
19242
+ "loss": 1.4386,
19243
+ "step": 27420
19244
+ },
19245
+ {
19246
+ "epoch": 0.69,
19247
+ "grad_norm": 20.625,
19248
+ "learning_rate": 1.7423728813559324e-06,
19249
+ "loss": 1.3253,
19250
+ "step": 27430
19251
+ },
19252
+ {
19253
+ "epoch": 0.69,
19254
+ "grad_norm": 19.875,
19255
+ "learning_rate": 1.7355932203389834e-06,
19256
+ "loss": 1.3057,
19257
+ "step": 27440
19258
+ },
19259
+ {
19260
+ "epoch": 0.69,
19261
+ "grad_norm": 35.75,
19262
+ "learning_rate": 1.728813559322034e-06,
19263
+ "loss": 1.2917,
19264
+ "step": 27450
19265
+ },
19266
+ {
19267
+ "epoch": 0.69,
19268
+ "grad_norm": 29.625,
19269
+ "learning_rate": 1.7220338983050849e-06,
19270
+ "loss": 1.4069,
19271
+ "step": 27460
19272
+ },
19273
+ {
19274
+ "epoch": 0.69,
19275
+ "grad_norm": 72.5,
19276
+ "learning_rate": 1.7152542372881356e-06,
19277
+ "loss": 1.274,
19278
+ "step": 27470
19279
+ },
19280
+ {
19281
+ "epoch": 0.69,
19282
+ "grad_norm": 13.25,
19283
+ "learning_rate": 1.7084745762711866e-06,
19284
+ "loss": 1.4452,
19285
+ "step": 27480
19286
+ },
19287
+ {
19288
+ "epoch": 0.69,
19289
+ "grad_norm": 21.75,
19290
+ "learning_rate": 1.7016949152542376e-06,
19291
+ "loss": 1.3647,
19292
+ "step": 27490
19293
+ },
19294
+ {
19295
+ "epoch": 0.69,
19296
+ "grad_norm": 35.0,
19297
+ "learning_rate": 1.6949152542372882e-06,
19298
+ "loss": 1.3659,
19299
+ "step": 27500
19300
+ },
19301
+ {
19302
+ "epoch": 0.69,
19303
+ "grad_norm": 20.125,
19304
+ "learning_rate": 1.6881355932203391e-06,
19305
+ "loss": 1.3932,
19306
+ "step": 27510
19307
+ },
19308
+ {
19309
+ "epoch": 0.69,
19310
+ "grad_norm": 12.875,
19311
+ "learning_rate": 1.6813559322033901e-06,
19312
+ "loss": 1.2859,
19313
+ "step": 27520
19314
+ },
19315
+ {
19316
+ "epoch": 0.69,
19317
+ "grad_norm": 13.875,
19318
+ "learning_rate": 1.6745762711864409e-06,
19319
+ "loss": 1.48,
19320
+ "step": 27530
19321
+ },
19322
+ {
19323
+ "epoch": 0.69,
19324
+ "grad_norm": 27.5,
19325
+ "learning_rate": 1.6677966101694916e-06,
19326
+ "loss": 1.4621,
19327
+ "step": 27540
19328
+ },
19329
+ {
19330
+ "epoch": 0.69,
19331
+ "grad_norm": 12.25,
19332
+ "learning_rate": 1.6610169491525424e-06,
19333
+ "loss": 1.3061,
19334
+ "step": 27550
19335
+ },
19336
+ {
19337
+ "epoch": 0.69,
19338
+ "grad_norm": 26.625,
19339
+ "learning_rate": 1.6542372881355934e-06,
19340
+ "loss": 1.272,
19341
+ "step": 27560
19342
+ },
19343
+ {
19344
+ "epoch": 0.69,
19345
+ "grad_norm": 15.8125,
19346
+ "learning_rate": 1.6474576271186444e-06,
19347
+ "loss": 1.3301,
19348
+ "step": 27570
19349
+ },
19350
+ {
19351
+ "epoch": 0.69,
19352
+ "grad_norm": 11.625,
19353
+ "learning_rate": 1.640677966101695e-06,
19354
+ "loss": 1.5456,
19355
+ "step": 27580
19356
+ },
19357
+ {
19358
+ "epoch": 0.69,
19359
+ "grad_norm": 17.5,
19360
+ "learning_rate": 1.6338983050847459e-06,
19361
+ "loss": 1.3963,
19362
+ "step": 27590
19363
+ },
19364
+ {
19365
+ "epoch": 0.69,
19366
+ "grad_norm": 34.5,
19367
+ "learning_rate": 1.6271186440677967e-06,
19368
+ "loss": 1.2749,
19369
+ "step": 27600
19370
+ },
19371
+ {
19372
+ "epoch": 0.69,
19373
+ "grad_norm": 21.375,
19374
+ "learning_rate": 1.6203389830508476e-06,
19375
+ "loss": 1.3479,
19376
+ "step": 27610
19377
+ },
19378
+ {
19379
+ "epoch": 0.69,
19380
+ "grad_norm": 21.375,
19381
+ "learning_rate": 1.6135593220338986e-06,
19382
+ "loss": 1.3659,
19383
+ "step": 27620
19384
+ },
19385
+ {
19386
+ "epoch": 0.69,
19387
+ "grad_norm": 20.5,
19388
+ "learning_rate": 1.6067796610169492e-06,
19389
+ "loss": 1.4773,
19390
+ "step": 27630
19391
+ },
19392
+ {
19393
+ "epoch": 0.69,
19394
+ "grad_norm": 27.25,
19395
+ "learning_rate": 1.6000000000000001e-06,
19396
+ "loss": 1.3929,
19397
+ "step": 27640
19398
+ },
19399
+ {
19400
+ "epoch": 0.69,
19401
+ "grad_norm": 20.875,
19402
+ "learning_rate": 1.593220338983051e-06,
19403
+ "loss": 1.2243,
19404
+ "step": 27650
19405
+ },
19406
+ {
19407
+ "epoch": 0.69,
19408
+ "grad_norm": 24.125,
19409
+ "learning_rate": 1.5864406779661019e-06,
19410
+ "loss": 1.3985,
19411
+ "step": 27660
19412
+ },
19413
+ {
19414
+ "epoch": 0.69,
19415
+ "grad_norm": 35.75,
19416
+ "learning_rate": 1.5796610169491526e-06,
19417
+ "loss": 1.3317,
19418
+ "step": 27670
19419
+ },
19420
+ {
19421
+ "epoch": 0.69,
19422
+ "grad_norm": 29.125,
19423
+ "learning_rate": 1.5728813559322034e-06,
19424
+ "loss": 1.3862,
19425
+ "step": 27680
19426
+ },
19427
+ {
19428
+ "epoch": 0.69,
19429
+ "grad_norm": 29.875,
19430
+ "learning_rate": 1.5661016949152544e-06,
19431
+ "loss": 1.2297,
19432
+ "step": 27690
19433
+ },
19434
+ {
19435
+ "epoch": 0.69,
19436
+ "grad_norm": 25.625,
19437
+ "learning_rate": 1.5593220338983054e-06,
19438
+ "loss": 1.2997,
19439
+ "step": 27700
19440
+ },
19441
+ {
19442
+ "epoch": 0.69,
19443
+ "grad_norm": 27.5,
19444
+ "learning_rate": 1.552542372881356e-06,
19445
+ "loss": 1.2711,
19446
+ "step": 27710
19447
+ },
19448
+ {
19449
+ "epoch": 0.69,
19450
+ "grad_norm": 12.9375,
19451
+ "learning_rate": 1.545762711864407e-06,
19452
+ "loss": 1.3008,
19453
+ "step": 27720
19454
+ },
19455
+ {
19456
+ "epoch": 0.69,
19457
+ "grad_norm": 27.875,
19458
+ "learning_rate": 1.5389830508474577e-06,
19459
+ "loss": 1.2319,
19460
+ "step": 27730
19461
+ },
19462
+ {
19463
+ "epoch": 0.69,
19464
+ "grad_norm": 53.75,
19465
+ "learning_rate": 1.5322033898305086e-06,
19466
+ "loss": 1.323,
19467
+ "step": 27740
19468
+ },
19469
+ {
19470
+ "epoch": 0.69,
19471
+ "grad_norm": 24.875,
19472
+ "learning_rate": 1.5254237288135596e-06,
19473
+ "loss": 1.4262,
19474
+ "step": 27750
19475
+ },
19476
+ {
19477
+ "epoch": 0.69,
19478
+ "grad_norm": 23.5,
19479
+ "learning_rate": 1.5186440677966102e-06,
19480
+ "loss": 1.2772,
19481
+ "step": 27760
19482
+ },
19483
+ {
19484
+ "epoch": 0.69,
19485
+ "grad_norm": 23.625,
19486
+ "learning_rate": 1.5118644067796611e-06,
19487
+ "loss": 1.3022,
19488
+ "step": 27770
19489
+ },
19490
+ {
19491
+ "epoch": 0.69,
19492
+ "grad_norm": 53.25,
19493
+ "learning_rate": 1.505084745762712e-06,
19494
+ "loss": 1.3384,
19495
+ "step": 27780
19496
+ },
19497
+ {
19498
+ "epoch": 0.69,
19499
+ "grad_norm": 34.0,
19500
+ "learning_rate": 1.4983050847457629e-06,
19501
+ "loss": 1.335,
19502
+ "step": 27790
19503
+ },
19504
+ {
19505
+ "epoch": 0.69,
19506
+ "grad_norm": 20.75,
19507
+ "learning_rate": 1.4915254237288139e-06,
19508
+ "loss": 1.4021,
19509
+ "step": 27800
19510
+ },
19511
+ {
19512
+ "epoch": 0.7,
19513
+ "grad_norm": 23.0,
19514
+ "learning_rate": 1.4847457627118644e-06,
19515
+ "loss": 1.417,
19516
+ "step": 27810
19517
+ },
19518
+ {
19519
+ "epoch": 0.7,
19520
+ "grad_norm": 39.0,
19521
+ "learning_rate": 1.4779661016949154e-06,
19522
+ "loss": 1.3886,
19523
+ "step": 27820
19524
+ },
19525
+ {
19526
+ "epoch": 0.7,
19527
+ "grad_norm": 30.75,
19528
+ "learning_rate": 1.4711864406779664e-06,
19529
+ "loss": 1.3769,
19530
+ "step": 27830
19531
+ },
19532
+ {
19533
+ "epoch": 0.7,
19534
+ "grad_norm": 27.75,
19535
+ "learning_rate": 1.464406779661017e-06,
19536
+ "loss": 1.2846,
19537
+ "step": 27840
19538
+ },
19539
+ {
19540
+ "epoch": 0.7,
19541
+ "grad_norm": 42.5,
19542
+ "learning_rate": 1.457627118644068e-06,
19543
+ "loss": 1.4831,
19544
+ "step": 27850
19545
+ },
19546
+ {
19547
+ "epoch": 0.7,
19548
+ "grad_norm": 24.25,
19549
+ "learning_rate": 1.4508474576271187e-06,
19550
+ "loss": 1.3099,
19551
+ "step": 27860
19552
+ },
19553
+ {
19554
+ "epoch": 0.7,
19555
+ "grad_norm": 27.0,
19556
+ "learning_rate": 1.4440677966101696e-06,
19557
+ "loss": 1.3877,
19558
+ "step": 27870
19559
+ },
19560
+ {
19561
+ "epoch": 0.7,
19562
+ "grad_norm": 15.75,
19563
+ "learning_rate": 1.4372881355932206e-06,
19564
+ "loss": 1.4706,
19565
+ "step": 27880
19566
+ },
19567
+ {
19568
+ "epoch": 0.7,
19569
+ "grad_norm": 17.875,
19570
+ "learning_rate": 1.4305084745762712e-06,
19571
+ "loss": 1.2598,
19572
+ "step": 27890
19573
+ },
19574
+ {
19575
+ "epoch": 0.7,
19576
+ "grad_norm": 14.5625,
19577
+ "learning_rate": 1.4237288135593222e-06,
19578
+ "loss": 1.3585,
19579
+ "step": 27900
19580
+ },
19581
+ {
19582
+ "epoch": 0.7,
19583
+ "grad_norm": 29.125,
19584
+ "learning_rate": 1.416949152542373e-06,
19585
+ "loss": 1.3198,
19586
+ "step": 27910
19587
+ },
19588
+ {
19589
+ "epoch": 0.7,
19590
+ "grad_norm": 43.0,
19591
+ "learning_rate": 1.410169491525424e-06,
19592
+ "loss": 1.3106,
19593
+ "step": 27920
19594
+ },
19595
+ {
19596
+ "epoch": 0.7,
19597
+ "grad_norm": 9.25,
19598
+ "learning_rate": 1.4033898305084749e-06,
19599
+ "loss": 1.2443,
19600
+ "step": 27930
19601
+ },
19602
+ {
19603
+ "epoch": 0.7,
19604
+ "grad_norm": 9.5,
19605
+ "learning_rate": 1.3966101694915254e-06,
19606
+ "loss": 1.4339,
19607
+ "step": 27940
19608
+ },
19609
+ {
19610
+ "epoch": 0.7,
19611
+ "grad_norm": 29.125,
19612
+ "learning_rate": 1.3898305084745764e-06,
19613
+ "loss": 1.3682,
19614
+ "step": 27950
19615
+ },
19616
+ {
19617
+ "epoch": 0.7,
19618
+ "grad_norm": 21.875,
19619
+ "learning_rate": 1.3830508474576274e-06,
19620
+ "loss": 1.3385,
19621
+ "step": 27960
19622
+ },
19623
+ {
19624
+ "epoch": 0.7,
19625
+ "grad_norm": 33.5,
19626
+ "learning_rate": 1.376271186440678e-06,
19627
+ "loss": 1.2603,
19628
+ "step": 27970
19629
+ },
19630
+ {
19631
+ "epoch": 0.7,
19632
+ "grad_norm": 38.25,
19633
+ "learning_rate": 1.369491525423729e-06,
19634
+ "loss": 1.3336,
19635
+ "step": 27980
19636
+ },
19637
+ {
19638
+ "epoch": 0.7,
19639
+ "grad_norm": 18.125,
19640
+ "learning_rate": 1.3627118644067797e-06,
19641
+ "loss": 1.291,
19642
+ "step": 27990
19643
+ },
19644
+ {
19645
+ "epoch": 0.7,
19646
+ "grad_norm": 72.0,
19647
+ "learning_rate": 1.3559322033898307e-06,
19648
+ "loss": 1.3963,
19649
+ "step": 28000
19650
+ },
19651
+ {
19652
+ "epoch": 0.7,
19653
+ "grad_norm": 17.125,
19654
+ "learning_rate": 1.3491525423728816e-06,
19655
+ "loss": 1.2018,
19656
+ "step": 28010
19657
+ },
19658
+ {
19659
+ "epoch": 0.7,
19660
+ "grad_norm": 18.375,
19661
+ "learning_rate": 1.3423728813559322e-06,
19662
+ "loss": 1.4179,
19663
+ "step": 28020
19664
+ },
19665
+ {
19666
+ "epoch": 0.7,
19667
+ "grad_norm": 16.25,
19668
+ "learning_rate": 1.3355932203389832e-06,
19669
+ "loss": 1.1736,
19670
+ "step": 28030
19671
+ },
19672
+ {
19673
+ "epoch": 0.7,
19674
+ "grad_norm": 34.0,
19675
+ "learning_rate": 1.328813559322034e-06,
19676
+ "loss": 1.4772,
19677
+ "step": 28040
19678
+ },
19679
+ {
19680
+ "epoch": 0.7,
19681
+ "grad_norm": 39.75,
19682
+ "learning_rate": 1.322033898305085e-06,
19683
+ "loss": 1.4616,
19684
+ "step": 28050
19685
+ },
19686
+ {
19687
+ "epoch": 0.7,
19688
+ "grad_norm": 8.8125,
19689
+ "learning_rate": 1.3152542372881359e-06,
19690
+ "loss": 1.3562,
19691
+ "step": 28060
19692
+ },
19693
+ {
19694
+ "epoch": 0.7,
19695
+ "grad_norm": 11.25,
19696
+ "learning_rate": 1.3084745762711864e-06,
19697
+ "loss": 1.2376,
19698
+ "step": 28070
19699
+ },
19700
+ {
19701
+ "epoch": 0.7,
19702
+ "grad_norm": 25.5,
19703
+ "learning_rate": 1.3016949152542374e-06,
19704
+ "loss": 1.4133,
19705
+ "step": 28080
19706
+ },
19707
+ {
19708
+ "epoch": 0.7,
19709
+ "grad_norm": 33.5,
19710
+ "learning_rate": 1.2949152542372884e-06,
19711
+ "loss": 1.303,
19712
+ "step": 28090
19713
+ },
19714
+ {
19715
+ "epoch": 0.7,
19716
+ "grad_norm": 26.375,
19717
+ "learning_rate": 1.288135593220339e-06,
19718
+ "loss": 1.2879,
19719
+ "step": 28100
19720
+ },
19721
+ {
19722
+ "epoch": 0.7,
19723
+ "grad_norm": 15.625,
19724
+ "learning_rate": 1.28135593220339e-06,
19725
+ "loss": 1.3765,
19726
+ "step": 28110
19727
+ },
19728
+ {
19729
+ "epoch": 0.7,
19730
+ "grad_norm": 19.375,
19731
+ "learning_rate": 1.2745762711864407e-06,
19732
+ "loss": 1.4118,
19733
+ "step": 28120
19734
+ },
19735
+ {
19736
+ "epoch": 0.7,
19737
+ "grad_norm": 46.25,
19738
+ "learning_rate": 1.2677966101694917e-06,
19739
+ "loss": 1.191,
19740
+ "step": 28130
19741
+ },
19742
+ {
19743
+ "epoch": 0.7,
19744
+ "grad_norm": 25.25,
19745
+ "learning_rate": 1.2610169491525426e-06,
19746
+ "loss": 1.5409,
19747
+ "step": 28140
19748
+ },
19749
+ {
19750
+ "epoch": 0.7,
19751
+ "grad_norm": 24.5,
19752
+ "learning_rate": 1.2542372881355932e-06,
19753
+ "loss": 1.4233,
19754
+ "step": 28150
19755
+ },
19756
+ {
19757
+ "epoch": 0.7,
19758
+ "grad_norm": 22.0,
19759
+ "learning_rate": 1.2474576271186442e-06,
19760
+ "loss": 1.2828,
19761
+ "step": 28160
19762
+ },
19763
+ {
19764
+ "epoch": 0.7,
19765
+ "grad_norm": 21.5,
19766
+ "learning_rate": 1.240677966101695e-06,
19767
+ "loss": 1.3378,
19768
+ "step": 28170
19769
+ },
19770
+ {
19771
+ "epoch": 0.7,
19772
+ "grad_norm": 18.375,
19773
+ "learning_rate": 1.233898305084746e-06,
19774
+ "loss": 1.3357,
19775
+ "step": 28180
19776
+ },
19777
+ {
19778
+ "epoch": 0.7,
19779
+ "grad_norm": 42.5,
19780
+ "learning_rate": 1.2271186440677967e-06,
19781
+ "loss": 1.4315,
19782
+ "step": 28190
19783
+ },
19784
+ {
19785
+ "epoch": 0.7,
19786
+ "grad_norm": 19.5,
19787
+ "learning_rate": 1.2203389830508477e-06,
19788
+ "loss": 1.392,
19789
+ "step": 28200
19790
+ },
19791
+ {
19792
+ "epoch": 0.71,
19793
+ "grad_norm": 18.0,
19794
+ "learning_rate": 1.2135593220338984e-06,
19795
+ "loss": 1.4079,
19796
+ "step": 28210
19797
+ },
19798
+ {
19799
+ "epoch": 0.71,
19800
+ "grad_norm": 16.875,
19801
+ "learning_rate": 1.2067796610169492e-06,
19802
+ "loss": 1.2263,
19803
+ "step": 28220
19804
+ },
19805
+ {
19806
+ "epoch": 0.71,
19807
+ "grad_norm": 13.8125,
19808
+ "learning_rate": 1.2000000000000002e-06,
19809
+ "loss": 1.2481,
19810
+ "step": 28230
19811
+ },
19812
+ {
19813
+ "epoch": 0.71,
19814
+ "grad_norm": 6.84375,
19815
+ "learning_rate": 1.193220338983051e-06,
19816
+ "loss": 1.3511,
19817
+ "step": 28240
19818
+ },
19819
+ {
19820
+ "epoch": 0.71,
19821
+ "grad_norm": 24.875,
19822
+ "learning_rate": 1.186440677966102e-06,
19823
+ "loss": 1.3507,
19824
+ "step": 28250
19825
+ },
19826
+ {
19827
+ "epoch": 0.71,
19828
+ "grad_norm": 32.25,
19829
+ "learning_rate": 1.1796610169491527e-06,
19830
+ "loss": 1.2826,
19831
+ "step": 28260
19832
+ },
19833
+ {
19834
+ "epoch": 0.71,
19835
+ "grad_norm": 15.4375,
19836
+ "learning_rate": 1.1728813559322034e-06,
19837
+ "loss": 1.4907,
19838
+ "step": 28270
19839
+ },
19840
+ {
19841
+ "epoch": 0.71,
19842
+ "grad_norm": 12.4375,
19843
+ "learning_rate": 1.1661016949152542e-06,
19844
+ "loss": 1.5181,
19845
+ "step": 28280
19846
+ },
19847
+ {
19848
+ "epoch": 0.71,
19849
+ "grad_norm": 27.125,
19850
+ "learning_rate": 1.1593220338983052e-06,
19851
+ "loss": 1.2625,
19852
+ "step": 28290
19853
+ },
19854
+ {
19855
+ "epoch": 0.71,
19856
+ "grad_norm": 14.5,
19857
+ "learning_rate": 1.152542372881356e-06,
19858
+ "loss": 1.199,
19859
+ "step": 28300
19860
+ },
19861
+ {
19862
+ "epoch": 0.71,
19863
+ "grad_norm": 23.75,
19864
+ "learning_rate": 1.145762711864407e-06,
19865
+ "loss": 1.3387,
19866
+ "step": 28310
19867
+ },
19868
+ {
19869
+ "epoch": 0.71,
19870
+ "grad_norm": 8.875,
19871
+ "learning_rate": 1.1389830508474577e-06,
19872
+ "loss": 1.3493,
19873
+ "step": 28320
19874
+ },
19875
+ {
19876
+ "epoch": 0.71,
19877
+ "grad_norm": 19.75,
19878
+ "learning_rate": 1.1322033898305087e-06,
19879
+ "loss": 1.3027,
19880
+ "step": 28330
19881
+ },
19882
+ {
19883
+ "epoch": 0.71,
19884
+ "grad_norm": 51.5,
19885
+ "learning_rate": 1.1254237288135594e-06,
19886
+ "loss": 1.3349,
19887
+ "step": 28340
19888
+ },
19889
+ {
19890
+ "epoch": 0.71,
19891
+ "grad_norm": 78.5,
19892
+ "learning_rate": 1.1186440677966102e-06,
19893
+ "loss": 1.2642,
19894
+ "step": 28350
19895
+ },
19896
+ {
19897
+ "epoch": 0.71,
19898
+ "grad_norm": 37.25,
19899
+ "learning_rate": 1.1118644067796612e-06,
19900
+ "loss": 1.3023,
19901
+ "step": 28360
19902
+ },
19903
+ {
19904
+ "epoch": 0.71,
19905
+ "grad_norm": 51.0,
19906
+ "learning_rate": 1.105084745762712e-06,
19907
+ "loss": 1.3719,
19908
+ "step": 28370
19909
+ },
19910
+ {
19911
+ "epoch": 0.71,
19912
+ "grad_norm": 16.5,
19913
+ "learning_rate": 1.098305084745763e-06,
19914
+ "loss": 1.2886,
19915
+ "step": 28380
19916
+ },
19917
+ {
19918
+ "epoch": 0.71,
19919
+ "grad_norm": 31.25,
19920
+ "learning_rate": 1.0915254237288137e-06,
19921
+ "loss": 1.2252,
19922
+ "step": 28390
19923
+ },
19924
+ {
19925
+ "epoch": 0.71,
19926
+ "grad_norm": 26.875,
19927
+ "learning_rate": 1.0847457627118644e-06,
19928
+ "loss": 1.2735,
19929
+ "step": 28400
19930
+ },
19931
+ {
19932
+ "epoch": 0.71,
19933
+ "grad_norm": 17.75,
19934
+ "learning_rate": 1.0779661016949152e-06,
19935
+ "loss": 1.4295,
19936
+ "step": 28410
19937
+ },
19938
+ {
19939
+ "epoch": 0.71,
19940
+ "grad_norm": 22.25,
19941
+ "learning_rate": 1.0711864406779662e-06,
19942
+ "loss": 1.1851,
19943
+ "step": 28420
19944
+ },
19945
+ {
19946
+ "epoch": 0.71,
19947
+ "grad_norm": 10.4375,
19948
+ "learning_rate": 1.064406779661017e-06,
19949
+ "loss": 1.3764,
19950
+ "step": 28430
19951
+ },
19952
+ {
19953
+ "epoch": 0.71,
19954
+ "grad_norm": 28.875,
19955
+ "learning_rate": 1.057627118644068e-06,
19956
+ "loss": 1.2924,
19957
+ "step": 28440
19958
+ },
19959
+ {
19960
+ "epoch": 0.71,
19961
+ "grad_norm": 23.375,
19962
+ "learning_rate": 1.0508474576271187e-06,
19963
+ "loss": 1.4333,
19964
+ "step": 28450
19965
+ },
19966
+ {
19967
+ "epoch": 0.71,
19968
+ "grad_norm": 26.5,
19969
+ "learning_rate": 1.0440677966101697e-06,
19970
+ "loss": 1.3717,
19971
+ "step": 28460
19972
+ },
19973
+ {
19974
+ "epoch": 0.71,
19975
+ "grad_norm": 44.75,
19976
+ "learning_rate": 1.0372881355932204e-06,
19977
+ "loss": 1.2711,
19978
+ "step": 28470
19979
+ },
19980
+ {
19981
+ "epoch": 0.71,
19982
+ "grad_norm": 16.5,
19983
+ "learning_rate": 1.0305084745762712e-06,
19984
+ "loss": 1.3858,
19985
+ "step": 28480
19986
+ },
19987
+ {
19988
+ "epoch": 0.71,
19989
+ "grad_norm": 28.875,
19990
+ "learning_rate": 1.0237288135593222e-06,
19991
+ "loss": 1.4771,
19992
+ "step": 28490
19993
+ },
19994
+ {
19995
+ "epoch": 0.71,
19996
+ "grad_norm": 15.375,
19997
+ "learning_rate": 1.016949152542373e-06,
19998
+ "loss": 1.3463,
19999
+ "step": 28500
20000
+ },
20001
+ {
20002
+ "epoch": 0.71,
20003
+ "grad_norm": 57.75,
20004
+ "learning_rate": 1.010169491525424e-06,
20005
+ "loss": 1.292,
20006
+ "step": 28510
20007
+ },
20008
+ {
20009
+ "epoch": 0.71,
20010
+ "grad_norm": 21.0,
20011
+ "learning_rate": 1.0033898305084747e-06,
20012
+ "loss": 1.2532,
20013
+ "step": 28520
20014
+ },
20015
+ {
20016
+ "epoch": 0.71,
20017
+ "grad_norm": 25.0,
20018
+ "learning_rate": 9.966101694915254e-07,
20019
+ "loss": 1.4307,
20020
+ "step": 28530
20021
+ },
20022
+ {
20023
+ "epoch": 0.71,
20024
+ "grad_norm": 21.375,
20025
+ "learning_rate": 9.898305084745762e-07,
20026
+ "loss": 1.5372,
20027
+ "step": 28540
20028
+ },
20029
+ {
20030
+ "epoch": 0.71,
20031
+ "grad_norm": 24.875,
20032
+ "learning_rate": 9.830508474576272e-07,
20033
+ "loss": 1.5059,
20034
+ "step": 28550
20035
+ },
20036
+ {
20037
+ "epoch": 0.71,
20038
+ "grad_norm": 62.5,
20039
+ "learning_rate": 9.762711864406782e-07,
20040
+ "loss": 1.262,
20041
+ "step": 28560
20042
+ },
20043
+ {
20044
+ "epoch": 0.71,
20045
+ "grad_norm": 29.125,
20046
+ "learning_rate": 9.69491525423729e-07,
20047
+ "loss": 1.2998,
20048
+ "step": 28570
20049
+ },
20050
+ {
20051
+ "epoch": 0.71,
20052
+ "grad_norm": 25.875,
20053
+ "learning_rate": 9.627118644067797e-07,
20054
+ "loss": 1.3971,
20055
+ "step": 28580
20056
+ },
20057
+ {
20058
+ "epoch": 0.71,
20059
+ "grad_norm": 33.25,
20060
+ "learning_rate": 9.559322033898307e-07,
20061
+ "loss": 1.4679,
20062
+ "step": 28590
20063
+ },
20064
+ {
20065
+ "epoch": 0.71,
20066
+ "grad_norm": 12.125,
20067
+ "learning_rate": 9.491525423728814e-07,
20068
+ "loss": 1.3046,
20069
+ "step": 28600
20070
+ },
20071
+ {
20072
+ "epoch": 0.72,
20073
+ "grad_norm": 37.25,
20074
+ "learning_rate": 9.423728813559323e-07,
20075
+ "loss": 1.3883,
20076
+ "step": 28610
20077
+ },
20078
+ {
20079
+ "epoch": 0.72,
20080
+ "grad_norm": 62.75,
20081
+ "learning_rate": 9.355932203389831e-07,
20082
+ "loss": 1.1382,
20083
+ "step": 28620
20084
+ },
20085
+ {
20086
+ "epoch": 0.72,
20087
+ "grad_norm": 27.375,
20088
+ "learning_rate": 9.28813559322034e-07,
20089
+ "loss": 1.361,
20090
+ "step": 28630
20091
+ },
20092
+ {
20093
+ "epoch": 0.72,
20094
+ "grad_norm": 36.75,
20095
+ "learning_rate": 9.220338983050848e-07,
20096
+ "loss": 1.3811,
20097
+ "step": 28640
20098
+ },
20099
+ {
20100
+ "epoch": 0.72,
20101
+ "grad_norm": 10.1875,
20102
+ "learning_rate": 9.152542372881357e-07,
20103
+ "loss": 1.3962,
20104
+ "step": 28650
20105
+ },
20106
+ {
20107
+ "epoch": 0.72,
20108
+ "grad_norm": 38.75,
20109
+ "learning_rate": 9.084745762711864e-07,
20110
+ "loss": 1.3429,
20111
+ "step": 28660
20112
+ },
20113
+ {
20114
+ "epoch": 0.72,
20115
+ "grad_norm": 19.625,
20116
+ "learning_rate": 9.016949152542373e-07,
20117
+ "loss": 1.2986,
20118
+ "step": 28670
20119
+ },
20120
+ {
20121
+ "epoch": 0.72,
20122
+ "grad_norm": 33.75,
20123
+ "learning_rate": 8.949152542372883e-07,
20124
+ "loss": 1.3135,
20125
+ "step": 28680
20126
+ },
20127
+ {
20128
+ "epoch": 0.72,
20129
+ "grad_norm": 24.75,
20130
+ "learning_rate": 8.881355932203391e-07,
20131
+ "loss": 1.4679,
20132
+ "step": 28690
20133
+ },
20134
+ {
20135
+ "epoch": 0.72,
20136
+ "grad_norm": 7.21875,
20137
+ "learning_rate": 8.813559322033899e-07,
20138
+ "loss": 1.4693,
20139
+ "step": 28700
20140
+ },
20141
+ {
20142
+ "epoch": 0.72,
20143
+ "grad_norm": 19.125,
20144
+ "learning_rate": 8.745762711864407e-07,
20145
+ "loss": 1.3307,
20146
+ "step": 28710
20147
+ },
20148
+ {
20149
+ "epoch": 0.72,
20150
+ "grad_norm": 17.0,
20151
+ "learning_rate": 8.677966101694917e-07,
20152
+ "loss": 1.4457,
20153
+ "step": 28720
20154
+ },
20155
+ {
20156
+ "epoch": 0.72,
20157
+ "grad_norm": 5.75,
20158
+ "learning_rate": 8.610169491525424e-07,
20159
+ "loss": 1.282,
20160
+ "step": 28730
20161
+ },
20162
+ {
20163
+ "epoch": 0.72,
20164
+ "grad_norm": 18.625,
20165
+ "learning_rate": 8.542372881355933e-07,
20166
+ "loss": 1.3471,
20167
+ "step": 28740
20168
+ },
20169
+ {
20170
+ "epoch": 0.72,
20171
+ "grad_norm": 24.375,
20172
+ "learning_rate": 8.474576271186441e-07,
20173
+ "loss": 1.2901,
20174
+ "step": 28750
20175
+ },
20176
+ {
20177
+ "epoch": 0.72,
20178
+ "grad_norm": 13.9375,
20179
+ "learning_rate": 8.406779661016951e-07,
20180
+ "loss": 1.2879,
20181
+ "step": 28760
20182
+ },
20183
+ {
20184
+ "epoch": 0.72,
20185
+ "grad_norm": 40.5,
20186
+ "learning_rate": 8.338983050847458e-07,
20187
+ "loss": 1.4003,
20188
+ "step": 28770
20189
+ },
20190
+ {
20191
+ "epoch": 0.72,
20192
+ "grad_norm": 40.5,
20193
+ "learning_rate": 8.271186440677967e-07,
20194
+ "loss": 1.3961,
20195
+ "step": 28780
20196
+ },
20197
+ {
20198
+ "epoch": 0.72,
20199
+ "grad_norm": 36.5,
20200
+ "learning_rate": 8.203389830508475e-07,
20201
+ "loss": 1.3868,
20202
+ "step": 28790
20203
+ },
20204
+ {
20205
+ "epoch": 0.72,
20206
+ "grad_norm": 9.875,
20207
+ "learning_rate": 8.135593220338983e-07,
20208
+ "loss": 1.4772,
20209
+ "step": 28800
20210
+ },
20211
+ {
20212
+ "epoch": 0.72,
20213
+ "grad_norm": 24.0,
20214
+ "learning_rate": 8.067796610169493e-07,
20215
+ "loss": 1.3712,
20216
+ "step": 28810
20217
+ },
20218
+ {
20219
+ "epoch": 0.72,
20220
+ "grad_norm": 12.9375,
20221
+ "learning_rate": 8.000000000000001e-07,
20222
+ "loss": 1.3414,
20223
+ "step": 28820
20224
+ },
20225
+ {
20226
+ "epoch": 0.72,
20227
+ "grad_norm": 9.75,
20228
+ "learning_rate": 7.932203389830509e-07,
20229
+ "loss": 1.2895,
20230
+ "step": 28830
20231
+ },
20232
+ {
20233
+ "epoch": 0.72,
20234
+ "grad_norm": 41.5,
20235
+ "learning_rate": 7.864406779661017e-07,
20236
+ "loss": 1.438,
20237
+ "step": 28840
20238
+ },
20239
+ {
20240
+ "epoch": 0.72,
20241
+ "grad_norm": 14.5,
20242
+ "learning_rate": 7.796610169491527e-07,
20243
+ "loss": 1.289,
20244
+ "step": 28850
20245
+ },
20246
+ {
20247
+ "epoch": 0.72,
20248
+ "grad_norm": 20.125,
20249
+ "learning_rate": 7.728813559322034e-07,
20250
+ "loss": 1.4131,
20251
+ "step": 28860
20252
+ },
20253
+ {
20254
+ "epoch": 0.72,
20255
+ "grad_norm": 31.25,
20256
+ "learning_rate": 7.661016949152543e-07,
20257
+ "loss": 1.2683,
20258
+ "step": 28870
20259
+ },
20260
+ {
20261
+ "epoch": 0.72,
20262
+ "grad_norm": 18.75,
20263
+ "learning_rate": 7.593220338983051e-07,
20264
+ "loss": 1.2794,
20265
+ "step": 28880
20266
+ },
20267
+ {
20268
+ "epoch": 0.72,
20269
+ "grad_norm": 31.375,
20270
+ "learning_rate": 7.52542372881356e-07,
20271
+ "loss": 1.36,
20272
+ "step": 28890
20273
+ },
20274
+ {
20275
+ "epoch": 0.72,
20276
+ "grad_norm": 18.5,
20277
+ "learning_rate": 7.457627118644069e-07,
20278
+ "loss": 1.25,
20279
+ "step": 28900
20280
+ },
20281
+ {
20282
+ "epoch": 0.72,
20283
+ "grad_norm": 33.5,
20284
+ "learning_rate": 7.389830508474577e-07,
20285
+ "loss": 1.2343,
20286
+ "step": 28910
20287
+ },
20288
+ {
20289
+ "epoch": 0.72,
20290
+ "grad_norm": 39.75,
20291
+ "learning_rate": 7.322033898305085e-07,
20292
+ "loss": 1.3837,
20293
+ "step": 28920
20294
+ },
20295
+ {
20296
+ "epoch": 0.72,
20297
+ "grad_norm": 21.125,
20298
+ "learning_rate": 7.254237288135593e-07,
20299
+ "loss": 1.4617,
20300
+ "step": 28930
20301
+ },
20302
+ {
20303
+ "epoch": 0.72,
20304
+ "grad_norm": 31.875,
20305
+ "learning_rate": 7.186440677966103e-07,
20306
+ "loss": 1.3333,
20307
+ "step": 28940
20308
+ },
20309
+ {
20310
+ "epoch": 0.72,
20311
+ "grad_norm": 19.875,
20312
+ "learning_rate": 7.118644067796611e-07,
20313
+ "loss": 1.3642,
20314
+ "step": 28950
20315
+ },
20316
+ {
20317
+ "epoch": 0.72,
20318
+ "grad_norm": 11.4375,
20319
+ "learning_rate": 7.05084745762712e-07,
20320
+ "loss": 1.3345,
20321
+ "step": 28960
20322
+ },
20323
+ {
20324
+ "epoch": 0.72,
20325
+ "grad_norm": 43.25,
20326
+ "learning_rate": 6.983050847457627e-07,
20327
+ "loss": 1.2499,
20328
+ "step": 28970
20329
+ },
20330
+ {
20331
+ "epoch": 0.72,
20332
+ "grad_norm": 10.9375,
20333
+ "learning_rate": 6.915254237288137e-07,
20334
+ "loss": 1.3462,
20335
+ "step": 28980
20336
+ },
20337
+ {
20338
+ "epoch": 0.72,
20339
+ "grad_norm": 21.25,
20340
+ "learning_rate": 6.847457627118645e-07,
20341
+ "loss": 1.3835,
20342
+ "step": 28990
20343
+ },
20344
+ {
20345
+ "epoch": 0.72,
20346
+ "grad_norm": 17.0,
20347
+ "learning_rate": 6.779661016949153e-07,
20348
+ "loss": 1.3421,
20349
+ "step": 29000
20350
+ },
20351
+ {
20352
+ "epoch": 0.73,
20353
+ "grad_norm": 29.875,
20354
+ "learning_rate": 6.711864406779661e-07,
20355
+ "loss": 1.3094,
20356
+ "step": 29010
20357
+ },
20358
+ {
20359
+ "epoch": 0.73,
20360
+ "grad_norm": 15.8125,
20361
+ "learning_rate": 6.64406779661017e-07,
20362
+ "loss": 1.3585,
20363
+ "step": 29020
20364
+ },
20365
+ {
20366
+ "epoch": 0.73,
20367
+ "grad_norm": 13.8125,
20368
+ "learning_rate": 6.576271186440679e-07,
20369
+ "loss": 1.4441,
20370
+ "step": 29030
20371
+ },
20372
+ {
20373
+ "epoch": 0.73,
20374
+ "grad_norm": 28.25,
20375
+ "learning_rate": 6.508474576271187e-07,
20376
+ "loss": 1.3683,
20377
+ "step": 29040
20378
+ },
20379
+ {
20380
+ "epoch": 0.73,
20381
+ "grad_norm": 22.75,
20382
+ "learning_rate": 6.440677966101695e-07,
20383
+ "loss": 1.2842,
20384
+ "step": 29050
20385
+ },
20386
+ {
20387
+ "epoch": 0.73,
20388
+ "grad_norm": 25.875,
20389
+ "learning_rate": 6.372881355932203e-07,
20390
+ "loss": 1.3392,
20391
+ "step": 29060
20392
+ },
20393
+ {
20394
+ "epoch": 0.73,
20395
+ "grad_norm": 15.75,
20396
+ "learning_rate": 6.305084745762713e-07,
20397
+ "loss": 1.3556,
20398
+ "step": 29070
20399
+ },
20400
+ {
20401
+ "epoch": 0.73,
20402
+ "grad_norm": 16.75,
20403
+ "learning_rate": 6.237288135593221e-07,
20404
+ "loss": 1.221,
20405
+ "step": 29080
20406
+ },
20407
+ {
20408
+ "epoch": 0.73,
20409
+ "grad_norm": 18.375,
20410
+ "learning_rate": 6.16949152542373e-07,
20411
+ "loss": 1.4398,
20412
+ "step": 29090
20413
+ },
20414
+ {
20415
+ "epoch": 0.73,
20416
+ "grad_norm": 29.625,
20417
+ "learning_rate": 6.101694915254238e-07,
20418
+ "loss": 1.5108,
20419
+ "step": 29100
20420
+ },
20421
+ {
20422
+ "epoch": 0.73,
20423
+ "grad_norm": 41.5,
20424
+ "learning_rate": 6.033898305084746e-07,
20425
+ "loss": 1.0557,
20426
+ "step": 29110
20427
+ },
20428
+ {
20429
+ "epoch": 0.73,
20430
+ "grad_norm": 21.75,
20431
+ "learning_rate": 5.966101694915255e-07,
20432
+ "loss": 1.2235,
20433
+ "step": 29120
20434
+ },
20435
+ {
20436
+ "epoch": 0.73,
20437
+ "grad_norm": 18.875,
20438
+ "learning_rate": 5.898305084745763e-07,
20439
+ "loss": 1.3943,
20440
+ "step": 29130
20441
+ },
20442
+ {
20443
+ "epoch": 0.73,
20444
+ "grad_norm": 12.9375,
20445
+ "learning_rate": 5.830508474576271e-07,
20446
+ "loss": 1.5561,
20447
+ "step": 29140
20448
+ },
20449
+ {
20450
+ "epoch": 0.73,
20451
+ "grad_norm": 36.5,
20452
+ "learning_rate": 5.76271186440678e-07,
20453
+ "loss": 1.3348,
20454
+ "step": 29150
20455
+ },
20456
+ {
20457
+ "epoch": 0.73,
20458
+ "grad_norm": 31.0,
20459
+ "learning_rate": 5.694915254237288e-07,
20460
+ "loss": 1.4021,
20461
+ "step": 29160
20462
+ },
20463
+ {
20464
+ "epoch": 0.73,
20465
+ "grad_norm": 23.0,
20466
+ "learning_rate": 5.627118644067797e-07,
20467
+ "loss": 1.2516,
20468
+ "step": 29170
20469
+ },
20470
+ {
20471
+ "epoch": 0.73,
20472
+ "grad_norm": 53.5,
20473
+ "learning_rate": 5.559322033898306e-07,
20474
+ "loss": 1.3736,
20475
+ "step": 29180
20476
+ },
20477
+ {
20478
+ "epoch": 0.73,
20479
+ "grad_norm": 21.375,
20480
+ "learning_rate": 5.491525423728815e-07,
20481
+ "loss": 1.3918,
20482
+ "step": 29190
20483
+ },
20484
+ {
20485
+ "epoch": 0.73,
20486
+ "grad_norm": 59.25,
20487
+ "learning_rate": 5.423728813559322e-07,
20488
+ "loss": 1.2499,
20489
+ "step": 29200
20490
+ },
20491
+ {
20492
+ "epoch": 0.73,
20493
+ "grad_norm": 33.5,
20494
+ "learning_rate": 5.355932203389831e-07,
20495
+ "loss": 1.4177,
20496
+ "step": 29210
20497
+ },
20498
+ {
20499
+ "epoch": 0.73,
20500
+ "grad_norm": 26.25,
20501
+ "learning_rate": 5.28813559322034e-07,
20502
+ "loss": 1.2788,
20503
+ "step": 29220
20504
+ },
20505
+ {
20506
+ "epoch": 0.73,
20507
+ "grad_norm": 27.0,
20508
+ "learning_rate": 5.220338983050848e-07,
20509
+ "loss": 1.4001,
20510
+ "step": 29230
20511
+ },
20512
+ {
20513
+ "epoch": 0.73,
20514
+ "grad_norm": 24.75,
20515
+ "learning_rate": 5.152542372881356e-07,
20516
+ "loss": 1.3464,
20517
+ "step": 29240
20518
+ },
20519
+ {
20520
+ "epoch": 0.73,
20521
+ "grad_norm": 13.0,
20522
+ "learning_rate": 5.084745762711865e-07,
20523
+ "loss": 1.2711,
20524
+ "step": 29250
20525
+ },
20526
+ {
20527
+ "epoch": 0.73,
20528
+ "grad_norm": 8.875,
20529
+ "learning_rate": 5.016949152542373e-07,
20530
+ "loss": 1.3788,
20531
+ "step": 29260
20532
+ },
20533
+ {
20534
+ "epoch": 0.73,
20535
+ "grad_norm": 29.25,
20536
+ "learning_rate": 4.949152542372881e-07,
20537
+ "loss": 1.3081,
20538
+ "step": 29270
20539
+ },
20540
+ {
20541
+ "epoch": 0.73,
20542
+ "grad_norm": 65.0,
20543
+ "learning_rate": 4.881355932203391e-07,
20544
+ "loss": 1.1399,
20545
+ "step": 29280
20546
+ },
20547
+ {
20548
+ "epoch": 0.73,
20549
+ "grad_norm": 65.5,
20550
+ "learning_rate": 4.813559322033898e-07,
20551
+ "loss": 1.1244,
20552
+ "step": 29290
20553
+ },
20554
+ {
20555
+ "epoch": 0.73,
20556
+ "grad_norm": 14.8125,
20557
+ "learning_rate": 4.745762711864407e-07,
20558
+ "loss": 1.2739,
20559
+ "step": 29300
20560
+ },
20561
+ {
20562
+ "epoch": 0.73,
20563
+ "grad_norm": 18.125,
20564
+ "learning_rate": 4.6779661016949154e-07,
20565
+ "loss": 1.3434,
20566
+ "step": 29310
20567
+ },
20568
+ {
20569
+ "epoch": 0.73,
20570
+ "grad_norm": 47.0,
20571
+ "learning_rate": 4.610169491525424e-07,
20572
+ "loss": 1.3668,
20573
+ "step": 29320
20574
+ },
20575
+ {
20576
+ "epoch": 0.73,
20577
+ "grad_norm": 16.875,
20578
+ "learning_rate": 4.542372881355932e-07,
20579
+ "loss": 1.3071,
20580
+ "step": 29330
20581
+ },
20582
+ {
20583
+ "epoch": 0.73,
20584
+ "grad_norm": 19.75,
20585
+ "learning_rate": 4.4745762711864415e-07,
20586
+ "loss": 1.4213,
20587
+ "step": 29340
20588
+ },
20589
+ {
20590
+ "epoch": 0.73,
20591
+ "grad_norm": 11.625,
20592
+ "learning_rate": 4.4067796610169497e-07,
20593
+ "loss": 1.2687,
20594
+ "step": 29350
20595
+ },
20596
+ {
20597
+ "epoch": 0.73,
20598
+ "grad_norm": 32.25,
20599
+ "learning_rate": 4.3389830508474584e-07,
20600
+ "loss": 1.2956,
20601
+ "step": 29360
20602
+ },
20603
+ {
20604
+ "epoch": 0.73,
20605
+ "grad_norm": 12.3125,
20606
+ "learning_rate": 4.2711864406779666e-07,
20607
+ "loss": 1.3848,
20608
+ "step": 29370
20609
+ },
20610
+ {
20611
+ "epoch": 0.73,
20612
+ "grad_norm": 40.5,
20613
+ "learning_rate": 4.2033898305084753e-07,
20614
+ "loss": 1.3565,
20615
+ "step": 29380
20616
+ },
20617
+ {
20618
+ "epoch": 0.73,
20619
+ "grad_norm": 52.5,
20620
+ "learning_rate": 4.1355932203389835e-07,
20621
+ "loss": 1.407,
20622
+ "step": 29390
20623
+ },
20624
+ {
20625
+ "epoch": 0.73,
20626
+ "grad_norm": 25.25,
20627
+ "learning_rate": 4.0677966101694916e-07,
20628
+ "loss": 1.3811,
20629
+ "step": 29400
20630
+ },
20631
+ {
20632
+ "epoch": 0.74,
20633
+ "grad_norm": 11.1875,
20634
+ "learning_rate": 4.0000000000000003e-07,
20635
+ "loss": 1.3421,
20636
+ "step": 29410
20637
+ },
20638
+ {
20639
+ "epoch": 0.74,
20640
+ "grad_norm": 14.625,
20641
+ "learning_rate": 3.9322033898305085e-07,
20642
+ "loss": 1.3286,
20643
+ "step": 29420
20644
+ },
20645
+ {
20646
+ "epoch": 0.74,
20647
+ "grad_norm": 24.5,
20648
+ "learning_rate": 3.864406779661017e-07,
20649
+ "loss": 1.3828,
20650
+ "step": 29430
20651
+ },
20652
+ {
20653
+ "epoch": 0.74,
20654
+ "grad_norm": 24.25,
20655
+ "learning_rate": 3.7966101694915254e-07,
20656
+ "loss": 1.255,
20657
+ "step": 29440
20658
+ },
20659
+ {
20660
+ "epoch": 0.74,
20661
+ "grad_norm": 13.8125,
20662
+ "learning_rate": 3.7288135593220347e-07,
20663
+ "loss": 1.2879,
20664
+ "step": 29450
20665
+ },
20666
+ {
20667
+ "epoch": 0.74,
20668
+ "grad_norm": 26.375,
20669
+ "learning_rate": 3.6610169491525423e-07,
20670
+ "loss": 1.2621,
20671
+ "step": 29460
20672
+ },
20673
+ {
20674
+ "epoch": 0.74,
20675
+ "grad_norm": 10.0,
20676
+ "learning_rate": 3.5932203389830516e-07,
20677
+ "loss": 1.5059,
20678
+ "step": 29470
20679
+ },
20680
+ {
20681
+ "epoch": 0.74,
20682
+ "grad_norm": 40.75,
20683
+ "learning_rate": 3.52542372881356e-07,
20684
+ "loss": 1.332,
20685
+ "step": 29480
20686
+ },
20687
+ {
20688
+ "epoch": 0.74,
20689
+ "grad_norm": 24.375,
20690
+ "learning_rate": 3.4576271186440684e-07,
20691
+ "loss": 1.3809,
20692
+ "step": 29490
20693
+ },
20694
+ {
20695
+ "epoch": 0.74,
20696
+ "grad_norm": 37.0,
20697
+ "learning_rate": 3.3898305084745766e-07,
20698
+ "loss": 1.3389,
20699
+ "step": 29500
20700
+ },
20701
+ {
20702
+ "epoch": 0.74,
20703
+ "grad_norm": 30.0,
20704
+ "learning_rate": 3.322033898305085e-07,
20705
+ "loss": 1.2023,
20706
+ "step": 29510
20707
+ },
20708
+ {
20709
+ "epoch": 0.74,
20710
+ "grad_norm": 16.625,
20711
+ "learning_rate": 3.2542372881355935e-07,
20712
+ "loss": 1.3672,
20713
+ "step": 29520
20714
+ },
20715
+ {
20716
+ "epoch": 0.74,
20717
+ "grad_norm": 28.5,
20718
+ "learning_rate": 3.1864406779661017e-07,
20719
+ "loss": 1.5764,
20720
+ "step": 29530
20721
+ },
20722
+ {
20723
+ "epoch": 0.74,
20724
+ "grad_norm": 14.625,
20725
+ "learning_rate": 3.1186440677966104e-07,
20726
+ "loss": 1.5877,
20727
+ "step": 29540
20728
+ },
20729
+ {
20730
+ "epoch": 0.74,
20731
+ "grad_norm": 22.875,
20732
+ "learning_rate": 3.050847457627119e-07,
20733
+ "loss": 1.4373,
20734
+ "step": 29550
20735
+ },
20736
+ {
20737
+ "epoch": 0.74,
20738
+ "grad_norm": 25.875,
20739
+ "learning_rate": 2.9830508474576273e-07,
20740
+ "loss": 1.3222,
20741
+ "step": 29560
20742
+ },
20743
+ {
20744
+ "epoch": 0.74,
20745
+ "grad_norm": 38.5,
20746
+ "learning_rate": 2.9152542372881355e-07,
20747
+ "loss": 1.2743,
20748
+ "step": 29570
20749
+ },
20750
+ {
20751
+ "epoch": 0.74,
20752
+ "grad_norm": 46.75,
20753
+ "learning_rate": 2.847457627118644e-07,
20754
+ "loss": 1.3351,
20755
+ "step": 29580
20756
+ },
20757
+ {
20758
+ "epoch": 0.74,
20759
+ "grad_norm": 55.75,
20760
+ "learning_rate": 2.779661016949153e-07,
20761
+ "loss": 1.1375,
20762
+ "step": 29590
20763
+ },
20764
+ {
20765
+ "epoch": 0.74,
20766
+ "grad_norm": 19.0,
20767
+ "learning_rate": 2.711864406779661e-07,
20768
+ "loss": 1.1009,
20769
+ "step": 29600
20770
+ },
20771
+ {
20772
+ "epoch": 0.74,
20773
+ "grad_norm": 9.9375,
20774
+ "learning_rate": 2.64406779661017e-07,
20775
+ "loss": 1.1769,
20776
+ "step": 29610
20777
+ },
20778
+ {
20779
+ "epoch": 0.74,
20780
+ "grad_norm": 43.5,
20781
+ "learning_rate": 2.576271186440678e-07,
20782
+ "loss": 1.2881,
20783
+ "step": 29620
20784
+ },
20785
+ {
20786
+ "epoch": 0.74,
20787
+ "grad_norm": 23.5,
20788
+ "learning_rate": 2.5084745762711867e-07,
20789
+ "loss": 1.3584,
20790
+ "step": 29630
20791
+ },
20792
+ {
20793
+ "epoch": 0.74,
20794
+ "grad_norm": 29.875,
20795
+ "learning_rate": 2.4406779661016954e-07,
20796
+ "loss": 1.4624,
20797
+ "step": 29640
20798
+ },
20799
+ {
20800
+ "epoch": 0.74,
20801
+ "grad_norm": 14.25,
20802
+ "learning_rate": 2.3728813559322036e-07,
20803
+ "loss": 1.6026,
20804
+ "step": 29650
20805
+ },
20806
+ {
20807
+ "epoch": 0.74,
20808
+ "grad_norm": 71.5,
20809
+ "learning_rate": 2.305084745762712e-07,
20810
+ "loss": 1.1366,
20811
+ "step": 29660
20812
+ },
20813
+ {
20814
+ "epoch": 0.74,
20815
+ "grad_norm": 42.0,
20816
+ "learning_rate": 2.2372881355932207e-07,
20817
+ "loss": 1.378,
20818
+ "step": 29670
20819
+ },
20820
+ {
20821
+ "epoch": 0.74,
20822
+ "grad_norm": 34.5,
20823
+ "learning_rate": 2.1694915254237292e-07,
20824
+ "loss": 1.2709,
20825
+ "step": 29680
20826
+ },
20827
+ {
20828
+ "epoch": 0.74,
20829
+ "grad_norm": 30.25,
20830
+ "learning_rate": 2.1016949152542376e-07,
20831
+ "loss": 1.4227,
20832
+ "step": 29690
20833
+ },
20834
+ {
20835
+ "epoch": 0.74,
20836
+ "grad_norm": 23.0,
20837
+ "learning_rate": 2.0338983050847458e-07,
20838
+ "loss": 1.3385,
20839
+ "step": 29700
20840
+ },
20841
+ {
20842
+ "epoch": 0.74,
20843
+ "grad_norm": 18.875,
20844
+ "learning_rate": 1.9661016949152543e-07,
20845
+ "loss": 1.0732,
20846
+ "step": 29710
20847
+ },
20848
+ {
20849
+ "epoch": 0.74,
20850
+ "grad_norm": 12.5625,
20851
+ "learning_rate": 1.8983050847457627e-07,
20852
+ "loss": 1.4202,
20853
+ "step": 29720
20854
+ },
20855
+ {
20856
+ "epoch": 0.74,
20857
+ "grad_norm": 33.75,
20858
+ "learning_rate": 1.8305084745762712e-07,
20859
+ "loss": 1.1954,
20860
+ "step": 29730
20861
+ },
20862
+ {
20863
+ "epoch": 0.74,
20864
+ "grad_norm": 31.875,
20865
+ "learning_rate": 1.76271186440678e-07,
20866
+ "loss": 1.2623,
20867
+ "step": 29740
20868
+ },
20869
+ {
20870
+ "epoch": 0.74,
20871
+ "grad_norm": 18.375,
20872
+ "learning_rate": 1.6949152542372883e-07,
20873
+ "loss": 1.4002,
20874
+ "step": 29750
20875
+ },
20876
+ {
20877
+ "epoch": 0.74,
20878
+ "grad_norm": 37.5,
20879
+ "learning_rate": 1.6271186440677968e-07,
20880
+ "loss": 1.3116,
20881
+ "step": 29760
20882
+ },
20883
+ {
20884
+ "epoch": 0.74,
20885
+ "grad_norm": 21.0,
20886
+ "learning_rate": 1.5593220338983052e-07,
20887
+ "loss": 1.4163,
20888
+ "step": 29770
20889
+ },
20890
+ {
20891
+ "epoch": 0.74,
20892
+ "grad_norm": 41.75,
20893
+ "learning_rate": 1.4915254237288137e-07,
20894
+ "loss": 1.3354,
20895
+ "step": 29780
20896
+ },
20897
+ {
20898
+ "epoch": 0.74,
20899
+ "grad_norm": 48.75,
20900
+ "learning_rate": 1.423728813559322e-07,
20901
+ "loss": 1.36,
20902
+ "step": 29790
20903
+ },
20904
+ {
20905
+ "epoch": 0.74,
20906
+ "grad_norm": 15.6875,
20907
+ "learning_rate": 1.3559322033898305e-07,
20908
+ "loss": 1.3334,
20909
+ "step": 29800
20910
+ },
20911
+ {
20912
+ "epoch": 0.75,
20913
+ "grad_norm": 60.75,
20914
+ "learning_rate": 1.288135593220339e-07,
20915
+ "loss": 1.3058,
20916
+ "step": 29810
20917
+ },
20918
+ {
20919
+ "epoch": 0.75,
20920
+ "grad_norm": 24.625,
20921
+ "learning_rate": 1.2203389830508477e-07,
20922
+ "loss": 1.3963,
20923
+ "step": 29820
20924
+ },
20925
+ {
20926
+ "epoch": 0.75,
20927
+ "grad_norm": 15.3125,
20928
+ "learning_rate": 1.152542372881356e-07,
20929
+ "loss": 1.3757,
20930
+ "step": 29830
20931
+ },
20932
+ {
20933
+ "epoch": 0.75,
20934
+ "grad_norm": 18.375,
20935
+ "learning_rate": 1.0847457627118646e-07,
20936
+ "loss": 1.4074,
20937
+ "step": 29840
20938
+ },
20939
+ {
20940
+ "epoch": 0.75,
20941
+ "grad_norm": 17.0,
20942
+ "learning_rate": 1.0169491525423729e-07,
20943
+ "loss": 1.502,
20944
+ "step": 29850
20945
+ },
20946
+ {
20947
+ "epoch": 0.75,
20948
+ "grad_norm": 67.5,
20949
+ "learning_rate": 9.491525423728814e-08,
20950
+ "loss": 1.4699,
20951
+ "step": 29860
20952
+ },
20953
+ {
20954
+ "epoch": 0.75,
20955
+ "grad_norm": 15.875,
20956
+ "learning_rate": 8.8135593220339e-08,
20957
+ "loss": 1.3926,
20958
+ "step": 29870
20959
+ },
20960
+ {
20961
+ "epoch": 0.75,
20962
+ "grad_norm": 37.75,
20963
+ "learning_rate": 8.135593220338984e-08,
20964
+ "loss": 1.411,
20965
+ "step": 29880
20966
+ },
20967
+ {
20968
+ "epoch": 0.75,
20969
+ "grad_norm": 21.375,
20970
+ "learning_rate": 7.457627118644068e-08,
20971
+ "loss": 1.3486,
20972
+ "step": 29890
20973
+ },
20974
+ {
20975
+ "epoch": 0.75,
20976
+ "grad_norm": 47.0,
20977
+ "learning_rate": 6.779661016949153e-08,
20978
+ "loss": 1.4179,
20979
+ "step": 29900
20980
+ },
20981
+ {
20982
+ "epoch": 0.75,
20983
+ "grad_norm": 17.0,
20984
+ "learning_rate": 6.101694915254239e-08,
20985
+ "loss": 1.2906,
20986
+ "step": 29910
20987
+ },
20988
+ {
20989
+ "epoch": 0.75,
20990
+ "grad_norm": 11.75,
20991
+ "learning_rate": 5.423728813559323e-08,
20992
+ "loss": 1.3449,
20993
+ "step": 29920
20994
+ },
20995
+ {
20996
+ "epoch": 0.75,
20997
+ "grad_norm": 17.375,
20998
+ "learning_rate": 4.745762711864407e-08,
20999
+ "loss": 1.4177,
21000
+ "step": 29930
21001
+ },
21002
+ {
21003
+ "epoch": 0.75,
21004
+ "grad_norm": 18.0,
21005
+ "learning_rate": 4.067796610169492e-08,
21006
+ "loss": 1.4116,
21007
+ "step": 29940
21008
+ },
21009
+ {
21010
+ "epoch": 0.75,
21011
+ "grad_norm": 18.0,
21012
+ "learning_rate": 3.3898305084745764e-08,
21013
+ "loss": 1.1822,
21014
+ "step": 29950
21015
+ },
21016
+ {
21017
+ "epoch": 0.75,
21018
+ "grad_norm": 16.375,
21019
+ "learning_rate": 2.7118644067796615e-08,
21020
+ "loss": 1.2627,
21021
+ "step": 29960
21022
+ },
21023
+ {
21024
+ "epoch": 0.75,
21025
+ "grad_norm": 23.5,
21026
+ "learning_rate": 2.033898305084746e-08,
21027
+ "loss": 1.3393,
21028
+ "step": 29970
21029
+ },
21030
+ {
21031
+ "epoch": 0.75,
21032
+ "grad_norm": 8.9375,
21033
+ "learning_rate": 1.3559322033898307e-08,
21034
+ "loss": 1.3589,
21035
+ "step": 29980
21036
+ },
21037
+ {
21038
+ "epoch": 0.75,
21039
+ "grad_norm": 36.25,
21040
+ "learning_rate": 6.779661016949154e-09,
21041
+ "loss": 1.3054,
21042
+ "step": 29990
21043
+ },
21044
+ {
21045
+ "epoch": 0.75,
21046
+ "grad_norm": 21.25,
21047
+ "learning_rate": 0.0,
21048
+ "loss": 1.3149,
21049
+ "step": 30000
21050
+ },
21051
+ {
21052
+ "epoch": 0.75,
21053
+ "eval_loss": 1.3585624694824219,
21054
+ "eval_runtime": 59.2167,
21055
+ "eval_samples_per_second": 16.887,
21056
+ "eval_steps_per_second": 16.887,
21057
+ "step": 30000
21058
  }
21059
  ],
21060
  "logging_steps": 10,
 
21062
  "num_input_tokens_seen": 0,
21063
  "num_train_epochs": 1,
21064
  "save_steps": 5000,
21065
+ "total_flos": 4.7201094991872e+17,
21066
  "train_batch_size": 1,
21067
  "trial_name": null,
21068
  "trial_params": null